Files
sirh/venv/lib/python3.12/site-packages/pandas/tests/io/test_feather.py
2026-04-29 11:52:03 +02:00

292 lines
10 KiB
Python

"""test feather-format compat"""
from datetime import datetime
import zoneinfo
import numpy as np
import pytest
from pandas.compat.pyarrow import (
pa_version_under18p0,
pa_version_under19p0,
)
import pandas as pd
import pandas._testing as tm
from pandas.io.feather_format import read_feather, to_feather # isort:skip
pytestmark = pytest.mark.filterwarnings(
"ignore:Passing a BlockManager to DataFrame:DeprecationWarning"
)
pa = pytest.importorskip("pyarrow")
@pytest.mark.single_cpu
class TestFeather:
def check_error_on_write(self, df, exc, err_msg, temp_file):
# check that we are raising the exception
# on writing
with pytest.raises(exc, match=err_msg):
to_feather(df, temp_file)
def check_external_error_on_write(self, df, temp_file):
# check that we are raising the exception
# on writing
with tm.external_error_raised(Exception):
to_feather(df, temp_file)
def check_round_trip(
self, df, temp_file, expected=None, write_kwargs=None, **read_kwargs
):
if write_kwargs is None:
write_kwargs = {}
if expected is None:
expected = df.copy()
to_feather(df, temp_file, **write_kwargs)
result = read_feather(temp_file, **read_kwargs)
tm.assert_frame_equal(result, expected)
def test_error(self, temp_file):
msg = "feather only support IO with DataFrames"
for obj in [
pd.Series([1, 2, 3]),
1,
"foo",
pd.Timestamp("20130101"),
np.array([1, 2, 3]),
]:
self.check_error_on_write(obj, ValueError, msg, temp_file)
def test_basic(self, temp_file):
tz = zoneinfo.ZoneInfo("US/Eastern")
df = pd.DataFrame(
{
"string": list("abc"),
"int": list(range(1, 4)),
"uint": np.arange(3, 6).astype("u1"),
"float": np.arange(4.0, 7.0, dtype="float64"),
"float_with_null": [1.0, np.nan, 3],
"bool": [True, False, True],
"bool_with_null": [True, np.nan, False],
"cat": pd.Categorical(list("abc")),
"dt": pd.DatetimeIndex(
list(pd.date_range("20130101", periods=3)), freq=None
),
"dttz": pd.DatetimeIndex(
list(pd.date_range("20130101", periods=3, tz=tz)),
freq=None,
),
"dt_with_null": [
pd.Timestamp("20130101"),
pd.NaT,
pd.Timestamp("20130103"),
],
"dtns": pd.DatetimeIndex(
list(pd.date_range("20130101", periods=3, freq="ns")), freq=None
),
}
)
df["periods"] = pd.period_range("2013", freq="M", periods=3)
df["timedeltas"] = pd.timedelta_range("1 day", periods=3)
df["intervals"] = pd.interval_range(0, 3, 3)
assert df.dttz.dtype.tz.key == "US/Eastern"
expected = df.copy()
expected.loc[1, "bool_with_null"] = None
self.check_round_trip(df, temp_file, expected=expected)
def test_duplicate_columns(self, temp_file):
# https://github.com/wesm/feather/issues/53
# not currently able to handle duplicate columns
df = pd.DataFrame(np.arange(12).reshape(4, 3), columns=list("aaa")).copy()
self.check_external_error_on_write(df, temp_file)
def test_read_columns(self, temp_file):
# GH 24025
df = pd.DataFrame(
{
"col1": list("abc"),
"col2": list(range(1, 4)),
"col3": list("xyz"),
"col4": list(range(4, 7)),
}
)
columns = ["col1", "col3"]
self.check_round_trip(df, temp_file, expected=df[columns], columns=columns)
def test_read_columns_different_order(self, temp_file):
# GH 33878
df = pd.DataFrame({"A": [1, 2], "B": ["x", "y"], "C": [True, False]})
expected = df[["B", "A"]]
self.check_round_trip(df, temp_file, expected, columns=["B", "A"])
def test_unsupported_other(self, temp_file):
# mixed python objects
df = pd.DataFrame({"a": ["a", 1, 2.0]})
self.check_external_error_on_write(df, temp_file)
def test_rw_use_threads(self, temp_file):
df = pd.DataFrame({"A": np.arange(100000)})
self.check_round_trip(df, temp_file, use_threads=True)
self.check_round_trip(df, temp_file, use_threads=False)
def test_path_pathlib(self, temp_file):
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
).reset_index()
result = tm.round_trip_pathlib(df.to_feather, read_feather, temp_file)
tm.assert_frame_equal(df, result)
def test_passthrough_keywords(self, temp_file):
df = pd.DataFrame(
1.1 * np.arange(120).reshape((30, 4)),
columns=pd.Index(list("ABCD")),
index=pd.Index([f"i-{i}" for i in range(30)]),
).reset_index()
self.check_round_trip(df, temp_file, write_kwargs={"version": 1})
@pytest.mark.network
@pytest.mark.single_cpu
def test_http_path(self, feather_file, httpserver):
# GH 29055
expected = read_feather(feather_file)
with open(feather_file, "rb") as f:
httpserver.serve_content(content=f.read())
res = read_feather(httpserver.url)
tm.assert_frame_equal(expected, res)
def test_read_feather_dtype_backend(
self, string_storage, dtype_backend, using_infer_string, temp_file
):
# GH#50765
df = pd.DataFrame(
{
"a": pd.Series([1, pd.NA, 3], dtype="Int64"),
"b": pd.Series([1, 2, 3], dtype="Int64"),
"c": pd.Series([1.5, pd.NA, 2.5], dtype="Float64"),
"d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": [True, False, None],
"f": [True, False, True],
"g": ["a", "b", "c"],
"h": ["a", "b", None],
}
)
to_feather(df, temp_file)
with pd.option_context("mode.string_storage", string_storage):
result = read_feather(temp_file, dtype_backend=dtype_backend)
if dtype_backend == "pyarrow":
pa = pytest.importorskip("pyarrow")
if using_infer_string:
string_dtype = pd.ArrowDtype(pa.large_string())
else:
string_dtype = pd.ArrowDtype(pa.string())
else:
string_dtype = pd.StringDtype(string_storage)
expected = pd.DataFrame(
{
"a": pd.Series([1, pd.NA, 3], dtype="Int64"),
"b": pd.Series([1, 2, 3], dtype="Int64"),
"c": pd.Series([1.5, pd.NA, 2.5], dtype="Float64"),
"d": pd.Series([1.5, 2.0, 2.5], dtype="Float64"),
"e": pd.Series([True, False, pd.NA], dtype="boolean"),
"f": pd.Series([True, False, True], dtype="boolean"),
"g": pd.Series(["a", "b", "c"], dtype=string_dtype),
"h": pd.Series(["a", "b", None], dtype=string_dtype),
}
)
if dtype_backend == "pyarrow":
from pandas.arrays import ArrowExtensionArray
expected = pd.DataFrame(
{
col: ArrowExtensionArray(pa.array(expected[col], from_pandas=True))
for col in expected.columns
}
)
if using_infer_string:
expected.columns = expected.columns.astype(
pd.StringDtype(string_storage, na_value=np.nan)
)
tm.assert_frame_equal(result, expected)
def test_int_columns_and_index(self, temp_file):
df = pd.DataFrame({"a": [1, 2, 3]}, index=pd.Index([3, 4, 5], name="test"))
self.check_round_trip(df, temp_file)
def test_invalid_dtype_backend(self, temp_file):
msg = (
"dtype_backend numpy is invalid, only 'numpy_nullable' and "
"'pyarrow' are allowed."
)
df = pd.DataFrame({"int": list(range(1, 4))})
df.to_feather(temp_file)
with pytest.raises(ValueError, match=msg):
read_feather(temp_file, dtype_backend="numpy")
def test_string_inference(self, temp_file, using_infer_string):
# GH#54431
df = pd.DataFrame(data={"a": ["x", "y"]})
df.to_feather(temp_file)
with pd.option_context("future.infer_string", True):
result = read_feather(temp_file)
dtype = pd.StringDtype(na_value=np.nan)
expected = pd.DataFrame(
data={"a": ["x", "y"]}, dtype=pd.StringDtype(na_value=np.nan)
)
expected = pd.DataFrame(
data={"a": ["x", "y"]},
dtype=dtype,
columns=pd.Index(
["a"],
dtype=object
if pa_version_under19p0 and not using_infer_string
else dtype,
),
)
tm.assert_frame_equal(result, expected)
@pytest.mark.skipif(pa_version_under18p0, reason="not supported before 18.0")
def test_string_inference_string_view_type(self, temp_file):
# GH#54798
import pyarrow as pa
from pyarrow import feather
table = pa.table({"a": pa.array([None, "b", "c"], pa.string_view())})
feather.write_feather(table, temp_file)
with pd.option_context("future.infer_string", True):
result = read_feather(temp_file)
expected = pd.DataFrame(
data={"a": [None, "b", "c"]}, dtype=pd.StringDtype(na_value=np.nan)
)
tm.assert_frame_equal(result, expected)
def test_out_of_bounds_datetime_to_feather(self, temp_file):
# GH#47832
df = pd.DataFrame(
{
"date": [
datetime.fromisoformat("1654-01-01"),
datetime.fromisoformat("1920-01-01"),
],
}
)
self.check_round_trip(df, temp_file)