Ajout type contrat
This commit is contained in:
@@ -0,0 +1,222 @@
|
||||
"""
|
||||
Tests for the Apache Iceberg format.
|
||||
|
||||
Tests in this file use a simple Iceberg catalog based on SQLite, with the same
|
||||
data used for Parquet tests (``pandas/tests/io/data/parquet/simple.parquet``).
|
||||
"""
|
||||
|
||||
import collections
|
||||
import importlib
|
||||
import pathlib
|
||||
|
||||
import pytest
|
||||
|
||||
import pandas as pd
|
||||
import pandas._testing as tm
|
||||
|
||||
from pandas.io.iceberg import read_iceberg
|
||||
|
||||
pytestmark = pytest.mark.single_cpu
|
||||
|
||||
pyiceberg = pytest.importorskip("pyiceberg")
|
||||
pyiceberg_catalog = pytest.importorskip("pyiceberg.catalog")
|
||||
pq = pytest.importorskip("pyarrow.parquet")
|
||||
|
||||
Catalog = collections.namedtuple("Catalog", ["name", "uri", "warehouse"])
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def catalog(request, tmp_path):
|
||||
# the catalog stores the full path of data files, so the catalog needs to be
|
||||
# created dynamically, and not saved in pandas/tests/io/data as other formats
|
||||
uri = f"sqlite:///{tmp_path}/catalog.sqlite"
|
||||
warehouse = f"file://{tmp_path}"
|
||||
catalog_name = request.param if hasattr(request, "param") else None
|
||||
catalog = pyiceberg_catalog.load_catalog(
|
||||
catalog_name or "default",
|
||||
type="sql",
|
||||
uri=uri,
|
||||
warehouse=warehouse,
|
||||
)
|
||||
catalog.create_namespace("ns")
|
||||
|
||||
df = pq.read_table(
|
||||
pathlib.Path(__file__).parent / "data" / "parquet" / "simple.parquet"
|
||||
)
|
||||
table = catalog.create_table("ns.my_table", schema=df.schema)
|
||||
table.append(df)
|
||||
|
||||
if catalog_name is not None:
|
||||
config_path = pathlib.Path.home() / ".pyiceberg.yaml"
|
||||
with open(config_path, "w", encoding="utf-8") as f:
|
||||
f.write(f"""\
|
||||
catalog:
|
||||
{catalog_name}:
|
||||
type: sql
|
||||
uri: {uri}
|
||||
warehouse: {warehouse}""")
|
||||
|
||||
importlib.reload(pyiceberg_catalog) # needed to reload the config file
|
||||
|
||||
yield Catalog(name=catalog_name or "default", uri=uri, warehouse=warehouse)
|
||||
|
||||
if catalog_name is not None:
|
||||
config_path.unlink()
|
||||
|
||||
|
||||
class TestIceberg:
|
||||
def test_read(self, catalog):
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3],
|
||||
"B": ["foo", "foo", "foo"],
|
||||
}
|
||||
)
|
||||
result = read_iceberg(
|
||||
"ns.my_table",
|
||||
catalog_properties={"uri": catalog.uri},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
@pytest.mark.parametrize("catalog", ["default", "pandas_tests"], indirect=True)
|
||||
def test_read_by_catalog_name(self, catalog):
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3],
|
||||
"B": ["foo", "foo", "foo"],
|
||||
}
|
||||
)
|
||||
result = read_iceberg(
|
||||
"ns.my_table",
|
||||
catalog_name=catalog.name,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_read_with_row_filter(self, catalog):
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": [2, 3],
|
||||
"B": ["foo", "foo"],
|
||||
}
|
||||
)
|
||||
result = read_iceberg(
|
||||
"ns.my_table",
|
||||
catalog_properties={"uri": catalog.uri},
|
||||
row_filter="A > 1",
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_read_with_case_sensitive(self, catalog):
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3],
|
||||
}
|
||||
)
|
||||
result = read_iceberg(
|
||||
"ns.my_table",
|
||||
catalog_properties={"uri": catalog.uri},
|
||||
columns=["a"],
|
||||
case_sensitive=False,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
with pytest.raises(ValueError, match="^Could not find column"):
|
||||
read_iceberg(
|
||||
"ns.my_table",
|
||||
catalog_properties={"uri": catalog.uri},
|
||||
columns=["a"],
|
||||
case_sensitive=True,
|
||||
)
|
||||
|
||||
def test_read_with_limit(self, catalog):
|
||||
expected = pd.DataFrame(
|
||||
{
|
||||
"A": [1, 2],
|
||||
"B": ["foo", "foo"],
|
||||
}
|
||||
)
|
||||
result = read_iceberg(
|
||||
"ns.my_table",
|
||||
catalog_properties={"uri": catalog.uri},
|
||||
limit=2,
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_write(self, catalog):
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3],
|
||||
"B": ["foo", "foo", "foo"],
|
||||
}
|
||||
)
|
||||
df.to_iceberg(
|
||||
"ns.new_table",
|
||||
catalog_properties={"uri": catalog.uri},
|
||||
location=catalog.warehouse,
|
||||
)
|
||||
result = read_iceberg(
|
||||
"ns.new_table",
|
||||
catalog_properties={"uri": catalog.uri},
|
||||
)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
@pytest.mark.parametrize("catalog", ["default", "pandas_tests"], indirect=True)
|
||||
def test_write_by_catalog_name(self, catalog):
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3],
|
||||
"B": ["foo", "foo", "foo"],
|
||||
}
|
||||
)
|
||||
df.to_iceberg(
|
||||
"ns.new_table",
|
||||
catalog_name=catalog.name,
|
||||
)
|
||||
result = read_iceberg(
|
||||
"ns.new_table",
|
||||
catalog_name=catalog.name,
|
||||
)
|
||||
tm.assert_frame_equal(result, df)
|
||||
|
||||
def test_write_existing_table_with_append_true(self, catalog):
|
||||
original = read_iceberg(
|
||||
"ns.my_table",
|
||||
catalog_properties={"uri": catalog.uri},
|
||||
)
|
||||
new = pd.DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3],
|
||||
"B": ["foo", "foo", "foo"],
|
||||
}
|
||||
)
|
||||
expected = pd.concat([original, new], ignore_index=True)
|
||||
new.to_iceberg(
|
||||
"ns.my_table",
|
||||
catalog_properties={"uri": catalog.uri},
|
||||
location=catalog.warehouse,
|
||||
append=True,
|
||||
)
|
||||
result = read_iceberg(
|
||||
"ns.my_table",
|
||||
catalog_properties={"uri": catalog.uri},
|
||||
)
|
||||
tm.assert_frame_equal(result, expected)
|
||||
|
||||
def test_write_existing_table_with_append_false(self, catalog):
|
||||
df = pd.DataFrame(
|
||||
{
|
||||
"A": [1, 2, 3],
|
||||
"B": ["foo", "foo", "foo"],
|
||||
}
|
||||
)
|
||||
df.to_iceberg(
|
||||
"ns.my_table",
|
||||
catalog_properties={"uri": catalog.uri},
|
||||
location=catalog.warehouse,
|
||||
append=False,
|
||||
)
|
||||
result = read_iceberg(
|
||||
"ns.my_table",
|
||||
catalog_properties={"uri": catalog.uri},
|
||||
)
|
||||
tm.assert_frame_equal(result, df)
|
||||
Reference in New Issue
Block a user