Skip to content

Commit 85b4777

Browse files
committed
pandas default backend fixes
1 parent 9d6edb6 commit 85b4777

39 files changed

+663
-951
lines changed

tests/conftest.py

Lines changed: 21 additions & 79 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
import warnings
33
from importlib import import_module
44
from pathlib import Path
5-
from typing import Any, Union
5+
from typing import Union
66

77
import pytest
88

@@ -19,13 +19,27 @@
1919
pandas = None
2020
pyarrow_dtype = None
2121

22-
# Check if pandas has arrow dtypes enabled
23-
try:
24-
from pandas.compat import pa_version_under7p0
2522

26-
pyarrow_dtypes_enabled = not pa_version_under7p0
27-
except ImportError:
28-
pyarrow_dtypes_enabled = False
23+
# Version-aware helpers for Pandas 2.x vs 3.0 compatibility
24+
def _get_pandas_ge_3():
25+
if pandas is None:
26+
return False
27+
from packaging.version import Version
28+
29+
return Version(pandas.__version__) >= Version("3.0.0")
30+
31+
32+
PANDAS_GE_3 = _get_pandas_ge_3()
33+
34+
35+
def is_string_dtype(dtype):
36+
"""Check if a dtype is a string dtype (works across Pandas 2.x and 3.0).
37+
38+
Uses pd.api.types.is_string_dtype() which handles:
39+
- Pandas 2.x: object dtype for strings
40+
- Pandas 3.0+: str (StringDtype) for strings
41+
"""
42+
return pandas.api.types.is_string_dtype(dtype)
2943

3044

3145
def import_pandas():
@@ -113,78 +127,6 @@ def pandas_supports_arrow_backend():
113127
return pandas_2_or_higher()
114128

115129

116-
def numpy_pandas_df(*args, **kwargs):
117-
return import_pandas().DataFrame(*args, **kwargs)
118-
119-
120-
def arrow_pandas_df(*args, **kwargs):
121-
df = numpy_pandas_df(*args, **kwargs)
122-
return df.convert_dtypes(dtype_backend="pyarrow")
123-
124-
125-
class NumpyPandas:
126-
def __init__(self) -> None:
127-
self.backend = "numpy_nullable"
128-
self.DataFrame = numpy_pandas_df
129-
self.pandas = import_pandas()
130-
131-
def __getattr__(self, name: str) -> Any: # noqa: ANN401
132-
return getattr(self.pandas, name)
133-
134-
135-
def convert_arrow_to_numpy_backend(df):
136-
names = df.columns
137-
df_content = {}
138-
for name in names:
139-
df_content[name] = df[name].array.__arrow_array__()
140-
# This should convert the pyarrow chunked arrays into numpy arrays
141-
return import_pandas().DataFrame(df_content)
142-
143-
144-
def convert_to_numpy(df):
145-
if (
146-
pyarrow_dtypes_enabled
147-
and pyarrow_dtype is not None
148-
and any(True for x in df.dtypes if isinstance(x, pyarrow_dtype))
149-
):
150-
return convert_arrow_to_numpy_backend(df)
151-
return df
152-
153-
154-
def convert_and_equal(df1, df2, **kwargs):
155-
df1 = convert_to_numpy(df1)
156-
df2 = convert_to_numpy(df2)
157-
import_pandas().testing.assert_frame_equal(df1, df2, **kwargs)
158-
159-
160-
class ArrowMockTesting:
161-
def __init__(self) -> None:
162-
self.testing = import_pandas().testing
163-
self.assert_frame_equal = convert_and_equal
164-
165-
def __getattr__(self, name: str) -> Any: # noqa: ANN401
166-
return getattr(self.testing, name)
167-
168-
169-
# This converts dataframes constructed with 'DataFrame(...)' to pyarrow backed dataframes
170-
# Assert equal does the opposite, turning all pyarrow backed dataframes into numpy backed ones
171-
# this is done because we don't produce pyarrow backed dataframes yet
172-
class ArrowPandas:
173-
def __init__(self) -> None:
174-
self.pandas = import_pandas()
175-
if pandas_2_or_higher() and pyarrow_dtypes_enabled:
176-
self.backend = "pyarrow"
177-
self.DataFrame = arrow_pandas_df
178-
else:
179-
# For backwards compatible reasons, just mock regular pandas
180-
self.backend = "numpy_nullable"
181-
self.DataFrame = self.pandas.DataFrame
182-
self.testing = ArrowMockTesting()
183-
184-
def __getattr__(self, name: str) -> Any: # noqa: ANN401
185-
return getattr(self.pandas, name)
186-
187-
188130
@pytest.fixture
189131
def require():
190132
def _require(extension_name, db_name="") -> Union[duckdb.DuckDBPyConnection, None]:

tests/coverage/test_pandas_categorical_coverage.py

Lines changed: 11 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,4 @@
1-
import pytest
2-
from conftest import NumpyPandas
1+
import pandas as pd
32

43
import duckdb
54

@@ -9,23 +8,23 @@ def check_result_list(res):
98
assert res_item[0] == res_item[1]
109

1110

12-
def check_create_table(category, pandas):
11+
def check_create_table(category):
1312
conn = duckdb.connect()
1413

1514
conn.execute("PRAGMA enable_verification")
16-
df_in = pandas.DataFrame(
15+
df_in = pd.DataFrame(
1716
{
18-
"x": pandas.Categorical(category, ordered=True),
19-
"y": pandas.Categorical(category, ordered=True),
17+
"x": pd.Categorical(category, ordered=True),
18+
"y": pd.Categorical(category, ordered=True),
2019
"z": category,
2120
}
2221
)
2322

2423
category.append("bla")
2524

26-
df_in_diff = pandas.DataFrame( # noqa: F841
25+
df_in_diff = pd.DataFrame( # noqa: F841
2726
{
28-
"k": pandas.Categorical(category, ordered=True),
27+
"k": pd.Categorical(category, ordered=True),
2928
}
3029
)
3130

@@ -68,14 +67,11 @@ def check_create_table(category, pandas):
6867
conn.execute("DROP TABLE t1")
6968

7069

71-
# TODO: extend tests with ArrowPandas # noqa: TD002, TD003
7270
class TestCategory:
73-
@pytest.mark.parametrize("pandas", [NumpyPandas()])
74-
def test_category_string_uint16(self, duckdb_cursor, pandas):
71+
def test_category_string_uint16(self, duckdb_cursor):
7572
category = [str(i) for i in range(300)]
76-
check_create_table(category, pandas)
73+
check_create_table(category)
7774

78-
@pytest.mark.parametrize("pandas", [NumpyPandas()])
79-
def test_category_string_uint32(self, duckdb_cursor, pandas):
75+
def test_category_string_uint32(self, duckdb_cursor):
8076
category = [str(i) for i in range(70000)]
81-
check_create_table(category, pandas)
77+
check_create_table(category)

tests/extensions/test_httpfs.py

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
import datetime
22
import os
33

4+
import pandas as pd
45
import pytest
5-
from conftest import ArrowPandas, NumpyPandas
66

77
import duckdb
88

@@ -34,8 +34,7 @@ def test_s3fs(self, require):
3434
res = rel.fetchone()
3535
assert res == (1, 0, datetime.date(1965, 2, 28), 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 6, 0, 0, 0, 0)
3636

37-
@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()])
38-
def test_httpfs(self, require, pandas):
37+
def test_httpfs(self, require):
3938
connection = require("httpfs")
4039
try:
4140
connection.execute("""
@@ -51,14 +50,14 @@ def test_httpfs(self, require, pandas):
5150
raise
5251

5352
result_df = connection.fetchdf()
54-
exp_result = pandas.DataFrame(
53+
exp_result = pd.DataFrame(
5554
{
56-
"id": pandas.Series([1, 2, 3], dtype="int32"),
55+
"id": pd.Series([1, 2, 3], dtype="int32"),
5756
"first_name": ["Amanda", "Albert", "Evelyn"],
5857
"last_name": ["Jordan", "Freeman", "Morgan"],
5958
}
6059
)
61-
pandas.testing.assert_frame_equal(result_df, exp_result)
60+
pd.testing.assert_frame_equal(result_df, exp_result, check_dtype=False)
6261

6362
def test_http_exception(self, require):
6463
connection = require("httpfs")

tests/fast/api/test_3654.py

Lines changed: 5 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
import pytest
1+
import pandas as pd
22

33
import duckdb
44

@@ -8,13 +8,11 @@
88
can_run = True
99
except Exception:
1010
can_run = False
11-
from conftest import ArrowPandas, NumpyPandas
1211

1312

1413
class Test3654:
15-
@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()])
16-
def test_3654_pandas(self, duckdb_cursor, pandas):
17-
df1 = pandas.DataFrame(
14+
def test_3654_pandas(self, duckdb_cursor):
15+
df1 = pd.DataFrame(
1816
{
1917
"id": [1, 1, 2],
2018
}
@@ -25,12 +23,11 @@ def test_3654_pandas(self, duckdb_cursor, pandas):
2523
print(rel.execute().fetchall())
2624
assert rel.execute().fetchall() == [(1,), (1,), (2,)]
2725

28-
@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()])
29-
def test_3654_arrow(self, duckdb_cursor, pandas):
26+
def test_3654_arrow(self, duckdb_cursor):
3027
if not can_run:
3128
return
3229

33-
df1 = pandas.DataFrame(
30+
df1 = pd.DataFrame(
3431
{
3532
"id": [1, 1, 2],
3633
}

tests/fast/api/test_config.py

Lines changed: 9 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -2,37 +2,32 @@
22
import os
33
import re
44

5-
import pytest
6-
from conftest import ArrowPandas, NumpyPandas
5+
import pandas as pd
76

87
import duckdb
98

109

1110
class TestDBConfig:
12-
@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()])
13-
def test_default_order(self, duckdb_cursor, pandas):
14-
df = pandas.DataFrame({"a": [1, 2, 3]})
11+
def test_default_order(self, duckdb_cursor):
12+
df = pd.DataFrame({"a": [1, 2, 3]})
1513
con = duckdb.connect(":memory:", config={"default_order": "desc"})
1614
result = con.execute("select * from df order by a").fetchall()
1715
assert result == [(3,), (2,), (1,)]
1816

19-
@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()])
20-
def test_null_order(self, duckdb_cursor, pandas):
21-
df = pandas.DataFrame({"a": [1, 2, 3, None]})
17+
def test_null_order(self, duckdb_cursor):
18+
df = pd.DataFrame({"a": [1, 2, 3, None]})
2219
con = duckdb.connect(":memory:", config={"default_null_order": "nulls_last"})
2320
result = con.execute("select * from df order by a").fetchall()
2421
assert result == [(1,), (2,), (3,), (None,)]
2522

26-
@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()])
27-
def test_multiple_options(self, duckdb_cursor, pandas):
28-
df = pandas.DataFrame({"a": [1, 2, 3, None]})
23+
def test_multiple_options(self, duckdb_cursor):
24+
df = pd.DataFrame({"a": [1, 2, 3, None]})
2925
con = duckdb.connect(":memory:", config={"default_null_order": "nulls_last", "default_order": "desc"})
3026
result = con.execute("select * from df order by a").fetchall()
3127
assert result == [(3,), (2,), (1,), (None,)]
3228

33-
@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()])
34-
def test_external_access(self, duckdb_cursor, pandas):
35-
df = pandas.DataFrame({"a": [1, 2, 3]})
29+
def test_external_access(self, duckdb_cursor):
30+
df = pd.DataFrame({"a": [1, 2, 3]})
3631
# this works (replacement scan)
3732
con_regular = duckdb.connect(":memory:", config={})
3833
con_regular.execute("select * from df")

tests/fast/api/test_dbapi00.py

Lines changed: 10 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
# simple DB API testcase
22

33
import numpy
4+
import pandas as pd
45
import pytest
5-
from conftest import ArrowPandas, NumpyPandas
66

77

88
def assert_result_equal(result):
@@ -83,30 +83,29 @@ def test_numpy_selection(self, duckdb_cursor, integers, timestamps):
8383
arr.mask = [False, False, True]
8484
numpy.testing.assert_array_equal(result["t"], arr, "Incorrect result returned")
8585

86-
@pytest.mark.parametrize("pandas", [NumpyPandas(), ArrowPandas()])
87-
def test_pandas_selection(self, duckdb_cursor, pandas, integers, timestamps):
86+
def test_pandas_selection(self, duckdb_cursor, integers, timestamps):
8887
import datetime
8988

9089
from packaging.version import Version
9190

9291
# I don't know when this exactly changed, but 2.0.3 does not support this, recent versions do
93-
if Version(pandas.__version__) <= Version("2.0.3"):
92+
if Version(pd.__version__) <= Version("2.0.3"):
9493
pytest.skip("The resulting dtype is 'object' when given a Series with dtype Int32DType")
9594

9695
duckdb_cursor.execute("SELECT * FROM integers")
9796
result = duckdb_cursor.fetchdf()
9897
array = numpy.ma.masked_array(numpy.arange(11))
9998
array.mask = [False] * 10 + [True]
100-
arr = {"i": pandas.Series(array.data, dtype=pandas.Int32Dtype)}
101-
arr["i"][array.mask] = pandas.NA
102-
arr = pandas.DataFrame(arr)
103-
pandas.testing.assert_frame_equal(result, arr)
99+
arr = {"i": pd.Series(array.data, dtype=pd.Int32Dtype)}
100+
arr["i"][array.mask] = pd.NA
101+
arr = pd.DataFrame(arr)
102+
pd.testing.assert_frame_equal(result, arr)
104103

105104
duckdb_cursor.execute("SELECT * FROM timestamps")
106105
result = duckdb_cursor.fetchdf()
107-
df = pandas.DataFrame(
106+
df = pd.DataFrame(
108107
{
109-
"t": pandas.Series(
108+
"t": pd.Series(
110109
data=[
111110
datetime.datetime(year=1992, month=10, day=3, hour=18, minute=34, second=45),
112111
datetime.datetime(year=2010, month=1, day=1, hour=0, minute=0, second=1),
@@ -116,7 +115,7 @@ def test_pandas_selection(self, duckdb_cursor, pandas, integers, timestamps):
116115
)
117116
}
118117
)
119-
pandas.testing.assert_frame_equal(result, df)
118+
pd.testing.assert_frame_equal(result, df)
120119

121120
# def test_numpy_creation(self, duckdb_cursor):
122121
# # numpyarray = {'i': numpy.arange(10), 'v': numpy.random.randint(100, size=(1, 10))} # segfaults

tests/fast/api/test_dbapi08.py

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,21 +1,19 @@
11
# test fetchdf with various types
2-
import pytest
3-
from conftest import NumpyPandas
2+
import pandas as pd
43

54
import duckdb
65

76

87
class TestType:
9-
@pytest.mark.parametrize("pandas", [NumpyPandas()])
10-
def test_fetchdf(self, pandas):
8+
def test_fetchdf(self):
119
con = duckdb.connect()
1210
con.execute("CREATE TABLE items(item VARCHAR)")
1311
con.execute("INSERT INTO items VALUES ('jeans'), (''), (NULL)")
1412
res = con.execute("SELECT item FROM items").fetchdf()
15-
assert isinstance(res, pandas.core.frame.DataFrame)
13+
assert isinstance(res, pd.core.frame.DataFrame)
1614

17-
df = pandas.DataFrame({"item": ["jeans", "", None]})
15+
df = pd.DataFrame({"item": ["jeans", "", None]})
1816

1917
print(res)
2018
print(df)
21-
pandas.testing.assert_frame_equal(res, df)
19+
pd.testing.assert_frame_equal(res, df, check_dtype=False)

0 commit comments

Comments
 (0)