ai-content-maker/.venv/Lib/site-packages/pandas/tests/extension/test_arrow.py

1710 lines
65 KiB
Python

"""
This file contains a minimal set of tests for compliance with the extension
array interface test suite, and should contain no other tests.
The test suite for the full functionality of the array is located in
`pandas/tests/arrays/`.
The tests in this file are inherited from the BaseExtensionTests, and only
minimal tweaks should be applied to get the tests passing (by overwriting a
parent method).
Additional tests should either be added to one of the BaseExtensionTests
classes (if they are relevant for the extension interface for all dtypes), or
be added to the array-specific tests in `pandas/tests/arrays/`.
"""
from datetime import (
date,
datetime,
time,
timedelta,
)
import numpy as np
import pytest
from pandas.compat import (
is_ci_environment,
is_platform_windows,
pa_version_under2p0,
pa_version_under3p0,
pa_version_under4p0,
pa_version_under6p0,
pa_version_under7p0,
pa_version_under8p0,
pa_version_under9p0,
)
from pandas.errors import PerformanceWarning
import pandas as pd
import pandas._testing as tm
from pandas.tests.extension import base
pa = pytest.importorskip("pyarrow", minversion="1.0.1")
from pandas.core.arrays.arrow.array import ArrowExtensionArray
from pandas.core.arrays.arrow.dtype import ArrowDtype # isort:skip
@pytest.fixture(params=tm.ALL_PYARROW_DTYPES, ids=str)
def dtype(request):
return ArrowDtype(pyarrow_dtype=request.param)
@pytest.fixture
def data(dtype):
pa_dtype = dtype.pyarrow_dtype
if pa.types.is_boolean(pa_dtype):
data = [True, False] * 4 + [None] + [True, False] * 44 + [None] + [True, False]
elif pa.types.is_floating(pa_dtype):
data = [1.0, 0.0] * 4 + [None] + [-2.0, -1.0] * 44 + [None] + [0.5, 99.5]
elif pa.types.is_signed_integer(pa_dtype):
data = [1, 0] * 4 + [None] + [-2, -1] * 44 + [None] + [1, 99]
elif pa.types.is_unsigned_integer(pa_dtype):
data = [1, 0] * 4 + [None] + [2, 1] * 44 + [None] + [1, 99]
elif pa.types.is_date(pa_dtype):
data = (
[date(2022, 1, 1), date(1999, 12, 31)] * 4
+ [None]
+ [date(2022, 1, 1), date(2022, 1, 1)] * 44
+ [None]
+ [date(1999, 12, 31), date(1999, 12, 31)]
)
elif pa.types.is_timestamp(pa_dtype):
data = (
[datetime(2020, 1, 1, 1, 1, 1, 1), datetime(1999, 1, 1, 1, 1, 1, 1)] * 4
+ [None]
+ [datetime(2020, 1, 1, 1), datetime(1999, 1, 1, 1)] * 44
+ [None]
+ [datetime(2020, 1, 1), datetime(1999, 1, 1)]
)
elif pa.types.is_duration(pa_dtype):
data = (
[timedelta(1), timedelta(1, 1)] * 4
+ [None]
+ [timedelta(-1), timedelta(0)] * 44
+ [None]
+ [timedelta(-10), timedelta(10)]
)
elif pa.types.is_time(pa_dtype):
data = (
[time(12, 0), time(0, 12)] * 4
+ [None]
+ [time(0, 0), time(1, 1)] * 44
+ [None]
+ [time(0, 5), time(5, 0)]
)
else:
raise NotImplementedError
return pd.array(data, dtype=dtype)
@pytest.fixture
def data_missing(data):
"""Length-2 array with [NA, Valid]"""
return type(data)._from_sequence([None, data[0]])
@pytest.fixture(params=["data", "data_missing"])
def all_data(request, data, data_missing):
"""Parametrized fixture returning 'data' or 'data_missing' integer arrays.
Used to test dtype conversion with and without missing values.
"""
if request.param == "data":
return data
elif request.param == "data_missing":
return data_missing
@pytest.fixture
def data_for_grouping(dtype):
"""
Data for factorization, grouping, and unique tests.
Expected to be like [B, B, NA, NA, A, A, B, C]
Where A < B < C and NA is missing
"""
pa_dtype = dtype.pyarrow_dtype
if pa.types.is_boolean(pa_dtype):
A = False
B = True
C = True
elif pa.types.is_floating(pa_dtype):
A = -1.1
B = 0.0
C = 1.1
elif pa.types.is_signed_integer(pa_dtype):
A = -1
B = 0
C = 1
elif pa.types.is_unsigned_integer(pa_dtype):
A = 0
B = 1
C = 10
elif pa.types.is_date(pa_dtype):
A = date(1999, 12, 31)
B = date(2010, 1, 1)
C = date(2022, 1, 1)
elif pa.types.is_timestamp(pa_dtype):
A = datetime(1999, 1, 1, 1, 1, 1, 1)
B = datetime(2020, 1, 1)
C = datetime(2020, 1, 1, 1)
elif pa.types.is_duration(pa_dtype):
A = timedelta(-1)
B = timedelta(0)
C = timedelta(1, 4)
elif pa.types.is_time(pa_dtype):
A = time(0, 0)
B = time(0, 12)
C = time(12, 12)
else:
raise NotImplementedError
return pd.array([B, B, None, None, A, A, B, C], dtype=dtype)
@pytest.fixture
def data_for_sorting(data_for_grouping):
"""
Length-3 array with a known sort order.
This should be three items [B, C, A] with
A < B < C
"""
return type(data_for_grouping)._from_sequence(
[data_for_grouping[0], data_for_grouping[7], data_for_grouping[4]]
)
@pytest.fixture
def data_missing_for_sorting(data_for_grouping):
"""
Length-3 array with a known sort order.
This should be three items [B, NA, A] with
A < B and NA missing.
"""
return type(data_for_grouping)._from_sequence(
[data_for_grouping[0], data_for_grouping[2], data_for_grouping[4]]
)
@pytest.fixture
def data_for_twos(data):
"""Length-100 array in which all the elements are two."""
pa_dtype = data.dtype.pyarrow_dtype
if pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype):
return pd.array([2] * 100, dtype=data.dtype)
# tests will be xfailed where 2 is not a valid scalar for pa_dtype
return data
@pytest.fixture
def na_value():
"""The scalar missing value for this type. Default 'None'"""
return pd.NA
class TestBaseCasting(base.BaseCastingTests):
pass
class TestConstructors(base.BaseConstructorsTests):
def test_from_dtype(self, data, request):
pa_dtype = data.dtype.pyarrow_dtype
if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz:
if pa_version_under2p0:
request.node.add_marker(
pytest.mark.xfail(
reason=f"timestamp data with tz={pa_dtype.tz} "
"converted to integer when pyarrow < 2.0",
)
)
else:
request.node.add_marker(
pytest.mark.xfail(
raises=NotImplementedError,
reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}",
)
)
super().test_from_dtype(data)
def test_from_sequence_pa_array(self, data, request):
# https://github.com/pandas-dev/pandas/pull/47034#discussion_r955500784
# data._data = pa.ChunkedArray
if pa_version_under3p0:
request.node.add_marker(
pytest.mark.xfail(
reason="ChunkedArray has no attribute combine_chunks",
)
)
result = type(data)._from_sequence(data._data)
tm.assert_extension_array_equal(result, data)
assert isinstance(result._data, pa.ChunkedArray)
result = type(data)._from_sequence(data._data.combine_chunks())
tm.assert_extension_array_equal(result, data)
assert isinstance(result._data, pa.ChunkedArray)
def test_from_sequence_pa_array_notimplemented(self, request):
if pa_version_under6p0:
request.node.add_marker(
pytest.mark.xfail(
raises=AttributeError,
reason="month_day_nano_interval not implemented by pyarrow.",
)
)
with pytest.raises(NotImplementedError, match="Converting strings to"):
ArrowExtensionArray._from_sequence_of_strings(
["12-1"], dtype=pa.month_day_nano_interval()
)
def test_from_sequence_of_strings_pa_array(self, data, request):
pa_dtype = data.dtype.pyarrow_dtype
if pa_version_under3p0:
request.node.add_marker(
pytest.mark.xfail(
reason="ChunkedArray has no attribute combine_chunks",
)
)
elif pa.types.is_time64(pa_dtype) and pa_dtype.equals("time64[ns]"):
request.node.add_marker(
pytest.mark.xfail(
reason="Nanosecond time parsing not supported.",
)
)
elif pa.types.is_duration(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"pyarrow doesn't support parsing {pa_dtype}",
)
)
elif pa.types.is_boolean(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
reason="Iterating over ChunkedArray[bool] returns PyArrow scalars.",
)
)
elif pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None:
if pa_version_under7p0:
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"pyarrow doesn't support string cast from {pa_dtype}",
)
)
elif is_platform_windows() and is_ci_environment():
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowInvalid,
reason=(
"TODO: Set ARROW_TIMEZONE_DATABASE environment variable "
"on CI to path to the tzdata for pyarrow."
),
)
)
elif pa_version_under6p0 and pa.types.is_temporal(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"pyarrow doesn't support string cast from {pa_dtype}",
)
)
pa_array = data._data.cast(pa.string())
result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype)
tm.assert_extension_array_equal(result, data)
pa_array = pa_array.combine_chunks()
result = type(data)._from_sequence_of_strings(pa_array, dtype=data.dtype)
tm.assert_extension_array_equal(result, data)
class TestGetitemTests(base.BaseGetitemTests):
@pytest.mark.xfail(
reason=(
"data.dtype.type return pyarrow.DataType "
"but this (intentionally) returns "
"Python scalars or pd.Na"
)
)
def test_getitem_scalar(self, data):
super().test_getitem_scalar(data)
def test_take_series(self, request, data):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
unit = getattr(data.dtype.pyarrow_dtype, "unit", None)
bad_units = ["ns"]
if pa_version_under2p0:
bad_units.extend(["s", "ms", "us"])
if pa_version_under3p0 and tz not in (None, "UTC") and unit in bad_units:
request.node.add_marker(
pytest.mark.xfail(
reason=(
f"Not supported by pyarrow < 3.0 "
f"with timestamp type {tz} and {unit}"
)
)
)
super().test_take_series(data)
def test_reindex(self, request, data, na_value):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
unit = getattr(data.dtype.pyarrow_dtype, "unit", None)
bad_units = ["ns"]
if pa_version_under2p0:
bad_units.extend(["s", "ms", "us"])
if pa_version_under3p0 and tz not in (None, "UTC") and unit in bad_units:
request.node.add_marker(
pytest.mark.xfail(
reason=(
f"Not supported by pyarrow < 3.0 "
f"with timestamp type {tz} and {unit}"
)
)
)
super().test_reindex(data, na_value)
def test_loc_iloc_frame_single_dtype(self, request, using_array_manager, data):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
unit = getattr(data.dtype.pyarrow_dtype, "unit", None)
bad_units = ["ns"]
if pa_version_under2p0:
bad_units.extend(["s", "ms", "us"])
if (
pa_version_under3p0
and not using_array_manager
and tz not in (None, "UTC")
and unit in bad_units
):
request.node.add_marker(
pytest.mark.xfail(
reason=(
f"Not supported by pyarrow < 3.0 "
f"with timestamp type {tz} and {unit}"
)
)
)
super().test_loc_iloc_frame_single_dtype(data)
class TestBaseNumericReduce(base.BaseNumericReduceTests):
def check_reduce(self, ser, op_name, skipna):
pa_dtype = ser.dtype.pyarrow_dtype
result = getattr(ser, op_name)(skipna=skipna)
if pa.types.is_boolean(pa_dtype):
# Can't convert if ser contains NA
pytest.skip(
"pandas boolean data with NA does not fully support all reductions"
)
elif pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype):
ser = ser.astype("Float64")
expected = getattr(ser, op_name)(skipna=skipna)
tm.assert_almost_equal(result, expected)
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_series(self, data, all_numeric_reductions, skipna, request):
pa_dtype = data.dtype.pyarrow_dtype
xfail_mark = pytest.mark.xfail(
raises=TypeError,
reason=(
f"{all_numeric_reductions} is not implemented in "
f"pyarrow={pa.__version__} for {pa_dtype}"
),
)
if all_numeric_reductions in {"skew", "kurt"}:
request.node.add_marker(xfail_mark)
elif (
all_numeric_reductions in {"median", "var", "std", "prod", "max", "min"}
and pa_version_under6p0
):
request.node.add_marker(xfail_mark)
elif all_numeric_reductions in {"sum", "mean"} and pa_version_under2p0:
request.node.add_marker(xfail_mark)
elif (
all_numeric_reductions in {"sum", "mean"}
and skipna is False
and pa_version_under6p0
and (pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype))
):
request.node.add_marker(
pytest.mark.xfail(
raises=AssertionError,
reason=(
f"{all_numeric_reductions} with skip_nulls={skipna} did not "
f"return NA for {pa_dtype} with pyarrow={pa.__version__}"
),
)
)
elif not (
pa.types.is_integer(pa_dtype)
or pa.types.is_floating(pa_dtype)
or pa.types.is_boolean(pa_dtype)
) and not (
all_numeric_reductions in {"min", "max"}
and (pa.types.is_temporal(pa_dtype) and not pa.types.is_duration(pa_dtype))
):
request.node.add_marker(xfail_mark)
elif pa.types.is_boolean(pa_dtype) and all_numeric_reductions in {
"std",
"var",
"median",
}:
request.node.add_marker(xfail_mark)
super().test_reduce_series(data, all_numeric_reductions, skipna)
class TestBaseBooleanReduce(base.BaseBooleanReduceTests):
@pytest.mark.parametrize("skipna", [True, False])
def test_reduce_series(
self, data, all_boolean_reductions, skipna, na_value, request
):
pa_dtype = data.dtype.pyarrow_dtype
xfail_mark = pytest.mark.xfail(
raises=TypeError,
reason=(
f"{all_boolean_reductions} is not implemented in "
f"pyarrow={pa.__version__} for {pa_dtype}"
),
)
if not pa.types.is_boolean(pa_dtype):
request.node.add_marker(xfail_mark)
elif pa_version_under3p0:
request.node.add_marker(xfail_mark)
op_name = all_boolean_reductions
s = pd.Series(data)
result = getattr(s, op_name)(skipna=skipna)
assert result is (op_name == "any")
class TestBaseGroupby(base.BaseGroupbyTests):
def test_groupby_agg_extension(self, data_for_grouping, request):
tz = getattr(data_for_grouping.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}."
)
)
super().test_groupby_agg_extension(data_for_grouping)
def test_groupby_extension_no_sort(self, data_for_grouping, request):
pa_dtype = data_for_grouping.dtype.pyarrow_dtype
if pa.types.is_boolean(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
reason=f"{pa_dtype} only has 2 unique possible values",
)
)
elif pa.types.is_duration(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"pyarrow doesn't support factorizing {pa_dtype}",
)
)
super().test_groupby_extension_no_sort(data_for_grouping)
def test_groupby_extension_transform(self, data_for_grouping, request):
pa_dtype = data_for_grouping.dtype.pyarrow_dtype
if pa.types.is_boolean(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
reason=f"{pa_dtype} only has 2 unique possible values",
)
)
elif pa.types.is_duration(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"pyarrow doesn't support factorizing {pa_dtype}",
)
)
super().test_groupby_extension_transform(data_for_grouping)
def test_groupby_extension_apply(
self, data_for_grouping, groupby_apply_op, request
):
pa_dtype = data_for_grouping.dtype.pyarrow_dtype
if pa.types.is_duration(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"pyarrow doesn't support factorizing {pa_dtype}",
)
)
with tm.maybe_produces_warning(
PerformanceWarning, pa_version_under7p0, check_stacklevel=False
):
super().test_groupby_extension_apply(data_for_grouping, groupby_apply_op)
def test_in_numeric_groupby(self, data_for_grouping, request):
pa_dtype = data_for_grouping.dtype.pyarrow_dtype
if pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
reason="ArrowExtensionArray doesn't support .sum() yet.",
)
)
super().test_in_numeric_groupby(data_for_grouping)
@pytest.mark.parametrize("as_index", [True, False])
def test_groupby_extension_agg(self, as_index, data_for_grouping, request):
pa_dtype = data_for_grouping.dtype.pyarrow_dtype
if pa.types.is_boolean(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=ValueError,
reason=f"{pa_dtype} only has 2 unique possible values",
)
)
elif pa.types.is_duration(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"pyarrow doesn't support factorizing {pa_dtype}",
)
)
with tm.maybe_produces_warning(
PerformanceWarning, pa_version_under7p0, check_stacklevel=False
):
super().test_groupby_extension_agg(as_index, data_for_grouping)
class TestBaseDtype(base.BaseDtypeTests):
def test_construct_from_string_own_name(self, dtype, request):
pa_dtype = dtype.pyarrow_dtype
if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None:
request.node.add_marker(
pytest.mark.xfail(
raises=NotImplementedError,
reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}",
)
)
super().test_construct_from_string_own_name(dtype)
def test_is_dtype_from_name(self, dtype, request):
pa_dtype = dtype.pyarrow_dtype
if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None:
request.node.add_marker(
pytest.mark.xfail(
raises=NotImplementedError,
reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}",
)
)
super().test_is_dtype_from_name(dtype)
def test_construct_from_string(self, dtype, request):
pa_dtype = dtype.pyarrow_dtype
if pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None:
request.node.add_marker(
pytest.mark.xfail(
raises=NotImplementedError,
reason=f"pyarrow.type_for_alias cannot infer {pa_dtype}",
)
)
super().test_construct_from_string(dtype)
def test_construct_from_string_another_type_raises(self, dtype):
msg = r"'another_type' must end with '\[pyarrow\]'"
with pytest.raises(TypeError, match=msg):
type(dtype).construct_from_string("another_type")
def test_get_common_dtype(self, dtype, request):
pa_dtype = dtype.pyarrow_dtype
if (
pa.types.is_date(pa_dtype)
or pa.types.is_time(pa_dtype)
or (
pa.types.is_timestamp(pa_dtype)
and (pa_dtype.unit != "ns" or pa_dtype.tz is not None)
)
or (pa.types.is_duration(pa_dtype) and pa_dtype.unit != "ns")
):
request.node.add_marker(
pytest.mark.xfail(
reason=(
f"{pa_dtype} does not have associated numpy "
f"dtype findable by find_common_type"
)
)
)
super().test_get_common_dtype(dtype)
class TestBaseIndex(base.BaseIndexTests):
pass
class TestBaseInterface(base.BaseInterfaceTests):
def test_contains(self, data, data_missing, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
unit = getattr(data.dtype.pyarrow_dtype, "unit", None)
if pa_version_under2p0 and tz not in (None, "UTC") and unit == "us":
request.node.add_marker(
pytest.mark.xfail(
reason=(
f"Not supported by pyarrow < 2.0 "
f"with timestamp type {tz} and {unit}"
)
)
)
super().test_contains(data, data_missing)
@pytest.mark.xfail(reason="pyarrow.ChunkedArray does not support views.")
def test_view(self, data):
super().test_view(data)
class TestBaseMissing(base.BaseMissingTests):
@pytest.mark.filterwarnings("ignore:Falling back:pandas.errors.PerformanceWarning")
def test_dropna_array(self, data_missing):
super().test_dropna_array(data_missing)
class TestBasePrinting(base.BasePrintingTests):
pass
class TestBaseReshaping(base.BaseReshapingTests):
def test_concat_columns(self, data, na_value, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_concat_columns(data, na_value)
def test_concat_extension_arrays_copy_false(self, data, na_value, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_concat_extension_arrays_copy_false(data, na_value)
def test_align(self, data, na_value, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_align(data, na_value)
def test_align_frame(self, data, na_value, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_align_frame(data, na_value)
def test_align_series_frame(self, data, na_value, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_align_series_frame(data, na_value)
def test_merge(self, data, na_value, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_merge(data, na_value)
def test_ravel(self, data, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_ravel(data)
@pytest.mark.xfail(reason="GH 45419: pyarrow.ChunkedArray does not support views")
def test_transpose(self, data):
super().test_transpose(data)
def test_transpose_frame(self, data, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_transpose_frame(data)
class TestBaseSetitem(base.BaseSetitemTests):
def test_setitem_scalar_series(self, data, box_in_series, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_setitem_scalar_series(data, box_in_series)
def test_setitem_sequence(self, data, box_in_series, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_sequence(data, box_in_series)
def test_setitem_sequence_broadcasts(self, data, box_in_series, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_sequence_broadcasts(data, box_in_series)
@pytest.mark.parametrize("setter", ["loc", "iloc"])
def test_setitem_scalar(self, data, setter, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_scalar(data, setter)
def test_setitem_loc_scalar_mixed(self, data, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_loc_scalar_mixed(data)
def test_setitem_loc_scalar_single(self, data, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_setitem_loc_scalar_single(data)
def test_setitem_loc_scalar_multiple_homogoneous(self, data, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_loc_scalar_multiple_homogoneous(data)
def test_setitem_iloc_scalar_mixed(self, data, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_iloc_scalar_mixed(data)
def test_setitem_iloc_scalar_single(self, data, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_iloc_scalar_single(data)
def test_setitem_iloc_scalar_multiple_homogoneous(self, data, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_iloc_scalar_multiple_homogoneous(data)
@pytest.mark.parametrize(
"mask",
[
np.array([True, True, True, False, False]),
pd.array([True, True, True, False, False], dtype="boolean"),
pd.array([True, True, True, pd.NA, pd.NA], dtype="boolean"),
],
ids=["numpy-array", "boolean-array", "boolean-array-na"],
)
def test_setitem_mask(self, data, mask, box_in_series, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_mask(data, mask, box_in_series)
def test_setitem_mask_boolean_array_with_na(self, data, box_in_series, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
unit = getattr(data.dtype.pyarrow_dtype, "unit", None)
if pa_version_under2p0 and tz not in (None, "UTC") and unit == "us":
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_mask_boolean_array_with_na(data, box_in_series)
@pytest.mark.parametrize(
"idx",
[[0, 1, 2], pd.array([0, 1, 2], dtype="Int64"), np.array([0, 1, 2])],
ids=["list", "integer-array", "numpy-array"],
)
def test_setitem_integer_array(self, data, idx, box_in_series, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_integer_array(data, idx, box_in_series)
@pytest.mark.parametrize("as_callable", [True, False])
@pytest.mark.parametrize("setter", ["loc", None])
def test_setitem_mask_aligned(self, data, as_callable, setter, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_mask_aligned(data, as_callable, setter)
@pytest.mark.parametrize("setter", ["loc", None])
def test_setitem_mask_broadcast(self, data, setter, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_mask_broadcast(data, setter)
def test_setitem_tuple_index(self, data, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_tuple_index(data)
def test_setitem_slice(self, data, box_in_series, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_slice(data, box_in_series)
def test_setitem_loc_iloc_slice(self, data, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_setitem_loc_iloc_slice(data)
def test_setitem_slice_array(self, data, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_setitem_slice_array(data)
def test_setitem_with_expansion_dataframe_column(self, data, full_indexer, request):
# Is there a better way to get the full_indexer id "null_slice"?
is_null_slice = "null_slice" in request.node.nodeid
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC") and not is_null_slice:
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_setitem_with_expansion_dataframe_column(data, full_indexer)
def test_setitem_with_expansion_row(self, data, na_value, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=(f"Not supported by pyarrow < 2.0 with timestamp type {tz}")
)
)
super().test_setitem_with_expansion_row(data, na_value)
def test_setitem_frame_2d_values(self, data, request):
tz = getattr(data.dtype.pyarrow_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_setitem_frame_2d_values(data)
@pytest.mark.xfail(reason="GH 45419: pyarrow.ChunkedArray does not support views")
def test_setitem_preserves_views(self, data):
super().test_setitem_preserves_views(data)
class TestBaseParsing(base.BaseParsingTests):
@pytest.mark.parametrize("engine", ["c", "python"])
def test_EA_types(self, engine, data, request):
pa_dtype = data.dtype.pyarrow_dtype
if pa.types.is_boolean(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(raises=TypeError, reason="GH 47534")
)
elif pa.types.is_timestamp(pa_dtype) and pa_dtype.tz is not None:
request.node.add_marker(
pytest.mark.xfail(
raises=NotImplementedError,
reason=f"Parameterized types with tz={pa_dtype.tz} not supported.",
)
)
super().test_EA_types(engine, data)
class TestBaseUnaryOps(base.BaseUnaryOpsTests):
@pytest.mark.xfail(
pa_version_under2p0,
raises=NotImplementedError,
reason="pyarrow.compute.invert not supported in pyarrow<2.0",
)
def test_invert(self, data, request):
pa_dtype = data.dtype.pyarrow_dtype
if not pa.types.is_boolean(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"pyarrow.compute.invert does support {pa_dtype}",
)
)
super().test_invert(data)
class TestBaseMethods(base.BaseMethodsTests):
@pytest.mark.parametrize("periods", [1, -2])
def test_diff(self, data, periods, request):
pa_dtype = data.dtype.pyarrow_dtype
if pa.types.is_unsigned_integer(pa_dtype) and periods == 1:
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowInvalid,
reason=(
f"diff with {pa_dtype} and periods={periods} will overflow"
),
)
)
super().test_diff(data, periods)
@pytest.mark.parametrize("dropna", [True, False])
def test_value_counts(self, all_data, dropna, request):
pa_dtype = all_data.dtype.pyarrow_dtype
if pa.types.is_duration(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"value_count has no kernel for {pa_dtype}",
)
)
super().test_value_counts(all_data, dropna)
def test_value_counts_with_normalize(self, data, request):
pa_dtype = data.dtype.pyarrow_dtype
if pa.types.is_duration(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"value_count has no pyarrow kernel for {pa_dtype}",
)
)
super().test_value_counts_with_normalize(data)
@pytest.mark.xfail(
pa_version_under6p0,
raises=NotImplementedError,
reason="argmin/max only implemented for pyarrow version >= 6.0",
)
def test_argmin_argmax(
self, data_for_sorting, data_missing_for_sorting, na_value, request
):
pa_dtype = data_for_sorting.dtype.pyarrow_dtype
if pa.types.is_boolean(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
reason=f"{pa_dtype} only has 2 unique possible values",
)
)
elif pa.types.is_duration(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"min_max not supported in pyarrow for {pa_dtype}",
)
)
super().test_argmin_argmax(data_for_sorting, data_missing_for_sorting, na_value)
@pytest.mark.parametrize(
"op_name, skipna, expected",
[
("idxmax", True, 0),
("idxmin", True, 2),
("argmax", True, 0),
("argmin", True, 2),
("idxmax", False, np.nan),
("idxmin", False, np.nan),
("argmax", False, -1),
("argmin", False, -1),
],
)
def test_argreduce_series(
self, data_missing_for_sorting, op_name, skipna, expected, request
):
pa_dtype = data_missing_for_sorting.dtype.pyarrow_dtype
if pa_version_under6p0 and skipna:
request.node.add_marker(
pytest.mark.xfail(
raises=NotImplementedError,
reason="min_max not supported in pyarrow",
)
)
elif not pa_version_under6p0 and pa.types.is_duration(pa_dtype) and skipna:
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"min_max not supported in pyarrow for {pa_dtype}",
)
)
super().test_argreduce_series(
data_missing_for_sorting, op_name, skipna, expected
)
@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values(self, data_for_sorting, ascending, sort_by_key, request):
pa_dtype = data_for_sorting.dtype.pyarrow_dtype
if pa.types.is_duration(pa_dtype) and not ascending and not pa_version_under2p0:
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=(
f"unique has no pyarrow kernel "
f"for {pa_dtype} when ascending={ascending}"
),
)
)
super().test_sort_values(data_for_sorting, ascending, sort_by_key)
@pytest.mark.parametrize("ascending", [True, False])
def test_sort_values_frame(self, data_for_sorting, ascending, request):
pa_dtype = data_for_sorting.dtype.pyarrow_dtype
if pa.types.is_duration(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=(
f"dictionary_encode has no pyarrow kernel "
f"for {pa_dtype} when ascending={ascending}"
),
)
)
super().test_sort_values_frame(data_for_sorting, ascending)
@pytest.mark.parametrize("box", [pd.Series, lambda x: x])
@pytest.mark.parametrize("method", [lambda x: x.unique(), pd.unique])
def test_unique(self, data, box, method, request):
pa_dtype = data.dtype.pyarrow_dtype
if pa.types.is_duration(pa_dtype) and not pa_version_under2p0:
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"unique has no pyarrow kernel for {pa_dtype}.",
)
)
super().test_unique(data, box, method)
@pytest.mark.parametrize("na_sentinel", [-1, -2])
def test_factorize(self, data_for_grouping, na_sentinel, request):
pa_dtype = data_for_grouping.dtype.pyarrow_dtype
if pa.types.is_duration(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"dictionary_encode has no pyarrow kernel for {pa_dtype}",
)
)
elif pa.types.is_boolean(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
reason=f"{pa_dtype} only has 2 unique possible values",
)
)
super().test_factorize(data_for_grouping, na_sentinel)
@pytest.mark.parametrize("na_sentinel", [-1, -2])
def test_factorize_equivalence(self, data_for_grouping, na_sentinel, request):
pa_dtype = data_for_grouping.dtype.pyarrow_dtype
if pa.types.is_duration(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"dictionary_encode has no pyarrow kernel for {pa_dtype}",
)
)
super().test_factorize_equivalence(data_for_grouping, na_sentinel)
def test_factorize_empty(self, data, request):
pa_dtype = data.dtype.pyarrow_dtype
if pa.types.is_duration(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"dictionary_encode has no pyarrow kernel for {pa_dtype}",
)
)
super().test_factorize_empty(data)
def test_shift_fill_value(self, data, request):
pa_dtype = data.dtype.pyarrow_dtype
tz = getattr(pa_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_shift_fill_value(data)
@pytest.mark.parametrize("repeats", [0, 1, 2, [1, 2, 3]])
def test_repeat(self, data, repeats, as_series, use_numpy, request):
pa_dtype = data.dtype.pyarrow_dtype
tz = getattr(pa_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC") and repeats != 0:
request.node.add_marker(
pytest.mark.xfail(
reason=(
f"Not supported by pyarrow < 2.0 with "
f"timestamp type {tz} when repeats={repeats}"
)
)
)
super().test_repeat(data, repeats, as_series, use_numpy)
def test_insert(self, data, request):
pa_dtype = data.dtype.pyarrow_dtype
tz = getattr(pa_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_insert(data)
def test_combine_first(self, data, request):
pa_dtype = data.dtype.pyarrow_dtype
tz = getattr(pa_dtype, "tz", None)
if pa_version_under2p0 and tz not in (None, "UTC"):
request.node.add_marker(
pytest.mark.xfail(
reason=f"Not supported by pyarrow < 2.0 with timestamp type {tz}"
)
)
super().test_combine_first(data)
@pytest.mark.xfail(
reason="result dtype pyarrow[bool] better than expected dtype object"
)
def test_combine_le(self, data_repeated):
super().test_combine_le(data_repeated)
def test_combine_add(self, data_repeated, request):
pa_dtype = next(data_repeated(1)).dtype.pyarrow_dtype
if pa.types.is_temporal(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=TypeError,
reason=f"{pa_dtype} cannot be added to {pa_dtype}",
)
)
super().test_combine_add(data_repeated)
def test_searchsorted(self, data_for_sorting, as_series, request):
pa_dtype = data_for_sorting.dtype.pyarrow_dtype
if pa.types.is_boolean(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
reason=f"{pa_dtype} only has 2 unique possible values",
)
)
super().test_searchsorted(data_for_sorting, as_series)
def test_where_series(self, data, na_value, as_frame, request):
pa_dtype = data.dtype.pyarrow_dtype
if pa.types.is_temporal(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"Unsupported cast from double to {pa_dtype}",
)
)
super().test_where_series(data, na_value, as_frame)
class TestBaseArithmeticOps(base.BaseArithmeticOpsTests):
divmod_exc = NotImplementedError
def _patch_combine(self, obj, other, op):
# BaseOpsUtil._combine can upcast expected dtype
# (because it generates expected on python scalars)
# while ArrowExtensionArray maintains original type
expected = base.BaseArithmeticOpsTests._combine(self, obj, other, op)
was_frame = False
if isinstance(expected, pd.DataFrame):
was_frame = True
expected_data = expected.iloc[:, 0]
original_dtype = obj.iloc[:, 0].dtype
else:
expected_data = expected
original_dtype = obj.dtype
pa_array = pa.array(expected_data._values).cast(original_dtype.pyarrow_dtype)
pd_array = type(expected_data._values)(pa_array)
if was_frame:
expected = pd.DataFrame(
pd_array, index=expected.index, columns=expected.columns
)
else:
expected = pd.Series(pd_array)
return expected
def test_arith_series_with_scalar(
self, data, all_arithmetic_operators, request, monkeypatch
):
pa_dtype = data.dtype.pyarrow_dtype
arrow_temporal_supported = not pa_version_under8p0 and (
all_arithmetic_operators in ("__add__", "__radd__")
and pa.types.is_duration(pa_dtype)
or all_arithmetic_operators in ("__sub__", "__rsub__")
and pa.types.is_temporal(pa_dtype)
)
if (
all_arithmetic_operators
in {
"__mod__",
"__rmod__",
}
or pa_version_under2p0
):
self.series_scalar_exc = NotImplementedError
elif arrow_temporal_supported:
self.series_scalar_exc = None
elif not (
pa.types.is_floating(pa_dtype)
or pa.types.is_integer(pa_dtype)
or arrow_temporal_supported
):
self.series_scalar_exc = pa.ArrowNotImplementedError
else:
self.series_scalar_exc = None
if (
all_arithmetic_operators == "__rpow__"
and (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype))
and not pa_version_under2p0
):
request.node.add_marker(
pytest.mark.xfail(
reason=(
f"GH 29997: 1**pandas.NA == 1 while 1**pyarrow.NA == NULL "
f"for {pa_dtype}"
)
)
)
elif arrow_temporal_supported:
request.node.add_marker(
pytest.mark.xfail(
raises=TypeError,
reason=(
f"{all_arithmetic_operators} not supported between"
f"pd.NA and {pa_dtype} Python scalar"
),
)
)
elif (
all_arithmetic_operators in {"__rtruediv__", "__rfloordiv__"}
and (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype))
and not pa_version_under2p0
):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowInvalid,
reason="divide by 0",
)
)
if all_arithmetic_operators == "__floordiv__" and pa.types.is_integer(pa_dtype):
# BaseOpsUtil._combine always returns int64, while ArrowExtensionArray does
# not upcast
monkeypatch.setattr(TestBaseArithmeticOps, "_combine", self._patch_combine)
super().test_arith_series_with_scalar(data, all_arithmetic_operators)
def test_arith_frame_with_scalar(
self, data, all_arithmetic_operators, request, monkeypatch
):
pa_dtype = data.dtype.pyarrow_dtype
arrow_temporal_supported = not pa_version_under8p0 and (
all_arithmetic_operators in ("__add__", "__radd__")
and pa.types.is_duration(pa_dtype)
or all_arithmetic_operators in ("__sub__", "__rsub__")
and pa.types.is_temporal(pa_dtype)
)
if (
all_arithmetic_operators
in {
"__mod__",
"__rmod__",
}
or pa_version_under2p0
):
self.frame_scalar_exc = NotImplementedError
elif arrow_temporal_supported:
self.frame_scalar_exc = None
elif not (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype)):
self.frame_scalar_exc = pa.ArrowNotImplementedError
else:
self.frame_scalar_exc = None
if (
all_arithmetic_operators == "__rpow__"
and (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype))
and not pa_version_under2p0
):
request.node.add_marker(
pytest.mark.xfail(
reason=(
f"GH 29997: 1**pandas.NA == 1 while 1**pyarrow.NA == NULL "
f"for {pa_dtype}"
)
)
)
elif arrow_temporal_supported:
request.node.add_marker(
pytest.mark.xfail(
raises=TypeError,
reason=(
f"{all_arithmetic_operators} not supported between"
f"pd.NA and {pa_dtype} Python scalar"
),
)
)
elif (
all_arithmetic_operators in {"__rtruediv__", "__rfloordiv__"}
and (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype))
and not pa_version_under2p0
):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowInvalid,
reason="divide by 0",
)
)
if all_arithmetic_operators == "__floordiv__" and pa.types.is_integer(pa_dtype):
# BaseOpsUtil._combine always returns int64, while ArrowExtensionArray does
# not upcast
monkeypatch.setattr(TestBaseArithmeticOps, "_combine", self._patch_combine)
super().test_arith_frame_with_scalar(data, all_arithmetic_operators)
def test_arith_series_with_array(
self, data, all_arithmetic_operators, request, monkeypatch
):
pa_dtype = data.dtype.pyarrow_dtype
arrow_temporal_supported = not pa_version_under8p0 and (
all_arithmetic_operators in ("__add__", "__radd__")
and pa.types.is_duration(pa_dtype)
or all_arithmetic_operators in ("__sub__", "__rsub__")
and pa.types.is_temporal(pa_dtype)
)
if (
all_arithmetic_operators
in {
"__mod__",
"__rmod__",
}
or pa_version_under2p0
):
self.series_array_exc = NotImplementedError
elif arrow_temporal_supported:
self.series_array_exc = None
elif not (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype)):
self.series_array_exc = pa.ArrowNotImplementedError
else:
self.series_array_exc = None
if (
all_arithmetic_operators == "__rpow__"
and (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype))
and not pa_version_under2p0
):
request.node.add_marker(
pytest.mark.xfail(
reason=(
f"GH 29997: 1**pandas.NA == 1 while 1**pyarrow.NA == NULL "
f"for {pa_dtype}"
)
)
)
elif (
all_arithmetic_operators
in (
"__sub__",
"__rsub__",
)
and pa.types.is_unsigned_integer(pa_dtype)
and not pa_version_under2p0
):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowInvalid,
reason=(
f"Implemented pyarrow.compute.subtract_checked "
f"which raises on overflow for {pa_dtype}"
),
)
)
elif arrow_temporal_supported:
request.node.add_marker(
pytest.mark.xfail(
raises=TypeError,
reason=(
f"{all_arithmetic_operators} not supported between"
f"pd.NA and {pa_dtype} Python scalar"
),
)
)
elif (
all_arithmetic_operators in {"__rtruediv__", "__rfloordiv__"}
and (pa.types.is_floating(pa_dtype) or pa.types.is_integer(pa_dtype))
and not pa_version_under2p0
):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowInvalid,
reason="divide by 0",
)
)
op_name = all_arithmetic_operators
ser = pd.Series(data)
# pd.Series([ser.iloc[0]] * len(ser)) may not return ArrowExtensionArray
# since ser.iloc[0] is a python scalar
other = pd.Series(pd.array([ser.iloc[0]] * len(ser), dtype=data.dtype))
if pa.types.is_floating(pa_dtype) or (
pa.types.is_integer(pa_dtype) and all_arithmetic_operators != "__truediv__"
):
monkeypatch.setattr(TestBaseArithmeticOps, "_combine", self._patch_combine)
self.check_opname(ser, op_name, other, exc=self.series_array_exc)
def test_add_series_with_extension_array(self, data, request):
pa_dtype = data.dtype.pyarrow_dtype
if (
not (
pa.types.is_integer(pa_dtype)
or pa.types.is_floating(pa_dtype)
or (not pa_version_under8p0 and pa.types.is_duration(pa_dtype))
)
or pa_version_under2p0
):
request.node.add_marker(
pytest.mark.xfail(
raises=NotImplementedError,
reason=f"add_checked not implemented for {pa_dtype}",
)
)
elif pa_dtype.equals("int8"):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowInvalid,
reason=f"raises on overflow for {pa_dtype}",
)
)
super().test_add_series_with_extension_array(data)
class TestBaseComparisonOps(base.BaseComparisonOpsTests):
def assert_series_equal(self, left, right, *args, **kwargs):
# Series.combine for "expected" retains bool[pyarrow] dtype
# While "result" return "boolean" dtype
right = pd.Series(right._values.to_numpy(), dtype="boolean")
super().assert_series_equal(left, right, *args, **kwargs)
def test_compare_array(self, data, comparison_op, na_value, request):
pa_dtype = data.dtype.pyarrow_dtype
ser = pd.Series(data)
# pd.Series([ser.iloc[0]] * len(ser)) may not return ArrowExtensionArray
# since ser.iloc[0] is a python scalar
other = pd.Series(pd.array([ser.iloc[0]] * len(ser), dtype=data.dtype))
if comparison_op.__name__ in ["eq", "ne"]:
# comparison should match point-wise comparisons
result = comparison_op(ser, other)
# Series.combine does not calculate the NA mask correctly
# when comparing over an array
assert result[8] is na_value
assert result[97] is na_value
expected = ser.combine(other, comparison_op)
expected[8] = na_value
expected[97] = na_value
self.assert_series_equal(result, expected)
else:
exc = None
try:
result = comparison_op(ser, other)
except Exception as err:
exc = err
if exc is None:
# Didn't error, then should match point-wise behavior
if pa.types.is_temporal(pa_dtype):
# point-wise comparison with pd.NA raises TypeError
assert result[8] is na_value
assert result[97] is na_value
result = result.drop([8, 97]).reset_index(drop=True)
ser = ser.drop([8, 97])
other = other.drop([8, 97])
expected = ser.combine(other, comparison_op)
self.assert_series_equal(result, expected)
else:
with pytest.raises(type(exc)):
ser.combine(other, comparison_op)
def test_invalid_other_comp(self, data, comparison_op):
# GH 48833
with pytest.raises(
NotImplementedError, match=".* not implemented for <class 'object'>"
):
comparison_op(data, object())
def test_arrowdtype_construct_from_string_type_with_unsupported_parameters():
with pytest.raises(NotImplementedError, match="Passing pyarrow type"):
ArrowDtype.construct_from_string("timestamp[s, tz=UTC][pyarrow]")
@pytest.mark.xfail(
pa_version_under4p0,
raises=NotImplementedError,
reason="quantile only supported for pyarrow version >= 4.0",
)
@pytest.mark.parametrize(
"interpolation", ["linear", "lower", "higher", "nearest", "midpoint"]
)
@pytest.mark.parametrize("quantile", [0.5, [0.5, 0.5]])
def test_quantile(data, interpolation, quantile, request):
pa_dtype = data.dtype.pyarrow_dtype
if not (pa.types.is_integer(pa_dtype) or pa.types.is_floating(pa_dtype)):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"quantile not supported by pyarrow for {pa_dtype}",
)
)
data = data.take([0, 0, 0])
ser = pd.Series(data)
result = ser.quantile(q=quantile, interpolation=interpolation)
if quantile == 0.5:
assert result == data[0]
else:
# Just check the values
result = result.astype("float64[pyarrow]")
expected = pd.Series(
data.take([0, 0]).astype("float64[pyarrow]"), index=[0.5, 0.5]
)
tm.assert_series_equal(result, expected)
@pytest.mark.xfail(
pa_version_under6p0,
raises=NotImplementedError,
reason="mode only supported for pyarrow version >= 6.0",
)
@pytest.mark.parametrize("dropna", [True, False])
@pytest.mark.parametrize(
"take_idx, exp_idx",
[[[0, 0, 2, 2, 4, 4], [4, 0]], [[0, 0, 0, 2, 4, 4], [0]]],
ids=["multi_mode", "single_mode"],
)
def test_mode(data_for_grouping, dropna, take_idx, exp_idx, request):
pa_dtype = data_for_grouping.dtype.pyarrow_dtype
if pa.types.is_temporal(pa_dtype):
request.node.add_marker(
pytest.mark.xfail(
raises=pa.ArrowNotImplementedError,
reason=f"mode not supported by pyarrow for {pa_dtype}",
)
)
elif (
pa.types.is_boolean(pa_dtype)
and "multi_mode" in request.node.nodeid
and pa_version_under9p0
):
request.node.add_marker(
pytest.mark.xfail(
reason="https://issues.apache.org/jira/browse/ARROW-17096",
)
)
data = data_for_grouping.take(take_idx)
ser = pd.Series(data)
result = ser.mode(dropna=dropna)
expected = pd.Series(data_for_grouping.take(exp_idx))
tm.assert_series_equal(result, expected)