1115 lines
37 KiB
Python
1115 lines
37 KiB
Python
""" test fancy indexing & misc """
|
|
|
|
import array
|
|
from datetime import datetime
|
|
import re
|
|
import weakref
|
|
|
|
import numpy as np
|
|
import pytest
|
|
|
|
from pandas.errors import IndexingError
|
|
|
|
from pandas.core.dtypes.common import (
|
|
is_float_dtype,
|
|
is_integer_dtype,
|
|
)
|
|
|
|
import pandas as pd
|
|
from pandas import (
|
|
DataFrame,
|
|
Index,
|
|
NaT,
|
|
Series,
|
|
date_range,
|
|
offsets,
|
|
timedelta_range,
|
|
)
|
|
import pandas._testing as tm
|
|
from pandas.core.api import Float64Index
|
|
from pandas.tests.indexing.common import _mklbl
|
|
from pandas.tests.indexing.test_floats import gen_obj
|
|
|
|
# ------------------------------------------------------------------------
|
|
# Indexing test cases
|
|
|
|
|
|
class TestFancy:
|
|
"""pure get/set item & fancy indexing"""
|
|
|
|
def test_setitem_ndarray_1d(self):
|
|
# GH5508
|
|
|
|
# len of indexer vs length of the 1d ndarray
|
|
df = DataFrame(index=Index(np.arange(1, 11)))
|
|
df["foo"] = np.zeros(10, dtype=np.float64)
|
|
df["bar"] = np.zeros(10, dtype=complex)
|
|
|
|
# invalid
|
|
msg = "Must have equal len keys and value when setting with an iterable"
|
|
with pytest.raises(ValueError, match=msg):
|
|
df.loc[df.index[2:5], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0])
|
|
|
|
# valid
|
|
df.loc[df.index[2:6], "bar"] = np.array([2.33j, 1.23 + 0.1j, 2.2, 1.0])
|
|
|
|
result = df.loc[df.index[2:6], "bar"]
|
|
expected = Series(
|
|
[2.33j, 1.23 + 0.1j, 2.2, 1.0], index=[3, 4, 5, 6], name="bar"
|
|
)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_setitem_ndarray_1d_2(self):
|
|
# GH5508
|
|
|
|
# dtype getting changed?
|
|
df = DataFrame(index=Index(np.arange(1, 11)))
|
|
df["foo"] = np.zeros(10, dtype=np.float64)
|
|
df["bar"] = np.zeros(10, dtype=complex)
|
|
|
|
msg = "Must have equal len keys and value when setting with an iterable"
|
|
with pytest.raises(ValueError, match=msg):
|
|
with tm.assert_produces_warning(FutureWarning, match="label-based"):
|
|
df[2:5] = np.arange(1, 4) * 1j
|
|
|
|
def test_getitem_ndarray_3d(
|
|
self, index, frame_or_series, indexer_sli, using_array_manager
|
|
):
|
|
# GH 25567
|
|
obj = gen_obj(frame_or_series, index)
|
|
idxr = indexer_sli(obj)
|
|
nd3 = np.random.randint(5, size=(2, 2, 2))
|
|
|
|
msgs = []
|
|
if frame_or_series is Series and indexer_sli in [tm.setitem, tm.iloc]:
|
|
msgs.append(r"Wrong number of dimensions. values.ndim > ndim \[3 > 1\]")
|
|
if using_array_manager:
|
|
msgs.append("Passed array should be 1-dimensional")
|
|
if frame_or_series is Series or indexer_sli is tm.iloc:
|
|
msgs.append(r"Buffer has wrong number of dimensions \(expected 1, got 3\)")
|
|
if using_array_manager:
|
|
msgs.append("indexer should be 1-dimensional")
|
|
if indexer_sli is tm.loc or (
|
|
frame_or_series is Series and indexer_sli is tm.setitem
|
|
):
|
|
msgs.append("Cannot index with multidimensional key")
|
|
if frame_or_series is DataFrame and indexer_sli is tm.setitem:
|
|
msgs.append("Index data must be 1-dimensional")
|
|
if isinstance(index, pd.IntervalIndex) and indexer_sli is tm.iloc:
|
|
msgs.append("Index data must be 1-dimensional")
|
|
if isinstance(index, (pd.TimedeltaIndex, pd.DatetimeIndex, pd.PeriodIndex)):
|
|
msgs.append("Data must be 1-dimensional")
|
|
if len(index) == 0 or isinstance(index, pd.MultiIndex):
|
|
msgs.append("positional indexers are out-of-bounds")
|
|
if type(index) is Index and not isinstance(index._values, np.ndarray):
|
|
# e.g. Int64
|
|
msgs.append("values must be a 1D array")
|
|
|
|
# string[pyarrow]
|
|
msgs.append("only handle 1-dimensional arrays")
|
|
|
|
msg = "|".join(msgs)
|
|
|
|
potential_errors = (IndexError, ValueError, NotImplementedError)
|
|
with pytest.raises(potential_errors, match=msg):
|
|
idxr[nd3]
|
|
|
|
def test_setitem_ndarray_3d(self, index, frame_or_series, indexer_sli):
|
|
# GH 25567
|
|
obj = gen_obj(frame_or_series, index)
|
|
idxr = indexer_sli(obj)
|
|
nd3 = np.random.randint(5, size=(2, 2, 2))
|
|
|
|
if indexer_sli is tm.iloc:
|
|
err = ValueError
|
|
msg = f"Cannot set values with ndim > {obj.ndim}"
|
|
else:
|
|
err = ValueError
|
|
msg = "|".join(
|
|
[
|
|
r"Buffer has wrong number of dimensions \(expected 1, got 3\)",
|
|
"Cannot set values with ndim > 1",
|
|
"Index data must be 1-dimensional",
|
|
"Data must be 1-dimensional",
|
|
"Array conditional must be same shape as self",
|
|
]
|
|
)
|
|
|
|
with pytest.raises(err, match=msg):
|
|
idxr[nd3] = 0
|
|
|
|
def test_getitem_ndarray_0d(self):
|
|
# GH#24924
|
|
key = np.array(0)
|
|
|
|
# dataframe __getitem__
|
|
df = DataFrame([[1, 2], [3, 4]])
|
|
result = df[key]
|
|
expected = Series([1, 3], name=0)
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# series __getitem__
|
|
ser = Series([1, 2])
|
|
result = ser[key]
|
|
assert result == 1
|
|
|
|
def test_inf_upcast(self):
|
|
# GH 16957
|
|
# We should be able to use np.inf as a key
|
|
# np.inf should cause an index to convert to float
|
|
|
|
# Test with np.inf in rows
|
|
df = DataFrame(columns=[0])
|
|
df.loc[1] = 1
|
|
df.loc[2] = 2
|
|
df.loc[np.inf] = 3
|
|
|
|
# make sure we can look up the value
|
|
assert df.loc[np.inf, 0] == 3
|
|
|
|
result = df.index
|
|
expected = Float64Index([1, 2, np.inf])
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
def test_setitem_dtype_upcast(self):
|
|
|
|
# GH3216
|
|
df = DataFrame([{"a": 1}, {"a": 3, "b": 2}])
|
|
df["c"] = np.nan
|
|
assert df["c"].dtype == np.float64
|
|
|
|
df.loc[0, "c"] = "foo"
|
|
expected = DataFrame(
|
|
[{"a": 1, "b": np.nan, "c": "foo"}, {"a": 3, "b": 2, "c": np.nan}]
|
|
)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
@pytest.mark.parametrize("val", [3.14, "wxyz"])
|
|
def test_setitem_dtype_upcast2(self, val):
|
|
|
|
# GH10280
|
|
df = DataFrame(
|
|
np.arange(6, dtype="int64").reshape(2, 3),
|
|
index=list("ab"),
|
|
columns=["foo", "bar", "baz"],
|
|
)
|
|
|
|
left = df.copy()
|
|
left.loc["a", "bar"] = val
|
|
right = DataFrame(
|
|
[[0, val, 2], [3, 4, 5]],
|
|
index=list("ab"),
|
|
columns=["foo", "bar", "baz"],
|
|
)
|
|
|
|
tm.assert_frame_equal(left, right)
|
|
assert is_integer_dtype(left["foo"])
|
|
assert is_integer_dtype(left["baz"])
|
|
|
|
def test_setitem_dtype_upcast3(self):
|
|
left = DataFrame(
|
|
np.arange(6, dtype="int64").reshape(2, 3) / 10.0,
|
|
index=list("ab"),
|
|
columns=["foo", "bar", "baz"],
|
|
)
|
|
left.loc["a", "bar"] = "wxyz"
|
|
|
|
right = DataFrame(
|
|
[[0, "wxyz", 0.2], [0.3, 0.4, 0.5]],
|
|
index=list("ab"),
|
|
columns=["foo", "bar", "baz"],
|
|
)
|
|
|
|
tm.assert_frame_equal(left, right)
|
|
assert is_float_dtype(left["foo"])
|
|
assert is_float_dtype(left["baz"])
|
|
|
|
def test_dups_fancy_indexing(self):
|
|
|
|
# GH 3455
|
|
|
|
df = tm.makeCustomDataframe(10, 3)
|
|
df.columns = ["a", "a", "b"]
|
|
result = df[["b", "a"]].columns
|
|
expected = Index(["b", "a", "a"])
|
|
tm.assert_index_equal(result, expected)
|
|
|
|
def test_dups_fancy_indexing_across_dtypes(self):
|
|
|
|
# across dtypes
|
|
df = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]], columns=list("aaaaaaa"))
|
|
df.head()
|
|
str(df)
|
|
result = DataFrame([[1, 2, 1.0, 2.0, 3.0, "foo", "bar"]])
|
|
result.columns = list("aaaaaaa") # GH#3468
|
|
|
|
# GH#3509 smoke tests for indexing with duplicate columns
|
|
df.iloc[:, 4]
|
|
result.iloc[:, 4]
|
|
|
|
tm.assert_frame_equal(df, result)
|
|
|
|
def test_dups_fancy_indexing_not_in_order(self):
|
|
# GH 3561, dups not in selected order
|
|
df = DataFrame(
|
|
{"test": [5, 7, 9, 11], "test1": [4.0, 5, 6, 7], "other": list("abcd")},
|
|
index=["A", "A", "B", "C"],
|
|
)
|
|
rows = ["C", "B"]
|
|
expected = DataFrame(
|
|
{"test": [11, 9], "test1": [7.0, 6], "other": ["d", "c"]}, index=rows
|
|
)
|
|
result = df.loc[rows]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
result = df.loc[Index(rows)]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
rows = ["C", "B", "E"]
|
|
with pytest.raises(KeyError, match="not in index"):
|
|
df.loc[rows]
|
|
|
|
# see GH5553, make sure we use the right indexer
|
|
rows = ["F", "G", "H", "C", "B", "E"]
|
|
with pytest.raises(KeyError, match="not in index"):
|
|
df.loc[rows]
|
|
|
|
def test_dups_fancy_indexing_only_missing_label(self):
|
|
|
|
# List containing only missing label
|
|
dfnu = DataFrame(np.random.randn(5, 3), index=list("AABCD"))
|
|
with pytest.raises(
|
|
KeyError,
|
|
match=re.escape(
|
|
"\"None of [Index(['E'], dtype='object')] are in the [index]\""
|
|
),
|
|
):
|
|
dfnu.loc[["E"]]
|
|
|
|
@pytest.mark.parametrize("vals", [[0, 1, 2], list("abc")])
|
|
def test_dups_fancy_indexing_missing_label(self, vals):
|
|
|
|
# GH 4619; duplicate indexer with missing label
|
|
df = DataFrame({"A": vals})
|
|
with pytest.raises(KeyError, match="not in index"):
|
|
df.loc[[0, 8, 0]]
|
|
|
|
def test_dups_fancy_indexing_non_unique(self):
|
|
|
|
# non unique with non unique selector
|
|
df = DataFrame({"test": [5, 7, 9, 11]}, index=["A", "A", "B", "C"])
|
|
with pytest.raises(KeyError, match="not in index"):
|
|
df.loc[["A", "A", "E"]]
|
|
|
|
def test_dups_fancy_indexing2(self):
|
|
# GH 5835
|
|
# dups on index and missing values
|
|
df = DataFrame(np.random.randn(5, 5), columns=["A", "B", "B", "B", "A"])
|
|
|
|
with pytest.raises(KeyError, match="not in index"):
|
|
df.loc[:, ["A", "B", "C"]]
|
|
|
|
def test_dups_fancy_indexing3(self):
|
|
|
|
# GH 6504, multi-axis indexing
|
|
df = DataFrame(
|
|
np.random.randn(9, 2), index=[1, 1, 1, 2, 2, 2, 3, 3, 3], columns=["a", "b"]
|
|
)
|
|
|
|
expected = df.iloc[0:6]
|
|
result = df.loc[[1, 2]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
expected = df
|
|
result = df.loc[:, ["a", "b"]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
expected = df.iloc[0:6, :]
|
|
result = df.loc[[1, 2], ["a", "b"]]
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_duplicate_int_indexing(self, indexer_sl):
|
|
# GH 17347
|
|
ser = Series(range(3), index=[1, 1, 3])
|
|
expected = Series(range(2), index=[1, 1])
|
|
result = indexer_sl(ser)[[1]]
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
def test_indexing_mixed_frame_bug(self):
|
|
|
|
# GH3492
|
|
df = DataFrame(
|
|
{"a": {1: "aaa", 2: "bbb", 3: "ccc"}, "b": {1: 111, 2: 222, 3: 333}}
|
|
)
|
|
|
|
# this works, new column is created correctly
|
|
df["test"] = df["a"].apply(lambda x: "_" if x == "aaa" else x)
|
|
|
|
# this does not work, ie column test is not changed
|
|
idx = df["test"] == "_"
|
|
temp = df.loc[idx, "a"].apply(lambda x: "-----" if x == "aaa" else x)
|
|
df.loc[idx, "test"] = temp
|
|
assert df.iloc[0, 2] == "-----"
|
|
|
|
def test_multitype_list_index_access(self):
|
|
# GH 10610
|
|
df = DataFrame(np.random.random((10, 5)), columns=["a"] + [20, 21, 22, 23])
|
|
|
|
with pytest.raises(KeyError, match=re.escape("'[26, -8] not in index'")):
|
|
df[[22, 26, -8]]
|
|
assert df[21].shape[0] == df.shape[0]
|
|
|
|
def test_set_index_nan(self):
|
|
|
|
# GH 3586
|
|
df = DataFrame(
|
|
{
|
|
"PRuid": {
|
|
17: "nonQC",
|
|
18: "nonQC",
|
|
19: "nonQC",
|
|
20: "10",
|
|
21: "11",
|
|
22: "12",
|
|
23: "13",
|
|
24: "24",
|
|
25: "35",
|
|
26: "46",
|
|
27: "47",
|
|
28: "48",
|
|
29: "59",
|
|
30: "10",
|
|
},
|
|
"QC": {
|
|
17: 0.0,
|
|
18: 0.0,
|
|
19: 0.0,
|
|
20: np.nan,
|
|
21: np.nan,
|
|
22: np.nan,
|
|
23: np.nan,
|
|
24: 1.0,
|
|
25: np.nan,
|
|
26: np.nan,
|
|
27: np.nan,
|
|
28: np.nan,
|
|
29: np.nan,
|
|
30: np.nan,
|
|
},
|
|
"data": {
|
|
17: 7.9544899999999998,
|
|
18: 8.0142609999999994,
|
|
19: 7.8591520000000008,
|
|
20: 0.86140349999999999,
|
|
21: 0.87853110000000001,
|
|
22: 0.8427041999999999,
|
|
23: 0.78587700000000005,
|
|
24: 0.73062459999999996,
|
|
25: 0.81668560000000001,
|
|
26: 0.81927080000000008,
|
|
27: 0.80705009999999999,
|
|
28: 0.81440240000000008,
|
|
29: 0.80140849999999997,
|
|
30: 0.81307740000000006,
|
|
},
|
|
"year": {
|
|
17: 2006,
|
|
18: 2007,
|
|
19: 2008,
|
|
20: 1985,
|
|
21: 1985,
|
|
22: 1985,
|
|
23: 1985,
|
|
24: 1985,
|
|
25: 1985,
|
|
26: 1985,
|
|
27: 1985,
|
|
28: 1985,
|
|
29: 1985,
|
|
30: 1986,
|
|
},
|
|
}
|
|
).reset_index()
|
|
|
|
result = (
|
|
df.set_index(["year", "PRuid", "QC"])
|
|
.reset_index()
|
|
.reindex(columns=df.columns)
|
|
)
|
|
tm.assert_frame_equal(result, df)
|
|
|
|
def test_multi_assign(self):
|
|
|
|
# GH 3626, an assignment of a sub-df to a df
|
|
df = DataFrame(
|
|
{
|
|
"FC": ["a", "b", "a", "b", "a", "b"],
|
|
"PF": [0, 0, 0, 0, 1, 1],
|
|
"col1": list(range(6)),
|
|
"col2": list(range(6, 12)),
|
|
}
|
|
)
|
|
df.iloc[1, 0] = np.nan
|
|
df2 = df.copy()
|
|
|
|
mask = ~df2.FC.isna()
|
|
cols = ["col1", "col2"]
|
|
|
|
dft = df2 * 2
|
|
dft.iloc[3, 3] = np.nan
|
|
|
|
expected = DataFrame(
|
|
{
|
|
"FC": ["a", np.nan, "a", "b", "a", "b"],
|
|
"PF": [0, 0, 0, 0, 1, 1],
|
|
"col1": Series([0, 1, 4, 6, 8, 10]),
|
|
"col2": [12, 7, 16, np.nan, 20, 22],
|
|
}
|
|
)
|
|
|
|
# frame on rhs
|
|
df2.loc[mask, cols] = dft.loc[mask, cols]
|
|
tm.assert_frame_equal(df2, expected)
|
|
|
|
# with an ndarray on rhs
|
|
# coerces to float64 because values has float64 dtype
|
|
# GH 14001
|
|
expected = DataFrame(
|
|
{
|
|
"FC": ["a", np.nan, "a", "b", "a", "b"],
|
|
"PF": [0, 0, 0, 0, 1, 1],
|
|
"col1": [0.0, 1.0, 4.0, 6.0, 8.0, 10.0],
|
|
"col2": [12, 7, 16, np.nan, 20, 22],
|
|
}
|
|
)
|
|
df2 = df.copy()
|
|
df2.loc[mask, cols] = dft.loc[mask, cols].values
|
|
tm.assert_frame_equal(df2, expected)
|
|
|
|
def test_multi_assign_broadcasting_rhs(self):
|
|
# broadcasting on the rhs is required
|
|
df = DataFrame(
|
|
{
|
|
"A": [1, 2, 0, 0, 0],
|
|
"B": [0, 0, 0, 10, 11],
|
|
"C": [0, 0, 0, 10, 11],
|
|
"D": [3, 4, 5, 6, 7],
|
|
}
|
|
)
|
|
|
|
expected = df.copy()
|
|
mask = expected["A"] == 0
|
|
for col in ["A", "B"]:
|
|
expected.loc[mask, col] = df["D"]
|
|
|
|
df.loc[df["A"] == 0, ["A", "B"]] = df["D"]
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_setitem_list(self):
|
|
|
|
# GH 6043
|
|
# iloc with a list
|
|
df = DataFrame(index=[0, 1], columns=[0])
|
|
df.iloc[1, 0] = [1, 2, 3]
|
|
df.iloc[1, 0] = [1, 2]
|
|
|
|
result = DataFrame(index=[0, 1], columns=[0])
|
|
result.iloc[1, 0] = [1, 2]
|
|
|
|
tm.assert_frame_equal(result, df)
|
|
|
|
def test_string_slice(self):
|
|
# GH 14424
|
|
# string indexing against datetimelike with object
|
|
# dtype should properly raises KeyError
|
|
df = DataFrame([1], Index([pd.Timestamp("2011-01-01")], dtype=object))
|
|
assert df.index._is_all_dates
|
|
with pytest.raises(KeyError, match="'2011'"):
|
|
df["2011"]
|
|
|
|
with pytest.raises(KeyError, match="'2011'"):
|
|
df.loc["2011", 0]
|
|
|
|
def test_string_slice_empty(self):
|
|
# GH 14424
|
|
|
|
df = DataFrame()
|
|
assert not df.index._is_all_dates
|
|
with pytest.raises(KeyError, match="'2011'"):
|
|
df["2011"]
|
|
|
|
with pytest.raises(KeyError, match="^0$"):
|
|
df.loc["2011", 0]
|
|
|
|
def test_astype_assignment(self):
|
|
|
|
# GH4312 (iloc)
|
|
df_orig = DataFrame(
|
|
[["1", "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
|
|
)
|
|
|
|
df = df_orig.copy()
|
|
msg = "will attempt to set the values inplace instead"
|
|
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
|
df.iloc[:, 0:2] = df.iloc[:, 0:2].astype(np.int64)
|
|
expected = DataFrame(
|
|
[[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
|
|
)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
df = df_orig.copy()
|
|
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
|
df.iloc[:, 0:2] = df.iloc[:, 0:2]._convert(datetime=True, numeric=True)
|
|
expected = DataFrame(
|
|
[[1, 2, "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
|
|
)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
# GH5702 (loc)
|
|
df = df_orig.copy()
|
|
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
|
df.loc[:, "A"] = df.loc[:, "A"].astype(np.int64)
|
|
expected = DataFrame(
|
|
[[1, "2", "3", ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
|
|
)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
df = df_orig.copy()
|
|
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
|
df.loc[:, ["B", "C"]] = df.loc[:, ["B", "C"]].astype(np.int64)
|
|
expected = DataFrame(
|
|
[["1", 2, 3, ".4", 5, 6.0, "foo"]], columns=list("ABCDEFG")
|
|
)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_astype_assignment_full_replacements(self):
|
|
# full replacements / no nans
|
|
df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]})
|
|
msg = "will attempt to set the values inplace instead"
|
|
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
|
df.iloc[:, 0] = df["A"].astype(np.int64)
|
|
expected = DataFrame({"A": [1, 2, 3, 4]})
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
df = DataFrame({"A": [1.0, 2.0, 3.0, 4.0]})
|
|
with tm.assert_produces_warning(DeprecationWarning, match=msg):
|
|
df.loc[:, "A"] = df["A"].astype(np.int64)
|
|
expected = DataFrame({"A": [1, 2, 3, 4]})
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
@pytest.mark.parametrize("indexer", [tm.getitem, tm.loc])
|
|
def test_index_type_coercion(self, indexer):
|
|
|
|
# GH 11836
|
|
# if we have an index type and set it with something that looks
|
|
# to numpy like the same, but is actually, not
|
|
# (e.g. setting with a float or string '0')
|
|
# then we need to coerce to object
|
|
|
|
# integer indexes
|
|
for s in [Series(range(5)), Series(range(5), index=range(1, 6))]:
|
|
|
|
assert s.index.is_integer()
|
|
|
|
s2 = s.copy()
|
|
indexer(s2)[0.1] = 0
|
|
assert s2.index.is_floating()
|
|
assert indexer(s2)[0.1] == 0
|
|
|
|
s2 = s.copy()
|
|
indexer(s2)[0.0] = 0
|
|
exp = s.index
|
|
if 0 not in s:
|
|
exp = Index(s.index.tolist() + [0])
|
|
tm.assert_index_equal(s2.index, exp)
|
|
|
|
s2 = s.copy()
|
|
indexer(s2)["0"] = 0
|
|
assert s2.index.is_object()
|
|
|
|
for s in [Series(range(5), index=np.arange(5.0))]:
|
|
|
|
assert s.index.is_floating()
|
|
|
|
s2 = s.copy()
|
|
indexer(s2)[0.1] = 0
|
|
assert s2.index.is_floating()
|
|
assert indexer(s2)[0.1] == 0
|
|
|
|
s2 = s.copy()
|
|
indexer(s2)[0.0] = 0
|
|
tm.assert_index_equal(s2.index, s.index)
|
|
|
|
s2 = s.copy()
|
|
indexer(s2)["0"] = 0
|
|
assert s2.index.is_object()
|
|
|
|
|
|
class TestMisc:
|
|
def test_float_index_to_mixed(self):
|
|
df = DataFrame({0.0: np.random.rand(10), 1.0: np.random.rand(10)})
|
|
df["a"] = 10
|
|
|
|
expected = DataFrame({0.0: df[0.0], 1.0: df[1.0], "a": [10] * 10})
|
|
tm.assert_frame_equal(expected, df)
|
|
|
|
def test_float_index_non_scalar_assignment(self):
|
|
df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0])
|
|
df.loc[df.index[:2]] = 1
|
|
expected = DataFrame({"a": [1, 1, 3], "b": [1, 1, 5]}, index=df.index)
|
|
tm.assert_frame_equal(expected, df)
|
|
|
|
def test_loc_setitem_fullindex_views(self):
|
|
df = DataFrame({"a": [1, 2, 3], "b": [3, 4, 5]}, index=[1.0, 2.0, 3.0])
|
|
df2 = df.copy()
|
|
df.loc[df.index] = df.loc[df.index]
|
|
tm.assert_frame_equal(df, df2)
|
|
|
|
def test_rhs_alignment(self):
|
|
# GH8258, tests that both rows & columns are aligned to what is
|
|
# assigned to. covers both uniform data-type & multi-type cases
|
|
def run_tests(df, rhs, right_loc, right_iloc):
|
|
# label, index, slice
|
|
lbl_one, idx_one, slice_one = list("bcd"), [1, 2, 3], slice(1, 4)
|
|
lbl_two, idx_two, slice_two = ["joe", "jolie"], [1, 2], slice(1, 3)
|
|
|
|
left = df.copy()
|
|
left.loc[lbl_one, lbl_two] = rhs
|
|
tm.assert_frame_equal(left, right_loc)
|
|
|
|
left = df.copy()
|
|
left.iloc[idx_one, idx_two] = rhs
|
|
tm.assert_frame_equal(left, right_iloc)
|
|
|
|
left = df.copy()
|
|
left.iloc[slice_one, slice_two] = rhs
|
|
tm.assert_frame_equal(left, right_iloc)
|
|
|
|
xs = np.arange(20).reshape(5, 4)
|
|
cols = ["jim", "joe", "jolie", "joline"]
|
|
df = DataFrame(xs, columns=cols, index=list("abcde"), dtype="int64")
|
|
|
|
# right hand side; permute the indices and multiplpy by -2
|
|
rhs = -2 * df.iloc[3:0:-1, 2:0:-1]
|
|
|
|
# expected `right` result; just multiply by -2
|
|
right_iloc = df.copy()
|
|
right_iloc["joe"] = [1, 14, 10, 6, 17]
|
|
right_iloc["jolie"] = [2, 13, 9, 5, 18]
|
|
right_iloc.iloc[1:4, 1:3] *= -2
|
|
right_loc = df.copy()
|
|
right_loc.iloc[1:4, 1:3] *= -2
|
|
|
|
# run tests with uniform dtypes
|
|
run_tests(df, rhs, right_loc, right_iloc)
|
|
|
|
# make frames multi-type & re-run tests
|
|
for frame in [df, rhs, right_loc, right_iloc]:
|
|
frame["joe"] = frame["joe"].astype("float64")
|
|
frame["jolie"] = frame["jolie"].map("@{}".format)
|
|
right_iloc["joe"] = [1.0, "@-28", "@-20", "@-12", 17.0]
|
|
right_iloc["jolie"] = ["@2", -26.0, -18.0, -10.0, "@18"]
|
|
run_tests(df, rhs, right_loc, right_iloc)
|
|
|
|
@pytest.mark.parametrize(
|
|
"idx", [_mklbl("A", 20), np.arange(20) + 100, np.linspace(100, 150, 20)]
|
|
)
|
|
def test_str_label_slicing_with_negative_step(self, idx):
|
|
SLC = pd.IndexSlice
|
|
|
|
idx = Index(idx)
|
|
ser = Series(np.arange(20), index=idx)
|
|
tm.assert_indexing_slices_equivalent(ser, SLC[idx[9] :: -1], SLC[9::-1])
|
|
tm.assert_indexing_slices_equivalent(ser, SLC[: idx[9] : -1], SLC[:8:-1])
|
|
tm.assert_indexing_slices_equivalent(
|
|
ser, SLC[idx[13] : idx[9] : -1], SLC[13:8:-1]
|
|
)
|
|
tm.assert_indexing_slices_equivalent(ser, SLC[idx[9] : idx[13] : -1], SLC[:0])
|
|
|
|
def test_slice_with_zero_step_raises(self, index, indexer_sl, frame_or_series):
|
|
obj = frame_or_series(np.arange(len(index)), index=index)
|
|
with pytest.raises(ValueError, match="slice step cannot be zero"):
|
|
indexer_sl(obj)[::0]
|
|
|
|
def test_loc_setitem_indexing_assignment_dict_already_exists(self):
|
|
index = Index([-5, 0, 5], name="z")
|
|
df = DataFrame({"x": [1, 2, 6], "y": [2, 2, 8]}, index=index)
|
|
expected = df.copy()
|
|
rhs = {"x": 9, "y": 99}
|
|
df.loc[5] = rhs
|
|
expected.loc[5] = [9, 99]
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
# GH#38335 same thing, mixed dtypes
|
|
df = DataFrame({"x": [1, 2, 6], "y": [2.0, 2.0, 8.0]}, index=index)
|
|
df.loc[5] = rhs
|
|
expected = DataFrame({"x": [1, 2, 9], "y": [2.0, 2.0, 99.0]}, index=index)
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
def test_iloc_getitem_indexing_dtypes_on_empty(self):
|
|
# Check that .iloc returns correct dtypes GH9983
|
|
df = DataFrame({"a": [1, 2, 3], "b": ["b", "b2", "b3"]})
|
|
df2 = df.iloc[[], :]
|
|
|
|
assert df2.loc[:, "a"].dtype == np.int64
|
|
tm.assert_series_equal(df2.loc[:, "a"], df2.iloc[:, 0])
|
|
|
|
@pytest.mark.parametrize("size", [5, 999999, 1000000])
|
|
def test_loc_range_in_series_indexing(self, size):
|
|
# range can cause an indexing error
|
|
# GH 11652
|
|
s = Series(index=range(size), dtype=np.float64)
|
|
s.loc[range(1)] = 42
|
|
tm.assert_series_equal(s.loc[range(1)], Series(42.0, index=[0]))
|
|
|
|
s.loc[range(2)] = 43
|
|
tm.assert_series_equal(s.loc[range(2)], Series(43.0, index=[0, 1]))
|
|
|
|
def test_partial_boolean_frame_indexing(self):
|
|
# GH 17170
|
|
df = DataFrame(
|
|
np.arange(9.0).reshape(3, 3), index=list("abc"), columns=list("ABC")
|
|
)
|
|
index_df = DataFrame(1, index=list("ab"), columns=list("AB"))
|
|
result = df[index_df.notnull()]
|
|
expected = DataFrame(
|
|
np.array([[0.0, 1.0, np.nan], [3.0, 4.0, np.nan], [np.nan] * 3]),
|
|
index=list("abc"),
|
|
columns=list("ABC"),
|
|
)
|
|
tm.assert_frame_equal(result, expected)
|
|
|
|
def test_no_reference_cycle(self):
|
|
df = DataFrame({"a": [0, 1], "b": [2, 3]})
|
|
for name in ("loc", "iloc", "at", "iat"):
|
|
getattr(df, name)
|
|
wr = weakref.ref(df)
|
|
del df
|
|
assert wr() is None
|
|
|
|
def test_label_indexing_on_nan(self, nulls_fixture):
|
|
# GH 32431
|
|
df = Series([1, "{1,2}", 1, nulls_fixture])
|
|
vc = df.value_counts(dropna=False)
|
|
result1 = vc.loc[nulls_fixture]
|
|
result2 = vc[nulls_fixture]
|
|
|
|
expected = 1
|
|
assert result1 == expected
|
|
assert result2 == expected
|
|
|
|
|
|
class TestDataframeNoneCoercion:
|
|
EXPECTED_SINGLE_ROW_RESULTS = [
|
|
# For numeric series, we should coerce to NaN.
|
|
([1, 2, 3], [np.nan, 2, 3]),
|
|
([1.0, 2.0, 3.0], [np.nan, 2.0, 3.0]),
|
|
# For datetime series, we should coerce to NaT.
|
|
(
|
|
[datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
|
[NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
|
),
|
|
# For objects, we should preserve the None value.
|
|
(["foo", "bar", "baz"], [None, "bar", "baz"]),
|
|
]
|
|
|
|
@pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS)
|
|
def test_coercion_with_loc(self, expected):
|
|
start_data, expected_result = expected
|
|
|
|
start_dataframe = DataFrame({"foo": start_data})
|
|
start_dataframe.loc[0, ["foo"]] = None
|
|
|
|
expected_dataframe = DataFrame({"foo": expected_result})
|
|
tm.assert_frame_equal(start_dataframe, expected_dataframe)
|
|
|
|
@pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS)
|
|
def test_coercion_with_setitem_and_dataframe(self, expected):
|
|
start_data, expected_result = expected
|
|
|
|
start_dataframe = DataFrame({"foo": start_data})
|
|
start_dataframe[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
|
|
|
|
expected_dataframe = DataFrame({"foo": expected_result})
|
|
tm.assert_frame_equal(start_dataframe, expected_dataframe)
|
|
|
|
@pytest.mark.parametrize("expected", EXPECTED_SINGLE_ROW_RESULTS)
|
|
def test_none_coercion_loc_and_dataframe(self, expected):
|
|
start_data, expected_result = expected
|
|
|
|
start_dataframe = DataFrame({"foo": start_data})
|
|
start_dataframe.loc[start_dataframe["foo"] == start_dataframe["foo"][0]] = None
|
|
|
|
expected_dataframe = DataFrame({"foo": expected_result})
|
|
tm.assert_frame_equal(start_dataframe, expected_dataframe)
|
|
|
|
def test_none_coercion_mixed_dtypes(self):
|
|
start_dataframe = DataFrame(
|
|
{
|
|
"a": [1, 2, 3],
|
|
"b": [1.0, 2.0, 3.0],
|
|
"c": [datetime(2000, 1, 1), datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
|
"d": ["a", "b", "c"],
|
|
}
|
|
)
|
|
start_dataframe.iloc[0] = None
|
|
|
|
exp = DataFrame(
|
|
{
|
|
"a": [np.nan, 2, 3],
|
|
"b": [np.nan, 2.0, 3.0],
|
|
"c": [NaT, datetime(2000, 1, 2), datetime(2000, 1, 3)],
|
|
"d": [None, "b", "c"],
|
|
}
|
|
)
|
|
tm.assert_frame_equal(start_dataframe, exp)
|
|
|
|
|
|
class TestDatetimelikeCoercion:
|
|
def test_setitem_dt64_string_scalar(self, tz_naive_fixture, indexer_sli):
|
|
# dispatching _can_hold_element to underlying DatetimeArray
|
|
tz = tz_naive_fixture
|
|
|
|
dti = date_range("2016-01-01", periods=3, tz=tz)
|
|
ser = Series(dti)
|
|
|
|
values = ser._values
|
|
|
|
newval = "2018-01-01"
|
|
values._validate_setitem_value(newval)
|
|
|
|
indexer_sli(ser)[0] = newval
|
|
|
|
if tz is None:
|
|
# TODO(EA2D): we can make this no-copy in tz-naive case too
|
|
assert ser.dtype == dti.dtype
|
|
assert ser._values._data is values._data
|
|
else:
|
|
assert ser._values is values
|
|
|
|
@pytest.mark.parametrize("box", [list, np.array, pd.array, pd.Categorical, Index])
|
|
@pytest.mark.parametrize(
|
|
"key", [[0, 1], slice(0, 2), np.array([True, True, False])]
|
|
)
|
|
def test_setitem_dt64_string_values(self, tz_naive_fixture, indexer_sli, key, box):
|
|
# dispatching _can_hold_element to underling DatetimeArray
|
|
tz = tz_naive_fixture
|
|
|
|
if isinstance(key, slice) and indexer_sli is tm.loc:
|
|
key = slice(0, 1)
|
|
|
|
dti = date_range("2016-01-01", periods=3, tz=tz)
|
|
ser = Series(dti)
|
|
|
|
values = ser._values
|
|
|
|
newvals = box(["2019-01-01", "2010-01-02"])
|
|
values._validate_setitem_value(newvals)
|
|
|
|
indexer_sli(ser)[key] = newvals
|
|
|
|
if tz is None:
|
|
# TODO(EA2D): we can make this no-copy in tz-naive case too
|
|
assert ser.dtype == dti.dtype
|
|
assert ser._values._data is values._data
|
|
else:
|
|
assert ser._values is values
|
|
|
|
@pytest.mark.parametrize("scalar", ["3 Days", offsets.Hour(4)])
|
|
def test_setitem_td64_scalar(self, indexer_sli, scalar):
|
|
# dispatching _can_hold_element to underling TimedeltaArray
|
|
tdi = timedelta_range("1 Day", periods=3)
|
|
ser = Series(tdi)
|
|
|
|
values = ser._values
|
|
values._validate_setitem_value(scalar)
|
|
|
|
indexer_sli(ser)[0] = scalar
|
|
assert ser._values._data is values._data
|
|
|
|
@pytest.mark.parametrize("box", [list, np.array, pd.array, pd.Categorical, Index])
|
|
@pytest.mark.parametrize(
|
|
"key", [[0, 1], slice(0, 2), np.array([True, True, False])]
|
|
)
|
|
def test_setitem_td64_string_values(self, indexer_sli, key, box):
|
|
# dispatching _can_hold_element to underling TimedeltaArray
|
|
if isinstance(key, slice) and indexer_sli is tm.loc:
|
|
key = slice(0, 1)
|
|
|
|
tdi = timedelta_range("1 Day", periods=3)
|
|
ser = Series(tdi)
|
|
|
|
values = ser._values
|
|
|
|
newvals = box(["10 Days", "44 hours"])
|
|
values._validate_setitem_value(newvals)
|
|
|
|
indexer_sli(ser)[key] = newvals
|
|
assert ser._values._data is values._data
|
|
|
|
|
|
def test_extension_array_cross_section():
|
|
# A cross-section of a homogeneous EA should be an EA
|
|
df = DataFrame(
|
|
{
|
|
"A": pd.array([1, 2], dtype="Int64"),
|
|
"B": pd.array([3, 4], dtype="Int64"),
|
|
},
|
|
index=["a", "b"],
|
|
)
|
|
expected = Series(pd.array([1, 3], dtype="Int64"), index=["A", "B"], name="a")
|
|
result = df.loc["a"]
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.iloc[0]
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
def test_extension_array_cross_section_converts():
|
|
# all numeric columns -> numeric series
|
|
df = DataFrame(
|
|
{
|
|
"A": pd.array([1, 2], dtype="Int64"),
|
|
"B": np.array([1, 2], dtype="int64"),
|
|
},
|
|
index=["a", "b"],
|
|
)
|
|
result = df.loc["a"]
|
|
expected = Series([1, 1], dtype="Int64", index=["A", "B"], name="a")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.iloc[0]
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
# mixed columns -> object series
|
|
df = DataFrame(
|
|
{"A": pd.array([1, 2], dtype="Int64"), "B": np.array(["a", "b"])},
|
|
index=["a", "b"],
|
|
)
|
|
result = df.loc["a"]
|
|
expected = Series([1, "a"], dtype=object, index=["A", "B"], name="a")
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
result = df.iloc[0]
|
|
tm.assert_series_equal(result, expected)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"ser, keys",
|
|
[(Series([10]), (0, 0)), (Series([1, 2, 3], index=list("abc")), (0, 1))],
|
|
)
|
|
def test_ser_tup_indexer_exceeds_dimensions(ser, keys, indexer_li):
|
|
# GH#13831
|
|
exp_err, exp_msg = IndexingError, "Too many indexers"
|
|
with pytest.raises(exp_err, match=exp_msg):
|
|
indexer_li(ser)[keys]
|
|
|
|
if indexer_li == tm.iloc:
|
|
# For iloc.__setitem__ we let numpy handle the error reporting.
|
|
exp_err, exp_msg = IndexError, "too many indices for array"
|
|
|
|
with pytest.raises(exp_err, match=exp_msg):
|
|
indexer_li(ser)[keys] = 0
|
|
|
|
|
|
def test_ser_list_indexer_exceeds_dimensions(indexer_li):
|
|
# GH#13831
|
|
# Make sure an exception is raised when a tuple exceeds the dimension of the series,
|
|
# but not list when a list is used.
|
|
ser = Series([10])
|
|
res = indexer_li(ser)[[0, 0]]
|
|
exp = Series([10, 10], index=Index([0, 0]))
|
|
tm.assert_series_equal(res, exp)
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"value", [(0, 1), [0, 1], np.array([0, 1]), array.array("b", [0, 1])]
|
|
)
|
|
def test_scalar_setitem_with_nested_value(value):
|
|
# For numeric data, we try to unpack and thus raise for mismatching length
|
|
df = DataFrame({"A": [1, 2, 3]})
|
|
msg = "|".join(
|
|
[
|
|
"Must have equal len keys and value",
|
|
"setting an array element with a sequence",
|
|
]
|
|
)
|
|
with pytest.raises(ValueError, match=msg):
|
|
df.loc[0, "B"] = value
|
|
|
|
# TODO For object dtype this happens as well, but should we rather preserve
|
|
# the nested data and set as such?
|
|
df = DataFrame({"A": [1, 2, 3], "B": np.array([1, "a", "b"], dtype=object)})
|
|
with pytest.raises(ValueError, match="Must have equal len keys and value"):
|
|
df.loc[0, "B"] = value
|
|
# if isinstance(value, np.ndarray):
|
|
# assert (df.loc[0, "B"] == value).all()
|
|
# else:
|
|
# assert df.loc[0, "B"] == value
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"value", [(0, 1), [0, 1], np.array([0, 1]), array.array("b", [0, 1])]
|
|
)
|
|
def test_scalar_setitem_series_with_nested_value(value, indexer_sli):
|
|
# For numeric data, we try to unpack and thus raise for mismatching length
|
|
ser = Series([1, 2, 3])
|
|
with pytest.raises(ValueError, match="setting an array element with a sequence"):
|
|
indexer_sli(ser)[0] = value
|
|
|
|
# but for object dtype we preserve the nested data and set as such
|
|
ser = Series([1, "a", "b"], dtype=object)
|
|
indexer_sli(ser)[0] = value
|
|
if isinstance(value, np.ndarray):
|
|
assert (ser.loc[0] == value).all()
|
|
else:
|
|
assert ser.loc[0] == value
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"value", [(0.0,), [0.0], np.array([0.0]), array.array("d", [0.0])]
|
|
)
|
|
def test_scalar_setitem_with_nested_value_length1(value):
|
|
# https://github.com/pandas-dev/pandas/issues/46268
|
|
|
|
# For numeric data, assigning length-1 array to scalar position gets unpacked
|
|
df = DataFrame({"A": [1, 2, 3]})
|
|
df.loc[0, "B"] = value
|
|
expected = DataFrame({"A": [1, 2, 3], "B": [0.0, np.nan, np.nan]})
|
|
tm.assert_frame_equal(df, expected)
|
|
|
|
# but for object dtype we preserve the nested data
|
|
df = DataFrame({"A": [1, 2, 3], "B": np.array([1, "a", "b"], dtype=object)})
|
|
df.loc[0, "B"] = value
|
|
if isinstance(value, np.ndarray):
|
|
assert (df.loc[0, "B"] == value).all()
|
|
else:
|
|
assert df.loc[0, "B"] == value
|
|
|
|
|
|
@pytest.mark.parametrize(
|
|
"value", [(0.0,), [0.0], np.array([0.0]), array.array("d", [0.0])]
|
|
)
|
|
def test_scalar_setitem_series_with_nested_value_length1(value, indexer_sli):
|
|
# For numeric data, assigning length-1 array to scalar position gets unpacked
|
|
# TODO this only happens in case of ndarray, should we make this consistent
|
|
# for all list-likes? (as happens for DataFrame.(i)loc, see test above)
|
|
ser = Series([1.0, 2.0, 3.0])
|
|
if isinstance(value, np.ndarray):
|
|
indexer_sli(ser)[0] = value
|
|
expected = Series([0.0, 2.0, 3.0])
|
|
tm.assert_series_equal(ser, expected)
|
|
else:
|
|
with pytest.raises(
|
|
ValueError, match="setting an array element with a sequence"
|
|
):
|
|
indexer_sli(ser)[0] = value
|
|
|
|
# but for object dtype we preserve the nested data
|
|
ser = Series([1, "a", "b"], dtype=object)
|
|
indexer_sli(ser)[0] = value
|
|
if isinstance(value, np.ndarray):
|
|
assert (ser.loc[0] == value).all()
|
|
else:
|
|
assert ser.loc[0] == value
|