ai-content-maker/.venv/Lib/site-packages/pandas/tests/frame/indexing/test_setitem.py

1256 lines
44 KiB
Python
Raw Normal View History

2024-05-03 04:18:51 +03:00
from datetime import datetime
import numpy as np
import pytest
import pandas.util._test_decorators as td
from pandas.core.dtypes.base import _registry as ea_registry
from pandas.core.dtypes.common import (
is_categorical_dtype,
is_interval_dtype,
is_object_dtype,
)
from pandas.core.dtypes.dtypes import (
CategoricalDtype,
DatetimeTZDtype,
IntervalDtype,
PeriodDtype,
)
import pandas as pd
from pandas import (
Categorical,
DataFrame,
DatetimeIndex,
Index,
Interval,
IntervalIndex,
MultiIndex,
NaT,
Period,
PeriodIndex,
Series,
Timestamp,
cut,
date_range,
notna,
period_range,
)
import pandas._testing as tm
from pandas.core.arrays import SparseArray
from pandas.tseries.offsets import BDay
class TestDataFrameSetItem:
def test_setitem_str_subclass(self):
# GH#37366
class mystring(str):
pass
data = ["2020-10-22 01:21:00+00:00"]
index = DatetimeIndex(data)
df = DataFrame({"a": [1]}, index=index)
df["b"] = 2
df[mystring("c")] = 3
expected = DataFrame({"a": [1], "b": [2], mystring("c"): [3]}, index=index)
tm.assert_equal(df, expected)
@pytest.mark.parametrize(
"dtype", ["int32", "int64", "uint32", "uint64", "float32", "float64"]
)
def test_setitem_dtype(self, dtype, float_frame):
arr = np.random.randn(len(float_frame))
float_frame[dtype] = np.array(arr, dtype=dtype)
assert float_frame[dtype].dtype.name == dtype
def test_setitem_list_not_dataframe(self, float_frame):
data = np.random.randn(len(float_frame), 2)
float_frame[["A", "B"]] = data
tm.assert_almost_equal(float_frame[["A", "B"]].values, data)
def test_setitem_error_msmgs(self):
# GH 7432
df = DataFrame(
{"bar": [1, 2, 3], "baz": ["d", "e", "f"]},
index=Index(["a", "b", "c"], name="foo"),
)
ser = Series(
["g", "h", "i", "j"],
index=Index(["a", "b", "c", "a"], name="foo"),
name="fiz",
)
msg = "cannot reindex on an axis with duplicate labels"
with pytest.raises(ValueError, match=msg):
with tm.assert_produces_warning(FutureWarning, match="non-unique"):
df["newcol"] = ser
# GH 4107, more descriptive error message
df = DataFrame(np.random.randint(0, 2, (4, 4)), columns=["a", "b", "c", "d"])
msg = "Cannot set a DataFrame with multiple columns to the single column gr"
with pytest.raises(ValueError, match=msg):
df["gr"] = df.groupby(["b", "c"]).count()
def test_setitem_benchmark(self):
# from the vb_suite/frame_methods/frame_insert_columns
N = 10
K = 5
df = DataFrame(index=range(N))
new_col = np.random.randn(N)
for i in range(K):
df[i] = new_col
expected = DataFrame(np.repeat(new_col, K).reshape(N, K), index=range(N))
tm.assert_frame_equal(df, expected)
def test_setitem_different_dtype(self):
df = DataFrame(
np.random.randn(5, 3), index=np.arange(5), columns=["c", "b", "a"]
)
df.insert(0, "foo", df["a"])
df.insert(2, "bar", df["c"])
# diff dtype
# new item
df["x"] = df["a"].astype("float32")
result = df.dtypes
expected = Series(
[np.dtype("float64")] * 5 + [np.dtype("float32")],
index=["foo", "c", "bar", "b", "a", "x"],
)
tm.assert_series_equal(result, expected)
# replacing current (in different block)
df["a"] = df["a"].astype("float32")
result = df.dtypes
expected = Series(
[np.dtype("float64")] * 4 + [np.dtype("float32")] * 2,
index=["foo", "c", "bar", "b", "a", "x"],
)
tm.assert_series_equal(result, expected)
df["y"] = df["a"].astype("int32")
result = df.dtypes
expected = Series(
[np.dtype("float64")] * 4 + [np.dtype("float32")] * 2 + [np.dtype("int32")],
index=["foo", "c", "bar", "b", "a", "x", "y"],
)
tm.assert_series_equal(result, expected)
def test_setitem_empty_columns(self):
# GH 13522
df = DataFrame(index=["A", "B", "C"])
df["X"] = df.index
df["X"] = ["x", "y", "z"]
exp = DataFrame(data={"X": ["x", "y", "z"]}, index=["A", "B", "C"])
tm.assert_frame_equal(df, exp)
def test_setitem_dt64_index_empty_columns(self):
rng = date_range("1/1/2000 00:00:00", "1/1/2000 1:59:50", freq="10s")
df = DataFrame(index=np.arange(len(rng)))
df["A"] = rng
assert df["A"].dtype == np.dtype("M8[ns]")
def test_setitem_timestamp_empty_columns(self):
# GH#19843
df = DataFrame(index=range(3))
df["now"] = Timestamp("20130101", tz="UTC")
expected = DataFrame(
[[Timestamp("20130101", tz="UTC")]] * 3, index=[0, 1, 2], columns=["now"]
)
tm.assert_frame_equal(df, expected)
def test_setitem_wrong_length_categorical_dtype_raises(self):
# GH#29523
cat = Categorical.from_codes([0, 1, 1, 0, 1, 2], ["a", "b", "c"])
df = DataFrame(range(10), columns=["bar"])
msg = (
rf"Length of values \({len(cat)}\) "
rf"does not match length of index \({len(df)}\)"
)
with pytest.raises(ValueError, match=msg):
df["foo"] = cat
def test_setitem_with_sparse_value(self):
# GH#8131
df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]})
sp_array = SparseArray([0, 0, 1])
df["new_column"] = sp_array
expected = Series(sp_array, name="new_column")
tm.assert_series_equal(df["new_column"], expected)
def test_setitem_with_unaligned_sparse_value(self):
df = DataFrame({"c_1": ["a", "b", "c"], "n_1": [1.0, 2.0, 3.0]})
sp_series = Series(SparseArray([0, 0, 1]), index=[2, 1, 0])
df["new_column"] = sp_series
expected = Series(SparseArray([1, 0, 0]), name="new_column")
tm.assert_series_equal(df["new_column"], expected)
def test_setitem_period_preserves_dtype(self):
# GH: 26861
data = [Period("2003-12", "D")]
result = DataFrame([])
result["a"] = data
expected = DataFrame({"a": data})
tm.assert_frame_equal(result, expected)
def test_setitem_dict_preserves_dtypes(self):
# https://github.com/pandas-dev/pandas/issues/34573
expected = DataFrame(
{
"a": Series([0, 1, 2], dtype="int64"),
"b": Series([1, 2, 3], dtype=float),
"c": Series([1, 2, 3], dtype=float),
"d": Series([1, 2, 3], dtype="uint32"),
}
)
df = DataFrame(
{
"a": Series([], dtype="int64"),
"b": Series([], dtype=float),
"c": Series([], dtype=float),
"d": Series([], dtype="uint32"),
}
)
for idx, b in enumerate([1, 2, 3]):
df.loc[df.shape[0]] = {
"a": int(idx),
"b": float(b),
"c": float(b),
"d": np.uint32(b),
}
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize(
"obj,dtype",
[
(Period("2020-01"), PeriodDtype("M")),
(Interval(left=0, right=5), IntervalDtype("int64", "right")),
(
Timestamp("2011-01-01", tz="US/Eastern"),
DatetimeTZDtype(tz="US/Eastern"),
),
],
)
def test_setitem_extension_types(self, obj, dtype):
# GH: 34832
expected = DataFrame({"idx": [1, 2, 3], "obj": Series([obj] * 3, dtype=dtype)})
df = DataFrame({"idx": [1, 2, 3]})
df["obj"] = obj
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize(
"ea_name",
[
dtype.name
for dtype in ea_registry.dtypes
# property would require instantiation
if not isinstance(dtype.name, property)
]
# mypy doesn't allow adding lists of different types
# https://github.com/python/mypy/issues/5492
+ ["datetime64[ns, UTC]", "period[D]"], # type: ignore[list-item]
)
def test_setitem_with_ea_name(self, ea_name):
# GH 38386
result = DataFrame([0])
result[ea_name] = [1]
expected = DataFrame({0: [0], ea_name: [1]})
tm.assert_frame_equal(result, expected)
def test_setitem_dt64_ndarray_with_NaT_and_diff_time_units(self):
# GH#7492
data_ns = np.array([1, "nat"], dtype="datetime64[ns]")
result = Series(data_ns).to_frame()
result["new"] = data_ns
expected = DataFrame({0: [1, None], "new": [1, None]}, dtype="datetime64[ns]")
tm.assert_frame_equal(result, expected)
# OutOfBoundsDatetime error shouldn't occur
data_s = np.array([1, "nat"], dtype="datetime64[s]")
result["new"] = data_s
expected = DataFrame({0: [1, None], "new": [1e9, None]}, dtype="datetime64[ns]")
tm.assert_frame_equal(result, expected)
@pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"])
def test_frame_setitem_datetime64_col_other_units(self, unit):
# Check that non-nano dt64 values get cast to dt64 on setitem
# into a not-yet-existing column
n = 100
dtype = np.dtype(f"M8[{unit}]")
vals = np.arange(n, dtype=np.int64).view(dtype)
ex_vals = vals.astype("datetime64[ns]")
df = DataFrame({"ints": np.arange(n)}, index=np.arange(n))
df[unit] = vals
assert df[unit].dtype == np.dtype("M8[ns]")
assert (df[unit].values == ex_vals).all()
@pytest.mark.parametrize("unit", ["h", "m", "s", "ms", "D", "M", "Y"])
def test_frame_setitem_existing_datetime64_col_other_units(self, unit):
# Check that non-nano dt64 values get cast to dt64 on setitem
# into an already-existing dt64 column
n = 100
dtype = np.dtype(f"M8[{unit}]")
vals = np.arange(n, dtype=np.int64).view(dtype)
ex_vals = vals.astype("datetime64[ns]")
df = DataFrame({"ints": np.arange(n)}, index=np.arange(n))
df["dates"] = np.arange(n, dtype=np.int64).view("M8[ns]")
# We overwrite existing dt64 column with new, non-nano dt64 vals
df["dates"] = vals
assert (df["dates"].values == ex_vals).all()
def test_setitem_dt64tz(self, timezone_frame):
df = timezone_frame
idx = df["B"].rename("foo")
# setitem
df["C"] = idx
tm.assert_series_equal(df["C"], Series(idx, name="C"))
df["D"] = "foo"
df["D"] = idx
tm.assert_series_equal(df["D"], Series(idx, name="D"))
del df["D"]
# assert that A & C are not sharing the same base (e.g. they
# are copies)
v1 = df._mgr.arrays[1]
v2 = df._mgr.arrays[2]
tm.assert_extension_array_equal(v1, v2)
v1base = v1._data.base
v2base = v2._data.base
assert v1base is None or (id(v1base) != id(v2base))
# with nan
df2 = df.copy()
df2.iloc[1, 1] = NaT
df2.iloc[1, 2] = NaT
result = df2["B"]
tm.assert_series_equal(notna(result), Series([True, False, True], name="B"))
tm.assert_series_equal(df2.dtypes, df.dtypes)
def test_setitem_periodindex(self):
rng = period_range("1/1/2000", periods=5, name="index")
df = DataFrame(np.random.randn(5, 3), index=rng)
df["Index"] = rng
rs = Index(df["Index"])
tm.assert_index_equal(rs, rng, check_names=False)
assert rs.name == "Index"
assert rng.name == "index"
rs = df.reset_index().set_index("index")
assert isinstance(rs.index, PeriodIndex)
tm.assert_index_equal(rs.index, rng)
def test_setitem_complete_column_with_array(self):
# GH#37954
df = DataFrame({"a": ["one", "two", "three"], "b": [1, 2, 3]})
arr = np.array([[1, 1], [3, 1], [5, 1]])
df[["c", "d"]] = arr
expected = DataFrame(
{
"a": ["one", "two", "three"],
"b": [1, 2, 3],
"c": [1, 3, 5],
"d": [1, 1, 1],
}
)
expected["c"] = expected["c"].astype(arr.dtype)
expected["d"] = expected["d"].astype(arr.dtype)
assert expected["c"].dtype == arr.dtype
assert expected["d"].dtype == arr.dtype
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("dtype", ["f8", "i8", "u8"])
def test_setitem_bool_with_numeric_index(self, dtype):
# GH#36319
cols = Index([1, 2, 3], dtype=dtype)
df = DataFrame(np.random.randn(3, 3), columns=cols)
df[False] = ["a", "b", "c"]
expected_cols = Index([1, 2, 3, False], dtype=object)
if dtype == "f8":
expected_cols = Index([1.0, 2.0, 3.0, False], dtype=object)
tm.assert_index_equal(df.columns, expected_cols)
@pytest.mark.parametrize("indexer", ["B", ["B"]])
def test_setitem_frame_length_0_str_key(self, indexer):
# GH#38831
df = DataFrame(columns=["A", "B"])
other = DataFrame({"B": [1, 2]})
df[indexer] = other
expected = DataFrame({"A": [np.nan] * 2, "B": [1, 2]})
expected["A"] = expected["A"].astype("object")
tm.assert_frame_equal(df, expected)
def test_setitem_frame_duplicate_columns(self, using_array_manager):
# GH#15695
warn = DeprecationWarning if using_array_manager else None
msg = "will attempt to set the values inplace"
cols = ["A", "B", "C"] * 2
df = DataFrame(index=range(3), columns=cols)
df.loc[0, "A"] = (0, 3)
with tm.assert_produces_warning(warn, match=msg):
df.loc[:, "B"] = (1, 4)
df["C"] = (2, 5)
expected = DataFrame(
[
[0, 1, 2, 3, 4, 5],
[np.nan, 1, 2, np.nan, 4, 5],
[np.nan, 1, 2, np.nan, 4, 5],
],
dtype="object",
)
if using_array_manager:
# setitem replaces column so changes dtype
expected.columns = cols
expected["C"] = expected["C"].astype("int64")
# TODO(ArrayManager) .loc still overwrites
expected["B"] = expected["B"].astype("int64")
else:
# set these with unique columns to be extra-unambiguous
expected[2] = expected[2].astype(np.int64)
expected[5] = expected[5].astype(np.int64)
expected.columns = cols
tm.assert_frame_equal(df, expected)
def test_setitem_frame_duplicate_columns_size_mismatch(self):
# GH#39510
cols = ["A", "B", "C"] * 2
df = DataFrame(index=range(3), columns=cols)
with pytest.raises(ValueError, match="Columns must be same length as key"):
df[["A"]] = (0, 3, 5)
df2 = df.iloc[:, :3] # unique columns
with pytest.raises(ValueError, match="Columns must be same length as key"):
df2[["A"]] = (0, 3, 5)
@pytest.mark.parametrize("cols", [["a", "b", "c"], ["a", "a", "a"]])
def test_setitem_df_wrong_column_number(self, cols):
# GH#38604
df = DataFrame([[1, 2, 3]], columns=cols)
rhs = DataFrame([[10, 11]], columns=["d", "e"])
msg = "Columns must be same length as key"
with pytest.raises(ValueError, match=msg):
df["a"] = rhs
def test_setitem_listlike_indexer_duplicate_columns(self):
# GH#38604
df = DataFrame([[1, 2, 3]], columns=["a", "b", "b"])
rhs = DataFrame([[10, 11, 12]], columns=["a", "b", "b"])
df[["a", "b"]] = rhs
expected = DataFrame([[10, 11, 12]], columns=["a", "b", "b"])
tm.assert_frame_equal(df, expected)
df[["c", "b"]] = rhs
expected = DataFrame([[10, 11, 12, 10]], columns=["a", "b", "b", "c"])
tm.assert_frame_equal(df, expected)
def test_setitem_listlike_indexer_duplicate_columns_not_equal_length(self):
# GH#39403
df = DataFrame([[1, 2, 3]], columns=["a", "b", "b"])
rhs = DataFrame([[10, 11]], columns=["a", "b"])
msg = "Columns must be same length as key"
with pytest.raises(ValueError, match=msg):
df[["a", "b"]] = rhs
def test_setitem_intervals(self):
df = DataFrame({"A": range(10)})
ser = cut(df["A"], 5)
assert isinstance(ser.cat.categories, IntervalIndex)
# B & D end up as Categoricals
# the remainder are converted to in-line objects
# containing an IntervalIndex.values
df["B"] = ser
df["C"] = np.array(ser)
df["D"] = ser.values
df["E"] = np.array(ser.values)
df["F"] = ser.astype(object)
assert is_categorical_dtype(df["B"].dtype)
assert is_interval_dtype(df["B"].cat.categories)
assert is_categorical_dtype(df["D"].dtype)
assert is_interval_dtype(df["D"].cat.categories)
# These go through the Series constructor and so get inferred back
# to IntervalDtype
assert is_interval_dtype(df["C"])
assert is_interval_dtype(df["E"])
# But the Series constructor doesn't do inference on Series objects,
# so setting df["F"] doesn't get cast back to IntervalDtype
assert is_object_dtype(df["F"])
# they compare equal as Index
# when converted to numpy objects
c = lambda x: Index(np.array(x))
tm.assert_index_equal(c(df.B), c(df.B))
tm.assert_index_equal(c(df.B), c(df.C), check_names=False)
tm.assert_index_equal(c(df.B), c(df.D), check_names=False)
tm.assert_index_equal(c(df.C), c(df.D), check_names=False)
# B & D are the same Series
tm.assert_series_equal(df["B"], df["B"])
tm.assert_series_equal(df["B"], df["D"], check_names=False)
# C & E are the same Series
tm.assert_series_equal(df["C"], df["C"])
tm.assert_series_equal(df["C"], df["E"], check_names=False)
def test_setitem_categorical(self):
# GH#35369
df = DataFrame({"h": Series(list("mn")).astype("category")})
df.h = df.h.cat.reorder_categories(["n", "m"])
expected = DataFrame(
{"h": Categorical(["m", "n"]).reorder_categories(["n", "m"])}
)
tm.assert_frame_equal(df, expected)
def test_setitem_with_empty_listlike(self):
# GH#17101
index = Index([], name="idx")
result = DataFrame(columns=["A"], index=index)
result["A"] = []
expected = DataFrame(columns=["A"], index=index)
tm.assert_index_equal(result.index, expected.index)
@pytest.mark.parametrize(
"cols, values, expected",
[
(["C", "D", "D", "a"], [1, 2, 3, 4], 4), # with duplicates
(["D", "C", "D", "a"], [1, 2, 3, 4], 4), # mixed order
(["C", "B", "B", "a"], [1, 2, 3, 4], 4), # other duplicate cols
(["C", "B", "a"], [1, 2, 3], 3), # no duplicates
(["B", "C", "a"], [3, 2, 1], 1), # alphabetical order
(["C", "a", "B"], [3, 2, 1], 2), # in the middle
],
)
def test_setitem_same_column(self, cols, values, expected):
# GH#23239
df = DataFrame([values], columns=cols)
df["a"] = df["a"]
result = df["a"].values[0]
assert result == expected
def test_setitem_multi_index(self):
# GH#7655, test that assigning to a sub-frame of a frame
# with multi-index columns aligns both rows and columns
it = ["jim", "joe", "jolie"], ["first", "last"], ["left", "center", "right"]
cols = MultiIndex.from_product(it)
index = date_range("20141006", periods=20)
vals = np.random.randint(1, 1000, (len(index), len(cols)))
df = DataFrame(vals, columns=cols, index=index)
i, j = df.index.values.copy(), it[-1][:]
np.random.shuffle(i)
df["jim"] = df["jolie"].loc[i, ::-1]
tm.assert_frame_equal(df["jim"], df["jolie"])
np.random.shuffle(j)
df[("joe", "first")] = df[("jolie", "last")].loc[i, j]
tm.assert_frame_equal(df[("joe", "first")], df[("jolie", "last")])
np.random.shuffle(j)
df[("joe", "last")] = df[("jolie", "first")].loc[i, j]
tm.assert_frame_equal(df[("joe", "last")], df[("jolie", "first")])
@pytest.mark.parametrize(
"columns,box,expected",
[
(
["A", "B", "C", "D"],
7,
DataFrame(
[[7, 7, 7, 7], [7, 7, 7, 7], [7, 7, 7, 7]],
columns=["A", "B", "C", "D"],
),
),
(
["C", "D"],
[7, 8],
DataFrame(
[[1, 2, 7, 8], [3, 4, 7, 8], [5, 6, 7, 8]],
columns=["A", "B", "C", "D"],
),
),
(
["A", "B", "C"],
np.array([7, 8, 9], dtype=np.int64),
DataFrame([[7, 8, 9], [7, 8, 9], [7, 8, 9]], columns=["A", "B", "C"]),
),
(
["B", "C", "D"],
[[7, 8, 9], [10, 11, 12], [13, 14, 15]],
DataFrame(
[[1, 7, 8, 9], [3, 10, 11, 12], [5, 13, 14, 15]],
columns=["A", "B", "C", "D"],
),
),
(
["C", "A", "D"],
np.array([[7, 8, 9], [10, 11, 12], [13, 14, 15]], dtype=np.int64),
DataFrame(
[[8, 2, 7, 9], [11, 4, 10, 12], [14, 6, 13, 15]],
columns=["A", "B", "C", "D"],
),
),
(
["A", "C"],
DataFrame([[7, 8], [9, 10], [11, 12]], columns=["A", "C"]),
DataFrame(
[[7, 2, 8], [9, 4, 10], [11, 6, 12]], columns=["A", "B", "C"]
),
),
],
)
def test_setitem_list_missing_columns(self, columns, box, expected):
# GH#29334
df = DataFrame([[1, 2], [3, 4], [5, 6]], columns=["A", "B"])
df[columns] = box
tm.assert_frame_equal(df, expected)
def test_setitem_list_of_tuples(self, float_frame):
tuples = list(zip(float_frame["A"], float_frame["B"]))
float_frame["tuples"] = tuples
result = float_frame["tuples"]
expected = Series(tuples, index=float_frame.index, name="tuples")
tm.assert_series_equal(result, expected)
def test_setitem_iloc_generator(self):
# GH#39614
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
indexer = (x for x in [1, 2])
df.iloc[indexer] = 1
expected = DataFrame({"a": [1, 1, 1], "b": [4, 1, 1]})
tm.assert_frame_equal(df, expected)
def test_setitem_iloc_two_dimensional_generator(self):
df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6]})
indexer = (x for x in [1, 2])
df.iloc[indexer, 1] = 1
expected = DataFrame({"a": [1, 2, 3], "b": [4, 1, 1]})
tm.assert_frame_equal(df, expected)
def test_setitem_dtypes_bytes_type_to_object(self):
# GH 20734
index = Series(name="id", dtype="S24")
df = DataFrame(index=index)
df["a"] = Series(name="a", index=index, dtype=np.uint32)
df["b"] = Series(name="b", index=index, dtype="S64")
df["c"] = Series(name="c", index=index, dtype="S64")
df["d"] = Series(name="d", index=index, dtype=np.uint8)
result = df.dtypes
expected = Series([np.uint32, object, object, np.uint8], index=list("abcd"))
tm.assert_series_equal(result, expected)
def test_boolean_mask_nullable_int64(self):
# GH 28928
result = DataFrame({"a": [3, 4], "b": [5, 6]}).astype(
{"a": "int64", "b": "Int64"}
)
mask = Series(False, index=result.index)
result.loc[mask, "a"] = result["a"]
result.loc[mask, "b"] = result["b"]
expected = DataFrame({"a": [3, 4], "b": [5, 6]}).astype(
{"a": "int64", "b": "Int64"}
)
tm.assert_frame_equal(result, expected)
def test_setitem_ea_dtype_rhs_series(self):
# GH#47425
df = DataFrame({"a": [1, 2]})
df["a"] = Series([1, 2], dtype="Int64")
expected = DataFrame({"a": [1, 2]}, dtype="Int64")
tm.assert_frame_equal(df, expected)
# TODO(ArrayManager) set column with 2d column array, see #44788
@td.skip_array_manager_not_yet_implemented
def test_setitem_npmatrix_2d(self):
# GH#42376
# for use-case df["x"] = sparse.random(10, 10).mean(axis=1)
expected = DataFrame(
{"np-array": np.ones(10), "np-matrix": np.ones(10)}, index=np.arange(10)
)
a = np.ones((10, 1))
df = DataFrame(index=np.arange(10))
df["np-array"] = a
# Instantiation of `np.matrix` gives PendingDeprecationWarning
with tm.assert_produces_warning(PendingDeprecationWarning):
df["np-matrix"] = np.matrix(a)
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("vals", [{}, {"d": "a"}])
def test_setitem_aligning_dict_with_index(self, vals):
# GH#47216
df = DataFrame({"a": [1, 2], "b": [3, 4], **vals})
df.loc[:, "a"] = {1: 100, 0: 200}
df.loc[:, "c"] = {0: 5, 1: 6}
df.loc[:, "e"] = {1: 5}
expected = DataFrame(
{"a": [200, 100], "b": [3, 4], **vals, "c": [5, 6], "e": [np.nan, 5]}
)
tm.assert_frame_equal(df, expected)
def test_setitem_rhs_dataframe(self):
# GH#47578
df = DataFrame({"a": [1, 2]})
df["a"] = DataFrame({"a": [10, 11]}, index=[1, 2])
expected = DataFrame({"a": [np.nan, 10]})
tm.assert_frame_equal(df, expected)
df = DataFrame({"a": [1, 2]})
df.isetitem(0, DataFrame({"a": [10, 11]}, index=[1, 2]))
tm.assert_frame_equal(df, expected)
def test_setitem_frame_overwrite_with_ea_dtype(self, any_numeric_ea_dtype):
# GH#46896
df = DataFrame(columns=["a", "b"], data=[[1, 2], [3, 4]])
df["a"] = DataFrame({"a": [10, 11]}, dtype=any_numeric_ea_dtype)
expected = DataFrame(
{
"a": Series([10, 11], dtype=any_numeric_ea_dtype),
"b": [2, 4],
}
)
tm.assert_frame_equal(df, expected)
class TestSetitemTZAwareValues:
@pytest.fixture
def idx(self):
naive = DatetimeIndex(["2013-1-1 13:00", "2013-1-2 14:00"], name="B")
idx = naive.tz_localize("US/Pacific")
return idx
@pytest.fixture
def expected(self, idx):
expected = Series(np.array(idx.tolist(), dtype="object"), name="B")
assert expected.dtype == idx.dtype
return expected
def test_setitem_dt64series(self, idx, expected):
# convert to utc
df = DataFrame(np.random.randn(2, 1), columns=["A"])
df["B"] = idx
with tm.assert_produces_warning(FutureWarning) as m:
df["B"] = idx.to_series(keep_tz=False, index=[0, 1])
msg = "do 'idx.tz_convert(None)' before calling"
assert msg in str(m[0].message)
result = df["B"]
comp = Series(idx.tz_convert("UTC").tz_localize(None), name="B")
tm.assert_series_equal(result, comp)
def test_setitem_datetimeindex(self, idx, expected):
# setting a DataFrame column with a tzaware DTI retains the dtype
df = DataFrame(np.random.randn(2, 1), columns=["A"])
# assign to frame
df["B"] = idx
result = df["B"]
tm.assert_series_equal(result, expected)
def test_setitem_object_array_of_tzaware_datetimes(self, idx, expected):
# setting a DataFrame column with a tzaware DTI retains the dtype
df = DataFrame(np.random.randn(2, 1), columns=["A"])
# object array of datetimes with a tz
df["B"] = idx.to_pydatetime()
result = df["B"]
tm.assert_series_equal(result, expected)
class TestDataFrameSetItemWithExpansion:
def test_setitem_listlike_views(self, using_copy_on_write):
# GH#38148
df = DataFrame({"a": [1, 2, 3], "b": [4, 4, 6]})
# get one column as a view of df
ser = df["a"]
# add columns with list-like indexer
df[["c", "d"]] = np.array([[0.1, 0.2], [0.3, 0.4], [0.4, 0.5]])
# edit in place the first column to check view semantics
df.iloc[0, 0] = 100
if using_copy_on_write:
expected = Series([1, 2, 3], name="a")
else:
expected = Series([100, 2, 3], name="a")
tm.assert_series_equal(ser, expected)
def test_setitem_string_column_numpy_dtype_raising(self):
# GH#39010
df = DataFrame([[1, 2], [3, 4]])
df["0 - Name"] = [5, 6]
expected = DataFrame([[1, 2, 5], [3, 4, 6]], columns=[0, 1, "0 - Name"])
tm.assert_frame_equal(df, expected)
def test_setitem_empty_df_duplicate_columns(self, using_copy_on_write):
# GH#38521
df = DataFrame(columns=["a", "b", "b"], dtype="float64")
df.loc[:, "a"] = list(range(2))
expected = DataFrame(
[[0, np.nan, np.nan], [1, np.nan, np.nan]], columns=["a", "b", "b"]
)
tm.assert_frame_equal(df, expected)
def test_setitem_with_expansion_categorical_dtype(self):
# assignment
df = DataFrame(
{"value": np.array(np.random.randint(0, 10000, 100), dtype="int32")}
)
labels = Categorical([f"{i} - {i + 499}" for i in range(0, 10000, 500)])
df = df.sort_values(by=["value"], ascending=True)
ser = cut(df.value, range(0, 10500, 500), right=False, labels=labels)
cat = ser.values
# setting with a Categorical
df["D"] = cat
str(df)
result = df.dtypes
expected = Series(
[np.dtype("int32"), CategoricalDtype(categories=labels, ordered=False)],
index=["value", "D"],
)
tm.assert_series_equal(result, expected)
# setting with a Series
df["E"] = ser
str(df)
result = df.dtypes
expected = Series(
[
np.dtype("int32"),
CategoricalDtype(categories=labels, ordered=False),
CategoricalDtype(categories=labels, ordered=False),
],
index=["value", "D", "E"],
)
tm.assert_series_equal(result, expected)
result1 = df["D"]
result2 = df["E"]
tm.assert_categorical_equal(result1._mgr.array, cat)
# sorting
ser.name = "E"
tm.assert_series_equal(result2.sort_index(), ser.sort_index())
def test_setitem_scalars_no_index(self):
# GH#16823 / GH#17894
df = DataFrame()
df["foo"] = 1
expected = DataFrame(columns=["foo"]).astype(np.int64)
tm.assert_frame_equal(df, expected)
def test_setitem_newcol_tuple_key(self, float_frame):
assert (
"A",
"B",
) not in float_frame.columns
float_frame["A", "B"] = float_frame["A"]
assert ("A", "B") in float_frame.columns
result = float_frame["A", "B"]
expected = float_frame["A"]
tm.assert_series_equal(result, expected, check_names=False)
def test_frame_setitem_newcol_timestamp(self):
# GH#2155
columns = date_range(start="1/1/2012", end="2/1/2012", freq=BDay())
data = DataFrame(columns=columns, index=range(10))
t = datetime(2012, 11, 1)
ts = Timestamp(t)
data[ts] = np.nan # works, mostly a smoke-test
assert np.isnan(data[ts]).all()
def test_frame_setitem_rangeindex_into_new_col(self):
# GH#47128
df = DataFrame({"a": ["a", "b"]})
df["b"] = df.index
df.loc[[False, True], "b"] = 100
result = df.loc[[1], :]
expected = DataFrame({"a": ["b"], "b": [100]}, index=[1])
tm.assert_frame_equal(result, expected)
def test_setitem_frame_keep_ea_dtype(self, any_numeric_ea_dtype):
# GH#46896
df = DataFrame(columns=["a", "b"], data=[[1, 2], [3, 4]])
df["c"] = DataFrame({"a": [10, 11]}, dtype=any_numeric_ea_dtype)
expected = DataFrame(
{
"a": [1, 3],
"b": [2, 4],
"c": Series([10, 11], dtype=any_numeric_ea_dtype),
}
)
tm.assert_frame_equal(df, expected)
class TestDataFrameSetItemSlicing:
def test_setitem_slice_position(self):
# GH#31469
df = DataFrame(np.zeros((100, 1)))
df[-4:] = 1
arr = np.zeros((100, 1))
arr[-4:] = 1
expected = DataFrame(arr)
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("indexer", [tm.setitem, tm.iloc])
@pytest.mark.parametrize("box", [Series, np.array, list, pd.array])
@pytest.mark.parametrize("n", [1, 2, 3])
def test_setitem_slice_indexer_broadcasting_rhs(self, n, box, indexer):
# GH#40440
df = DataFrame([[1, 3, 5]] + [[2, 4, 6]] * n, columns=["a", "b", "c"])
indexer(df)[1:] = box([10, 11, 12])
expected = DataFrame([[1, 3, 5]] + [[10, 11, 12]] * n, columns=["a", "b", "c"])
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("box", [Series, np.array, list, pd.array])
@pytest.mark.parametrize("n", [1, 2, 3])
def test_setitem_list_indexer_broadcasting_rhs(self, n, box):
# GH#40440
df = DataFrame([[1, 3, 5]] + [[2, 4, 6]] * n, columns=["a", "b", "c"])
df.iloc[list(range(1, n + 1))] = box([10, 11, 12])
expected = DataFrame([[1, 3, 5]] + [[10, 11, 12]] * n, columns=["a", "b", "c"])
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("indexer", [tm.setitem, tm.iloc])
@pytest.mark.parametrize("box", [Series, np.array, list, pd.array])
@pytest.mark.parametrize("n", [1, 2, 3])
def test_setitem_slice_broadcasting_rhs_mixed_dtypes(self, n, box, indexer):
# GH#40440
df = DataFrame(
[[1, 3, 5], ["x", "y", "z"]] + [[2, 4, 6]] * n, columns=["a", "b", "c"]
)
indexer(df)[1:] = box([10, 11, 12])
expected = DataFrame(
[[1, 3, 5]] + [[10, 11, 12]] * (n + 1),
columns=["a", "b", "c"],
dtype="object",
)
tm.assert_frame_equal(df, expected)
class TestDataFrameSetItemCallable:
def test_setitem_callable(self):
# GH#12533
df = DataFrame({"A": [1, 2, 3, 4], "B": [5, 6, 7, 8]})
df[lambda x: "A"] = [11, 12, 13, 14]
exp = DataFrame({"A": [11, 12, 13, 14], "B": [5, 6, 7, 8]})
tm.assert_frame_equal(df, exp)
def test_setitem_other_callable(self):
# GH#13299
def inc(x):
return x + 1
df = DataFrame([[-1, 1], [1, -1]])
df[df > 0] = inc
expected = DataFrame([[-1, inc], [inc, -1]])
tm.assert_frame_equal(df, expected)
class TestDataFrameSetItemBooleanMask:
@td.skip_array_manager_invalid_test # TODO(ArrayManager) rewrite not using .values
@pytest.mark.parametrize(
"mask_type",
[lambda df: df > np.abs(df) / 2, lambda df: (df > np.abs(df) / 2).values],
ids=["dataframe", "array"],
)
def test_setitem_boolean_mask(self, mask_type, float_frame):
# Test for issue #18582
df = float_frame.copy()
mask = mask_type(df)
# index with boolean mask
result = df.copy()
result[mask] = np.nan
expected = df.copy()
expected.values[np.array(mask)] = np.nan
tm.assert_frame_equal(result, expected)
@pytest.mark.xfail(reason="Currently empty indexers are treated as all False")
@pytest.mark.parametrize("box", [list, np.array, Series])
def test_setitem_loc_empty_indexer_raises_with_non_empty_value(self, box):
# GH#37672
df = DataFrame({"a": ["a"], "b": [1], "c": [1]})
if box == Series:
indexer = box([], dtype="object")
else:
indexer = box([])
msg = "Must have equal len keys and value when setting with an iterable"
with pytest.raises(ValueError, match=msg):
df.loc[indexer, ["b"]] = [1]
@pytest.mark.parametrize("box", [list, np.array, Series])
def test_setitem_loc_only_false_indexer_dtype_changed(self, box):
# GH#37550
# Dtype is only changed when value to set is a Series and indexer is
# empty/bool all False
df = DataFrame({"a": ["a"], "b": [1], "c": [1]})
indexer = box([False])
df.loc[indexer, ["b"]] = 10 - df["c"]
expected = DataFrame({"a": ["a"], "b": [1], "c": [1]})
tm.assert_frame_equal(df, expected)
df.loc[indexer, ["b"]] = 9
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize("indexer", [tm.setitem, tm.loc])
def test_setitem_boolean_mask_aligning(self, indexer):
# GH#39931
df = DataFrame({"a": [1, 4, 2, 3], "b": [5, 6, 7, 8]})
expected = df.copy()
mask = df["a"] >= 3
indexer(df)[mask] = indexer(df)[mask].sort_values("a")
tm.assert_frame_equal(df, expected)
def test_setitem_mask_categorical(self):
# assign multiple rows (mixed values) (-> array) -> exp_multi_row
# changed multiple rows
cats2 = Categorical(["a", "a", "b", "b", "a", "a", "a"], categories=["a", "b"])
idx2 = Index(["h", "i", "j", "k", "l", "m", "n"])
values2 = [1, 1, 2, 2, 1, 1, 1]
exp_multi_row = DataFrame({"cats": cats2, "values": values2}, index=idx2)
catsf = Categorical(
["a", "a", "c", "c", "a", "a", "a"], categories=["a", "b", "c"]
)
idxf = Index(["h", "i", "j", "k", "l", "m", "n"])
valuesf = [1, 1, 3, 3, 1, 1, 1]
df = DataFrame({"cats": catsf, "values": valuesf}, index=idxf)
exp_fancy = exp_multi_row.copy()
with tm.assert_produces_warning(FutureWarning, check_stacklevel=False):
# issue #37643 inplace kwarg deprecated
return_value = exp_fancy["cats"].cat.set_categories(
["a", "b", "c"], inplace=True
)
assert return_value is None
mask = df["cats"] == "c"
df[mask] = ["b", 2]
# category c is kept in .categories
tm.assert_frame_equal(df, exp_fancy)
@pytest.mark.parametrize("dtype", ["float", "int64"])
@pytest.mark.parametrize("kwargs", [{}, {"index": [1]}, {"columns": ["A"]}])
def test_setitem_empty_frame_with_boolean(self, dtype, kwargs):
# see GH#10126
kwargs["dtype"] = dtype
df = DataFrame(**kwargs)
df2 = df.copy()
df[df > df2] = 47
tm.assert_frame_equal(df, df2)
def test_setitem_boolean_indexing(self):
idx = list(range(3))
cols = ["A", "B", "C"]
df1 = DataFrame(
index=idx,
columns=cols,
data=np.array(
[[0.0, 0.5, 1.0], [1.5, 2.0, 2.5], [3.0, 3.5, 4.0]], dtype=float
),
)
df2 = DataFrame(index=idx, columns=cols, data=np.ones((len(idx), len(cols))))
expected = DataFrame(
index=idx,
columns=cols,
data=np.array([[0.0, 0.5, 1.0], [1.5, 2.0, -1], [-1, -1, -1]], dtype=float),
)
df1[df1 > 2.0 * df2] = -1
tm.assert_frame_equal(df1, expected)
with pytest.raises(ValueError, match="Item wrong length"):
df1[df1.index[:-1] > 2] = -1
def test_loc_setitem_all_false_boolean_two_blocks(self):
# GH#40885
df = DataFrame({"a": [1, 2], "b": [3, 4], "c": "a"})
expected = df.copy()
indexer = Series([False, False], name="c")
df.loc[indexer, ["b"]] = DataFrame({"b": [5, 6]}, index=[0, 1])
tm.assert_frame_equal(df, expected)
class TestDataFrameSetitemCopyViewSemantics:
def test_setitem_always_copy(self, float_frame):
assert "E" not in float_frame.columns
s = float_frame["A"].copy()
float_frame["E"] = s
float_frame["E"][5:10] = np.nan
assert notna(s[5:10]).all()
@pytest.mark.parametrize("consolidate", [True, False])
def test_setitem_partial_column_inplace(
self, consolidate, using_array_manager, using_copy_on_write
):
# This setting should be in-place, regardless of whether frame is
# single-block or multi-block
# GH#304 this used to be incorrectly not-inplace, in which case
# we needed to ensure _item_cache was cleared.
df = DataFrame(
{"x": [1.1, 2.1, 3.1, 4.1], "y": [5.1, 6.1, 7.1, 8.1]}, index=[0, 1, 2, 3]
)
df.insert(2, "z", np.nan)
if not using_array_manager:
if consolidate:
df._consolidate_inplace()
assert len(df._mgr.blocks) == 1
else:
assert len(df._mgr.blocks) == 2
zvals = df["z"]._values
df.loc[2:, "z"] = 42
expected = Series([np.nan, np.nan, 42, 42], index=df.index, name="z")
tm.assert_series_equal(df["z"], expected)
# check setting occurred in-place
if not using_copy_on_write:
tm.assert_numpy_array_equal(zvals, expected.values)
assert np.shares_memory(zvals, df["z"]._values)
def test_setitem_duplicate_columns_not_inplace(self):
# GH#39510
cols = ["A", "B"] * 2
df = DataFrame(0.0, index=[0], columns=cols)
df_copy = df.copy()
df_view = df[:]
df["B"] = (2, 5)
expected = DataFrame([[0.0, 2, 0.0, 5]], columns=cols)
tm.assert_frame_equal(df_view, df_copy)
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize(
"value", [1, np.array([[1], [1]], dtype="int64"), [[1], [1]]]
)
def test_setitem_same_dtype_not_inplace(self, value, using_array_manager):
# GH#39510
cols = ["A", "B"]
df = DataFrame(0, index=[0, 1], columns=cols)
df_copy = df.copy()
df_view = df[:]
df[["B"]] = value
expected = DataFrame([[0, 1], [0, 1]], columns=cols)
tm.assert_frame_equal(df, expected)
tm.assert_frame_equal(df_view, df_copy)
@pytest.mark.parametrize("value", [1.0, np.array([[1.0], [1.0]]), [[1.0], [1.0]]])
def test_setitem_listlike_key_scalar_value_not_inplace(self, value):
# GH#39510
cols = ["A", "B"]
df = DataFrame(0, index=[0, 1], columns=cols)
df_copy = df.copy()
df_view = df[:]
df[["B"]] = value
expected = DataFrame([[0, 1.0], [0, 1.0]], columns=cols)
tm.assert_frame_equal(df_view, df_copy)
tm.assert_frame_equal(df, expected)
@pytest.mark.parametrize(
"indexer",
[
"a",
["a"],
pytest.param(
[True, False],
marks=pytest.mark.xfail(
reason="Boolean indexer incorrectly setting inplace",
strict=False, # passing on some builds, no obvious pattern
),
),
],
)
@pytest.mark.parametrize(
"value, set_value",
[
(1, 5),
(1.0, 5.0),
(Timestamp("2020-12-31"), Timestamp("2021-12-31")),
("a", "b"),
],
)
def test_setitem_not_operating_inplace(self, value, set_value, indexer):
# GH#43406
df = DataFrame({"a": value}, index=[0, 1])
expected = df.copy()
view = df[:]
df[indexer] = set_value
tm.assert_frame_equal(view, expected)
@td.skip_array_manager_invalid_test
def test_setitem_column_update_inplace(self, using_copy_on_write):
# https://github.com/pandas-dev/pandas/issues/47172
labels = [f"c{i}" for i in range(10)]
df = DataFrame({col: np.zeros(len(labels)) for col in labels}, index=labels)
values = df._mgr.blocks[0].values
for label in df.columns:
df[label][label] = 1
if not using_copy_on_write:
# diagonal values all updated
assert np.all(values[np.arange(10), np.arange(10)] == 1)
else:
# original dataframe not updated
assert np.all(values[np.arange(10), np.arange(10)] == 0)