import numpy as np import pytest from pandas import ( DataFrame, Series, ) import pandas._testing as tm from pandas.tests.copy_view.util import get_array def test_copy(using_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_copy = df.copy() # the deep copy doesn't share memory assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: assert df_copy._mgr.refs is None # mutating copy doesn't mutate original df_copy.iloc[0, 0] = 0 assert df.iloc[0, 0] == 1 def test_copy_shallow(using_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_copy = df.copy(deep=False) # the shallow copy still shares memory assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) if using_copy_on_write: assert df_copy._mgr.refs is not None if using_copy_on_write: # mutating shallow copy doesn't mutate original df_copy.iloc[0, 0] = 0 assert df.iloc[0, 0] == 1 # mutating triggered a copy-on-write -> no longer shares memory assert not np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) # but still shares memory for the other columns/blocks assert np.shares_memory(get_array(df_copy, "c"), get_array(df, "c")) else: # mutating shallow copy does mutate original df_copy.iloc[0, 0] = 0 assert df.iloc[0, 0] == 0 # and still shares memory assert np.shares_memory(get_array(df_copy, "a"), get_array(df, "a")) # ----------------------------------------------------------------------------- # DataFrame methods returning new DataFrame using shallow copy def test_reset_index(using_copy_on_write): # Case: resetting the index (i.e. adding a new column) + mutating the # resulting dataframe df = DataFrame( {"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}, index=[10, 11, 12] ) df_orig = df.copy() df2 = df.reset_index() df2._mgr._verify_integrity() if using_copy_on_write: # still shares memory (df2 is a shallow copy) assert np.shares_memory(get_array(df2, "b"), get_array(df, "b")) assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) # mutating df2 triggers a copy-on-write for that column / block df2.iloc[0, 2] = 0 assert not np.shares_memory(get_array(df2, "b"), get_array(df, "b")) if using_copy_on_write: assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) tm.assert_frame_equal(df, df_orig) def test_rename_columns(using_copy_on_write): # Case: renaming columns returns a new dataframe # + afterwards modifying the result df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df.rename(columns=str.upper) if using_copy_on_write: assert np.shares_memory(get_array(df2, "A"), get_array(df, "a")) df2.iloc[0, 0] = 0 assert not np.shares_memory(get_array(df2, "A"), get_array(df, "a")) if using_copy_on_write: assert np.shares_memory(get_array(df2, "C"), get_array(df, "c")) expected = DataFrame({"A": [0, 2, 3], "B": [4, 5, 6], "C": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(df2, expected) tm.assert_frame_equal(df, df_orig) def test_rename_columns_modify_parent(using_copy_on_write): # Case: renaming columns returns a new dataframe # + afterwards modifying the original (parent) dataframe df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df2 = df.rename(columns=str.upper) df2_orig = df2.copy() if using_copy_on_write: assert np.shares_memory(get_array(df2, "A"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "A"), get_array(df, "a")) df.iloc[0, 0] = 0 assert not np.shares_memory(get_array(df2, "A"), get_array(df, "a")) if using_copy_on_write: assert np.shares_memory(get_array(df2, "C"), get_array(df, "c")) expected = DataFrame({"a": [0, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) tm.assert_frame_equal(df, expected) tm.assert_frame_equal(df2, df2_orig) def test_reindex_columns(using_copy_on_write): # Case: reindexing the column returns a new dataframe # + afterwards modifying the result df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df.reindex(columns=["a", "c"]) if using_copy_on_write: # still shares memory (df2 is a shallow copy) assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column df2.iloc[0, 0] = 0 assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) if using_copy_on_write: assert np.shares_memory(get_array(df2, "c"), get_array(df, "c")) tm.assert_frame_equal(df, df_orig) def test_select_dtypes(using_copy_on_write): # Case: selecting columns using `select_dtypes()` returns a new dataframe # + afterwards modifying the result df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() df2 = df.select_dtypes("int64") df2._mgr._verify_integrity() if using_copy_on_write: assert np.shares_memory(get_array(df2, "a"), get_array(df, "a")) else: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) # mutating df2 triggers a copy-on-write for that column/block df2.iloc[0, 0] = 0 if using_copy_on_write: assert not np.shares_memory(get_array(df2, "a"), get_array(df, "a")) tm.assert_frame_equal(df, df_orig) def test_to_frame(using_copy_on_write): # Case: converting a Series to a DataFrame with to_frame ser = Series([1, 2, 3]) ser_orig = ser.copy() df = ser[:].to_frame() # currently this always returns a "view" assert np.shares_memory(ser.values, get_array(df, 0)) df.iloc[0, 0] = 0 if using_copy_on_write: # mutating df triggers a copy-on-write for that column assert not np.shares_memory(ser.values, get_array(df, 0)) tm.assert_series_equal(ser, ser_orig) else: # but currently select_dtypes() actually returns a view -> mutates parent expected = ser_orig.copy() expected.iloc[0] = 0 tm.assert_series_equal(ser, expected) # modify original series -> don't modify dataframe df = ser[:].to_frame() ser.iloc[0] = 0 if using_copy_on_write: tm.assert_frame_equal(df, ser_orig.to_frame()) else: expected = ser_orig.copy().to_frame() expected.iloc[0, 0] = 0 tm.assert_frame_equal(df, expected) @pytest.mark.parametrize( "method, idx", [ (lambda df: df.copy(deep=False).copy(deep=False), 0), (lambda df: df.reset_index().reset_index(), 2), (lambda df: df.rename(columns=str.upper).rename(columns=str.lower), 0), (lambda df: df.copy(deep=False).select_dtypes(include="number"), 0), ], ids=["shallow-copy", "reset_index", "rename", "select_dtypes"], ) def test_chained_methods(request, method, idx, using_copy_on_write): df = DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [0.1, 0.2, 0.3]}) df_orig = df.copy() # when not using CoW, only the copy() variant actually gives a view df2_is_view = not using_copy_on_write and request.node.callspec.id == "shallow-copy" # modify df2 -> don't modify df df2 = method(df) df2.iloc[0, idx] = 0 if not df2_is_view: tm.assert_frame_equal(df, df_orig) # modify df -> don't modify df2 df2 = method(df) df.iloc[0, 0] = 0 if not df2_is_view: tm.assert_frame_equal(df2.iloc[:, idx:], df_orig) def test_putmask(using_copy_on_write): df = DataFrame({"a": [1, 2], "b": 1, "c": 2}) view = df[:] df_orig = df.copy() df[df == df] = 5 if using_copy_on_write: assert not np.shares_memory(get_array(view, "a"), get_array(df, "a")) tm.assert_frame_equal(view, df_orig) else: # Without CoW the original will be modified assert np.shares_memory(get_array(view, "a"), get_array(df, "a")) assert view.iloc[0, 0] == 5