ai-content-maker/.venv/Lib/site-packages/pandas/tests/frame/methods/test_explode.py

import numpy as np
import pytest

import pandas as pd
import pandas._testing as tm


def test_error():
    df = pd.DataFrame(
        {"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
    )
    with pytest.raises(
        ValueError, match="column must be a scalar, tuple, or list thereof"
    ):
        df.explode([list("AA")])

    with pytest.raises(ValueError, match="column must be unique"):
        df.explode(list("AA"))

    df.columns = list("AA")
    with pytest.raises(ValueError, match="columns must be unique"):
        df.explode("A")


@pytest.mark.parametrize(
    "input_subset, error_message",
    [
        (
            list("AC"),
            "columns must have matching element counts",
        ),
        (
            [],
            "column must be nonempty",
        ),
        (
            list("AC"),
            "columns must have matching element counts",
        ),
    ],
)
def test_error_multi_columns(input_subset, error_message):
    # GH 39240
    df = pd.DataFrame(
        {
            "A": [[0, 1, 2], np.nan, [], (3, 4)],
            "B": 1,
            "C": [["a", "b", "c"], "foo", [], ["d", "e", "f"]],
        },
        index=list("abcd"),
    )
    with pytest.raises(ValueError, match=error_message):
        df.explode(input_subset)


@pytest.mark.parametrize(
    "scalar",
    ["a", 0, 1.5, pd.Timedelta("1 days"), pd.Timestamp("2019-12-31")],
)
def test_basic(scalar):
    df = pd.DataFrame(
        {scalar: pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}
    )
    result = df.explode(scalar)
    expected = pd.DataFrame(
        {
            scalar: pd.Series(
                [0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object
            ),
            "B": 1,
        }
    )
    tm.assert_frame_equal(result, expected)


def test_multi_index_rows():
    df = pd.DataFrame(
        {"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1},
        index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]),
    )

    result = df.explode("A")
    expected = pd.DataFrame(
        {
            "A": pd.Series(
                [0, 1, 2, np.nan, np.nan, 3, 4],
                index=pd.MultiIndex.from_tuples(
                    [
                        ("a", 1),
                        ("a", 1),
                        ("a", 1),
                        ("a", 2),
                        ("b", 1),
                        ("b", 2),
                        ("b", 2),
                    ]
                ),
                dtype=object,
            ),
            "B": 1,
        }
    )
    tm.assert_frame_equal(result, expected)


def test_multi_index_columns():
    df = pd.DataFrame(
        {("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1}
    )

    result = df.explode(("A", 1))
    expected = pd.DataFrame(
        {
            ("A", 1): pd.Series(
                [0, 1, 2, np.nan, np.nan, 3, 4],
                index=pd.Index([0, 0, 0, 1, 2, 3, 3]),
                dtype=object,
            ),
            ("A", 2): 1,
        }
    )
    tm.assert_frame_equal(result, expected)


def test_usecase():
    # explode a single column
    # gh-10511
    df = pd.DataFrame(
        [[11, range(5), 10], [22, range(3), 20]], columns=list("ABC")
    ).set_index("C")
    result = df.explode("B")

    expected = pd.DataFrame(
        {
            "A": [11, 11, 11, 11, 11, 22, 22, 22],
            "B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object),
            "C": [10, 10, 10, 10, 10, 20, 20, 20],
        },
        columns=list("ABC"),
    ).set_index("C")

    tm.assert_frame_equal(result, expected)

    # gh-8517
    df = pd.DataFrame(
        [["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]],
        columns=["dt", "name", "text"],
    )
    result = df.assign(text=df.text.str.split(" ")).explode("text")
    expected = pd.DataFrame(
        [
            ["2014-01-01", "Alice", "A"],
            ["2014-01-01", "Alice", "B"],
            ["2014-01-02", "Bob", "C"],
            ["2014-01-02", "Bob", "D"],
        ],
        columns=["dt", "name", "text"],
        index=[0, 0, 1, 1],
    )
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
    "input_dict, input_index, expected_dict, expected_index",
    [
        (
            {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
            [0, 0],
            {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
            [0, 0, 0, 0],
        ),
        (
            {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
            pd.Index([0, 0], name="my_index"),
            {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
            pd.Index([0, 0, 0, 0], name="my_index"),
        ),
        (
            {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
            pd.MultiIndex.from_arrays(
                [[0, 0], [1, 1]], names=["my_first_index", "my_second_index"]
            ),
            {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
            pd.MultiIndex.from_arrays(
                [[0, 0, 0, 0], [1, 1, 1, 1]],
                names=["my_first_index", "my_second_index"],
            ),
        ),
        (
            {"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},
            pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["my_index", None]),
            {"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},
            pd.MultiIndex.from_arrays(
                [[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_index", None]
            ),
        ),
    ],
)
def test_duplicate_index(input_dict, input_index, expected_dict, expected_index):
    # GH 28005
    df = pd.DataFrame(input_dict, index=input_index)
    result = df.explode("col1")
    expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object)
    tm.assert_frame_equal(result, expected)


def test_ignore_index():
    # GH 34932
    df = pd.DataFrame({"id": range(0, 20, 10), "values": [list("ab"), list("cd")]})
    result = df.explode("values", ignore_index=True)
    expected = pd.DataFrame(
        {"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3]
    )
    tm.assert_frame_equal(result, expected)


def test_explode_sets():
    # https://github.com/pandas-dev/pandas/issues/35614
    df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1])
    result = df.explode(column="a").sort_values(by="a")
    expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1])
    tm.assert_frame_equal(result, expected)


@pytest.mark.parametrize(
    "input_subset, expected_dict, expected_index",
    [
        (
            list("AC"),
            {
                "A": pd.Series(
                    [0, 1, 2, np.nan, np.nan, 3, 4, np.nan],
                    index=list("aaabcdde"),
                    dtype=object,
                ),
                "B": 1,
                "C": ["a", "b", "c", "foo", np.nan, "d", "e", np.nan],
            },
            list("aaabcdde"),
        ),
        (
            list("A"),
            {
                "A": pd.Series(
                    [0, 1, 2, np.nan, np.nan, 3, 4, np.nan],
                    index=list("aaabcdde"),
                    dtype=object,
                ),
                "B": 1,
                "C": [
                    ["a", "b", "c"],
                    ["a", "b", "c"],
                    ["a", "b", "c"],
                    "foo",
                    [],
                    ["d", "e"],
                    ["d", "e"],
                    np.nan,
                ],
            },
            list("aaabcdde"),
        ),
    ],
)
def test_multi_columns(input_subset, expected_dict, expected_index):
    # GH 39240
    df = pd.DataFrame(
        {
            "A": [[0, 1, 2], np.nan, [], (3, 4), np.nan],
            "B": 1,
            "C": [["a", "b", "c"], "foo", [], ["d", "e"], np.nan],
        },
        index=list("abcde"),
    )
    result = df.explode(input_subset)
    expected = pd.DataFrame(expected_dict, expected_index)
    tm.assert_frame_equal(result, expected)
first commit 2024-05-03 04:18:51 +03:00			`import numpy as np`
			`import pytest`

			`import pandas as pd`
			`import pandas._testing as tm`


			`def test_error():`
			`df = pd.DataFrame(`
			`{"A": pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}`
			`)`
			`with pytest.raises(`
			`ValueError, match="column must be a scalar, tuple, or list thereof"`
			`):`
			`df.explode([list("AA")])`

			`with pytest.raises(ValueError, match="column must be unique"):`
			`df.explode(list("AA"))`

			`df.columns = list("AA")`
			`with pytest.raises(ValueError, match="columns must be unique"):`
			`df.explode("A")`


			`@pytest.mark.parametrize(`
			`"input_subset, error_message",`
			`[`
			`(`
			`list("AC"),`
			`"columns must have matching element counts",`
			`),`
			`(`
			`[],`
			`"column must be nonempty",`
			`),`
			`(`
			`list("AC"),`
			`"columns must have matching element counts",`
			`),`
			`],`
			`)`
			`def test_error_multi_columns(input_subset, error_message):`
			`# GH 39240`
			`df = pd.DataFrame(`
			`{`
			`"A": [[0, 1, 2], np.nan, [], (3, 4)],`
			`"B": 1,`
			`"C": [["a", "b", "c"], "foo", [], ["d", "e", "f"]],`
			`},`
			`index=list("abcd"),`
			`)`
			`with pytest.raises(ValueError, match=error_message):`
			`df.explode(input_subset)`


			`@pytest.mark.parametrize(`
			`"scalar",`
			`["a", 0, 1.5, pd.Timedelta("1 days"), pd.Timestamp("2019-12-31")],`
			`)`
			`def test_basic(scalar):`
			`df = pd.DataFrame(`
			`{scalar: pd.Series([[0, 1, 2], np.nan, [], (3, 4)], index=list("abcd")), "B": 1}`
			`)`
			`result = df.explode(scalar)`
			`expected = pd.DataFrame(`
			`{`
			`scalar: pd.Series(`
			`[0, 1, 2, np.nan, np.nan, 3, 4], index=list("aaabcdd"), dtype=object`
			`),`
			`"B": 1,`
			`}`
			`)`
			`tm.assert_frame_equal(result, expected)`


			`def test_multi_index_rows():`
			`df = pd.DataFrame(`
			`{"A": np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), "B": 1},`
			`index=pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1), ("b", 2)]),`
			`)`

			`result = df.explode("A")`
			`expected = pd.DataFrame(`
			`{`
			`"A": pd.Series(`
			`[0, 1, 2, np.nan, np.nan, 3, 4],`
			`index=pd.MultiIndex.from_tuples(`
			`[`
			`("a", 1),`
			`("a", 1),`
			`("a", 1),`
			`("a", 2),`
			`("b", 1),`
			`("b", 2),`
			`("b", 2),`
			`]`
			`),`
			`dtype=object,`
			`),`
			`"B": 1,`
			`}`
			`)`
			`tm.assert_frame_equal(result, expected)`


			`def test_multi_index_columns():`
			`df = pd.DataFrame(`
			`{("A", 1): np.array([[0, 1, 2], np.nan, [], (3, 4)], dtype=object), ("A", 2): 1}`
			`)`

			`result = df.explode(("A", 1))`
			`expected = pd.DataFrame(`
			`{`
			`("A", 1): pd.Series(`
			`[0, 1, 2, np.nan, np.nan, 3, 4],`
			`index=pd.Index([0, 0, 0, 1, 2, 3, 3]),`
			`dtype=object,`
			`),`
			`("A", 2): 1,`
			`}`
			`)`
			`tm.assert_frame_equal(result, expected)`


			`def test_usecase():`
			`# explode a single column`
			`# gh-10511`
			`df = pd.DataFrame(`
			`[[11, range(5), 10], [22, range(3), 20]], columns=list("ABC")`
			`).set_index("C")`
			`result = df.explode("B")`

			`expected = pd.DataFrame(`
			`{`
			`"A": [11, 11, 11, 11, 11, 22, 22, 22],`
			`"B": np.array([0, 1, 2, 3, 4, 0, 1, 2], dtype=object),`
			`"C": [10, 10, 10, 10, 10, 20, 20, 20],`
			`},`
			`columns=list("ABC"),`
			`).set_index("C")`

			`tm.assert_frame_equal(result, expected)`

			`# gh-8517`
			`df = pd.DataFrame(`
			`[["2014-01-01", "Alice", "A B"], ["2014-01-02", "Bob", "C D"]],`
			`columns=["dt", "name", "text"],`
			`)`
			`result = df.assign(text=df.text.str.split(" ")).explode("text")`
			`expected = pd.DataFrame(`
			`[`
			`["2014-01-01", "Alice", "A"],`
			`["2014-01-01", "Alice", "B"],`
			`["2014-01-02", "Bob", "C"],`
			`["2014-01-02", "Bob", "D"],`
			`],`
			`columns=["dt", "name", "text"],`
			`index=[0, 0, 1, 1],`
			`)`
			`tm.assert_frame_equal(result, expected)`


			`@pytest.mark.parametrize(`
			`"input_dict, input_index, expected_dict, expected_index",`
			`[`
			`(`
			`{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},`
			`[0, 0],`
			`{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},`
			`[0, 0, 0, 0],`
			`),`
			`(`
			`{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},`
			`pd.Index([0, 0], name="my_index"),`
			`{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},`
			`pd.Index([0, 0, 0, 0], name="my_index"),`
			`),`
			`(`
			`{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},`
			`pd.MultiIndex.from_arrays(`
			`[[0, 0], [1, 1]], names=["my_first_index", "my_second_index"]`
			`),`
			`{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},`
			`pd.MultiIndex.from_arrays(`
			`[[0, 0, 0, 0], [1, 1, 1, 1]],`
			`names=["my_first_index", "my_second_index"],`
			`),`
			`),`
			`(`
			`{"col1": [[1, 2], [3, 4]], "col2": ["foo", "bar"]},`
			`pd.MultiIndex.from_arrays([[0, 0], [1, 1]], names=["my_index", None]),`
			`{"col1": [1, 2, 3, 4], "col2": ["foo", "foo", "bar", "bar"]},`
			`pd.MultiIndex.from_arrays(`
			`[[0, 0, 0, 0], [1, 1, 1, 1]], names=["my_index", None]`
			`),`
			`),`
			`],`
			`)`
			`def test_duplicate_index(input_dict, input_index, expected_dict, expected_index):`
			`# GH 28005`
			`df = pd.DataFrame(input_dict, index=input_index)`
			`result = df.explode("col1")`
			`expected = pd.DataFrame(expected_dict, index=expected_index, dtype=object)`
			`tm.assert_frame_equal(result, expected)`


			`def test_ignore_index():`
			`# GH 34932`
			`df = pd.DataFrame({"id": range(0, 20, 10), "values": [list("ab"), list("cd")]})`
			`result = df.explode("values", ignore_index=True)`
			`expected = pd.DataFrame(`
			`{"id": [0, 0, 10, 10], "values": list("abcd")}, index=[0, 1, 2, 3]`
			`)`
			`tm.assert_frame_equal(result, expected)`


			`def test_explode_sets():`
			`# https://github.com/pandas-dev/pandas/issues/35614`
			`df = pd.DataFrame({"a": [{"x", "y"}], "b": [1]}, index=[1])`
			`result = df.explode(column="a").sort_values(by="a")`
			`expected = pd.DataFrame({"a": ["x", "y"], "b": [1, 1]}, index=[1, 1])`
			`tm.assert_frame_equal(result, expected)`


			`@pytest.mark.parametrize(`
			`"input_subset, expected_dict, expected_index",`
			`[`
			`(`
			`list("AC"),`
			`{`
			`"A": pd.Series(`
			`[0, 1, 2, np.nan, np.nan, 3, 4, np.nan],`
			`index=list("aaabcdde"),`
			`dtype=object,`
			`),`
			`"B": 1,`
			`"C": ["a", "b", "c", "foo", np.nan, "d", "e", np.nan],`
			`},`
			`list("aaabcdde"),`
			`),`
			`(`
			`list("A"),`
			`{`
			`"A": pd.Series(`
			`[0, 1, 2, np.nan, np.nan, 3, 4, np.nan],`
			`index=list("aaabcdde"),`
			`dtype=object,`
			`),`
			`"B": 1,`
			`"C": [`
			`["a", "b", "c"],`
			`["a", "b", "c"],`
			`["a", "b", "c"],`
			`"foo",`
			`[],`
			`["d", "e"],`
			`["d", "e"],`
			`np.nan,`
			`],`
			`},`
			`list("aaabcdde"),`
			`),`
			`],`
			`)`
			`def test_multi_columns(input_subset, expected_dict, expected_index):`
			`# GH 39240`
			`df = pd.DataFrame(`
			`{`
			`"A": [[0, 1, 2], np.nan, [], (3, 4), np.nan],`
			`"B": 1,`
			`"C": [["a", "b", "c"], "foo", [], ["d", "e"], np.nan],`
			`},`
			`index=list("abcde"),`
			`)`
			`result = df.explode(input_subset)`
			`expected = pd.DataFrame(expected_dict, expected_index)`
			`tm.assert_frame_equal(result, expected)`