ai-content-maker/.venv/Lib/site-packages/pandas/tests/io/sas/test_sas7bdat.py

400 lines
14 KiB
Python

import contextlib
from datetime import datetime
import io
import os
from pathlib import Path
import dateutil.parser
import numpy as np
import pytest
from pandas.errors import EmptyDataError
import pandas.util._test_decorators as td
import pandas as pd
import pandas._testing as tm
@pytest.fixture
def dirpath(datapath):
return datapath("io", "sas", "data")
@pytest.fixture(params=[(1, range(1, 16)), (2, [16])])
def data_test_ix(request, dirpath):
i, test_ix = request.param
fname = os.path.join(dirpath, f"test_sas7bdat_{i}.csv")
df = pd.read_csv(fname)
epoch = datetime(1960, 1, 1)
t1 = pd.to_timedelta(df["Column4"], unit="d")
df["Column4"] = epoch + t1
t2 = pd.to_timedelta(df["Column12"], unit="d")
df["Column12"] = epoch + t2
for k in range(df.shape[1]):
col = df.iloc[:, k]
if col.dtype == np.int64:
df.isetitem(k, df.iloc[:, k].astype(np.float64))
return df, test_ix
# https://github.com/cython/cython/issues/1720
@pytest.mark.filterwarnings("ignore:can't resolve package:ImportWarning")
class TestSAS7BDAT:
@pytest.mark.slow
def test_from_file(self, dirpath, data_test_ix):
df0, test_ix = data_test_ix
for k in test_ix:
fname = os.path.join(dirpath, f"test{k}.sas7bdat")
df = pd.read_sas(fname, encoding="utf-8")
tm.assert_frame_equal(df, df0)
@pytest.mark.slow
def test_from_buffer(self, dirpath, data_test_ix):
df0, test_ix = data_test_ix
for k in test_ix:
fname = os.path.join(dirpath, f"test{k}.sas7bdat")
with open(fname, "rb") as f:
byts = f.read()
buf = io.BytesIO(byts)
with pd.read_sas(
buf, format="sas7bdat", iterator=True, encoding="utf-8"
) as rdr:
df = rdr.read()
tm.assert_frame_equal(df, df0, check_exact=False)
@pytest.mark.slow
def test_from_iterator(self, dirpath, data_test_ix):
df0, test_ix = data_test_ix
for k in test_ix:
fname = os.path.join(dirpath, f"test{k}.sas7bdat")
with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr:
df = rdr.read(2)
tm.assert_frame_equal(df, df0.iloc[0:2, :])
df = rdr.read(3)
tm.assert_frame_equal(df, df0.iloc[2:5, :])
@pytest.mark.slow
def test_path_pathlib(self, dirpath, data_test_ix):
df0, test_ix = data_test_ix
for k in test_ix:
fname = Path(os.path.join(dirpath, f"test{k}.sas7bdat"))
df = pd.read_sas(fname, encoding="utf-8")
tm.assert_frame_equal(df, df0)
@td.skip_if_no("py.path")
@pytest.mark.slow
def test_path_localpath(self, dirpath, data_test_ix):
from py.path import local as LocalPath
df0, test_ix = data_test_ix
for k in test_ix:
fname = LocalPath(os.path.join(dirpath, f"test{k}.sas7bdat"))
df = pd.read_sas(fname, encoding="utf-8")
tm.assert_frame_equal(df, df0)
@pytest.mark.slow
@pytest.mark.parametrize("chunksize", (3, 5, 10, 11))
@pytest.mark.parametrize("k", range(1, 17))
def test_iterator_loop(self, dirpath, k, chunksize):
# github #13654
fname = os.path.join(dirpath, f"test{k}.sas7bdat")
with pd.read_sas(fname, chunksize=chunksize, encoding="utf-8") as rdr:
y = 0
for x in rdr:
y += x.shape[0]
assert y == rdr.row_count
def test_iterator_read_too_much(self, dirpath):
# github #14734
fname = os.path.join(dirpath, "test1.sas7bdat")
with pd.read_sas(
fname, format="sas7bdat", iterator=True, encoding="utf-8"
) as rdr:
d1 = rdr.read(rdr.row_count + 20)
with pd.read_sas(fname, iterator=True, encoding="utf-8") as rdr:
d2 = rdr.read(rdr.row_count + 20)
tm.assert_frame_equal(d1, d2)
def test_encoding_options(datapath):
fname = datapath("io", "sas", "data", "test1.sas7bdat")
df1 = pd.read_sas(fname)
df2 = pd.read_sas(fname, encoding="utf-8")
for col in df1.columns:
try:
df1[col] = df1[col].str.decode("utf-8")
except AttributeError:
pass
tm.assert_frame_equal(df1, df2)
from pandas.io.sas.sas7bdat import SAS7BDATReader
with contextlib.closing(SAS7BDATReader(fname, convert_header_text=False)) as rdr:
df3 = rdr.read()
for x, y in zip(df1.columns, df3.columns):
assert x == y.decode()
def test_productsales(datapath):
fname = datapath("io", "sas", "data", "productsales.sas7bdat")
df = pd.read_sas(fname, encoding="utf-8")
fname = datapath("io", "sas", "data", "productsales.csv")
df0 = pd.read_csv(fname, parse_dates=["MONTH"])
vn = ["ACTUAL", "PREDICT", "QUARTER", "YEAR"]
df0[vn] = df0[vn].astype(np.float64)
tm.assert_frame_equal(df, df0)
def test_12659(datapath):
fname = datapath("io", "sas", "data", "test_12659.sas7bdat")
df = pd.read_sas(fname)
fname = datapath("io", "sas", "data", "test_12659.csv")
df0 = pd.read_csv(fname)
df0 = df0.astype(np.float64)
tm.assert_frame_equal(df, df0)
def test_airline(datapath):
fname = datapath("io", "sas", "data", "airline.sas7bdat")
df = pd.read_sas(fname)
fname = datapath("io", "sas", "data", "airline.csv")
df0 = pd.read_csv(fname)
df0 = df0.astype(np.float64)
tm.assert_frame_equal(df, df0, check_exact=False)
def test_date_time(datapath):
# Support of different SAS date/datetime formats (PR #15871)
fname = datapath("io", "sas", "data", "datetime.sas7bdat")
df = pd.read_sas(fname)
fname = datapath("io", "sas", "data", "datetime.csv")
df0 = pd.read_csv(
fname, parse_dates=["Date1", "Date2", "DateTime", "DateTimeHi", "Taiw"]
)
# GH 19732: Timestamps imported from sas will incur floating point errors
df[df.columns[3]] = df.iloc[:, 3].dt.round("us")
tm.assert_frame_equal(df, df0)
@pytest.mark.parametrize("column", ["WGT", "CYL"])
def test_compact_numerical_values(datapath, column):
# Regression test for #21616
fname = datapath("io", "sas", "data", "cars.sas7bdat")
df = pd.read_sas(fname, encoding="latin-1")
# The two columns CYL and WGT in cars.sas7bdat have column
# width < 8 and only contain integral values.
# Test that pandas doesn't corrupt the numbers by adding
# decimals.
result = df[column]
expected = df[column].round()
tm.assert_series_equal(result, expected, check_exact=True)
def test_many_columns(datapath):
# Test for looking for column information in more places (PR #22628)
fname = datapath("io", "sas", "data", "many_columns.sas7bdat")
df = pd.read_sas(fname, encoding="latin-1")
fname = datapath("io", "sas", "data", "many_columns.csv")
df0 = pd.read_csv(fname, encoding="latin-1")
tm.assert_frame_equal(df, df0)
def test_inconsistent_number_of_rows(datapath):
# Regression test for issue #16615. (PR #22628)
fname = datapath("io", "sas", "data", "load_log.sas7bdat")
df = pd.read_sas(fname, encoding="latin-1")
assert len(df) == 2097
def test_zero_variables(datapath):
# Check if the SAS file has zero variables (PR #18184)
fname = datapath("io", "sas", "data", "zero_variables.sas7bdat")
with pytest.raises(EmptyDataError, match="No columns to parse from file"):
pd.read_sas(fname)
def test_zero_rows(datapath):
# GH 18198
fname = datapath("io", "sas", "data", "zero_rows.sas7bdat")
result = pd.read_sas(fname)
expected = pd.DataFrame([{"char_field": "a", "num_field": 1.0}]).iloc[:0]
tm.assert_frame_equal(result, expected)
def test_corrupt_read(datapath):
# We don't really care about the exact failure, the important thing is
# that the resource should be cleaned up afterwards (BUG #35566)
fname = datapath("io", "sas", "data", "corrupt.sas7bdat")
msg = "'SAS7BDATReader' object has no attribute 'row_count'"
with pytest.raises(AttributeError, match=msg):
pd.read_sas(fname)
def round_datetime_to_ms(ts):
if isinstance(ts, datetime):
return ts.replace(microsecond=int(round(ts.microsecond, -3) / 1000) * 1000)
elif isinstance(ts, str):
_ts = dateutil.parser.parse(timestr=ts)
return _ts.replace(microsecond=int(round(_ts.microsecond, -3) / 1000) * 1000)
else:
return ts
def test_max_sas_date(datapath):
# GH 20927
# NB. max datetime in SAS dataset is 31DEC9999:23:59:59.999
# but this is read as 29DEC9999:23:59:59.998993 by a buggy
# sas7bdat module
fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat")
df = pd.read_sas(fname, encoding="iso-8859-1")
# SAS likes to left pad strings with spaces - lstrip before comparing
df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x)
# GH 19732: Timestamps imported from sas will incur floating point errors
try:
df["dt_as_dt"] = df["dt_as_dt"].dt.round("us")
except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime:
df = df.applymap(round_datetime_to_ms)
except AttributeError:
df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms)
# if there are any date/times > pandas.Timestamp.max then ALL in that chunk
# are returned as datetime.datetime
expected = pd.DataFrame(
{
"text": ["max", "normal"],
"dt_as_float": [253717747199.999, 1880323199.999],
"dt_as_dt": [
datetime(9999, 12, 29, 23, 59, 59, 999000),
datetime(2019, 8, 1, 23, 59, 59, 999000),
],
"date_as_float": [2936547.0, 21762.0],
"date_as_date": [datetime(9999, 12, 29), datetime(2019, 8, 1)],
},
columns=["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"],
)
tm.assert_frame_equal(df, expected)
def test_max_sas_date_iterator(datapath):
# GH 20927
# when called as an iterator, only those chunks with a date > pd.Timestamp.max
# are returned as datetime.datetime, if this happens that whole chunk is returned
# as datetime.datetime
col_order = ["text", "dt_as_float", "dt_as_dt", "date_as_float", "date_as_date"]
fname = datapath("io", "sas", "data", "max_sas_date.sas7bdat")
results = []
for df in pd.read_sas(fname, encoding="iso-8859-1", chunksize=1):
# SAS likes to left pad strings with spaces - lstrip before comparing
df = df.applymap(lambda x: x.lstrip() if isinstance(x, str) else x)
# GH 19732: Timestamps imported from sas will incur floating point errors
try:
df["dt_as_dt"] = df["dt_as_dt"].dt.round("us")
except pd._libs.tslibs.np_datetime.OutOfBoundsDatetime:
df = df.applymap(round_datetime_to_ms)
except AttributeError:
df["dt_as_dt"] = df["dt_as_dt"].apply(round_datetime_to_ms)
df.reset_index(inplace=True, drop=True)
results.append(df)
expected = [
pd.DataFrame(
{
"text": ["max"],
"dt_as_float": [253717747199.999],
"dt_as_dt": [datetime(9999, 12, 29, 23, 59, 59, 999000)],
"date_as_float": [2936547.0],
"date_as_date": [datetime(9999, 12, 29)],
},
columns=col_order,
),
pd.DataFrame(
{
"text": ["normal"],
"dt_as_float": [1880323199.999],
"dt_as_dt": [np.datetime64("2019-08-01 23:59:59.999")],
"date_as_float": [21762.0],
"date_as_date": [np.datetime64("2019-08-01")],
},
columns=col_order,
),
]
for result, expected in zip(results, expected):
tm.assert_frame_equal(result, expected)
def test_null_date(datapath):
fname = datapath("io", "sas", "data", "dates_null.sas7bdat")
df = pd.read_sas(fname, encoding="utf-8")
expected = pd.DataFrame(
{
"datecol": [
datetime(9999, 12, 29),
pd.NaT,
],
"datetimecol": [
datetime(9999, 12, 29, 23, 59, 59, 998993),
pd.NaT,
],
},
)
tm.assert_frame_equal(df, expected)
def test_meta2_page(datapath):
# GH 35545
fname = datapath("io", "sas", "data", "test_meta2_page.sas7bdat")
df = pd.read_sas(fname)
assert len(df) == 1000
@pytest.mark.parametrize("test_file", ["test2.sas7bdat", "test3.sas7bdat"])
def test_exception_propagation_rdc_rle_decompress(datapath, monkeypatch, test_file):
"""Errors in RLE/RDC decompression should propagate the same error."""
orig_np_zeros = np.zeros
def _patched_zeros(size, dtype):
if isinstance(size, int):
# np.zeros() call in {rdc,rle}_decompress
raise Exception("Test exception")
else:
# Other calls to np.zeros
return orig_np_zeros(size, dtype)
monkeypatch.setattr(np, "zeros", _patched_zeros)
with pytest.raises(Exception, match="^Test exception$"):
pd.read_sas(datapath("io", "sas", "data", test_file))
def test_exception_propagation_rle_decompress(tmp_path, datapath):
"""Illegal control byte in RLE decompressor should raise the correct ValueError."""
with open(datapath("io", "sas", "data", "test2.sas7bdat"), "rb") as f:
data = bytearray(f.read())
invalid_control_byte = 0x10
page_offset = 0x10000
control_byte_pos = 55229
data[page_offset + control_byte_pos] = invalid_control_byte
tmp_file = tmp_path / "test2.sas7bdat"
tmp_file.write_bytes(data)
with pytest.raises(ValueError, match="unknown control byte"):
pd.read_sas(tmp_file)
def test_0x40_control_byte(datapath):
# GH 31243
fname = datapath("io", "sas", "data", "0x40controlbyte.sas7bdat")
df = pd.read_sas(fname, encoding="ascii")
fname = datapath("io", "sas", "data", "0x40controlbyte.csv")
df0 = pd.read_csv(fname, dtype="object")
tm.assert_frame_equal(df, df0)
def test_0x00_control_byte(datapath):
# GH 47099
fname = datapath("io", "sas", "data", "0x00controlbyte.sas7bdat.bz2")
df = next(pd.read_sas(fname, chunksize=11_000))
assert df.shape == (11_000, 20)