ai-content-maker/.venv/Lib/site-packages/pandas/_libs/tslib.pyx

858 lines
28 KiB
Cython
Raw Permalink Normal View History

2024-05-03 04:18:51 +03:00
import warnings
cimport cython
from cpython.datetime cimport (
PyDate_Check,
PyDateTime_Check,
datetime,
import_datetime,
tzinfo,
)
from pandas.util._exceptions import find_stack_level
# import datetime C API
import_datetime()
cimport numpy as cnp
from numpy cimport (
float64_t,
int64_t,
ndarray,
)
import numpy as np
cnp.import_array()
import pytz
from pandas._libs.tslibs.np_datetime cimport (
NPY_DATETIMEUNIT,
NPY_FR_ns,
check_dts_bounds,
dtstruct_to_dt64,
get_datetime64_value,
npy_datetimestruct,
pandas_datetime_to_datetimestruct,
pydate_to_dt64,
pydatetime_to_dt64,
string_to_dts,
)
from pandas._libs.util cimport (
is_datetime64_object,
is_float_object,
is_integer_object,
)
from pandas._libs.tslibs.np_datetime import OutOfBoundsDatetime
from pandas._libs.tslibs.parsing import parse_datetime_string
from pandas._libs.tslibs.conversion cimport (
_TSObject,
cast_from_unit,
convert_datetime_to_tsobject,
get_datetime64_nanos,
precision_from_unit,
)
from pandas._libs.tslibs.nattype cimport (
NPY_NAT,
c_NaT as NaT,
c_nat_strings as nat_strings,
)
from pandas._libs.tslibs.timestamps cimport _Timestamp
from pandas._libs.tslibs.timezones cimport tz_compare
from pandas._libs.tslibs import (
Resolution,
get_resolution,
)
from pandas._libs.tslibs.timestamps import Timestamp
# Note: this is the only non-tslibs intra-pandas dependency here
from pandas._libs.missing cimport checknull_with_nat_and_na
from pandas._libs.tslibs.tzconversion cimport tz_localize_to_utc_single
def _test_parse_iso8601(ts: str):
"""
TESTING ONLY: Parse string into Timestamp using iso8601 parser. Used
only for testing, actual construction uses `convert_str_to_tsobject`
"""
cdef:
_TSObject obj
int out_local = 0, out_tzoffset = 0
NPY_DATETIMEUNIT out_bestunit
obj = _TSObject()
if ts == 'now':
return Timestamp.utcnow()
elif ts == 'today':
return Timestamp.now().normalize()
string_to_dts(ts, &obj.dts, &out_bestunit, &out_local, &out_tzoffset, True)
obj.value = dtstruct_to_dt64(&obj.dts)
check_dts_bounds(&obj.dts)
if out_local == 1:
obj.tzinfo = pytz.FixedOffset(out_tzoffset)
obj.value = tz_localize_to_utc_single(obj.value, obj.tzinfo)
return Timestamp(obj.value, tz=obj.tzinfo)
else:
return Timestamp(obj.value)
@cython.wraparound(False)
@cython.boundscheck(False)
def format_array_from_datetime(
ndarray values,
tzinfo tz=None,
str format=None,
object na_rep=None,
NPY_DATETIMEUNIT reso=NPY_FR_ns,
) -> np.ndarray:
"""
return a np object array of the string formatted values
Parameters
----------
values : a 1-d i8 array
tz : tzinfo or None, default None
format : str or None, default None
a strftime capable string
na_rep : optional, default is None
a nat format
reso : NPY_DATETIMEUNIT, default NPY_FR_ns
Returns
-------
np.ndarray[object]
"""
cdef:
int64_t val, ns, N = values.size
bint show_ms = False, show_us = False, show_ns = False
bint basic_format = False, basic_format_day = False
_Timestamp ts
object res
npy_datetimestruct dts
# Note that `result` (and thus `result_flat`) is C-order and
# `it` iterates C-order as well, so the iteration matches
# See discussion at
# github.com/pandas-dev/pandas/pull/46886#discussion_r860261305
ndarray result = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0)
object[::1] res_flat = result.ravel() # should NOT be a copy
cnp.flatiter it = cnp.PyArray_IterNew(values)
if na_rep is None:
na_rep = 'NaT'
if tz is None:
# if we don't have a format nor tz, then choose
# a format based on precision
basic_format = format is None
if basic_format:
reso_obj = get_resolution(values, tz=tz, reso=reso)
show_ns = reso_obj == Resolution.RESO_NS
show_us = reso_obj == Resolution.RESO_US
show_ms = reso_obj == Resolution.RESO_MS
elif format == "%Y-%m-%d %H:%M:%S":
# Same format as default, but with hardcoded precision (s)
basic_format = True
show_ns = show_us = show_ms = False
elif format == "%Y-%m-%d %H:%M:%S.%f":
# Same format as default, but with hardcoded precision (us)
basic_format = show_us = True
show_ns = show_ms = False
elif format == "%Y-%m-%d":
# Default format for dates
basic_format_day = True
assert not (basic_format_day and basic_format)
for i in range(N):
# Analogous to: utc_val = values[i]
val = (<int64_t*>cnp.PyArray_ITER_DATA(it))[0]
if val == NPY_NAT:
res = na_rep
elif basic_format_day:
pandas_datetime_to_datetimestruct(val, reso, &dts)
res = f'{dts.year}-{dts.month:02d}-{dts.day:02d}'
elif basic_format:
pandas_datetime_to_datetimestruct(val, reso, &dts)
res = (f'{dts.year}-{dts.month:02d}-{dts.day:02d} '
f'{dts.hour:02d}:{dts.min:02d}:{dts.sec:02d}')
if show_ns:
ns = dts.ps // 1000
res += f'.{ns + dts.us * 1000:09d}'
elif show_us:
res += f'.{dts.us:06d}'
elif show_ms:
res += f'.{dts.us // 1000:03d}'
else:
ts = Timestamp._from_value_and_reso(val, reso=reso, tz=tz)
if format is None:
# Use datetime.str, that returns ts.isoformat(sep=' ')
res = str(ts)
else:
# invalid format string
# requires dates > 1900
try:
# Note: dispatches to pydatetime
res = ts.strftime(format)
except ValueError:
# Use datetime.str, that returns ts.isoformat(sep=' ')
res = str(ts)
# Note: we can index result directly instead of using PyArray_MultiIter_DATA
# like we do for the other functions because result is known C-contiguous
# and is the first argument to PyArray_MultiIterNew2. The usual pattern
# does not seem to work with object dtype.
# See discussion at
# github.com/pandas-dev/pandas/pull/46886#discussion_r860261305
res_flat[i] = res
cnp.PyArray_ITER_NEXT(it)
return result
def array_with_unit_to_datetime(
ndarray values,
str unit,
str errors="coerce"
):
"""
Convert the ndarray to datetime according to the time unit.
This function converts an array of objects into a numpy array of
datetime64[ns]. It returns the converted array
and also returns the timezone offset
if errors:
- raise: return converted values or raise OutOfBoundsDatetime
if out of range on the conversion or
ValueError for other conversions (e.g. a string)
- ignore: return non-convertible values as the same unit
- coerce: NaT for non-convertibles
Parameters
----------
values : ndarray
Date-like objects to convert.
unit : str
Time unit to use during conversion.
errors : str, default 'raise'
Error behavior when parsing.
Returns
-------
result : ndarray of m8 values
tz : parsed timezone offset or None
"""
cdef:
Py_ssize_t i, j, n=len(values)
int64_t mult
int prec = 0
ndarray[float64_t] fvalues
bint is_ignore = errors=='ignore'
bint is_coerce = errors=='coerce'
bint is_raise = errors=='raise'
bint need_to_iterate = True
ndarray[int64_t] iresult
ndarray[object] oresult
ndarray mask
object tz = None
assert is_ignore or is_coerce or is_raise
if unit == "ns":
if issubclass(values.dtype.type, (np.integer, np.float_)):
result = values.astype("M8[ns]", copy=False)
else:
result, tz = array_to_datetime(
values.astype(object, copy=False),
errors=errors,
)
return result, tz
mult, _ = precision_from_unit(unit)
if is_raise:
# try a quick conversion to i8/f8
# if we have nulls that are not type-compat
# then need to iterate
if values.dtype.kind in ["i", "f", "u"]:
iresult = values.astype("i8", copy=False)
# fill missing values by comparing to NPY_NAT
mask = iresult == NPY_NAT
iresult[mask] = 0
fvalues = iresult.astype("f8") * mult
need_to_iterate = False
if not need_to_iterate:
# check the bounds
if (fvalues < Timestamp.min.value).any() or (
(fvalues > Timestamp.max.value).any()
):
raise OutOfBoundsDatetime(f"cannot convert input with unit '{unit}'")
if values.dtype.kind in ["i", "u"]:
result = (iresult * mult).astype("M8[ns]")
elif values.dtype.kind == "f":
fresult = (values * mult).astype("f8")
fresult[mask] = 0
if prec:
fresult = round(fresult, prec)
result = fresult.astype("M8[ns]", copy=False)
iresult = result.view("i8")
iresult[mask] = NPY_NAT
return result, tz
result = np.empty(n, dtype='M8[ns]')
iresult = result.view('i8')
try:
for i in range(n):
val = values[i]
if checknull_with_nat_and_na(val):
iresult[i] = NPY_NAT
elif is_integer_object(val) or is_float_object(val):
if val != val or val == NPY_NAT:
iresult[i] = NPY_NAT
else:
try:
iresult[i] = cast_from_unit(val, unit)
except OverflowError:
if is_raise:
raise OutOfBoundsDatetime(
f"cannot convert input {val} with the unit '{unit}'"
)
elif is_ignore:
raise AssertionError
iresult[i] = NPY_NAT
elif isinstance(val, str):
if len(val) == 0 or val in nat_strings:
iresult[i] = NPY_NAT
else:
try:
iresult[i] = cast_from_unit(float(val), unit)
except ValueError:
if is_raise:
raise ValueError(
f"non convertible value {val} with the unit '{unit}'"
)
elif is_ignore:
raise AssertionError
iresult[i] = NPY_NAT
except OverflowError:
if is_raise:
raise OutOfBoundsDatetime(
f"cannot convert input {val} with the unit '{unit}'"
)
elif is_ignore:
raise AssertionError
iresult[i] = NPY_NAT
else:
if is_raise:
raise ValueError(
f"unit='{unit}' not valid with non-numerical val='{val}'"
)
if is_ignore:
raise AssertionError
iresult[i] = NPY_NAT
return result, tz
except AssertionError:
pass
# we have hit an exception
# and are in ignore mode
# redo as object
oresult = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0)
for i in range(n):
val = values[i]
if checknull_with_nat_and_na(val):
oresult[i] = <object>NaT
elif is_integer_object(val) or is_float_object(val):
if val != val or val == NPY_NAT:
oresult[i] = <object>NaT
else:
try:
oresult[i] = Timestamp(cast_from_unit(val, unit))
except OverflowError:
oresult[i] = val
elif isinstance(val, str):
if len(val) == 0 or val in nat_strings:
oresult[i] = <object>NaT
else:
oresult[i] = val
return oresult, tz
@cython.wraparound(False)
@cython.boundscheck(False)
cpdef array_to_datetime(
ndarray[object] values,
str errors='raise',
bint dayfirst=False,
bint yearfirst=False,
bint utc=False,
bint require_iso8601=False,
bint allow_mixed=False,
):
"""
Converts a 1D array of date-like values to a numpy array of either:
1) datetime64[ns] data
2) datetime.datetime objects, if OutOfBoundsDatetime or TypeError
is encountered
Also returns a pytz.FixedOffset if an array of strings with the same
timezone offset is passed and utc=True is not passed. Otherwise, None
is returned
Handles datetime.date, datetime.datetime, np.datetime64 objects, numeric,
strings
Parameters
----------
values : ndarray of object
date-like objects to convert
errors : str, default 'raise'
error behavior when parsing
dayfirst : bool, default False
dayfirst parsing behavior when encountering datetime strings
yearfirst : bool, default False
yearfirst parsing behavior when encountering datetime strings
utc : bool, default False
indicator whether the dates should be UTC
require_iso8601 : bool, default False
indicator whether the datetime string should be iso8601
allow_mixed : bool, default False
Whether to allow mixed datetimes and integers.
Returns
-------
np.ndarray
May be datetime64[ns] or object dtype
tzinfo or None
"""
cdef:
Py_ssize_t i, n = len(values)
object val, tz
ndarray[int64_t] iresult
ndarray[object] oresult
npy_datetimestruct dts
NPY_DATETIMEUNIT out_bestunit
bint utc_convert = bool(utc)
bint seen_integer = False
bint seen_string = False
bint seen_datetime = False
bint seen_datetime_offset = False
bint is_raise = errors=='raise'
bint is_ignore = errors=='ignore'
bint is_coerce = errors=='coerce'
bint is_same_offsets
_TSObject _ts
int64_t value
int out_local = 0, out_tzoffset = 0
float offset_seconds, tz_offset
set out_tzoffset_vals = set()
bint string_to_dts_failed
datetime py_dt
tzinfo tz_out = None
bint found_tz = False, found_naive = False
# specify error conditions
assert is_raise or is_ignore or is_coerce
result = np.empty(n, dtype='M8[ns]')
iresult = result.view('i8')
try:
for i in range(n):
val = values[i]
try:
if checknull_with_nat_and_na(val):
iresult[i] = NPY_NAT
elif PyDateTime_Check(val):
seen_datetime = True
if val.tzinfo is not None:
found_tz = True
if utc_convert:
_ts = convert_datetime_to_tsobject(val, None)
iresult[i] = _ts.value
elif found_naive:
raise ValueError('Tz-aware datetime.datetime '
'cannot be converted to '
'datetime64 unless utc=True')
elif tz_out is not None and not tz_compare(tz_out, val.tzinfo):
raise ValueError('Tz-aware datetime.datetime '
'cannot be converted to '
'datetime64 unless utc=True')
else:
found_tz = True
tz_out = val.tzinfo
_ts = convert_datetime_to_tsobject(val, None)
iresult[i] = _ts.value
else:
found_naive = True
if found_tz and not utc_convert:
raise ValueError('Cannot mix tz-aware with '
'tz-naive values')
if isinstance(val, _Timestamp):
iresult[i] = val.value
else:
iresult[i] = pydatetime_to_dt64(val, &dts)
check_dts_bounds(&dts)
elif PyDate_Check(val):
seen_datetime = True
iresult[i] = pydate_to_dt64(val, &dts)
check_dts_bounds(&dts)
elif is_datetime64_object(val):
seen_datetime = True
iresult[i] = get_datetime64_nanos(val)
elif is_integer_object(val) or is_float_object(val):
# these must be ns unit by-definition
seen_integer = True
if val != val or val == NPY_NAT:
iresult[i] = NPY_NAT
elif is_raise or is_ignore:
iresult[i] = val
else:
# coerce
# we now need to parse this as if unit='ns'
# we can ONLY accept integers at this point
# if we have previously (or in future accept
# datetimes/strings, then we must coerce)
try:
iresult[i] = cast_from_unit(val, 'ns')
except OverflowError:
iresult[i] = NPY_NAT
elif isinstance(val, str):
# string
seen_string = True
if type(val) is not str:
# GH#32264 np.str_ object
val = str(val)
if len(val) == 0 or val in nat_strings:
iresult[i] = NPY_NAT
continue
string_to_dts_failed = string_to_dts(
val, &dts, &out_bestunit, &out_local,
&out_tzoffset, False
)
if string_to_dts_failed:
# An error at this point is a _parsing_ error
# specifically _not_ OutOfBoundsDatetime
if _parse_today_now(val, &iresult[i], utc):
continue
elif require_iso8601:
# if requiring iso8601 strings, skip trying
# other formats
if is_coerce:
iresult[i] = NPY_NAT
continue
elif is_raise:
raise ValueError(
f"time data \"{val}\" at position {i} doesn't match format specified"
)
return values, tz_out
try:
py_dt = parse_datetime_string(val,
dayfirst=dayfirst,
yearfirst=yearfirst)
# If the dateutil parser returned tzinfo, capture it
# to check if all arguments have the same tzinfo
tz = py_dt.utcoffset()
except (ValueError, OverflowError):
if is_coerce:
iresult[i] = NPY_NAT
continue
raise TypeError(f"invalid string coercion to datetime for \"{val}\" at position {i}")
if tz is not None:
seen_datetime_offset = True
# dateutil timezone objects cannot be hashed, so
# store the UTC offsets in seconds instead
out_tzoffset_vals.add(tz.total_seconds())
else:
# Add a marker for naive string, to track if we are
# parsing mixed naive and aware strings
out_tzoffset_vals.add('naive')
_ts = convert_datetime_to_tsobject(py_dt, None)
iresult[i] = _ts.value
if not string_to_dts_failed:
# No error reported by string_to_dts, pick back up
# where we left off
value = dtstruct_to_dt64(&dts)
if out_local == 1:
seen_datetime_offset = True
# Store the out_tzoffset in seconds
# since we store the total_seconds of
# dateutil.tz.tzoffset objects
out_tzoffset_vals.add(out_tzoffset * 60.)
tz = pytz.FixedOffset(out_tzoffset)
value = tz_localize_to_utc_single(value, tz)
out_local = 0
out_tzoffset = 0
else:
# Add a marker for naive string, to track if we are
# parsing mixed naive and aware strings
out_tzoffset_vals.add('naive')
iresult[i] = value
check_dts_bounds(&dts)
else:
if is_coerce:
iresult[i] = NPY_NAT
else:
raise TypeError(f"{type(val)} is not convertible to datetime")
except OutOfBoundsDatetime as ex:
ex.args = (str(ex) + f" present at position {i}", )
if is_coerce:
iresult[i] = NPY_NAT
continue
elif require_iso8601 and isinstance(val, str):
# GH#19382 for just-barely-OutOfBounds falling back to
# dateutil parser will return incorrect result because
# it will ignore nanoseconds
if is_raise:
# Still raise OutOfBoundsDatetime,
# as error message is informative.
raise
assert is_ignore
return values, tz_out
raise
except OutOfBoundsDatetime:
if is_raise:
raise
return ignore_errors_out_of_bounds_fallback(values), tz_out
except TypeError:
return _array_to_datetime_object(values, errors, dayfirst, yearfirst)
if seen_datetime and seen_integer:
# we have mixed datetimes & integers
if is_coerce:
# coerce all of the integers/floats to NaT, preserve
# the datetimes and other convertibles
for i in range(n):
val = values[i]
if is_integer_object(val) or is_float_object(val):
result[i] = NPY_NAT
elif allow_mixed:
pass
elif is_raise:
raise ValueError("mixed datetimes and integers in passed array")
else:
return _array_to_datetime_object(values, errors, dayfirst, yearfirst)
if seen_datetime_offset and not utc_convert:
# GH#17697
# 1) If all the offsets are equal, return one offset for
# the parsed dates to (maybe) pass to DatetimeIndex
# 2) If the offsets are different, then force the parsing down the
# object path where an array of datetimes
# (with individual dateutil.tzoffsets) are returned
is_same_offsets = len(out_tzoffset_vals) == 1
if not is_same_offsets:
return _array_to_datetime_object(values, errors, dayfirst, yearfirst)
else:
tz_offset = out_tzoffset_vals.pop()
tz_out = pytz.FixedOffset(tz_offset / 60.)
return result, tz_out
@cython.wraparound(False)
@cython.boundscheck(False)
cdef ndarray[object] ignore_errors_out_of_bounds_fallback(ndarray[object] values):
"""
Fallback for array_to_datetime if an OutOfBoundsDatetime is raised
and errors == "ignore"
Parameters
----------
values : ndarray[object]
Returns
-------
ndarray[object]
"""
cdef:
Py_ssize_t i, n = len(values)
object val
oresult = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0)
for i in range(n):
val = values[i]
# set as nan except if its a NaT
if checknull_with_nat_and_na(val):
if isinstance(val, float):
oresult[i] = np.nan
else:
oresult[i] = NaT
elif is_datetime64_object(val):
if get_datetime64_value(val) == NPY_NAT:
oresult[i] = NaT
else:
oresult[i] = val.item()
else:
oresult[i] = val
return oresult
@cython.wraparound(False)
@cython.boundscheck(False)
cdef _array_to_datetime_object(
ndarray[object] values,
str errors,
bint dayfirst=False,
bint yearfirst=False,
):
"""
Fall back function for array_to_datetime
Attempts to parse datetime strings with dateutil to return an array
of datetime objects
Parameters
----------
values : ndarray[object]
date-like objects to convert
errors : str
error behavior when parsing
dayfirst : bool, default False
dayfirst parsing behavior when encountering datetime strings
yearfirst : bool, default False
yearfirst parsing behavior when encountering datetime strings
Returns
-------
np.ndarray[object]
Literal[None]
"""
cdef:
Py_ssize_t i, n = len(values)
object val
bint is_ignore = errors == 'ignore'
bint is_coerce = errors == 'coerce'
bint is_raise = errors == 'raise'
ndarray[object] oresult
npy_datetimestruct dts
assert is_raise or is_ignore or is_coerce
oresult = cnp.PyArray_EMPTY(values.ndim, values.shape, cnp.NPY_OBJECT, 0)
# We return an object array and only attempt to parse:
# 1) NaT or NaT-like values
# 2) datetime strings, which we return as datetime.datetime
# 3) special strings - "now" & "today"
for i in range(n):
val = values[i]
if checknull_with_nat_and_na(val) or PyDateTime_Check(val):
# GH 25978. No need to parse NaT-like or datetime-like vals
oresult[i] = val
elif isinstance(val, str):
if type(val) is not str:
# GH#32264 np.str_ objects
val = str(val)
if len(val) == 0 or val in nat_strings:
oresult[i] = 'NaT'
continue
try:
oresult[i] = parse_datetime_string(val, dayfirst=dayfirst,
yearfirst=yearfirst)
pydatetime_to_dt64(oresult[i], &dts)
check_dts_bounds(&dts)
except (ValueError, OverflowError) as ex:
ex.args = (f"{ex} present at position {i}", )
if is_coerce:
oresult[i] = <object>NaT
continue
if is_raise:
raise
return values, None
else:
if is_raise:
raise
return values, None
return oresult, None
cdef inline bint _parse_today_now(str val, int64_t* iresult, bint utc):
# We delay this check for as long as possible
# because it catches relatively rare cases
if val == "now":
iresult[0] = Timestamp.utcnow().value
if not utc:
# GH#18705 make sure to_datetime("now") matches Timestamp("now")
warnings.warn(
"The parsing of 'now' in pd.to_datetime without `utc=True` is "
"deprecated. In a future version, this will match Timestamp('now') "
"and Timestamp.now()",
FutureWarning,
stacklevel=find_stack_level(),
)
return True
elif val == "today":
iresult[0] = Timestamp.today().value
return True
return False