1203 lines
37 KiB
Cython
1203 lines
37 KiB
Cython
|
"""
|
|||
|
Parsing functions for datetime and datetime-like strings.
|
|||
|
"""
|
|||
|
import re
|
|||
|
import time
|
|||
|
import warnings
|
|||
|
|
|||
|
from pandas.util._exceptions import find_stack_level
|
|||
|
|
|||
|
cimport cython
|
|||
|
from cpython.datetime cimport (
|
|||
|
datetime,
|
|||
|
datetime_new,
|
|||
|
import_datetime,
|
|||
|
)
|
|||
|
from cpython.object cimport PyObject_Str
|
|||
|
from cython cimport Py_ssize_t
|
|||
|
from libc.string cimport strchr
|
|||
|
|
|||
|
import_datetime()
|
|||
|
|
|||
|
import numpy as np
|
|||
|
|
|||
|
cimport numpy as cnp
|
|||
|
from numpy cimport (
|
|||
|
PyArray_GETITEM,
|
|||
|
PyArray_ITER_DATA,
|
|||
|
PyArray_ITER_NEXT,
|
|||
|
PyArray_IterNew,
|
|||
|
flatiter,
|
|||
|
float64_t,
|
|||
|
)
|
|||
|
|
|||
|
cnp.import_array()
|
|||
|
|
|||
|
# dateutil compat
|
|||
|
|
|||
|
from dateutil.parser import (
|
|||
|
DEFAULTPARSER,
|
|||
|
parse as du_parse,
|
|||
|
)
|
|||
|
from dateutil.relativedelta import relativedelta
|
|||
|
from dateutil.tz import (
|
|||
|
tzlocal as _dateutil_tzlocal,
|
|||
|
tzoffset,
|
|||
|
tzutc as _dateutil_tzutc,
|
|||
|
)
|
|||
|
|
|||
|
from pandas._config import get_option
|
|||
|
|
|||
|
from pandas._libs.tslibs.ccalendar cimport c_MONTH_NUMBERS
|
|||
|
from pandas._libs.tslibs.nattype cimport (
|
|||
|
c_NaT as NaT,
|
|||
|
c_nat_strings as nat_strings,
|
|||
|
)
|
|||
|
from pandas._libs.tslibs.np_datetime cimport (
|
|||
|
NPY_DATETIMEUNIT,
|
|||
|
npy_datetimestruct,
|
|||
|
string_to_dts,
|
|||
|
)
|
|||
|
from pandas._libs.tslibs.offsets cimport is_offset_object
|
|||
|
from pandas._libs.tslibs.util cimport (
|
|||
|
get_c_string_buf_and_size,
|
|||
|
is_array,
|
|||
|
)
|
|||
|
|
|||
|
|
|||
|
cdef extern from "../src/headers/portable.h":
|
|||
|
int getdigit_ascii(char c, int default) nogil
|
|||
|
|
|||
|
cdef extern from "../src/parser/tokenizer.h":
|
|||
|
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
|
|||
|
int skip_trailing, int *error, int *maybe_int)
|
|||
|
|
|||
|
|
|||
|
# ----------------------------------------------------------------------
|
|||
|
# Constants
|
|||
|
|
|||
|
|
|||
|
class DateParseError(ValueError):
|
|||
|
pass
|
|||
|
|
|||
|
|
|||
|
_DEFAULT_DATETIME = datetime(1, 1, 1).replace(hour=0, minute=0,
|
|||
|
second=0, microsecond=0)
|
|||
|
|
|||
|
PARSING_WARNING_MSG = (
|
|||
|
"Parsing dates in {format} format when dayfirst={dayfirst} was specified. "
|
|||
|
"This may lead to inconsistently parsed dates! Specify a format "
|
|||
|
"to ensure consistent parsing."
|
|||
|
)
|
|||
|
|
|||
|
cdef:
|
|||
|
set _not_datelike_strings = {'a', 'A', 'm', 'M', 'p', 'P', 't', 'T'}
|
|||
|
|
|||
|
# ----------------------------------------------------------------------
|
|||
|
cdef:
|
|||
|
const char* delimiters = " /-."
|
|||
|
int MAX_DAYS_IN_MONTH = 31, MAX_MONTH = 12
|
|||
|
|
|||
|
|
|||
|
cdef inline bint _is_delimiter(const char ch):
|
|||
|
return strchr(delimiters, ch) != NULL
|
|||
|
|
|||
|
|
|||
|
cdef inline int _parse_1digit(const char* s):
|
|||
|
cdef int result = 0
|
|||
|
result += getdigit_ascii(s[0], -10) * 1
|
|||
|
return result
|
|||
|
|
|||
|
|
|||
|
cdef inline int _parse_2digit(const char* s):
|
|||
|
cdef int result = 0
|
|||
|
result += getdigit_ascii(s[0], -10) * 10
|
|||
|
result += getdigit_ascii(s[1], -100) * 1
|
|||
|
return result
|
|||
|
|
|||
|
|
|||
|
cdef inline int _parse_4digit(const char* s):
|
|||
|
cdef int result = 0
|
|||
|
result += getdigit_ascii(s[0], -10) * 1000
|
|||
|
result += getdigit_ascii(s[1], -100) * 100
|
|||
|
result += getdigit_ascii(s[2], -1000) * 10
|
|||
|
result += getdigit_ascii(s[3], -10000) * 1
|
|||
|
return result
|
|||
|
|
|||
|
|
|||
|
cdef inline object _parse_delimited_date(str date_string, bint dayfirst):
|
|||
|
"""
|
|||
|
Parse special cases of dates: MM/DD/YYYY, DD/MM/YYYY, MM/YYYY.
|
|||
|
|
|||
|
At the beginning function tries to parse date in MM/DD/YYYY format, but
|
|||
|
if month > 12 - in DD/MM/YYYY (`dayfirst == False`).
|
|||
|
With `dayfirst == True` function makes an attempt to parse date in
|
|||
|
DD/MM/YYYY, if an attempt is wrong - in DD/MM/YYYY
|
|||
|
|
|||
|
For MM/DD/YYYY, DD/MM/YYYY: delimiter can be a space or one of /-.
|
|||
|
For MM/YYYY: delimiter can be a space or one of /-
|
|||
|
If `date_string` can't be converted to date, then function returns
|
|||
|
None, None
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
date_string : str
|
|||
|
dayfirst : bool
|
|||
|
|
|||
|
Returns:
|
|||
|
--------
|
|||
|
datetime or None
|
|||
|
str or None
|
|||
|
Describing resolution of the parsed string.
|
|||
|
"""
|
|||
|
cdef:
|
|||
|
const char* buf
|
|||
|
Py_ssize_t length
|
|||
|
int day = 1, month = 1, year
|
|||
|
bint can_swap = 0
|
|||
|
|
|||
|
buf = get_c_string_buf_and_size(date_string, &length)
|
|||
|
if length == 10 and _is_delimiter(buf[2]) and _is_delimiter(buf[5]):
|
|||
|
# parsing MM?DD?YYYY and DD?MM?YYYY dates
|
|||
|
month = _parse_2digit(buf)
|
|||
|
day = _parse_2digit(buf + 3)
|
|||
|
year = _parse_4digit(buf + 6)
|
|||
|
reso = 'day'
|
|||
|
can_swap = 1
|
|||
|
elif length == 9 and _is_delimiter(buf[1]) and _is_delimiter(buf[4]):
|
|||
|
# parsing M?DD?YYYY and D?MM?YYYY dates
|
|||
|
month = _parse_1digit(buf)
|
|||
|
day = _parse_2digit(buf + 2)
|
|||
|
year = _parse_4digit(buf + 5)
|
|||
|
reso = 'day'
|
|||
|
can_swap = 1
|
|||
|
elif length == 9 and _is_delimiter(buf[2]) and _is_delimiter(buf[4]):
|
|||
|
# parsing MM?D?YYYY and DD?M?YYYY dates
|
|||
|
month = _parse_2digit(buf)
|
|||
|
day = _parse_1digit(buf + 3)
|
|||
|
year = _parse_4digit(buf + 5)
|
|||
|
reso = 'day'
|
|||
|
can_swap = 1
|
|||
|
elif length == 8 and _is_delimiter(buf[1]) and _is_delimiter(buf[3]):
|
|||
|
# parsing M?D?YYYY and D?M?YYYY dates
|
|||
|
month = _parse_1digit(buf)
|
|||
|
day = _parse_1digit(buf + 2)
|
|||
|
year = _parse_4digit(buf + 4)
|
|||
|
reso = 'day'
|
|||
|
can_swap = 1
|
|||
|
elif length == 7 and _is_delimiter(buf[2]):
|
|||
|
# parsing MM?YYYY dates
|
|||
|
if buf[2] == b'.':
|
|||
|
# we cannot reliably tell whether e.g. 10.2010 is a float
|
|||
|
# or a date, thus we refuse to parse it here
|
|||
|
return None, None
|
|||
|
month = _parse_2digit(buf)
|
|||
|
year = _parse_4digit(buf + 3)
|
|||
|
reso = 'month'
|
|||
|
else:
|
|||
|
return None, None
|
|||
|
|
|||
|
if month < 0 or day < 0 or year < 1000:
|
|||
|
# some part is not an integer, so
|
|||
|
# date_string can't be converted to date, above format
|
|||
|
return None, None
|
|||
|
|
|||
|
swapped_day_and_month = False
|
|||
|
if 1 <= month <= MAX_DAYS_IN_MONTH and 1 <= day <= MAX_DAYS_IN_MONTH \
|
|||
|
and (month <= MAX_MONTH or day <= MAX_MONTH):
|
|||
|
if (month > MAX_MONTH or (day <= MAX_MONTH and dayfirst)) and can_swap:
|
|||
|
day, month = month, day
|
|||
|
swapped_day_and_month = True
|
|||
|
if dayfirst and not swapped_day_and_month:
|
|||
|
warnings.warn(
|
|||
|
PARSING_WARNING_MSG.format(
|
|||
|
format='MM/DD/YYYY',
|
|||
|
dayfirst='True',
|
|||
|
),
|
|||
|
stacklevel=find_stack_level(),
|
|||
|
)
|
|||
|
elif not dayfirst and swapped_day_and_month:
|
|||
|
warnings.warn(
|
|||
|
PARSING_WARNING_MSG.format(
|
|||
|
format='DD/MM/YYYY',
|
|||
|
dayfirst='False (the default)',
|
|||
|
),
|
|||
|
stacklevel=find_stack_level(),
|
|||
|
)
|
|||
|
# In Python <= 3.6.0 there is no range checking for invalid dates
|
|||
|
# in C api, thus we call faster C version for 3.6.1 or newer
|
|||
|
return datetime_new(year, month, day, 0, 0, 0, 0, None), reso
|
|||
|
|
|||
|
raise DateParseError(f"Invalid date specified ({month}/{day})")
|
|||
|
|
|||
|
|
|||
|
cdef inline bint does_string_look_like_time(str parse_string):
|
|||
|
"""
|
|||
|
Checks whether given string is a time: it has to start either from
|
|||
|
H:MM or from HH:MM, and hour and minute values must be valid.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
parse_string : str
|
|||
|
|
|||
|
Returns:
|
|||
|
--------
|
|||
|
bool
|
|||
|
Whether given string is potentially a time.
|
|||
|
"""
|
|||
|
cdef:
|
|||
|
const char* buf
|
|||
|
Py_ssize_t length
|
|||
|
int hour = -1, minute = -1
|
|||
|
|
|||
|
buf = get_c_string_buf_and_size(parse_string, &length)
|
|||
|
if length >= 4:
|
|||
|
if buf[1] == b':':
|
|||
|
# h:MM format
|
|||
|
hour = getdigit_ascii(buf[0], -1)
|
|||
|
minute = _parse_2digit(buf + 2)
|
|||
|
elif buf[2] == b':':
|
|||
|
# HH:MM format
|
|||
|
hour = _parse_2digit(buf)
|
|||
|
minute = _parse_2digit(buf + 3)
|
|||
|
|
|||
|
return 0 <= hour <= 23 and 0 <= minute <= 59
|
|||
|
|
|||
|
|
|||
|
def parse_datetime_string(
|
|||
|
# NB: This will break with np.str_ (GH#32264) even though
|
|||
|
# isinstance(npstrobj, str) evaluates to True, so caller must ensure
|
|||
|
# the argument is *exactly* 'str'
|
|||
|
str date_string,
|
|||
|
bint dayfirst=False,
|
|||
|
bint yearfirst=False,
|
|||
|
**kwargs,
|
|||
|
) -> datetime:
|
|||
|
"""
|
|||
|
Parse datetime string, only returns datetime.
|
|||
|
Also cares special handling matching time patterns.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
datetime
|
|||
|
"""
|
|||
|
|
|||
|
cdef:
|
|||
|
datetime dt
|
|||
|
|
|||
|
if not _does_string_look_like_datetime(date_string):
|
|||
|
raise ValueError(f'Given date string {date_string} not likely a datetime')
|
|||
|
|
|||
|
if does_string_look_like_time(date_string):
|
|||
|
# use current datetime as default, not pass _DEFAULT_DATETIME
|
|||
|
dt = du_parse(date_string, dayfirst=dayfirst,
|
|||
|
yearfirst=yearfirst, **kwargs)
|
|||
|
return dt
|
|||
|
|
|||
|
dt, _ = _parse_delimited_date(date_string, dayfirst)
|
|||
|
if dt is not None:
|
|||
|
return dt
|
|||
|
|
|||
|
# Handling special case strings today & now
|
|||
|
if date_string == "now":
|
|||
|
dt = datetime.now()
|
|||
|
return dt
|
|||
|
elif date_string == "today":
|
|||
|
dt = datetime.today()
|
|||
|
return dt
|
|||
|
|
|||
|
try:
|
|||
|
dt, _ = _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq=None)
|
|||
|
return dt
|
|||
|
except DateParseError:
|
|||
|
raise
|
|||
|
except ValueError:
|
|||
|
pass
|
|||
|
|
|||
|
try:
|
|||
|
dt = du_parse(date_string, default=_DEFAULT_DATETIME,
|
|||
|
dayfirst=dayfirst, yearfirst=yearfirst, **kwargs)
|
|||
|
except TypeError:
|
|||
|
# following may be raised from dateutil
|
|||
|
# TypeError: 'NoneType' object is not iterable
|
|||
|
raise ValueError(f'Given date string {date_string} not likely a datetime')
|
|||
|
|
|||
|
return dt
|
|||
|
|
|||
|
|
|||
|
def parse_time_string(arg, freq=None, dayfirst=None, yearfirst=None):
|
|||
|
"""
|
|||
|
Try hard to parse datetime string, leveraging dateutil plus some extra
|
|||
|
goodies like quarter recognition.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
arg : str
|
|||
|
freq : str or DateOffset, default None
|
|||
|
Helps with interpreting time string if supplied
|
|||
|
dayfirst : bool, default None
|
|||
|
If None uses default from print_config
|
|||
|
yearfirst : bool, default None
|
|||
|
If None uses default from print_config
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
datetime
|
|||
|
str
|
|||
|
Describing resolution of parsed string.
|
|||
|
"""
|
|||
|
if type(arg) is not str:
|
|||
|
# GH#45580 np.str_ satisfies isinstance(obj, str) but if we annotate
|
|||
|
# arg as "str" this raises here
|
|||
|
if not isinstance(arg, np.str_):
|
|||
|
raise TypeError(
|
|||
|
"Argument 'arg' has incorrect type "
|
|||
|
f"(expected str, got {type(arg).__name__})"
|
|||
|
)
|
|||
|
arg = str(arg)
|
|||
|
|
|||
|
if is_offset_object(freq):
|
|||
|
freq = freq.rule_code
|
|||
|
|
|||
|
if dayfirst is None:
|
|||
|
dayfirst = get_option("display.date_dayfirst")
|
|||
|
if yearfirst is None:
|
|||
|
yearfirst = get_option("display.date_yearfirst")
|
|||
|
|
|||
|
res = parse_datetime_string_with_reso(arg, freq=freq,
|
|||
|
dayfirst=dayfirst,
|
|||
|
yearfirst=yearfirst)
|
|||
|
return res
|
|||
|
|
|||
|
|
|||
|
cdef parse_datetime_string_with_reso(
|
|||
|
str date_string, str freq=None, bint dayfirst=False, bint yearfirst=False,
|
|||
|
):
|
|||
|
"""
|
|||
|
Parse datetime string and try to identify its resolution.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
datetime
|
|||
|
str
|
|||
|
Inferred resolution of the parsed string.
|
|||
|
|
|||
|
Raises
|
|||
|
------
|
|||
|
ValueError : preliminary check suggests string is not datetime
|
|||
|
DateParseError : error within dateutil
|
|||
|
"""
|
|||
|
cdef:
|
|||
|
object parsed, reso
|
|||
|
bint string_to_dts_failed
|
|||
|
npy_datetimestruct dts
|
|||
|
NPY_DATETIMEUNIT out_bestunit
|
|||
|
int out_local
|
|||
|
int out_tzoffset
|
|||
|
|
|||
|
if not _does_string_look_like_datetime(date_string):
|
|||
|
raise ValueError(f'Given date string {date_string} not likely a datetime')
|
|||
|
|
|||
|
parsed, reso = _parse_delimited_date(date_string, dayfirst)
|
|||
|
if parsed is not None:
|
|||
|
return parsed, reso
|
|||
|
|
|||
|
# Try iso8601 first, as it handles nanoseconds
|
|||
|
# TODO: does this render some/all of parse_delimited_date redundant?
|
|||
|
string_to_dts_failed = string_to_dts(
|
|||
|
date_string, &dts, &out_bestunit, &out_local,
|
|||
|
&out_tzoffset, False
|
|||
|
)
|
|||
|
if not string_to_dts_failed:
|
|||
|
if dts.ps != 0 or out_local:
|
|||
|
# TODO: the not-out_local case we could do without Timestamp;
|
|||
|
# avoid circular import
|
|||
|
from pandas import Timestamp
|
|||
|
parsed = Timestamp(date_string)
|
|||
|
else:
|
|||
|
parsed = datetime(dts.year, dts.month, dts.day, dts.hour, dts.min, dts.sec, dts.us)
|
|||
|
reso = {
|
|||
|
NPY_DATETIMEUNIT.NPY_FR_Y: "year",
|
|||
|
NPY_DATETIMEUNIT.NPY_FR_M: "month",
|
|||
|
NPY_DATETIMEUNIT.NPY_FR_D: "day",
|
|||
|
NPY_DATETIMEUNIT.NPY_FR_h: "hour",
|
|||
|
NPY_DATETIMEUNIT.NPY_FR_m: "minute",
|
|||
|
NPY_DATETIMEUNIT.NPY_FR_s: "second",
|
|||
|
NPY_DATETIMEUNIT.NPY_FR_ms: "millisecond",
|
|||
|
NPY_DATETIMEUNIT.NPY_FR_us: "microsecond",
|
|||
|
NPY_DATETIMEUNIT.NPY_FR_ns: "nanosecond",
|
|||
|
}[out_bestunit]
|
|||
|
return parsed, reso
|
|||
|
|
|||
|
try:
|
|||
|
return _parse_dateabbr_string(date_string, _DEFAULT_DATETIME, freq)
|
|||
|
except DateParseError:
|
|||
|
raise
|
|||
|
except ValueError:
|
|||
|
pass
|
|||
|
|
|||
|
try:
|
|||
|
parsed, reso = dateutil_parse(date_string, _DEFAULT_DATETIME,
|
|||
|
dayfirst=dayfirst, yearfirst=yearfirst,
|
|||
|
ignoretz=False)
|
|||
|
except (ValueError, OverflowError) as err:
|
|||
|
# TODO: allow raise of errors within instead
|
|||
|
raise DateParseError(err)
|
|||
|
if parsed is None:
|
|||
|
raise DateParseError(f"Could not parse {date_string}")
|
|||
|
return parsed, reso
|
|||
|
|
|||
|
|
|||
|
cpdef bint _does_string_look_like_datetime(str py_string):
|
|||
|
"""
|
|||
|
Checks whether given string is a datetime: it has to start with '0' or
|
|||
|
be greater than 1000.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
py_string: str
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
bool
|
|||
|
Whether given string is potentially a datetime.
|
|||
|
"""
|
|||
|
cdef:
|
|||
|
const char *buf
|
|||
|
char *endptr = NULL
|
|||
|
Py_ssize_t length = -1
|
|||
|
double converted_date
|
|||
|
char first
|
|||
|
int error = 0
|
|||
|
|
|||
|
buf = get_c_string_buf_and_size(py_string, &length)
|
|||
|
if length >= 1:
|
|||
|
first = buf[0]
|
|||
|
if first == b'0':
|
|||
|
# Strings starting with 0 are more consistent with a
|
|||
|
# date-like string than a number
|
|||
|
return True
|
|||
|
elif py_string in _not_datelike_strings:
|
|||
|
return False
|
|||
|
else:
|
|||
|
# xstrtod with such parameters copies behavior of python `float`
|
|||
|
# cast; for example, " 35.e-1 " is valid string for this cast so,
|
|||
|
# for correctly xstrtod call necessary to pass these params:
|
|||
|
# b'.' - a dot is used as separator, b'e' - an exponential form of
|
|||
|
# a float number can be used, b'\0' - not to use a thousand
|
|||
|
# separator, 1 - skip extra spaces before and after,
|
|||
|
converted_date = xstrtod(buf, &endptr,
|
|||
|
b'.', b'e', b'\0', 1, &error, NULL)
|
|||
|
# if there were no errors and the whole line was parsed, then ...
|
|||
|
if error == 0 and endptr == buf + length:
|
|||
|
return converted_date >= 1000
|
|||
|
|
|||
|
return True
|
|||
|
|
|||
|
|
|||
|
cdef inline object _parse_dateabbr_string(object date_string, datetime default,
|
|||
|
str freq=None):
|
|||
|
cdef:
|
|||
|
object ret
|
|||
|
# year initialized to prevent compiler warnings
|
|||
|
int year = -1, quarter = -1, month, mnum
|
|||
|
Py_ssize_t date_len
|
|||
|
|
|||
|
# special handling for possibilities eg, 2Q2005, 2Q05, 2005Q1, 05Q1
|
|||
|
assert isinstance(date_string, str)
|
|||
|
|
|||
|
if date_string in nat_strings:
|
|||
|
return NaT, ''
|
|||
|
|
|||
|
date_string = date_string.upper()
|
|||
|
date_len = len(date_string)
|
|||
|
|
|||
|
if date_len == 4:
|
|||
|
# parse year only like 2000
|
|||
|
try:
|
|||
|
ret = default.replace(year=int(date_string))
|
|||
|
return ret, 'year'
|
|||
|
except ValueError:
|
|||
|
pass
|
|||
|
|
|||
|
try:
|
|||
|
if 4 <= date_len <= 7:
|
|||
|
i = date_string.index('Q', 1, 6)
|
|||
|
if i == 1:
|
|||
|
quarter = int(date_string[0])
|
|||
|
if date_len == 4 or (date_len == 5
|
|||
|
and date_string[i + 1] == '-'):
|
|||
|
# r'(\d)Q-?(\d\d)')
|
|||
|
year = 2000 + int(date_string[-2:])
|
|||
|
elif date_len == 6 or (date_len == 7
|
|||
|
and date_string[i + 1] == '-'):
|
|||
|
# r'(\d)Q-?(\d\d\d\d)')
|
|||
|
year = int(date_string[-4:])
|
|||
|
else:
|
|||
|
raise ValueError
|
|||
|
elif i == 2 or i == 3:
|
|||
|
# r'(\d\d)-?Q(\d)'
|
|||
|
if date_len == 4 or (date_len == 5
|
|||
|
and date_string[i - 1] == '-'):
|
|||
|
quarter = int(date_string[-1])
|
|||
|
year = 2000 + int(date_string[:2])
|
|||
|
else:
|
|||
|
raise ValueError
|
|||
|
elif i == 4 or i == 5:
|
|||
|
if date_len == 6 or (date_len == 7
|
|||
|
and date_string[i - 1] == '-'):
|
|||
|
# r'(\d\d\d\d)-?Q(\d)'
|
|||
|
quarter = int(date_string[-1])
|
|||
|
year = int(date_string[:4])
|
|||
|
else:
|
|||
|
raise ValueError
|
|||
|
|
|||
|
if not (1 <= quarter <= 4):
|
|||
|
raise DateParseError(f'Incorrect quarterly string is given, '
|
|||
|
f'quarter must be '
|
|||
|
f'between 1 and 4: {date_string}')
|
|||
|
|
|||
|
try:
|
|||
|
# GH#1228
|
|||
|
year, month = quarter_to_myear(year, quarter, freq)
|
|||
|
except KeyError:
|
|||
|
raise DateParseError("Unable to retrieve month "
|
|||
|
"information from given "
|
|||
|
f"freq: {freq}")
|
|||
|
|
|||
|
ret = default.replace(year=year, month=month)
|
|||
|
return ret, 'quarter'
|
|||
|
|
|||
|
except DateParseError:
|
|||
|
raise
|
|||
|
except ValueError:
|
|||
|
pass
|
|||
|
|
|||
|
if date_len == 6 and freq == 'M':
|
|||
|
year = int(date_string[:4])
|
|||
|
month = int(date_string[4:6])
|
|||
|
try:
|
|||
|
ret = default.replace(year=year, month=month)
|
|||
|
return ret, 'month'
|
|||
|
except ValueError:
|
|||
|
pass
|
|||
|
|
|||
|
for pat in ['%Y-%m', '%b %Y', '%b-%Y']:
|
|||
|
try:
|
|||
|
ret = datetime.strptime(date_string, pat)
|
|||
|
return ret, 'month'
|
|||
|
except ValueError:
|
|||
|
pass
|
|||
|
|
|||
|
raise ValueError(f'Unable to parse {date_string}')
|
|||
|
|
|||
|
|
|||
|
cpdef quarter_to_myear(int year, int quarter, str freq):
|
|||
|
"""
|
|||
|
A quarterly frequency defines a "year" which may not coincide with
|
|||
|
the calendar-year. Find the calendar-year and calendar-month associated
|
|||
|
with the given year and quarter under the `freq`-derived calendar.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
year : int
|
|||
|
quarter : int
|
|||
|
freq : str or None
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
year : int
|
|||
|
month : int
|
|||
|
|
|||
|
See Also
|
|||
|
--------
|
|||
|
Period.qyear
|
|||
|
"""
|
|||
|
if quarter <= 0 or quarter > 4:
|
|||
|
raise ValueError("Quarter must be 1 <= q <= 4")
|
|||
|
|
|||
|
if freq is not None:
|
|||
|
mnum = c_MONTH_NUMBERS[get_rule_month(freq)] + 1
|
|||
|
month = (mnum + (quarter - 1) * 3) % 12 + 1
|
|||
|
if month > mnum:
|
|||
|
year -= 1
|
|||
|
else:
|
|||
|
month = (quarter - 1) * 3 + 1
|
|||
|
|
|||
|
return year, month
|
|||
|
|
|||
|
|
|||
|
cdef dateutil_parse(
|
|||
|
str timestr,
|
|||
|
object default,
|
|||
|
bint ignoretz=False,
|
|||
|
bint dayfirst=False,
|
|||
|
bint yearfirst=False,
|
|||
|
):
|
|||
|
""" lifted from dateutil to get resolution"""
|
|||
|
|
|||
|
cdef:
|
|||
|
str attr
|
|||
|
datetime ret
|
|||
|
object res
|
|||
|
object reso = None
|
|||
|
dict repl = {}
|
|||
|
|
|||
|
res, _ = DEFAULTPARSER._parse(timestr, dayfirst=dayfirst, yearfirst=yearfirst)
|
|||
|
|
|||
|
if res is None:
|
|||
|
raise ValueError(f"Unknown datetime string format, unable to parse: {timestr}")
|
|||
|
|
|||
|
for attr in ["year", "month", "day", "hour",
|
|||
|
"minute", "second", "microsecond"]:
|
|||
|
value = getattr(res, attr)
|
|||
|
if value is not None:
|
|||
|
repl[attr] = value
|
|||
|
reso = attr
|
|||
|
|
|||
|
if reso is None:
|
|||
|
raise ValueError(f"Unable to parse datetime string: {timestr}")
|
|||
|
|
|||
|
if reso == 'microsecond':
|
|||
|
if repl['microsecond'] == 0:
|
|||
|
reso = 'second'
|
|||
|
elif repl['microsecond'] % 1000 == 0:
|
|||
|
reso = 'millisecond'
|
|||
|
|
|||
|
ret = default.replace(**repl)
|
|||
|
if res.weekday is not None and not res.day:
|
|||
|
ret = ret + relativedelta.relativedelta(weekday=res.weekday)
|
|||
|
if not ignoretz:
|
|||
|
if res.tzname and res.tzname in time.tzname:
|
|||
|
ret = ret.replace(tzinfo=_dateutil_tzlocal())
|
|||
|
elif res.tzoffset == 0:
|
|||
|
ret = ret.replace(tzinfo=_dateutil_tzutc())
|
|||
|
elif res.tzoffset:
|
|||
|
ret = ret.replace(tzinfo=tzoffset(res.tzname, res.tzoffset))
|
|||
|
return ret, reso
|
|||
|
|
|||
|
|
|||
|
# ----------------------------------------------------------------------
|
|||
|
# Parsing for type-inference
|
|||
|
|
|||
|
|
|||
|
def try_parse_dates(
|
|||
|
object[:] values, parser=None, bint dayfirst=False, default=None,
|
|||
|
) -> np.ndarray:
|
|||
|
cdef:
|
|||
|
Py_ssize_t i, n
|
|||
|
object[::1] result
|
|||
|
|
|||
|
n = len(values)
|
|||
|
result = np.empty(n, dtype='O')
|
|||
|
|
|||
|
if parser is None:
|
|||
|
if default is None: # GH2618
|
|||
|
date = datetime.now()
|
|||
|
default = datetime(date.year, date.month, 1)
|
|||
|
|
|||
|
parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default)
|
|||
|
|
|||
|
# EAFP here
|
|||
|
try:
|
|||
|
for i in range(n):
|
|||
|
if values[i] == '':
|
|||
|
result[i] = np.nan
|
|||
|
else:
|
|||
|
result[i] = parse_date(values[i])
|
|||
|
except Exception:
|
|||
|
# Since parser is user-defined, we can't guess what it might raise
|
|||
|
return values
|
|||
|
else:
|
|||
|
parse_date = parser
|
|||
|
|
|||
|
for i in range(n):
|
|||
|
if values[i] == '':
|
|||
|
result[i] = np.nan
|
|||
|
else:
|
|||
|
result[i] = parse_date(values[i])
|
|||
|
|
|||
|
return result.base # .base to access underlying ndarray
|
|||
|
|
|||
|
|
|||
|
def try_parse_date_and_time(
|
|||
|
object[:] dates,
|
|||
|
object[:] times,
|
|||
|
date_parser=None,
|
|||
|
time_parser=None,
|
|||
|
bint dayfirst=False,
|
|||
|
default=None,
|
|||
|
) -> np.ndarray:
|
|||
|
cdef:
|
|||
|
Py_ssize_t i, n
|
|||
|
object[::1] result
|
|||
|
|
|||
|
n = len(dates)
|
|||
|
# TODO(cython3): Use len instead of `shape[0]`
|
|||
|
if times.shape[0] != n:
|
|||
|
raise ValueError('Length of dates and times must be equal')
|
|||
|
result = np.empty(n, dtype='O')
|
|||
|
|
|||
|
if date_parser is None:
|
|||
|
if default is None: # GH2618
|
|||
|
date = datetime.now()
|
|||
|
default = datetime(date.year, date.month, 1)
|
|||
|
|
|||
|
parse_date = lambda x: du_parse(x, dayfirst=dayfirst, default=default)
|
|||
|
|
|||
|
else:
|
|||
|
parse_date = date_parser
|
|||
|
|
|||
|
if time_parser is None:
|
|||
|
parse_time = lambda x: du_parse(x)
|
|||
|
|
|||
|
else:
|
|||
|
parse_time = time_parser
|
|||
|
|
|||
|
for i in range(n):
|
|||
|
d = parse_date(str(dates[i]))
|
|||
|
t = parse_time(str(times[i]))
|
|||
|
result[i] = datetime(d.year, d.month, d.day,
|
|||
|
t.hour, t.minute, t.second)
|
|||
|
|
|||
|
return result.base # .base to access underlying ndarray
|
|||
|
|
|||
|
|
|||
|
def try_parse_year_month_day(
|
|||
|
object[:] years, object[:] months, object[:] days
|
|||
|
) -> np.ndarray:
|
|||
|
cdef:
|
|||
|
Py_ssize_t i, n
|
|||
|
object[::1] result
|
|||
|
|
|||
|
n = len(years)
|
|||
|
# TODO(cython3): Use len instead of `shape[0]`
|
|||
|
if months.shape[0] != n or days.shape[0] != n:
|
|||
|
raise ValueError('Length of years/months/days must all be equal')
|
|||
|
result = np.empty(n, dtype='O')
|
|||
|
|
|||
|
for i in range(n):
|
|||
|
result[i] = datetime(int(years[i]), int(months[i]), int(days[i]))
|
|||
|
|
|||
|
return result.base # .base to access underlying ndarray
|
|||
|
|
|||
|
|
|||
|
def try_parse_datetime_components(object[:] years,
|
|||
|
object[:] months,
|
|||
|
object[:] days,
|
|||
|
object[:] hours,
|
|||
|
object[:] minutes,
|
|||
|
object[:] seconds) -> np.ndarray:
|
|||
|
|
|||
|
cdef:
|
|||
|
Py_ssize_t i, n
|
|||
|
object[::1] result
|
|||
|
int secs
|
|||
|
double float_secs
|
|||
|
double micros
|
|||
|
|
|||
|
n = len(years)
|
|||
|
# TODO(cython3): Use len instead of `shape[0]`
|
|||
|
if (
|
|||
|
months.shape[0] != n
|
|||
|
or days.shape[0] != n
|
|||
|
or hours.shape[0] != n
|
|||
|
or minutes.shape[0] != n
|
|||
|
or seconds.shape[0] != n
|
|||
|
):
|
|||
|
raise ValueError('Length of all datetime components must be equal')
|
|||
|
result = np.empty(n, dtype='O')
|
|||
|
|
|||
|
for i in range(n):
|
|||
|
float_secs = float(seconds[i])
|
|||
|
secs = int(float_secs)
|
|||
|
|
|||
|
micros = float_secs - secs
|
|||
|
if micros > 0:
|
|||
|
micros = micros * 1000000
|
|||
|
|
|||
|
result[i] = datetime(int(years[i]), int(months[i]), int(days[i]),
|
|||
|
int(hours[i]), int(minutes[i]), secs,
|
|||
|
int(micros))
|
|||
|
|
|||
|
return result.base # .base to access underlying ndarray
|
|||
|
|
|||
|
|
|||
|
# ----------------------------------------------------------------------
|
|||
|
# Miscellaneous
|
|||
|
|
|||
|
|
|||
|
# Class copied verbatim from https://github.com/dateutil/dateutil/pull/732
|
|||
|
#
|
|||
|
# We use this class to parse and tokenize date strings. However, as it is
|
|||
|
# a private class in the dateutil library, relying on backwards compatibility
|
|||
|
# is not practical. In fact, using this class issues warnings (xref gh-21322).
|
|||
|
# Thus, we port the class over so that both issues are resolved.
|
|||
|
#
|
|||
|
# Copyright (c) 2017 - dateutil contributors
|
|||
|
class _timelex:
|
|||
|
def __init__(self, instream):
|
|||
|
if getattr(instream, 'decode', None) is not None:
|
|||
|
instream = instream.decode()
|
|||
|
|
|||
|
if isinstance(instream, str):
|
|||
|
self.stream = instream
|
|||
|
elif getattr(instream, 'read', None) is None:
|
|||
|
raise TypeError(
|
|||
|
'Parser must be a string or character stream, not '
|
|||
|
f'{type(instream).__name__}')
|
|||
|
else:
|
|||
|
self.stream = instream.read()
|
|||
|
|
|||
|
def get_tokens(self):
|
|||
|
"""
|
|||
|
This function breaks the time string into lexical units (tokens), which
|
|||
|
can be parsed by the parser. Lexical units are demarcated by changes in
|
|||
|
the character set, so any continuous string of letters is considered
|
|||
|
one unit, any continuous string of numbers is considered one unit.
|
|||
|
The main complication arises from the fact that dots ('.') can be used
|
|||
|
both as separators (e.g. "Sep.20.2009") or decimal points (e.g.
|
|||
|
"4:30:21.447"). As such, it is necessary to read the full context of
|
|||
|
any dot-separated strings before breaking it into tokens; as such, this
|
|||
|
function maintains a "token stack", for when the ambiguous context
|
|||
|
demands that multiple tokens be parsed at once.
|
|||
|
"""
|
|||
|
cdef:
|
|||
|
Py_ssize_t n
|
|||
|
|
|||
|
stream = self.stream.replace('\x00', '')
|
|||
|
|
|||
|
# TODO: Change \s --> \s+ (this doesn't match existing behavior)
|
|||
|
# TODO: change the punctuation block to punc+ (does not match existing)
|
|||
|
# TODO: can we merge the two digit patterns?
|
|||
|
tokens = re.findall(r"\s|"
|
|||
|
r"(?<![\.\d])\d+\.\d+(?![\.\d])"
|
|||
|
r"|\d+"
|
|||
|
r"|[a-zA-Z]+"
|
|||
|
r"|[\./:]+"
|
|||
|
r"|[^\da-zA-Z\./:\s]+", stream)
|
|||
|
|
|||
|
# Re-combine token tuples of the form ["59", ",", "456"] because
|
|||
|
# in this context the "," is treated as a decimal
|
|||
|
# (e.g. in python's default logging format)
|
|||
|
for n, token in enumerate(tokens[:-2]):
|
|||
|
# Kludge to match ,-decimal behavior; it'd be better to do this
|
|||
|
# later in the process and have a simpler tokenization
|
|||
|
if (token is not None and token.isdigit() and
|
|||
|
tokens[n + 1] == ',' and tokens[n + 2].isdigit()):
|
|||
|
# Have to check None b/c it might be replaced during the loop
|
|||
|
# TODO: I _really_ don't faking the value here
|
|||
|
tokens[n] = token + '.' + tokens[n + 2]
|
|||
|
tokens[n + 1] = None
|
|||
|
tokens[n + 2] = None
|
|||
|
|
|||
|
tokens = [x for x in tokens if x is not None]
|
|||
|
return tokens
|
|||
|
|
|||
|
@classmethod
|
|||
|
def split(cls, s):
|
|||
|
return cls(s).get_tokens()
|
|||
|
|
|||
|
|
|||
|
_DATEUTIL_LEXER_SPLIT = _timelex.split
|
|||
|
|
|||
|
|
|||
|
def format_is_iso(f: str) -> bint:
|
|||
|
"""
|
|||
|
Does format match the iso8601 set that can be handled by the C parser?
|
|||
|
Generally of form YYYY-MM-DDTHH:MM:SS - date separator can be different
|
|||
|
but must be consistent. Leading 0s in dates and times are optional.
|
|||
|
"""
|
|||
|
iso_template = '%Y{date_sep}%m{date_sep}%d{time_sep}%H:%M:%S{micro_or_tz}'.format
|
|||
|
excluded_formats = ['%Y%m%d', '%Y%m', '%Y']
|
|||
|
|
|||
|
for date_sep in [' ', '/', '\\', '-', '.', '']:
|
|||
|
for time_sep in [' ', 'T']:
|
|||
|
for micro_or_tz in ['', '%z', '%Z', '.%f', '.%f%z', '.%f%Z']:
|
|||
|
if (iso_template(date_sep=date_sep,
|
|||
|
time_sep=time_sep,
|
|||
|
micro_or_tz=micro_or_tz,
|
|||
|
).startswith(f) and f not in excluded_formats):
|
|||
|
return True
|
|||
|
return False
|
|||
|
|
|||
|
|
|||
|
def guess_datetime_format(dt_str, bint dayfirst=False):
|
|||
|
"""
|
|||
|
Guess the datetime format of a given datetime string.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
dt_str : str
|
|||
|
Datetime string to guess the format of.
|
|||
|
dayfirst : bool, default False
|
|||
|
If True parses dates with the day first, eg 20/01/2005
|
|||
|
Warning: dayfirst=True is not strict, but will prefer to parse
|
|||
|
with day first (this is a known bug).
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
ret : datetime format string (for `strftime` or `strptime`)
|
|||
|
"""
|
|||
|
|
|||
|
if not isinstance(dt_str, str):
|
|||
|
return None
|
|||
|
|
|||
|
day_attribute_and_format = (('day',), '%d', 2)
|
|||
|
|
|||
|
# attr name, format, padding (if any)
|
|||
|
datetime_attrs_to_format = [
|
|||
|
(('year', 'month', 'day'), '%Y%m%d', 0),
|
|||
|
(('year',), '%Y', 0),
|
|||
|
(('month',), '%B', 0),
|
|||
|
(('month',), '%b', 0),
|
|||
|
(('month',), '%m', 2),
|
|||
|
day_attribute_and_format,
|
|||
|
(('hour',), '%H', 2),
|
|||
|
(('minute',), '%M', 2),
|
|||
|
(('second',), '%S', 2),
|
|||
|
(('microsecond',), '%f', 6),
|
|||
|
(('second', 'microsecond'), '%S.%f', 0),
|
|||
|
(('tzinfo',), '%z', 0),
|
|||
|
(('tzinfo',), '%Z', 0),
|
|||
|
(('day_of_week',), '%a', 0),
|
|||
|
(('day_of_week',), '%A', 0),
|
|||
|
(('meridiem',), '%p', 0),
|
|||
|
]
|
|||
|
|
|||
|
if dayfirst:
|
|||
|
datetime_attrs_to_format.remove(day_attribute_and_format)
|
|||
|
datetime_attrs_to_format.insert(0, day_attribute_and_format)
|
|||
|
|
|||
|
try:
|
|||
|
parsed_datetime = du_parse(dt_str, dayfirst=dayfirst)
|
|||
|
except (ValueError, OverflowError):
|
|||
|
# In case the datetime can't be parsed, its format cannot be guessed
|
|||
|
return None
|
|||
|
|
|||
|
if parsed_datetime is None:
|
|||
|
return None
|
|||
|
|
|||
|
# _DATEUTIL_LEXER_SPLIT from dateutil will never raise here
|
|||
|
tokens = _DATEUTIL_LEXER_SPLIT(dt_str)
|
|||
|
|
|||
|
# Normalize offset part of tokens.
|
|||
|
# There are multiple formats for the timezone offset.
|
|||
|
# To pass the comparison condition between the output of `strftime` and
|
|||
|
# joined tokens, which is carried out at the final step of the function,
|
|||
|
# the offset part of the tokens must match the '%z' format like '+0900'
|
|||
|
# instead of ‘+09:00’.
|
|||
|
if parsed_datetime.tzinfo is not None:
|
|||
|
offset_index = None
|
|||
|
if len(tokens) > 0 and tokens[-1] == 'Z':
|
|||
|
# the last 'Z' means zero offset
|
|||
|
offset_index = -1
|
|||
|
elif len(tokens) > 1 and tokens[-2] in ('+', '-'):
|
|||
|
# ex. [..., '+', '0900']
|
|||
|
offset_index = -2
|
|||
|
elif len(tokens) > 3 and tokens[-4] in ('+', '-'):
|
|||
|
# ex. [..., '+', '09', ':', '00']
|
|||
|
offset_index = -4
|
|||
|
|
|||
|
if offset_index is not None:
|
|||
|
# If the input string has a timezone offset like '+0900',
|
|||
|
# the offset is separated into two tokens, ex. ['+', '0900’].
|
|||
|
# This separation will prevent subsequent processing
|
|||
|
# from correctly parsing the time zone format.
|
|||
|
# So in addition to the format nomalization, we rejoin them here.
|
|||
|
tokens[offset_index] = parsed_datetime.strftime("%z")
|
|||
|
tokens = tokens[:offset_index + 1 or None]
|
|||
|
|
|||
|
format_guess = [None] * len(tokens)
|
|||
|
found_attrs = set()
|
|||
|
|
|||
|
for attrs, attr_format, padding in datetime_attrs_to_format:
|
|||
|
# If a given attribute has been placed in the format string, skip
|
|||
|
# over other formats for that same underlying attribute (IE, month
|
|||
|
# can be represented in multiple different ways)
|
|||
|
if set(attrs) & found_attrs:
|
|||
|
continue
|
|||
|
|
|||
|
if parsed_datetime.tzinfo is None and attr_format in ("%Z", "%z"):
|
|||
|
continue
|
|||
|
|
|||
|
parsed_formatted = parsed_datetime.strftime(attr_format)
|
|||
|
for i, token_format in enumerate(format_guess):
|
|||
|
token_filled = tokens[i].zfill(padding)
|
|||
|
if token_format is None and token_filled == parsed_formatted:
|
|||
|
format_guess[i] = attr_format
|
|||
|
tokens[i] = token_filled
|
|||
|
found_attrs.update(attrs)
|
|||
|
break
|
|||
|
|
|||
|
# Only consider it a valid guess if we have a year, month and day
|
|||
|
if len({'year', 'month', 'day'} & found_attrs) != 3:
|
|||
|
return None
|
|||
|
|
|||
|
output_format = []
|
|||
|
for i, guess in enumerate(format_guess):
|
|||
|
if guess is not None:
|
|||
|
# Either fill in the format placeholder (like %Y)
|
|||
|
output_format.append(guess)
|
|||
|
else:
|
|||
|
# Or just the token separate (IE, the dashes in "01-01-2013")
|
|||
|
try:
|
|||
|
# If the token is numeric, then we likely didn't parse it
|
|||
|
# properly, so our guess is wrong
|
|||
|
float(tokens[i])
|
|||
|
return None
|
|||
|
except ValueError:
|
|||
|
pass
|
|||
|
|
|||
|
output_format.append(tokens[i])
|
|||
|
|
|||
|
guessed_format = ''.join(output_format)
|
|||
|
|
|||
|
# rebuild string, capturing any inferred padding
|
|||
|
dt_str = ''.join(tokens)
|
|||
|
if parsed_datetime.strftime(guessed_format) == dt_str:
|
|||
|
return guessed_format
|
|||
|
else:
|
|||
|
return None
|
|||
|
|
|||
|
|
|||
|
@cython.wraparound(False)
|
|||
|
@cython.boundscheck(False)
|
|||
|
cdef inline object convert_to_unicode(object item, bint keep_trivial_numbers):
|
|||
|
"""
|
|||
|
Convert `item` to str.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
item : object
|
|||
|
keep_trivial_numbers : bool
|
|||
|
if True, then conversion (to string from integer/float zero)
|
|||
|
is not performed
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
str or int or float
|
|||
|
"""
|
|||
|
cdef:
|
|||
|
float64_t float_item
|
|||
|
|
|||
|
if keep_trivial_numbers:
|
|||
|
if isinstance(item, int):
|
|||
|
if <int>item == 0:
|
|||
|
return item
|
|||
|
elif isinstance(item, float):
|
|||
|
float_item = item
|
|||
|
if float_item == 0.0 or float_item != float_item:
|
|||
|
return item
|
|||
|
|
|||
|
if not isinstance(item, str):
|
|||
|
item = PyObject_Str(item)
|
|||
|
|
|||
|
return item
|
|||
|
|
|||
|
|
|||
|
@cython.wraparound(False)
|
|||
|
@cython.boundscheck(False)
|
|||
|
def concat_date_cols(tuple date_cols, bint keep_trivial_numbers=True) -> np.ndarray:
|
|||
|
"""
|
|||
|
Concatenates elements from numpy arrays in `date_cols` into strings.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
date_cols : tuple[ndarray]
|
|||
|
keep_trivial_numbers : bool, default True
|
|||
|
if True and len(date_cols) == 1, then
|
|||
|
conversion (to string from integer/float zero) is not performed
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
arr_of_rows : ndarray[object]
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> dates=np.array(['3/31/2019', '4/31/2019'], dtype=object)
|
|||
|
>>> times=np.array(['11:20', '10:45'], dtype=object)
|
|||
|
>>> result = concat_date_cols((dates, times))
|
|||
|
>>> result
|
|||
|
array(['3/31/2019 11:20', '4/31/2019 10:45'], dtype=object)
|
|||
|
"""
|
|||
|
cdef:
|
|||
|
Py_ssize_t rows_count = 0, col_count = len(date_cols)
|
|||
|
Py_ssize_t col_idx, row_idx
|
|||
|
list list_to_join
|
|||
|
cnp.ndarray[object] iters
|
|||
|
object[::1] iters_view
|
|||
|
flatiter it
|
|||
|
cnp.ndarray[object] result
|
|||
|
object[::1] result_view
|
|||
|
|
|||
|
if col_count == 0:
|
|||
|
return np.zeros(0, dtype=object)
|
|||
|
|
|||
|
if not all(is_array(array) for array in date_cols):
|
|||
|
raise ValueError("not all elements from date_cols are numpy arrays")
|
|||
|
|
|||
|
rows_count = min(len(array) for array in date_cols)
|
|||
|
result = np.zeros(rows_count, dtype=object)
|
|||
|
result_view = result
|
|||
|
|
|||
|
if col_count == 1:
|
|||
|
array = date_cols[0]
|
|||
|
it = <flatiter>PyArray_IterNew(array)
|
|||
|
for row_idx in range(rows_count):
|
|||
|
item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
|
|||
|
result_view[row_idx] = convert_to_unicode(item,
|
|||
|
keep_trivial_numbers)
|
|||
|
PyArray_ITER_NEXT(it)
|
|||
|
else:
|
|||
|
# create fixed size list - more efficient memory allocation
|
|||
|
list_to_join = [None] * col_count
|
|||
|
iters = np.zeros(col_count, dtype=object)
|
|||
|
|
|||
|
# create memoryview of iters ndarray, that will contain some
|
|||
|
# flatiter's for each array in `date_cols` - more efficient indexing
|
|||
|
iters_view = iters
|
|||
|
for col_idx, array in enumerate(date_cols):
|
|||
|
iters_view[col_idx] = PyArray_IterNew(array)
|
|||
|
|
|||
|
# array elements that are on the same line are converted to one string
|
|||
|
for row_idx in range(rows_count):
|
|||
|
for col_idx, array in enumerate(date_cols):
|
|||
|
# this cast is needed, because we did not find a way
|
|||
|
# to efficiently store `flatiter` type objects in ndarray
|
|||
|
it = <flatiter>iters_view[col_idx]
|
|||
|
item = PyArray_GETITEM(array, PyArray_ITER_DATA(it))
|
|||
|
list_to_join[col_idx] = convert_to_unicode(item, False)
|
|||
|
PyArray_ITER_NEXT(it)
|
|||
|
result_view[row_idx] = " ".join(list_to_join)
|
|||
|
|
|||
|
return result
|
|||
|
|
|||
|
|
|||
|
cpdef str get_rule_month(str source):
|
|||
|
"""
|
|||
|
Return starting month of given freq, default is December.
|
|||
|
|
|||
|
Parameters
|
|||
|
----------
|
|||
|
source : str
|
|||
|
Derived from `freq.rule_code` or `freq.freqstr`.
|
|||
|
|
|||
|
Returns
|
|||
|
-------
|
|||
|
rule_month: str
|
|||
|
|
|||
|
Examples
|
|||
|
--------
|
|||
|
>>> get_rule_month('D')
|
|||
|
'DEC'
|
|||
|
|
|||
|
>>> get_rule_month('A-JAN')
|
|||
|
'JAN'
|
|||
|
"""
|
|||
|
source = source.upper()
|
|||
|
if "-" not in source:
|
|||
|
return "DEC"
|
|||
|
else:
|
|||
|
return source.split("-")[1]
|