3148 lines
90 KiB
Cython
3148 lines
90 KiB
Cython
from collections import abc
|
|
from decimal import Decimal
|
|
from enum import Enum
|
|
from typing import (
|
|
Literal,
|
|
_GenericAlias,
|
|
)
|
|
import warnings
|
|
|
|
cimport cython
|
|
from cpython.datetime cimport (
|
|
PyDate_Check,
|
|
PyDateTime_Check,
|
|
PyDelta_Check,
|
|
PyTime_Check,
|
|
import_datetime,
|
|
)
|
|
from cpython.iterator cimport PyIter_Check
|
|
from cpython.number cimport PyNumber_Check
|
|
from cpython.object cimport (
|
|
Py_EQ,
|
|
PyObject_RichCompareBool,
|
|
PyTypeObject,
|
|
)
|
|
from cpython.ref cimport Py_INCREF
|
|
from cpython.sequence cimport PySequence_Check
|
|
from cpython.tuple cimport (
|
|
PyTuple_New,
|
|
PyTuple_SET_ITEM,
|
|
)
|
|
from cython cimport (
|
|
Py_ssize_t,
|
|
floating,
|
|
)
|
|
|
|
from pandas.util._exceptions import find_stack_level
|
|
|
|
import_datetime()
|
|
|
|
import numpy as np
|
|
|
|
cimport numpy as cnp
|
|
from numpy cimport (
|
|
NPY_OBJECT,
|
|
PyArray_Check,
|
|
PyArray_GETITEM,
|
|
PyArray_ITER_DATA,
|
|
PyArray_ITER_NEXT,
|
|
PyArray_IterNew,
|
|
complex128_t,
|
|
flatiter,
|
|
float32_t,
|
|
float64_t,
|
|
int64_t,
|
|
intp_t,
|
|
ndarray,
|
|
uint8_t,
|
|
uint64_t,
|
|
)
|
|
|
|
cnp.import_array()
|
|
|
|
cdef extern from "Python.h":
|
|
# Note: importing extern-style allows us to declare these as nogil
|
|
# functions, whereas `from cpython cimport` does not.
|
|
bint PyObject_TypeCheck(object obj, PyTypeObject* type) nogil
|
|
|
|
cdef extern from "numpy/arrayobject.h":
|
|
# cython's numpy.dtype specification is incorrect, which leads to
|
|
# errors in issubclass(self.dtype.type, np.bool_), so we directly
|
|
# include the correct version
|
|
# https://github.com/cython/cython/issues/2022
|
|
|
|
ctypedef class numpy.dtype [object PyArray_Descr]:
|
|
# Use PyDataType_* macros when possible, however there are no macros
|
|
# for accessing some of the fields, so some are defined. Please
|
|
# ask on cython-dev if you need more.
|
|
cdef:
|
|
int type_num
|
|
int itemsize "elsize"
|
|
char byteorder
|
|
object fields
|
|
tuple names
|
|
|
|
PyTypeObject PySignedIntegerArrType_Type
|
|
PyTypeObject PyUnsignedIntegerArrType_Type
|
|
|
|
cdef extern from "numpy/ndarrayobject.h":
|
|
bint PyArray_CheckScalar(obj) nogil
|
|
|
|
|
|
cdef extern from "src/parse_helper.h":
|
|
int floatify(object, float64_t *result, int *maybe_int) except -1
|
|
|
|
from pandas._libs cimport util
|
|
from pandas._libs.util cimport (
|
|
INT64_MAX,
|
|
INT64_MIN,
|
|
UINT64_MAX,
|
|
is_nan,
|
|
)
|
|
|
|
from pandas._libs.tslib import array_to_datetime
|
|
from pandas._libs.tslibs import (
|
|
OutOfBoundsDatetime,
|
|
OutOfBoundsTimedelta,
|
|
)
|
|
from pandas._libs.tslibs.period import Period
|
|
|
|
from pandas._libs.missing cimport (
|
|
C_NA,
|
|
checknull,
|
|
is_matching_na,
|
|
is_null_datetime64,
|
|
is_null_timedelta64,
|
|
)
|
|
from pandas._libs.tslibs.conversion cimport convert_to_tsobject
|
|
from pandas._libs.tslibs.nattype cimport (
|
|
NPY_NAT,
|
|
c_NaT as NaT,
|
|
checknull_with_nat,
|
|
)
|
|
from pandas._libs.tslibs.offsets cimport is_offset_object
|
|
from pandas._libs.tslibs.period cimport is_period_object
|
|
from pandas._libs.tslibs.timedeltas cimport convert_to_timedelta64
|
|
from pandas._libs.tslibs.timezones cimport tz_compare
|
|
|
|
# constants that will be compared to potentially arbitrarily large
|
|
# python int
|
|
cdef:
|
|
object oINT64_MAX = <int64_t>INT64_MAX
|
|
object oINT64_MIN = <int64_t>INT64_MIN
|
|
object oUINT64_MAX = <uint64_t>UINT64_MAX
|
|
|
|
float64_t NaN = <float64_t>np.NaN
|
|
|
|
# python-visible
|
|
i8max = <int64_t>INT64_MAX
|
|
u8max = <uint64_t>UINT64_MAX
|
|
|
|
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
def memory_usage_of_objects(arr: object[:]) -> int64_t:
|
|
"""
|
|
Return the memory usage of an object array in bytes.
|
|
|
|
Does not include the actual bytes of the pointers
|
|
"""
|
|
i: Py_ssize_t
|
|
n: Py_ssize_t
|
|
size: int64_t
|
|
|
|
size = 0
|
|
n = len(arr)
|
|
for i in range(n):
|
|
size += arr[i].__sizeof__()
|
|
return size
|
|
|
|
|
|
# ----------------------------------------------------------------------
|
|
|
|
|
|
def is_scalar(val: object) -> bool:
|
|
"""
|
|
Return True if given object is scalar.
|
|
|
|
Parameters
|
|
----------
|
|
val : object
|
|
This includes:
|
|
|
|
- numpy array scalar (e.g. np.int64)
|
|
- Python builtin numerics
|
|
- Python builtin byte arrays and strings
|
|
- None
|
|
- datetime.datetime
|
|
- datetime.timedelta
|
|
- Period
|
|
- decimal.Decimal
|
|
- Interval
|
|
- DateOffset
|
|
- Fraction
|
|
- Number.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
Return True if given object is scalar.
|
|
|
|
Examples
|
|
--------
|
|
>>> import datetime
|
|
>>> dt = datetime.datetime(2018, 10, 3)
|
|
>>> pd.api.types.is_scalar(dt)
|
|
True
|
|
|
|
>>> pd.api.types.is_scalar([2, 3])
|
|
False
|
|
|
|
>>> pd.api.types.is_scalar({0: 1, 2: 3})
|
|
False
|
|
|
|
>>> pd.api.types.is_scalar((0, 2))
|
|
False
|
|
|
|
pandas supports PEP 3141 numbers:
|
|
|
|
>>> from fractions import Fraction
|
|
>>> pd.api.types.is_scalar(Fraction(3, 5))
|
|
True
|
|
"""
|
|
|
|
# Start with C-optimized checks
|
|
if (cnp.PyArray_IsAnyScalar(val)
|
|
# PyArray_IsAnyScalar is always False for bytearrays on Py3
|
|
or PyDate_Check(val)
|
|
or PyDelta_Check(val)
|
|
or PyTime_Check(val)
|
|
# We differ from numpy, which claims that None is not scalar;
|
|
# see np.isscalar
|
|
or val is C_NA
|
|
or val is None):
|
|
return True
|
|
|
|
# Next use C-optimized checks to exclude common non-scalars before falling
|
|
# back to non-optimized checks.
|
|
if PySequence_Check(val):
|
|
# e.g. list, tuple
|
|
# includes np.ndarray, Series which PyNumber_Check can return True for
|
|
return False
|
|
|
|
# Note: PyNumber_Check check includes Decimal, Fraction, numbers.Number
|
|
return (PyNumber_Check(val)
|
|
or is_period_object(val)
|
|
or is_interval(val)
|
|
or is_offset_object(val))
|
|
|
|
|
|
cdef inline int64_t get_itemsize(object val):
|
|
"""
|
|
Get the itemsize of a NumPy scalar, -1 if not a NumPy scalar.
|
|
|
|
Parameters
|
|
----------
|
|
val : object
|
|
|
|
Returns
|
|
-------
|
|
is_ndarray : bool
|
|
"""
|
|
if PyArray_CheckScalar(val):
|
|
return cnp.PyArray_DescrFromScalar(val).itemsize
|
|
else:
|
|
return -1
|
|
|
|
|
|
def is_iterator(obj: object) -> bool:
|
|
"""
|
|
Check if the object is an iterator.
|
|
|
|
This is intended for generators, not list-like objects.
|
|
|
|
Parameters
|
|
----------
|
|
obj : The object to check
|
|
|
|
Returns
|
|
-------
|
|
is_iter : bool
|
|
Whether `obj` is an iterator.
|
|
|
|
Examples
|
|
--------
|
|
>>> import datetime
|
|
>>> is_iterator((x for x in []))
|
|
True
|
|
>>> is_iterator([1, 2, 3])
|
|
False
|
|
>>> is_iterator(datetime.datetime(2017, 1, 1))
|
|
False
|
|
>>> is_iterator("foo")
|
|
False
|
|
>>> is_iterator(1)
|
|
False
|
|
"""
|
|
return PyIter_Check(obj)
|
|
|
|
|
|
def item_from_zerodim(val: object) -> object:
|
|
"""
|
|
If the value is a zerodim array, return the item it contains.
|
|
|
|
Parameters
|
|
----------
|
|
val : object
|
|
|
|
Returns
|
|
-------
|
|
object
|
|
|
|
Examples
|
|
--------
|
|
>>> item_from_zerodim(1)
|
|
1
|
|
>>> item_from_zerodim('foobar')
|
|
'foobar'
|
|
>>> item_from_zerodim(np.array(1))
|
|
1
|
|
>>> item_from_zerodim(np.array([1]))
|
|
array([1])
|
|
"""
|
|
if cnp.PyArray_IsZeroDim(val):
|
|
return cnp.PyArray_ToScalar(cnp.PyArray_DATA(val), val)
|
|
return val
|
|
|
|
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
def fast_unique_multiple(list arrays, sort: bool = True):
|
|
"""
|
|
Generate a list of unique values from a list of arrays.
|
|
|
|
Parameters
|
|
----------
|
|
list : array-like
|
|
List of array-like objects.
|
|
sort : bool
|
|
Whether or not to sort the resulting unique list.
|
|
|
|
Returns
|
|
-------
|
|
list of unique values
|
|
"""
|
|
cdef:
|
|
ndarray[object] buf
|
|
Py_ssize_t k = len(arrays)
|
|
Py_ssize_t i, j, n
|
|
list uniques = []
|
|
dict table = {}
|
|
object val, stub = 0
|
|
|
|
for i in range(k):
|
|
buf = arrays[i]
|
|
n = len(buf)
|
|
for j in range(n):
|
|
val = buf[j]
|
|
if val not in table:
|
|
table[val] = stub
|
|
uniques.append(val)
|
|
|
|
if sort is None:
|
|
try:
|
|
uniques.sort()
|
|
except TypeError:
|
|
warnings.warn(
|
|
"The values in the array are unorderable. "
|
|
"Pass `sort=False` to suppress this warning.",
|
|
RuntimeWarning,
|
|
stacklevel=find_stack_level(),
|
|
)
|
|
pass
|
|
|
|
return uniques
|
|
|
|
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
def fast_unique_multiple_list(lists: list, sort: bool | None = True) -> list:
|
|
cdef:
|
|
list buf
|
|
Py_ssize_t k = len(lists)
|
|
Py_ssize_t i, j, n
|
|
list uniques = []
|
|
dict table = {}
|
|
object val, stub = 0
|
|
|
|
for i in range(k):
|
|
buf = lists[i]
|
|
n = len(buf)
|
|
for j in range(n):
|
|
val = buf[j]
|
|
if val not in table:
|
|
table[val] = stub
|
|
uniques.append(val)
|
|
if sort:
|
|
try:
|
|
uniques.sort()
|
|
except TypeError:
|
|
pass
|
|
|
|
return uniques
|
|
|
|
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
def fast_unique_multiple_list_gen(object gen, bint sort=True) -> list:
|
|
"""
|
|
Generate a list of unique values from a generator of lists.
|
|
|
|
Parameters
|
|
----------
|
|
gen : generator object
|
|
Generator of lists from which the unique list is created.
|
|
sort : bool
|
|
Whether or not to sort the resulting unique list.
|
|
|
|
Returns
|
|
-------
|
|
list of unique values
|
|
"""
|
|
cdef:
|
|
list buf
|
|
Py_ssize_t j, n
|
|
list uniques = []
|
|
dict table = {}
|
|
object val, stub = 0
|
|
|
|
for buf in gen:
|
|
n = len(buf)
|
|
for j in range(n):
|
|
val = buf[j]
|
|
if val not in table:
|
|
table[val] = stub
|
|
uniques.append(val)
|
|
if sort:
|
|
try:
|
|
uniques.sort()
|
|
except TypeError:
|
|
pass
|
|
|
|
return uniques
|
|
|
|
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
def dicts_to_array(dicts: list, columns: list):
|
|
cdef:
|
|
Py_ssize_t i, j, k, n
|
|
ndarray[object, ndim=2] result
|
|
dict row
|
|
object col, onan = np.nan
|
|
|
|
k = len(columns)
|
|
n = len(dicts)
|
|
|
|
result = np.empty((n, k), dtype='O')
|
|
|
|
for i in range(n):
|
|
row = dicts[i]
|
|
for j in range(k):
|
|
col = columns[j]
|
|
if col in row:
|
|
result[i, j] = row[col]
|
|
else:
|
|
result[i, j] = onan
|
|
|
|
return result
|
|
|
|
|
|
def fast_zip(list ndarrays) -> ndarray[object]:
|
|
"""
|
|
For zipping multiple ndarrays into an ndarray of tuples.
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, j, k, n
|
|
ndarray[object, ndim=1] result
|
|
flatiter it
|
|
object val, tup
|
|
|
|
k = len(ndarrays)
|
|
n = len(ndarrays[0])
|
|
|
|
result = np.empty(n, dtype=object)
|
|
|
|
# initialize tuples on first pass
|
|
arr = ndarrays[0]
|
|
it = <flatiter>PyArray_IterNew(arr)
|
|
for i in range(n):
|
|
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
|
|
tup = PyTuple_New(k)
|
|
|
|
PyTuple_SET_ITEM(tup, 0, val)
|
|
Py_INCREF(val)
|
|
result[i] = tup
|
|
PyArray_ITER_NEXT(it)
|
|
|
|
for j in range(1, k):
|
|
arr = ndarrays[j]
|
|
it = <flatiter>PyArray_IterNew(arr)
|
|
if len(arr) != n:
|
|
raise ValueError("all arrays must be same length")
|
|
|
|
for i in range(n):
|
|
val = PyArray_GETITEM(arr, PyArray_ITER_DATA(it))
|
|
PyTuple_SET_ITEM(result[i], j, val)
|
|
Py_INCREF(val)
|
|
PyArray_ITER_NEXT(it)
|
|
|
|
return result
|
|
|
|
|
|
def get_reverse_indexer(const intp_t[:] indexer, Py_ssize_t length) -> ndarray:
|
|
"""
|
|
Reverse indexing operation.
|
|
|
|
Given `indexer`, make `indexer_inv` of it, such that::
|
|
|
|
indexer_inv[indexer[x]] = x
|
|
|
|
Parameters
|
|
----------
|
|
indexer : np.ndarray[np.intp]
|
|
length : int
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray[np.intp]
|
|
|
|
Notes
|
|
-----
|
|
If indexer is not unique, only first occurrence is accounted.
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, n = len(indexer)
|
|
ndarray[intp_t, ndim=1] rev_indexer
|
|
intp_t idx
|
|
|
|
rev_indexer = np.empty(length, dtype=np.intp)
|
|
rev_indexer[:] = -1
|
|
for i in range(n):
|
|
idx = indexer[i]
|
|
if idx != -1:
|
|
rev_indexer[idx] = i
|
|
|
|
return rev_indexer
|
|
|
|
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
# Can add const once https://github.com/cython/cython/issues/1772 resolved
|
|
def has_infs(floating[:] arr) -> bool:
|
|
cdef:
|
|
Py_ssize_t i, n = len(arr)
|
|
floating inf, neginf, val
|
|
bint ret = False
|
|
|
|
inf = np.inf
|
|
neginf = -inf
|
|
with nogil:
|
|
for i in range(n):
|
|
val = arr[i]
|
|
if val == inf or val == neginf:
|
|
ret = True
|
|
break
|
|
return ret
|
|
|
|
|
|
def maybe_indices_to_slice(ndarray[intp_t, ndim=1] indices, int max_len):
|
|
cdef:
|
|
Py_ssize_t i, n = len(indices)
|
|
intp_t k, vstart, vlast, v
|
|
|
|
if n == 0:
|
|
return slice(0, 0)
|
|
|
|
vstart = indices[0]
|
|
if vstart < 0 or max_len <= vstart:
|
|
return indices
|
|
|
|
if n == 1:
|
|
return slice(vstart, <intp_t>(vstart + 1))
|
|
|
|
vlast = indices[n - 1]
|
|
if vlast < 0 or max_len <= vlast:
|
|
return indices
|
|
|
|
k = indices[1] - indices[0]
|
|
if k == 0:
|
|
return indices
|
|
else:
|
|
for i in range(2, n):
|
|
v = indices[i]
|
|
if v - indices[i - 1] != k:
|
|
return indices
|
|
|
|
if k > 0:
|
|
return slice(vstart, <intp_t>(vlast + 1), k)
|
|
else:
|
|
if vlast == 0:
|
|
return slice(vstart, None, k)
|
|
else:
|
|
return slice(vstart, <intp_t>(vlast - 1), k)
|
|
|
|
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
def maybe_booleans_to_slice(ndarray[uint8_t, ndim=1] mask):
|
|
cdef:
|
|
Py_ssize_t i, n = len(mask)
|
|
Py_ssize_t start = 0, end = 0
|
|
bint started = False, finished = False
|
|
|
|
for i in range(n):
|
|
if mask[i]:
|
|
if finished:
|
|
return mask.view(np.bool_)
|
|
if not started:
|
|
started = True
|
|
start = i
|
|
else:
|
|
if finished:
|
|
continue
|
|
|
|
if started:
|
|
end = i
|
|
finished = True
|
|
|
|
if not started:
|
|
return slice(0, 0)
|
|
if not finished:
|
|
return slice(start, None)
|
|
else:
|
|
return slice(start, end)
|
|
|
|
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
def array_equivalent_object(left: object[:], right: object[:]) -> bool:
|
|
"""
|
|
Perform an element by element comparison on 1-d object arrays
|
|
taking into account nan positions.
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, n = left.shape[0]
|
|
object x, y
|
|
|
|
for i in range(n):
|
|
x = left[i]
|
|
y = right[i]
|
|
|
|
# we are either not equal or both nan
|
|
# I think None == None will be true here
|
|
try:
|
|
if PyArray_Check(x) and PyArray_Check(y):
|
|
if not array_equivalent_object(x, y):
|
|
return False
|
|
elif (x is C_NA) ^ (y is C_NA):
|
|
return False
|
|
elif not (
|
|
PyObject_RichCompareBool(x, y, Py_EQ)
|
|
or is_matching_na(x, y, nan_matches_none=True)
|
|
):
|
|
return False
|
|
except ValueError:
|
|
# Avoid raising ValueError when comparing Numpy arrays to other types
|
|
if cnp.PyArray_IsAnyScalar(x) != cnp.PyArray_IsAnyScalar(y):
|
|
# Only compare scalars to scalars and non-scalars to non-scalars
|
|
return False
|
|
elif (not (cnp.PyArray_IsPythonScalar(x) or cnp.PyArray_IsPythonScalar(y))
|
|
and not (isinstance(x, type(y)) or isinstance(y, type(x)))):
|
|
# Check if non-scalars have the same type
|
|
return False
|
|
raise
|
|
return True
|
|
|
|
|
|
ctypedef fused ndarr_object:
|
|
ndarray[object, ndim=1]
|
|
ndarray[object, ndim=2]
|
|
|
|
# TODO: get rid of this in StringArray and modify
|
|
# and go through ensure_string_array instead
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
def convert_nans_to_NA(ndarr_object arr) -> ndarray:
|
|
"""
|
|
Helper for StringArray that converts null values that
|
|
are not pd.NA(e.g. np.nan, None) to pd.NA. Assumes elements
|
|
have already been validated as null.
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, m, n
|
|
object val
|
|
ndarr_object result
|
|
result = np.asarray(arr, dtype="object")
|
|
if arr.ndim == 2:
|
|
m, n = arr.shape[0], arr.shape[1]
|
|
for i in range(m):
|
|
for j in range(n):
|
|
val = arr[i, j]
|
|
if not isinstance(val, str):
|
|
result[i, j] = <object>C_NA
|
|
else:
|
|
n = len(arr)
|
|
for i in range(n):
|
|
val = arr[i]
|
|
if not isinstance(val, str):
|
|
result[i] = <object>C_NA
|
|
return result
|
|
|
|
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
cpdef ndarray[object] ensure_string_array(
|
|
arr,
|
|
object na_value=np.nan,
|
|
bint convert_na_value=True,
|
|
bint copy=True,
|
|
bint skipna=True,
|
|
):
|
|
"""
|
|
Returns a new numpy array with object dtype and only strings and na values.
|
|
|
|
Parameters
|
|
----------
|
|
arr : array-like
|
|
The values to be converted to str, if needed.
|
|
na_value : Any, default np.nan
|
|
The value to use for na. For example, np.nan or pd.NA.
|
|
convert_na_value : bool, default True
|
|
If False, existing na values will be used unchanged in the new array.
|
|
copy : bool, default True
|
|
Whether to ensure that a new array is returned.
|
|
skipna : bool, default True
|
|
Whether or not to coerce nulls to their stringified form
|
|
(e.g. if False, NaN becomes 'nan').
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray[object]
|
|
An array with the input array's elements casted to str or nan-like.
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i = 0, n = len(arr)
|
|
|
|
if hasattr(arr, "to_numpy"):
|
|
|
|
if hasattr(arr, "dtype") and arr.dtype.kind in ["m", "M"]:
|
|
# dtype check to exclude DataFrame
|
|
# GH#41409 TODO: not a great place for this
|
|
out = arr.astype(str).astype(object)
|
|
out[arr.isna()] = na_value
|
|
return out
|
|
|
|
arr = arr.to_numpy()
|
|
elif not util.is_array(arr):
|
|
arr = np.array(arr, dtype="object")
|
|
|
|
result = np.asarray(arr, dtype="object")
|
|
|
|
if copy and result is arr:
|
|
result = result.copy()
|
|
|
|
for i in range(n):
|
|
val = arr[i]
|
|
|
|
if isinstance(val, str):
|
|
continue
|
|
|
|
if not checknull(val):
|
|
if not util.is_float_object(val):
|
|
# f"{val}" is faster than str(val)
|
|
result[i] = f"{val}"
|
|
else:
|
|
# f"{val}" is not always equivalent to str(val) for floats
|
|
result[i] = str(val)
|
|
else:
|
|
if convert_na_value:
|
|
val = na_value
|
|
if skipna:
|
|
result[i] = val
|
|
else:
|
|
result[i] = f"{val}"
|
|
|
|
return result
|
|
|
|
|
|
def is_all_arraylike(obj: list) -> bool:
|
|
"""
|
|
Should we treat these as levels of a MultiIndex, as opposed to Index items?
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, n = len(obj)
|
|
object val
|
|
bint all_arrays = True
|
|
|
|
for i in range(n):
|
|
val = obj[i]
|
|
if not (isinstance(val, list) or
|
|
util.is_array(val) or hasattr(val, '_data')):
|
|
# TODO: EA?
|
|
# exclude tuples, frozensets as they may be contained in an Index
|
|
all_arrays = False
|
|
break
|
|
|
|
return all_arrays
|
|
|
|
|
|
# ------------------------------------------------------------------------------
|
|
# Groupby-related functions
|
|
|
|
# TODO: could do even better if we know something about the data. eg, index has
|
|
# 1-min data, binner has 5-min data, then bins are just strides in index. This
|
|
# is a general, O(max(len(values), len(binner))) method.
|
|
@cython.boundscheck(False)
|
|
@cython.wraparound(False)
|
|
def generate_bins_dt64(ndarray[int64_t, ndim=1] values, const int64_t[:] binner,
|
|
object closed='left', bint hasnans=False):
|
|
"""
|
|
Int64 (datetime64) version of generic python version in ``groupby.py``.
|
|
"""
|
|
cdef:
|
|
Py_ssize_t lenidx, lenbin, i, j, bc, vc
|
|
ndarray[int64_t, ndim=1] bins
|
|
int64_t l_bin, r_bin, nat_count
|
|
bint right_closed = closed == 'right'
|
|
|
|
nat_count = 0
|
|
if hasnans:
|
|
mask = values == NPY_NAT
|
|
nat_count = np.sum(mask)
|
|
values = values[~mask]
|
|
|
|
lenidx = len(values)
|
|
lenbin = len(binner)
|
|
|
|
if lenidx <= 0 or lenbin <= 0:
|
|
raise ValueError("Invalid length for values or for binner")
|
|
|
|
# check binner fits data
|
|
if values[0] < binner[0]:
|
|
raise ValueError("Values falls before first bin")
|
|
|
|
if values[lenidx - 1] > binner[lenbin - 1]:
|
|
raise ValueError("Values falls after last bin")
|
|
|
|
bins = np.empty(lenbin - 1, dtype=np.int64)
|
|
|
|
j = 0 # index into values
|
|
bc = 0 # bin count
|
|
|
|
# linear scan
|
|
if right_closed:
|
|
for i in range(0, lenbin - 1):
|
|
r_bin = binner[i + 1]
|
|
# count values in current bin, advance to next bin
|
|
while j < lenidx and values[j] <= r_bin:
|
|
j += 1
|
|
bins[bc] = j
|
|
bc += 1
|
|
else:
|
|
for i in range(0, lenbin - 1):
|
|
r_bin = binner[i + 1]
|
|
# count values in current bin, advance to next bin
|
|
while j < lenidx and values[j] < r_bin:
|
|
j += 1
|
|
bins[bc] = j
|
|
bc += 1
|
|
|
|
if nat_count > 0:
|
|
# shift bins by the number of NaT
|
|
bins = bins + nat_count
|
|
bins = np.insert(bins, 0, nat_count)
|
|
|
|
return bins
|
|
|
|
|
|
@cython.boundscheck(False)
|
|
@cython.wraparound(False)
|
|
def get_level_sorter(
|
|
ndarray[int64_t, ndim=1] codes, const intp_t[:] starts
|
|
) -> ndarray:
|
|
"""
|
|
Argsort for a single level of a multi-index, keeping the order of higher
|
|
levels unchanged. `starts` points to starts of same-key indices w.r.t
|
|
to leading levels; equivalent to:
|
|
np.hstack([codes[starts[i]:starts[i+1]].argsort(kind='mergesort')
|
|
+ starts[i] for i in range(len(starts) - 1)])
|
|
|
|
Parameters
|
|
----------
|
|
codes : np.ndarray[int64_t, ndim=1]
|
|
starts : np.ndarray[intp, ndim=1]
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray[np.int, ndim=1]
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, l, r
|
|
ndarray[intp_t, ndim=1] out = cnp.PyArray_EMPTY(1, codes.shape, cnp.NPY_INTP, 0)
|
|
|
|
for i in range(len(starts) - 1):
|
|
l, r = starts[i], starts[i + 1]
|
|
out[l:r] = l + codes[l:r].argsort(kind='mergesort')
|
|
|
|
return out
|
|
|
|
|
|
@cython.boundscheck(False)
|
|
@cython.wraparound(False)
|
|
def count_level_2d(ndarray[uint8_t, ndim=2, cast=True] mask,
|
|
const intp_t[:] labels,
|
|
Py_ssize_t max_bin,
|
|
int axis):
|
|
cdef:
|
|
Py_ssize_t i, j, k, n
|
|
ndarray[int64_t, ndim=2] counts
|
|
|
|
assert (axis == 0 or axis == 1)
|
|
n, k = (<object>mask).shape
|
|
|
|
if axis == 0:
|
|
counts = np.zeros((max_bin, k), dtype='i8')
|
|
with nogil:
|
|
for i in range(n):
|
|
for j in range(k):
|
|
if mask[i, j]:
|
|
counts[labels[i], j] += 1
|
|
|
|
else: # axis == 1
|
|
counts = np.zeros((n, max_bin), dtype='i8')
|
|
with nogil:
|
|
for i in range(n):
|
|
for j in range(k):
|
|
if mask[i, j]:
|
|
counts[i, labels[j]] += 1
|
|
|
|
return counts
|
|
|
|
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
def generate_slices(const intp_t[:] labels, Py_ssize_t ngroups):
|
|
cdef:
|
|
Py_ssize_t i, group_size, n, start
|
|
intp_t lab
|
|
int64_t[::1] starts, ends
|
|
|
|
n = len(labels)
|
|
|
|
starts = np.zeros(ngroups, dtype=np.int64)
|
|
ends = np.zeros(ngroups, dtype=np.int64)
|
|
|
|
start = 0
|
|
group_size = 0
|
|
with nogil:
|
|
for i in range(n):
|
|
lab = labels[i]
|
|
if lab < 0:
|
|
start += 1
|
|
else:
|
|
group_size += 1
|
|
if i == n - 1 or lab != labels[i + 1]:
|
|
starts[lab] = start
|
|
ends[lab] = start + group_size
|
|
start += group_size
|
|
group_size = 0
|
|
|
|
return np.asarray(starts), np.asarray(ends)
|
|
|
|
|
|
def indices_fast(ndarray[intp_t, ndim=1] index, const int64_t[:] labels, list keys,
|
|
list sorted_labels) -> dict:
|
|
"""
|
|
Parameters
|
|
----------
|
|
index : ndarray[intp]
|
|
labels : ndarray[int64]
|
|
keys : list
|
|
sorted_labels : list[ndarray[int64]]
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, j, k, lab, cur, start, n = len(labels)
|
|
dict result = {}
|
|
object tup
|
|
|
|
k = len(keys)
|
|
|
|
# Start at the first non-null entry
|
|
j = 0
|
|
for j in range(0, n):
|
|
if labels[j] != -1:
|
|
break
|
|
else:
|
|
return result
|
|
cur = labels[j]
|
|
start = j
|
|
|
|
for i in range(j+1, n):
|
|
lab = labels[i]
|
|
|
|
if lab != cur:
|
|
if lab != -1:
|
|
if k == 1:
|
|
# When k = 1 we do not want to return a tuple as key
|
|
tup = keys[0][sorted_labels[0][i - 1]]
|
|
else:
|
|
tup = PyTuple_New(k)
|
|
for j in range(k):
|
|
val = keys[j][sorted_labels[j][i - 1]]
|
|
PyTuple_SET_ITEM(tup, j, val)
|
|
Py_INCREF(val)
|
|
result[tup] = index[start:i]
|
|
start = i
|
|
cur = lab
|
|
|
|
if k == 1:
|
|
# When k = 1 we do not want to return a tuple as key
|
|
tup = keys[0][sorted_labels[0][n - 1]]
|
|
else:
|
|
tup = PyTuple_New(k)
|
|
for j in range(k):
|
|
val = keys[j][sorted_labels[j][n - 1]]
|
|
PyTuple_SET_ITEM(tup, j, val)
|
|
Py_INCREF(val)
|
|
result[tup] = index[start:]
|
|
|
|
return result
|
|
|
|
|
|
# core.common import for fast inference checks
|
|
|
|
def is_float(obj: object) -> bool:
|
|
"""
|
|
Return True if given object is float.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
"""
|
|
return util.is_float_object(obj)
|
|
|
|
|
|
def is_integer(obj: object) -> bool:
|
|
"""
|
|
Return True if given object is integer.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
"""
|
|
return util.is_integer_object(obj)
|
|
|
|
|
|
def is_bool(obj: object) -> bool:
|
|
"""
|
|
Return True if given object is boolean.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
"""
|
|
return util.is_bool_object(obj)
|
|
|
|
|
|
def is_complex(obj: object) -> bool:
|
|
"""
|
|
Return True if given object is complex.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
"""
|
|
return util.is_complex_object(obj)
|
|
|
|
|
|
cpdef bint is_decimal(object obj):
|
|
return isinstance(obj, Decimal)
|
|
|
|
|
|
cpdef bint is_interval(object obj):
|
|
return getattr(obj, '_typ', '_typ') == 'interval'
|
|
|
|
|
|
def is_period(val: object) -> bool:
|
|
"""
|
|
Return True if given object is Period.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
"""
|
|
return is_period_object(val)
|
|
|
|
|
|
def is_list_like(obj: object, allow_sets: bool = True) -> bool:
|
|
"""
|
|
Check if the object is list-like.
|
|
|
|
Objects that are considered list-like are for example Python
|
|
lists, tuples, sets, NumPy arrays, and Pandas Series.
|
|
|
|
Strings and datetime objects, however, are not considered list-like.
|
|
|
|
Parameters
|
|
----------
|
|
obj : object
|
|
Object to check.
|
|
allow_sets : bool, default True
|
|
If this parameter is False, sets will not be considered list-like.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
Whether `obj` has list-like properties.
|
|
|
|
Examples
|
|
--------
|
|
>>> import datetime
|
|
>>> is_list_like([1, 2, 3])
|
|
True
|
|
>>> is_list_like({1, 2, 3})
|
|
True
|
|
>>> is_list_like(datetime.datetime(2017, 1, 1))
|
|
False
|
|
>>> is_list_like("foo")
|
|
False
|
|
>>> is_list_like(1)
|
|
False
|
|
>>> is_list_like(np.array([2]))
|
|
True
|
|
>>> is_list_like(np.array(2))
|
|
False
|
|
"""
|
|
return c_is_list_like(obj, allow_sets)
|
|
|
|
|
|
cdef inline bint c_is_list_like(object obj, bint allow_sets) except -1:
|
|
# first, performance short-cuts for the most common cases
|
|
if util.is_array(obj):
|
|
# exclude zero-dimensional numpy arrays, effectively scalars
|
|
return not cnp.PyArray_IsZeroDim(obj)
|
|
elif isinstance(obj, list):
|
|
return True
|
|
# then the generic implementation
|
|
return (
|
|
# equiv: `isinstance(obj, abc.Iterable)`
|
|
getattr(obj, "__iter__", None) is not None and not isinstance(obj, type)
|
|
# we do not count strings/unicode/bytes as list-like
|
|
# exclude Generic types that have __iter__
|
|
and not isinstance(obj, (str, bytes, _GenericAlias))
|
|
# exclude zero-dimensional duck-arrays, effectively scalars
|
|
and not (hasattr(obj, "ndim") and obj.ndim == 0)
|
|
# exclude sets if allow_sets is False
|
|
and not (allow_sets is False and isinstance(obj, abc.Set))
|
|
)
|
|
|
|
|
|
_TYPE_MAP = {
|
|
"categorical": "categorical",
|
|
"category": "categorical",
|
|
"int8": "integer",
|
|
"int16": "integer",
|
|
"int32": "integer",
|
|
"int64": "integer",
|
|
"i": "integer",
|
|
"uint8": "integer",
|
|
"uint16": "integer",
|
|
"uint32": "integer",
|
|
"uint64": "integer",
|
|
"u": "integer",
|
|
"float32": "floating",
|
|
"float64": "floating",
|
|
"f": "floating",
|
|
"complex64": "complex",
|
|
"complex128": "complex",
|
|
"c": "complex",
|
|
"string": "string",
|
|
str: "string",
|
|
"S": "bytes",
|
|
"U": "string",
|
|
"bool": "boolean",
|
|
"b": "boolean",
|
|
"datetime64[ns]": "datetime64",
|
|
"M": "datetime64",
|
|
"timedelta64[ns]": "timedelta64",
|
|
"m": "timedelta64",
|
|
"interval": "interval",
|
|
Period: "period",
|
|
}
|
|
|
|
# types only exist on certain platform
|
|
try:
|
|
np.float128
|
|
_TYPE_MAP['float128'] = 'floating'
|
|
except AttributeError:
|
|
pass
|
|
try:
|
|
np.complex256
|
|
_TYPE_MAP['complex256'] = 'complex'
|
|
except AttributeError:
|
|
pass
|
|
try:
|
|
np.float16
|
|
_TYPE_MAP['float16'] = 'floating'
|
|
except AttributeError:
|
|
pass
|
|
|
|
|
|
@cython.internal
|
|
cdef class Seen:
|
|
"""
|
|
Class for keeping track of the types of elements
|
|
encountered when trying to perform type conversions.
|
|
"""
|
|
|
|
cdef:
|
|
bint int_ # seen_int
|
|
bint nat_ # seen nat
|
|
bint bool_ # seen_bool
|
|
bint null_ # seen_null
|
|
bint nan_ # seen_np.nan
|
|
bint uint_ # seen_uint (unsigned integer)
|
|
bint sint_ # seen_sint (signed integer)
|
|
bint float_ # seen_float
|
|
bint object_ # seen_object
|
|
bint complex_ # seen_complex
|
|
bint datetime_ # seen_datetime
|
|
bint coerce_numeric # coerce data to numeric
|
|
bint timedelta_ # seen_timedelta
|
|
bint datetimetz_ # seen_datetimetz
|
|
bint period_ # seen_period
|
|
bint interval_ # seen_interval
|
|
|
|
def __cinit__(self, bint coerce_numeric=False):
|
|
"""
|
|
Initialize a Seen instance.
|
|
|
|
Parameters
|
|
----------
|
|
coerce_numeric : bool, default False
|
|
Whether or not to force conversion to a numeric data type if
|
|
initial methods to convert to numeric fail.
|
|
"""
|
|
self.int_ = False
|
|
self.nat_ = False
|
|
self.bool_ = False
|
|
self.null_ = False
|
|
self.nan_ = False
|
|
self.uint_ = False
|
|
self.sint_ = False
|
|
self.float_ = False
|
|
self.object_ = False
|
|
self.complex_ = False
|
|
self.datetime_ = False
|
|
self.timedelta_ = False
|
|
self.datetimetz_ = False
|
|
self.period_ = False
|
|
self.interval_ = False
|
|
self.coerce_numeric = coerce_numeric
|
|
|
|
cdef inline bint check_uint64_conflict(self) except -1:
|
|
"""
|
|
Check whether we can safely convert a uint64 array to a numeric dtype.
|
|
|
|
There are two cases when conversion to numeric dtype with a uint64
|
|
array is not safe (and will therefore not be performed)
|
|
|
|
1) A NaN element is encountered.
|
|
|
|
uint64 cannot be safely cast to float64 due to truncation issues
|
|
at the extreme ends of the range.
|
|
|
|
2) A negative number is encountered.
|
|
|
|
There is no numerical dtype that can hold both negative numbers
|
|
and numbers greater than INT64_MAX. Hence, at least one number
|
|
will be improperly cast if we convert to a numeric dtype.
|
|
|
|
Returns
|
|
-------
|
|
bool
|
|
Whether or not we should return the original input array to avoid
|
|
data truncation.
|
|
|
|
Raises
|
|
------
|
|
ValueError
|
|
uint64 elements were detected, and at least one of the
|
|
two conflict cases was also detected. However, we are
|
|
trying to force conversion to a numeric dtype.
|
|
"""
|
|
return (self.uint_ and (self.null_ or self.sint_)
|
|
and not self.coerce_numeric)
|
|
|
|
cdef inline saw_null(self):
|
|
"""
|
|
Set flags indicating that a null value was encountered.
|
|
"""
|
|
self.null_ = True
|
|
self.float_ = True
|
|
|
|
cdef saw_int(self, object val):
|
|
"""
|
|
Set flags indicating that an integer value was encountered.
|
|
|
|
In addition to setting a flag that an integer was seen, we
|
|
also set two flags depending on the type of integer seen:
|
|
|
|
1) sint_ : a signed numpy integer type or a negative (signed) number in the
|
|
range of [-2**63, 0) was encountered
|
|
2) uint_ : an unsigned numpy integer type or a positive number in the range of
|
|
[2**63, 2**64) was encountered
|
|
|
|
Parameters
|
|
----------
|
|
val : Python int
|
|
Value with which to set the flags.
|
|
"""
|
|
self.int_ = True
|
|
self.sint_ = (
|
|
self.sint_
|
|
or (oINT64_MIN <= val < 0)
|
|
# Cython equivalent of `isinstance(val, np.signedinteger)`
|
|
or PyObject_TypeCheck(val, &PySignedIntegerArrType_Type)
|
|
)
|
|
self.uint_ = (
|
|
self.uint_
|
|
or (oINT64_MAX < val <= oUINT64_MAX)
|
|
# Cython equivalent of `isinstance(val, np.unsignedinteger)`
|
|
or PyObject_TypeCheck(val, &PyUnsignedIntegerArrType_Type)
|
|
)
|
|
|
|
@property
|
|
def numeric_(self):
|
|
return self.complex_ or self.float_ or self.int_
|
|
|
|
@property
|
|
def is_bool(self):
|
|
return not (self.datetime_ or self.numeric_ or self.timedelta_
|
|
or self.nat_)
|
|
|
|
@property
|
|
def is_float_or_complex(self):
|
|
return not (self.bool_ or self.datetime_ or self.timedelta_
|
|
or self.nat_)
|
|
|
|
|
|
cdef object _try_infer_map(object dtype):
|
|
"""
|
|
If its in our map, just return the dtype.
|
|
"""
|
|
cdef:
|
|
object val
|
|
str attr
|
|
for attr in ["name", "kind", "base", "type"]:
|
|
val = getattr(dtype, attr, None)
|
|
if val in _TYPE_MAP:
|
|
return _TYPE_MAP[val]
|
|
return None
|
|
|
|
|
|
def infer_dtype(value: object, skipna: bool = True) -> str:
|
|
"""
|
|
Return a string label of the type of a scalar or list-like of values.
|
|
|
|
Parameters
|
|
----------
|
|
value : scalar, list, ndarray, or pandas type
|
|
skipna : bool, default True
|
|
Ignore NaN values when inferring the type.
|
|
|
|
Returns
|
|
-------
|
|
str
|
|
Describing the common type of the input data.
|
|
Results can include:
|
|
|
|
- string
|
|
- bytes
|
|
- floating
|
|
- integer
|
|
- mixed-integer
|
|
- mixed-integer-float
|
|
- decimal
|
|
- complex
|
|
- categorical
|
|
- boolean
|
|
- datetime64
|
|
- datetime
|
|
- date
|
|
- timedelta64
|
|
- timedelta
|
|
- time
|
|
- period
|
|
- mixed
|
|
- unknown-array
|
|
|
|
Raises
|
|
------
|
|
TypeError
|
|
If ndarray-like but cannot infer the dtype
|
|
|
|
Notes
|
|
-----
|
|
- 'mixed' is the catchall for anything that is not otherwise
|
|
specialized
|
|
- 'mixed-integer-float' are floats and integers
|
|
- 'mixed-integer' are integers mixed with non-integers
|
|
- 'unknown-array' is the catchall for something that *is* an array (has
|
|
a dtype attribute), but has a dtype unknown to pandas (e.g. external
|
|
extension array)
|
|
|
|
Examples
|
|
--------
|
|
>>> import datetime
|
|
>>> infer_dtype(['foo', 'bar'])
|
|
'string'
|
|
|
|
>>> infer_dtype(['a', np.nan, 'b'], skipna=True)
|
|
'string'
|
|
|
|
>>> infer_dtype(['a', np.nan, 'b'], skipna=False)
|
|
'mixed'
|
|
|
|
>>> infer_dtype([b'foo', b'bar'])
|
|
'bytes'
|
|
|
|
>>> infer_dtype([1, 2, 3])
|
|
'integer'
|
|
|
|
>>> infer_dtype([1, 2, 3.5])
|
|
'mixed-integer-float'
|
|
|
|
>>> infer_dtype([1.0, 2.0, 3.5])
|
|
'floating'
|
|
|
|
>>> infer_dtype(['a', 1])
|
|
'mixed-integer'
|
|
|
|
>>> infer_dtype([Decimal(1), Decimal(2.0)])
|
|
'decimal'
|
|
|
|
>>> infer_dtype([True, False])
|
|
'boolean'
|
|
|
|
>>> infer_dtype([True, False, np.nan])
|
|
'boolean'
|
|
|
|
>>> infer_dtype([pd.Timestamp('20130101')])
|
|
'datetime'
|
|
|
|
>>> infer_dtype([datetime.date(2013, 1, 1)])
|
|
'date'
|
|
|
|
>>> infer_dtype([np.datetime64('2013-01-01')])
|
|
'datetime64'
|
|
|
|
>>> infer_dtype([datetime.timedelta(0, 1, 1)])
|
|
'timedelta'
|
|
|
|
>>> infer_dtype(pd.Series(list('aabc')).astype('category'))
|
|
'categorical'
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, n
|
|
object val
|
|
ndarray values
|
|
bint seen_pdnat = False
|
|
bint seen_val = False
|
|
flatiter it
|
|
|
|
if util.is_array(value):
|
|
values = value
|
|
elif hasattr(value, "inferred_type") and skipna is False:
|
|
# Index, use the cached attribute if possible, populate the cache otherwise
|
|
return value.inferred_type
|
|
elif hasattr(value, "dtype"):
|
|
# this will handle ndarray-like
|
|
# e.g. categoricals
|
|
dtype = value.dtype
|
|
if not cnp.PyArray_DescrCheck(dtype):
|
|
# i.e. not isinstance(dtype, np.dtype)
|
|
inferred = _try_infer_map(value.dtype)
|
|
if inferred is not None:
|
|
return inferred
|
|
return "unknown-array"
|
|
|
|
# Unwrap Series/Index
|
|
values = np.asarray(value)
|
|
|
|
else:
|
|
if not isinstance(value, list):
|
|
value = list(value)
|
|
if not value:
|
|
return "empty"
|
|
|
|
from pandas.core.dtypes.cast import construct_1d_object_array_from_listlike
|
|
values = construct_1d_object_array_from_listlike(value)
|
|
|
|
val = _try_infer_map(values.dtype)
|
|
if val is not None:
|
|
# Anything other than object-dtype should return here.
|
|
return val
|
|
|
|
if values.descr.type_num != NPY_OBJECT:
|
|
# i.e. values.dtype != np.object
|
|
# This should not be reached
|
|
values = values.astype(object)
|
|
|
|
n = cnp.PyArray_SIZE(values)
|
|
if n == 0:
|
|
return "empty"
|
|
|
|
# Iterate until we find our first valid value. We will use this
|
|
# value to decide which of the is_foo_array functions to call.
|
|
it = PyArray_IterNew(values)
|
|
for i in range(n):
|
|
# The PyArray_GETITEM and PyArray_ITER_NEXT are faster
|
|
# equivalents to `val = values[i]`
|
|
val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
|
|
PyArray_ITER_NEXT(it)
|
|
|
|
# do not use checknull to keep
|
|
# np.datetime64('nat') and np.timedelta64('nat')
|
|
if val is None or util.is_nan(val) or val is C_NA:
|
|
pass
|
|
elif val is NaT:
|
|
seen_pdnat = True
|
|
else:
|
|
seen_val = True
|
|
break
|
|
|
|
# if all values are nan/NaT
|
|
if seen_val is False and seen_pdnat is True:
|
|
return "datetime"
|
|
# float/object nan is handled in latter logic
|
|
if seen_val is False and skipna:
|
|
return "empty"
|
|
|
|
if util.is_datetime64_object(val):
|
|
if is_datetime64_array(values, skipna=skipna):
|
|
return "datetime64"
|
|
|
|
elif is_timedelta(val):
|
|
if is_timedelta_or_timedelta64_array(values, skipna=skipna):
|
|
return "timedelta"
|
|
|
|
elif util.is_integer_object(val):
|
|
# ordering matters here; this check must come after the is_timedelta
|
|
# check otherwise numpy timedelta64 objects would come through here
|
|
|
|
if is_integer_array(values, skipna=skipna):
|
|
return "integer"
|
|
elif is_integer_float_array(values, skipna=skipna):
|
|
if is_integer_na_array(values, skipna=skipna):
|
|
return "integer-na"
|
|
else:
|
|
return "mixed-integer-float"
|
|
return "mixed-integer"
|
|
|
|
elif PyDateTime_Check(val):
|
|
if is_datetime_array(values, skipna=skipna):
|
|
return "datetime"
|
|
elif is_date_array(values, skipna=skipna):
|
|
return "date"
|
|
|
|
elif PyDate_Check(val):
|
|
if is_date_array(values, skipna=skipna):
|
|
return "date"
|
|
|
|
elif PyTime_Check(val):
|
|
if is_time_array(values, skipna=skipna):
|
|
return "time"
|
|
|
|
elif is_decimal(val):
|
|
if is_decimal_array(values, skipna=skipna):
|
|
return "decimal"
|
|
|
|
elif util.is_complex_object(val):
|
|
if is_complex_array(values):
|
|
return "complex"
|
|
|
|
elif util.is_float_object(val):
|
|
if is_float_array(values):
|
|
return "floating"
|
|
elif is_integer_float_array(values, skipna=skipna):
|
|
if is_integer_na_array(values, skipna=skipna):
|
|
return "integer-na"
|
|
else:
|
|
return "mixed-integer-float"
|
|
|
|
elif util.is_bool_object(val):
|
|
if is_bool_array(values, skipna=skipna):
|
|
return "boolean"
|
|
|
|
elif isinstance(val, str):
|
|
if is_string_array(values, skipna=skipna):
|
|
return "string"
|
|
|
|
elif isinstance(val, bytes):
|
|
if is_bytes_array(values, skipna=skipna):
|
|
return "bytes"
|
|
|
|
elif is_period_object(val):
|
|
if is_period_array(values, skipna=skipna):
|
|
return "period"
|
|
|
|
elif is_interval(val):
|
|
if is_interval_array(values):
|
|
return "interval"
|
|
|
|
cnp.PyArray_ITER_RESET(it)
|
|
for i in range(n):
|
|
val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
|
|
PyArray_ITER_NEXT(it)
|
|
|
|
if util.is_integer_object(val):
|
|
return "mixed-integer"
|
|
|
|
return "mixed"
|
|
|
|
|
|
def infer_datetimelike_array(arr: ndarray[object]) -> tuple[str, bool]:
|
|
"""
|
|
Infer if we have a datetime or timedelta array.
|
|
- date: we have *only* date and maybe strings, nulls
|
|
- datetime: we have *only* datetimes and maybe strings, nulls
|
|
- timedelta: we have *only* timedeltas and maybe strings, nulls
|
|
- nat: we do not have *any* date, datetimes or timedeltas, but do have
|
|
at least a NaT
|
|
- mixed: other objects (strings, a mix of tz-aware and tz-naive, or
|
|
actual objects)
|
|
|
|
Parameters
|
|
----------
|
|
arr : ndarray[object]
|
|
|
|
Returns
|
|
-------
|
|
str: {datetime, timedelta, date, nat, mixed}
|
|
bool
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, n = len(arr)
|
|
bint seen_timedelta = False, seen_date = False, seen_datetime = False
|
|
bint seen_tz_aware = False, seen_tz_naive = False
|
|
bint seen_nat = False, seen_str = False
|
|
bint seen_period = False, seen_interval = False
|
|
list objs = []
|
|
object v
|
|
|
|
for i in range(n):
|
|
v = arr[i]
|
|
if isinstance(v, str):
|
|
objs.append(v)
|
|
seen_str = True
|
|
|
|
if len(objs) == 3:
|
|
break
|
|
|
|
elif v is None or util.is_nan(v):
|
|
# nan or None
|
|
pass
|
|
elif v is NaT:
|
|
seen_nat = True
|
|
elif PyDateTime_Check(v):
|
|
# datetime
|
|
seen_datetime = True
|
|
|
|
# disambiguate between tz-naive and tz-aware
|
|
if v.tzinfo is None:
|
|
seen_tz_naive = True
|
|
else:
|
|
seen_tz_aware = True
|
|
|
|
if seen_tz_naive and seen_tz_aware:
|
|
return "mixed", seen_str
|
|
elif util.is_datetime64_object(v):
|
|
# np.datetime64
|
|
seen_datetime = True
|
|
elif PyDate_Check(v):
|
|
seen_date = True
|
|
elif is_timedelta(v):
|
|
# timedelta, or timedelta64
|
|
seen_timedelta = True
|
|
elif is_period_object(v):
|
|
seen_period = True
|
|
break
|
|
elif is_interval(v):
|
|
seen_interval = True
|
|
break
|
|
else:
|
|
return "mixed", seen_str
|
|
|
|
if seen_period:
|
|
if is_period_array(arr):
|
|
return "period", seen_str
|
|
return "mixed", seen_str
|
|
|
|
if seen_interval:
|
|
if is_interval_array(arr):
|
|
return "interval", seen_str
|
|
return "mixed", seen_str
|
|
|
|
if seen_date and not (seen_datetime or seen_timedelta):
|
|
return "date", seen_str
|
|
elif seen_datetime and not seen_timedelta:
|
|
return "datetime", seen_str
|
|
elif seen_timedelta and not seen_datetime:
|
|
return "timedelta", seen_str
|
|
elif seen_nat:
|
|
return "nat", seen_str
|
|
|
|
# short-circuit by trying to
|
|
# actually convert these strings
|
|
# this is for performance as we don't need to try
|
|
# convert *every* string array
|
|
if len(objs):
|
|
try:
|
|
# require_iso8601 as in maybe_infer_to_datetimelike
|
|
array_to_datetime(objs, errors="raise", require_iso8601=True)
|
|
return "datetime", seen_str
|
|
except (ValueError, TypeError):
|
|
pass
|
|
|
|
# we are *not* going to infer from strings
|
|
# for timedelta as too much ambiguity
|
|
|
|
return "mixed", seen_str
|
|
|
|
|
|
cdef inline bint is_timedelta(object o):
|
|
return PyDelta_Check(o) or util.is_timedelta64_object(o)
|
|
|
|
|
|
@cython.internal
|
|
cdef class Validator:
|
|
|
|
cdef:
|
|
Py_ssize_t n
|
|
dtype dtype
|
|
bint skipna
|
|
|
|
def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_),
|
|
bint skipna=False):
|
|
self.n = n
|
|
self.dtype = dtype
|
|
self.skipna = skipna
|
|
|
|
cdef bint validate(self, ndarray values) except -1:
|
|
if not self.n:
|
|
return False
|
|
|
|
if self.is_array_typed():
|
|
# i.e. this ndarray is already of the desired dtype
|
|
return True
|
|
elif self.dtype.type_num == NPY_OBJECT:
|
|
if self.skipna:
|
|
return self._validate_skipna(values)
|
|
else:
|
|
return self._validate(values)
|
|
else:
|
|
return False
|
|
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
cdef bint _validate(self, ndarray values) except -1:
|
|
cdef:
|
|
Py_ssize_t i
|
|
Py_ssize_t n = values.size
|
|
flatiter it = PyArray_IterNew(values)
|
|
|
|
for i in range(n):
|
|
# The PyArray_GETITEM and PyArray_ITER_NEXT are faster
|
|
# equivalents to `val = values[i]`
|
|
val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
|
|
PyArray_ITER_NEXT(it)
|
|
if not self.is_valid(val):
|
|
return False
|
|
|
|
return True
|
|
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
cdef bint _validate_skipna(self, ndarray values) except -1:
|
|
cdef:
|
|
Py_ssize_t i
|
|
Py_ssize_t n = values.size
|
|
flatiter it = PyArray_IterNew(values)
|
|
|
|
for i in range(n):
|
|
# The PyArray_GETITEM and PyArray_ITER_NEXT are faster
|
|
# equivalents to `val = values[i]`
|
|
val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
|
|
PyArray_ITER_NEXT(it)
|
|
if not self.is_valid_skipna(val):
|
|
return False
|
|
|
|
return True
|
|
|
|
cdef bint is_valid(self, object value) except -1:
|
|
return self.is_value_typed(value)
|
|
|
|
cdef bint is_valid_skipna(self, object value) except -1:
|
|
return self.is_valid(value) or self.is_valid_null(value)
|
|
|
|
cdef bint is_value_typed(self, object value) except -1:
|
|
raise NotImplementedError(f"{type(self).__name__} child class "
|
|
"must define is_value_typed")
|
|
|
|
cdef bint is_valid_null(self, object value) except -1:
|
|
return value is None or value is C_NA or util.is_nan(value)
|
|
|
|
cdef bint is_array_typed(self) except -1:
|
|
return False
|
|
|
|
|
|
@cython.internal
|
|
cdef class BoolValidator(Validator):
|
|
cdef inline bint is_value_typed(self, object value) except -1:
|
|
return util.is_bool_object(value)
|
|
|
|
cdef inline bint is_array_typed(self) except -1:
|
|
return issubclass(self.dtype.type, np.bool_)
|
|
|
|
|
|
cpdef bint is_bool_array(ndarray values, bint skipna=False):
|
|
cdef:
|
|
BoolValidator validator = BoolValidator(len(values),
|
|
values.dtype,
|
|
skipna=skipna)
|
|
return validator.validate(values)
|
|
|
|
|
|
@cython.internal
|
|
cdef class IntegerValidator(Validator):
|
|
cdef inline bint is_value_typed(self, object value) except -1:
|
|
return util.is_integer_object(value)
|
|
|
|
cdef inline bint is_array_typed(self) except -1:
|
|
return issubclass(self.dtype.type, np.integer)
|
|
|
|
|
|
# Note: only python-exposed for tests
|
|
cpdef bint is_integer_array(ndarray values, bint skipna=True):
|
|
cdef:
|
|
IntegerValidator validator = IntegerValidator(len(values),
|
|
values.dtype,
|
|
skipna=skipna)
|
|
return validator.validate(values)
|
|
|
|
|
|
@cython.internal
|
|
cdef class IntegerNaValidator(Validator):
|
|
cdef inline bint is_value_typed(self, object value) except -1:
|
|
return (util.is_integer_object(value)
|
|
or (util.is_nan(value) and util.is_float_object(value)))
|
|
|
|
|
|
cdef bint is_integer_na_array(ndarray values, bint skipna=True):
|
|
cdef:
|
|
IntegerNaValidator validator = IntegerNaValidator(len(values),
|
|
values.dtype, skipna=skipna)
|
|
return validator.validate(values)
|
|
|
|
|
|
@cython.internal
|
|
cdef class IntegerFloatValidator(Validator):
|
|
cdef inline bint is_value_typed(self, object value) except -1:
|
|
return util.is_integer_object(value) or util.is_float_object(value)
|
|
|
|
cdef inline bint is_array_typed(self) except -1:
|
|
return issubclass(self.dtype.type, np.integer)
|
|
|
|
|
|
cdef bint is_integer_float_array(ndarray values, bint skipna=True):
|
|
cdef:
|
|
IntegerFloatValidator validator = IntegerFloatValidator(len(values),
|
|
values.dtype,
|
|
skipna=skipna)
|
|
return validator.validate(values)
|
|
|
|
|
|
@cython.internal
|
|
cdef class FloatValidator(Validator):
|
|
cdef inline bint is_value_typed(self, object value) except -1:
|
|
return util.is_float_object(value)
|
|
|
|
cdef inline bint is_array_typed(self) except -1:
|
|
return issubclass(self.dtype.type, np.floating)
|
|
|
|
|
|
# Note: only python-exposed for tests
|
|
cpdef bint is_float_array(ndarray values):
|
|
cdef:
|
|
FloatValidator validator = FloatValidator(len(values), values.dtype)
|
|
return validator.validate(values)
|
|
|
|
|
|
@cython.internal
|
|
cdef class ComplexValidator(Validator):
|
|
cdef inline bint is_value_typed(self, object value) except -1:
|
|
return (
|
|
util.is_complex_object(value)
|
|
or (util.is_float_object(value) and is_nan(value))
|
|
)
|
|
|
|
cdef inline bint is_array_typed(self) except -1:
|
|
return issubclass(self.dtype.type, np.complexfloating)
|
|
|
|
|
|
cdef bint is_complex_array(ndarray values):
|
|
cdef:
|
|
ComplexValidator validator = ComplexValidator(len(values), values.dtype)
|
|
return validator.validate(values)
|
|
|
|
|
|
@cython.internal
|
|
cdef class DecimalValidator(Validator):
|
|
cdef inline bint is_value_typed(self, object value) except -1:
|
|
return is_decimal(value)
|
|
|
|
|
|
cdef bint is_decimal_array(ndarray values, bint skipna=False):
|
|
cdef:
|
|
DecimalValidator validator = DecimalValidator(
|
|
len(values), values.dtype, skipna=skipna
|
|
)
|
|
return validator.validate(values)
|
|
|
|
|
|
@cython.internal
|
|
cdef class StringValidator(Validator):
|
|
cdef inline bint is_value_typed(self, object value) except -1:
|
|
return isinstance(value, str)
|
|
|
|
cdef inline bint is_array_typed(self) except -1:
|
|
return issubclass(self.dtype.type, np.str_)
|
|
|
|
|
|
cpdef bint is_string_array(ndarray values, bint skipna=False):
|
|
cdef:
|
|
StringValidator validator = StringValidator(len(values),
|
|
values.dtype,
|
|
skipna=skipna)
|
|
return validator.validate(values)
|
|
|
|
|
|
@cython.internal
|
|
cdef class BytesValidator(Validator):
|
|
cdef inline bint is_value_typed(self, object value) except -1:
|
|
return isinstance(value, bytes)
|
|
|
|
cdef inline bint is_array_typed(self) except -1:
|
|
return issubclass(self.dtype.type, np.bytes_)
|
|
|
|
|
|
cdef bint is_bytes_array(ndarray values, bint skipna=False):
|
|
cdef:
|
|
BytesValidator validator = BytesValidator(len(values), values.dtype,
|
|
skipna=skipna)
|
|
return validator.validate(values)
|
|
|
|
|
|
@cython.internal
|
|
cdef class TemporalValidator(Validator):
|
|
cdef:
|
|
bint all_generic_na
|
|
|
|
def __cinit__(self, Py_ssize_t n, dtype dtype=np.dtype(np.object_),
|
|
bint skipna=False):
|
|
self.n = n
|
|
self.dtype = dtype
|
|
self.skipna = skipna
|
|
self.all_generic_na = True
|
|
|
|
cdef inline bint is_valid(self, object value) except -1:
|
|
return self.is_value_typed(value) or self.is_valid_null(value)
|
|
|
|
cdef bint is_valid_null(self, object value) except -1:
|
|
raise NotImplementedError(f"{type(self).__name__} child class "
|
|
"must define is_valid_null")
|
|
|
|
cdef inline bint is_valid_skipna(self, object value) except -1:
|
|
cdef:
|
|
bint is_typed_null = self.is_valid_null(value)
|
|
bint is_generic_null = value is None or util.is_nan(value)
|
|
if not is_generic_null:
|
|
self.all_generic_na = False
|
|
return self.is_value_typed(value) or is_typed_null or is_generic_null
|
|
|
|
cdef bint _validate_skipna(self, ndarray values) except -1:
|
|
"""
|
|
If we _only_ saw non-dtype-specific NA values, even if they are valid
|
|
for this dtype, we do not infer this dtype.
|
|
"""
|
|
return Validator._validate_skipna(self, values) and not self.all_generic_na
|
|
|
|
|
|
@cython.internal
|
|
cdef class DatetimeValidator(TemporalValidator):
|
|
cdef bint is_value_typed(self, object value) except -1:
|
|
return PyDateTime_Check(value)
|
|
|
|
cdef inline bint is_valid_null(self, object value) except -1:
|
|
return is_null_datetime64(value)
|
|
|
|
|
|
cpdef bint is_datetime_array(ndarray values, bint skipna=True):
|
|
cdef:
|
|
DatetimeValidator validator = DatetimeValidator(len(values),
|
|
skipna=skipna)
|
|
return validator.validate(values)
|
|
|
|
|
|
@cython.internal
|
|
cdef class Datetime64Validator(DatetimeValidator):
|
|
cdef inline bint is_value_typed(self, object value) except -1:
|
|
return util.is_datetime64_object(value)
|
|
|
|
|
|
# Note: only python-exposed for tests
|
|
cpdef bint is_datetime64_array(ndarray values, bint skipna=True):
|
|
cdef:
|
|
Datetime64Validator validator = Datetime64Validator(len(values),
|
|
skipna=skipna)
|
|
return validator.validate(values)
|
|
|
|
|
|
@cython.internal
|
|
cdef class AnyDatetimeValidator(DatetimeValidator):
|
|
cdef inline bint is_value_typed(self, object value) except -1:
|
|
return util.is_datetime64_object(value) or (
|
|
PyDateTime_Check(value) and value.tzinfo is None
|
|
)
|
|
|
|
|
|
cdef bint is_datetime_or_datetime64_array(ndarray values, bint skipna=True):
|
|
cdef:
|
|
AnyDatetimeValidator validator = AnyDatetimeValidator(len(values),
|
|
skipna=skipna)
|
|
return validator.validate(values)
|
|
|
|
|
|
# Note: only python-exposed for tests
|
|
def is_datetime_with_singletz_array(values: ndarray) -> bool:
|
|
"""
|
|
Check values have the same tzinfo attribute.
|
|
Doesn't check values are datetime-like types.
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i = 0, j, n = len(values)
|
|
object base_val, base_tz, val, tz
|
|
|
|
if n == 0:
|
|
return False
|
|
|
|
# Get a reference timezone to compare with the rest of the tzs in the array
|
|
for i in range(n):
|
|
base_val = values[i]
|
|
if base_val is not NaT and base_val is not None and not util.is_nan(base_val):
|
|
base_tz = getattr(base_val, 'tzinfo', None)
|
|
break
|
|
|
|
for j in range(i, n):
|
|
# Compare val's timezone with the reference timezone
|
|
# NaT can coexist with tz-aware datetimes, so skip if encountered
|
|
val = values[j]
|
|
if val is not NaT and val is not None and not util.is_nan(val):
|
|
tz = getattr(val, 'tzinfo', None)
|
|
if not tz_compare(base_tz, tz):
|
|
return False
|
|
|
|
# Note: we should only be called if a tzaware datetime has been seen,
|
|
# so base_tz should always be set at this point.
|
|
return True
|
|
|
|
|
|
@cython.internal
|
|
cdef class TimedeltaValidator(TemporalValidator):
|
|
cdef bint is_value_typed(self, object value) except -1:
|
|
return PyDelta_Check(value)
|
|
|
|
cdef inline bint is_valid_null(self, object value) except -1:
|
|
return is_null_timedelta64(value)
|
|
|
|
|
|
@cython.internal
|
|
cdef class AnyTimedeltaValidator(TimedeltaValidator):
|
|
cdef inline bint is_value_typed(self, object value) except -1:
|
|
return is_timedelta(value)
|
|
|
|
|
|
# Note: only python-exposed for tests
|
|
cpdef bint is_timedelta_or_timedelta64_array(ndarray values, bint skipna=True):
|
|
"""
|
|
Infer with timedeltas and/or nat/none.
|
|
"""
|
|
cdef:
|
|
AnyTimedeltaValidator validator = AnyTimedeltaValidator(len(values),
|
|
skipna=skipna)
|
|
return validator.validate(values)
|
|
|
|
|
|
@cython.internal
|
|
cdef class DateValidator(Validator):
|
|
cdef inline bint is_value_typed(self, object value) except -1:
|
|
return PyDate_Check(value)
|
|
|
|
|
|
# Note: only python-exposed for tests
|
|
cpdef bint is_date_array(ndarray values, bint skipna=False):
|
|
cdef:
|
|
DateValidator validator = DateValidator(len(values), skipna=skipna)
|
|
return validator.validate(values)
|
|
|
|
|
|
@cython.internal
|
|
cdef class TimeValidator(Validator):
|
|
cdef inline bint is_value_typed(self, object value) except -1:
|
|
return PyTime_Check(value)
|
|
|
|
|
|
# Note: only python-exposed for tests
|
|
cpdef bint is_time_array(ndarray values, bint skipna=False):
|
|
cdef:
|
|
TimeValidator validator = TimeValidator(len(values), skipna=skipna)
|
|
return validator.validate(values)
|
|
|
|
|
|
# FIXME: actually use skipna
|
|
cdef bint is_period_array(ndarray values, bint skipna=True):
|
|
"""
|
|
Is this an ndarray of Period objects (or NaT) with a single `freq`?
|
|
"""
|
|
# values should be object-dtype, but ndarray[object] assumes 1D, while
|
|
# this _may_ be 2D.
|
|
cdef:
|
|
Py_ssize_t i, N = values.size
|
|
int dtype_code = -10000 # i.e. c_FreqGroup.FR_UND
|
|
object val
|
|
flatiter it
|
|
|
|
if N == 0:
|
|
return False
|
|
|
|
it = PyArray_IterNew(values)
|
|
for i in range(N):
|
|
# The PyArray_GETITEM and PyArray_ITER_NEXT are faster
|
|
# equivalents to `val = values[i]`
|
|
val = PyArray_GETITEM(values, PyArray_ITER_DATA(it))
|
|
PyArray_ITER_NEXT(it)
|
|
|
|
if is_period_object(val):
|
|
if dtype_code == -10000:
|
|
dtype_code = val._dtype._dtype_code
|
|
elif dtype_code != val._dtype._dtype_code:
|
|
# mismatched freqs
|
|
return False
|
|
elif checknull_with_nat(val):
|
|
pass
|
|
else:
|
|
# Not a Period or NaT-like
|
|
return False
|
|
|
|
if dtype_code == -10000:
|
|
# we saw all-NaTs, no actual Periods
|
|
return False
|
|
return True
|
|
|
|
|
|
# Note: only python-exposed for tests
|
|
cpdef bint is_interval_array(ndarray values):
|
|
"""
|
|
Is this an ndarray of Interval (or np.nan) with a single dtype?
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, n = len(values)
|
|
str closed = None
|
|
bint numeric = False
|
|
bint dt64 = False
|
|
bint td64 = False
|
|
object val
|
|
|
|
if len(values) == 0:
|
|
return False
|
|
|
|
for i in range(n):
|
|
val = values[i]
|
|
|
|
if is_interval(val):
|
|
if closed is None:
|
|
closed = val.closed
|
|
numeric = (
|
|
util.is_float_object(val.left)
|
|
or util.is_integer_object(val.left)
|
|
)
|
|
td64 = is_timedelta(val.left)
|
|
dt64 = PyDateTime_Check(val.left)
|
|
elif val.closed != closed:
|
|
# mismatched closedness
|
|
return False
|
|
elif numeric:
|
|
if not (
|
|
util.is_float_object(val.left)
|
|
or util.is_integer_object(val.left)
|
|
):
|
|
# i.e. datetime64 or timedelta64
|
|
return False
|
|
elif td64:
|
|
if not is_timedelta(val.left):
|
|
return False
|
|
elif dt64:
|
|
if not PyDateTime_Check(val.left):
|
|
return False
|
|
else:
|
|
raise ValueError(val)
|
|
elif util.is_nan(val) or val is None:
|
|
pass
|
|
else:
|
|
return False
|
|
|
|
if closed is None:
|
|
# we saw all-NAs, no actual Intervals
|
|
return False
|
|
return True
|
|
|
|
|
|
@cython.boundscheck(False)
|
|
@cython.wraparound(False)
|
|
def maybe_convert_numeric(
|
|
ndarray[object, ndim=1] values,
|
|
set na_values,
|
|
bint convert_empty=True,
|
|
bint coerce_numeric=False,
|
|
bint convert_to_masked_nullable=False,
|
|
) -> tuple[np.ndarray, np.ndarray | None]:
|
|
"""
|
|
Convert object array to a numeric array if possible.
|
|
|
|
Parameters
|
|
----------
|
|
values : ndarray[object]
|
|
Array of object elements to convert.
|
|
na_values : set
|
|
Set of values that should be interpreted as NaN.
|
|
convert_empty : bool, default True
|
|
If an empty array-like object is encountered, whether to interpret
|
|
that element as NaN or not. If set to False, a ValueError will be
|
|
raised if such an element is encountered and 'coerce_numeric' is False.
|
|
coerce_numeric : bool, default False
|
|
If initial attempts to convert to numeric have failed, whether to
|
|
force conversion to numeric via alternative methods or by setting the
|
|
element to NaN. Otherwise, an Exception will be raised when such an
|
|
element is encountered.
|
|
|
|
This boolean also has an impact on how conversion behaves when a
|
|
numeric array has no suitable numerical dtype to return (i.e. uint64,
|
|
int32, uint8). If set to False, the original object array will be
|
|
returned. Otherwise, a ValueError will be raised.
|
|
convert_to_masked_nullable : bool, default False
|
|
Whether to return a mask for the converted values. This also disables
|
|
upcasting for ints with nulls to float64.
|
|
Returns
|
|
-------
|
|
np.ndarray
|
|
Array of converted object values to numerical ones.
|
|
|
|
Optional[np.ndarray]
|
|
If convert_to_masked_nullable is True,
|
|
returns a boolean mask for the converted values, otherwise returns None.
|
|
"""
|
|
if len(values) == 0:
|
|
return (np.array([], dtype='i8'), None)
|
|
|
|
# fastpath for ints - try to convert all based on first value
|
|
cdef:
|
|
object val = values[0]
|
|
|
|
if util.is_integer_object(val):
|
|
try:
|
|
maybe_ints = values.astype('i8')
|
|
if (maybe_ints == values).all():
|
|
return (maybe_ints, None)
|
|
except (ValueError, OverflowError, TypeError):
|
|
pass
|
|
|
|
# Otherwise, iterate and do full inference.
|
|
cdef:
|
|
int status, maybe_int
|
|
Py_ssize_t i, n = values.size
|
|
Seen seen = Seen(coerce_numeric)
|
|
ndarray[float64_t, ndim=1] floats = cnp.PyArray_EMPTY(1, values.shape, cnp.NPY_FLOAT64, 0)
|
|
ndarray[complex128_t, ndim=1] complexes = cnp.PyArray_EMPTY(1, values.shape, cnp.NPY_COMPLEX128, 0)
|
|
ndarray[int64_t, ndim=1] ints = cnp.PyArray_EMPTY(1, values.shape, cnp.NPY_INT64, 0)
|
|
ndarray[uint64_t, ndim=1] uints = cnp.PyArray_EMPTY(1, values.shape, cnp.NPY_UINT64, 0)
|
|
ndarray[uint8_t, ndim=1] bools = cnp.PyArray_EMPTY(1, values.shape, cnp.NPY_UINT8, 0)
|
|
ndarray[uint8_t, ndim=1] mask = np.zeros(n, dtype="u1")
|
|
float64_t fval
|
|
bint allow_null_in_int = convert_to_masked_nullable
|
|
|
|
for i in range(n):
|
|
val = values[i]
|
|
# We only want to disable NaNs showing as float if
|
|
# a) convert_to_masked_nullable = True
|
|
# b) no floats have been seen ( assuming an int shows up later )
|
|
# However, if no ints present (all null array), we need to return floats
|
|
allow_null_in_int = convert_to_masked_nullable and not seen.float_
|
|
|
|
if val.__hash__ is not None and val in na_values:
|
|
if allow_null_in_int:
|
|
seen.null_ = True
|
|
mask[i] = 1
|
|
else:
|
|
if convert_to_masked_nullable:
|
|
mask[i] = 1
|
|
seen.saw_null()
|
|
floats[i] = complexes[i] = NaN
|
|
elif util.is_float_object(val):
|
|
fval = val
|
|
if fval != fval:
|
|
seen.null_ = True
|
|
if allow_null_in_int:
|
|
mask[i] = 1
|
|
else:
|
|
if convert_to_masked_nullable:
|
|
mask[i] = 1
|
|
seen.float_ = True
|
|
else:
|
|
seen.float_ = True
|
|
floats[i] = complexes[i] = fval
|
|
elif util.is_integer_object(val):
|
|
floats[i] = complexes[i] = val
|
|
|
|
val = int(val)
|
|
seen.saw_int(val)
|
|
|
|
if val >= 0:
|
|
if val <= oUINT64_MAX:
|
|
uints[i] = val
|
|
else:
|
|
seen.float_ = True
|
|
|
|
if oINT64_MIN <= val <= oINT64_MAX:
|
|
ints[i] = val
|
|
|
|
if val < oINT64_MIN or (seen.sint_ and seen.uint_):
|
|
seen.float_ = True
|
|
|
|
elif util.is_bool_object(val):
|
|
floats[i] = uints[i] = ints[i] = bools[i] = val
|
|
seen.bool_ = True
|
|
elif val is None or val is C_NA:
|
|
if allow_null_in_int:
|
|
seen.null_ = True
|
|
mask[i] = 1
|
|
else:
|
|
if convert_to_masked_nullable:
|
|
mask[i] = 1
|
|
seen.saw_null()
|
|
floats[i] = complexes[i] = NaN
|
|
elif hasattr(val, '__len__') and len(val) == 0:
|
|
if convert_empty or seen.coerce_numeric:
|
|
seen.saw_null()
|
|
floats[i] = complexes[i] = NaN
|
|
else:
|
|
raise ValueError("Empty string encountered")
|
|
elif util.is_complex_object(val):
|
|
complexes[i] = val
|
|
seen.complex_ = True
|
|
elif is_decimal(val):
|
|
floats[i] = complexes[i] = val
|
|
seen.float_ = True
|
|
else:
|
|
try:
|
|
status = floatify(val, &fval, &maybe_int)
|
|
|
|
if fval in na_values:
|
|
seen.saw_null()
|
|
floats[i] = complexes[i] = NaN
|
|
mask[i] = 1
|
|
else:
|
|
if fval != fval:
|
|
seen.null_ = True
|
|
mask[i] = 1
|
|
|
|
floats[i] = fval
|
|
|
|
if maybe_int:
|
|
as_int = int(val)
|
|
|
|
if as_int in na_values:
|
|
mask[i] = 1
|
|
seen.null_ = True
|
|
if not allow_null_in_int:
|
|
seen.float_ = True
|
|
else:
|
|
seen.saw_int(as_int)
|
|
|
|
if as_int not in na_values:
|
|
if as_int < oINT64_MIN or as_int > oUINT64_MAX:
|
|
if seen.coerce_numeric:
|
|
seen.float_ = True
|
|
else:
|
|
raise ValueError("Integer out of range.")
|
|
else:
|
|
if as_int >= 0:
|
|
uints[i] = as_int
|
|
|
|
if as_int <= oINT64_MAX:
|
|
ints[i] = as_int
|
|
|
|
seen.float_ = seen.float_ or (seen.uint_ and seen.sint_)
|
|
else:
|
|
seen.float_ = True
|
|
except (TypeError, ValueError) as err:
|
|
if not seen.coerce_numeric:
|
|
raise type(err)(f"{err} at position {i}")
|
|
|
|
seen.saw_null()
|
|
floats[i] = NaN
|
|
|
|
if seen.check_uint64_conflict():
|
|
return (values, None)
|
|
|
|
# This occurs since we disabled float nulls showing as null in anticipation
|
|
# of seeing ints that were never seen. So then, we return float
|
|
if allow_null_in_int and seen.null_ and not seen.int_:
|
|
seen.float_ = True
|
|
|
|
if seen.complex_:
|
|
return (complexes, None)
|
|
elif seen.float_:
|
|
if seen.null_ and convert_to_masked_nullable:
|
|
return (floats, mask.view(np.bool_))
|
|
return (floats, None)
|
|
elif seen.int_:
|
|
if seen.null_ and convert_to_masked_nullable:
|
|
if seen.uint_:
|
|
return (uints, mask.view(np.bool_))
|
|
else:
|
|
return (ints, mask.view(np.bool_))
|
|
if seen.uint_:
|
|
return (uints, None)
|
|
else:
|
|
return (ints, None)
|
|
elif seen.bool_:
|
|
return (bools.view(np.bool_), None)
|
|
elif seen.uint_:
|
|
return (uints, None)
|
|
return (ints, None)
|
|
|
|
|
|
@cython.boundscheck(False)
|
|
@cython.wraparound(False)
|
|
def maybe_convert_objects(ndarray[object] objects,
|
|
*,
|
|
bint try_float=False,
|
|
bint safe=False,
|
|
bint convert_datetime=False,
|
|
bint convert_timedelta=False,
|
|
bint convert_period=False,
|
|
bint convert_interval=False,
|
|
bint convert_to_nullable_integer=False,
|
|
object dtype_if_all_nat=None) -> "ArrayLike":
|
|
"""
|
|
Type inference function-- convert object array to proper dtype
|
|
|
|
Parameters
|
|
----------
|
|
objects : ndarray[object]
|
|
Array of object elements to convert.
|
|
try_float : bool, default False
|
|
If an array-like object contains only float or NaN values is
|
|
encountered, whether to convert and return an array of float dtype.
|
|
safe : bool, default False
|
|
Whether to upcast numeric type (e.g. int cast to float). If set to
|
|
True, no upcasting will be performed.
|
|
convert_datetime : bool, default False
|
|
If an array-like object contains only datetime values or NaT is
|
|
encountered, whether to convert and return an array of M8[ns] dtype.
|
|
convert_timedelta : bool, default False
|
|
If an array-like object contains only timedelta values or NaT is
|
|
encountered, whether to convert and return an array of m8[ns] dtype.
|
|
convert_period : bool, default False
|
|
If an array-like object contains only (homogeneous-freq) Period values
|
|
or NaT, whether to convert and return a PeriodArray.
|
|
convert_interval : bool, default False
|
|
If an array-like object contains only Interval objects (with matching
|
|
dtypes and closedness) or NaN, whether to convert to IntervalArray.
|
|
convert_to_nullable_integer : bool, default False
|
|
If an array-like object contains only integer values (and NaN) is
|
|
encountered, whether to convert and return an IntegerArray.
|
|
dtype_if_all_nat : np.dtype, ExtensionDtype, or None, default None
|
|
Dtype to cast to if we have all-NaT.
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray or ExtensionArray
|
|
Array of converted object values to more specific dtypes if applicable.
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, n, itemsize_max = 0
|
|
ndarray[float64_t] floats
|
|
ndarray[complex128_t] complexes
|
|
ndarray[int64_t] ints
|
|
ndarray[uint64_t] uints
|
|
ndarray[uint8_t] bools
|
|
int64_t[::1] idatetimes
|
|
int64_t[::1] itimedeltas
|
|
Seen seen = Seen()
|
|
object val
|
|
float64_t fval, fnan = np.nan
|
|
|
|
n = len(objects)
|
|
|
|
floats = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_FLOAT64, 0)
|
|
complexes = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_COMPLEX128, 0)
|
|
ints = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_INT64, 0)
|
|
uints = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT64, 0)
|
|
bools = cnp.PyArray_EMPTY(1, objects.shape, cnp.NPY_UINT8, 0)
|
|
mask = np.full(n, False)
|
|
|
|
if convert_datetime:
|
|
datetimes = np.empty(n, dtype='M8[ns]')
|
|
idatetimes = datetimes.view(np.int64)
|
|
|
|
if convert_timedelta:
|
|
timedeltas = np.empty(n, dtype='m8[ns]')
|
|
itimedeltas = timedeltas.view(np.int64)
|
|
|
|
for i in range(n):
|
|
val = objects[i]
|
|
if itemsize_max != -1:
|
|
itemsize = get_itemsize(val)
|
|
if itemsize > itemsize_max or itemsize == -1:
|
|
itemsize_max = itemsize
|
|
|
|
if val is None:
|
|
seen.null_ = True
|
|
floats[i] = complexes[i] = fnan
|
|
mask[i] = True
|
|
elif val is NaT:
|
|
seen.nat_ = True
|
|
if convert_datetime:
|
|
idatetimes[i] = NPY_NAT
|
|
if convert_timedelta:
|
|
itimedeltas[i] = NPY_NAT
|
|
if not (convert_datetime or convert_timedelta or convert_period):
|
|
seen.object_ = True
|
|
break
|
|
elif val is np.nan:
|
|
seen.nan_ = True
|
|
mask[i] = True
|
|
floats[i] = complexes[i] = val
|
|
elif util.is_bool_object(val):
|
|
seen.bool_ = True
|
|
bools[i] = val
|
|
elif util.is_float_object(val):
|
|
floats[i] = complexes[i] = val
|
|
seen.float_ = True
|
|
elif is_timedelta(val):
|
|
if convert_timedelta:
|
|
seen.timedelta_ = True
|
|
try:
|
|
itimedeltas[i] = convert_to_timedelta64(val, "ns").view("i8")
|
|
except OutOfBoundsTimedelta:
|
|
seen.object_ = True
|
|
break
|
|
break
|
|
else:
|
|
seen.object_ = True
|
|
break
|
|
elif util.is_integer_object(val):
|
|
seen.int_ = True
|
|
floats[i] = <float64_t>val
|
|
complexes[i] = <double complex>val
|
|
if not seen.null_:
|
|
seen.saw_int(val)
|
|
|
|
if ((seen.uint_ and seen.sint_) or
|
|
val > oUINT64_MAX or val < oINT64_MIN):
|
|
seen.object_ = True
|
|
break
|
|
|
|
if seen.uint_:
|
|
uints[i] = val
|
|
elif seen.sint_:
|
|
ints[i] = val
|
|
else:
|
|
uints[i] = val
|
|
ints[i] = val
|
|
|
|
elif util.is_complex_object(val):
|
|
complexes[i] = val
|
|
seen.complex_ = True
|
|
elif PyDateTime_Check(val) or util.is_datetime64_object(val):
|
|
|
|
# if we have an tz's attached then return the objects
|
|
if convert_datetime:
|
|
if getattr(val, 'tzinfo', None) is not None:
|
|
seen.datetimetz_ = True
|
|
break
|
|
else:
|
|
seen.datetime_ = True
|
|
try:
|
|
idatetimes[i] = convert_to_tsobject(
|
|
val, None, None, 0, 0).value
|
|
except OutOfBoundsDatetime:
|
|
seen.object_ = True
|
|
break
|
|
else:
|
|
seen.object_ = True
|
|
break
|
|
elif is_period_object(val):
|
|
if convert_period:
|
|
seen.period_ = True
|
|
break
|
|
else:
|
|
seen.object_ = True
|
|
break
|
|
elif try_float and not isinstance(val, str):
|
|
# this will convert Decimal objects
|
|
try:
|
|
floats[i] = float(val)
|
|
complexes[i] = complex(val)
|
|
seen.float_ = True
|
|
except (ValueError, TypeError):
|
|
seen.object_ = True
|
|
break
|
|
elif is_interval(val):
|
|
if convert_interval:
|
|
seen.interval_ = True
|
|
break
|
|
else:
|
|
seen.object_ = True
|
|
break
|
|
else:
|
|
seen.object_ = True
|
|
break
|
|
|
|
# we try to coerce datetime w/tz but must all have the same tz
|
|
if seen.datetimetz_:
|
|
if is_datetime_with_singletz_array(objects):
|
|
from pandas import DatetimeIndex
|
|
dti = DatetimeIndex(objects)
|
|
|
|
# unbox to DatetimeArray
|
|
return dti._data
|
|
seen.object_ = True
|
|
|
|
elif seen.datetime_:
|
|
if is_datetime_or_datetime64_array(objects):
|
|
from pandas import DatetimeIndex
|
|
|
|
try:
|
|
dti = DatetimeIndex(objects)
|
|
except OutOfBoundsDatetime:
|
|
pass
|
|
else:
|
|
# unbox to ndarray[datetime64[ns]]
|
|
return dti._data._ndarray
|
|
seen.object_ = True
|
|
|
|
elif seen.timedelta_:
|
|
if is_timedelta_or_timedelta64_array(objects):
|
|
from pandas import TimedeltaIndex
|
|
|
|
try:
|
|
tdi = TimedeltaIndex(objects)
|
|
except OutOfBoundsTimedelta:
|
|
pass
|
|
else:
|
|
# unbox to ndarray[timedelta64[ns]]
|
|
return tdi._data._ndarray
|
|
seen.object_ = True
|
|
|
|
if seen.period_:
|
|
if is_period_array(objects):
|
|
from pandas import PeriodIndex
|
|
pi = PeriodIndex(objects)
|
|
|
|
# unbox to PeriodArray
|
|
return pi._data
|
|
seen.object_ = True
|
|
|
|
if seen.interval_:
|
|
if is_interval_array(objects):
|
|
from pandas import IntervalIndex
|
|
ii = IntervalIndex(objects)
|
|
|
|
# unbox to IntervalArray
|
|
return ii._data
|
|
|
|
seen.object_ = True
|
|
|
|
if not seen.object_:
|
|
result = None
|
|
if not safe:
|
|
if seen.null_ or seen.nan_:
|
|
if seen.is_float_or_complex:
|
|
if seen.complex_:
|
|
result = complexes
|
|
elif seen.float_:
|
|
result = floats
|
|
elif seen.int_:
|
|
if convert_to_nullable_integer:
|
|
from pandas.core.arrays import IntegerArray
|
|
result = IntegerArray(ints, mask)
|
|
else:
|
|
result = floats
|
|
elif seen.nan_:
|
|
result = floats
|
|
else:
|
|
if not seen.bool_:
|
|
if seen.datetime_:
|
|
if not seen.numeric_ and not seen.timedelta_:
|
|
result = datetimes
|
|
elif seen.timedelta_:
|
|
if not seen.numeric_:
|
|
result = timedeltas
|
|
elif seen.nat_:
|
|
if not seen.numeric_:
|
|
if convert_datetime and convert_timedelta:
|
|
dtype = dtype_if_all_nat
|
|
if dtype is not None:
|
|
# otherwise we keep object dtype
|
|
result = _infer_all_nats(
|
|
dtype, datetimes, timedeltas
|
|
)
|
|
|
|
elif convert_datetime:
|
|
result = datetimes
|
|
elif convert_timedelta:
|
|
result = timedeltas
|
|
else:
|
|
if seen.complex_:
|
|
result = complexes
|
|
elif seen.float_:
|
|
result = floats
|
|
elif seen.int_:
|
|
if seen.uint_:
|
|
result = uints
|
|
else:
|
|
result = ints
|
|
elif seen.is_bool:
|
|
result = bools.view(np.bool_)
|
|
|
|
else:
|
|
# don't cast int to float, etc.
|
|
if seen.null_:
|
|
if seen.is_float_or_complex:
|
|
if seen.complex_:
|
|
if not seen.int_:
|
|
result = complexes
|
|
elif seen.float_ or seen.nan_:
|
|
if not seen.int_:
|
|
result = floats
|
|
else:
|
|
if not seen.bool_:
|
|
if seen.datetime_:
|
|
if not seen.numeric_ and not seen.timedelta_:
|
|
result = datetimes
|
|
elif seen.timedelta_:
|
|
if not seen.numeric_:
|
|
result = timedeltas
|
|
elif seen.nat_:
|
|
if not seen.numeric_:
|
|
if convert_datetime and convert_timedelta:
|
|
dtype = dtype_if_all_nat
|
|
if dtype is not None:
|
|
# otherwise we keep object dtype
|
|
result = _infer_all_nats(
|
|
dtype, datetimes, timedeltas
|
|
)
|
|
|
|
elif convert_datetime:
|
|
result = datetimes
|
|
elif convert_timedelta:
|
|
result = timedeltas
|
|
else:
|
|
if seen.complex_:
|
|
if not seen.int_:
|
|
result = complexes
|
|
elif seen.float_ or seen.nan_:
|
|
if not seen.int_:
|
|
result = floats
|
|
elif seen.int_:
|
|
if seen.uint_:
|
|
result = uints
|
|
else:
|
|
result = ints
|
|
elif seen.is_bool and not seen.nan_:
|
|
result = bools.view(np.bool_)
|
|
|
|
if result is uints or result is ints or result is floats or result is complexes:
|
|
# cast to the largest itemsize when all values are NumPy scalars
|
|
if itemsize_max > 0 and itemsize_max != result.dtype.itemsize:
|
|
result = result.astype(result.dtype.kind + str(itemsize_max))
|
|
return result
|
|
elif result is not None:
|
|
return result
|
|
|
|
return objects
|
|
|
|
|
|
cdef _infer_all_nats(dtype, ndarray datetimes, ndarray timedeltas):
|
|
"""
|
|
If we have all-NaT values, cast these to the given dtype.
|
|
"""
|
|
if cnp.PyArray_DescrCheck(dtype):
|
|
# i.e. isinstance(dtype, np.dtype):
|
|
if dtype == "M8[ns]":
|
|
result = datetimes
|
|
elif dtype == "m8[ns]":
|
|
result = timedeltas
|
|
else:
|
|
raise ValueError(dtype)
|
|
else:
|
|
# ExtensionDtype
|
|
cls = dtype.construct_array_type()
|
|
i8vals = cnp.PyArray_EMPTY(1, datetimes.shape, cnp.NPY_INT64, 0)
|
|
i8vals.fill(NPY_NAT)
|
|
result = cls(i8vals, dtype=dtype)
|
|
return result
|
|
|
|
|
|
class _NoDefault(Enum):
|
|
# We make this an Enum
|
|
# 1) because it round-trips through pickle correctly (see GH#40397)
|
|
# 2) because mypy does not understand singletons
|
|
no_default = "NO_DEFAULT"
|
|
|
|
def __repr__(self) -> str:
|
|
return "<no_default>"
|
|
|
|
|
|
# Note: no_default is exported to the public API in pandas.api.extensions
|
|
no_default = _NoDefault.no_default # Sentinel indicating the default value.
|
|
NoDefault = Literal[_NoDefault.no_default]
|
|
|
|
|
|
@cython.boundscheck(False)
|
|
@cython.wraparound(False)
|
|
def map_infer_mask(ndarray arr, object f, const uint8_t[:] mask, bint convert=True,
|
|
object na_value=no_default, cnp.dtype dtype=np.dtype(object)
|
|
) -> np.ndarray:
|
|
"""
|
|
Substitute for np.vectorize with pandas-friendly dtype inference.
|
|
|
|
Parameters
|
|
----------
|
|
arr : ndarray
|
|
f : function
|
|
mask : ndarray
|
|
uint8 dtype ndarray indicating values not to apply `f` to.
|
|
convert : bool, default True
|
|
Whether to call `maybe_convert_objects` on the resulting ndarray
|
|
na_value : Any, optional
|
|
The result value to use for masked values. By default, the
|
|
input value is used
|
|
dtype : numpy.dtype
|
|
The numpy dtype to use for the result ndarray.
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, n
|
|
ndarray result
|
|
object val
|
|
|
|
n = len(arr)
|
|
result = np.empty(n, dtype=dtype)
|
|
for i in range(n):
|
|
if mask[i]:
|
|
if na_value is no_default:
|
|
val = arr[i]
|
|
else:
|
|
val = na_value
|
|
else:
|
|
val = f(arr[i])
|
|
|
|
if cnp.PyArray_IsZeroDim(val):
|
|
# unbox 0-dim arrays, GH#690
|
|
val = val.item()
|
|
|
|
result[i] = val
|
|
|
|
if convert:
|
|
return maybe_convert_objects(result,
|
|
try_float=False,
|
|
convert_datetime=False,
|
|
convert_timedelta=False)
|
|
|
|
return result
|
|
|
|
|
|
@cython.boundscheck(False)
|
|
@cython.wraparound(False)
|
|
def map_infer(
|
|
ndarray arr, object f, bint convert=True, bint ignore_na=False
|
|
) -> np.ndarray:
|
|
"""
|
|
Substitute for np.vectorize with pandas-friendly dtype inference.
|
|
|
|
Parameters
|
|
----------
|
|
arr : ndarray
|
|
f : function
|
|
convert : bint
|
|
ignore_na : bint
|
|
If True, NA values will not have f applied
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, n
|
|
ndarray[object] result
|
|
object val
|
|
|
|
n = len(arr)
|
|
result = cnp.PyArray_EMPTY(1, arr.shape, cnp.NPY_OBJECT, 0)
|
|
for i in range(n):
|
|
if ignore_na and checknull(arr[i]):
|
|
result[i] = arr[i]
|
|
continue
|
|
val = f(arr[i])
|
|
|
|
if cnp.PyArray_IsZeroDim(val):
|
|
# unbox 0-dim arrays, GH#690
|
|
val = val.item()
|
|
|
|
result[i] = val
|
|
|
|
if convert:
|
|
return maybe_convert_objects(result,
|
|
try_float=False,
|
|
convert_datetime=False,
|
|
convert_timedelta=False)
|
|
|
|
return result
|
|
|
|
|
|
def to_object_array(rows: object, min_width: int = 0) -> ndarray:
|
|
"""
|
|
Convert a list of lists into an object array.
|
|
|
|
Parameters
|
|
----------
|
|
rows : 2-d array (N, K)
|
|
List of lists to be converted into an array.
|
|
min_width : int
|
|
Minimum width of the object array. If a list
|
|
in `rows` contains fewer than `width` elements,
|
|
the remaining elements in the corresponding row
|
|
will all be `NaN`.
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray[object, ndim=2]
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, j, n, k, tmp
|
|
ndarray[object, ndim=2] result
|
|
list row
|
|
|
|
rows = list(rows)
|
|
n = len(rows)
|
|
|
|
k = min_width
|
|
for i in range(n):
|
|
tmp = len(rows[i])
|
|
if tmp > k:
|
|
k = tmp
|
|
|
|
result = np.empty((n, k), dtype=object)
|
|
|
|
for i in range(n):
|
|
row = list(rows[i])
|
|
|
|
for j in range(len(row)):
|
|
result[i, j] = row[j]
|
|
|
|
return result
|
|
|
|
|
|
def tuples_to_object_array(ndarray[object] tuples):
|
|
cdef:
|
|
Py_ssize_t i, j, n, k, tmp
|
|
ndarray[object, ndim=2] result
|
|
tuple tup
|
|
|
|
n = len(tuples)
|
|
k = len(tuples[0])
|
|
result = np.empty((n, k), dtype=object)
|
|
for i in range(n):
|
|
tup = tuples[i]
|
|
for j in range(k):
|
|
result[i, j] = tup[j]
|
|
|
|
return result
|
|
|
|
|
|
def to_object_array_tuples(rows: object) -> np.ndarray:
|
|
"""
|
|
Convert a list of tuples into an object array. Any subclass of
|
|
tuple in `rows` will be casted to tuple.
|
|
|
|
Parameters
|
|
----------
|
|
rows : 2-d array (N, K)
|
|
List of tuples to be converted into an array.
|
|
|
|
Returns
|
|
-------
|
|
np.ndarray[object, ndim=2]
|
|
"""
|
|
cdef:
|
|
Py_ssize_t i, j, n, k, tmp
|
|
ndarray[object, ndim=2] result
|
|
tuple row
|
|
|
|
rows = list(rows)
|
|
n = len(rows)
|
|
|
|
k = 0
|
|
for i in range(n):
|
|
tmp = 1 if checknull(rows[i]) else len(rows[i])
|
|
if tmp > k:
|
|
k = tmp
|
|
|
|
result = np.empty((n, k), dtype=object)
|
|
|
|
try:
|
|
for i in range(n):
|
|
row = rows[i]
|
|
for j in range(len(row)):
|
|
result[i, j] = row[j]
|
|
except TypeError:
|
|
# e.g. "Expected tuple, got list"
|
|
# upcast any subclasses to tuple
|
|
for i in range(n):
|
|
row = (rows[i],) if checknull(rows[i]) else tuple(rows[i])
|
|
for j in range(len(row)):
|
|
result[i, j] = row[j]
|
|
|
|
return result
|
|
|
|
|
|
@cython.wraparound(False)
|
|
@cython.boundscheck(False)
|
|
def fast_multiget(dict mapping, ndarray keys, default=np.nan) -> np.ndarray:
|
|
cdef:
|
|
Py_ssize_t i, n = len(keys)
|
|
object val
|
|
ndarray[object] output = np.empty(n, dtype='O')
|
|
|
|
if n == 0:
|
|
# kludge, for Series
|
|
return np.empty(0, dtype='f8')
|
|
|
|
for i in range(n):
|
|
val = keys[i]
|
|
if val in mapping:
|
|
output[i] = mapping[val]
|
|
else:
|
|
output[i] = default
|
|
|
|
return maybe_convert_objects(output)
|
|
|
|
|
|
def is_bool_list(obj: list) -> bool:
|
|
"""
|
|
Check if this list contains only bool or np.bool_ objects.
|
|
|
|
This is appreciably faster than checking `np.array(obj).dtype == bool`
|
|
|
|
obj1 = [True, False] * 100
|
|
obj2 = obj1 * 100
|
|
obj3 = obj2 * 100
|
|
obj4 = [True, None] + obj1
|
|
|
|
for obj in [obj1, obj2, obj3, obj4]:
|
|
%timeit is_bool_list(obj)
|
|
%timeit np.array(obj).dtype.kind == "b"
|
|
|
|
340 ns ± 8.22 ns
|
|
8.78 µs ± 253 ns
|
|
|
|
28.8 µs ± 704 ns
|
|
813 µs ± 17.8 µs
|
|
|
|
3.4 ms ± 168 µs
|
|
78.4 ms ± 1.05 ms
|
|
|
|
48.1 ns ± 1.26 ns
|
|
8.1 µs ± 198 ns
|
|
"""
|
|
cdef:
|
|
object item
|
|
|
|
for item in obj:
|
|
if not util.is_bool_object(item):
|
|
return False
|
|
|
|
# Note: we return True for empty list
|
|
return True
|
|
|
|
|
|
cpdef ndarray eq_NA_compat(ndarray[object] arr, object key):
|
|
"""
|
|
Check for `arr == key`, treating all values as not-equal to pd.NA.
|
|
|
|
key is assumed to have `not isna(key)`
|
|
"""
|
|
cdef:
|
|
ndarray[uint8_t, cast=True] result = cnp.PyArray_EMPTY(arr.ndim, arr.shape, cnp.NPY_BOOL, 0)
|
|
Py_ssize_t i
|
|
object item
|
|
|
|
for i in range(len(arr)):
|
|
item = arr[i]
|
|
if item is C_NA:
|
|
result[i] = False
|
|
else:
|
|
result[i] = item == key
|
|
|
|
return result
|
|
|
|
|
|
def dtypes_all_equal(list types not None) -> bool:
|
|
"""
|
|
Faster version for:
|
|
|
|
first = types[0]
|
|
all(is_dtype_equal(first, t) for t in types[1:])
|
|
|
|
And assuming all elements in the list are np.dtype/ExtensionDtype objects
|
|
|
|
See timings at https://github.com/pandas-dev/pandas/pull/44594
|
|
"""
|
|
first = types[0]
|
|
for t in types[1:]:
|
|
try:
|
|
if not t == first:
|
|
return False
|
|
except (TypeError, AttributeError):
|
|
return False
|
|
else:
|
|
return True
|