1021 lines
34 KiB
Python
1021 lines
34 KiB
Python
"""Implements operations on bytes and str (unicode) array items."""
|
|
import operator
|
|
import numpy as np
|
|
from llvmlite import ir
|
|
|
|
from numba.core import types, cgutils
|
|
from numba.core.extending import (overload, intrinsic, overload_method,
|
|
lower_cast, register_jitable)
|
|
from numba.core.cgutils import is_nonelike
|
|
from numba.cpython import unicode
|
|
|
|
# bytes and str arrays items are of type CharSeq and UnicodeCharSeq,
|
|
# respectively. See numpy/types/npytypes.py for CharSeq,
|
|
# UnicodeCharSeq definitions. The corresponding data models are
|
|
# defined in numpy/datamodel/models.py. Boxing/unboxing of item types
|
|
# are defined in numpy/targets/boxing.py, see box_unicodecharseq,
|
|
# unbox_unicodecharseq, box_charseq, unbox_charseq.
|
|
|
|
s1_dtype = np.dtype('S1')
|
|
assert s1_dtype.itemsize == 1
|
|
bytes_type = types.Bytes(types.uint8, 1, "C", readonly=True)
|
|
|
|
# Currently, NumPy supports only UTF-32 arrays but this may change in
|
|
# future and the approach used here for supporting str arrays may need
|
|
# a revision depending on how NumPy will support UTF-8 and UTF-16
|
|
# arrays.
|
|
u1_dtype = np.dtype('U1')
|
|
unicode_byte_width = u1_dtype.itemsize
|
|
unicode_uint = {1: np.uint8, 2: np.uint16, 4: np.uint32}[unicode_byte_width]
|
|
unicode_kind = {1: unicode.PY_UNICODE_1BYTE_KIND,
|
|
2: unicode.PY_UNICODE_2BYTE_KIND,
|
|
4: unicode.PY_UNICODE_4BYTE_KIND}[unicode_byte_width]
|
|
|
|
|
|
# this is modified version of numba.unicode.make_deref_codegen
|
|
def make_deref_codegen(bitsize):
|
|
def codegen(context, builder, signature, args):
|
|
data, idx = args
|
|
rawptr = cgutils.alloca_once_value(builder, value=data)
|
|
ptr = builder.bitcast(rawptr, ir.IntType(bitsize).as_pointer())
|
|
ch = builder.load(builder.gep(ptr, [idx]))
|
|
return builder.zext(ch, ir.IntType(32))
|
|
return codegen
|
|
|
|
|
|
@intrinsic
|
|
def deref_uint8(typingctx, data, offset):
|
|
sig = types.uint32(data, types.intp)
|
|
return sig, make_deref_codegen(8)
|
|
|
|
|
|
@intrinsic
|
|
def deref_uint16(typingctx, data, offset):
|
|
sig = types.uint32(data, types.intp)
|
|
return sig, make_deref_codegen(16)
|
|
|
|
|
|
@intrinsic
|
|
def deref_uint32(typingctx, data, offset):
|
|
sig = types.uint32(data, types.intp)
|
|
return sig, make_deref_codegen(32)
|
|
|
|
|
|
@register_jitable(_nrt=False)
|
|
def charseq_get_code(a, i):
|
|
"""Access i-th item of CharSeq object via code value
|
|
"""
|
|
return deref_uint8(a, i)
|
|
|
|
|
|
@register_jitable
|
|
def charseq_get_value(a, i):
|
|
"""Access i-th item of CharSeq object via code value.
|
|
|
|
null code is interpreted as IndexError
|
|
"""
|
|
code = charseq_get_code(a, i)
|
|
if code == 0:
|
|
raise IndexError('index out of range')
|
|
return code
|
|
|
|
|
|
@register_jitable(_nrt=False)
|
|
def unicode_charseq_get_code(a, i):
|
|
"""Access i-th item of UnicodeCharSeq object via code value
|
|
"""
|
|
if unicode_byte_width == 4:
|
|
return deref_uint32(a, i)
|
|
elif unicode_byte_width == 2:
|
|
return deref_uint16(a, i)
|
|
elif unicode_byte_width == 1:
|
|
return deref_uint8(a, i)
|
|
else:
|
|
raise NotImplementedError(
|
|
'unicode_charseq_get_code: unicode_byte_width not in [1, 2, 4]')
|
|
|
|
|
|
@register_jitable
|
|
def unicode_get_code(a, i):
|
|
"""Access i-th item of UnicodeType object.
|
|
"""
|
|
return unicode._get_code_point(a, i)
|
|
|
|
|
|
@register_jitable
|
|
def bytes_get_code(a, i):
|
|
"""Access i-th item of Bytes object.
|
|
"""
|
|
return a[i]
|
|
|
|
|
|
def _get_code_impl(a):
|
|
if isinstance(a, types.CharSeq):
|
|
return charseq_get_code
|
|
elif isinstance(a, types.Bytes):
|
|
return bytes_get_code
|
|
elif isinstance(a, types.UnicodeCharSeq):
|
|
return unicode_charseq_get_code
|
|
elif isinstance(a, types.UnicodeType):
|
|
return unicode_get_code
|
|
|
|
|
|
def _same_kind(a, b):
|
|
for t in [(types.CharSeq, types.Bytes),
|
|
(types.UnicodeCharSeq, types.UnicodeType)]:
|
|
if isinstance(a, t) and isinstance(b, t):
|
|
return True
|
|
return False
|
|
|
|
|
|
def _is_bytes(a):
|
|
return isinstance(a, (types.CharSeq, types.Bytes))
|
|
|
|
|
|
def is_default(x, default):
|
|
return x == default or isinstance(x, types.Omitted)
|
|
|
|
|
|
@register_jitable
|
|
def unicode_charseq_get_value(a, i):
|
|
"""Access i-th item of UnicodeCharSeq object via unicode value
|
|
|
|
null code is interpreted as IndexError
|
|
"""
|
|
code = unicode_charseq_get_code(a, i)
|
|
if code == 0:
|
|
raise IndexError('index out of range')
|
|
# Return numpy equivalent of `chr(code)`
|
|
return np.array(code, unicode_uint).view(u1_dtype)[()]
|
|
|
|
|
|
#
|
|
# CAST
|
|
#
|
|
# Currently, the following casting operations are supported:
|
|
# Bytes -> CharSeq (ex: a=np.array(b'abc'); a[()] = b'123')
|
|
# UnicodeType -> UnicodeCharSeq (ex: a=np.array('abc'); a[()] = '123')
|
|
# CharSeq -> Bytes (ex: a=np.array(b'abc'); b = bytes(a[()]))
|
|
# UnicodeType -> Bytes (ex: str('123')._to_bytes())
|
|
#
|
|
# The following casting operations can be implemented when required:
|
|
# Bytes -> UnicodeCharSeq (ex: a=np.array('abc'); a[()] = b'123')
|
|
# UnicodeType -> CharSeq (ex: a=np.array(b'abc'); a[()] = '123')
|
|
# UnicodeType -> Bytes (ex: bytes('123', 'utf8'))
|
|
#
|
|
|
|
|
|
@lower_cast(types.Bytes, types.CharSeq)
|
|
def bytes_to_charseq(context, builder, fromty, toty, val):
|
|
barr = cgutils.create_struct_proxy(fromty)(context, builder, value=val)
|
|
src = builder.bitcast(barr.data, ir.IntType(8).as_pointer())
|
|
src_length = barr.nitems
|
|
|
|
lty = context.get_value_type(toty)
|
|
dstint_t = ir.IntType(8)
|
|
dst_ptr = cgutils.alloca_once(builder, lty)
|
|
dst = builder.bitcast(dst_ptr, dstint_t.as_pointer())
|
|
|
|
dst_length = ir.Constant(src_length.type, toty.count)
|
|
is_shorter_value = builder.icmp_unsigned('<', src_length, dst_length)
|
|
count = builder.select(is_shorter_value, src_length, dst_length)
|
|
with builder.if_then(is_shorter_value):
|
|
cgutils.memset(builder,
|
|
dst,
|
|
ir.Constant(src_length.type,
|
|
toty.count), 0)
|
|
with cgutils.for_range(builder, count) as loop:
|
|
in_ptr = builder.gep(src, [loop.index])
|
|
in_val = builder.zext(builder.load(in_ptr), dstint_t)
|
|
builder.store(in_val, builder.gep(dst, [loop.index]))
|
|
|
|
return builder.load(dst_ptr)
|
|
|
|
|
|
def _make_constant_bytes(context, builder, nbytes):
|
|
bstr_ctor = cgutils.create_struct_proxy(bytes_type)
|
|
bstr = bstr_ctor(context, builder)
|
|
|
|
if isinstance(nbytes, int):
|
|
nbytes = ir.Constant(bstr.nitems.type, nbytes)
|
|
|
|
bstr.meminfo = context.nrt.meminfo_alloc(builder, nbytes)
|
|
bstr.nitems = nbytes
|
|
bstr.itemsize = ir.Constant(bstr.itemsize.type, 1)
|
|
bstr.data = context.nrt.meminfo_data(builder, bstr.meminfo)
|
|
bstr.parent = cgutils.get_null_value(bstr.parent.type)
|
|
# bstr.shape and bstr.strides are not used
|
|
bstr.shape = cgutils.get_null_value(bstr.shape.type)
|
|
bstr.strides = cgutils.get_null_value(bstr.strides.type)
|
|
return bstr
|
|
|
|
|
|
@lower_cast(types.CharSeq, types.Bytes)
|
|
def charseq_to_bytes(context, builder, fromty, toty, val):
|
|
bstr = _make_constant_bytes(context, builder, val.type.count)
|
|
rawptr = cgutils.alloca_once_value(builder, value=val)
|
|
ptr = builder.bitcast(rawptr, bstr.data.type)
|
|
cgutils.memcpy(builder, bstr.data, ptr, bstr.nitems)
|
|
return bstr
|
|
|
|
|
|
@lower_cast(types.UnicodeType, types.Bytes)
|
|
def unicode_to_bytes_cast(context, builder, fromty, toty, val):
|
|
uni_str = cgutils.create_struct_proxy(fromty)(context, builder, value=val)
|
|
src1 = builder.bitcast(uni_str.data, ir.IntType(8).as_pointer())
|
|
notkind1 = builder.icmp_unsigned('!=', uni_str.kind,
|
|
ir.Constant(uni_str.kind.type, 1))
|
|
src_length = uni_str.length
|
|
|
|
with builder.if_then(notkind1):
|
|
context.call_conv.return_user_exc(
|
|
builder, ValueError,
|
|
("cannot cast higher than 8-bit unicode_type to bytes",))
|
|
|
|
bstr = _make_constant_bytes(context, builder, src_length)
|
|
cgutils.memcpy(builder, bstr.data, src1, bstr.nitems)
|
|
return bstr
|
|
|
|
|
|
@intrinsic
|
|
def _unicode_to_bytes(typingctx, s):
|
|
# used in _to_bytes method
|
|
assert s == types.unicode_type
|
|
sig = bytes_type(s)
|
|
|
|
def codegen(context, builder, signature, args):
|
|
return unicode_to_bytes_cast(
|
|
context, builder, s, bytes_type, args[0])._getvalue()
|
|
return sig, codegen
|
|
|
|
|
|
@lower_cast(types.UnicodeType, types.UnicodeCharSeq)
|
|
def unicode_to_unicode_charseq(context, builder, fromty, toty, val):
|
|
uni_str = cgutils.create_struct_proxy(fromty)(context, builder, value=val)
|
|
src1 = builder.bitcast(uni_str.data, ir.IntType(8).as_pointer())
|
|
src2 = builder.bitcast(uni_str.data, ir.IntType(16).as_pointer())
|
|
src4 = builder.bitcast(uni_str.data, ir.IntType(32).as_pointer())
|
|
kind1 = builder.icmp_unsigned('==', uni_str.kind,
|
|
ir.Constant(uni_str.kind.type, 1))
|
|
kind2 = builder.icmp_unsigned('==', uni_str.kind,
|
|
ir.Constant(uni_str.kind.type, 2))
|
|
kind4 = builder.icmp_unsigned('==', uni_str.kind,
|
|
ir.Constant(uni_str.kind.type, 4))
|
|
src_length = uni_str.length
|
|
|
|
lty = context.get_value_type(toty)
|
|
dstint_t = ir.IntType(8 * unicode_byte_width)
|
|
dst_ptr = cgutils.alloca_once(builder, lty)
|
|
dst = builder.bitcast(dst_ptr, dstint_t.as_pointer())
|
|
|
|
dst_length = ir.Constant(src_length.type, toty.count)
|
|
is_shorter_value = builder.icmp_unsigned('<', src_length, dst_length)
|
|
count = builder.select(is_shorter_value, src_length, dst_length)
|
|
with builder.if_then(is_shorter_value):
|
|
cgutils.memset(builder,
|
|
dst,
|
|
ir.Constant(src_length.type,
|
|
toty.count * unicode_byte_width), 0)
|
|
|
|
with builder.if_then(kind1):
|
|
with cgutils.for_range(builder, count) as loop:
|
|
in_ptr = builder.gep(src1, [loop.index])
|
|
in_val = builder.zext(builder.load(in_ptr), dstint_t)
|
|
builder.store(in_val, builder.gep(dst, [loop.index]))
|
|
|
|
with builder.if_then(kind2):
|
|
if unicode_byte_width >= 2:
|
|
with cgutils.for_range(builder, count) as loop:
|
|
in_ptr = builder.gep(src2, [loop.index])
|
|
in_val = builder.zext(builder.load(in_ptr), dstint_t)
|
|
builder.store(in_val, builder.gep(dst, [loop.index]))
|
|
else:
|
|
context.call_conv.return_user_exc(
|
|
builder, ValueError,
|
|
("cannot cast 16-bit unicode_type to %s-bit %s"
|
|
% (unicode_byte_width * 8, toty)))
|
|
|
|
with builder.if_then(kind4):
|
|
if unicode_byte_width >= 4:
|
|
with cgutils.for_range(builder, count) as loop:
|
|
in_ptr = builder.gep(src4, [loop.index])
|
|
in_val = builder.zext(builder.load(in_ptr), dstint_t)
|
|
builder.store(in_val, builder.gep(dst, [loop.index]))
|
|
else:
|
|
context.call_conv.return_user_exc(
|
|
builder, ValueError,
|
|
("cannot cast 32-bit unicode_type to %s-bit %s"
|
|
% (unicode_byte_width * 8, toty)))
|
|
|
|
return builder.load(dst_ptr)
|
|
|
|
#
|
|
# Operations on bytes/str array items
|
|
#
|
|
# Implementation note: while some operations need
|
|
# CharSeq/UnicodeCharSeq specific implementations (getitem, len, str,
|
|
# etc), many operations can be supported by casting
|
|
# CharSeq/UnicodeCharSeq objects to Bytes/UnicodeType objects and
|
|
# re-use existing operations.
|
|
#
|
|
# However, in numba more operations are implemented for UnicodeType
|
|
# than for Bytes objects, hence the support for operations with bytes
|
|
# array items will be less complete than for str arrays. Although, in
|
|
# some cases (hash, contains, etc) the UnicodeType implementations can
|
|
# be reused for Bytes objects via using `_to_str` method.
|
|
#
|
|
|
|
|
|
@overload(operator.getitem)
|
|
def charseq_getitem(s, i):
|
|
get_value = None
|
|
if isinstance(i, types.Integer):
|
|
if isinstance(s, types.CharSeq):
|
|
get_value = charseq_get_value
|
|
if isinstance(s, types.UnicodeCharSeq):
|
|
get_value = unicode_charseq_get_value
|
|
if get_value is not None:
|
|
max_i = s.count
|
|
msg = 'index out of range [0, %s]' % (max_i - 1)
|
|
|
|
def getitem_impl(s, i):
|
|
if i < max_i and i >= 0:
|
|
return get_value(s, i)
|
|
raise IndexError(msg)
|
|
return getitem_impl
|
|
|
|
|
|
@overload(len)
|
|
def charseq_len(s):
|
|
if isinstance(s, (types.CharSeq, types.UnicodeCharSeq)):
|
|
get_code = _get_code_impl(s)
|
|
n = s.count
|
|
if n == 0:
|
|
def len_impl(s):
|
|
return 0
|
|
return len_impl
|
|
else:
|
|
def len_impl(s):
|
|
# return the index of the last non-null value (numpy
|
|
# behavior)
|
|
i = n
|
|
code = 0
|
|
while code == 0:
|
|
i = i - 1
|
|
if i < 0:
|
|
break
|
|
code = get_code(s, i)
|
|
return i + 1
|
|
return len_impl
|
|
|
|
|
|
@overload(operator.add)
|
|
@overload(operator.iadd)
|
|
def charseq_concat(a, b):
|
|
if not _same_kind(a, b):
|
|
return
|
|
if (isinstance(a, types.UnicodeCharSeq) and
|
|
isinstance(b, types.UnicodeType)):
|
|
def impl(a, b):
|
|
return str(a) + b
|
|
return impl
|
|
if (isinstance(b, types.UnicodeCharSeq) and
|
|
isinstance(a, types.UnicodeType)):
|
|
def impl(a, b):
|
|
return a + str(b)
|
|
return impl
|
|
if (isinstance(a, types.UnicodeCharSeq) and
|
|
isinstance(b, types.UnicodeCharSeq)):
|
|
def impl(a, b):
|
|
return str(a) + str(b)
|
|
return impl
|
|
if (isinstance(a, (types.CharSeq, types.Bytes)) and
|
|
isinstance(b, (types.CharSeq, types.Bytes))):
|
|
def impl(a, b):
|
|
return (a._to_str() + b._to_str())._to_bytes()
|
|
return impl
|
|
|
|
|
|
@overload(operator.mul)
|
|
def charseq_repeat(a, b):
|
|
if isinstance(a, types.UnicodeCharSeq):
|
|
def wrap(a, b):
|
|
return str(a) * b
|
|
return wrap
|
|
if isinstance(b, types.UnicodeCharSeq):
|
|
def wrap(a, b):
|
|
return a * str(b)
|
|
return wrap
|
|
if isinstance(a, (types.CharSeq, types.Bytes)):
|
|
def wrap(a, b):
|
|
return (a._to_str() * b)._to_bytes()
|
|
return wrap
|
|
if isinstance(b, (types.CharSeq, types.Bytes)):
|
|
def wrap(a, b):
|
|
return (a * b._to_str())._to_bytes()
|
|
return wrap
|
|
|
|
|
|
@overload(operator.not_)
|
|
def charseq_not(a):
|
|
if isinstance(a, (types.UnicodeCharSeq, types.CharSeq, types.Bytes)):
|
|
def impl(a):
|
|
return len(a) == 0
|
|
return impl
|
|
|
|
|
|
@overload(operator.eq)
|
|
def charseq_eq(a, b):
|
|
if not _same_kind(a, b):
|
|
return
|
|
left_code = _get_code_impl(a)
|
|
right_code = _get_code_impl(b)
|
|
if left_code is not None and right_code is not None:
|
|
def eq_impl(a, b):
|
|
n = len(a)
|
|
if n != len(b):
|
|
return False
|
|
for i in range(n):
|
|
if left_code(a, i) != right_code(b, i):
|
|
return False
|
|
return True
|
|
return eq_impl
|
|
|
|
|
|
@overload(operator.ne)
|
|
def charseq_ne(a, b):
|
|
if not _same_kind(a, b):
|
|
return
|
|
left_code = _get_code_impl(a)
|
|
right_code = _get_code_impl(b)
|
|
if left_code is not None and right_code is not None:
|
|
def ne_impl(a, b):
|
|
return not (a == b)
|
|
return ne_impl
|
|
|
|
|
|
@overload(operator.lt)
|
|
def charseq_lt(a, b):
|
|
if not _same_kind(a, b):
|
|
return
|
|
left_code = _get_code_impl(a)
|
|
right_code = _get_code_impl(b)
|
|
if left_code is not None and right_code is not None:
|
|
def lt_impl(a, b):
|
|
na = len(a)
|
|
nb = len(b)
|
|
n = min(na, nb)
|
|
for i in range(n):
|
|
ca, cb = left_code(a, i), right_code(b, i)
|
|
if ca != cb:
|
|
return ca < cb
|
|
return na < nb
|
|
return lt_impl
|
|
|
|
|
|
@overload(operator.gt)
|
|
def charseq_gt(a, b):
|
|
if not _same_kind(a, b):
|
|
return
|
|
left_code = _get_code_impl(a)
|
|
right_code = _get_code_impl(b)
|
|
if left_code is not None and right_code is not None:
|
|
def gt_impl(a, b):
|
|
return b < a
|
|
return gt_impl
|
|
|
|
|
|
@overload(operator.le)
|
|
def charseq_le(a, b):
|
|
if not _same_kind(a, b):
|
|
return
|
|
left_code = _get_code_impl(a)
|
|
right_code = _get_code_impl(b)
|
|
if left_code is not None and right_code is not None:
|
|
def le_impl(a, b):
|
|
return not (a > b)
|
|
return le_impl
|
|
|
|
|
|
@overload(operator.ge)
|
|
def charseq_ge(a, b):
|
|
if not _same_kind(a, b):
|
|
return
|
|
left_code = _get_code_impl(a)
|
|
right_code = _get_code_impl(b)
|
|
if left_code is not None and right_code is not None:
|
|
def ge_impl(a, b):
|
|
return not (a < b)
|
|
return ge_impl
|
|
|
|
|
|
@overload(operator.contains)
|
|
def charseq_contains(a, b):
|
|
if not _same_kind(a, b):
|
|
return
|
|
left_code = _get_code_impl(a)
|
|
right_code = _get_code_impl(b)
|
|
if left_code is not None and right_code is not None:
|
|
if _is_bytes(a):
|
|
def contains_impl(a, b):
|
|
# Ideally, `return bytes(b) in bytes(a)` would be used
|
|
# here, but numba Bytes does not implement
|
|
# contains. So, using `unicode_type` implementation
|
|
# here:
|
|
return b._to_str() in a._to_str()
|
|
else:
|
|
def contains_impl(a, b):
|
|
return str(b) in str(a)
|
|
return contains_impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'isascii')
|
|
@overload_method(types.CharSeq, 'isascii')
|
|
@overload_method(types.Bytes, 'isascii')
|
|
def charseq_isascii(s):
|
|
get_code = _get_code_impl(s)
|
|
|
|
def impl(s):
|
|
for i in range(len(s)):
|
|
if get_code(s, i) > 127:
|
|
return False
|
|
return True
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, '_get_kind')
|
|
@overload_method(types.CharSeq, '_get_kind')
|
|
def charseq_get_kind(s):
|
|
get_code = _get_code_impl(s)
|
|
|
|
def impl(s):
|
|
max_code = 0
|
|
for i in range(len(s)):
|
|
code = get_code(s, i)
|
|
if code > max_code:
|
|
max_code = code
|
|
if max_code > 0xffff:
|
|
return unicode.PY_UNICODE_4BYTE_KIND
|
|
if max_code > 0xff:
|
|
return unicode.PY_UNICODE_2BYTE_KIND
|
|
return unicode.PY_UNICODE_1BYTE_KIND
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeType, '_to_bytes')
|
|
def unicode_to_bytes_mth(s):
|
|
"""Convert unicode_type object to Bytes object.
|
|
|
|
Note: The usage of _to_bytes method can be eliminated once all
|
|
Python bytes operations are implemented for numba Bytes objects.
|
|
|
|
"""
|
|
def impl(s):
|
|
return _unicode_to_bytes(s)
|
|
return impl
|
|
|
|
|
|
@overload_method(types.CharSeq, '_to_str')
|
|
@overload_method(types.Bytes, '_to_str')
|
|
def charseq_to_str_mth(s):
|
|
"""Convert bytes array item or bytes instance to UTF-8 str.
|
|
|
|
Note: The usage of _to_str method can be eliminated once all
|
|
Python bytes operations are implemented for numba Bytes objects.
|
|
"""
|
|
get_code = _get_code_impl(s)
|
|
|
|
def tostr_impl(s):
|
|
n = len(s)
|
|
is_ascii = s.isascii()
|
|
result = unicode._empty_string(
|
|
unicode.PY_UNICODE_1BYTE_KIND, n, is_ascii)
|
|
for i in range(n):
|
|
code = get_code(s, i)
|
|
unicode._set_code_point(result, i, code)
|
|
return result
|
|
return tostr_impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, "__str__")
|
|
def charseq_str(s):
|
|
get_code = _get_code_impl(s)
|
|
|
|
def str_impl(s):
|
|
n = len(s)
|
|
kind = s._get_kind()
|
|
is_ascii = kind == 1 and s.isascii()
|
|
result = unicode._empty_string(kind, n, is_ascii)
|
|
for i in range(n):
|
|
code = get_code(s, i)
|
|
unicode._set_code_point(result, i, code)
|
|
return result
|
|
|
|
return str_impl
|
|
|
|
|
|
@overload(bytes)
|
|
def charseq_bytes(s):
|
|
if isinstance(s, types.CharSeq):
|
|
return lambda s: s
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, '__hash__')
|
|
def unicode_charseq_hash(s):
|
|
def impl(s):
|
|
return hash(str(s))
|
|
return impl
|
|
|
|
|
|
@overload_method(types.CharSeq, '__hash__')
|
|
def charseq_hash(s):
|
|
def impl(s):
|
|
# Ideally, `return hash(bytes(s))` would be used here but
|
|
# numba Bytes does not implement hash (yet). However, for a
|
|
# UTF-8 string `s`, we have hash(bytes(s)) == hash(s), hence,
|
|
# we can convert CharSeq object to unicode_type and reuse its
|
|
# hash implementation:
|
|
return hash(s._to_str())
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'isupper')
|
|
def unicode_charseq_isupper(s):
|
|
def impl(s):
|
|
# workaround unicode_type.isupper bug: it returns int value
|
|
return not not str(s).isupper()
|
|
return impl
|
|
|
|
|
|
@overload_method(types.CharSeq, 'isupper')
|
|
def charseq_isupper(s):
|
|
def impl(s):
|
|
# return bytes(s).isupper() # TODO: implement isupper for Bytes
|
|
return not not s._to_str().isupper()
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'upper')
|
|
def unicode_charseq_upper(s):
|
|
def impl(s):
|
|
return str(s).upper()
|
|
return impl
|
|
|
|
|
|
@overload_method(types.CharSeq, 'upper')
|
|
def charseq_upper(s):
|
|
def impl(s):
|
|
# return bytes(s).upper() # TODO: implement upper for Bytes
|
|
return s._to_str().upper()._to_bytes()
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'find')
|
|
@overload_method(types.CharSeq, 'find')
|
|
@overload_method(types.Bytes, 'find')
|
|
def unicode_charseq_find(a, b):
|
|
if isinstance(a, types.UnicodeCharSeq):
|
|
if isinstance(b, types.UnicodeCharSeq):
|
|
def impl(a, b):
|
|
return str(a).find(str(b))
|
|
return impl
|
|
if isinstance(b, types.UnicodeType):
|
|
def impl(a, b):
|
|
return str(a).find(b)
|
|
return impl
|
|
if isinstance(a, types.CharSeq):
|
|
if isinstance(b, (types.CharSeq, types.Bytes)):
|
|
def impl(a, b):
|
|
return a._to_str().find(b._to_str())
|
|
return impl
|
|
if isinstance(a, types.UnicodeType):
|
|
if isinstance(b, types.UnicodeCharSeq):
|
|
def impl(a, b):
|
|
return a.find(str(b))
|
|
return impl
|
|
if isinstance(a, types.Bytes):
|
|
if isinstance(b, types.CharSeq):
|
|
def impl(a, b):
|
|
return a._to_str().find(b._to_str())
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'rfind')
|
|
@overload_method(types.CharSeq, 'rfind')
|
|
@overload_method(types.Bytes, 'rfind')
|
|
def unicode_charseq_rfind(a, b):
|
|
if isinstance(a, types.UnicodeCharSeq):
|
|
if isinstance(b, types.UnicodeCharSeq):
|
|
def impl(a, b):
|
|
return str(a).rfind(str(b))
|
|
return impl
|
|
if isinstance(b, types.UnicodeType):
|
|
def impl(a, b):
|
|
return str(a).rfind(b)
|
|
return impl
|
|
if isinstance(a, types.CharSeq):
|
|
if isinstance(b, (types.CharSeq, types.Bytes)):
|
|
def impl(a, b):
|
|
return a._to_str().rfind(b._to_str())
|
|
return impl
|
|
if isinstance(a, types.UnicodeType):
|
|
if isinstance(b, types.UnicodeCharSeq):
|
|
def impl(a, b):
|
|
return a.rfind(str(b))
|
|
return impl
|
|
if isinstance(a, types.Bytes):
|
|
if isinstance(b, types.CharSeq):
|
|
def impl(a, b):
|
|
return a._to_str().rfind(b._to_str())
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'startswith')
|
|
@overload_method(types.CharSeq, 'startswith')
|
|
@overload_method(types.Bytes, 'startswith')
|
|
def unicode_charseq_startswith(a, b):
|
|
if isinstance(a, types.UnicodeCharSeq):
|
|
if isinstance(b, types.UnicodeCharSeq):
|
|
def impl(a, b):
|
|
return str(a).startswith(str(b))
|
|
return impl
|
|
if isinstance(b, types.UnicodeType):
|
|
def impl(a, b):
|
|
return str(a).startswith(b)
|
|
return impl
|
|
if isinstance(a, (types.CharSeq, types.Bytes)):
|
|
if isinstance(b, (types.CharSeq, types.Bytes)):
|
|
def impl(a, b):
|
|
return a._to_str().startswith(b._to_str())
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'endswith')
|
|
@overload_method(types.CharSeq, 'endswith')
|
|
@overload_method(types.Bytes, 'endswith')
|
|
def unicode_charseq_endswith(a, b):
|
|
if isinstance(a, types.UnicodeCharSeq):
|
|
if isinstance(b, types.UnicodeCharSeq):
|
|
def impl(a, b):
|
|
return str(a).endswith(str(b))
|
|
return impl
|
|
if isinstance(b, types.UnicodeType):
|
|
def impl(a, b):
|
|
return str(a).endswith(b)
|
|
return impl
|
|
if isinstance(a, (types.CharSeq, types.Bytes)):
|
|
if isinstance(b, (types.CharSeq, types.Bytes)):
|
|
def impl(a, b):
|
|
return a._to_str().endswith(b._to_str())
|
|
return impl
|
|
|
|
|
|
@register_jitable
|
|
def _map_bytes(seq):
|
|
return [s._to_bytes() for s in seq]
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'split')
|
|
@overload_method(types.CharSeq, 'split')
|
|
@overload_method(types.Bytes, 'split')
|
|
def unicode_charseq_split(a, sep=None, maxsplit=-1):
|
|
if not (maxsplit == -1 or
|
|
isinstance(maxsplit, (types.Omitted, types.Integer,
|
|
types.IntegerLiteral))):
|
|
return None
|
|
if isinstance(a, types.UnicodeCharSeq):
|
|
if isinstance(sep, types.UnicodeCharSeq):
|
|
def impl(a, sep=None, maxsplit=-1):
|
|
return str(a).split(sep=str(sep), maxsplit=maxsplit)
|
|
return impl
|
|
if isinstance(sep, types.UnicodeType):
|
|
def impl(a, sep=None, maxsplit=-1):
|
|
return str(a).split(sep=sep, maxsplit=maxsplit)
|
|
return impl
|
|
if is_nonelike(sep):
|
|
if is_default(maxsplit, -1):
|
|
def impl(a, sep=None, maxsplit=-1):
|
|
return str(a).split()
|
|
else:
|
|
def impl(a, sep=None, maxsplit=-1):
|
|
return str(a).split(maxsplit=maxsplit)
|
|
return impl
|
|
if isinstance(a, (types.CharSeq, types.Bytes)):
|
|
if isinstance(sep, (types.CharSeq, types.Bytes)):
|
|
def impl(a, sep=None, maxsplit=-1):
|
|
return _map_bytes(a._to_str().split(sep._to_str(),
|
|
maxsplit=maxsplit))
|
|
return impl
|
|
if is_nonelike(sep):
|
|
if is_default(maxsplit, -1):
|
|
def impl(a, sep=None, maxsplit=-1):
|
|
return _map_bytes(a._to_str().split())
|
|
else:
|
|
def impl(a, sep=None, maxsplit=-1):
|
|
return _map_bytes(a._to_str().split(maxsplit=maxsplit))
|
|
return impl
|
|
|
|
# NOT IMPLEMENTED: rsplit
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'ljust')
|
|
@overload_method(types.CharSeq, 'ljust')
|
|
@overload_method(types.Bytes, 'ljust')
|
|
def unicode_charseq_ljust(a, width, fillchar=' '):
|
|
if isinstance(a, types.UnicodeCharSeq):
|
|
if is_default(fillchar, ' '):
|
|
def impl(a, width, fillchar=' '):
|
|
return str(a).ljust(width)
|
|
return impl
|
|
elif isinstance(fillchar, types.UnicodeCharSeq):
|
|
def impl(a, width, fillchar=' '):
|
|
return str(a).ljust(width, str(fillchar))
|
|
return impl
|
|
elif isinstance(fillchar, types.UnicodeType):
|
|
def impl(a, width, fillchar=' '):
|
|
return str(a).ljust(width, fillchar)
|
|
return impl
|
|
if isinstance(a, (types.CharSeq, types.Bytes)):
|
|
if is_default(fillchar, ' ') or is_default(fillchar, b' '):
|
|
def impl(a, width, fillchar=' '):
|
|
return a._to_str().ljust(width)._to_bytes()
|
|
return impl
|
|
elif isinstance(fillchar, (types.CharSeq, types.Bytes)):
|
|
def impl(a, width, fillchar=' '):
|
|
return a._to_str().ljust(width, fillchar._to_str())._to_bytes()
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'rjust')
|
|
@overload_method(types.CharSeq, 'rjust')
|
|
@overload_method(types.Bytes, 'rjust')
|
|
def unicode_charseq_rjust(a, width, fillchar=' '):
|
|
if isinstance(a, types.UnicodeCharSeq):
|
|
if is_default(fillchar, ' '):
|
|
def impl(a, width, fillchar=' '):
|
|
return str(a).rjust(width)
|
|
return impl
|
|
elif isinstance(fillchar, types.UnicodeCharSeq):
|
|
def impl(a, width, fillchar=' '):
|
|
return str(a).rjust(width, str(fillchar))
|
|
return impl
|
|
elif isinstance(fillchar, types.UnicodeType):
|
|
def impl(a, width, fillchar=' '):
|
|
return str(a).rjust(width, fillchar)
|
|
return impl
|
|
if isinstance(a, (types.CharSeq, types.Bytes)):
|
|
if is_default(fillchar, ' ') or is_default(fillchar, b' '):
|
|
def impl(a, width, fillchar=' '):
|
|
return a._to_str().rjust(width)._to_bytes()
|
|
return impl
|
|
elif isinstance(fillchar, (types.CharSeq, types.Bytes)):
|
|
def impl(a, width, fillchar=' '):
|
|
return a._to_str().rjust(width, fillchar._to_str())._to_bytes()
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'center')
|
|
@overload_method(types.CharSeq, 'center')
|
|
@overload_method(types.Bytes, 'center')
|
|
def unicode_charseq_center(a, width, fillchar=' '):
|
|
if isinstance(a, types.UnicodeCharSeq):
|
|
if is_default(fillchar, ' '):
|
|
def impl(a, width, fillchar=' '):
|
|
return str(a).center(width)
|
|
return impl
|
|
elif isinstance(fillchar, types.UnicodeCharSeq):
|
|
def impl(a, width, fillchar=' '):
|
|
return str(a).center(width, str(fillchar))
|
|
return impl
|
|
elif isinstance(fillchar, types.UnicodeType):
|
|
def impl(a, width, fillchar=' '):
|
|
return str(a).center(width, fillchar)
|
|
return impl
|
|
if isinstance(a, (types.CharSeq, types.Bytes)):
|
|
if is_default(fillchar, ' ') or is_default(fillchar, b' '):
|
|
def impl(a, width, fillchar=' '):
|
|
return a._to_str().center(width)._to_bytes()
|
|
return impl
|
|
elif isinstance(fillchar, (types.CharSeq, types.Bytes)):
|
|
def impl(a, width, fillchar=' '):
|
|
return a._to_str().center(width, fillchar._to_str())._to_bytes()
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'zfill')
|
|
@overload_method(types.CharSeq, 'zfill')
|
|
@overload_method(types.Bytes, 'zfill')
|
|
def unicode_charseq_zfill(a, width):
|
|
if isinstance(a, types.UnicodeCharSeq):
|
|
def impl(a, width):
|
|
return str(a).zfill(width)
|
|
return impl
|
|
if isinstance(a, (types.CharSeq, types.Bytes)):
|
|
def impl(a, width):
|
|
return a._to_str().zfill(width)._to_bytes()
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'lstrip')
|
|
@overload_method(types.CharSeq, 'lstrip')
|
|
@overload_method(types.Bytes, 'lstrip')
|
|
def unicode_charseq_lstrip(a, chars=None):
|
|
if isinstance(a, types.UnicodeCharSeq):
|
|
if is_nonelike(chars):
|
|
def impl(a, chars=None):
|
|
return str(a).lstrip()
|
|
return impl
|
|
elif isinstance(chars, types.UnicodeCharSeq):
|
|
def impl(a, chars=None):
|
|
return str(a).lstrip(str(chars))
|
|
return impl
|
|
elif isinstance(chars, types.UnicodeType):
|
|
def impl(a, chars=None):
|
|
return str(a).lstrip(chars)
|
|
return impl
|
|
if isinstance(a, (types.CharSeq, types.Bytes)):
|
|
if is_nonelike(chars):
|
|
def impl(a, chars=None):
|
|
return a._to_str().lstrip()._to_bytes()
|
|
return impl
|
|
elif isinstance(chars, (types.CharSeq, types.Bytes)):
|
|
def impl(a, chars=None):
|
|
return a._to_str().lstrip(chars._to_str())._to_bytes()
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'rstrip')
|
|
@overload_method(types.CharSeq, 'rstrip')
|
|
@overload_method(types.Bytes, 'rstrip')
|
|
def unicode_charseq_rstrip(a, chars=None):
|
|
if isinstance(a, types.UnicodeCharSeq):
|
|
if is_nonelike(chars):
|
|
def impl(a, chars=None):
|
|
return str(a).rstrip()
|
|
return impl
|
|
elif isinstance(chars, types.UnicodeCharSeq):
|
|
def impl(a, chars=None):
|
|
return str(a).rstrip(str(chars))
|
|
return impl
|
|
elif isinstance(chars, types.UnicodeType):
|
|
def impl(a, chars=None):
|
|
return str(a).rstrip(chars)
|
|
return impl
|
|
if isinstance(a, (types.CharSeq, types.Bytes)):
|
|
if is_nonelike(chars):
|
|
def impl(a, chars=None):
|
|
return a._to_str().rstrip()._to_bytes()
|
|
return impl
|
|
elif isinstance(chars, (types.CharSeq, types.Bytes)):
|
|
def impl(a, chars=None):
|
|
return a._to_str().rstrip(chars._to_str())._to_bytes()
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'strip')
|
|
@overload_method(types.CharSeq, 'strip')
|
|
@overload_method(types.Bytes, 'strip')
|
|
def unicode_charseq_strip(a, chars=None):
|
|
if isinstance(a, types.UnicodeCharSeq):
|
|
if is_nonelike(chars):
|
|
def impl(a, chars=None):
|
|
return str(a).strip()
|
|
return impl
|
|
elif isinstance(chars, types.UnicodeCharSeq):
|
|
def impl(a, chars=None):
|
|
return str(a).strip(str(chars))
|
|
return impl
|
|
elif isinstance(chars, types.UnicodeType):
|
|
def impl(a, chars=None):
|
|
return str(a).strip(chars)
|
|
return impl
|
|
if isinstance(a, (types.CharSeq, types.Bytes)):
|
|
if is_nonelike(chars):
|
|
def impl(a, chars=None):
|
|
return a._to_str().strip()._to_bytes()
|
|
return impl
|
|
elif isinstance(chars, (types.CharSeq, types.Bytes)):
|
|
def impl(a, chars=None):
|
|
return a._to_str().strip(chars._to_str())._to_bytes()
|
|
return impl
|
|
|
|
|
|
@overload_method(types.UnicodeCharSeq, 'join')
|
|
@overload_method(types.CharSeq, 'join')
|
|
@overload_method(types.Bytes, 'join')
|
|
def unicode_charseq_join(a, parts):
|
|
|
|
if isinstance(a, types.UnicodeCharSeq):
|
|
# assuming parts contains UnicodeCharSeq or UnicodeType objects
|
|
def impl(a, parts):
|
|
_parts = [str(p) for p in parts]
|
|
return str(a).join(_parts)
|
|
return impl
|
|
|
|
if isinstance(a, (types.CharSeq, types.Bytes)):
|
|
# assuming parts contains CharSeq or Bytes objects
|
|
def impl(a, parts):
|
|
_parts = [p._to_str() for p in parts]
|
|
return a._to_str().join(_parts)._to_bytes()
|
|
return impl
|