ai-content-maker/.venv/Lib/site-packages/thinc/backends/cupy_ops.py

367 lines
13 KiB
Python

import numpy
from .. import registry
from ..compat import cublas, cupy, cupyx
from ..types import DeviceTypes
from ..util import (
is_cupy_array,
is_mxnet_gpu_array,
is_tensorflow_gpu_array,
is_torch_cuda_array,
mxnet2xp,
tensorflow2xp,
torch2xp,
)
from . import _custom_kernels
from .numpy_ops import NumpyOps
from .ops import Ops
@registry.ops("CupyOps")
class CupyOps(Ops):
name = "cupy"
xp = cupy
_xp2 = cupyx
def __init__(
self, device_type: DeviceTypes = "gpu", device_id: int = 0, **kwargs
) -> None:
self.device_type = device_type
self.device_id = device_id
def to_numpy(self, data, *, byte_order=None):
if not isinstance(data, numpy.ndarray):
data = data.get()
if byte_order:
dtype = data.dtype.newbyteorder(byte_order)
data = numpy.asarray(data, dtype=dtype)
return data
def gather_add(self, table, indices):
if table.dtype in ("float32", "float64"):
return _custom_kernels.gather_add(table, indices)
else:
return super().gather_add(table, indices)
def dish(self, X, inplace=False):
if X.dtype in ("float32", "float64"):
return _custom_kernels.dish(X, inplace=inplace)
else:
return super().dish(X, inplace=inplace)
def backprop_dish(self, dY, X, inplace=False):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_dish(dY, X, inplace=inplace)
else:
return super().backprop_dish(dY, X, inplace=inplace)
def gelu(self, X, inplace=False):
if X.dtype in ("float32", "float64"):
return _custom_kernels.gelu(X, inplace=inplace, threshold=6.0)
else:
return super().gelu(X, inplace=inplace)
def backprop_gelu(self, dY, X, inplace=False):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_gelu(dY, X, inplace=inplace, threshold=6.0)
else:
return super().backprop_gelu(dY, X, inplace=inplace)
def gemm(self, x, y, out=None, trans1=False, trans2=False):
if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
raise ValueError(
"Encountered a numpy array when processing with cupy. "
"Did you call model.ops.asarray on your data?"
)
if trans1:
x = x.T
if trans2:
y = y.T
if out is None:
return self.xp.dot(x, y)
else:
self.xp.dot(x, y, out=out)
return out
def asarray(self, data, dtype=None):
# We'll try to perform a zero-copy conversion if possible.
if is_cupy_array(data):
array = self.xp.asarray(data, dtype=dtype)
elif is_torch_cuda_array(data):
array = torch2xp(data)
elif is_tensorflow_gpu_array(data):
array = tensorflow2xp(data)
elif is_mxnet_gpu_array(data):
array = mxnet2xp(data)
else:
array = self.xp.array(data, dtype=dtype)
if dtype is not None:
array = array.astype(dtype=dtype, copy=False)
return array
def pad(self, seqs, round_to=1):
"""Perform padding on a list of arrays so that they each have the same
length, by taking the maximum dimension across each axis. This only
works on non-empty sequences with the same `ndim` and `dtype`.
"""
# TODO: This should be generalized to handle different ranks
if not seqs:
raise ValueError("Cannot pad empty sequence")
if len(set(seq.ndim for seq in seqs)) != 1:
raise ValueError("Cannot pad sequences with different ndims")
if len(set(seq.dtype for seq in seqs)) != 1:
raise ValueError("Cannot pad sequences with different dtypes")
if len(set(seq.shape[1:] for seq in seqs)) != 1:
raise ValueError("Cannot pad sequences that differ on other dimensions")
# Our CUDA kernel can currently only handle C contiguous arrays.
if not all(seq.flags["C_CONTIGUOUS"] for seq in seqs) or seqs[0].dtype not in (
"float32",
"float64",
"int32",
"int64",
):
return super().pad(seqs, round_to)
return _custom_kernels.pad(seqs, round_to)
def maxout(self, X):
if X.dtype in ("float32", "float64"):
return _custom_kernels.maxout(X)
else:
return super().maxout(X)
def backprop_maxout(self, dY, which, P):
if dY.dtype in ("float32", "float64") and which.dtype == "int32":
return _custom_kernels.backprop_maxout(dY, which, P)
else:
return super().backprop_maxout(dY, which, P)
def relu(self, X, inplace=False):
if not inplace:
return X * (X > 0)
else:
X *= X > 0
return X
def backprop_relu(self, dY, Y, inplace=False):
if not inplace:
return dY * (Y > 0)
dY *= Y > 0
return dY
def clipped_linear(
self,
X,
slope: float = 1.0,
offset: float = 0.0,
min_val: float = 0.0,
max_val: float = 1.0,
inplace: bool = False,
):
if X.dtype in ("float32", "float64"):
return _custom_kernels.clipped_linear(
X,
inplace=inplace,
slope=slope,
offset=offset,
min_val=min_val,
max_val=max_val,
)
else:
return super().clipped_linear(
X,
inplace=inplace,
slope=slope,
offset=offset,
min_val=min_val,
max_val=max_val,
)
def backprop_clipped_linear(
self,
dY,
X,
slope: float = 1.0,
offset: float = 0.0,
min_val: float = 0.0,
max_val: float = 1.0,
inplace: bool = False,
):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_clipped_linear(
dY,
X,
slope=slope,
offset=offset,
min_val=min_val,
max_val=max_val,
inplace=inplace,
)
else:
return super().backprop_clipped_linear(
dY=dY,
X=X,
slope=slope,
offset=offset,
min_val=min_val,
max_val=max_val,
inplace=inplace,
)
def backprop_hard_swish(self, dY, X, inplace: bool = False):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_hard_swish(dY, X, inplace=inplace)
else:
return super().backprop_hard_swish(dY, X, inplace=inplace)
def backprop_hard_swish_mobilenet(self, dY, X, inplace: bool = False):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_hard_swish_mobilenet(dY, X, inplace=inplace)
else:
return super().backprop_hard_swish_mobilenet(dY, X, inplace=inplace)
def mish(self, X, threshold=20.0, inplace=False):
if X.dtype in ("float32", "float64"):
return _custom_kernels.mish(X, inplace=inplace, threshold=threshold)
else:
return super().mish(X, threshold, inplace)
def backprop_mish(self, dY, X, threshold=20.0, inplace=False):
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_mish(
dY, X, inplace=inplace, threshold=threshold
)
else:
return super().backprop_mish(dY, X, threshold, inplace)
def swish(self, X, inplace=False):
if X.dtype in ("float32", "float64"):
return _custom_kernels.swish(X, inplace=inplace, threshold=17.0)
else:
return super().swish(X, inplace=inplace)
def backprop_swish(self, dY, X, Y, inplace=False):
if X.dtype == dY.dtype == Y.dtype and X.dtype in ("float32", "float64"):
return _custom_kernels.backprop_swish(
dY, X, Y, inplace=inplace, threshold=17.0
)
else:
return super().backprop_swish(dY, X, Y, inplace=inplace)
def clip_gradient(self, gradient, threshold):
# We do not use CuPy's linalg.norm, since it uses scalar reductions
# using one CUDA block. This is a lot slower than the cuBLAS
# implementation.
def frobenius_norm(X):
X_vec = X.reshape(-1)
return cublas.nrm2(X_vec)
grad_norm = cupy.maximum(frobenius_norm(gradient), 1e-12)
gradient *= cupy.minimum(threshold, grad_norm) / grad_norm
return gradient
def seq2col(self, seq, nW, *, lengths=None):
"""Given an (M, N) sequence of vectors, return an (M, N*(nW*2+1)) sequence.
The new sequence is constructed by concatenating nW preceding and succeeding
vectors onto each column in the sequence, to extract a window of features.
"""
if seq.dtype in ("float32", "float64") and (
lengths is None or lengths.dtype == "int32"
):
return _custom_kernels.seq2col(seq, nW, lengths=lengths)
else:
return super().seq2col(seq, nW, lengths=lengths)
def backprop_seq2col(self, dY, nW, *, lengths=None):
if dY.dtype in ("float32", "float64") and (
lengths is None or lengths.dtype == "int32"
):
return _custom_kernels.backprop_seq2col(dY, nW, lengths=lengths)
else:
return super().backprop_seq2col(dY, nW, lengths=lengths)
def reduce_mean(self, X, lengths):
if X.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.reduce_mean(X, lengths=lengths)
else:
super().reduce_mean(X, lengths)
def backprop_reduce_mean(self, d_means, lengths):
if d_means.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.backprop_reduce_mean(d_means, lengths)
else:
super().backprop_reduce_mean(d_means, lengths)
def reduce_max(self, X, lengths):
if X.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.reduce_max(X, lengths)
else:
super().reduce_max(X, lengths)
def backprop_reduce_max(self, d_maxes, which, lengths):
if (
d_maxes.dtype in ("float32", "float64")
and which.dtype == "int32"
and lengths.dtype == "int32"
):
return _custom_kernels.backprop_reduce_max(d_maxes, which, lengths)
else:
super().backprop_reduce_max(d_maxes, which, lengths)
def reduce_sum(self, X, lengths):
if X.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.reduce_sum(X, lengths)
else:
return super().reduce_sum(X, lengths)
def backprop_reduce_sum(self, d_sums, lengths):
if d_sums.dtype in ("float32", "float64") and lengths.dtype == "int32":
return _custom_kernels.backprop_reduce_sum(d_sums, lengths)
else:
return super().backprop_reduce_sum(d_sums, lengths)
def hash(self, ids, seed):
return _custom_kernels.hash(ids, seed)
def scatter_add(self, table, indices, values):
self._xp2.scatter_add(table, indices, values)
def adam(
self, weights, gradient, mom1, mom2, beta1, beta2, eps, learn_rate, mod_rate=1.0
):
_check_compatible_shape(weights, gradient)
_check_compatible_shape(weights, mom1)
_check_compatible_shape(weights, mom2)
adam_kernel(
gradient, learn_rate, 1 - beta1, 1 - beta2, eps, weights, mom1, mom2
)
gradient.fill(0)
return weights, gradient, mom1, mom2
def position_encode(self, N, D, period=10000, out=None):
positions = NumpyOps().position_encode(N, D, period=period, out=out)
return self.asarray(positions)
if cupy is not None:
adam_kernel = cupy.ElementwiseKernel(
"T grad, T lr, T one_minus_beta1, T one_minus_beta2, T eps",
"T param, T m, T v",
"""m += one_minus_beta1 * (grad - m);
v += one_minus_beta2 * (grad * grad - v);
param -= lr * m / (sqrt(v) + eps);""",
"adam",
)
else:
adam_kernel = None
def _check_compatible_shape(u, v):
if u.shape != v.shape:
msg = f"arrays have incompatible shapes: {u.shape} and {v.shape}"
raise ValueError(msg)