367 lines
13 KiB
Python
367 lines
13 KiB
Python
import numpy
|
|
|
|
from .. import registry
|
|
from ..compat import cublas, cupy, cupyx
|
|
from ..types import DeviceTypes
|
|
from ..util import (
|
|
is_cupy_array,
|
|
is_mxnet_gpu_array,
|
|
is_tensorflow_gpu_array,
|
|
is_torch_cuda_array,
|
|
mxnet2xp,
|
|
tensorflow2xp,
|
|
torch2xp,
|
|
)
|
|
from . import _custom_kernels
|
|
from .numpy_ops import NumpyOps
|
|
from .ops import Ops
|
|
|
|
|
|
@registry.ops("CupyOps")
|
|
class CupyOps(Ops):
|
|
name = "cupy"
|
|
xp = cupy
|
|
_xp2 = cupyx
|
|
|
|
def __init__(
|
|
self, device_type: DeviceTypes = "gpu", device_id: int = 0, **kwargs
|
|
) -> None:
|
|
self.device_type = device_type
|
|
self.device_id = device_id
|
|
|
|
def to_numpy(self, data, *, byte_order=None):
|
|
if not isinstance(data, numpy.ndarray):
|
|
data = data.get()
|
|
if byte_order:
|
|
dtype = data.dtype.newbyteorder(byte_order)
|
|
data = numpy.asarray(data, dtype=dtype)
|
|
return data
|
|
|
|
def gather_add(self, table, indices):
|
|
if table.dtype in ("float32", "float64"):
|
|
return _custom_kernels.gather_add(table, indices)
|
|
else:
|
|
return super().gather_add(table, indices)
|
|
|
|
def dish(self, X, inplace=False):
|
|
if X.dtype in ("float32", "float64"):
|
|
return _custom_kernels.dish(X, inplace=inplace)
|
|
else:
|
|
return super().dish(X, inplace=inplace)
|
|
|
|
def backprop_dish(self, dY, X, inplace=False):
|
|
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
|
|
return _custom_kernels.backprop_dish(dY, X, inplace=inplace)
|
|
else:
|
|
return super().backprop_dish(dY, X, inplace=inplace)
|
|
|
|
def gelu(self, X, inplace=False):
|
|
if X.dtype in ("float32", "float64"):
|
|
return _custom_kernels.gelu(X, inplace=inplace, threshold=6.0)
|
|
else:
|
|
return super().gelu(X, inplace=inplace)
|
|
|
|
def backprop_gelu(self, dY, X, inplace=False):
|
|
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
|
|
return _custom_kernels.backprop_gelu(dY, X, inplace=inplace, threshold=6.0)
|
|
else:
|
|
return super().backprop_gelu(dY, X, inplace=inplace)
|
|
|
|
def gemm(self, x, y, out=None, trans1=False, trans2=False):
|
|
if isinstance(x, numpy.ndarray) or isinstance(y, numpy.ndarray):
|
|
raise ValueError(
|
|
"Encountered a numpy array when processing with cupy. "
|
|
"Did you call model.ops.asarray on your data?"
|
|
)
|
|
if trans1:
|
|
x = x.T
|
|
if trans2:
|
|
y = y.T
|
|
if out is None:
|
|
return self.xp.dot(x, y)
|
|
else:
|
|
self.xp.dot(x, y, out=out)
|
|
return out
|
|
|
|
def asarray(self, data, dtype=None):
|
|
# We'll try to perform a zero-copy conversion if possible.
|
|
if is_cupy_array(data):
|
|
array = self.xp.asarray(data, dtype=dtype)
|
|
elif is_torch_cuda_array(data):
|
|
array = torch2xp(data)
|
|
elif is_tensorflow_gpu_array(data):
|
|
array = tensorflow2xp(data)
|
|
elif is_mxnet_gpu_array(data):
|
|
array = mxnet2xp(data)
|
|
else:
|
|
array = self.xp.array(data, dtype=dtype)
|
|
|
|
if dtype is not None:
|
|
array = array.astype(dtype=dtype, copy=False)
|
|
|
|
return array
|
|
|
|
def pad(self, seqs, round_to=1):
|
|
"""Perform padding on a list of arrays so that they each have the same
|
|
length, by taking the maximum dimension across each axis. This only
|
|
works on non-empty sequences with the same `ndim` and `dtype`.
|
|
"""
|
|
# TODO: This should be generalized to handle different ranks
|
|
if not seqs:
|
|
raise ValueError("Cannot pad empty sequence")
|
|
if len(set(seq.ndim for seq in seqs)) != 1:
|
|
raise ValueError("Cannot pad sequences with different ndims")
|
|
if len(set(seq.dtype for seq in seqs)) != 1:
|
|
raise ValueError("Cannot pad sequences with different dtypes")
|
|
if len(set(seq.shape[1:] for seq in seqs)) != 1:
|
|
raise ValueError("Cannot pad sequences that differ on other dimensions")
|
|
|
|
# Our CUDA kernel can currently only handle C contiguous arrays.
|
|
if not all(seq.flags["C_CONTIGUOUS"] for seq in seqs) or seqs[0].dtype not in (
|
|
"float32",
|
|
"float64",
|
|
"int32",
|
|
"int64",
|
|
):
|
|
return super().pad(seqs, round_to)
|
|
|
|
return _custom_kernels.pad(seqs, round_to)
|
|
|
|
def maxout(self, X):
|
|
if X.dtype in ("float32", "float64"):
|
|
return _custom_kernels.maxout(X)
|
|
else:
|
|
return super().maxout(X)
|
|
|
|
def backprop_maxout(self, dY, which, P):
|
|
if dY.dtype in ("float32", "float64") and which.dtype == "int32":
|
|
return _custom_kernels.backprop_maxout(dY, which, P)
|
|
else:
|
|
return super().backprop_maxout(dY, which, P)
|
|
|
|
def relu(self, X, inplace=False):
|
|
if not inplace:
|
|
return X * (X > 0)
|
|
else:
|
|
X *= X > 0
|
|
return X
|
|
|
|
def backprop_relu(self, dY, Y, inplace=False):
|
|
if not inplace:
|
|
return dY * (Y > 0)
|
|
dY *= Y > 0
|
|
return dY
|
|
|
|
def clipped_linear(
|
|
self,
|
|
X,
|
|
slope: float = 1.0,
|
|
offset: float = 0.0,
|
|
min_val: float = 0.0,
|
|
max_val: float = 1.0,
|
|
inplace: bool = False,
|
|
):
|
|
if X.dtype in ("float32", "float64"):
|
|
return _custom_kernels.clipped_linear(
|
|
X,
|
|
inplace=inplace,
|
|
slope=slope,
|
|
offset=offset,
|
|
min_val=min_val,
|
|
max_val=max_val,
|
|
)
|
|
else:
|
|
return super().clipped_linear(
|
|
X,
|
|
inplace=inplace,
|
|
slope=slope,
|
|
offset=offset,
|
|
min_val=min_val,
|
|
max_val=max_val,
|
|
)
|
|
|
|
def backprop_clipped_linear(
|
|
self,
|
|
dY,
|
|
X,
|
|
slope: float = 1.0,
|
|
offset: float = 0.0,
|
|
min_val: float = 0.0,
|
|
max_val: float = 1.0,
|
|
inplace: bool = False,
|
|
):
|
|
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
|
|
return _custom_kernels.backprop_clipped_linear(
|
|
dY,
|
|
X,
|
|
slope=slope,
|
|
offset=offset,
|
|
min_val=min_val,
|
|
max_val=max_val,
|
|
inplace=inplace,
|
|
)
|
|
else:
|
|
return super().backprop_clipped_linear(
|
|
dY=dY,
|
|
X=X,
|
|
slope=slope,
|
|
offset=offset,
|
|
min_val=min_val,
|
|
max_val=max_val,
|
|
inplace=inplace,
|
|
)
|
|
|
|
def backprop_hard_swish(self, dY, X, inplace: bool = False):
|
|
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
|
|
return _custom_kernels.backprop_hard_swish(dY, X, inplace=inplace)
|
|
else:
|
|
return super().backprop_hard_swish(dY, X, inplace=inplace)
|
|
|
|
def backprop_hard_swish_mobilenet(self, dY, X, inplace: bool = False):
|
|
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
|
|
return _custom_kernels.backprop_hard_swish_mobilenet(dY, X, inplace=inplace)
|
|
else:
|
|
return super().backprop_hard_swish_mobilenet(dY, X, inplace=inplace)
|
|
|
|
def mish(self, X, threshold=20.0, inplace=False):
|
|
if X.dtype in ("float32", "float64"):
|
|
return _custom_kernels.mish(X, inplace=inplace, threshold=threshold)
|
|
else:
|
|
return super().mish(X, threshold, inplace)
|
|
|
|
def backprop_mish(self, dY, X, threshold=20.0, inplace=False):
|
|
if X.dtype == dY.dtype and X.dtype in ("float32", "float64"):
|
|
return _custom_kernels.backprop_mish(
|
|
dY, X, inplace=inplace, threshold=threshold
|
|
)
|
|
else:
|
|
return super().backprop_mish(dY, X, threshold, inplace)
|
|
|
|
def swish(self, X, inplace=False):
|
|
if X.dtype in ("float32", "float64"):
|
|
return _custom_kernels.swish(X, inplace=inplace, threshold=17.0)
|
|
else:
|
|
return super().swish(X, inplace=inplace)
|
|
|
|
def backprop_swish(self, dY, X, Y, inplace=False):
|
|
if X.dtype == dY.dtype == Y.dtype and X.dtype in ("float32", "float64"):
|
|
return _custom_kernels.backprop_swish(
|
|
dY, X, Y, inplace=inplace, threshold=17.0
|
|
)
|
|
else:
|
|
return super().backprop_swish(dY, X, Y, inplace=inplace)
|
|
|
|
def clip_gradient(self, gradient, threshold):
|
|
# We do not use CuPy's linalg.norm, since it uses scalar reductions
|
|
# using one CUDA block. This is a lot slower than the cuBLAS
|
|
# implementation.
|
|
def frobenius_norm(X):
|
|
X_vec = X.reshape(-1)
|
|
return cublas.nrm2(X_vec)
|
|
|
|
grad_norm = cupy.maximum(frobenius_norm(gradient), 1e-12)
|
|
gradient *= cupy.minimum(threshold, grad_norm) / grad_norm
|
|
return gradient
|
|
|
|
def seq2col(self, seq, nW, *, lengths=None):
|
|
"""Given an (M, N) sequence of vectors, return an (M, N*(nW*2+1)) sequence.
|
|
The new sequence is constructed by concatenating nW preceding and succeeding
|
|
vectors onto each column in the sequence, to extract a window of features.
|
|
"""
|
|
if seq.dtype in ("float32", "float64") and (
|
|
lengths is None or lengths.dtype == "int32"
|
|
):
|
|
return _custom_kernels.seq2col(seq, nW, lengths=lengths)
|
|
else:
|
|
return super().seq2col(seq, nW, lengths=lengths)
|
|
|
|
def backprop_seq2col(self, dY, nW, *, lengths=None):
|
|
if dY.dtype in ("float32", "float64") and (
|
|
lengths is None or lengths.dtype == "int32"
|
|
):
|
|
return _custom_kernels.backprop_seq2col(dY, nW, lengths=lengths)
|
|
else:
|
|
return super().backprop_seq2col(dY, nW, lengths=lengths)
|
|
|
|
def reduce_mean(self, X, lengths):
|
|
if X.dtype in ("float32", "float64") and lengths.dtype == "int32":
|
|
return _custom_kernels.reduce_mean(X, lengths=lengths)
|
|
else:
|
|
super().reduce_mean(X, lengths)
|
|
|
|
def backprop_reduce_mean(self, d_means, lengths):
|
|
if d_means.dtype in ("float32", "float64") and lengths.dtype == "int32":
|
|
return _custom_kernels.backprop_reduce_mean(d_means, lengths)
|
|
else:
|
|
super().backprop_reduce_mean(d_means, lengths)
|
|
|
|
def reduce_max(self, X, lengths):
|
|
if X.dtype in ("float32", "float64") and lengths.dtype == "int32":
|
|
return _custom_kernels.reduce_max(X, lengths)
|
|
else:
|
|
super().reduce_max(X, lengths)
|
|
|
|
def backprop_reduce_max(self, d_maxes, which, lengths):
|
|
if (
|
|
d_maxes.dtype in ("float32", "float64")
|
|
and which.dtype == "int32"
|
|
and lengths.dtype == "int32"
|
|
):
|
|
return _custom_kernels.backprop_reduce_max(d_maxes, which, lengths)
|
|
else:
|
|
super().backprop_reduce_max(d_maxes, which, lengths)
|
|
|
|
def reduce_sum(self, X, lengths):
|
|
if X.dtype in ("float32", "float64") and lengths.dtype == "int32":
|
|
return _custom_kernels.reduce_sum(X, lengths)
|
|
else:
|
|
return super().reduce_sum(X, lengths)
|
|
|
|
def backprop_reduce_sum(self, d_sums, lengths):
|
|
if d_sums.dtype in ("float32", "float64") and lengths.dtype == "int32":
|
|
return _custom_kernels.backprop_reduce_sum(d_sums, lengths)
|
|
else:
|
|
return super().backprop_reduce_sum(d_sums, lengths)
|
|
|
|
def hash(self, ids, seed):
|
|
return _custom_kernels.hash(ids, seed)
|
|
|
|
def scatter_add(self, table, indices, values):
|
|
self._xp2.scatter_add(table, indices, values)
|
|
|
|
def adam(
|
|
self, weights, gradient, mom1, mom2, beta1, beta2, eps, learn_rate, mod_rate=1.0
|
|
):
|
|
_check_compatible_shape(weights, gradient)
|
|
_check_compatible_shape(weights, mom1)
|
|
_check_compatible_shape(weights, mom2)
|
|
|
|
adam_kernel(
|
|
gradient, learn_rate, 1 - beta1, 1 - beta2, eps, weights, mom1, mom2
|
|
)
|
|
gradient.fill(0)
|
|
return weights, gradient, mom1, mom2
|
|
|
|
def position_encode(self, N, D, period=10000, out=None):
|
|
positions = NumpyOps().position_encode(N, D, period=period, out=out)
|
|
return self.asarray(positions)
|
|
|
|
|
|
if cupy is not None:
|
|
adam_kernel = cupy.ElementwiseKernel(
|
|
"T grad, T lr, T one_minus_beta1, T one_minus_beta2, T eps",
|
|
"T param, T m, T v",
|
|
"""m += one_minus_beta1 * (grad - m);
|
|
v += one_minus_beta2 * (grad * grad - v);
|
|
param -= lr * m / (sqrt(v) + eps);""",
|
|
"adam",
|
|
)
|
|
else:
|
|
adam_kernel = None
|
|
|
|
|
|
def _check_compatible_shape(u, v):
|
|
if u.shape != v.shape:
|
|
msg = f"arrays have incompatible shapes: {u.shape} and {v.shape}"
|
|
raise ValueError(msg)
|