# cython: cdivision=True # cython: infer_types=True from collections.abc import Sized from typing import Optional import numpy cimport blis.cy cimport cython cimport numpy as np from cymem.cymem cimport Pool from libc.math cimport isnan from libc.stdint cimport uint32_t, uint64_t from libc.stdlib cimport calloc, free, malloc from libc.string cimport memcpy, memset from murmurhash.mrmr cimport hash64 from preshed.maps cimport PreshMap from .. import registry from ..types import ArrayXd, DeviceTypes, DTypes, Shape from ..util import copy_array, get_array_module from .cblas cimport CBlas, daxpy, saxpy from .linalg cimport Vec, VecVec from .ops import Ops try: import blis.py has_blis = True except ImportError: has_blis = False ctypedef float weight_t cdef extern from "math.h": float logf(float x) nogil float sqrtf(float x) nogil float expf(float x) nogil float tanhf(float x) nogil float sinf(float x) nogil float cosf(float x) nogil @registry.ops("NumpyOps") class NumpyOps(Ops): name = "numpy" xp = numpy def __init__( self, device_type: DeviceTypes = "cpu", device_id: int = -1, *, use_blis: bool = True ) -> None: self.device_type = device_type self.device_id = device_id self.use_blis = use_blis if self.use_blis and not has_blis: raise ValueError("BLIS support requires blis: pip install blis") def asarray(self, data, dtype=None): if isinstance(data, self.xp.ndarray): array = data elif hasattr(data, 'numpy'): # Handles PyTorch Tensor array = data.numpy() elif hasattr(data, "get"): array = data.get() else: array = self.xp.array(data, dtype=dtype) if dtype is not None: array = array.astype(dtype=dtype, copy=False) return array def alloc(self, shape: Shape, *, dtype: Optional[DTypes] = "float32", zeros: bool = True) -> ArrayXd: if zeros: return self.xp.zeros(shape, dtype=dtype) else: return self.xp.empty(shape, dtype=dtype) def cblas(self) -> CBlas: return CBlas() def gemm(self, np.ndarray x, np.ndarray y, *, np.ndarray out=None, trans1=False, trans2=False): if x.ndim != 2: raise ValueError(f"Provided 'x' array should be 2-dimensional, but found {x.ndim} dimension(s).") if y.ndim != 2: raise ValueError(f"Provided 'y' array should be 2-dimensional, but found {y.ndim} dimension(s).") if not self.use_blis: # delegate to base Ops return super().gemm(x, y, out=out, trans1=trans1, trans2=trans2) x = self.as_contig(x) y = self.as_contig(y) if out is not None: out = self.as_contig(out) return blis.py.gemm(x, y, out=out, trans1=trans1, trans2=trans2, beta=0.) def relu(self, np.ndarray X, inplace=False): cdef np.ndarray Y if X.dtype == "float32": Y = _inplace_or_copy(X, inplace) cpu_relu(Y.data, Y.size) return Y elif X.dtype == "float64": Y = _inplace_or_copy(X, inplace) cpu_relu(Y.data, Y.size) return Y else: return super().relu(X, inplace=inplace) def backprop_relu(self, np.ndarray dY, np.ndarray Y, inplace=False): _check_compatible_shape(dY, Y) cdef size_t size = Y.size cdef weight_t* dX_ptr cdef const weight_t* Y_ptr = Y.data cdef np.ndarray dX if dY.dtype == "float32" and Y.dtype == "float32": dX = _inplace_or_copy(dY, inplace) dX_ptr = dX.data for i in range(size): if Y_ptr[i] <= 0: dX_ptr[i] = 0. return dX else: return super().backprop_relu(dY, Y, inplace) def lstm_forward_training( self, np.ndarray params, np.ndarray H0, np.ndarray C0, np.ndarray X, np.ndarray size_at_t ): assert H0.shape[0] == C0.shape[0] assert H0.shape[1] == C0.shape[1] Y, fwd_state = lstm_forward_training(params, H0, C0, X, size_at_t) return Y, fwd_state def lstm_forward_inference( self, np.ndarray params, np.ndarray H0, np.ndarray C0, np.ndarray X, np.ndarray size_at_t ): Y, _ = lstm_forward_training(params, H0, C0, X, size_at_t) return Y def backprop_lstm( self, np.ndarray dY, np.ndarray lengths, np.ndarray params, fwd_state ): dX, d_params = backprop_lstm(dY, lengths, params, fwd_state) return dX, d_params def maxout(self, reals3d_ft X): cdef int B = X.shape[0] cdef int O = X.shape[1] cdef int P = X.shape[2] cdef np.ndarray best cdef np.ndarray which = self.alloc(shape=(B, O), dtype='int32', zeros=False) if reals3d_ft is float3d_t: best = self.alloc(shape=(B, O), dtype="float32", zeros=False) if len(X) > 0: cpu_maxout(best.data, which.data, &X[0, 0, 0], B, O, P) else: best = self.alloc(shape=(B, O), dtype="float64", zeros=False) if len(X) > 0: cpu_maxout(best.data, which.data, &X[0, 0, 0], B, O, P) return best, which def backprop_maxout(self, reals2d_ft dY, int[:, ::1] which, int P): cdef int B = dY.shape[0] cdef int O = dY.shape[1] cdef np.ndarray dX if reals2d_ft == float2d_t: dX = numpy.zeros((B, O, P), dtype='float32') cpu_backprop_maxout(dX.data, &dY[0, 0], &which[0, 0], B, O, P) else: dX = numpy.zeros((B, O, P), dtype='float64') cpu_backprop_maxout(dX.data, &dY[0, 0], &which[0, 0], B, O, P) return dX def mish(self, np.ndarray X, threshold=20.0, inplace: bool = False): cdef np.ndarray Y if X.dtype == "float32": Y = _inplace_or_copy(X, inplace) cpu_mish(Y.data, Y.size, threshold) return Y elif X.dtype == "float64": Y = _inplace_or_copy(X, inplace) cpu_mish(Y.data, Y.size, threshold) return Y else: return super().mish(X, inplace=inplace) def backprop_mish(self, np.ndarray dY, np.ndarray X, threshold=20.0, inplace=False): _check_compatible_shape(dY, X) cdef np.ndarray dX if dY.dtype == "float32" and X.dtype == "float32": dX = _inplace_or_copy(dY, inplace) cpu_backprop_mish(dX.data, X.data, X.size, threshold) return dX elif dY.dtype == "float64" and X.dtype == "float64": dX = _inplace_or_copy(dY, inplace) cpu_backprop_mish(dX.data, X.data, X.size, threshold) return dX else: return super().backprop_mish(dY, X, threshold, inplace) def seq2col(self, np.ndarray seq, int nW, *, int[::1] lengths=None): """Given an (M, N) sequence of vectors, return an (M, N*(nW*2+1)) sequence. The new sequence is constructed by concatenating nW preceding and succeeding vectors onto each column in the sequence, to extract a window of features. """ # Note: the type of seq should be changed to reals2d_ft once # cython/cython#4697 is fixed. The following checks can then be # removed, because they are guaranteed by the reals2d_ft # type. if seq.ndim != 2: msg = f"seq2col requires sequence array of dimensionality 2, was {seq.ndim}" raise ValueError(msg) if not seq.flags.c_contiguous: msg = f"seq2col requires sequence array that is in C order and contiguous" raise ValueError(msg) cdef int B = seq.shape[0] cdef int I = seq.shape[1] lengths = check_seq2col_lengths(self, lengths, B) cdef int nL = lengths.shape[0] cdef np.ndarray cols if seq.dtype == "float32": cols = self.alloc((B, (2*nW + 1) * I), dtype="float32") if seq.size != 0 and lengths.size != 0: seq2col(cols.data, seq.data, &lengths[0], nW, B, I, nL) return cols elif seq.dtype == "float64": cols = self.alloc((B, (2*nW + 1) * I), dtype="float64") if seq.size != 0 and lengths.size != 0: seq2col(cols.data, seq.data, &lengths[0], nW, B, I, nL) return cols else: return super().seq2col(seq, nW, lengths=lengths) def backprop_seq2col(self, np.ndarray dY, int nW, *, int[::1] lengths=None): # Note: the type of dY should be changed to reals2d_ft once # cython/cython#4697 is fixed. The following checks can then be # removed, because they are guaranteed by the reals2d_ft # type. if dY.ndim != 2: msg = f"backprop_seq2col requires gradient array of dimensionality 2, was {dY.ndim}" raise ValueError(msg) if not dY.flags.c_contiguous: msg = f"backprop_seq2col requires gradient array that is in C order and contiguous" raise ValueError(msg) cdef int B = dY.shape[0] cdef int nF = nW*2+1 cdef int I = dY.shape[1] / nF lengths = check_seq2col_lengths(self, lengths, B) cdef int nL = lengths.shape[0] cdef np.ndarray dX if dY.dtype == "float32": dX = self.alloc((B, I), dtype='float32') if dY.size != 0 and lengths.size != 0: backprop_seq2col(dX.data, dY.data, &lengths[0], B, I, nW, nL) return dX elif dY.dtype == "float64": dX = self.alloc((B, I), dtype='float64') if dY.size != 0 and lengths.size != 0: backprop_seq2col(dX.data, dY.data, &lengths[0], B, I, nW, nL) return dX else: return super().backprop_seq2col(dY, nW, lengths=lengths) @cython.boundscheck(False) @cython.wraparound(False) def hash(self, const uint64_t[::1] ids, uint32_t seed): """Hash a sequence of 64-bit keys into a table with 4 32-bit keys.""" # Written to mirror the GPU implementation cdef np.ndarray[uint32_t, ndim=2] keys = self.alloc((ids.shape[0], 4), dtype='uint32') cdef int i cdef uint32_t* dest = keys.data for i in range(len(ids)): MurmurHash3_x86_128_uint64(ids[i], seed, &dest[i*4]) return keys def reduce_mean(self, reals2d_ft X, int[::1] lengths): cdef int B = lengths.shape[0] cdef int O = X.shape[1] cdef int T = X.shape[0] if B == 0 or O == 0: if reals2d_ft is float2d_t: return numpy.zeros(shape=(B, O), dtype="float32") else: return numpy.zeros(shape=(B, O), dtype="float64") cdef np.ndarray means if reals2d_ft is float2d_t: means = numpy.zeros(shape=(B, O), dtype="float32") cpu_reduce_mean(means.data, &X[0, 0], &lengths[0], B, T, O) else: means = numpy.zeros(shape=(B, O), dtype="float64") cpu_reduce_mean(means.data, &X[0, 0], &lengths[0], B, T, O) return means def backprop_reduce_mean(self, reals2d_ft d_means, int[::1] lengths): cdef int B = lengths.shape[0] cdef int O = d_means.shape[1] cdef int T = 0 for length in lengths[:B]: if length < 0: raise ValueError(f"all sequence lengths must be >= 0, got {length}") T += length if T == 0 or O == 0: if reals2d_ft is float2d_t: return numpy.zeros(shape=(T, O), dtype="float32") else: return numpy.zeros(shape=(T, O), dtype="float64") cdef np.ndarray dX if reals2d_ft is float2d_t: dX = numpy.zeros((T, O), dtype="float32") cpu_backprop_reduce_mean(dX.data, &d_means[0,0], &lengths[0], B, T, O) else: dX = numpy.zeros((T, O), dtype="float64") cpu_backprop_reduce_mean(dX.data, &d_means[0,0], &lengths[0], B, T, O) return dX def reduce_sum(self, reals2d_ft X, int[::1] lengths): cdef int B = lengths.shape[0] cdef int O = X.shape[1] cdef int T = X.shape[0] if B == 0 or O == 0: if reals2d_ft is float2d_t: return numpy.zeros(shape=(B, O), dtype="float32") else: return numpy.zeros(shape=(B, O), dtype="float64") cdef np.ndarray sums if reals2d_ft is float2d_t: sums = numpy.zeros(shape=(B, O), dtype="float32") cpu_reduce_sum(sums.data, &X[0, 0], &lengths[0], B, T, O) else: sums = numpy.zeros(shape=(B, O), dtype="float64") cpu_reduce_sum(sums.data, &X[0, 0], &lengths[0], B, T, O) return sums def backprop_reduce_sum(self, reals2d_ft d_sums, int[::1] lengths): cdef int B = lengths.shape[0] cdef int O = d_sums.shape[1] cdef int T = 0 for length in lengths[:B]: if length < 0: raise ValueError(f"all sequence lengths must be >= 0, got {length}") T += length if T == 0 or O == 0: if reals2d_ft is float2d_t: return numpy.zeros(shape=(T, O), dtype="float32") else: return numpy.zeros(shape=(T, O), dtype="float64") cdef np.ndarray dX if reals2d_ft is float2d_t: dX = numpy.zeros((T, O), dtype="float32") cpu_backprop_reduce_sum(dX.data, &d_sums[0,0], &lengths[0], B, T, O) else: dX = numpy.zeros((T, O), dtype="float64") cpu_backprop_reduce_sum(dX.data, &d_sums[0,0], &lengths[0], B, T, O) return dX def reduce_max(self, reals2d_ft X, int[::1] lengths): cdef int B = lengths.shape[0] cdef int O = X.shape[1] cdef int T = X.shape[0] # Needs to be zero-initialized as we start by assuming that the first element is the max value. cdef np.ndarray which = self.alloc(shape=(B, O), dtype="i", zeros=True) if B == 0 or O == 0: if reals2d_ft is float2d_t: return numpy.zeros(shape=(B, O), dtype="float32"), which else: return numpy.zeros(shape=(B, O), dtype="float64"), which cdef np.ndarray maxes if reals2d_ft is float2d_t: maxes = self.alloc(shape=(B, O), dtype="float32", zeros=False) cpu_reduce_max(maxes.data, which.data, &X[0, 0], &lengths[0], B, T, O) else: maxes = self.alloc(shape=(B, O), dtype="float64", zeros=False) cpu_reduce_max(maxes.data, which.data, &X[0, 0], &lengths[0], B, T, O) return maxes, which def backprop_reduce_max(self, reals2d_ft d_maxes, int[:, ::1] which, int[::1] lengths): cdef int B = lengths.shape[0] cdef int O = d_maxes.shape[1] cdef int T = 0 for length in lengths[:B]: if length <= 0: raise ValueError(f"all sequence lengths must be > 0, got {length}") T += length if T == 0 or O == 0: if reals2d_ft is float2d_t: return numpy.zeros(shape=(T, O), dtype="float32") else: return numpy.zeros(shape=(T, O), dtype="float64") cdef np.ndarray dX if reals2d_ft is float2d_t: dX = numpy.zeros((T, O), dtype="float32") cpu_backprop_reduce_max(dX.data, &d_maxes[0,0], &which[0, 0], &lengths[0], B, T, O) else: dX = numpy.zeros((T, O), dtype="float64") cpu_backprop_reduce_max(dX.data, &d_maxes[0,0], &which[0, 0], &lengths[0], B, T, O) return dX def gather_add(self, reals2d_ft table, ints2d_ft indices): cdef CBlas cblas = self.cblas() rows = indices.shape[0] dims = table.shape[1] cdef np.ndarray output if reals2d_ft is float2d_t: output = self.xp.zeros((rows, dims), dtype="float32") cpu_gather_add(saxpy(cblas), output.data, &table[0, 0], &indices[0, 0], table.shape[0], dims, rows, indices.shape[1]) else: output = self.xp.zeros((rows, dims), dtype="float64") cpu_gather_add(daxpy(cblas), output.data, &table[0, 0], &indices[0, 0], table.shape[0], dims, rows, indices.shape[1]) return output def scatter_add(self, np.ndarray table, np.ndarray indices, np.ndarray values): if table.dtype == 'float32' \ and indices.dtype == 'int32' \ and values.dtype == 'float32' \ and table.flags.c_contiguous \ and indices.flags.c_contiguous \ and values.flags.c_contiguous \ and indices.ndim == 1 \ and table.ndim == 2 \ and values.ndim == 2 \ and values.shape[0] == indices.shape[0] \ and values.shape[1] == table.shape[1]: cpu_scatter_add(table.data, indices.data, values.data, indices.shape[0], table.shape[1]) else: self.xp.add.at(table, indices, values) @cython.boundscheck(False) @cython.wraparound(False) def adam(self, np.ndarray[np.float32_t] weights, np.ndarray[np.float32_t] gradient, np.ndarray[np.float32_t] mom1, np.ndarray[np.float32_t] mom2, const float beta1, const float beta2, float eps, float learn_rate, float mod_rate=1.): _check_compatible_shape(weights, gradient) _check_compatible_shape(weights, mom1) _check_compatible_shape(weights, mom2) _adam_momentum(gradient.data, mom1.data, mom2.data, weights.shape[0], beta1, beta2, eps, learn_rate) VecVec.add_i(weights.data, gradient.data, -learn_rate, weights.shape[0]) memset(gradient.data, 0, gradient.size * sizeof(float)) return weights, gradient, mom1, mom2 def ngrams(self, int n, const uint64_t[::1] keys): if n < 1: return self.alloc((0,), dtype="uint64") keys_ = &keys[0] length = max(0, keys.shape[0]-(n-1)) cdef np.ndarray output_ = self.alloc((length,), dtype="uint64") output = output_.data for i in range(keys.shape[0]-(n-1)): output[i] = hash64(&keys_[i], n*sizeof(keys_[0]), 0) return output_ def position_encode(self, int N, int D, int period=10000, out=None): cdef np.ndarray out_ if out is None: out_ = self.alloc((N, D), zeros=False) else: out_ = out assert out_.shape[0] == N assert out_.shape[1] == D cpu_position_encode(out_.data, period, N, D) return out_ def check_seq2col_lengths(ops, lengths, B): if lengths is None: lengths = ops.asarray1i([B]) else: assert ops.xp.all(ops.xp.array(lengths) >= 0), "All sequence lengths must be >= 0" assert ops.xp.sum(lengths) == B, "The lengths must sum up to the batch length" return lengths def cpu_clip_gradient(weight_t[::1] gradient, weight_t threshold): grad_norm = Vec.norm(&gradient[0], gradient.shape[0]) if grad_norm >= threshold: Vec.mul_i(&gradient[0], threshold / grad_norm, gradient.shape[0]) def add_gradient_noise(float[::1] gradient, weight_t noise_level, weight_t timestep): cdef weight_t variance = noise_level / ((1 + timestep) ** 0.55) if variance >= 0.000001: gradient += numpy.asarray( numpy.random.normal(scale=variance, loc=0., size=len(gradient)), dtype='float32') cdef void cpu_position_encode(float* output, float period, int N, int D) nogil: cdef float pos, d cdef int j cdef float dimensions = D for i in range(N): pos = i j = 0 d = 0 while (j+1) < D: d = j output[j] = sinf(pos / period ** (2 * d / dimensions)) output[j+1] = cosf(pos / period ** (2 * d / dimensions)) j += 2 if j < D: output[j] = sinf(pos / period ** (2 * d / dimensions)) output += D cdef void cpu_scatter_add(float* dest, const int* indices, const float* src, int nr_id, int nr_col) nogil: cdef int i for i in range(nr_id): id_ = indices[i] if id_ >= 0: VecVec.add_i(&dest[id_*nr_col], &src[i*nr_col], 1., nr_col) @cython.cdivision(True) cdef void _adam_momentum(weight_t* gradient, weight_t* mom1, weight_t* mom2, int nr_weight, weight_t beta1, weight_t beta2, weight_t eps, weight_t learn_rate) nogil: # Calculate Adam on CPU, fused. # Assumes the learning rate adjustment is calculated by the caller; # a_t = learn_rate * sqrt(1-beta2**timestep) / (1-beta1**timestep) cdef weight_t one_minus_beta1 = 1-beta1 cdef weight_t one_minus_beta2 = 1-beta2 cdef weight_t m1, m2, g cdef int i # Blockwise implementation is a bit faster. Adam is slooow :( cdef weight_t[64] buff cdef int steps = nr_weight // 64 if steps * 64 < nr_weight: steps += 1 idx = 0 for i in range(steps): step_size = min(64, nr_weight-idx) Vec.mul_i(mom1, beta1, step_size) VecVec.add_i(mom1, gradient, one_minus_beta1, step_size) Vec.mul_i(mom2, beta2, step_size) for j in range(step_size): mom2[j] += one_minus_beta2 * gradient[j] ** 2 for j in range(step_size): buff[j] = sqrtf(mom2[j]) for j in range(step_size): buff[j] += eps for j in range(step_size): buff[j] = mom1[j] / buff[j] for j in range(step_size): gradient[j] = buff[j] mom1 += step_size mom2 += step_size gradient += step_size idx += step_size @cython.cdivision(True) cdef void cpu_update_averages(weight_t* ema, const weight_t* weights, int nr_weight, weight_t t, weight_t max_decay) nogil: cdef weight_t decay = (1.0 + t) / (10.0 + t) if decay > max_decay: decay = max_decay cdef weight_t one_minus_decay = 1-decay cdef int i for i in range(nr_weight): # num_threads=4, schedule='static'): ema[i] -= one_minus_decay * (ema[i] - weights[i]) def lstm_forward_training( np.ndarray params, np.ndarray c_init, np.ndarray h_init, np.ndarray X, np.ndarray lengths ): xp = numpy depth = c_init.shape[0] dirs = c_init.shape[1] nO = c_init.shape[2] N = X.shape[0] nI = X.shape[1] nT = lengths.shape[0] cdef int batch_size = lengths[0] # Preallocate these so we can pass them through for loop. cdef np.ndarray G = xp.zeros((depth, dirs, X.shape[0], nO * 4), dtype="f") cdef np.ndarray Y = xp.zeros((depth, dirs, X.shape[0], nO), dtype="f") cdef np.ndarray C = xp.zeros((depth, dirs, X.shape[0], nO), dtype="f") cdef np.ndarray Yt2 = numpy.zeros((batch_size, nO), dtype="f") cdef np.ndarray Ct2 = numpy.zeros((batch_size, nO), dtype="f") cdef int params_i = 0 cdef int seq_i = 0 orig_X = X cdef int i cdef np.ndarray Yid cdef np.ndarray Cid cdef np.ndarray Gid cdef np.ndarray Wx cdef np.ndarray Wh cdef np.ndarray bias for i in range(depth): nI = X.shape[1] for d in range(dirs): # The inits are shaped (depth, dirs, nO). We add the internal dimension # to make them set correctly. Yt2[:] = h_init[i, d].reshape((1, nO)) Ct2[:] = c_init[i, d].reshape((1, nO)) layer_params, params_i = _split_weights(params, i, nO, nI, params_i) Wx, Wh, bias = _transpose_weights(layer_params) Yid = Y[i, d] Cid = C[i, d] Gid = G[i, d] _lstm_forward_training( d, N, nO, nI, nT, Gid, Yid.data, Cid.data, X.data, Wx.data, Wh.data, bias, lengths.data, Yt2.data, Ct2.data ) H = Y[i].transpose((1, 0, 2)).reshape((N, -1)) if dirs == 2: H = xp.ascontiguousarray(H) X = H return H, (Y, G, C, orig_X) cdef int _lstm_forward_training( int d, int N, int nO, int nI, int nT, np.ndarray G, float* Y, float* C, float* X, float* Wx, float* Wh, np.ndarray bias, int* lengths, float* Yt2, float* Ct2, ) except -1: cdef double one = 1.0 blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE, N, nO*4, nI, one, X, nI, 1, Wx, nI, 1, one, G.data, nO*4, 1 ) cdef int t, batch_size cdef int seq_i = 0 if d == 0 else N cdef int i, j cdef np.ndarray Gt3_ for t in range(nT): if d == 0: batch_size = lengths[t] else: batch_size = lengths[nT-(t+1)] seq_i -= batch_size # Prepare the inputs Yt3 = &Y[seq_i*nO] Ct3 = &C[seq_i*nO] Gt3_ = G[seq_i : seq_i+batch_size] Gt3 = Gt3_.data # Now do the actual calculation blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.TRANSPOSE, batch_size, nO*4, nO, one, Yt2, nO, 1, Wh, nO, 1, one, Gt3, nO*4, 1 ) # This is super weird: if we remove this add, it gets slower? I guess # it does cache prefetching or something? # It's annoying though --- it means I can't really refactor further, # because speed goes down if I remove this. Gt3_ += bias #for i in range(batch_size): # for j in range(nO*4): # Gt3[i*nO*4+j] += bias[j] cpu_lstm_activate_fwd(Gt3, batch_size, nO) cpu_lstm_gates_fwd(Yt3, Ct3, Gt3, Ct2, batch_size, nO) if d == 0: seq_i += batch_size # We need to keep a full-sized array here, padded with the sequence-start # values. This isn't necessary for the l2r part, but for the r2l part # it's necessary, as we otherwise would have the previous step smaller # than the current. memcpy(Yt2, Yt3, sizeof(Yt3[0]) * batch_size * nO) memcpy(Ct2, Ct3, sizeof(Ct3[0]) * batch_size * nO) def backprop_lstm(np.ndarray dY, np.ndarray lengths, np.ndarray params, fwd_state): xp = numpy cdef np.ndarray Y cdef np.ndarray G cdef np.ndarray C cdef np.ndarray X cdef np.ndarray Yid cdef np.ndarray Cid cdef np.ndarray Gid cdef np.ndarray Wx, Wh, bias cdef np.ndarray dWx, dWh, d_bias cdef np.ndarray dYid Y, G, C, X = fwd_state cdef int depth = C.shape[0] cdef int dirs = C.shape[1] cdef int N = C.shape[2] cdef int nO = C.shape[3] cdef int nI = X.shape[1] cdef int batch_size = lengths[0] cdef int nT = lengths.shape[0] # We don't need to store all the cells for all the layers. cdef np.ndarray dC = xp.zeros((N, nO), dtype=C.dtype) cdef np.ndarray dG = xp.zeros((N, nO*4), dtype=C.dtype) cdef np.ndarray d_params = xp.zeros((params.shape[0],), dtype=params.dtype) # Collect the params and slices. It makes it a bit easier to get the indexing # right, when we're iterating backwards. params_i = 0 all_layer_params = [] for i in range(depth): all_layer_params.append([]) n_inputs = nI if i == 0 else (nO * dirs) for d in range(dirs): layer_params, params_i = _split_weights(params, i, nO, n_inputs, params_i) layer_params = _transpose_weights(layer_params) all_layer_params[-1].append((layer_params, params_i)) params_i = 0 all_layer_grads = [] for i in range(depth): all_layer_grads.append([]) n_inputs = nI if i == 0 else (nO * dirs) for d in range(dirs): layer_grads, params_i = _split_weights(params, i, nO, n_inputs, params_i) layer_grads = _transpose_weights(layer_grads) all_layer_grads[-1].append((layer_grads, params_i)) # Similarly, we want to compute the indices first indices = [] seq_i = 0 for batch_size in lengths: indices.append((seq_i, batch_size)) seq_i += batch_size cdef np.ndarray dX Xs = [X] + [Y[i].transpose(1, 0, 2).reshape((N, -1)) for i in range(depth-1)] dXs = [xp.zeros((X.shape[0], X.shape[1]), dtype=X.dtype) for X in Xs] # Okay, now do the actual looping for i in reversed(range(depth)): dY = dY.reshape((N, dirs, nO)).transpose((1, 0, 2)) dX = dXs[i] X = Xs[i] if dirs >= 2: dY = numpy.ascontiguousarray(dY) for d in range(dirs): Wx, Wh, bias = all_layer_params[i][d][0] dWx, dWh, d_bias = all_layer_grads[i][d][0] assert Wx.shape[1] == dWx.shape[1] == X.shape[1] == dX.shape[1], (Wx.shape[1], dWx.shape[1], X.shape[1], dX.shape[1]) dYid = dY[d] dC.fill(0.) dG.fill(0.) Cid = C[i, d] Gid = G[i, d] Yid = Y[i, d] assert (Cid.shape[0], Cid.shape[1]) == (N, nO) assert (Yid.shape[0], Yid.shape[1]) == (N, nO) assert (Gid.shape[0], Gid.shape[1]) == (N, nO*4) assert (dYid.shape[0], dYid.shape[1]) == (N, nO) assert (dC.shape[0], dC.shape[1]) == (N, nO) assert (dG.shape[0], dG.shape[1]) == (N, nO*4) _lstm_backward_training(d, N, nO, dX.shape[1], nT, dX.data, dYid.data, dC.data, dG.data, dWx.data, dWh.data, d_bias.data, Cid.data, Gid.data, Yid.data, X.data, Wx.data, Wh.data, list(indices) ) dY = dX assert dX.shape[1] == X.shape[1] grad_parts = [] for layer_grads in all_layer_grads: for dir_grads, _ in layer_grads: grad_parts.append(_untranspose_unsplit_weights(dir_grads)) return dX, numpy.concatenate(grad_parts) def _split_directions(X, dirs): if dirs == 1: return [X] else: X_ = X.reshape((X.shape[0], -1, dirs)) Xs = [] for d in range(dirs): Xs.append(numpy.ascontiguousarray(X_[:, d])) return Xs cdef int _lstm_backward_training( int d, int N, int nO, int nI, int nT, float* dX, float* dY, float* dC, float* dG, float* dWx, float* dWh, float* d_bias, const float* C, const float* G, const float* Y, const float* X, const float* Wx, const float* Wh, indices, ) except -1: cdef int seq_t2 cdef int seq_t3 cdef double one = 1.0 if d == 0: seq_t3, size_t3 = indices[-1] indices = indices[:-1] indices.reverse() else: seq_t3, size_t3 = indices[0] indices = indices[1:] cdef int batch_size for seq_t2, size_t2 in indices: dGt3 = &dG[seq_t3*nO*4] dXt3 = &dX[seq_t3*nI] dYt3 = &dY[seq_t3*nO] dCt3 = &dC[seq_t3*nO] dYt2 = &dY[seq_t2*nO] dCt2 = &dC[seq_t2*nO] Ct3 = &C[seq_t3*nO] Gt3 = &G[seq_t3*nO*4] Ct2 = &C[seq_t2*nO] batch_size = min(size_t2, size_t3) cpu_lstm_gates_bwd(dGt3, dCt2, dYt3, dCt3, Gt3, Ct3, Ct2, batch_size * nO ) # Backprop hidden-to-hidden w.r.t. hidden. # dYt2 += dGt3 @ Wh blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.NO_TRANSPOSE, batch_size, nO, nO*4, one, dGt3, nO*4, 1, Wh, nO, 1, one, dYt2, nO, 1 ) seq_t3 = seq_t2 size_t3 = size_t2 # Backprop input-to-hidden w.r.t. weights. # dWx += dG @ X blis.cy.gemm(blis.cy.TRANSPOSE, blis.cy.NO_TRANSPOSE, nO*4, nI, N, one, dG, nO*4, 1, X, nI, 1, one, dWx, nI, 1 ) # Backprop hidden-to-hidden w.r.t weights. # dWh += dG @ Y blis.cy.gemm(blis.cy.TRANSPOSE, blis.cy.NO_TRANSPOSE, nO*4, nO, N, one, dG, nO*4, 1, Y, nO, 1, one, dWh, nO, 1 ) # Backprop bias for i in range(N): for j in range(nO*4): d_bias[j] += dG[i*nO*4+j] # Backprop input-to-hidden w.r.t. input blis.cy.gemm(blis.cy.NO_TRANSPOSE, blis.cy.NO_TRANSPOSE, N, nI, nO*4, one, dG, nO*4, 1, Wx, nI, 1, one, dX, nI, 1 ) def _split_weights(np.ndarray params, int i, int nO, int nI, int params_i): Wx_size = 4 * nO * nI bx_size = 4 * nO Wh_size = 4 * nO * nO bh_size = 4 * nO Wx = params[params_i : params_i + Wx_size].reshape((4 * nO, nI)) params_i += Wx_size bx = params[params_i : params_i + bx_size].reshape((4 * nO,)) params_i += bx_size Wh = params[params_i : params_i + Wh_size].reshape((4 * nO, nO)) params_i += Wh_size bh = params[params_i : params_i + bh_size].reshape((4 * nO,)) params_i += bh_size return ((Wx, bx), (Wh, bh)), params_i def _transpose_weights(params): # Transpose the parameters so that the gates are the last dimension. This # makes it easier to fuse. (Wx, bx), (Wh, bh) = params Wx = Wx.reshape((4, -1, Wx.shape[-1])) Wx = Wx.transpose((1, 0, 2)).reshape((-1, Wx.shape[-1])) bx = bx.reshape((4, -1)).transpose((1, 0)).reshape((-1,)) Wh = Wh.reshape((4, -1, Wh.shape[-1])) Wh = Wh.transpose((1, 0, 2)).reshape((-1, Wh.shape[-1])) bh = bh.reshape((4, -1)).transpose((1, 0)).reshape((-1,)) ascontig = numpy.ascontiguousarray Wx = ascontig(Wx) Wh = ascontig(Wh) bias = ascontig(bx) + bh return Wx, Wh, bias def _untranspose_unsplit_weights(params): Wx, Wh, bias = params nO = Wh.shape[1] nI = Wx.shape[1] Wx = Wx.reshape((-1, 4, nI)).transpose((1, 0, 2)).reshape((-1, nI)) Wh = Wh.reshape((-1, 4, nO)).transpose((1, 0, 2)).reshape((-1, nO)) bias = bias.reshape((-1, 4)).transpose((1, 0)).reshape((-1,)) zeros = numpy.zeros(bias.shape, dtype="f") return numpy.concatenate((Wx.ravel(), bias, Wh.ravel(), zeros)) cdef inline float sigmoid(float X) nogil: return 1./(1. + expf(-X)) cdef inline float dsigmoid(float y) nogil: return y*(1-y) cdef inline float dtanh(float y) nogil: return 1-y**2 cdef void cpu_lstm_activate_fwd(float* gates, int B, int N) nogil: """Apply sigmoid activation in-place to columns 0, 1, 2 and tanh to column 3. The data is assumed to have the gates in the last dimension. """ # This just does the following, but unrolled slightly to give # a better chance at simd. # # gates[g+i+0] = sigmoid(gates[g+i+0]) # gates[g+i+1] = sigmoid(gates[g+i+1]) # gates[g+i+2] = sigmoid(gates[g+i+2]) # gates[g+i+3] = tanh(gates[g+i+3]) # # I would've hoped the compiler would find this itself? It seems to make # it like, 10% faster. It feels like a dumb thing to do but it's not much # code. The problem with this sort of thing is it needs to be rebenchmarked # later...It's fine to revert this at a later date to the simpler loop. # Shrug. The weird thing is, why should the batch entries be a good loop # stride here? Surely something to do with cache lines would make more sense? cdef int i, b, g g = 0 for b in range(B): g = b * N * 4 end = g + N*4 while g < end: gates[g+0] = expf(-gates[g+0]) gates[g+1] = expf(-gates[g+1]) gates[g+2] = expf(-gates[g+2]) g += 4 g = b * N * 4 while g < end: gates[g+0] += 1 gates[g+1] += 1 gates[g+2] += 1 g += 4 g = b * N * 4 while g < end: gates[g+0] = 1.0 / gates[g+0] gates[g+1] = 1.0 / gates[g+1] gates[g+2] = 1.0 / gates[g+2] g += 4 g = b * N * 4 while g < end: gates[g+3] = tanhf(gates[g+3]) g += 4 cdef void cpu_lstm_gates_fwd(float* hiddens, float* cells, const float* gates, const float* prevcells, int B, int N) nogil: cdef float hf, hi, ho, hc, ct2, ct3 cdef int i, b, g, c, h g = 0 c = 0 h = 0 while g < B*N*4: hf = gates[g+0] hi = gates[g+1] ho = gates[g+2] hc = gates[g+3] ct2 = prevcells[c] ct3 = hf * ct2 + hi * hc hiddens[h] = tanhf(ct3) * ho cells[c] = ct3 g += 4 c += 1 h += 1 cdef void cpu_lstm_gates_bwd( float* dGt3, float* dCt2, const float* dYt3, const float* dCt3, const float* Gt3, const float* Ct3, const float* Ct2, int N ) nogil: cdef int i cdef float ct2, ct3, hf, hi, ho, hc, tanh_ct3 cdef float d_ho, d_tanh_ct3, dct3, d_hi, d_hc, d_hf for i in range(N): ct2 = Ct2[i] ct3 = Ct3[i] dct3 = dCt3[i] dyt3 = dYt3[i] hf = Gt3[i*4+0] hi = Gt3[i*4+1] ho = Gt3[i*4+2] hc = Gt3[i*4+3] tanh_ct3 = tanhf(ct3) # 3b: Yt3 = tanhCt3 * ho d_ho = dyt3 * tanh_ct3 d_tanh_ct3 = dyt3 * ho # 3a: tanhCt3 = tanh(Ct3) dct3 += d_tanh_ct3 * dtanh(tanh_ct3) # 2b: Ct3 += hi * hc d_hi = dct3 * hc d_hc = dct3 * hi # 2a: Ct3 = hf * Ct2 d_hf = dct3 * ct2 dCt2[i] = dct3 * hf dGt3[i*4+0] = d_hf * dsigmoid(hf) # 1a dGt3[i*4+1] = d_hi * dsigmoid(hi) # 1b dGt3[i*4+2] = d_ho * dsigmoid(ho) # 1c dGt3[i*4+3] = d_hc * dtanh(hc) # 1d cdef void MurmurHash3_x86_128_uint64( const uint64_t val, const uint32_t seed, uint32_t *out ) nogil: cdef uint64_t h1, h2 h1 = val h1 *= 0x87c37b91114253d5ull h1 = (h1 << 31) | (h1 >> 33) h1 *= 0x4cf5ad432745937full h1 ^= seed h1 ^= 8 h2 = seed h2 ^= 8 h1 += h2 h2 += h1 h1 ^= h1 >> 33 h1 *= 0xff51afd7ed558ccdull h1 ^= h1 >> 33 h1 *= 0xc4ceb9fe1a85ec53ull h1 ^= h1 >> 33 h2 ^= h2 >> 33 h2 *= 0xff51afd7ed558ccdull h2 ^= h2 >> 33 h2 *= 0xc4ceb9fe1a85ec53ull h2 ^= h2 >> 33 h1 += h2 h2 += h1 out[0] = h1 & 0xffffffffu out[1] = h1 >> 32 out[2] = h2 & 0xffffffffu out[3] = h2 >> 32 def _check_compatible_shape(u: np.ndarray, v: np.ndarray): if u.shape != v.shape: msg = f"arrays have incompatible shapes: {u.shape} and {v.shape}" raise ValueError(msg) cdef inline np.ndarray _inplace_or_copy(np.ndarray X, inplace): if inplace: return X else: return numpy.array(X)