#ifndef CPU_KERNELS_HH #define CPU_KERNELS_HH #include #include #include #include #include #include // Ideally we'd use an alias declaration for a generic definition of // *axpy. But Cython doesn't support alias declarations yet: // // https://github.com/cython/cython/issues/3272 // // template // using axpy = void (*)(int N, T alpha, const T* X, int incX, // T *Y, int incY); // // So, instead we'll do this the pre-C++11 way: template struct axpy { typedef void (*ptr)(int N, T alpha, const T* X, int incX, T *Y, int incY); }; // All elementwise functions, such as most activations, work in-place. template struct argmax_result { T max; L max_idx; }; template argmax_result argmax(T const *arr, L len) { static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); argmax_result r { arr[0], 0 }; for (L i = 1; i < len; ++i) { if (arr[i] > r.max) { r.max = arr[i]; r.max_idx = i; } } return r; } // The next two templates define argmax for a fixed number of elements. template argmax_result argmax(T a) { static_assert(std::is_floating_point::value, "Argument should be floating point"); argmax_result acc { a, 0 }; return acc; } template argmax_result argmax(T a, Args... args) { static_assert(std::is_floating_point::value, "Arguments should be floating point"); auto acc = argmax(args...); if (acc.max > a) { acc.max_idx += 1; } else { acc.max_idx = 0; acc.max = a; } return acc; } template void vec_add(A* X, const A* Y, A scale, L N) { static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); for (L i = 0; i < N; ++i) X[i] += scale * Y[i]; } template void cpu_maxout(A* best__bo, L* which__bo, const A* cands__bop, L B, L O, L P) { static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); // For small inputs, we use an unrolled argmax. if (P == 2) { for (int i = 0; i < B * O; ++i) { A const *input = cands__bop + i * P; auto r = argmax(input[0], input[1]); which__bo[i] = r.max_idx; best__bo[i] = r.max; } } else if (P == 3) { for (int i = 0; i < B * O; ++i) { A const *input = cands__bop + i * P; auto r = argmax(input[0], input[1], input[2]); which__bo[i] = r.max_idx; best__bo[i] = r.max; } } else { for (int i = 0; i < B * O; ++i) { auto r = argmax(cands__bop + i * P, P); which__bo[i] = r.max_idx; best__bo[i] = r.max; } } } template void cpu_backprop_maxout(A* dX__bop, const A* dX__bo, const L* which__bo, L B, L O, L P) { static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); for (L b = 0; b < B; ++b) { for (L o = 0; o < O; ++o) { if (*which__bo >= P) { throw std::out_of_range(std::string("index ") + std::to_string(*which__bo) + " is out of bounds for maxout with size " + std::to_string(P)); } dX__bop[*which__bo] = *dX__bo; dX__bop += P; dX__bo += 1; which__bo += 1; } } } template void cpu_reduce_max(A* maxes__bo, L* which__bo, const A* X__to, const L* lengths__b, L B, L T, L O) { static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); for (const L* length = lengths__b; length < lengths__b + B; ++length) { if (*length <= 0) throw std::invalid_argument(std::string("all sequence lengths must be > 0, was: ") + std::to_string(*length)); else if (*length > T) { throw std::out_of_range("lengths must sum up to the number of rows"); } T -= *length; std::memcpy(maxes__bo, X__to, O * sizeof(*maxes__bo)); X__to += O; for (L i = 1; i < *length; ++i) { for (L j = 0; j < O; ++j) { if (X__to[j] > maxes__bo[j]) { maxes__bo[j] = X__to[j]; which__bo[j] = i; } } X__to += O; } maxes__bo += O; which__bo += O; } } template void cpu_backprop_reduce_max(A* dX__to, const A* d_maxes__bo, const L* which__bo, const L* lengths__b, L B, L T, L O) { static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); for (const L* length = lengths__b; length < lengths__b + B; ++length) { for (L i = 0; i < O; ++i) { L item = which__bo[i]; if (item >= *length) { throw std::out_of_range(std::string("index ") + std::to_string(item) + " is out of bounds for maxout with length " + std::to_string(*length)); } dX__to[item * O + i] = d_maxes__bo[i]; } dX__to += *length * O; d_maxes__bo += O; which__bo += O; } } template void cpu_reduce_mean(A* means__bo, const A* X__to, const L* lengths__b, L B, L T, L O) { static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); for (const L* length = lengths__b; length < lengths__b + B; ++length) { if (*length < 0) { throw std::invalid_argument(std::string("all sequence lengths must be >= 0, was: ") + std::to_string(*length)); } else if (length == 0) { means__bo += O; continue; } else if (*length > T) { throw std::out_of_range("lengths must sum up to the number of rows"); } T -= *length; A scale = 1. / *length; for (L i = 0; i < *length; ++i) { vec_add(means__bo, X__to, scale, O); X__to += O; } means__bo += O; } } template void cpu_backprop_reduce_mean(A* dX__to, const A* d_means__bo, const L* lengths__b, L B, L T, L O) { static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); for (const L* length = lengths__b; length < lengths__b + B; ++length) { A scale = 1. / *length; for (L i = 0; i < *length; ++i) { vec_add(dX__to, d_means__bo, scale, O); dX__to += O; } d_means__bo += O; } } template void cpu_mish(A* Y, L N, A threshold) { static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); for (L i = 0; i < N; ++i) { if (Y[i] < threshold) { Y[i] *= std::tanh(std::log(1.0 + std::exp(Y[i]))); } } } template void cpu_backprop_mish(A* dX, const A* X, L N, A threshold) { static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); for (L i = 0; i < N; ++i) { A x = X[i]; if (x < threshold) { A exp_x = std::exp(x); A exp_2x = std::exp(2 * x); A exp_3x = std::exp(3 * x); A omega = (4. * (x + 1)) + (4 * exp_2x) + exp_3x + exp_x * (4. * x + 6); A delta = 2. * exp_x + exp_2x + 2.; dX[i] = dX[i] * ((exp_x * omega) / (delta * delta)); } } } template void cpu_reduce_sum(A* sums__bo, const A* X__to, const L* lengths__b, L B, L T, L O) { static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); for (const L* length = lengths__b; length < lengths__b + B; ++length) { if (*length < 0) { throw std::invalid_argument(std::string("all sequence lengths must be >= 0, was: ") + std::to_string(*length)); } else if (length == 0) { sums__bo += O; continue; } else if (*length > T) { throw std::out_of_range("lengths must sum up to the number of rows"); } T -= *length; for (L i = 0; i < *length; ++i) { vec_add(sums__bo, X__to, static_cast(1.0), O); X__to += O; } sums__bo += O; } } template void cpu_backprop_reduce_sum(A* dX__to, const A* d_sums__bo, const L* lengths__b, L B, L T, L O) { static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); for (const L* length = lengths__b; length < lengths__b + B; ++length) { for (L i = 0; i < *length; ++i) { vec_add(dX__to, d_sums__bo, static_cast(1.0), O); dX__to += O; } d_sums__bo += O; } } template void cpu_relu(A* X, L N) { static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); for (L i = 0; i < N; ++i) { if (X[i] <= 0.0) { X[i] = 0.0; } } } template void seq2col(A* output, const A* X, const L* lengths, L nW, L B, L I, L nL) { // Let's say nW is 1 (it usually is). Then we want to take: // 1a 1b 1c // 2a 2b 2c // 3a 3b 3c // And make // __ __ __ 1a 1b 1c 2a 2b 2c // 1a 1b 1c 2a 2b 2c 3a 3b 3c // 2a 2b 2c 3a 3b 3c __ __ __ // Where __ is padding. // Now let's say nW is 2. Then we want to take: // 1a 1b 1c // 2a 2b 2c // 3a 3b 3c // And make // __ __ __ __ __ __ 1a 1b 1c 2a 2b 2c 3a 3b 3c // __ __ __ 1a 1b 1c 2a 2b 2c 3a 3b 3c __ __ __ // 1a 1b 1c 2a 2b 2c 3a 3b 3c __ __ __ __ __ __ // * x_start=-6, x_end=9 : (0-2) * 3, (0+2+1) * 3 // * x_start=-3, x_end=13 : (1-2) * 3, (1+2+1) * 3 // * x_start=0, x_end=16 : (2-2) * 3, (2+2+1) * 3 // If lengths > 1, then the sequence lengths dictate // the boundaries/padding rather than the begin/end // of X. static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); L nF = nW * 2 + 1; L seq_start = 0; for (L i = 0; i < nL; ++i) { // Calculate the bounds of the next sequence. L seq_end = seq_start + lengths[i]; for (L j = seq_start; j < seq_end; ++j) { // Find the unconstrained window around b, which // may be out of the sequence bounds. L window_start = j - nW; L window_end = j + nW + 1; // Find the sequence-constrained window around b. L x_start = std::max(seq_start, window_start); L x_end = std::min(seq_end, window_end); L n_elems = x_end - x_start; L out_offset = x_start - window_start; std::memcpy(output + (j * nF * I) + (out_offset * I), X + (x_start * I), n_elems * I * sizeof(*output)); } seq_start += lengths[i]; } } template void backprop_seq2col(A* d_seqs, const A* d_cols, const L* lengths, L B, L I, L nW, L nL) { // here's what we're doing, if we had 2d indexing. // for i in range(b): // d_seq[i] += d_cols[i-2, 4] // d_seq[i] += d_cols[i-1, 3] // d_seq[i] += d_cols[i, 2] // d_seq[i] += d_cols[i+1, 1] // d_seq[i] += d_cols[i+2, 0] static_assert(std::is_floating_point::value, "Array should be floating point"); static_assert(std::is_integral::value, "Array length should be integral"); L nF = nW * 2 + 1; L seq_start = 0; for (L i = 0; i < nL; ++i) { // Calculate the bounds of the next sequence. L seq_end = seq_start + lengths[i]; for (L j = seq_start; j < seq_end; ++j) { // Find the unconstrained window around b, which // may be out of the sequence bounds. L window_begin = j - nW; L window_end = j + nW + 1; // Find the sequence-constrained window around b. L d_seqs_begin = std::max(seq_start, window_begin); L d_seqs_end = std::min(seq_end, window_end); L n_elems = d_seqs_end - d_seqs_begin; // If the left window is cut short, we want to // start by the same amount in the output. L out_offset = d_seqs_begin - window_begin; vec_add(d_seqs + d_seqs_begin * I, d_cols + (j * nF * I) + (out_offset * I), static_cast(1.), n_elems * I); } seq_start += lengths[i]; } } template void cpu_gather_add(typename axpy::ptr axpy, F* out_bo, const F* table_to, const I* indices_bk, L T, L O, L B, L K) { for (L b = 0; b < B; ++b) { for (L k = 0; k < K; ++k) { I idx = indices_bk[b * K + k]; if (idx > T) { throw std::out_of_range("Embedding index out-of-bounds"); } axpy(O, 1.0, table_to + idx * O, 1, out_bo + b * O, 1); } } } #endif // CPU_KERNELS_HH