ai-content-maker/.venv/Lib/site-packages/thinc/backends/cpu_kernels.hh

483 lines
14 KiB
C++

#ifndef CPU_KERNELS_HH
#define CPU_KERNELS_HH
#include <algorithm>
#include <cmath>
#include <cstring>
#include <stdexcept>
#include <string>
#include <type_traits>
// Ideally we'd use an alias declaration for a generic definition of
// *axpy. But Cython doesn't support alias declarations yet:
//
// https://github.com/cython/cython/issues/3272
//
// template <typename T>
// using axpy = void (*)(int N, T alpha, const T* X, int incX,
// T *Y, int incY);
//
// So, instead we'll do this the pre-C++11 way:
template <typename T>
struct axpy {
typedef void (*ptr)(int N, T alpha, const T* X, int incX, T *Y, int incY);
};
// All elementwise functions, such as most activations, work in-place.
template <typename T, typename L>
struct argmax_result {
T max;
L max_idx;
};
template <typename T, typename L>
argmax_result<T, L> argmax(T const *arr, L len)
{
static_assert(std::is_floating_point<T>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
argmax_result<T, L> r { arr[0], 0 };
for (L i = 1; i < len; ++i) {
if (arr[i] > r.max) {
r.max = arr[i];
r.max_idx = i;
}
}
return r;
}
// The next two templates define argmax for a fixed number of elements.
template <typename T, typename L>
argmax_result<T, L> argmax(T a) {
static_assert(std::is_floating_point<T>::value, "Argument should be floating point");
argmax_result<T, L> acc { a, 0 };
return acc;
}
template<typename T, typename L, typename... Args>
argmax_result<T, L> argmax(T a, Args... args) {
static_assert(std::is_floating_point<T>::value, "Arguments should be floating point");
auto acc = argmax<T, L>(args...);
if (acc.max > a) {
acc.max_idx += 1;
} else {
acc.max_idx = 0;
acc.max = a;
}
return acc;
}
template <typename A, typename L>
void vec_add(A* X, const A* Y, A scale, L N)
{
static_assert(std::is_floating_point<A>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
for (L i = 0; i < N; ++i)
X[i] += scale * Y[i];
}
template <typename A, typename L>
void cpu_maxout(A* best__bo, L* which__bo, const A* cands__bop, L B, L O, L P)
{
static_assert(std::is_floating_point<A>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
// For small inputs, we use an unrolled argmax.
if (P == 2) {
for (int i = 0; i < B * O; ++i) {
A const *input = cands__bop + i * P;
auto r = argmax<A, L>(input[0], input[1]);
which__bo[i] = r.max_idx;
best__bo[i] = r.max;
}
} else if (P == 3) {
for (int i = 0; i < B * O; ++i) {
A const *input = cands__bop + i * P;
auto r = argmax<A, L>(input[0], input[1], input[2]);
which__bo[i] = r.max_idx;
best__bo[i] = r.max;
}
} else {
for (int i = 0; i < B * O; ++i) {
auto r = argmax<A, L>(cands__bop + i * P, P);
which__bo[i] = r.max_idx;
best__bo[i] = r.max;
}
}
}
template <typename A, typename L>
void cpu_backprop_maxout(A* dX__bop, const A* dX__bo, const L* which__bo,
L B, L O, L P)
{
static_assert(std::is_floating_point<A>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
for (L b = 0; b < B; ++b) {
for (L o = 0; o < O; ++o) {
if (*which__bo >= P) {
throw std::out_of_range(std::string("index ") + std::to_string(*which__bo) + " is out of bounds for maxout with size " + std::to_string(P));
}
dX__bop[*which__bo] = *dX__bo;
dX__bop += P;
dX__bo += 1;
which__bo += 1;
}
}
}
template <typename A, typename L>
void cpu_reduce_max(A* maxes__bo, L* which__bo, const A* X__to,
const L* lengths__b, L B, L T, L O)
{
static_assert(std::is_floating_point<A>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
for (const L* length = lengths__b; length < lengths__b + B; ++length) {
if (*length <= 0)
throw std::invalid_argument(std::string("all sequence lengths must be > 0, was: ") + std::to_string(*length));
else if (*length > T) {
throw std::out_of_range("lengths must sum up to the number of rows");
}
T -= *length;
std::memcpy(maxes__bo, X__to, O * sizeof(*maxes__bo));
X__to += O;
for (L i = 1; i < *length; ++i) {
for (L j = 0; j < O; ++j) {
if (X__to[j] > maxes__bo[j]) {
maxes__bo[j] = X__to[j];
which__bo[j] = i;
}
}
X__to += O;
}
maxes__bo += O;
which__bo += O;
}
}
template <typename A, typename L>
void cpu_backprop_reduce_max(A* dX__to, const A* d_maxes__bo, const L* which__bo,
const L* lengths__b, L B, L T, L O)
{
static_assert(std::is_floating_point<A>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
for (const L* length = lengths__b; length < lengths__b + B; ++length) {
for (L i = 0; i < O; ++i) {
L item = which__bo[i];
if (item >= *length) {
throw std::out_of_range(std::string("index ") + std::to_string(item) + " is out of bounds for maxout with length " + std::to_string(*length));
}
dX__to[item * O + i] = d_maxes__bo[i];
}
dX__to += *length * O;
d_maxes__bo += O;
which__bo += O;
}
}
template <typename A, typename L>
void cpu_reduce_mean(A* means__bo, const A* X__to, const L* lengths__b,
L B, L T, L O)
{
static_assert(std::is_floating_point<A>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
for (const L* length = lengths__b; length < lengths__b + B; ++length) {
if (*length < 0) {
throw std::invalid_argument(std::string("all sequence lengths must be >= 0, was: ") + std::to_string(*length));
}
else if (length == 0) {
means__bo += O;
continue;
}
else if (*length > T) {
throw std::out_of_range("lengths must sum up to the number of rows");
}
T -= *length;
A scale = 1. / *length;
for (L i = 0; i < *length; ++i) {
vec_add(means__bo, X__to, scale, O);
X__to += O;
}
means__bo += O;
}
}
template <typename A, typename L>
void cpu_backprop_reduce_mean(A* dX__to, const A* d_means__bo, const L* lengths__b,
L B, L T, L O)
{
static_assert(std::is_floating_point<A>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
for (const L* length = lengths__b; length < lengths__b + B; ++length) {
A scale = 1. / *length;
for (L i = 0; i < *length; ++i) {
vec_add(dX__to, d_means__bo, scale, O);
dX__to += O;
}
d_means__bo += O;
}
}
template <typename A, typename L>
void cpu_mish(A* Y, L N, A threshold)
{
static_assert(std::is_floating_point<A>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
for (L i = 0; i < N; ++i) {
if (Y[i] < threshold) {
Y[i] *= std::tanh(std::log(1.0 + std::exp(Y[i])));
}
}
}
template <typename A, typename L>
void cpu_backprop_mish(A* dX, const A* X, L N, A threshold)
{
static_assert(std::is_floating_point<A>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
for (L i = 0; i < N; ++i) {
A x = X[i];
if (x < threshold) {
A exp_x = std::exp(x);
A exp_2x = std::exp(2 * x);
A exp_3x = std::exp(3 * x);
A omega = (4. * (x + 1)) + (4 * exp_2x) + exp_3x + exp_x * (4. * x + 6);
A delta = 2. * exp_x + exp_2x + 2.;
dX[i] = dX[i] * ((exp_x * omega) / (delta * delta));
}
}
}
template <typename A, typename L>
void cpu_reduce_sum(A* sums__bo, const A* X__to, const L* lengths__b,
L B, L T, L O)
{
static_assert(std::is_floating_point<A>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
for (const L* length = lengths__b; length < lengths__b + B; ++length) {
if (*length < 0) {
throw std::invalid_argument(std::string("all sequence lengths must be >= 0, was: ") + std::to_string(*length));
}
else if (length == 0) {
sums__bo += O;
continue;
}
else if (*length > T) {
throw std::out_of_range("lengths must sum up to the number of rows");
}
T -= *length;
for (L i = 0; i < *length; ++i) {
vec_add(sums__bo, X__to, static_cast<A>(1.0), O);
X__to += O;
}
sums__bo += O;
}
}
template <typename A, typename L>
void cpu_backprop_reduce_sum(A* dX__to, const A* d_sums__bo, const L* lengths__b,
L B, L T, L O)
{
static_assert(std::is_floating_point<A>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
for (const L* length = lengths__b; length < lengths__b + B; ++length) {
for (L i = 0; i < *length; ++i) {
vec_add(dX__to, d_sums__bo, static_cast<A>(1.0), O);
dX__to += O;
}
d_sums__bo += O;
}
}
template <typename A, typename L>
void cpu_relu(A* X, L N)
{
static_assert(std::is_floating_point<A>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
for (L i = 0; i < N; ++i) {
if (X[i] <= 0.0) {
X[i] = 0.0;
}
}
}
template <typename A, typename L>
void seq2col(A* output, const A* X, const L* lengths, L nW, L B, L I, L nL)
{
// Let's say nW is 1 (it usually is). Then we want to take:
// 1a 1b 1c
// 2a 2b 2c
// 3a 3b 3c
// And make
// __ __ __ 1a 1b 1c 2a 2b 2c
// 1a 1b 1c 2a 2b 2c 3a 3b 3c
// 2a 2b 2c 3a 3b 3c __ __ __
// Where __ is padding.
// Now let's say nW is 2. Then we want to take:
// 1a 1b 1c
// 2a 2b 2c
// 3a 3b 3c
// And make
// __ __ __ __ __ __ 1a 1b 1c 2a 2b 2c 3a 3b 3c
// __ __ __ 1a 1b 1c 2a 2b 2c 3a 3b 3c __ __ __
// 1a 1b 1c 2a 2b 2c 3a 3b 3c __ __ __ __ __ __
// * x_start=-6, x_end=9 : (0-2) * 3, (0+2+1) * 3
// * x_start=-3, x_end=13 : (1-2) * 3, (1+2+1) * 3
// * x_start=0, x_end=16 : (2-2) * 3, (2+2+1) * 3
// If lengths > 1, then the sequence lengths dictate
// the boundaries/padding rather than the begin/end
// of X.
static_assert(std::is_floating_point<A>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
L nF = nW * 2 + 1;
L seq_start = 0;
for (L i = 0; i < nL; ++i) {
// Calculate the bounds of the next sequence.
L seq_end = seq_start + lengths[i];
for (L j = seq_start; j < seq_end; ++j) {
// Find the unconstrained window around b, which
// may be out of the sequence bounds.
L window_start = j - nW;
L window_end = j + nW + 1;
// Find the sequence-constrained window around b.
L x_start = std::max(seq_start, window_start);
L x_end = std::min(seq_end, window_end);
L n_elems = x_end - x_start;
L out_offset = x_start - window_start;
std::memcpy(output + (j * nF * I) + (out_offset * I),
X + (x_start * I),
n_elems * I * sizeof(*output));
}
seq_start += lengths[i];
}
}
template <typename A, typename L>
void backprop_seq2col(A* d_seqs, const A* d_cols, const L* lengths, L B, L I, L nW, L nL)
{
// here's what we're doing, if we had 2d indexing.
// for i in range(b):
// d_seq[i] += d_cols[i-2, 4]
// d_seq[i] += d_cols[i-1, 3]
// d_seq[i] += d_cols[i, 2]
// d_seq[i] += d_cols[i+1, 1]
// d_seq[i] += d_cols[i+2, 0]
static_assert(std::is_floating_point<A>::value,
"Array should be floating point");
static_assert(std::is_integral<L>::value, "Array length should be integral");
L nF = nW * 2 + 1;
L seq_start = 0;
for (L i = 0; i < nL; ++i) {
// Calculate the bounds of the next sequence.
L seq_end = seq_start + lengths[i];
for (L j = seq_start; j < seq_end; ++j) {
// Find the unconstrained window around b, which
// may be out of the sequence bounds.
L window_begin = j - nW;
L window_end = j + nW + 1;
// Find the sequence-constrained window around b.
L d_seqs_begin = std::max(seq_start, window_begin);
L d_seqs_end = std::min(seq_end, window_end);
L n_elems = d_seqs_end - d_seqs_begin;
// If the left window is cut short, we want to
// start by the same amount in the output.
L out_offset = d_seqs_begin - window_begin;
vec_add(d_seqs + d_seqs_begin * I,
d_cols + (j * nF * I) + (out_offset * I),
static_cast<A>(1.), n_elems * I);
}
seq_start += lengths[i];
}
}
template <typename F, typename I, typename L>
void cpu_gather_add(typename axpy<F>::ptr axpy, F* out_bo, const F* table_to, const I* indices_bk, L T, L O, L B, L K) {
for (L b = 0; b < B; ++b) {
for (L k = 0; k < K; ++k) {
I idx = indices_bk[b * K + k];
if (idx > T) {
throw std::out_of_range("Embedding index out-of-bounds");
}
axpy(O, 1.0, table_to + idx * O, 1, out_bo + b * O, 1);
}
}
}
#endif // CPU_KERNELS_HH