1038 lines
26 KiB
Python
1038 lines
26 KiB
Python
# Author: Leland McInnes <leland.mcinnes@gmail.com>
|
|
#
|
|
# License: BSD 3 clause
|
|
import numpy as np
|
|
import numba
|
|
|
|
from pynndescent.optimal_transport import (
|
|
allocate_graph_structures,
|
|
initialize_graph_structures,
|
|
initialize_supply,
|
|
initialize_cost,
|
|
network_simplex_core,
|
|
total_cost,
|
|
ProblemStatus,
|
|
sinkhorn_transport_plan,
|
|
)
|
|
|
|
_mock_identity = np.eye(2, dtype=np.float32)
|
|
_mock_ones = np.ones(2, dtype=np.float32)
|
|
_dummy_cost = np.zeros((2, 2), dtype=np.float64)
|
|
|
|
FLOAT32_EPS = np.finfo(np.float32).eps
|
|
FLOAT32_MAX = np.finfo(np.float32).max
|
|
|
|
popcnt = np.array(
|
|
[bin(i).count('1') for i in range(256)],
|
|
dtype=np.float32
|
|
)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def euclidean(x, y):
|
|
r"""Standard euclidean distance.
|
|
|
|
.. math::
|
|
D(x, y) = \\sqrt{\sum_i (x_i - y_i)^2}
|
|
"""
|
|
result = 0.0
|
|
for i in range(x.shape[0]):
|
|
result += (x[i] - y[i]) ** 2
|
|
return np.sqrt(result)
|
|
|
|
|
|
@numba.njit(
|
|
[
|
|
"f4(f4[::1],f4[::1])",
|
|
numba.types.float32(
|
|
numba.types.Array(numba.types.float32, 1, "C", readonly=True),
|
|
numba.types.Array(numba.types.float32, 1, "C", readonly=True),
|
|
),
|
|
],
|
|
fastmath=True,
|
|
locals={
|
|
"result": numba.types.float32,
|
|
"diff": numba.types.float32,
|
|
"dim": numba.types.intp,
|
|
"i": numba.types.uint16,
|
|
},
|
|
)
|
|
def squared_euclidean(x, y):
|
|
r"""Squared euclidean distance.
|
|
|
|
.. math::
|
|
D(x, y) = \sum_i (x_i - y_i)^2
|
|
"""
|
|
result = 0.0
|
|
dim = x.shape[0]
|
|
for i in range(dim):
|
|
diff = x[i] - y[i]
|
|
result += diff * diff
|
|
|
|
return result
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def standardised_euclidean(x, y, sigma=_mock_ones):
|
|
r"""Euclidean distance standardised against a vector of standard
|
|
deviations per coordinate.
|
|
|
|
.. math::
|
|
D(x, y) = \sqrt{\sum_i \frac{(x_i - y_i)**2}{v_i}}
|
|
"""
|
|
result = 0.0
|
|
for i in range(x.shape[0]):
|
|
result += ((x[i] - y[i]) ** 2) / sigma[i]
|
|
|
|
return np.sqrt(result)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def manhattan(x, y):
|
|
r"""Manhattan, taxicab, or l1 distance.
|
|
|
|
.. math::
|
|
D(x, y) = \sum_i |x_i - y_i|
|
|
"""
|
|
result = 0.0
|
|
for i in range(x.shape[0]):
|
|
result += np.abs(x[i] - y[i])
|
|
|
|
return result
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def chebyshev(x, y):
|
|
r"""Chebyshev or l-infinity distance.
|
|
|
|
.. math::
|
|
D(x, y) = \max_i |x_i - y_i|
|
|
"""
|
|
result = 0.0
|
|
for i in range(x.shape[0]):
|
|
result = max(result, np.abs(x[i] - y[i]))
|
|
|
|
return result
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def minkowski(x, y, p=2):
|
|
r"""Minkowski distance.
|
|
|
|
.. math::
|
|
D(x, y) = \left(\sum_i |x_i - y_i|^p\right)^{\frac{1}{p}}
|
|
|
|
This is a general distance. For p=1 it is equivalent to
|
|
manhattan distance, for p=2 it is Euclidean distance, and
|
|
for p=infinity it is Chebyshev distance. In general it is better
|
|
to use the more specialised functions for those distances.
|
|
"""
|
|
result = 0.0
|
|
for i in range(x.shape[0]):
|
|
result += (np.abs(x[i] - y[i])) ** p
|
|
|
|
return result ** (1.0 / p)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def weighted_minkowski(x, y, w=_mock_ones, p=2):
|
|
r"""A weighted version of Minkowski distance.
|
|
|
|
.. math::
|
|
D(x, y) = \left(\sum_i w_i |x_i - y_i|^p\right)^{\frac{1}{p}}
|
|
|
|
If weights w_i are inverse standard deviations of graph_data in each dimension
|
|
then this represented a standardised Minkowski distance (and is
|
|
equivalent to standardised Euclidean distance for p=1).
|
|
"""
|
|
result = 0.0
|
|
for i in range(x.shape[0]):
|
|
result += w[i] * np.abs(x[i] - y[i]) ** p
|
|
|
|
return result ** (1.0 / p)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def mahalanobis(x, y, vinv=_mock_identity):
|
|
result = 0.0
|
|
|
|
diff = np.empty(x.shape[0], dtype=np.float32)
|
|
|
|
for i in range(x.shape[0]):
|
|
diff[i] = x[i] - y[i]
|
|
|
|
for i in range(x.shape[0]):
|
|
tmp = 0.0
|
|
for j in range(x.shape[0]):
|
|
tmp += vinv[i, j] * diff[j]
|
|
result += tmp * diff[i]
|
|
|
|
return np.sqrt(result)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def hamming(x, y):
|
|
result = 0.0
|
|
for i in range(x.shape[0]):
|
|
if x[i] != y[i]:
|
|
result += 1.0
|
|
|
|
return float(result) / x.shape[0]
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def canberra(x, y):
|
|
result = 0.0
|
|
for i in range(x.shape[0]):
|
|
denominator = np.abs(x[i]) + np.abs(y[i])
|
|
if denominator > 0:
|
|
result += np.abs(x[i] - y[i]) / denominator
|
|
|
|
return result
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def bray_curtis(x, y):
|
|
numerator = 0.0
|
|
denominator = 0.0
|
|
for i in range(x.shape[0]):
|
|
numerator += np.abs(x[i] - y[i])
|
|
denominator += np.abs(x[i] + y[i])
|
|
|
|
if denominator > 0.0:
|
|
return float(numerator) / denominator
|
|
else:
|
|
return 0.0
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def jaccard(x, y):
|
|
num_non_zero = 0.0
|
|
num_equal = 0.0
|
|
for i in range(x.shape[0]):
|
|
x_true = x[i] != 0
|
|
y_true = y[i] != 0
|
|
num_non_zero += x_true or y_true
|
|
num_equal += x_true and y_true
|
|
|
|
if num_non_zero == 0.0:
|
|
return 0.0
|
|
else:
|
|
return float(num_non_zero - num_equal) / num_non_zero
|
|
|
|
|
|
@numba.njit(
|
|
[
|
|
"f4(f4[::1],f4[::1])",
|
|
numba.types.float32(
|
|
numba.types.Array(numba.types.float32, 1, "C", readonly=True),
|
|
numba.types.Array(numba.types.float32, 1, "C", readonly=True),
|
|
),
|
|
],
|
|
fastmath=True,
|
|
locals={
|
|
"result": numba.types.float32,
|
|
"num_non_zero": numba.types.float32,
|
|
"num_equal": numba.types.float32,
|
|
"x_true": numba.types.uint8,
|
|
"y_true": numba.types.uint8,
|
|
"dim": numba.types.intp,
|
|
"i": numba.types.uint16,
|
|
},
|
|
)
|
|
def alternative_jaccard(x, y):
|
|
num_non_zero = 0.0
|
|
num_equal = 0.0
|
|
dim = x.shape[0]
|
|
for i in range(dim):
|
|
x_true = x[i] != 0
|
|
y_true = y[i] != 0
|
|
num_non_zero += x_true or y_true
|
|
num_equal += x_true and y_true
|
|
|
|
if num_non_zero == 0.0:
|
|
return 0.0
|
|
else:
|
|
return -np.log2(num_equal / num_non_zero)
|
|
|
|
|
|
@numba.vectorize(fastmath=True)
|
|
def correct_alternative_jaccard(v):
|
|
return 1.0 - pow(2.0, -v)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def matching(x, y):
|
|
num_not_equal = 0.0
|
|
for i in range(x.shape[0]):
|
|
x_true = x[i] != 0
|
|
y_true = y[i] != 0
|
|
num_not_equal += x_true != y_true
|
|
|
|
return float(num_not_equal) / x.shape[0]
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def dice(x, y):
|
|
num_true_true = 0.0
|
|
num_not_equal = 0.0
|
|
for i in range(x.shape[0]):
|
|
x_true = x[i] != 0
|
|
y_true = y[i] != 0
|
|
num_true_true += x_true and y_true
|
|
num_not_equal += x_true != y_true
|
|
|
|
if num_not_equal == 0.0:
|
|
return 0.0
|
|
else:
|
|
return num_not_equal / (2.0 * num_true_true + num_not_equal)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def kulsinski(x, y):
|
|
num_true_true = 0.0
|
|
num_not_equal = 0.0
|
|
for i in range(x.shape[0]):
|
|
x_true = x[i] != 0
|
|
y_true = y[i] != 0
|
|
num_true_true += x_true and y_true
|
|
num_not_equal += x_true != y_true
|
|
|
|
if num_not_equal == 0:
|
|
return 0.0
|
|
else:
|
|
return float(num_not_equal - num_true_true + x.shape[0]) / (
|
|
num_not_equal + x.shape[0]
|
|
)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def rogers_tanimoto(x, y):
|
|
num_not_equal = 0.0
|
|
for i in range(x.shape[0]):
|
|
x_true = x[i] != 0
|
|
y_true = y[i] != 0
|
|
num_not_equal += x_true != y_true
|
|
|
|
return (2.0 * num_not_equal) / (x.shape[0] + num_not_equal)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def russellrao(x, y):
|
|
num_true_true = 0.0
|
|
for i in range(x.shape[0]):
|
|
x_true = x[i] != 0
|
|
y_true = y[i] != 0
|
|
num_true_true += x_true and y_true
|
|
|
|
if num_true_true == np.sum(x != 0) and num_true_true == np.sum(y != 0):
|
|
return 0.0
|
|
else:
|
|
return float(x.shape[0] - num_true_true) / (x.shape[0])
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def sokal_michener(x, y):
|
|
num_not_equal = 0.0
|
|
for i in range(x.shape[0]):
|
|
x_true = x[i] != 0
|
|
y_true = y[i] != 0
|
|
num_not_equal += x_true != y_true
|
|
|
|
return (2.0 * num_not_equal) / (x.shape[0] + num_not_equal)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def sokal_sneath(x, y):
|
|
num_true_true = 0.0
|
|
num_not_equal = 0.0
|
|
for i in range(x.shape[0]):
|
|
x_true = x[i] != 0
|
|
y_true = y[i] != 0
|
|
num_true_true += x_true and y_true
|
|
num_not_equal += x_true != y_true
|
|
|
|
if num_not_equal == 0.0:
|
|
return 0.0
|
|
else:
|
|
return num_not_equal / (0.5 * num_true_true + num_not_equal)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def haversine(x, y):
|
|
if x.shape[0] != 2:
|
|
raise ValueError("haversine is only defined for 2 dimensional graph_data")
|
|
sin_lat = np.sin(0.5 * (x[0] - y[0]))
|
|
sin_long = np.sin(0.5 * (x[1] - y[1]))
|
|
result = np.sqrt(sin_lat**2 + np.cos(x[0]) * np.cos(y[0]) * sin_long**2)
|
|
return 2.0 * np.arcsin(result)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def yule(x, y):
|
|
num_true_true = 0.0
|
|
num_true_false = 0.0
|
|
num_false_true = 0.0
|
|
for i in range(x.shape[0]):
|
|
x_true = x[i] != 0
|
|
y_true = y[i] != 0
|
|
num_true_true += x_true and y_true
|
|
num_true_false += x_true and (not y_true)
|
|
num_false_true += (not x_true) and y_true
|
|
|
|
num_false_false = x.shape[0] - num_true_true - num_true_false - num_false_true
|
|
|
|
if num_true_false == 0.0 or num_false_true == 0.0:
|
|
return 0.0
|
|
else:
|
|
return (2.0 * num_true_false * num_false_true) / (
|
|
num_true_true * num_false_false + num_true_false * num_false_true
|
|
)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def cosine(x, y):
|
|
result = 0.0
|
|
norm_x = 0.0
|
|
norm_y = 0.0
|
|
for i in range(x.shape[0]):
|
|
result += x[i] * y[i]
|
|
norm_x += x[i] ** 2
|
|
norm_y += y[i] ** 2
|
|
|
|
if norm_x == 0.0 and norm_y == 0.0:
|
|
return 0.0
|
|
elif norm_x == 0.0 or norm_y == 0.0:
|
|
return 1.0
|
|
else:
|
|
return 1.0 - (result / np.sqrt(norm_x * norm_y))
|
|
|
|
|
|
@numba.njit(
|
|
[
|
|
"f4(f4[::1],f4[::1])",
|
|
numba.types.float32(
|
|
numba.types.Array(numba.types.float32, 1, "C", readonly=True),
|
|
numba.types.Array(numba.types.float32, 1, "C", readonly=True),
|
|
),
|
|
],
|
|
fastmath=True,
|
|
locals={
|
|
"result": numba.types.float32,
|
|
"norm_x": numba.types.float32,
|
|
"norm_y": numba.types.float32,
|
|
"dim": numba.types.intp,
|
|
"i": numba.types.uint16,
|
|
},
|
|
)
|
|
def alternative_cosine(x, y):
|
|
result = 0.0
|
|
norm_x = 0.0
|
|
norm_y = 0.0
|
|
dim = x.shape[0]
|
|
for i in range(dim):
|
|
result += x[i] * y[i]
|
|
norm_x += x[i] * x[i]
|
|
norm_y += y[i] * y[i]
|
|
|
|
if norm_x == 0.0 and norm_y == 0.0:
|
|
return 0.0
|
|
elif norm_x == 0.0 or norm_y == 0.0:
|
|
return FLOAT32_MAX
|
|
elif result <= 0.0:
|
|
return FLOAT32_MAX
|
|
else:
|
|
result = np.sqrt(norm_x * norm_y) / result
|
|
return np.log2(result)
|
|
|
|
|
|
@numba.njit(
|
|
"f4(f4[::1],f4[::1])",
|
|
fastmath=True,
|
|
locals={
|
|
"result": numba.types.float32,
|
|
"dim": numba.types.intp,
|
|
"i": numba.types.uint16,
|
|
},
|
|
)
|
|
def dot(x, y):
|
|
result = 0.0
|
|
dim = x.shape[0]
|
|
for i in range(dim):
|
|
result += x[i] * y[i]
|
|
|
|
if result <= 0.0:
|
|
return 1.0
|
|
else:
|
|
return 1.0 - result
|
|
|
|
|
|
@numba.njit(
|
|
[
|
|
"f4(f4[::1],f4[::1])",
|
|
numba.types.float32(
|
|
numba.types.Array(numba.types.float32, 1, "C", readonly=True),
|
|
numba.types.Array(numba.types.float32, 1, "C", readonly=True),
|
|
),
|
|
],
|
|
fastmath=True,
|
|
locals={
|
|
"result": numba.types.float32,
|
|
"dim": numba.types.intp,
|
|
"i": numba.types.uint16,
|
|
},
|
|
)
|
|
def alternative_dot(x, y):
|
|
result = 0.0
|
|
dim = x.shape[0]
|
|
for i in range(dim):
|
|
result += x[i] * y[i]
|
|
|
|
if result <= 0.0:
|
|
return FLOAT32_MAX
|
|
else:
|
|
return -np.log2(result)
|
|
|
|
|
|
@numba.vectorize(fastmath=True)
|
|
def correct_alternative_cosine(d):
|
|
return 1.0 - pow(2.0, -d)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def tsss(x, y):
|
|
d_euc_squared = 0.0
|
|
d_cos = 0.0
|
|
norm_x = 0.0
|
|
norm_y = 0.0
|
|
dim = x.shape[0]
|
|
|
|
for i in range(dim):
|
|
diff = x[i] - y[i]
|
|
d_euc_squared += diff * diff
|
|
d_cos += x[i] * y[i]
|
|
norm_x += x[i] * x[i]
|
|
norm_y += y[i] * y[i]
|
|
|
|
norm_x = np.sqrt(norm_x)
|
|
norm_y = np.sqrt(norm_y)
|
|
magnitude_difference = np.abs(norm_x - norm_y)
|
|
d_cos /= norm_x * norm_y
|
|
theta = np.arccos(d_cos) + np.radians(10) # Add 10 degrees as an "epsilon" to
|
|
# avoid problems
|
|
sector = ((np.sqrt(d_euc_squared) + magnitude_difference) ** 2) * theta
|
|
triangle = norm_x * norm_y * np.sin(theta) / 2.0
|
|
return triangle * sector
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def true_angular(x, y):
|
|
result = 0.0
|
|
norm_x = 0.0
|
|
norm_y = 0.0
|
|
dim = x.shape[0]
|
|
for i in range(dim):
|
|
result += x[i] * y[i]
|
|
norm_x += x[i] * x[i]
|
|
norm_y += y[i] * y[i]
|
|
|
|
if norm_x == 0.0 and norm_y == 0.0:
|
|
return 0.0
|
|
elif norm_x == 0.0 or norm_y == 0.0:
|
|
return FLOAT32_MAX
|
|
elif result <= 0.0:
|
|
return FLOAT32_MAX
|
|
else:
|
|
result = result / np.sqrt(norm_x * norm_y)
|
|
return 1.0 - (np.arccos(result) / np.pi)
|
|
|
|
|
|
@numba.vectorize(fastmath=True)
|
|
def true_angular_from_alt_cosine(d):
|
|
return 1.0 - (np.arccos(pow(2.0, -d)) / np.pi)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def correlation(x, y):
|
|
mu_x = 0.0
|
|
mu_y = 0.0
|
|
norm_x = 0.0
|
|
norm_y = 0.0
|
|
dot_product = 0.0
|
|
|
|
for i in range(x.shape[0]):
|
|
mu_x += x[i]
|
|
mu_y += y[i]
|
|
|
|
mu_x /= x.shape[0]
|
|
mu_y /= x.shape[0]
|
|
|
|
for i in range(x.shape[0]):
|
|
shifted_x = x[i] - mu_x
|
|
shifted_y = y[i] - mu_y
|
|
norm_x += shifted_x**2
|
|
norm_y += shifted_y**2
|
|
dot_product += shifted_x * shifted_y
|
|
|
|
if norm_x == 0.0 and norm_y == 0.0:
|
|
return 0.0
|
|
elif dot_product == 0.0:
|
|
return 1.0
|
|
else:
|
|
return 1.0 - (dot_product / np.sqrt(norm_x * norm_y))
|
|
|
|
|
|
@numba.njit(
|
|
[
|
|
"f4(f4[::1],f4[::1])",
|
|
numba.types.float32(
|
|
numba.types.Array(numba.types.float32, 1, "C", readonly=True),
|
|
numba.types.Array(numba.types.float32, 1, "C", readonly=True),
|
|
),
|
|
],
|
|
fastmath=True,
|
|
locals={
|
|
"result": numba.types.float32,
|
|
"l1_norm_x": numba.types.float32,
|
|
"l1_norm_y": numba.types.float32,
|
|
"dim": numba.types.intp,
|
|
"i": numba.types.uint16,
|
|
},
|
|
)
|
|
def hellinger(x, y):
|
|
result = 0.0
|
|
l1_norm_x = 0.0
|
|
l1_norm_y = 0.0
|
|
dim = x.shape[0]
|
|
|
|
for i in range(dim):
|
|
result += np.sqrt(x[i] * y[i])
|
|
l1_norm_x += x[i]
|
|
l1_norm_y += y[i]
|
|
|
|
if l1_norm_x == 0 and l1_norm_y == 0:
|
|
return 0.0
|
|
elif l1_norm_x == 0 or l1_norm_y == 0:
|
|
return 1.0
|
|
else:
|
|
return np.sqrt(1 - result / np.sqrt(l1_norm_x * l1_norm_y))
|
|
|
|
|
|
@numba.njit(
|
|
[
|
|
"f4(f4[::1],f4[::1])",
|
|
numba.types.float32(
|
|
numba.types.Array(numba.types.float32, 1, "C", readonly=True),
|
|
numba.types.Array(numba.types.float32, 1, "C", readonly=True),
|
|
),
|
|
],
|
|
fastmath=True,
|
|
locals={
|
|
"result": numba.types.float32,
|
|
"l1_norm_x": numba.types.float32,
|
|
"l1_norm_y": numba.types.float32,
|
|
"dim": numba.types.intp,
|
|
"i": numba.types.uint16,
|
|
},
|
|
)
|
|
def alternative_hellinger(x, y):
|
|
result = 0.0
|
|
l1_norm_x = 0.0
|
|
l1_norm_y = 0.0
|
|
dim = x.shape[0]
|
|
|
|
for i in range(dim):
|
|
result += np.sqrt(x[i] * y[i])
|
|
l1_norm_x += x[i]
|
|
l1_norm_y += y[i]
|
|
|
|
if l1_norm_x == 0 and l1_norm_y == 0:
|
|
return 0.0
|
|
elif l1_norm_x == 0 or l1_norm_y == 0:
|
|
return FLOAT32_MAX
|
|
elif result <= 0:
|
|
return FLOAT32_MAX
|
|
else:
|
|
result = np.sqrt(l1_norm_x * l1_norm_y) / result
|
|
return np.log2(result)
|
|
|
|
|
|
@numba.vectorize(fastmath=True)
|
|
def correct_alternative_hellinger(d):
|
|
return np.sqrt(1.0 - pow(2.0, -d))
|
|
|
|
|
|
@numba.njit()
|
|
def rankdata(a, method="average"):
|
|
arr = np.ravel(np.asarray(a))
|
|
if method == "ordinal":
|
|
sorter = arr.argsort(kind="mergesort")
|
|
else:
|
|
sorter = arr.argsort(kind="quicksort")
|
|
|
|
inv = np.empty(sorter.size, dtype=np.intp)
|
|
inv[sorter] = np.arange(sorter.size)
|
|
|
|
if method == "ordinal":
|
|
return (inv + 1).astype(np.float64)
|
|
|
|
arr = arr[sorter]
|
|
obs = np.ones(arr.size, np.bool_)
|
|
obs[1:] = arr[1:] != arr[:-1]
|
|
dense = obs.cumsum()[inv]
|
|
|
|
if method == "dense":
|
|
return dense.astype(np.float64)
|
|
|
|
# cumulative counts of each unique value
|
|
nonzero = np.nonzero(obs)[0]
|
|
count = np.concatenate((nonzero, np.array([len(obs)], nonzero.dtype)))
|
|
|
|
if method == "max":
|
|
return count[dense].astype(np.float64)
|
|
|
|
if method == "min":
|
|
return (count[dense - 1] + 1).astype(np.float64)
|
|
|
|
# average method
|
|
return 0.5 * (count[dense] + count[dense - 1] + 1)
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def spearmanr(x, y):
|
|
x_rank = rankdata(x)
|
|
y_rank = rankdata(y)
|
|
|
|
return correlation(x_rank, y_rank)
|
|
|
|
|
|
@numba.njit(nogil=True)
|
|
def kantorovich(x, y, cost=_dummy_cost, max_iter=100000):
|
|
|
|
row_mask = x != 0
|
|
col_mask = y != 0
|
|
|
|
a = x[row_mask].astype(np.float64)
|
|
b = y[col_mask].astype(np.float64)
|
|
|
|
a_sum = a.sum()
|
|
b_sum = b.sum()
|
|
|
|
# if not isclose(a_sum, b_sum):
|
|
# raise ValueError(
|
|
# "Kantorovich distance inputs must be valid probability distributions."
|
|
# )
|
|
|
|
a /= a_sum
|
|
b /= b_sum
|
|
|
|
sub_cost = cost[row_mask, :][:, col_mask]
|
|
|
|
node_arc_data, spanning_tree, graph = allocate_graph_structures(
|
|
a.shape[0], b.shape[0], False
|
|
)
|
|
initialize_supply(a, -b, graph, node_arc_data.supply)
|
|
initialize_cost(sub_cost, graph, node_arc_data.cost)
|
|
# initialize_cost(cost, graph, node_arc_data.cost)
|
|
init_status = initialize_graph_structures(graph, node_arc_data, spanning_tree)
|
|
if init_status == False:
|
|
raise ValueError(
|
|
"Kantorovich distance inputs must be valid probability distributions."
|
|
)
|
|
solve_status = network_simplex_core(node_arc_data, spanning_tree, graph, max_iter)
|
|
# if solve_status == ProblemStatus.MAX_ITER_REACHED:
|
|
# print("WARNING: RESULT MIGHT BE INACCURATE\nMax number of iteration reached!")
|
|
if solve_status == ProblemStatus.INFEASIBLE:
|
|
raise ValueError(
|
|
"Optimal transport problem was INFEASIBLE. Please check inputs."
|
|
)
|
|
elif solve_status == ProblemStatus.UNBOUNDED:
|
|
raise ValueError(
|
|
"Optimal transport problem was UNBOUNDED. Please check inputs."
|
|
)
|
|
result = total_cost(node_arc_data.flow, node_arc_data.cost)
|
|
|
|
return result
|
|
|
|
|
|
@numba.njit(fastmath=True)
|
|
def sinkhorn(x, y, cost=_dummy_cost, regularization=1.0):
|
|
row_mask = x != 0
|
|
col_mask = y != 0
|
|
|
|
a = x[row_mask].astype(np.float64)
|
|
b = y[col_mask].astype(np.float64)
|
|
|
|
a_sum = a.sum()
|
|
b_sum = b.sum()
|
|
|
|
a /= a_sum
|
|
b /= b_sum
|
|
|
|
sub_cost = cost[row_mask, :][:, col_mask]
|
|
|
|
transport_plan = sinkhorn_transport_plan(
|
|
x, y, cost=sub_cost, regularization=regularization
|
|
)
|
|
dim_i = transport_plan.shape[0]
|
|
dim_j = transport_plan.shape[1]
|
|
result = 0.0
|
|
for i in range(dim_i):
|
|
for j in range(dim_j):
|
|
result += transport_plan[i, j] * cost[i, j]
|
|
|
|
return result
|
|
|
|
|
|
@numba.njit()
|
|
def jensen_shannon_divergence(x, y):
|
|
result = 0.0
|
|
l1_norm_x = 0.0
|
|
l1_norm_y = 0.0
|
|
dim = x.shape[0]
|
|
|
|
for i in range(dim):
|
|
l1_norm_x += x[i]
|
|
l1_norm_y += y[i]
|
|
|
|
l1_norm_x += FLOAT32_EPS * dim
|
|
l1_norm_y += FLOAT32_EPS * dim
|
|
|
|
pdf_x = (x + FLOAT32_EPS) / l1_norm_x
|
|
pdf_y = (y + FLOAT32_EPS) / l1_norm_y
|
|
m = 0.5 * (pdf_x + pdf_y)
|
|
|
|
for i in range(dim):
|
|
result += 0.5 * (
|
|
pdf_x[i] * np.log(pdf_x[i] / m[i]) + pdf_y[i] * np.log(pdf_y[i] / m[i])
|
|
)
|
|
|
|
return result
|
|
|
|
|
|
@numba.njit()
|
|
def wasserstein_1d(x, y, p=1):
|
|
x_sum = 0.0
|
|
y_sum = 0.0
|
|
for i in range(x.shape[0]):
|
|
x_sum += x[i]
|
|
y_sum += y[i]
|
|
|
|
x_cdf = x / x_sum
|
|
y_cdf = y / y_sum
|
|
|
|
for i in range(1, x_cdf.shape[0]):
|
|
x_cdf[i] += x_cdf[i - 1]
|
|
y_cdf[i] += y_cdf[i - 1]
|
|
|
|
return minkowski(x_cdf, y_cdf, p)
|
|
|
|
|
|
@numba.njit()
|
|
def circular_kantorovich(x, y, p=1):
|
|
x_sum = 0.0
|
|
y_sum = 0.0
|
|
for i in range(x.shape[0]):
|
|
x_sum += x[i]
|
|
y_sum += y[i]
|
|
|
|
x_cdf = x / x_sum
|
|
y_cdf = y / y_sum
|
|
|
|
for i in range(1, x_cdf.shape[0]):
|
|
x_cdf[i] += x_cdf[i - 1]
|
|
y_cdf[i] += y_cdf[i - 1]
|
|
|
|
mu = np.median((x_cdf - y_cdf) ** p)
|
|
|
|
# Now we just want minkowski distance on the CDFs shifted by mu
|
|
result = 0.0
|
|
if p > 2:
|
|
for i in range(x_cdf.shape[0]):
|
|
result += np.abs(x_cdf[i] - y_cdf[i] - mu) ** p
|
|
|
|
return result ** (1.0 / p)
|
|
|
|
elif p == 2:
|
|
for i in range(x_cdf.shape[0]):
|
|
val = x_cdf[i] - y_cdf[i] - mu
|
|
result += val * val
|
|
|
|
return np.sqrt(result)
|
|
|
|
elif p == 1:
|
|
for i in range(x_cdf.shape[0]):
|
|
result += np.abs(x_cdf[i] - y_cdf[i] - mu)
|
|
|
|
return result
|
|
|
|
else:
|
|
raise ValueError("Invalid p supplied to Kantorvich distance")
|
|
|
|
|
|
@numba.njit()
|
|
def symmetric_kl_divergence(x, y):
|
|
result = 0.0
|
|
l1_norm_x = 0.0
|
|
l1_norm_y = 0.0
|
|
dim = x.shape[0]
|
|
|
|
for i in range(dim):
|
|
l1_norm_x += x[i]
|
|
l1_norm_y += y[i]
|
|
|
|
l1_norm_x += FLOAT32_EPS * dim
|
|
l1_norm_y += FLOAT32_EPS * dim
|
|
|
|
pdf_x = (x + FLOAT32_EPS) / l1_norm_x
|
|
pdf_y = (y + FLOAT32_EPS) / l1_norm_y
|
|
|
|
for i in range(dim):
|
|
result += pdf_x[i] * np.log(pdf_x[i] / pdf_y[i]) + pdf_y[i] * np.log(
|
|
pdf_y[i] / pdf_x[i]
|
|
)
|
|
|
|
return result
|
|
|
|
|
|
@numba.njit(
|
|
[
|
|
"f4(u1[::1],u1[::1])",
|
|
numba.types.float32(
|
|
numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
|
|
numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
|
|
),
|
|
],
|
|
fastmath=True,
|
|
locals={
|
|
"result": numba.types.float32,
|
|
"intersection": numba.types.uint8,
|
|
"dim": numba.types.intp,
|
|
"i": numba.types.uint16,
|
|
},
|
|
)
|
|
def bit_hamming(x, y):
|
|
result = 0.0
|
|
dim = x.shape[0]
|
|
|
|
for i in range(dim):
|
|
intersection = x[i] ^ y[i]
|
|
result += popcnt[intersection]
|
|
|
|
return result
|
|
|
|
|
|
@numba.njit(
|
|
[
|
|
"f4(u1[::1],u1[::1])",
|
|
numba.types.float32(
|
|
numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
|
|
numba.types.Array(numba.types.uint8, 1, "C", readonly=True),
|
|
),
|
|
],
|
|
fastmath=True,
|
|
locals={
|
|
"result": numba.types.float32,
|
|
"denom": numba.types.float32,
|
|
"and_": numba.types.uint8,
|
|
"or_": numba.types.uint8,
|
|
"dim": numba.types.intp,
|
|
"i": numba.types.uint16,
|
|
},
|
|
)
|
|
def bit_jaccard(x, y):
|
|
result = 0.0
|
|
denom = 0.0
|
|
dim = x.shape[0]
|
|
|
|
for i in range(dim):
|
|
and_ = x[i] & y[i]
|
|
or_ = x[i] | y[i]
|
|
result += popcnt[and_]
|
|
denom += popcnt[or_]
|
|
|
|
return -np.log(result / denom)
|
|
|
|
|
|
named_distances = {
|
|
# general minkowski distances
|
|
"euclidean": euclidean,
|
|
"l2": euclidean,
|
|
"sqeuclidean": squared_euclidean,
|
|
"manhattan": manhattan,
|
|
"taxicab": manhattan,
|
|
"l1": manhattan,
|
|
"chebyshev": chebyshev,
|
|
"linfinity": chebyshev,
|
|
"linfty": chebyshev,
|
|
"linf": chebyshev,
|
|
"minkowski": minkowski,
|
|
# Standardised/weighted distances
|
|
"seuclidean": standardised_euclidean,
|
|
"standardised_euclidean": standardised_euclidean,
|
|
"wminkowski": weighted_minkowski,
|
|
"weighted_minkowski": weighted_minkowski,
|
|
"mahalanobis": mahalanobis,
|
|
# Other distances
|
|
"canberra": canberra,
|
|
"cosine": cosine,
|
|
"dot": dot,
|
|
"correlation": correlation,
|
|
"haversine": haversine,
|
|
"braycurtis": bray_curtis,
|
|
"spearmanr": spearmanr,
|
|
"tsss": tsss,
|
|
"true_angular": true_angular,
|
|
# Distribution distances
|
|
"hellinger": hellinger,
|
|
"kantorovich": kantorovich,
|
|
"wasserstein": kantorovich,
|
|
"wasserstein_1d": wasserstein_1d,
|
|
"wasserstein-1d": wasserstein_1d,
|
|
"kantorovich-1d": wasserstein_1d,
|
|
"kantorovich_1d": wasserstein_1d,
|
|
"circular_kantorovich": circular_kantorovich,
|
|
"circular_wasserstein": circular_kantorovich,
|
|
"sinkhorn": sinkhorn,
|
|
"jensen-shannon": jensen_shannon_divergence,
|
|
"jensen_shannon": jensen_shannon_divergence,
|
|
"symmetric-kl": symmetric_kl_divergence,
|
|
"symmetric_kl": symmetric_kl_divergence,
|
|
"symmetric_kullback_liebler": symmetric_kl_divergence,
|
|
# Binary distances
|
|
"hamming": hamming,
|
|
"jaccard": jaccard,
|
|
"dice": dice,
|
|
"matching": matching,
|
|
"kulsinski": kulsinski,
|
|
"rogerstanimoto": rogers_tanimoto,
|
|
"russellrao": russellrao,
|
|
"sokalsneath": sokal_sneath,
|
|
"sokalmichener": sokal_michener,
|
|
"yule": yule,
|
|
"bit_hamming": bit_hamming,
|
|
"bit_jaccard": bit_jaccard,
|
|
}
|
|
|
|
# Some distances have a faster to compute alternative that
|
|
# retains the same ordering of distances. We can compute with
|
|
# this instead, and then correct the final distances when complete.
|
|
# This provides a list of distances that have such an alternative
|
|
# along with the alternative distance function and the correction
|
|
# function to be applied.
|
|
fast_distance_alternatives = {
|
|
"euclidean": {"dist": squared_euclidean, "correction": np.sqrt},
|
|
"l2": {"dist": squared_euclidean, "correction": np.sqrt},
|
|
"cosine": {"dist": alternative_cosine, "correction": correct_alternative_cosine},
|
|
"dot": {"dist": alternative_dot, "correction": correct_alternative_cosine},
|
|
"true_angular": {
|
|
"dist": alternative_cosine,
|
|
"correction": true_angular_from_alt_cosine,
|
|
},
|
|
"hellinger": {
|
|
"dist": alternative_hellinger,
|
|
"correction": correct_alternative_hellinger,
|
|
},
|
|
"jaccard": {"dist": alternative_jaccard, "correction": correct_alternative_jaccard},
|
|
}
|