500 lines
18 KiB
Python
500 lines
18 KiB
Python
import warnings
|
|
import numpy as np
|
|
from . import distributions
|
|
from .._lib._bunch import _make_tuple_bunch
|
|
from ._stats_pythran import siegelslopes as siegelslopes_pythran
|
|
from . import _mstats_basic
|
|
|
|
__all__ = ['_find_repeats', 'linregress', 'theilslopes', 'siegelslopes']
|
|
|
|
# This is not a namedtuple for backwards compatibility. See PR #12983
|
|
LinregressResult = _make_tuple_bunch('LinregressResult',
|
|
['slope', 'intercept', 'rvalue',
|
|
'pvalue', 'stderr'],
|
|
extra_field_names=['intercept_stderr'])
|
|
TheilslopesResult = _make_tuple_bunch('TheilslopesResult',
|
|
['slope', 'intercept',
|
|
'low_slope', 'high_slope'])
|
|
SiegelslopesResult = _make_tuple_bunch('SiegelslopesResult',
|
|
['slope', 'intercept'])
|
|
|
|
|
|
def linregress(x, y=None, alternative='two-sided'):
|
|
"""
|
|
Calculate a linear least-squares regression for two sets of measurements.
|
|
|
|
Parameters
|
|
----------
|
|
x, y : array_like
|
|
Two sets of measurements. Both arrays should have the same length. If
|
|
only `x` is given (and ``y=None``), then it must be a two-dimensional
|
|
array where one dimension has length 2. The two sets of measurements
|
|
are then found by splitting the array along the length-2 dimension. In
|
|
the case where ``y=None`` and `x` is a 2x2 array, ``linregress(x)`` is
|
|
equivalent to ``linregress(x[0], x[1])``.
|
|
alternative : {'two-sided', 'less', 'greater'}, optional
|
|
Defines the alternative hypothesis. Default is 'two-sided'.
|
|
The following options are available:
|
|
|
|
* 'two-sided': the slope of the regression line is nonzero
|
|
* 'less': the slope of the regression line is less than zero
|
|
* 'greater': the slope of the regression line is greater than zero
|
|
|
|
.. versionadded:: 1.7.0
|
|
|
|
Returns
|
|
-------
|
|
result : ``LinregressResult`` instance
|
|
The return value is an object with the following attributes:
|
|
|
|
slope : float
|
|
Slope of the regression line.
|
|
intercept : float
|
|
Intercept of the regression line.
|
|
rvalue : float
|
|
The Pearson correlation coefficient. The square of ``rvalue``
|
|
is equal to the coefficient of determination.
|
|
pvalue : float
|
|
The p-value for a hypothesis test whose null hypothesis is
|
|
that the slope is zero, using Wald Test with t-distribution of
|
|
the test statistic. See `alternative` above for alternative
|
|
hypotheses.
|
|
stderr : float
|
|
Standard error of the estimated slope (gradient), under the
|
|
assumption of residual normality.
|
|
intercept_stderr : float
|
|
Standard error of the estimated intercept, under the assumption
|
|
of residual normality.
|
|
|
|
See Also
|
|
--------
|
|
scipy.optimize.curve_fit :
|
|
Use non-linear least squares to fit a function to data.
|
|
scipy.optimize.leastsq :
|
|
Minimize the sum of squares of a set of equations.
|
|
|
|
Notes
|
|
-----
|
|
Missing values are considered pair-wise: if a value is missing in `x`,
|
|
the corresponding value in `y` is masked.
|
|
|
|
For compatibility with older versions of SciPy, the return value acts
|
|
like a ``namedtuple`` of length 5, with fields ``slope``, ``intercept``,
|
|
``rvalue``, ``pvalue`` and ``stderr``, so one can continue to write::
|
|
|
|
slope, intercept, r, p, se = linregress(x, y)
|
|
|
|
With that style, however, the standard error of the intercept is not
|
|
available. To have access to all the computed values, including the
|
|
standard error of the intercept, use the return value as an object
|
|
with attributes, e.g.::
|
|
|
|
result = linregress(x, y)
|
|
print(result.intercept, result.intercept_stderr)
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> import matplotlib.pyplot as plt
|
|
>>> from scipy import stats
|
|
>>> rng = np.random.default_rng()
|
|
|
|
Generate some data:
|
|
|
|
>>> x = rng.random(10)
|
|
>>> y = 1.6*x + rng.random(10)
|
|
|
|
Perform the linear regression:
|
|
|
|
>>> res = stats.linregress(x, y)
|
|
|
|
Coefficient of determination (R-squared):
|
|
|
|
>>> print(f"R-squared: {res.rvalue**2:.6f}")
|
|
R-squared: 0.717533
|
|
|
|
Plot the data along with the fitted line:
|
|
|
|
>>> plt.plot(x, y, 'o', label='original data')
|
|
>>> plt.plot(x, res.intercept + res.slope*x, 'r', label='fitted line')
|
|
>>> plt.legend()
|
|
>>> plt.show()
|
|
|
|
Calculate 95% confidence interval on slope and intercept:
|
|
|
|
>>> # Two-sided inverse Students t-distribution
|
|
>>> # p - probability, df - degrees of freedom
|
|
>>> from scipy.stats import t
|
|
>>> tinv = lambda p, df: abs(t.ppf(p/2, df))
|
|
|
|
>>> ts = tinv(0.05, len(x)-2)
|
|
>>> print(f"slope (95%): {res.slope:.6f} +/- {ts*res.stderr:.6f}")
|
|
slope (95%): 1.453392 +/- 0.743465
|
|
>>> print(f"intercept (95%): {res.intercept:.6f}"
|
|
... f" +/- {ts*res.intercept_stderr:.6f}")
|
|
intercept (95%): 0.616950 +/- 0.544475
|
|
|
|
"""
|
|
TINY = 1.0e-20
|
|
if y is None: # x is a (2, N) or (N, 2) shaped array_like
|
|
x = np.asarray(x)
|
|
if x.shape[0] == 2:
|
|
x, y = x
|
|
elif x.shape[1] == 2:
|
|
x, y = x.T
|
|
else:
|
|
raise ValueError("If only `x` is given as input, it has to "
|
|
"be of shape (2, N) or (N, 2); provided shape "
|
|
f"was {x.shape}.")
|
|
else:
|
|
x = np.asarray(x)
|
|
y = np.asarray(y)
|
|
|
|
if x.size == 0 or y.size == 0:
|
|
raise ValueError("Inputs must not be empty.")
|
|
|
|
if np.amax(x) == np.amin(x) and len(x) > 1:
|
|
raise ValueError("Cannot calculate a linear regression "
|
|
"if all x values are identical")
|
|
|
|
n = len(x)
|
|
xmean = np.mean(x, None)
|
|
ymean = np.mean(y, None)
|
|
|
|
# Average sums of square differences from the mean
|
|
# ssxm = mean( (x-mean(x))^2 )
|
|
# ssxym = mean( (x-mean(x)) * (y-mean(y)) )
|
|
ssxm, ssxym, _, ssym = np.cov(x, y, bias=1).flat
|
|
|
|
# R-value
|
|
# r = ssxym / sqrt( ssxm * ssym )
|
|
if ssxm == 0.0 or ssym == 0.0:
|
|
# If the denominator was going to be 0
|
|
r = 0.0
|
|
else:
|
|
r = ssxym / np.sqrt(ssxm * ssym)
|
|
# Test for numerical error propagation (make sure -1 < r < 1)
|
|
if r > 1.0:
|
|
r = 1.0
|
|
elif r < -1.0:
|
|
r = -1.0
|
|
|
|
slope = ssxym / ssxm
|
|
intercept = ymean - slope*xmean
|
|
if n == 2:
|
|
# handle case when only two points are passed in
|
|
if y[0] == y[1]:
|
|
prob = 1.0
|
|
else:
|
|
prob = 0.0
|
|
slope_stderr = 0.0
|
|
intercept_stderr = 0.0
|
|
else:
|
|
df = n - 2 # Number of degrees of freedom
|
|
# n-2 degrees of freedom because 2 has been used up
|
|
# to estimate the mean and standard deviation
|
|
t = r * np.sqrt(df / ((1.0 - r + TINY)*(1.0 + r + TINY)))
|
|
t, prob = _mstats_basic._ttest_finish(df, t, alternative)
|
|
|
|
slope_stderr = np.sqrt((1 - r**2) * ssym / ssxm / df)
|
|
|
|
# Also calculate the standard error of the intercept
|
|
# The following relationship is used:
|
|
# ssxm = mean( (x-mean(x))^2 )
|
|
# = ssx - sx*sx
|
|
# = mean( x^2 ) - mean(x)^2
|
|
intercept_stderr = slope_stderr * np.sqrt(ssxm + xmean**2)
|
|
|
|
return LinregressResult(slope=slope, intercept=intercept, rvalue=r,
|
|
pvalue=prob, stderr=slope_stderr,
|
|
intercept_stderr=intercept_stderr)
|
|
|
|
|
|
def theilslopes(y, x=None, alpha=0.95, method='separate'):
|
|
r"""
|
|
Computes the Theil-Sen estimator for a set of points (x, y).
|
|
|
|
`theilslopes` implements a method for robust linear regression. It
|
|
computes the slope as the median of all slopes between paired values.
|
|
|
|
Parameters
|
|
----------
|
|
y : array_like
|
|
Dependent variable.
|
|
x : array_like or None, optional
|
|
Independent variable. If None, use ``arange(len(y))`` instead.
|
|
alpha : float, optional
|
|
Confidence degree between 0 and 1. Default is 95% confidence.
|
|
Note that `alpha` is symmetric around 0.5, i.e. both 0.1 and 0.9 are
|
|
interpreted as "find the 90% confidence interval".
|
|
method : {'joint', 'separate'}, optional
|
|
Method to be used for computing estimate for intercept.
|
|
Following methods are supported,
|
|
|
|
* 'joint': Uses np.median(y - slope * x) as intercept.
|
|
* 'separate': Uses np.median(y) - slope * np.median(x)
|
|
as intercept.
|
|
|
|
The default is 'separate'.
|
|
|
|
.. versionadded:: 1.8.0
|
|
|
|
Returns
|
|
-------
|
|
result : ``TheilslopesResult`` instance
|
|
The return value is an object with the following attributes:
|
|
|
|
slope : float
|
|
Theil slope.
|
|
intercept : float
|
|
Intercept of the Theil line.
|
|
low_slope : float
|
|
Lower bound of the confidence interval on `slope`.
|
|
high_slope : float
|
|
Upper bound of the confidence interval on `slope`.
|
|
|
|
See Also
|
|
--------
|
|
siegelslopes : a similar technique using repeated medians
|
|
|
|
Notes
|
|
-----
|
|
The implementation of `theilslopes` follows [1]_. The intercept is
|
|
not defined in [1]_, and here it is defined as ``median(y) -
|
|
slope*median(x)``, which is given in [3]_. Other definitions of
|
|
the intercept exist in the literature such as ``median(y - slope*x)``
|
|
in [4]_. The approach to compute the intercept can be determined by the
|
|
parameter ``method``. A confidence interval for the intercept is not
|
|
given as this question is not addressed in [1]_.
|
|
|
|
For compatibility with older versions of SciPy, the return value acts
|
|
like a ``namedtuple`` of length 4, with fields ``slope``, ``intercept``,
|
|
``low_slope``, and ``high_slope``, so one can continue to write::
|
|
|
|
slope, intercept, low_slope, high_slope = theilslopes(y, x)
|
|
|
|
References
|
|
----------
|
|
.. [1] P.K. Sen, "Estimates of the regression coefficient based on
|
|
Kendall's tau", J. Am. Stat. Assoc., Vol. 63, pp. 1379-1389, 1968.
|
|
.. [2] H. Theil, "A rank-invariant method of linear and polynomial
|
|
regression analysis I, II and III", Nederl. Akad. Wetensch., Proc.
|
|
53:, pp. 386-392, pp. 521-525, pp. 1397-1412, 1950.
|
|
.. [3] W.L. Conover, "Practical nonparametric statistics", 2nd ed.,
|
|
John Wiley and Sons, New York, pp. 493.
|
|
.. [4] https://en.wikipedia.org/wiki/Theil%E2%80%93Sen_estimator
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from scipy import stats
|
|
>>> import matplotlib.pyplot as plt
|
|
|
|
>>> x = np.linspace(-5, 5, num=150)
|
|
>>> y = x + np.random.normal(size=x.size)
|
|
>>> y[11:15] += 10 # add outliers
|
|
>>> y[-5:] -= 7
|
|
|
|
Compute the slope, intercept and 90% confidence interval. For comparison,
|
|
also compute the least-squares fit with `linregress`:
|
|
|
|
>>> res = stats.theilslopes(y, x, 0.90, method='separate')
|
|
>>> lsq_res = stats.linregress(x, y)
|
|
|
|
Plot the results. The Theil-Sen regression line is shown in red, with the
|
|
dashed red lines illustrating the confidence interval of the slope (note
|
|
that the dashed red lines are not the confidence interval of the regression
|
|
as the confidence interval of the intercept is not included). The green
|
|
line shows the least-squares fit for comparison.
|
|
|
|
>>> fig = plt.figure()
|
|
>>> ax = fig.add_subplot(111)
|
|
>>> ax.plot(x, y, 'b.')
|
|
>>> ax.plot(x, res[1] + res[0] * x, 'r-')
|
|
>>> ax.plot(x, res[1] + res[2] * x, 'r--')
|
|
>>> ax.plot(x, res[1] + res[3] * x, 'r--')
|
|
>>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
|
|
>>> plt.show()
|
|
|
|
"""
|
|
if method not in ['joint', 'separate']:
|
|
raise ValueError("method must be either 'joint' or 'separate'."
|
|
f"'{method}' is invalid.")
|
|
# We copy both x and y so we can use _find_repeats.
|
|
y = np.array(y, dtype=float, copy=True).ravel()
|
|
if x is None:
|
|
x = np.arange(len(y), dtype=float)
|
|
else:
|
|
x = np.array(x, dtype=float, copy=True).ravel()
|
|
if len(x) != len(y):
|
|
raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})")
|
|
|
|
# Compute sorted slopes only when deltax > 0
|
|
deltax = x[:, np.newaxis] - x
|
|
deltay = y[:, np.newaxis] - y
|
|
slopes = deltay[deltax > 0] / deltax[deltax > 0]
|
|
if not slopes.size:
|
|
msg = "All `x` coordinates are identical."
|
|
warnings.warn(msg, RuntimeWarning, stacklevel=2)
|
|
slopes.sort()
|
|
medslope = np.median(slopes)
|
|
if method == 'joint':
|
|
medinter = np.median(y - medslope * x)
|
|
else:
|
|
medinter = np.median(y) - medslope * np.median(x)
|
|
# Now compute confidence intervals
|
|
if alpha > 0.5:
|
|
alpha = 1. - alpha
|
|
|
|
z = distributions.norm.ppf(alpha / 2.)
|
|
# This implements (2.6) from Sen (1968)
|
|
_, nxreps = _find_repeats(x)
|
|
_, nyreps = _find_repeats(y)
|
|
nt = len(slopes) # N in Sen (1968)
|
|
ny = len(y) # n in Sen (1968)
|
|
# Equation 2.6 in Sen (1968):
|
|
sigsq = 1/18. * (ny * (ny-1) * (2*ny+5) -
|
|
sum(k * (k-1) * (2*k + 5) for k in nxreps) -
|
|
sum(k * (k-1) * (2*k + 5) for k in nyreps))
|
|
# Find the confidence interval indices in `slopes`
|
|
try:
|
|
sigma = np.sqrt(sigsq)
|
|
Ru = min(int(np.round((nt - z*sigma)/2.)), len(slopes)-1)
|
|
Rl = max(int(np.round((nt + z*sigma)/2.)) - 1, 0)
|
|
delta = slopes[[Rl, Ru]]
|
|
except (ValueError, IndexError):
|
|
delta = (np.nan, np.nan)
|
|
|
|
return TheilslopesResult(slope=medslope, intercept=medinter,
|
|
low_slope=delta[0], high_slope=delta[1])
|
|
|
|
|
|
def _find_repeats(arr):
|
|
# This function assumes it may clobber its input.
|
|
if len(arr) == 0:
|
|
return np.array(0, np.float64), np.array(0, np.intp)
|
|
|
|
# XXX This cast was previously needed for the Fortran implementation,
|
|
# should we ditch it?
|
|
arr = np.asarray(arr, np.float64).ravel()
|
|
arr.sort()
|
|
|
|
# Taken from NumPy 1.9's np.unique.
|
|
change = np.concatenate(([True], arr[1:] != arr[:-1]))
|
|
unique = arr[change]
|
|
change_idx = np.concatenate(np.nonzero(change) + ([arr.size],))
|
|
freq = np.diff(change_idx)
|
|
atleast2 = freq > 1
|
|
return unique[atleast2], freq[atleast2]
|
|
|
|
|
|
def siegelslopes(y, x=None, method="hierarchical"):
|
|
r"""
|
|
Computes the Siegel estimator for a set of points (x, y).
|
|
|
|
`siegelslopes` implements a method for robust linear regression
|
|
using repeated medians (see [1]_) to fit a line to the points (x, y).
|
|
The method is robust to outliers with an asymptotic breakdown point
|
|
of 50%.
|
|
|
|
Parameters
|
|
----------
|
|
y : array_like
|
|
Dependent variable.
|
|
x : array_like or None, optional
|
|
Independent variable. If None, use ``arange(len(y))`` instead.
|
|
method : {'hierarchical', 'separate'}
|
|
If 'hierarchical', estimate the intercept using the estimated
|
|
slope ``slope`` (default option).
|
|
If 'separate', estimate the intercept independent of the estimated
|
|
slope. See Notes for details.
|
|
|
|
Returns
|
|
-------
|
|
result : ``SiegelslopesResult`` instance
|
|
The return value is an object with the following attributes:
|
|
|
|
slope : float
|
|
Estimate of the slope of the regression line.
|
|
intercept : float
|
|
Estimate of the intercept of the regression line.
|
|
|
|
See Also
|
|
--------
|
|
theilslopes : a similar technique without repeated medians
|
|
|
|
Notes
|
|
-----
|
|
With ``n = len(y)``, compute ``m_j`` as the median of
|
|
the slopes from the point ``(x[j], y[j])`` to all other `n-1` points.
|
|
``slope`` is then the median of all slopes ``m_j``.
|
|
Two ways are given to estimate the intercept in [1]_ which can be chosen
|
|
via the parameter ``method``.
|
|
The hierarchical approach uses the estimated slope ``slope``
|
|
and computes ``intercept`` as the median of ``y - slope*x``.
|
|
The other approach estimates the intercept separately as follows: for
|
|
each point ``(x[j], y[j])``, compute the intercepts of all the `n-1`
|
|
lines through the remaining points and take the median ``i_j``.
|
|
``intercept`` is the median of the ``i_j``.
|
|
|
|
The implementation computes `n` times the median of a vector of size `n`
|
|
which can be slow for large vectors. There are more efficient algorithms
|
|
(see [2]_) which are not implemented here.
|
|
|
|
For compatibility with older versions of SciPy, the return value acts
|
|
like a ``namedtuple`` of length 2, with fields ``slope`` and
|
|
``intercept``, so one can continue to write::
|
|
|
|
slope, intercept = siegelslopes(y, x)
|
|
|
|
References
|
|
----------
|
|
.. [1] A. Siegel, "Robust Regression Using Repeated Medians",
|
|
Biometrika, Vol. 69, pp. 242-244, 1982.
|
|
|
|
.. [2] A. Stein and M. Werman, "Finding the repeated median regression
|
|
line", Proceedings of the Third Annual ACM-SIAM Symposium on
|
|
Discrete Algorithms, pp. 409-413, 1992.
|
|
|
|
Examples
|
|
--------
|
|
>>> import numpy as np
|
|
>>> from scipy import stats
|
|
>>> import matplotlib.pyplot as plt
|
|
|
|
>>> x = np.linspace(-5, 5, num=150)
|
|
>>> y = x + np.random.normal(size=x.size)
|
|
>>> y[11:15] += 10 # add outliers
|
|
>>> y[-5:] -= 7
|
|
|
|
Compute the slope and intercept. For comparison, also compute the
|
|
least-squares fit with `linregress`:
|
|
|
|
>>> res = stats.siegelslopes(y, x)
|
|
>>> lsq_res = stats.linregress(x, y)
|
|
|
|
Plot the results. The Siegel regression line is shown in red. The green
|
|
line shows the least-squares fit for comparison.
|
|
|
|
>>> fig = plt.figure()
|
|
>>> ax = fig.add_subplot(111)
|
|
>>> ax.plot(x, y, 'b.')
|
|
>>> ax.plot(x, res[1] + res[0] * x, 'r-')
|
|
>>> ax.plot(x, lsq_res[1] + lsq_res[0] * x, 'g-')
|
|
>>> plt.show()
|
|
|
|
"""
|
|
if method not in ['hierarchical', 'separate']:
|
|
raise ValueError("method can only be 'hierarchical' or 'separate'")
|
|
y = np.asarray(y).ravel()
|
|
if x is None:
|
|
x = np.arange(len(y), dtype=float)
|
|
else:
|
|
x = np.asarray(x, dtype=float).ravel()
|
|
if len(x) != len(y):
|
|
raise ValueError(f"Incompatible lengths ! ({len(y)}<>{len(x)})")
|
|
dtype = np.result_type(x, y, np.float32) # use at least float32
|
|
y, x = y.astype(dtype), x.astype(dtype)
|
|
medslope, medinter = siegelslopes_pythran(y, x, method)
|
|
return SiegelslopesResult(slope=medslope, intercept=medinter)
|