"""Core functions to calculate statistics."""
import math
from functools import partial
from typing import Callable, Literal
import numpy as np
import pandas as pd
[docs]
def mad(data: np.ndarray, axis=None):
"""Compute mean absolute deviation.
Parameters
----------
data
The data from which to calculate the mean absolute deviation.
axis
The axis along which to calculate the mean absolute deviation.
Returns
-------
numpy.ndarray
The mean absolute deviation.
"""
return np.mean(np.absolute(data - np.mean(data, axis)), axis)
[docs]
def variation(
data: pd.Series, error: Literal["raise", "coerce", "omit"] = "coerce"
) -> float:
"""Compute the coefficient of variation of a series.
Parameters
----------
data
A pandas series for which to compute the coefficient of variation.
error
Defines how to handle when the data mean is null is raised. The
following options are available (default is ``coerce``)
- ``raise``: ``ZeroDivisionError`` will be raised.
- ``coerce``: variation will be set as 0.
- ``omit``: variation will return nan.
Returns
-------
float
The coefficient of variation of the data using an unbiased standard
deviation computation.
Raises
------
ZeroDivisionError
If the data mean is null and the argument ``error`` is set to
``raised``.
ValueError
If the argument ``error`` is given an unsupported value.
Examples
--------
Here are a few usage examples:
.. testsetup:: variation
import warnings
import pandas as pd
warnings.simplefilter('ignore', RuntimeWarning)
.. doctest:: variation
>>> import pandas as pd
>>> from dispel.stats.core import variation
>>> x = pd.Series([3.2, 4.1, 0., 1., -6.])
>>> variation(x)
8.626902135395195
In case of ``ZeroDivisionError``, one can use the ``error`` argument to
control the output.
.. doctest:: variation
>>> x = pd.Series([3., -4., 0., 2., -1.])
>>> variation(x)
0.0
>>> x = pd.Series([3., -4., 0., 2., -1.])
>>> variation(x, error='raise')
Traceback (most recent call last):
...
ZeroDivisionError: Cannot divide by null mean.
>>> x = pd.Series([3., -4., 0., 2., -1.])
>>> variation(x, error='omit')
nan
"""
res = data.std() / data.mean()
if math.isinf(res):
if error == "coerce":
return 0.0
if error == "omit":
return np.nan
if error == "raise":
raise ZeroDivisionError("Cannot divide by null mean.")
raise ValueError("Unsupported ``error`` value.")
return res
# Setting variation name acronym
variation.__name__ = "cv"
[docs]
def variation_increase(
data: pd.Series, error: Literal["raise", "coerce", "omit"] = "coerce"
) -> float:
"""Compute the coefficient of variation increase for a series.
The coefficient of variation increase corresponds to the the CV of the
second half of the data minus that of the first half.
Parameters
----------
data
A pandas series for which to compute the coefficient of variation
increase.
error
Defines how to handle when the data mean is null is raised. The
following options are available (default is ``coerce``)
- ``raise``: ``ZeroDivisionError`` will be raised.
- ``coerce``: variation will be set as 0.
- ``omit``: variation will return nan.
Returns
-------
float
The coefficient of variation increase of the data using an unbiased
standard deviation computation.
Examples
--------
Here are a few usage examples:
.. testsetup:: variation_increase
import warnings
import pandas as pd
warnings.simplefilter('ignore', RuntimeWarning)
.. doctest:: variation_increase
>>> import pandas as pd
>>> from dispel.stats.core import variation_increase
>>> x = pd.Series([3.2, 4.1, 0., 1., -6.])
>>> variation_increase(x)
-2.4459184350510386
In case of ``ZeroDivisionError``, one can use the ``error`` argument to
control the output.
.. doctest:: variation_increase
>>> x = pd.Series([3., -4., 0., 1., -1.])
>>> variation_increase(x, error='raise')
Traceback (most recent call last):
...
ZeroDivisionError: Cannot divide by null mean.
>>> x = pd.Series([3., -3., 0., 2., -1.])
>>> variation_increase(x, error='omit')
nan
"""
first_half = data[: (half_idx := len(data) // 2)]
second_half = data[half_idx:]
return variation(second_half, error) - variation(first_half, error)
# Setting variation increase name acronym
variation_increase.__name__ = "cvi"
[docs]
def q_factory(percentile: float, name: str) -> Callable[[pd.Series], float]:
"""Create percentile aggregation method.
Parameters
----------
percentile
The percentile used in the aggregation. This is passed to
:meth:`pandas.Series.quantile`.
name
The name of the method.
Returns
-------
Callable[[pandas.Series], float]
Returns a callable aggregation method that returns the percentile
specified in `percentile`.
"""
func = partial(pd.Series.quantile, q=percentile)
func.__name__ = name # type: ignore
return func
#: First quartile (Q1) aggregation
q1 = q_factory(0.25, "q1")
#: Third quartile (Q3) aggregation
q3 = q_factory(0.75, "q3")
#: Percentile 0.5 aggregation
percentile_05 = q_factory(0.05, "q05")
#: Percentile 0.95 aggregation
percentile_95 = q_factory(0.95, "q95")
[docs]
def freq_nan(data: pd.Series) -> float:
"""Get the frequency of null values."""
return data.isnull().sum() / len(data)
[docs]
def iqr(data: pd.Series) -> float:
"""Compute the inter-quartile range."""
return data.quantile(q=0.75) - data.quantile(q=0.25)
[docs]
def npcv(data: pd.Series) -> float:
"""Compute the non-parametric coefficient of variation of a series."""
return mad(data) / data.abs().mean()