Source code for dispel.stats.core

"""Core functions to calculate statistics."""
import math
from functools import partial
from typing import Callable, Literal

import numpy as np
import pandas as pd


[docs] def mad(data: np.ndarray, axis=None): """Compute mean absolute deviation. Parameters ---------- data The data from which to calculate the mean absolute deviation. axis The axis along which to calculate the mean absolute deviation. Returns ------- numpy.ndarray The mean absolute deviation. """ return np.mean(np.absolute(data - np.mean(data, axis)), axis)
[docs] def variation( data: pd.Series, error: Literal["raise", "coerce", "omit"] = "coerce" ) -> float: """Compute the coefficient of variation of a series. Parameters ---------- data A pandas series for which to compute the coefficient of variation. error Defines how to handle when the data mean is null is raised. The following options are available (default is ``coerce``) - ``raise``: ``ZeroDivisionError`` will be raised. - ``coerce``: variation will be set as 0. - ``omit``: variation will return nan. Returns ------- float The coefficient of variation of the data using an unbiased standard deviation computation. Raises ------ ZeroDivisionError If the data mean is null and the argument ``error`` is set to ``raised``. ValueError If the argument ``error`` is given an unsupported value. Examples -------- Here are a few usage examples: .. testsetup:: variation import warnings import pandas as pd warnings.simplefilter('ignore', RuntimeWarning) .. doctest:: variation >>> import pandas as pd >>> from dispel.stats.core import variation >>> x = pd.Series([3.2, 4.1, 0., 1., -6.]) >>> variation(x) 8.626902135395195 In case of ``ZeroDivisionError``, one can use the ``error`` argument to control the output. .. doctest:: variation >>> x = pd.Series([3., -4., 0., 2., -1.]) >>> variation(x) 0.0 >>> x = pd.Series([3., -4., 0., 2., -1.]) >>> variation(x, error='raise') Traceback (most recent call last): ... ZeroDivisionError: Cannot divide by null mean. >>> x = pd.Series([3., -4., 0., 2., -1.]) >>> variation(x, error='omit') nan """ res = data.std() / data.mean() if math.isinf(res): if error == "coerce": return 0.0 if error == "omit": return np.nan if error == "raise": raise ZeroDivisionError("Cannot divide by null mean.") raise ValueError("Unsupported ``error`` value.") return res
# Setting variation name acronym variation.__name__ = "cv"
[docs] def variation_increase( data: pd.Series, error: Literal["raise", "coerce", "omit"] = "coerce" ) -> float: """Compute the coefficient of variation increase for a series. The coefficient of variation increase corresponds to the the CV of the second half of the data minus that of the first half. Parameters ---------- data A pandas series for which to compute the coefficient of variation increase. error Defines how to handle when the data mean is null is raised. The following options are available (default is ``coerce``) - ``raise``: ``ZeroDivisionError`` will be raised. - ``coerce``: variation will be set as 0. - ``omit``: variation will return nan. Returns ------- float The coefficient of variation increase of the data using an unbiased standard deviation computation. Examples -------- Here are a few usage examples: .. testsetup:: variation_increase import warnings import pandas as pd warnings.simplefilter('ignore', RuntimeWarning) .. doctest:: variation_increase >>> import pandas as pd >>> from dispel.stats.core import variation_increase >>> x = pd.Series([3.2, 4.1, 0., 1., -6.]) >>> variation_increase(x) -2.4459184350510386 In case of ``ZeroDivisionError``, one can use the ``error`` argument to control the output. .. doctest:: variation_increase >>> x = pd.Series([3., -4., 0., 1., -1.]) >>> variation_increase(x, error='raise') Traceback (most recent call last): ... ZeroDivisionError: Cannot divide by null mean. >>> x = pd.Series([3., -3., 0., 2., -1.]) >>> variation_increase(x, error='omit') nan """ first_half = data[: (half_idx := len(data) // 2)] second_half = data[half_idx:] return variation(second_half, error) - variation(first_half, error)
# Setting variation increase name acronym variation_increase.__name__ = "cvi"
[docs] def q_factory(percentile: float, name: str) -> Callable[[pd.Series], float]: """Create percentile aggregation method. Parameters ---------- percentile The percentile used in the aggregation. This is passed to :meth:`pandas.Series.quantile`. name The name of the method. Returns ------- Callable[[pandas.Series], float] Returns a callable aggregation method that returns the percentile specified in `percentile`. """ func = partial(pd.Series.quantile, q=percentile) func.__name__ = name # type: ignore return func
#: First quartile (Q1) aggregation q1 = q_factory(0.25, "q1") #: Third quartile (Q3) aggregation q3 = q_factory(0.75, "q3") #: Percentile 0.5 aggregation percentile_05 = q_factory(0.05, "q05") #: Percentile 0.95 aggregation percentile_95 = q_factory(0.95, "q95")
[docs] def freq_nan(data: pd.Series) -> float: """Get the frequency of null values.""" return data.isnull().sum() / len(data)
[docs] def iqr(data: pd.Series) -> float: """Compute the inter-quartile range.""" return data.quantile(q=0.75) - data.quantile(q=0.25)
[docs] def npcv(data: pd.Series) -> float: """Compute the non-parametric coefficient of variation of a series.""" return mad(data) / data.abs().mean()