Source code for dispel.stats.learning

"""Inter-session Learning analysis.

A module where functions are provided to compute and extract learning parameters from
measure collections containing processed measures. The module provides class and
functions to compute and extract parameters from fitted model by curve fit and compute
relevant learning related measures.
"""
from dataclasses import dataclass
from typing import Dict, Optional, Tuple, Union, cast

import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
from scipy.stats import norm, zscore
from sklearn.metrics import r2_score

from dispel.data.collections import MeasureCollection

NumericType = Union[float, int, np.ndarray, pd.Series]



[docs]
@dataclass(frozen=True)
class LearningCurve:
    """Class ensemble of learning curve parameters."""

    #: The asymptote of the fitted learning curve.
    asymptote: float

    #: The slope of the fitted learning curve.
    slope: float


[docs]
    def to_dict(self):
        """Convert learning curve information to dictionary format."""
        return dict(
            optimal_performance=self.asymptote,
            slope_coefficient=self.slope,
            learning_rate=self.learning_rate,
        )



[docs]
    @staticmethod
    def compute_learning(x: NumericType, a: NumericType, b: NumericType) -> NumericType:
        """Compute learning curve function."""
        return a - b / x


    def __call__(self, data: NumericType) -> NumericType:
        """Compute the learning curve for a given input (trial number)."""
        return self.compute_learning(data, self.asymptote, self.slope)

    @property
    def learning_rate(self) -> float:
        """Get the learning rate related to curve."""
        return self.slope / self.asymptote


[docs]
    @classmethod
    def fit(cls, x: np.ndarray, y: np.ndarray) -> "LearningCurve":
        """Fit learning curve using :func:`scipy.optimize.curve_fit`.

        See :meth:`dispel.stats.learning.LearningCurve.compute_learning`.

        Parameters
        ----------
        x
            The trial numbers associated with data points.
        y
            The measure data points.

        Returns
        -------
        LearningCurve
            The fitted learning curve.
        """
        (asymptote, slope), *_ = curve_fit(cls.compute_learning, x, y)
        return cls(asymptote=asymptote, slope=slope)



[docs]
    def get_warm_up(self, data: Union[pd.Series, np.ndarray]) -> int:
        """Compute the warm-up argmax for measure values and fitted parameters.

        The ``warm_up`` here is actually the minimum number of trials the user has to
        perform in order to reach 90% of the optimal performance (asymptote value) given
        by the model.

        Parameters
        ----------
        data
            A numpy array series containing ordered measure values.

        Returns
        -------
        int
            The argmax of the first occurrence of the measure values that reaches 90% of
            the optimal performance given by the model.

        Raises
        ------
        TypeError
            If the given data is not a pandas series nor a numpy array.
        """
        if data.size == 0:
            return cast(int, np.nan)

        if self.slope < 0:
            threshold_90 = 0.9 * (self.asymptote - (max_ := data.max())) + max_
            argmax_index = np.argmax(data < threshold_90)
        else:
            threshold_90 = 0.9 * (self.asymptote - (min_ := data.min())) + min_
            argmax_index = np.argmax(data > threshold_90)

        if isinstance(data, np.ndarray):
            return int(argmax_index)
        if isinstance(data, pd.Series):
            return data.index[argmax_index]
        raise TypeError(
            f"Unsupported data type {type(data)}. Only a ``pandas.Series`` or "
            "``numpy.ndarray`` is allowed."
        )



[docs]
    @classmethod
    def empty(cls):
        """Return empty learning curve."""
        return cls(asymptote=np.nan, slope=np.nan)





[docs]
@dataclass(frozen=True)
class LearningModel:
    """Class ensemble of learning model."""

    #: The fitted learning curve.
    curve: LearningCurve

    #: The data points without outliers
    new_data: pd.Series

    # Thr R squared score of the fitted learning model.
    r2_score: Optional[float]

    #: The number of outliers rejected during the model fitting.
    nb_outliers: Optional[int] = 0


[docs]
    def to_dict(self):
        """Convert learning model information to dictionary format."""
        return dict(
            **self.curve.to_dict(),
            warm_up=self.curve.get_warm_up(self.new_data),
            r2_score=self.r2_score,
            nb_outliers=self.nb_outliers,
        )



[docs]
    @classmethod
    def empty(cls) -> "LearningModel":
        """Return empty learning model."""
        return cls(
            curve=LearningCurve.empty(),
            new_data=pd.Series(dtype="float64"),
            r2_score=None,
            nb_outliers=None,
        )





[docs]
@dataclass(frozen=True)
class DelayParameters:
    """Class ensemble of delay parameters."""

    #: The mean delay between sessions.
    mean: Optional[float]

    #: The median delay between sessions.
    median: Optional[float]

    #: The maximum delay between sessions.
    max: Optional[float]


[docs]
    def to_dict(self) -> Dict[str, Optional[float]]:
        """Convert learning parameters to dictionary format."""
        return dict(delay_mean=self.mean, delay_median=self.median, delay_max=self.max)



[docs]
    @classmethod
    def empty(cls) -> "DelayParameters":
        """Return empty delay parameters."""
        return cls(mean=None, median=None, max=None)





[docs]
@dataclass(frozen=True)
class LearningParameters:
    """Class ensemble of learning parameters."""

    #: The subject's id
    subject_id: str

    #: The measure id
    measure_id: str

    #: The learning model
    model: LearningModel

    #: The delay parameters in days
    delay_parameters: DelayParameters


[docs]
    def to_dict(self) -> Dict[str, Optional[Union[float, int, str]]]:
        """Convert learning parameters to dictionary format."""
        return dict(
            subject_id=self.subject_id,
            measure_id=self.measure_id,
            **self.model.to_dict(),
            **self.delay_parameters.to_dict(),
        )





[docs]
def compute_delay(data: pd.Series) -> DelayParameters:
    """Extract mean, median and maximum delay between consecutive sessions.

    Parameters
    ----------
    data
        A pandas series containing timestamps.

    Returns
    -------
    DelayParameters
        A :class:`dispel.stats.learning.DelayParameters` with the values of the mean,
        median and maximum delay between consecutive trials for a given measure and
        subject in days.
    """
    day_diff = data.diff().dt.total_seconds() / 86_400
    delay_mean, delay_median, delay_max = day_diff.agg(["mean", "median", "max"])
    return DelayParameters(mean=delay_mean, median=delay_median, max=delay_max)




[docs]
def reject_outliers(data: pd.Series, sigma: float) -> pd.Series:
    """Reject outliers with Z-score outside the tolerated bounds.

    Parameters
    ----------
    data
        A pandas series composed of measure values for only one measure and only one
        user and trials numbers as index.
    sigma
        The standard deviation threshold above which the data points are to be
        considered as outliers and therefore rejected.

    Returns
    -------
    pandas.Series
        The data without the detected outlier (if detected) with the same structure as
        the entry.
    """
    # Fit of the model by curve fit.
    baseline_curve = LearningCurve.fit(data.index.values, data.values)

    # Compute predictions and residuals of fitted model curve
    predictions = baseline_curve(data.index.values)
    residuals = (data - predictions).abs()

    if residuals.std() == 0.0 or len(data) <= 3:
        return data.copy()

    # Compute Z-scores of residuals
    zscores = pd.Series(zscore(residuals), index=residuals.index)

    if len(data[zscores >= sigma]) == 0:
        return data.copy()
    return reject_outliers(data.drop(zscores.idxmax()), sigma)




[docs]
def compute_learning_model(
    data: pd.Series, tolerance: float = 0.99, reset_trials: bool = True
) -> Tuple[LearningModel, DelayParameters]:
    """Compute the learning model.

    Parameters
    ----------
    data
        A pandas series composed of measure values for only one measure and only one
        user and trials numbers as index.
    tolerance
        The tolerance threshold above which the data points are to be considered
        outliers and therefore rejected. Should be between ``0`` and ``1``.
    reset_trials
        ``True`` if the trial numbers are to be reset for the new data (without
        outliers). ``False`` otherwise.

    Returns
    -------
    Tuple[LearningModel, DelayParameters]
        The output contains the following information:

            - The fitted learning model.
            - The delay parameters.

    Raises
    ------
    ValueError
        If the threshold tolerance is outside the legal bounds i.e. [0, 1].
    """
    if tolerance < 0 or tolerance > 1:
        raise ValueError(
            f"Unsupported tolerance threshold value: {tolerance}. Must be between 0 "
            f"and 1."
        )

    if data.size < 2:
        return LearningModel.empty(), DelayParameters.empty()

    # Reject outliers
    trial_data = data.copy()
    trial_data.index = trial_data.index.get_level_values("trial")
    new_data = reject_outliers(trial_data, sigma=norm.ppf(tolerance))

    # Retrieve trial start dates and compute delay parameters
    all_start_dates = data.index.to_frame().set_index("trial")
    start_dates = all_start_dates[all_start_dates.index.isin(new_data.index)][
        "start_date"
    ]
    delay_parameters = compute_delay(start_dates)

    if reset_trials:
        new_data.index = pd.Series(np.arange(1, len(new_data) + 1), name="trial")

    model_curve = LearningCurve.fit(new_data.index.values, new_data.values)

    predictions = model_curve(new_data.index)

    model = LearningModel(
        curve=model_curve,
        new_data=new_data,
        nb_outliers=len(trial_data) - len(new_data),
        r2_score=r2_score(new_data, predictions),
    )
    return model, delay_parameters




[docs]
class LearningResult:
    """The learning results for one measure and one or multiple subjects."""

    _COLUMNS = [
        "subject_id",
        "optimal_performance",
        "slope_coefficient",
        "learning_rate",
        "warm_up",
        "r2_score",
        "nb_outliers",
        "delay_mean",
        "delay_median",
        "delay_max",
    ]


[docs]
    def __init__(self):
        self.measure_id = None
        self._parameters = pd.DataFrame(columns=self._COLUMNS)
        self._new_data: Dict[str, pd.Series] = {}


    def _add_learning_result(self, other: "LearningResult"):
        # pylint: disable=protected-access
        if other.measure_id:
            if self.measure_id and self.measure_id != other.measure_id:
                raise ValueError(
                    "Cannot append learning results for different measures."
                )
            self.measure_id = other.measure_id
            self._parameters = self._parameters.append(
                other._parameters, ignore_index=True
            )
            self._new_data = {
                **self._new_data,
                **other._new_data,
            }

    def __add__(self, other):
        if isinstance(other, LearningResult):
            (res := LearningResult())._add_learning_result(self)
            res._add_learning_result(other)
            return res
        raise TypeError("Can only add LearningResults.")

    def __iadd__(self, other):
        if isinstance(other, LearningResult):
            self._add_learning_result(other)
            return self

        raise TypeError("Can only add LearningResults.")


[docs]
    @classmethod
    def from_parameters(cls, learning_parameters: LearningParameters):
        """Initialize learning result from parameters.

        Parameters
        ----------
        learning_parameters
            The learning parameters for the measure and subject in question.

        Returns
        -------
        LearningResult
            The learning result regrouping the given information.
        """
        (res := cls()).append(learning_parameters)
        return res



[docs]
    def append(self, learning_parameters: LearningParameters):
        """Append new learning results for one subject to learning results.

        Parameters
        ----------
        learning_parameters
            The learning parameters for the measure and subject in question.

        Raises
        ------
        ValueError
            If the learning parameters are for a different measure than the one
            concerning the learning result.
        """
        self.measure_id = self.measure_id or learning_parameters.measure_id
        if self.measure_id != learning_parameters.measure_id:
            raise ValueError("Cannot append learning results for different measures.")

        self._parameters = self._parameters.append(
            learning_parameters.to_dict(), ignore_index=True
        )
        self._new_data[
            learning_parameters.subject_id
        ] = learning_parameters.model.new_data



[docs]
    def get_parameters(
        self, subject_id: Optional[str] = None
    ) -> Union[pd.Series, pd.DataFrame]:
        """Get learning results for one or all subjects.

        Parameters
        ----------
        subject_id
            The subject identifier for which the learning is to be retrieved. If
            ``None`` is provided all learning results will be given.

        Returns
        -------
        Union[pandas.Series, pandas.DataFrame]
            If a valid subject id is given, the output is a pandas series summarizing
            learning results. If ``None`` is given the output will be a pandas data
            frame summarizing all learning results.

        Raises
        ------
        ValueError
            If the subject identifier is not found in the learning analysis results.
        """
        if subject_id is None:
            return self._parameters.copy()
        if subject_id not in (ids := set(self._parameters["subject_id"])):
            raise ValueError(
                f"The given subject id {subject_id} is not found. Must be within the "
                f"following values: {ids}."
            )
        return self._parameters[self._parameters["subject_id"] == subject_id]



[docs]
    def get_new_data(self, subject_id: str) -> pd.Series:
        """Get the new data points without outliers.

        Parameters
        ----------
        subject_id
            The identifier of the subject for which the new data is to be retrieved.

        Returns
        -------
        pandas.Series
            A pandas series containing the new data points for the measure in question
            (without outliers).

        Raises
        ------
        ValueError
            If the subject identifier is not found in the learning analysis results.
        """
        if subject_id not in self._new_data:
            raise ValueError(
                f"Subject not found {subject_id}. Must be withing following values: "
                f"{self._new_data.keys()}"
            )
        return self._new_data[subject_id]





[docs]
def extract_learning_for_one_subject(
    measure_collection: MeasureCollection,
    subject_id: str,
    measure_id: str,
    tolerance: float = 0.99,
    reset_trials: bool = True,
) -> LearningResult:
    """Compute learning for a unique subject and a unique measure.

    Parameters
    ----------
    measure_collection
        A measure collection containing any measures and any subjects.
    subject_id
        The identifier of the subject for which the delay is to be computed.
    measure_id
        The identifier of the measure for which the delay is to be computed.
    tolerance
        The tolerance threshold above which the data points are to be considered
        outliers and therefore rejected. Should be between ``0`` and ``1``.
    reset_trials
        ``True`` if the trial numbers are to be reset for the new data (without
        outliers). ``False`` otherwise.

    Returns
    -------
    LearningResult
        The learning result for one subject of the measure in question. See:
        :class:`dispel.stats.learning.LearningResult`.
    """
    # Retrieve measure values
    measure_values = measure_collection.get_measure_values_over_time(
        subject_id=subject_id, measure_id=measure_id, index=["start_date", "trial"]
    ).dropna()

    # Compute learning model and delay parameters
    model, delay_parameters = compute_learning_model(
        measure_values, tolerance, reset_trials
    )

    return LearningResult.from_parameters(
        LearningParameters(
            subject_id=subject_id,
            measure_id=measure_id,
            model=model,
            delay_parameters=delay_parameters,
        )
    )




[docs]
def extract_learning_for_all_subjects(
    measure_collection: MeasureCollection,
    measure_id: str,
    tolerance: float = 0.99,
    reset_trials: bool = True,
) -> LearningResult:
    """Compute learning parameters for all subjects in a measure collection.

    Parameters
    ----------
    measure_collection
        A measure collection containing any measures and any subjects.
    measure_id
        The measure id on which the learning parameters are to be computed.
    tolerance
        The tolerance threshold above which the data points are to be considered
        outliers and therefore rejected. Should be between ``0`` and ``1``.
    reset_trials
        ``True`` if the trial numbers are to be reset for the new data (without
        outliers). ``False`` otherwise.

    Returns
    -------
    LearningResult
        The learning result for all subjects of the measure in question. See:
        :class:`dispel.stats.learning.LearningResult`.
    """
    learning_results = (
        extract_learning_for_one_subject(
            measure_collection,
            subject_id=subject_id,
            measure_id=measure_id,
            tolerance=tolerance,
            reset_trials=reset_trials,
        )
        for subject_id in measure_collection.subject_ids
    )

    return sum(learning_results, LearningResult())