Source code for dispel.data.raw
"""A module containing models for raw data sets."""
from dataclasses import InitVar, dataclass, field
from enum import Enum
from operator import eq
from typing import Dict, Iterable, Optional
import pandas as pd
from dispel.data.flags import FlagMixIn
from dispel.data.validators import RangeValidator
from dispel.data.values import DefinitionId, ValueDefinition
#: Default columns ids for sensors
DEFAULT_COLUMNS = list("xyz")
#: Column ids of the accelerometer sensor
ACCELEROMETER_COLUMNS = [f"userAcceleration{x}" for x in "XYZ"]
#: Column ids of the gravity sensor
GRAVITY_COLUMNS = [f"gravity{x}" for x in "XYZ"]
#: Map user acceleration columns
USER_ACC_MAP = {
"userAccelerationX": "x",
"userAccelerationY": "y",
"userAccelerationZ": "z",
"ts": "ts",
}
#: Max pressure in screen measurements
PRESSURE_MAX = 6.666666666666667
#: A validator for pressure in screen measurements
PRESSURE_VALIDATOR = RangeValidator(lower_bound=0, upper_bound=PRESSURE_MAX)
[docs]
class SensorType(str, Enum):
"""Abstract class for sensor type."""
[docs]
class EmptyDataError(ValueError):
"""Exception raised when a pandas dataframe is empty.
Parameters
----------
message
An optional message.
"""
[docs]
def __init__(self, message: Optional[str] = None):
self.message = message
super().__init__(self.message)
def __str__(self):
"""Pretty print error message."""
res = "Data frame is empty."
if self.message:
res += self.message
return res
[docs]
class MissingColumnError(ValueError):
"""Exception raised when a pandas dataframe is missing required column(s).
Parameters
----------
columns
The set of the missing column names.
message
An optional message.
"""
[docs]
def __init__(self, columns: Iterable[str], message: Optional[str] = None):
self.columns = columns
self.message = message
super().__init__(self.message)
def __str__(self):
"""Pretty print error message."""
res = f"Data frame is missing the following columns: {self.columns}. "
if self.message:
res += self.message
return res
[docs]
class RawDataValueDefinition(ValueDefinition):
"""The definition of raw data set values.
Attributes
----------
is_index
``True`` if the values are part of the raw data set index. Otherwise, ``False``.
"""
[docs]
def __init__(
self,
id_: str,
name: str,
unit: Optional[str] = None,
description: Optional[str] = None,
data_type: Optional[str] = None,
precision: Optional[int] = None,
is_index: bool = False,
):
super().__init__(
id_=id_,
name=name,
unit=unit,
description=description,
data_type=data_type,
precision=precision,
)
self.is_index = is_index
[docs]
@dataclass
class RawDataSetSource:
"""The source of a raw data set."""
#: The manufacturer producing the raw data set source
manufacturer: str
def _create_value_definition_dict(
definitions: Optional[Iterable[ValueDefinition]],
) -> Dict[DefinitionId, ValueDefinition]:
"""Turn iterables of definitions into a dictionary."""
res = {}
if definitions:
for definition in definitions:
if definition.id in res:
raise ValueError(
f"Duplicate measure value definition for {definition.id}"
)
res[definition.id] = definition
return res
[docs]
@dataclass
class RawDataSetDefinition:
"""The definition of a raw data set."""
#: The identifier of the raw data set definition
id: str
#: The source of the raw data set
source: RawDataSetSource
value_definitions_list: InitVar[Iterable[RawDataValueDefinition]]
is_computed: bool = False
"""`True` if the raw data source is computed. ``False`` if it is a measured
source without transformation, e.g. acceleration recorded from the low
level APIs."""
_value_definitions: Dict[DefinitionId, ValueDefinition] = field(init=False)
def __post_init__(self, value_definitions_list):
self._value_definitions = _create_value_definition_dict(value_definitions_list)
@property
def value_definitions(self):
"""Get the value definitions of the raw data set."""
return self._value_definitions.values()
[docs]
def get_value_definition(self, id_: DefinitionId):
"""Get a value definition."""
return self._value_definitions[id_]
def __hash__(self):
return hash(self.id)
def __eq__(self, other):
return (
isinstance(other, RawDataSetDefinition)
and self.id == other.id
and self.source == other.source
and eq(set(self.value_definitions), set(other.value_definitions))
and self.is_computed == other.is_computed
)
[docs]
class RawDataSet(FlagMixIn):
"""A raw data set.
Parameters
----------
definition
The definition of the raw data set
data
The data set
"""
[docs]
def __init__(self, definition: RawDataSetDefinition, data: pd.DataFrame):
super().__init__()
self.definition = definition
self.data = data
precision_exists = any(
[d.precision is not None for d in self.definition.value_definitions]
)
if precision_exists:
# if precision exists then store the original data prior to any rounding
self.raw_data = data
def_ids = {d.id for d in self.definition.value_definitions if not d.is_index}
data_ids = set(data.columns)
diff_data_columns = data_ids - def_ids
if diff_data_columns:
raise ValueError(f"Missing definition for column(s): {diff_data_columns}")
diff_def_ids = def_ids - data_ids
if diff_def_ids:
raise ValueError(f"Missing columns for definition(s): {diff_def_ids}")
# for each column definition check if precision exists and apply it to the data
for col_def in self.definition.value_definitions:
if col_def.precision is not None:
self.data[col_def.id.id] = round(
self.data[col_def.id.id], ndigits=col_def.precision
)
@property
def id(self) -> str:
"""Get the identifier from the definition of the raw data set."""
return self.definition.id
def __repr__(self) -> str:
return f"<RawDataSet: {self.id} ({self.flag_count_repr})>"
[docs]
def concat(self, other: "RawDataSet") -> "RawDataSet":
"""Concatenate two raw data sets."""
if self.definition != other.definition:
raise ValueError("Can only concatenate data sets with equal definitions")
return RawDataSet(self.definition, pd.concat([self.data, other.data]))