Source code for dispel.io.raw
"""Common functionality to read files."""
from typing import Dict, List, Optional, Sequence, Union
import numpy as np
import pandas as pd
from dispel.data.raw import (
    RawDataSet,
    RawDataSetDefinition,
    RawDataSetSource,
    RawDataValueDefinition,
    SensorType,
)
from dispel.data.values import ValueDefinition
[docs]
def raw_data_set_definition_to_columns(definition: RawDataSetDefinition) -> List[str]:
    """Get column names for data set definition.
    Parameters
    ----------
    definition
        The raw data set definition.
    Returns
    -------
    List[str]
        The column names of the value definitions of the data set.
    """
    return [d.id for d in definition.value_definitions] 
[docs]
def raw_data_set_definition_to_index(definition: RawDataSetDefinition) -> List[str]:
    """Get index names for data set definition.
    Parameters
    ----------
    definition
        The raw data set definition
    Returns
    -------
    List[str]
        The list of indices, if any value definition is part of the index
    """
    return [d.id for d in definition.value_definitions if d.is_index] 
[docs]
def convert_types(
    sensor_data: pd.DataFrame, value_definition: ValueDefinition
) -> pd.DataFrame:
    """Convert sensor data types.
    The conversion happens only if specified in
    :class:`dispel.data.raw.RawDataSetDefinition`.
    Parameters
    ----------
    sensor_data
        Copy of the data frame output of :func:`read_sensor_log_as_data_frame`
    value_definition
        The definition of a value.
    Returns
    -------
    pandas.DataFrame
        The updated data frame with formatted types.
    """
    # extract the data_type
    data_type = value_definition.data_type
    if data_type is not None:
        # get value id
        id_ = value_definition.id
        # check for special case with timedelta64
        if np.issubdtype(data_type, np.timedelta64):
            unit = get_unit_from_datatype(data_type)
            sensor_data[id_] = pd.to_timedelta(sensor_data[id_], unit=unit)
        elif np.issubdtype(data_type, np.datetime64):
            unit = get_unit_from_datatype(data_type)
            sensor_data[id_] = pd.to_datetime(sensor_data[id_], unit=unit).astype(
                "datetime64[ns]"
            )
        else:
            # cast sensor data in new data_type
            sensor_data[id_] = sensor_data[id_].astype(data_type, errors="ignore")
    return sensor_data 
[docs]
def get_unit_from_datatype(data_type: str) -> Union[str, None]:
    """Parse data_type string to extract the unit.
    Parameters
    ----------
    data_type
        The data type, e.g., ``'timedelta64[us]'``.
    Returns
    -------
    str
        The unit, e.g. the unit of ``'timedelta64[us]'`` is ``us``
    """
    split = data_type.split("[")
    # check if there is a unit
    if len(split) <= 1:
        return None
    return split[1][:-1] 
[docs]
def generate_raw_data_value_definition(column: str, unit: Optional[str] = None):
    """Create a basic RawDataValueDefinition for data (unit is optional)."""
    return RawDataValueDefinition(column, f"{column} data", unit) 
[docs]
def generate_raw_data_set_definition(
    data_set_id: str,
    columns: Sequence[str],
):
    """Create a basic RawDataSetDefinition for data."""
    definitions = [RawDataValueDefinition(column, column.upper()) for column in columns]
    return RawDataSetDefinition(data_set_id, RawDataSetSource("example"), definitions) 
[docs]
def generate_raw_data_set(data_set_id: str, columns: Sequence[str]) -> RawDataSet:
    """Generate a random raw data set."""
    return RawDataSet(
        generate_raw_data_set_definition(data_set_id, columns),
        pd.DataFrame(0, index=range(2), columns=columns),
    )