Source code for dispel.processing.utils
"""Utility functions around data processing."""
from abc import ABCMeta
from itertools import zip_longest
from typing import Union
import pandas as pd
from dispel.data.values import AbbreviatedValue as AV
[docs]
class TaskMixin(metaclass=ABCMeta):
"""A mixin class for entities related to tasks."""
#: The task name
task_name: Union[AV, str]
[docs]
def get_task_name(self, **kwargs) -> Union[str, AV]:
"""Get the task name."""
task_name = kwargs.get("task_name", None) or getattr(self, "task_name")
if isinstance(task_name, (str, AV)):
return task_name
raise ValueError("Missing task name.")
[docs]
def parallel_explode(data: pd.DataFrame, dtype="float64") -> pd.DataFrame:
"""Transform each element of a list-like to a row for all columns.
Parameters
----------
data
The data pandas data frame to be exploded.
dtype
The type of the data frame values.
Returns
-------
pandas.DataFrame
Exploded lists to rows of all columns.
Examples
--------
.. testsetup:: parallel_explode
import pandas as pd
from dispel.processing.utils import parallel_explode
.. doctest:: parallel_explode
>>> df = pd.DataFrame({
... 'a': [[2.], [3., 4.], [], [6., 7.]],
... 'b': [[8.], [9., 10.], [11.], [12., 13.]],
... })
>>> df
a b
0 [2.0] [8.0]
1 [3.0, 4.0] [9.0, 10.0]
2 [] [11.0]
3 [6.0, 7.0] [12.0, 13.0]
>>> parallel_explode(df)
a b
0 2.0 8.0
1 3.0 9.0
2 4.0 10.0
3 6.0 11.0
4 7.0 12.0
5 NaN 13.0
"""
return pd.DataFrame(
(
exploded_column
for exploded_column in zip_longest(
*[data[column].explode().dropna() for column in data]
)
),
columns=data.columns,
dtype=dtype,
)