Source code for dispel.providers.bdh.tasks.typing.keyboard

"""Keyboard functions to extract relevant properties for Typing."""
from difflib import SequenceMatcher
from typing import List, Optional, Union

import numpy as np
import pandas as pd
from unidecode import unidecode

from dispel.providers.bdh.tasks.typing.const import KeyState


[docs] def transform_reaction_time( word: pd.DataFrame, key_typed: pd.DataFrame ) -> pd.DataFrame: """Compute the reaction time and the first correct letter reaction time. The reaction time is the time elapsed between the appearance of a word and the time the user typed a letter. The first correct letter reaction time follows the same definition but measures the time elapsed until a correct letter is typed. Parameters ---------- word : pd.DataFrame A data frame listing all the words displayed on screen during the test with the following columns: ``words``, ``appearance_timestamp`` and ``disappearance_timestamp``. key_typed : pd.DataFrame The dataframe with the list of key pressed by the user, it should contain the columns: ``timestamp_out`` and ``key``. Returns ------- pd.DataFrame A DataFrame with two columns: the reaction time and the reaction time to the first correct letter. """ def _compute_reaction_time( row: pd.Series, key_typed: pd.DataFrame, first_correct: bool = False ) -> pd.DataFrame: # Mask df_key between start and end mask = (key_typed["timestamp_out"] > row["appearance_timestamp"]) & ( key_typed["timestamp_out"] <= row["disappearance_timestamp"] ) if not first_correct: return ( key_typed.loc[mask, "timestamp_out"].min() - row["appearance_timestamp"] ).total_seconds() # Find the first correct letter in the masked keys condition = key_typed.loc[mask, "key"] == row["word"][0] # Take the first timestamp flagging this condition return ( key_typed.loc[mask][condition]["timestamp_out"].min() - row["appearance_timestamp"] ).total_seconds() return pd.DataFrame( { "reaction_time": word.apply( lambda x: _compute_reaction_time(x, key_typed), axis=1 ), "reaction_time_first_correct": word.apply( lambda x: _compute_reaction_time(x, key_typed, True), axis=1 ), } ).set_index(word["appearance_timestamp"])
[docs] def compute_rt_slope(df_reaction_time: pd.DataFrame, col: str) -> float: """Compute the mean of the reaction time slope. Parameters ---------- df_reaction_time : pd.DataFrame The reaction time dataframe (first letter and first correct letter) col : String The column to be used : reaction_time or reaction_time_first_correct Returns ------- float A float which is the mean slope of the reaction time (first letter or first correct letter) """ return ( df_reaction_time[col].diff() / df_reaction_time.index.to_series().diff().dt.total_seconds() ).mean()
[docs] def total_words_typed(word: pd.DataFrame) -> int: """Find total number of words typed. The total number of words typed successfully is the number of True values in the columns ``success``. Parameters ---------- word: pd.DataFrame A data frame containing the word information with the column ``success``. Returns ------- int Number of words successfully typed. """ return word["success"].sum()
[docs] def time_intervals(data: pd.DataFrame, interval_column: Optional[str]) -> pd.DataFrame: """Find time intervals between consecutive element of a data frame. Parameters ---------- data: pd.DataFrame Any data frame with a timestamp index. interval_column: Optional[str] When specified, indicates that the interval should be computed with values from a column and not the index. Returns ------- pd.Series Time intervals between samples. """ if not interval_column: return data.index.to_series().diff().dt.total_seconds() return data[interval_column].diff().dt.total_seconds()
[docs] def word_duration(word: pd.DataFrame) -> pd.DataFrame: """Compute the time spent per word. Parameters ---------- word: pd.DataFrame A data frame containing the word information with the column ``success``. Returns ------- pd.Series A data frame containing the time spent per word. """ return ( word["disappearance_timestamp"] - word["appearance_timestamp"] ).dt.total_seconds()
[docs] def get_submission_state(word: pd.DataFrame, key_typed: pd.DataFrame) -> pd.DataFrame: """Compute the submission state. Create a data frame that contains the state at each time a user types a key. Available states are the following: ``Correct``, ``Incorrect``, and ``Correcting``. The state is ``Correct`` if the textbox is empty or if it forms a subpart of the word displayed on the screen. The state is incorrect whenever the user types a key (that is not backspace), leading to a textbox that does not form a subpart of the word displayed on the screen. Finally, the state is identified as correcting whenever the user corrects its mistake by typing backspace; it can be seen as an intermediate state between an incorrect and a correct state. Parameters ---------- word : pd.DataFrame A data frame listing all the words displayed on screen during the test with the following columns: ``words``, ``appearance_timestamp`` and ``disappearance_timestamp``. key_typed : pd.DataFrame The dataframe with the list of key pressed by the user, it should contain the columns: ``timestamp_out`` and ``key``. Returns ------- pd.DataFrame A data frame that contains the user state after each keystroke. """ def _displayed_word(timestamp): """Find the word displayed on screen.""" return word["word"].values[ (timestamp < word["disappearance_timestamp"]) & (timestamp > word["appearance_timestamp"]) ][0] input_and_screen = pd.DataFrame( { "displayed_input": key_typed["displayed_input"], "displayed_word": key_typed["timestamp_out"].apply(_displayed_word), } ) is_correct = input_and_screen.apply( lambda x: unidecode(x["displayed_word"]).startswith(x["displayed_input"]), axis=1, ) backspace_pressed = key_typed["key"] == "Backspace" word_changed = input_and_screen["displayed_word"] != input_and_screen[ "displayed_word" ].shift(1) word_changed[0] = False # update the states submission_state: List[KeyState] = [KeyState.INIT] * len(is_correct) for i in range(len(is_correct)): if word_changed.iloc[i] or is_correct.iloc[i]: submission_state[i] = KeyState.CORRECT continue if not is_correct.iloc[i - 1] and backspace_pressed.iloc[i]: submission_state[i] = KeyState.CORRECTING continue if not is_correct.iloc[i]: submission_state[i] = KeyState.INCORRECT res = pd.DataFrame( { "submission_state": submission_state, "displayed_word": input_and_screen["displayed_word"], "displayed_input": input_and_screen["displayed_input"], "key": key_typed["key"], } ) res["success"] = res["displayed_word"].apply( lambda x: word.loc[word["word"] == x, "success"].values[0] ) return res.set_index(key_typed["timestamp_out"])
[docs] def detect_key_autocompletion(submission_state: pd.DataFrame) -> pd.DataFrame: """Detect the autocompletion at the key level. Create a data frame that contains all the typed keys the autocompletion status. Parameters ---------- submission_state : pd.DataFrame A data frame that contains the user state after each keystroke. Returns ------- pd.DataFrame An new dataset with the displayed word, all the key typed and autocompletion information. """ submission_state_copy = submission_state[["displayed_word", "key"]].copy() # Detect the non autocompleted keys to_keep = (submission_state_copy["key"].str.len() == 1) | ( submission_state_copy["key"].str.match("Backspace") ) submission_state_copy["autocomplete_per_key"] = ~to_keep return submission_state_copy
[docs] def detect_word_autocompletion(keys_with_autocompletion: pd.DataFrame) -> pd.DataFrame: """Detect the autocompletion for all the words. Parameters ---------- keys_with_autocompletion : pd.DataFrame A data frame that . Returns ------- pd.DataFrame The word dataset without the autocompleted words """ # Detect the words with autocompletion from the keys per_word_autocompletion = keys_with_autocompletion.groupby( "displayed_word", sort=False ).apply(lambda x: x["autocomplete_per_key"].any()) return pd.DataFrame(per_word_autocompletion, columns=["autocomplete_per_word"])
[docs] def total_autocomplete(keys_with_autocompletion: pd.DataFrame) -> int: """Get the total number of autocompletions. Parameters ---------- keys_with_autocompletion: pd.DataFrame A dataframe indicating whether a key was autocompleted or not. Returns ------- int Number of autocompletions performed by the user. """ return keys_with_autocompletion["autocomplete_per_key"].sum()
[docs] def get_state_durations( submission_state: pd.DataFrame, key_intervals: pd.DataFrame, word: pd.DataFrame, ) -> pd.DataFrame: """Return submission state duration. To compute the state duration, we group by displayed word and by submission state. Indeed, we want to compute statistics per word on the time spent in a given state or between states. Parameters ---------- submission_state : pd.DataFrame Data frame containing the submission state each time a user types a key. key_intervals : pd.DataFrame Time intervals between samples word : pd.DataFrame A data frame listing all the words displayed on screen during the test with the following columns: ``words``, ``appearance_timestamp`` and ``disappearance_timestamp``. Returns ------- pd.DataFrame A data frame that contains duration of each correcting state. """ # We group by displayed word and submission_state to keep track of # whether the state corresponds to a successfully typed word or # not. Otherwise, two correctly typed words would just regroup into # one `Correct` state. temp_df = submission_state.copy() intervals = key_intervals["key_intervals"].values temp_df["key_intervals"] = intervals temp_df["success"] = submission_state["displayed_word"].apply( lambda x: word.loc[word["word"] == x, "success"].values[0] ) t_between_states = submission_state.index.to_series().diff().dt.total_seconds() temp_df["t_between_states"] = t_between_states temp_df["index"] = submission_state.index groups = temp_df.groupby( [ "displayed_word", ( submission_state["submission_state"].shift() != submission_state["submission_state"] ).cumsum(), ] ) res = pd.DataFrame(groups) res["submission_state"] = groups.submission_state.apply(lambda x: x.iloc[0]).values res["duration"] = groups.sum()["key_intervals"].values res["success"] = groups.success.apply(lambda x: x.all()).values res["t_between_states"] = groups.t_between_states.apply(lambda x: x.iloc[0]).values res["displayed_word"] = groups.displayed_word.apply(lambda x: x.iloc[0]).values res["index"] = groups.index.apply(lambda x: x.iloc[0]).values # These columns were created by the groupby object being converted # into a dataframe. res.drop([0, 1], axis=1, inplace=True) return res.set_index("index").sort_index()
[docs] def get_reaction_duration(state_durations: pd.DataFrame) -> pd.DataFrame: """Compute the reaction duration. The reaction duration is the time elapsed in an incorrect state for a correct submission. Parameters ---------- state_durations : pd.DataFrame Data frame containing submission states and duration. Returns ------- pd.DataFrame A data frame that contains reaction duration. """ return state_durations.loc[ state_durations["submission_state"] == KeyState.INCORRECT, ["duration", "success"], ]
[docs] def get_correct_reaction_duration( reaction_duration: pd.DataFrame, ) -> pd.DataFrame: """Return reaction durations for correct words. Parameters ---------- reaction_duration : pd.DataFrame Data frame containing reacting durations. Returns ------- pd.DataFrame A data frame that contains reaction duration for correct words. """ return reaction_duration.loc[reaction_duration["success"], "duration"]
[docs] def get_correcting_duration(state_durations: pd.DataFrame) -> pd.DataFrame: """Compute correcting duration. The Correcting Duration is the time elapsed in a correcting state for a correct submission, in other words, the time spent correcting a mistake. Parameters ---------- state_durations : pd.DataFrame Data frame containing submission states and duration. Returns ------- pd.DataFrame A data frame that contains correcting duration. """ return state_durations.loc[ state_durations["submission_state"] == KeyState.CORRECTING, ["duration", "success"], ]
[docs] def get_correct_correcting_duration( correcting_duration: pd.DataFrame, ) -> pd.DataFrame: """Return correcting duration for correct words. Parameters ---------- correcting_duration : pd.DataFrame Data frame containing correcting durations. Returns ------- pd.DataFrame A data frame that contains correcting duration for correct words. """ return correcting_duration.loc[correcting_duration["success"], "duration"]
[docs] def get_reacting_times(state_durations: pd.DataFrame) -> pd.DataFrame: """Return reacting times. The reacting time is the time elapsed between an incorrect state and a correcting state for a correct submission, in other words, the time to react to a mistake. Parameters ---------- state_durations : pd.DataFrame Data frame containing submission states and duration. Returns ------- pd.DataFrame A data frame containing reacting times. """ return state_durations.loc[ state_durations["submission_state"] == KeyState.CORRECTING, ["t_between_states", "success"], ]
[docs] def get_correct_reacting_time(reacting_times: pd.DataFrame) -> pd.DataFrame: """Return reacting times for correct words. Parameters ---------- reacting_times : pd.DataFrame Data frame containing correction differences. Returns ------- pd.DataFrame A data frame that contains reacting duration for correct words. """ return reacting_times.loc[reacting_times["success"], "t_between_states"]
[docs] def find_consec_element_and_count(data: pd.DataFrame) -> pd.DataFrame: """Find consecutive elements and count them. Parameters ---------- data : pd.DataFrame Data frame to compute measure related to successfully written words with(out) error, and also a streak of words with(out) error(s). Returns ------- pd.DataFrame A data frame with three columns: the count, the submission state and if the sequence was free of error """ res = data.copy() # Assign a group to each consecutive elements res["subgroup"] = (res["is_error_free"] != res["is_error_free"].shift(1)).cumsum() group = res.groupby("subgroup") return pd.DataFrame( { "count": group["is_error_free"].count(), "success": group["success"].any(), "is_error_free": group["is_error_free"].any(), } )
[docs] def count_words_typed_in_row(data: pd.DataFrame, error_free: bool = True) -> int: """Count the successfully typed words with(out) error in a row. The parameter ``error_free`` indicates if we look at consecutive words that has been successfully typed and: without error (True) or with error (False). Parameters ---------- data Dataframe containing typing data error_free If True, only count words typed with no error Returns ------- Number of words typed in a row """ mask = data.is_error_free if not error_free: mask = ~mask res = data.loc[data.success & mask, "count"] if len(res) > 0: return res.max() return 0
[docs] def similarity_ratio(x: Union[pd.Series, pd.DataFrame]) -> float: """Compute the similarity ratio.""" a, b = x.values return SequenceMatcher(None, a, b).ratio()
[docs] def apply_similarity_ratio(submission_state: pd.DataFrame) -> pd.Series: """Apply similarity ratio. The similarity ratio is computed between the word being typed ``displayed_input`` and the word on screen ``displayed_word``. Parameters ---------- submission_state A data frame that contains the user state after each keystroke. Returns ------- Similarity ratio between the displayed input and displayed_word. """ res = submission_state[["displayed_input", "displayed_word"]].copy() res["similarity_ratio"] = res.apply(similarity_ratio, axis=1) return res
[docs] def count_incorrect_words(word: pd.DataFrame) -> float: """Count the number of incorrect words. Parameters ---------- word : pd.DataFrame A data frame of the words. Returns ------- int The number of incorrect words. """ return (~word["success"]).sum()
[docs] def count_key_pressed(key_pressed: pd.Series, alphabet: bool = False) -> int: """Count the number of keys pressed. Parameters ---------- key_pressed A pd.Series indicating the key pressed during the test. alphabet An optional argument indicating if the count should be done on alphabet letters only. Returns ------- int The number of keys pressed. """ if alphabet: return key_pressed.apply(lambda x: x.isalpha()).sum() return key_pressed.sum()
[docs] def ratio_key_pressed( submission_state: pd.DataFrame, word: pd.DataFrame ) -> pd.DataFrame: """Group by word and count the letters typed for completed words. Parameters ---------- submission_state A data frame that contains the user state after each keystroke. word A data frame listing all the words displayed on screen during the test with the following columns: ``words``, ``appearance_timestamp`` and ``disappearance_timestamp``. Returns ------- pd.DataFrame A data frame with a column ``letters_typed_over_length`` containing the following ratio: number of letters typed divided by the length of the word (for completed words). """ res = ( submission_state[["displayed_word", "key"]] .groupby(by="displayed_word") .apply(lambda x: count_key_pressed(x["key"], True)) ) # keep the completed words only completed_word = word.loc[word["success"], "word"] res = res[completed_word] # compute word length word_length = res.index.to_series().apply(len) # return the ratio letters typed over word length return pd.DataFrame({"letters_typed_over_length": res / word_length}).set_index( res.index )
[docs] def letter_interval( key_is_letter: pd.Series, key_intervals: pd.Series, extra_mask: Optional[pd.Series] = None, ) -> pd.Series: """Compute the interval between two consecutive letters. Parameters ---------- key_is_letter A boolean series indicating if a key is a letter. key_intervals A series of float indicating the time between two consecutive keys. extra_mask An optional series of boolean that will be used as an extra mask to filter intervals. Returns ------- Tuple[pd.Series, pd.Series] The first element of the tuple is the time between each consecutive letter keys. The second element is the mask that has been used to filter the `key_intervals` to find the letters of interests. """ mask = key_is_letter & key_is_letter.shift(1) if extra_mask is not None: mask = mask & extra_mask return key_intervals[mask], mask
[docs] def keep_top_ten(data: pd.DataFrame) -> pd.Series: """Keep top 10% intervals. Parameters ---------- data A dataframe of letter intervals with the column `letter_intervals`. Returns ------- The top 10 percent letter intervals. """ res = data["letter_intervals"] if res.empty: return res return res[res >= np.percentile(res, q=90)]
[docs] def max_letter_interval_dev(data: pd.DataFrame) -> pd.Series: """Compute the maximum letter interval deviation from the mean.""" return ((data["letter_intervals"] - data["letter_intervals"].mean()) ** 2).max()
[docs] def filter_intervals(data: pd.DataFrame, differentiate: bool) -> pd.Series: """Compute letter intervals prior a mistake within a word.""" mask = (data["submission_state"] == KeyState.CORRECT).to_numpy() # Find first non-correct element if there is one not_correct = np.nonzero(~mask)[0] if len(not_correct) > 0: # turn to False every Correct letters typed after a mistake mask[not_correct[0] :] = False mask = mask & data["is_letter"].to_numpy() res = data.index.to_series().diff() if differentiate: res = res.diff() return res[mask].dt.total_seconds()
[docs] def interval_until_mistake( submission_state: pd.DataFrame, key_is_letter: pd.DataFrame, differentiate: bool, ) -> pd.DataFrame: """Compute letter intervals prior a mistake for each word.""" col = "interval_until_mistake" if differentiate: col = f"derived_{col}" res = submission_state[["displayed_word", "submission_state"]].copy() res["is_letter"] = key_is_letter["key_is_letter"].values multi_index_res = ( res.groupby(by="displayed_word") .apply(lambda x: filter_intervals(x, differentiate)) .dropna() ) if multi_index_res.empty: return pd.DataFrame({col: []}) return pd.DataFrame({col: multi_index_res.values})
[docs] def compute_typing_speed_slope( word: pd.DataFrame, submission_state: pd.DataFrame ) -> float: """Compute the slope of the typing speed.""" def _compute_char_speed(group: pd.DataFrame): # Return the mean interval between the characters of a word return group.index.to_series().diff().dt.total_seconds().mean() # Get the right columns and change index for the merge df_word_filtered = word[["appearance_timestamp", "word"]].copy().set_index("word") # Compute the typing speed for each word char_speed = pd.DataFrame( submission_state.groupby("displayed_word").apply(_compute_char_speed), columns=["mean_char_speed"], ) # Index based inner join to discard uncompleted words res = pd.merge(df_word_filtered, char_speed, left_index=True, right_index=True) # Return the mean slope return ( res["mean_char_speed"].diff() / res["appearance_timestamp"].diff().dt.total_seconds() ).mean()