Source code for niimpy.preprocessing.survey

# Utilities for dealing with survey data

import pandas as pd
import numpy as np


# Below, we provide some mappings between standardized survey raw questions and their respective codes
# You will need to adjust these mappings to your own needs if your questions do not match with these values.

# PHQ2: Patient Health Questionnaire. Link: https://en.wikipedia.org/wiki/Patient_Health_Questionnaire
PHQ2_MAP = {
    'Little interest or pleasure in doing things' : 'PHQ2_1',
    'Feeling down depressed or hopeless' : 'PHQ2_2',
}

# PHQ9: Patient Health Questionnaire. Link: https://en.wikipedia.org/wiki/PHQ-9
PHQ9_MAP = {'Little interest or pleasure in doing things' : "PHQ9_1",
            'Feeling down depressed or hopeless' : "PHQ9_2",
            'Trouble falling or staying asleep or sleeping too much' : "PHQ9_3",
            'Feeling tired or having little energy' : "PHQ9_4",
            'Poor appetite or overeating' : "PHQ9_5",
            'Feeling bad about yourself or that you are a failure or have let yourself or your family down' : "PHQ9_6",
            'Trouble concentrating on things such as reading the newspaper or watching television' : "PHQ9_7",
            'Moving or speaking so slowly that other people could have noticed. Or the opposite being so fidgety or restless that you have been moving around a lot more than usual' : "PHQ9_8",
            'Thoughts that you would be better off dead or of hurting yourself' : "PHQ9_9",
            }

# PSQI: Pittsburgh Sleep Quality Index. Link: https://en.wikipedia.org/wiki/Pittsburgh_Sleep_Quality_Index
PSQI_MAP = {
    'Currently is your sleep typically interrupted? (For example for attending to a child or due to loud neighbours or medical reasons.)' : 'PSQI_1',
    'During the past month how often have you taken medicine (prescribed or “over the counter”) to help you sleep' : 'PSQI_2',
    'During the past month how often have you had trouble staying awake while driving eating meals or engaging in social activity' : 'PSQI_3',
    'During the past month how much of a problem has it been for you to keep up enthusiasm to get things done' : 'PSQI_4',
    'During the past month how would you rate your sleep quality overall' : 'PSQI_5',
    'When have you usually gone to bed? (hh:mm)' : 'PSQI_6',
    'What time have you usually gotten up in the morning? (hh:mm)' : 'PSQI_7',
    'How long (in minutes) has it taken you to fall asleep each night' : 'PSQI_8',
    'How many hours of actual sleep did you get at night' : 'PSQI_9',
}

# PSS-10: Perceived Stress Scale. Link: https://en.wikipedia.org/wiki/Perceived_Stress_Scale
PSS10_MAP = {
    'In the last month how often have you been upset because of something that happened unexpectedly' : 'PSS10_1',
    'In the last month how often have you felt that you were unable to control the important things in your life' : 'PSS10_2',
    'In the last month how often have you felt nervous and “stressed”' : 'PSS10_3',
    'In the last month how often have you felt confident about your ability to handle your personal problems' : 'PSS10_4',
    'In the last month how often have you felt that things were going your way' : 'PSS10_5',
    'In the last month how often have you been able to control irritations in your life' : 'PSS10_6',
    'In the last month how often have you felt that you were on top of things' : 'PSS10_7',
    'In the last month how often have you been angered because of things that were outside of your control' : 'PSS10_8',
    'In the last month how often have you felt difficulties were piling up so high that you could not overcome them' : 'PSS10_9',
    'In the last month how often have you found that you could not cope with all the things that you had to do': 'PSS10_10'
}

# PANAS: Positive and Negative Affect Schedule. Link: https://en.wikipedia.org/wiki/Positive_and_Negative_Affect_Schedule
PANAS_MAP = {
    'Upset': 'upset',
    'Hostile': 'hostile',
    'Alert': 'alert',
    'Ashamed': 'ashamed',
    'Inspired': 'inspired',
    'Nervous': 'nervous',
    'Determined': 'determined',
    'Attentive': 'attentive',
    'Afraid': 'afraid',
    'Active': 'active',
}

# GAD: Generalized anxiety disorder. Link: https://en.wikipedia.org/wiki/Generalized_anxiety_disorder
GAD2_MAP = {
    'Feeling nervous anxious or on edge': 'GAD2_1',
    'Not being able to stop or control worrying': 'GAD2_2'
}

# The below mappings map between answers to the questionnaires and their numerical values
# You will need to adjust these mappings to your own needs if the answers do not match with these values.
PSS_ANSWER_MAP = {
    'never': 0,
    'almost never': 1,
    'sometimes': 2,
    'fairly often': 3,
    'very often': 4
}

PHQ2_ANSWER_MAP = {
    'not at all': 0,
    'several days': 1,
    'more than half the days': 2,
    'nearly every day': 3
}

PHQ9_ANSWER_MAP = {
    "Not at all": 0,
    "Several days": 1,
    "More than half the days": 2,
    "Nearly every day": 3
}

# use this mapping for prefix option, so that multiple question id's can be processed
# simultaneuously
ID_MAP_PREFIX = {'PSS' : PSS_ANSWER_MAP,
                 'PHQ2' : PHQ2_ANSWER_MAP,
                 'GAD2' : PHQ2_ANSWER_MAP}

# use this mapping if you want to explicitly specify the mapping for each question
ID_MAP =  {'PSS10_1' : PSS_ANSWER_MAP,
           'PSS10_2' : PSS_ANSWER_MAP,
           'PSS10_3' : PSS_ANSWER_MAP,
           'PSS10_4' : PSS_ANSWER_MAP,
           'PSS10_5' : PSS_ANSWER_MAP,
           'PSS10_6' : PSS_ANSWER_MAP,
           'PSS10_7' : PSS_ANSWER_MAP,
           'PSS10_8' : PSS_ANSWER_MAP,
           'PSS10_9' : PSS_ANSWER_MAP,
           'PSS10_10' : PSS_ANSWER_MAP}

group_by_columns = set(["user", "device"])

[docs]def group_data(df):
    """ Group the dataframe by a standard set of columns listed in
    group_by_columns."""
    columns = list(group_by_columns & set(df.columns))
    return df.groupby(columns)

[docs]def reset_groups(df):
    """ Group the dataframe by a standard set of columns listed in
    group_by_columns."""
    columns = list(group_by_columns & set(df.index.names))
    return df.reset_index(columns)


[docs]def clean_survey_column_names(df):
    """
    This function takes a pandas DataFrame as input and cleans the column names
    by removing or replacing specified characters. It helps to ensure standardized
    and clean column names for further analysis or processing.
    
    Parameters
    ----------
        df : pandas dataframe
          The input DataFrame with column names to be cleaned.
    
    Returns
    -------
        df : pandas.DataFrame
          The DataFrame with cleaned column names.
    """
    for char in ['.', ',', ':', ';', '!', '?', '(', ')', '[', ']', '{', '}']:
        df.columns = df.columns.str.replace(char, "")
    for char in ['-', '_', '—']:
        df.columns = df.columns.str.replace(char, " ")
    return df


[docs]def convert_survey_to_numerical_answer(df, id_map, use_prefix=False):
    """Convert text answers into numerical value (assuming a long dataframe).
    Use answer mapping dictionaries provided by the users to convert the answers.
    Can convert multiple questions having the same prefix (e.g., PSS10_1, PSS10_2, ...,PSS10_9)
    if prefix mapping is provided. Function returns original values for the 
    answers that have not been specified for conversion.
    
    
    Parameters
    ----------
    df : pandas dataframe
        Dataframe containing the questions
        
    answer_col : str
        Name of the column containing the answers
        
    question_id : str
        Name of the column containing the question id.
        
    id_map : dictionary
        Dictionary containing answer mappings (value) for each question_id (key),
        or a dictionary containing a map for each question id prefix if use_prefix 
        option is used.
           
    use_prefix : boolean
        If False, uses given map (id_map) to convert questions. The default is False.  
        If True, use question id prefix map, so that multiple question_id's having 
        the same prefix may be converted on the same time. 
    
    Returns
    -------
    result : pandas series
        Series containing converted values and original values for aswers hat are not 
        supposed to be converted.
    
    """
    assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe."
    assert isinstance(id_map, dict), "id_map is not a dictionary."
    assert isinstance(use_prefix, bool), "use_prefix is not a bool."

    for key, map in id_map.items():
        if use_prefix == True:
            columns  = [c for c in df.columns if c.startswith(key)]
        else:
            columns = [c for c in df.columns if c == key]
        for col in columns:
            for char in [',', ':', ';', '!', '?', '(', ')', '[', ']', '{', '}']:
                df[col] = df[col].str.replace(char, "")
            for char in ['-', '_', '—']:
                df[col] = df[col].str.replace(char, " ")
            df[col] = df[col].map(map)
    return df

[docs]def survey_statistic(df, config):
    '''
    Return statistics for a single survey question or a list of questions.
    Assuming that each of the columns contains numerical values representing
    answers, this function returns the mean, maximum, minimum and standard
    deviation for each question in separate columns.

    Parameters
    ----------
    df: pandas.DataFrame
        Input data frame
    config: dict
        Dictionary keys containing optional arguments for the computation of screen
        information

        configuration options include:
            columns: string or list(string), optional
                A list of columns to process. If empty, the prefix will be
                used to identify columns
            prefix: string or list(string)
                required unless columns is given. The function will process
                columns whose name starts with the prefix (QID_0, QID_1, ...)
                
    Returns
    -------
    dict: pandas.DataFrame
        A dataframe containing summaries of each questionaire.
    '''

    columns = config.get('columns', None)
    prefix = config.get('prefix', None)
    resample_args = config.get('resample_args', {"rule":"1D"})    
    
    assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe."
    if columns is not None:
        assert type(columns) == str or type(columns) == list, "columns is not a string or a list of strings."
    if prefix is not None:
        assert type(prefix) == str or type(prefix) == list, "prefix is not a string or a list of strings."
    if columns is None and prefix is None:
        raise ValueError("Either columns or prefix must be specified.")
    
    if columns is None:
        if type(prefix) == list:
            columns = []
            for pref in prefix:
                columns += [c for c in df.columns if c.startswith(pref)]
        else:
            columns = [c for c in df.columns if c.startswith(prefix)]
    
    if type(columns) == str:
        columns = [columns] 
    
    def calculate_statistic(df):
        result = {}
        for answer_col in columns:
            result[answer_col+"_mean"] = df[answer_col].mean()
            result[answer_col+"_min"] = df[answer_col].min()
            result[answer_col+"_max"] = df[answer_col].max()
            result[answer_col+"_std"] = df[answer_col].std()
        return pd.Series(result)

    res = group_data(df).resample(**resample_args).apply(calculate_statistic)
    res = reset_groups(res)
    return res


[docs]def sum_survey_scores(df, survey_prefix=None):
    """Sum all columns (like ``PHQ9_*``) to get a survey score.

    Parameters
    -------
    
    df: pandas DataFrame 
        DataFrame should be a DateTime index, an answer_column with numeric
        scores, and an id_column with question IDs like "PHQ9_1", "PHQ9_2",
        etc.  The given survey_prefix is the "PHQ9" (no underscore) part
        which selects the right questions (rows not matching this prefix
        won't be included).

    survey_prefix: string
        The survey prefix in the 'id' column, e.g. 'PHQ9'.  An '_' is appended.
        
    
    Return
    -------
    survey_score: pandas DataFrame
        DataFrame contains the sum of each questionnaires marked with survey_prefix
    """

    assert type(survey_prefix) == str or type(survey_prefix) == list, "survey_prefix is not a string or a list of strings."

    result = pd.DataFrame(df["user"])

    if type(survey_prefix) == str:
        survey_prefix = [survey_prefix]

    for prefix in survey_prefix:
        columns = [c for c in df.columns if c.startswith(prefix)]
        result[prefix] = df[columns].sum(axis=1, skipna=False)
    
    return result


ALL_FEATURES = [globals()[name] for name in globals()
                         if name.startswith('survey_')]
ALL_FEATURES = {x: {} for x in ALL_FEATURES}

[docs]def extract_features_survey(df, features=None):
    """Calculates survey features

    Parameters
    ----------
    df : pd.DataFrame
        dataframe of survey data. Must follow Niimpy format. In additions,
        each survey question must be in a single column and the column name
        must be formatted as survey-id_question-number (for example PHQ9_3).
    features : map (dictionary) of functions that compute features.
        it is a map of map, where the keys to the first map is the name of
        functions that compute features and the nested map contains the keyword
        arguments to that function. If there is no arguments use an empty map.
        Default is None. If None, all the available functions are used.
        Those functions are in the dict `survey.ALL_FEATURES`.
        You can implement your own function and use it instead or add it
        to the mentioned map.

    Returns
    -------
    features : pd.DataFrame
        Dataframe of computed features where the index is users and columns
        are the the features.
    """
    if features is None:
        features = ALL_FEATURES
    else:
        assert isinstance(features, dict), "Please input the features as a dictionary"

    computed_features = []
    for features, feature_arg in features.items():
        computed_feature = features(df, feature_arg)
        index_by = list(group_by_columns & set(computed_feature.columns))
        computed_feature = computed_feature.set_index(index_by, append=True)
        computed_features.append(computed_feature)
    
    computed_features = pd.concat(computed_features, axis=1)
    computed_features = computed_features.loc[:,~computed_features.columns.duplicated()]

    if 'group' in df:
        computed_features['group'] = df.groupby('user')['group'].first()

    computed_features = reset_groups(computed_features)
    return computed_features