Source code for niimpy.preprocessing.application

import numpy as np
import pandas as pd

import niimpy
from niimpy.preprocessing import battery as b
from niimpy.preprocessing import screen as s

group_by_columns = set(["user", "device", "app_group"])

[docs]def group_data(df): """ Group the dataframe by a standard set of columns listed in group_by_columns.""" columns = list(group_by_columns & set(df.columns)) return df.groupby(columns)
[docs]def reset_groups(df): """ Group the dataframe by a standard set of columns listed in group_by_columns.""" columns = list(group_by_columns & set(df.index.names)) return df.reset_index(columns)
MAP_APP = {'CrossCycle':'sports', 'Runtastic':'sports', 'Polar Flow':'sports', 'Pedometer - Step Counter':'sports', 'STAMINA-tila':'sports', 'Fit':'sports', 'Modo STAMINA':'sports', '7 MINUTE WORKOUT':'sports', 'Moves':'sports', 'Six Pack in 30 Days':'sports', 'Bodyweight':'sports', 'Sports Tracker':'sports', 'Fit':'sports', 'Sports Tracker':'sports', 'Pedometer Step Counter':'sports', 'MyFitnessPal':'sports', "Endomondo": 'sports', "Health Mate": 'sports', "Upsi": 'sports', 'Mahjong':'games', 'Solitaire':'games', 'Solitaire Collection':'games', 'Paradise Island 2':'games', 'Steam':'games', 'Hidden City':'games', 'Dokkan Battle':'games', 'Super Mario Run':'games', 'Candy Crush Saga':'games', 'Jeopardy!':'games', 'Clash Royale':'games', 'Calcy IV':'games', 'QuizTaisto PREMIUM':'games', 'PlayStation':'games', 'Cleopatra Jewels':'games', 'DraStic':'games', 'XCOM':'games', 'The Secret Society':'games', 'Pokémon GO':'games', 'Hearthstone':'games', 'I Love Hue':'games', 'Destiny':'games', 'Castle Cats':'games', 'Pocket Camp':'games', 'Hatch':'games', '1010!':'games', 'AirConsole':'games', 'Sanapala':'games', 'Head Ball 2':'games', 'Pokémon GO':'games', 'Wordfeud FREE':'games', 'Pyramid Solitaire Saga':'games', 'Match and Explore':'games', 'Twitch':'games', 'Mahjong':'games', 'Solitaire':'games', 'Pokémon GO':'games', "Game Launcher": 'games', "Hay Day": 'games', "State of Survival": 'games', "Wordfeud Free": 'games', 'Sähköposti':'comm', 'Gmail':'comm', 'Puhelin – puheluiden hallinta':'comm', 'Teléfono':'comm', 'Email':'comm', 'Outlook':'comm', 'Skype':'comm', 'Romantic love messages':'comm', 'Dialer':'comm', 'Discord':'comm', 'WhatsApp':'comm', 'Telegram':'comm', 'Phone':'comm', 'Teléfono':'comm', 'Messages':'comm', 'Messenger Lite':'comm', 'Puhelin':'comm', 'Mensajería':'comm', 'Numerovalitsin':'comm', 'Messenger':'comm', 'LINE':'comm', 'Dual Messenger':'comm', 'Telegeram':'comm', 'Mensajería':'comm', 'Googlen tekstistä puheeksi -moottori':'comm', 'LINE Camera':'comm', 'Signal':'comm', 'Viber':'comm', 'Viestit':'comm', "Amino": 'comm', "Fonecta Caller": 'comm', "ICE - In Case of Emergency": 'comm', "Orbot": 'comm', "Puhelu": 'comm', "Puhelutallennin": 'comm', "Kuvat": 'utility', "TikTok": 'comm', "Sähköposti": 'comm', "MysticMessenger": 'comm', 'Pinterest':'socialmedia', 'Tumblr':'socialmedia', 'Snapchat':'socialmedia', 'Twitter':'socialmedia', 'Hootsuite':'socialmedia', 'We Heart It':'socialmedia', 'Instagram':'socialmedia', 'Jodel':'socialmedia', 'happn':'socialmedia', 'LinkedIn':'socialmedia', 'Facebook':'socialmedia', 'Tinder':'socialmedia', 'SDP Kansalaispaneeli':'socialmedia', "Grindr": 'socialmedia', "ROMEO UNCUT": 'socialmedia', 'Geo News':'news', 'Helsingin Sanomat':'news', 'Yle Areena':'news', 'Uutisvahti':'news', 'Flipboard':'news', 'Kauppalehti':'news', 'Ilta-Sanomat':'news', 'Iltalehti':'news', "mtv": 'news', "upday": 'news', "MTV Uutiset": 'news', "Sää": 'news', "Weather": 'news', 'Booking.com Hotellit':'travel', 'Airbnb':'travel', 'Booking.com':'travel', 'TripAdvisor':'travel', 'Couchsurfing':'travel', 'Bonusway':'travel', 'TUI Suomi':'travel', 'Norwegian':'travel', 'Booking.com':'travel', 'OPSkin':'shop', 'Iso Omena':'shop', 'Lunchie Market':'shop', 'AliExpress':'shop', 'Frank App':'shop', 'Hesburger':'shop', 'MobilePay':'shop', 'Zalando':'shop', 'WeShare':'shop', 'Wish':'shop', 'eBay':'shop', 'Aktia Wallet':'shop', 'S-mobiili':'shop', 'Klarna':'shop', 'PINS':'shop', 'McDonalds':'shop', 'K-Ruoka':'shop', 'Wrapp':'shop', 'Wolt':'shop', 'Ticketmaster':'shop', 'H&M':'shop', 'EspressoHouse':'shop', 'ResQ Club':'shop', 'Momotoko':'shop', 'Pivo':'shop', 'Lunchie Market':'shop', 'EspressoHouse':'shop', 'Sheets':'work', 'Slack':'work', 'My Files':'work', 'Dropbox':'work', 'Moodle':'work', 'Knudge.me':'work', 'Wilma':'work', 'Docs':'work', 'Zoom':'work', 'Teams':'work', "KDE Connect": 'work', "Linkity Pro": 'work', "Timely": 'work', "OneDrive": 'work', 'Uber':'transport', 'VR Lähijunat':'transport', 'HSL':'transport', 'HSL Mobiililippu':'transport', 'CityTrack':'transport', 'Podcast Player':'leisure', 'Samsung Music':'leisure', 'Google Play Music':'leisure', 'Shazam':'leisure', 'Photos':'leisure', 'Player FM':'leisure', 'Crowst':'leisure', 'Leffapeli':'leisure', 'WEBTOON':'leisure', 'Tarot Reading':'leisure', 'Duolingo':'leisure', 'Crunchyroll':'leisure', 'SoundHound':'leisure', 'LiveTulokset':'leisure', 'Youtify':'leisure', 'Kuvakaappaus':'leisure', 'Tarot Universe':'leisure', 'Norstat':'leisure', 'Enkeli-tarot':'leisure', 'Podcast Republic':'leisure', 'Audiobooks':'leisure', '9GAG':'leisure', 'Netflix':'leisure', 'Pornhub':'leisure', 'Musiikki':'leisure', 'YouTube':'leisure', 'Imgur':'leisure', 'Google-sovellus':'leisure', 'Chrome':'leisure', 'YouTube Music':'leisure', 'Peel Remote':'leisure', 'Music Center':'leisure', 'SoundCloud':'leisure', 'Spotify':'leisure', 'Google Play Musiikki':'leisure', 'MadLipz':'leisure', 'HAVEN KBH':'leisure', 'Internet':'leisure', 'Podcast Go':'leisure', 'TuneIn Radio':'leisure', 'pixiv':'leisure', 'Pic Collage':'leisure', 'Radio':'leisure', 'myTuner Free':'leisure', 'Audiobooks':'leisure', 'FaceApp':'leisure', 'Podcast Republic':'leisure', 'Libby':'leisure', 'Headspace':'leisure', "BookBeat": 'leisure', "Edge": 'leisure', "Google": 'leisure', "Nextory": 'leisure', "Android System": 'system', "Android system": 'system', "Android-järjestelmä": 'system', "Android-süsteem": 'system', "Download Manager": 'system', "Järj. UI": 'system', "Käyttöliitt.": 'system', "Latauksen hallinta": 'system', "Lataustenhallinta": 'system', "System UI": 'system', "Ohjelmistopäivitys": 'system', "Optimoija": 'system', "Avast Mobile Security": 'security', "Elisa Turvapaketti": 'security', "F-Secure SAFE": 'security', "Freedome": 'security', "Telia Turvapaketti": 'security', "McAfee Security": 'security', "Camera": 'utility', "Clock": 'utility', "Galleria": 'utility', "Google Play Kauppa": 'utility', "Google Play Palvelut": 'utility', "Google Play Store": 'utility', "Galaxy Store": 'utility', "Kalenteri": 'utility', "Kamera": 'utility', "Kello": 'utility', "Kuvat": 'utility', "Smartâ\\x80\\x8bThings": 'utility', "Maps": 'utility', "Samsung capture": 'utility', "Daylio": 'wellbeing', "MoMoMood": 'wellbeing', }
[docs]def classify_app(df, config): """ This function is a helper function for other screen preprocessing. The function classifies the screen events into the groups specified by group_map. Parameters ---------- df: pandas.DataFrame Input data frame config: dict, optional Dictionary keys containing optional arguments for the computation of screen information. Keys can be column names, other dictionaries, etc. It can contain a dictionary called group_map, which has the mapping to define the app groups. Keys should be the app name, values are the app groups (e.g. 'my_app':'my_app_group') Returns ------- df: dataframe Resulting dataframe """ assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe." assert isinstance(config, dict), "config is not a dictionary" if not "app_column_name" in config.keys(): col_name = "application_name" else: col_name = config["app_column_name"] df['app_group'] = 'na' for key,value in config["group_map"].items(): df.app_group[df[col_name] == key]=value return df
[docs]def app_count(df, bat, screen, config={}): """ This function returns the number of times each app group has been used, within the specified timeframe. The app groups are defined as a dictionary within the config variable. Examples of app groups are social media, sports, games, etc. If no mapping is given, a default one will be used. If no resampling window is given, the function sets a 30 min default time window. The function aggregates the duration by user, by app group, by timewindow. Parameters ---------- df: pandas.DataFrame Input data frame bat: pandas.DataFrame Dataframe with the battery information. If no data is available, an empty dataframe should be passed. screen: pandas.DataFrame Dataframe with the screen information. If no data is available, an empty dataframe should be passed. config: dict, optional Dictionary keys containing optional arguments for the computation of scrren information. Keys can be column names, other dictionaries, etc. The functions needs the column name where the data is stored; if none is given, the default name "" will be used. To include information about the resampling window, please include the selected parameters from pandas.DataFrame.resample in a dictionary called resample_args. Returns ------- result: dataframe Resulting dataframe """ assert isinstance(df, pd.DataFrame), "Please input data as a pandas DataFrame type" assert isinstance(bat, pd.DataFrame), "Please input data as a pandas DataFrame type" assert isinstance(screen, pd.DataFrame), "Please input data as a pandas DataFrame type" assert isinstance(config, dict), "config is not a dictionary" if not "group_map" in config.keys(): config['group_map'] = MAP_APP if not "screen_column_name" in config.keys(): screen_col_name = "screen_status" else: screen_col_name = config["screen_column_name"] if not "resample_args" in config.keys(): config["resample_args"] = {"rule":"30T"} df2 = classify_app(df, config) #Insert missing data due to the screen being off or battery depleated if not screen.empty: screen = s.screen_off(screen, bat, config) if type(screen.index)==pd.MultiIndex: screen.reset_index(inplace=True) screen.set_index("index", inplace=True) df2 = pd.concat([df2, screen]) df2.sort_values(by=["user","device","datetime"], inplace=True) df2["app_group"].fillna('off', inplace=True) df2 = df2[['user', 'device', 'datetime', 'app_group', "application_name"]] if (screen.empty and not bat.empty): shutdown = b.shutdown_info(bat, config) shutdown = shutdown.replace([-1,-2],'off') if type(shutdown.index)==pd.MultiIndex: shutdown.reset_index(inplace=True) shutdown.set_index("index", inplace=True) df2 = pd.concat([df2, shutdown]) df2.sort_values(by=["user","device","datetime"], inplace=True) df2["app_group"].fillna('off', inplace=True) df2 = df2[['user', 'device', 'datetime', 'app_group', "application_name"]] if (screen.empty and bat.empty): df2 = df2[['user', 'device', 'datetime', 'app_group', "application_name"]] df2.dropna(inplace=True) if len(df2)>0: df2['datetime'] = pd.to_datetime(df2['datetime']) df2.set_index('datetime', inplace=True) result = group_data(df2)["app_group"].resample(**config["resample_args"]).count() result = pd.DataFrame(result).rename(columns={"app_group": "count"}) result = reset_groups(result) return result return None
[docs]def app_duration(df, bat, screen, config=None): """ This function returns the duration of use of different app groups, within the specified timeframe. The app groups are defined as a dictionary within the config variable. Examples of app groups are social media, sports, games, etc. If no mapping is given, a default one will be used. If no resampling window is given, the function sets a 30 min default time window. The function aggregates the duration by user, by app group, by timewindow. Parameters ---------- df: pandas.DataFrame Input data frame bat: pandas.DataFrame Dataframe with the battery information. If no data is available, an empty dataframe should be passed. screen: pandas.DataFrame Dataframe with the screen information. If no data is available, an empty dataframe should be passed. config: dict, optional Dictionary keys containing optional arguments for the computation of scrren information. Keys can be column names, other dictionaries, etc. The functions needs the column name where the data is stored; if none is given, the default name "application_name" will be used. To include information about the resampling window, please include the selected parameters from pandas.DataFrame.resample in a dictionary called resample_args. Returns ------- result: dataframe Resulting dataframe """ assert isinstance(df, pd.DataFrame), "Please input data as a pandas DataFrame type" assert isinstance(bat, pd.DataFrame), "Please input data as a pandas DataFrame type" assert isinstance(screen, pd.DataFrame), "Please input data as a pandas DataFrame type" assert isinstance(config, dict), "config is not a dictionary" if not "group_map" in config.keys(): config['group_map'] = MAP_APP if not "screen_column_name" in config.keys(): screen_col_name = "screen_status" else: screen_col_name = config["screen_column_name"] if not "resample_args" in config.keys(): config["resample_args"] = {"rule":"30T"} df2 = classify_app(df, config) #Insert missing data due to the screen being off or battery depleated if not screen.empty: screen = s.screen_off(screen, bat, config) if type(screen.index)==pd.MultiIndex: screen.reset_index(inplace=True) screen.set_index("index", inplace=True) df2 = pd.concat([df2, screen]) df2.sort_values(by=["user","device","datetime"], inplace=True) df2["app_group"].fillna('off', inplace=True) df2 = df2[['user', 'device', 'time','datetime', 'app_group']] if (screen.empty and not bat.empty): shutdown = b.shutdown_info(bat, config) shutdown = shutdown.replace([-1,-2],'off') if type(shutdown.index)==pd.MultiIndex: shutdown.reset_index(inplace=True) shutdown.set_index("index", inplace=True) df2 = pd.concat([df2, shutdown]) df2.sort_values(by=["user","device","datetime"], inplace=True) df2["app_group"].fillna('off', inplace=True) df2 = df2[['user', 'device', 'time','datetime', 'app_group']] if (screen.empty and bat.empty): df2 = df2[['user', 'device', 'time','datetime', 'app_group']] df2['duration']=np.nan df2['duration']=df2['datetime'].diff() df2['duration'] = df2['duration'].shift(-1) #Discard any datapoints whose duration are than 10 hours becaus they may be artifacts thr = pd.Timedelta('10 hours') df2 = df2[~(df2.duration>thr)] df2 = df2[~(df2.duration>thr)] df2["duration"] = df2["duration"].dt.total_seconds() df2.dropna(inplace=True) if len(df2)>0: df2['datetime'] = pd.to_datetime(df2['datetime']) df2.set_index('datetime', inplace=True) result = group_data(df2)["duration"].resample(**config["resample_args"]).sum() result = pd.DataFrame(result).rename(columns={"app_group": "count"}) return reset_groups(result) return None
ALL_FEATURES = [globals()[name] for name in globals() if name.startswith('app_')] ALL_FEATURES = {x: {} for x in ALL_FEATURES}
[docs]def extract_features_app(df, bat, screen, features=None): """ This function computes and organizes the selected features for application events. The function aggregates the features by user, by app group, by time window. If no time window is specified, it will automatically aggregate the features in 30 mins non-overlapping windows. If no group_map is provided, a default one will be used. The complete list of features that can be calculated are: app_count, and app_duration. Parameters ---------- df: pandas.DataFrame Input data frame features: dict, optional Dictionary keys contain the names of the features to compute. If none is given, all features will be computed. Returns ------- result: dataframe Resulting dataframe """ assert isinstance(df, pd.DataFrame), "Please input data as a pandas DataFrame type" if features is None: features = ALL_FEATURES else: assert isinstance(features, dict), "Please input the features as a dictionary" computed_features = [] for feature, feature_arg in features.items(): print(f'computing {feature}...') computed_feature = feature(df, bat, screen, feature_arg) index_by = list(group_by_columns & set(computed_feature.columns)) computed_feature = computed_feature.set_index(index_by, append=True) computed_features.append(computed_feature) computed_features = pd.concat(computed_features, axis=1) # index the result only by the original index (datetime) computed_features = reset_groups(computed_features) return computed_features