Source code for niimpy.exploration.eda.punchcard

# -*- coding: utf-8 -*-
"""
Created on Thu Nov 18 16:14:47 2021

@author: arsii
"""
import pandas as pd
import numpy as np
import plotly.express as px

[docs]def get_timerange_(df,resample): """get first and last timepoint from the dataframe, and return a resampled datetimeindex. Parameters ---------- df : Pandas Dataframe Dataframe containing the data ressample : str Resample parameter e.g., 'D' for resampling by day Returns ------- date_index : pd.DatatimeIndex Resampled DatetimeIndex """ resample_options = ['D','H'] assert isinstance(df,pd.DataFrame), "df is not a pandas dataframe" assert isinstance(resample,str), "resample is not a string" assert (resample in resample_options), f"resample option: {resample} is not valid. Available options: {resample_options}." start = df.index.min() end = df.index.max() if resample == 'D': date_index = pd.date_range(start = start.strftime('%Y-%m-%d'), end = end.strftime('%Y-%m-%d'),freq='D') if resample == 'H': date_index = pd.date_range(start = start.strftime('%Y-%m-%d-%H'), end = end.strftime('%Y-%m-%d-%H'),freq='H') return date_index
[docs]def combine_dataframe_(df,user_list,columns,res,date_index,agg_func=np.mean): """resample values from multiple users into new dataframe Parameters ---------- df : Pandas Dataframe Dataframe containing the data user_list : list List containing user names/id's (str) columns : list List of column names (str) to be plotted res : str Resample parameter e.g., 'D' for resampling by day date_index : pd.date_range Date range used as an index agg_func : numpy function Aggregation function used with resample. The default is np.mean Returns ------- df_comb : pd.DataFrame Resampled and combined dataframe """ assert isinstance(df,pd.DataFrame), "df is not a pandas dataframe." assert isinstance(user_list,list), "user_list is not a list." assert isinstance(columns, list), "columns is not a list" assert isinstance(res,str), "res is not a string." assert isinstance(date_index,pd.core.indexes.datetimes.DatetimeIndex), "date_index is not a DatetimeIndex." df_comb = pd.DataFrame(index=date_index) df_comb.index = pd.to_datetime(df_comb.index) for u in user_list: df_temp = df[df['user'] == u][columns].resample(res).agg(agg_func) df_temp.index = df_temp.index.strftime('%Y-%m-%d') df_temp.index = pd.to_datetime(df_temp.index) df_temp = df_temp.reindex(date_index) df_comb[u] = df_temp return df_comb
[docs]def punchcard_(df,title,n_xticks,xtitle,ytitle): """ create a punchcard plot Parameters ---------- df : Pandas Dataframe Dataframe containing the data title : str Plot title. n_xticks : int or None Number of xaxis ticks. If None, scaled automatically. xtitle : str Plot xaxis title ytitle : str Plot yaxis title Returns ------- fig : plotly.graph_objs._figure.Figure Punchcard plot """ assert isinstance(df,pd.DataFrame), "df is not a pandas dataframe." assert isinstance(title,str), "title is not a string." assert isinstance(n_xticks, (int,type(None))), "n_ticks is not an integer or None" assert isinstance(xtitle,str), "xtitle is not a string." assert isinstance(ytitle,str), "ytitle is not a string." fig = px.imshow(df,aspect='auto',labels={'x':xtitle,'y':ytitle,'color':'Value'}) if n_xticks: fig.update_layout(title=title, xaxis_nticks=n_xticks, xaxis_title=xtitle, yaxis_title=ytitle) else: fig.update_layout(title=title, xaxis_title=xtitle, yaxis_title=ytitle) fig.update_yaxes(tickson="labels") fig.update_yaxes(type='category') fig.update_xaxes(showgrid=False) fig.update_yaxes(showgrid=False) return fig
[docs]def punchcard_plot(df, user_list = None, columns = None, title = "Punchcard Plot", resample = 'D', normalize = False, agg_func = np.mean, timerange = False): """Punchcard plot for given users and column with optional resampling Parameters ---------- df : Pandas Dataframe Dataframe containing the data user_list : list, optional List containing user id's as string. The default is None. columns : list, optional List containing columns as strings. The default is None. title : str, optional Plot title. The default is "Punchcard Plot". resample : str, optional Indicator for resampling frequency. The default is 'D' (day). agg_func : numpy function Aggregation function used with resample. The default is np.mean normalize : boolean, optional If true, data is normalized using min-max-scaling. The default is False. timerange : boolean or tuple, optional If false, timerange is not filtered. If tuple containing timestamps, timerange is filtered. The default is False. Returns ------- fig : plotly.graph_objs._figure.Figure Punchcard plot """ assert isinstance(df,pd.DataFrame), "df is not a pandas dataframe." assert isinstance(user_list,(list,type(None))), "user_list is not a list or None." assert isinstance(columns, (list,type(None))), "columns is not a list or None" assert isinstance(title,str), "title is not a string." assert isinstance(resample,str), "resample is not a string." assert callable(agg_func), "agg_function is not a callable." assert isinstance(normalize,bool), "normalize is not a boolean." assert isinstance(timerange,(bool,tuple)), "timerange is not a boolean or tuple." # one user if len(user_list) == 1: # one colums if len(columns) == 1: df_sel = df[df['user'] == user_list[0]][[columns[0]]].resample(resample).agg(agg_func) if normalize: df_sel[columns] = (df_sel[columns] - df_sel[columns].min()) / (df_sel[columns].max() - df_sel[columns].min()) fp = pd.pivot_table(df_sel, index=df_sel.index.month, values = columns[0], columns=df_sel.index.day) fig = punchcard_(fp,title,n_xticks=31, xtitle='Day',ytitle='Month') # multiple columns else: bools = df['user'].isin(user_list) selected = [] for col in columns: if pd.api.types.is_numeric_dtype(df[col]): selected.append(df[bools][col].resample(resample).agg(agg_func)) else: selected.append(df[bools][col].resample(resample).first()) df_sel = pd.concat(selected, axis=1) if normalize: df_sel[columns] = (df_sel[columns] - df_sel[columns].min()) / (df_sel[columns].max() - df_sel[columns].min()) fig = punchcard_(df_sel,title,n_xticks=None, xtitle='Column',ytitle='Date') # multiple users, one column else: date_index = get_timerange_(df,resample) df_comb = combine_dataframe_(df,user_list,columns,resample,date_index,agg_func) if normalize: df_comb =(df_comb-df_comb.min())/(df_comb.max()-df_comb.min()) if timerange: fig = punchcard_(df_comb.loc[timerange[0]:timerange[1]].transpose(),title,n_xticks=None, xtitle='Date',ytitle='User') else: fig = punchcard_(df_comb.transpose(),title,n_xticks=None, xtitle='Date',ytitle='User') return fig