Source code for niimpy.exploration.eda.countplot

# -*- coding: utf-8 -*-
"""
Created on Mon Nov  8 14:42:18 2021

@author: arsii
"""

import pandas as pd
import plotly.express as px

[docs]def get_counts(df,aggregation): """Calculate datapoint counts by group or by user Parameters ---------- df : Pandas DataFrame aggregation : str Returns ------- n_events : Pandas DataFrame """ assert isinstance(df,pd.DataFrame), "df is not a pandas dataframe." assert isinstance(aggregation,str), "aggregation is not a string" if aggregation == 'group': n_events = df[['group', 'user']].groupby(['group']).size().to_frame() n_events.columns = ['values'] n_events = n_events.reset_index() elif aggregation == 'user': n_events = df[['user']].groupby(['user']).size().to_frame() n_events.columns = ['values'] n_events = n_events.reset_index() return n_events
[docs]def calculate_bins(df,binning): """Calculate time index based bins for each observation in the dataframe. Parameters ---------- df : Pandas DataFrame binning : str to_string : bool Returns ------- bins : pandas period index """ assert isinstance(df,pd.DataFrame), "df is not a pandas dataframe." assert isinstance(binning,str), "binning is not a string" #assert isinstance(to_string,bool), "to_string is not a bool" bins = df.index.to_period(binning) bins = bins.to_series().astype(str) return bins
[docs]def boxplot_(df, fig_title, points = 'outliers', y = 'values', xlabel="Group", ylabel="Count",binning=False): """Plot a boxplot Parameters ---------- df : Pandas Dataframe Dataframe containing the data fig_title : str Plot title points : str If 'all', show all observations next to boxplots If 'outliers', show only outlying points The default is 'outliers' y: str A dataframe column to plot xlabel : str Plot xlabel ylabel : str Plot ylabel Returns ------- """ assert isinstance(df,pd.DataFrame), "df is not a pandas dataframe." assert isinstance(points,str), "points is not a string" assert isinstance(y,str), "y is not a string" assert isinstance(fig_title,str), "Title is not a string." assert isinstance(xlabel,str), "Xlabel is not a string." assert isinstance(ylabel,str), "Ylabel is not a string." assert ((isinstance(binning,str)) or (binning,bool)), "binning." #TODO! check if this is necessary! #df[y] = df[y].astype(np.float64) if binning is not False: df['bin'] = calculate_bins(df,binning).values fig = px.box(df, x = "bin", y = y, color = "group", ) else: fig = px.box(df, x = "group", y = y, color = "group", points = points) fig.update_traces(quartilemethod='inclusive') # or "inclusive", or "linear" by default fig.update_layout(title = fig_title, xaxis_title = xlabel, yaxis_title = ylabel, autotypenumbers='convert types') return fig
[docs]def barplot_(df, fig_title, xlabel, ylabel): """Plot a barplot showing counts for each subjects A dataframe must have columns named 'user', containing the user id's, and 'values' containing the observation counts. Parameters ---------- df : Pandas Dataframe Dataframe containing the data fig_title : str Plot title xlabel : str Plot xlabel ylabel : str Plot ylabel Returns ------- """ assert isinstance(df,pd.DataFrame), "df is not a pandas dataframe." assert isinstance(fig_title,str), "Title is not a string." assert isinstance(xlabel,str), "Xlabel is not a string." assert isinstance(ylabel,str), "Ylabel is not a string." fig = px.bar(df, x = "user", y = "values", color = "user") fig.update_layout(title = fig_title, xaxis_title = xlabel, yaxis_title = ylabel) return fig
[docs]def countplot(df, fig_title, plot_type = 'count', points = 'outliers',aggregation = 'group', user = None, column=None, binning=False): """Create boxplot comparing groups or individual users. Parameters ---------- df : pandas DataFrame A DataFrame to be visuliazed fig_title : str The plot title. plot_type : str If 'count', plot observation count per group (boxplot) or by user (barplot) If 'value', plot observation values per group (boxplot) The default is 'count' aggregation : str If 'group', plot group level summary If 'user', plot user level summary The default is 'group' user : str if given ... The default is None column : str, optional if None, count number of rows. If given, count only occurances of that column. The default is None. Returns ------- """ assert isinstance(df,pd.DataFrame), "df is not a pandas dataframe." assert isinstance(fig_title,str), "Title is not a string." assert isinstance(plot_type, str), "plot_type is not a string." assert isinstance(points,str), "points is not a string" assert isinstance(aggregation,str), "aggregation is not a string" assert ((isinstance(user,str)) or (user is None)), "user is not a string or None type." assert ((isinstance(column,str)) or (column is None)), "column in not a string or None type." # Plot counts if plot_type == 'count': if aggregation == 'group': n_events = get_counts(df,aggregation) fig = boxplot_(n_events, fig_title, points, y = 'values', xlabel="Group", ylabel="Count", binning=binning ) elif aggregation == 'user': n_events = get_counts(df,aggregation) fig = barplot_(n_events, fig_title, xlabel="User", ylabel="Count") else: pass # Plot values elif plot_type == 'value': if aggregation == 'group': fig = boxplot_(df, fig_title, points, y=column, xlabel="Group", ylabel="Value", binning=binning ) else: pass return fig