Source code for niimpy.exploration.eda.categorical

# -*- coding: utf-8 -*-
"""
Created on Thu Nov 18 14:49:22 2021

@author: arsii
"""
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

[docs]def get_xticks_(ser): """ Helper function for plot_categories function. Convert series index into xtick values and text. Parameters ---------- ser : Pandas series Series containing the categorized counts Return ------ vals : list of integers xtick values/indices for plotting text : list of strings xtick text for plotting """ assert isinstance(ser, pd.Series), "ser is not a pandas series." vals = ser.index.values text = [str(i) for i in vals] return vals,text
[docs]def categorize_answers(df, question): """ Extract a question answered and count different answers. Parameters ---------- df : Pandas Dataframe Dataframe containing questionnaire data question : str dataframe column sontaining question id answer_column : str dataframe column containing the answer Returns ------- category_counts: Pandas Dataframe Dataframe containing the category counts of answers filtered by the question """ assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe." assert isinstance(question, str), "question is not a string." df = df[question] category_counts = df.astype("category").value_counts(sort=False) return category_counts
[docs]def plot_categories( df, title=None, xlabel=None, ylabel=None, width=900, height=900 ): """ Create a barplot of categorical data Parameters ---------- df : Pandas Dataframe Dataframe containing categorized data title : str Plot title xlabel : str Plot xlabel ylabel : str Plot ylabel width : integer Plot width height : integer Plot height Returns ------- fig: plotly Figure A barplot of the input data """ assert isinstance(df, pd.Series), "df is not a pandas series." assert isinstance(title, (str,type(None))), 'title is not a string or None type.' assert isinstance(xlabel, (str,type(None))), "xlabel is not a string or None type." assert isinstance(ylabel, (str,type(None))), "ylabel is not a string or None type." assert isinstance(width, int), "width is not an integer." assert isinstance(height, int), "height is not an integer." #xtick labels and values vals, text = get_xticks_(df) fig = px.bar(df) fig.update_layout(title = title, xaxis_title = xlabel, yaxis_title = ylabel, width = width, height = height, xaxis = dict(tickmode = 'array', tickvals = vals, ticktext = text) ) return fig
[docs]def questionnaire_summary( df, question, title=None, xlabel=None, ylabel=None, user=None, width=900, height=900 ): """Plot summary barplot for questionnaire data. Parameters ---------- df : Pandas Dataframe Dataframe containing questionnaire data question : str question id title : str Plot title xlabel : str Plot xlabel ylabel : str Plot ylabel user : Bool or str If str, plot single user data If False, plot group level data Returns ------- fig: plotly Figure A barplot summary of the questionnaire """ assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe." assert isinstance(question, str), "question is not a string." assert isinstance(user, (str,type(None))), "user is not a boolean or string." assert isinstance(title, (str, type(None))), "title is not a string or None type." assert isinstance(xlabel, (str,type(None))), "xlabel is not a string or None type." assert isinstance(ylabel, (str,type(None))), "ylabel is not a string or None type." assert isinstance(width, int), "width is not an integer." assert isinstance(height, int), "height is not an integer." if user is not None: df = df[df['user'] == user] df = categorize_answers(df, question) fig = plot_categories(df, title, xlabel, ylabel,width,height) return fig
[docs]def question_by_group(df, question, group='group'): """Plot summary barplot for questionnaire data. Parameters ---------- df : Pandas Dataframe Dataframe containing questionnaire data question : str question id answer_column : str answer_column containing the answer group : str group by this column Returns ------- df : Pandas DataFrameGroupBy Dataframe a single answers column filtered by the question parameter and grouped by the group parameter """ assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe." assert isinstance(question, str), "question is not a string." assert isinstance(group, (type(None), str)), "group is not a boolean or string." grouped = df[[question, group]].reset_index(drop=True) grouped = grouped.groupby([group,question]).agg({question:'count'}) grouped = grouped.rename(columns={question:'count'}).reset_index() grouped = grouped.rename(columns={question:'answer'}) return grouped
[docs]def plot_grouped_categories(df, group, title=None, xlabel=None, ylabel=None, width=900, height=900): """Plot summary barplot for questionnaire data. Parameters ---------- df: Pandas DataFrameGroupBy A grouped dataframe containing categorical data group: str Column used to describe group title : str Plot title xlabel : str Plot xlabel ylabel : str Plot ylabel width : integer Plot width height : integer Plot height Returns ------- fig: plotly Figure Figure containing barplots of the data in each group """ assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe." assert isinstance(title,(type(None), str)), "title is not a string or none type." assert isinstance(xlabel,(type(None), str)), "xlabel is not a string or none type." assert isinstance(ylabel,(type(None), str)), "ylabel is not a string or none type." assert isinstance(width, int), "width is not an integer." assert isinstance(height, int), "height is not an integer." fig = px.bar(df, x="answer", y="count", color=group, barmode='group',) fig.update_layout(xaxis={'categoryorder':'category ascending'}, title = title, legend_title="Groups", barmode='group', xaxis_title = xlabel, yaxis_title = ylabel, width = width, height = height) return fig
[docs]def questionnaire_grouped_summary( df, question, group = 'group', title=None, xlabel=None, ylabel=None, width=900, height=900 ): """ Create a barplot of categorical data Parameters ---------- df : Pandas Dataframe Dataframe containing questionnaire data question : str question id title : str Plot title xlabel : str Plot xlabel ylabel : str Plot ylabel user : Bool or str If str, plot single user data If False, plot group level data group : str group by this column Returns ------- fig: plotly Figure A barplot of the input data """ assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe." assert isinstance(question, str), "question is not a string." assert isinstance(group, str), "group is not a string." assert isinstance(title, (str,type(None))), "title is not a string or None type." assert isinstance(xlabel, (str,type(None))), "xlabel is not a string or None type." assert isinstance(ylabel, (str,type(None))), "ylabel is not a string or None type." assert isinstance(width, int), "width is not an integer." assert isinstance(height, int), "height is not an integer." df_filt = question_by_group(df, question, group) fig = plot_grouped_categories(df_filt, group=group, title=title, xlabel=xlabel, ylabel=ylabel, width=width, height=height) return fig