Source code for niimpy.exploration.eda.missingness

'''
This module is rewritten based on the missingno package.
The original files can be found here: https://github.com/ResidentMario/missingno
'''

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff

from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform

[docs]def bar_count(df, columns=None, title='Data frequency', xaxis_title = '', yaxis_title = '', sampling_freq='H'):
    ''' Display bar chart visualization of the nullity of the given DataFrame.
    
    Parameters
    ----------
    df: pandas Dataframe
        Dataframe to plot
    columns: list, optional
        Columns from input dataframe to investigate missingness. If none is given, uses all columns.
    title: str
        Figure's title
    xaxis_title: str, optional
        x_axis's label
    yaxis_title: str, optional
        y_axis's label
    sampling_freq: str, optional
        Frequency to resample the data. Requires the dataframe to have datetime-like index. Possible values: 'H', 'T'

    Returns
    -------
    fig: Plotly figure.
    '''
    
    assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe."
    
    if columns == None:
        columns = df.columns
       
    resampled_df = df.resample(sampling_freq).count()
    
    if sampling_freq == 'H':
        resampled_df = resampled_df.groupby([resampled_df.index.hour])[columns].sum()
        fig = px.bar(resampled_df)
        
        # Define xticks
        # Define xticks
        tickvals = list(range(0, 24))
        ticktexs = []
        for tick in tickvals:
            ticktexs.append("{:02d}:00:00".format(tick))
            
        fig.update_layout(
            xaxis = dict(
                tickangle= 90,
                tickmode = 'array',
                tickvals = tickvals,
                ticktext = ticktexs,
                dtick = 5
            )
        )
        

    elif sampling_freq == 'T':
        resampled_df = resampled_df.groupby([resampled_df.index.minute])[columns].sum()
            
        fig = px.bar(resampled_df)
        
        # Define xticks
        tickvals = list(range(0, 60))
        ticktexs = []
        for tick in tickvals:
            ticktexs.append("{:02d}:00".format(tick))
            
        fig.update_layout(
            xaxis = dict(
                tickmode = 'array',
                tickvals = tickvals,
                ticktext = ticktexs,
                dtick = 5
            )
        )
        
    fig.update_layout(title=title, xaxis_title=xaxis_title, yaxis_title=yaxis_title, showlegend=False)
    
    return fig

[docs]def bar(df, columns=None, title='Data frequency', xaxis_title = '', yaxis_title = '', sampling_freq=None, sampling_method='mean'):
    ''' Display bar chart visualization of the nullity of the given DataFrame.
    
    Parameters
    ----------
    df: pandas Dataframe
        Dataframe to plot
    columns: list, optional
        Columns from input dataframe to investigate missingness. If none is given, uses all columns.
    title: str
        Figure's title
    xaxis_title: str, optional
        x_axis's label
    yaxis_title: str, optional
        y_axis's label
    sampling_freq: str, optional
        Frequency to resample the data. Requires the dataframe to have datetime-like index. Possible values: 'H', 'T'
    sampling_method: str, optional
        Resampling method. Possible values: 'sum', 'mean'. Default value is 'mean'.
    Returns
    -------
    fig: Plotly figure.
    '''
    
    assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe."
    
    def _missing_percentage(df):
        
        # Return each column missing percentage
        # Count nullity in all columns
        nullity_counts = len(df) - df.isnull().sum()
        missing_perc = (nullity_counts / len(df))
        return missing_perc
    
    if columns == None:
        columns = df.columns
        
    if sampling_freq:
        assert sampling_method in ['mean', 'sum'], 'Cannot recognize sampling method. Possible values: "mean", "sum".'
        if sampling_method == 'mean':
            resampled_df = df.resample(sampling_freq).mean()
        else:
            resampled_df = df.resample(sampling_freq).sum()
            
        # Transpose the dataframe so that timestamp index become columns
        resampled_df = resampled_df[columns].transpose()

        fig = px.bar(_missing_percentage(resampled_df))
    else:

        fig = px.bar(_missing_percentage(df[columns]))
    
    fig.update_layout(title=title, xaxis_title=xaxis_title, yaxis_title=yaxis_title, showlegend=False)
    return fig


[docs]def matrix(df, height=500, title='Data frequency', xaxis_title = '', yaxis_title = '', sampling_freq=None, sampling_method='mean'):
    ''' Return matrix visualization of the nullity of data.
    For now, this function assumes that the data frame is datetime indexed.
    
    Parameters
    ----------
    df: pandas Dataframe
        Dataframe to plot
    columns: list, optional
        Columns from input dataframe to investigate missingness. If none is given, uses all columns.
    title: str
        Figure's title
    xaxis_title: str, optional
        x_axis's label
    yaxis_title: str, optional
        y_axis's label
    sampling_freq: str, optional
        Frequency to resample the data. Requires the dataframe to have datetime-like index. Possible values: 'H', 'T'
    sampling_method: str, optional
        Resampling method. Possible values: 'sum', 'mean'. Default value is 'mean'.
    Returns
    -------
    fig: Plotly figure.
    '''

    assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe."
    
    if sampling_freq:
        assert sampling_method in ['mean', 'sum'], 'Cannot recognize sampling method. Possible values: "mean", "sum".'
        if sampling_method == 'mean':
            resampled_df = df.resample(sampling_freq).mean()
        else:
            resampled_df = df.resample(sampling_freq).sum()
    else:
        resampled_df = df.copy()
        
    # Create a boolean mask for the dataframe, where the null values are masked with False
    bool_mask = resampled_df.isna()
    
    # Plot the dataframe as pixel
    fig = px.imshow(bool_mask, color_continuous_scale='gray')
        
    # Update layout
    fig.update_layout(title=title, xaxis_title = xaxis_title, yaxis_title = yaxis_title,
                      coloraxis_showscale=False, height=height)

    return fig

[docs]def heatmap(df, height=800, width=800, title='', xaxis_title='', yaxis_title=''):
    ''' Return 'plotly' heatmap visualization of the nullity correlation of the Dataframe.
    
     Parameters
    ----------
    df: pandas Dataframe
        Dataframe to plot
    width: int:
        Figure's width
    height: int:
        Figure's height
    Returns
    -------
    fig: Plotly figure.
    '''
    
    # Remove completely filled or completely empty variables.
    df = df.iloc[:, [i for i, n in enumerate(np.var(df.isnull(), axis='rows')) if n > 0]]

    # Create and mask the correlation matrix. Construct the base heatmap.
    corr_mat = df.isnull().corr()
    
    # Calculate dissimilarity distance. 
    # Dissimilarity is close to zero if correlation is close to 1 or -1. 
    dissimilarity = 1 - abs(corr_mat)
    labels = df.columns

    # Initialize figure by creating upper dendrogram
    fig = ff.create_dendrogram(dissimilarity, orientation='bottom', labels=labels)
    for i in range(len(fig['data'])):
        fig['data'][i]['yaxis'] = 'y2'

    # Create Side Dendrogram
    dendro_side = ff.create_dendrogram(dissimilarity, orientation='right', labels=labels)
    dendro_side.for_each_trace(lambda trace: trace.update(visible=False))
    for i in range(len(dendro_side['data'])):
        dendro_side['data'][i]['xaxis'] = 'x2'

    # Add Side Dendrogram Data to Figure
    for data in dendro_side['data']:
        fig.add_trace(data)

    # Create Heatmap
    dendro_leaves = dendro_side['layout']['yaxis']['ticktext']
    dendro_vals =  dendro_side['layout']['yaxis']['tickvals']

    heat_data = corr_mat.reindex(columns=dendro_leaves)
    heat_data = heat_data.reindex(dendro_leaves)
    
    heatmap = [
        go.Heatmap(
            x = dendro_leaves,
            y = dendro_leaves,
            z = heat_data,
            colorscale = 'Blues'
        )
    ]

    heatmap[0]['x'] = fig['layout']['xaxis']['tickvals']
    heatmap[0]['y'] = dendro_side['layout']['yaxis']['tickvals']

    # Add Heatmap Data to Figure
    for data in heatmap:
        fig.add_trace(data)
    
    # Edit Layout
    fig.update_layout({'width':width, 
                       'height':height,
                       'showlegend':False, 
                       'hovermode': 'closest',
                       'title':title, 
                       'xaxis_title':xaxis_title, 
                       'yaxis_title':yaxis_title})
    # Edit xaxis
    fig.update_layout(xaxis={'domain': [.15, 1],
                              'mirror': False,
                              'showgrid': False,
                              'showline': False,
                              'zeroline': False,
                              'ticks':""})
    # Edit xaxis2
    fig.update_layout(xaxis2={'domain': [0, .15],
                               'mirror': False,
                               'showgrid': False,
                               'showline': False,
                               'zeroline': False,
                               'showticklabels': False,
                               'ticks':""})

    # Edit yaxis
    fig.update_layout(yaxis={'domain': [0, .85],
                             'mirror': False,
                             'showgrid': False,
                             'showline': False,
                             'zeroline': False,
                             'ticks': "",
                             'tickmode': 'array',
                             'ticktext': dendro_leaves,
                             'tickvals': dendro_vals})
    # Edit yaxis2
    fig.update_layout(yaxis2={'domain':[.825, .975],
                               'mirror': False,
                               'showgrid': False,
                               'showline': False,
                               'zeroline': False,
                               'showticklabels': False,
                               'ticks':""})

    return fig