'''
This module is rewritten based on the missingno package.
The original files can be found here: https://github.com/ResidentMario/missingno
'''
import numpy as np
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.figure_factory as ff
from scipy.cluster.hierarchy import linkage, dendrogram, fcluster
from scipy.spatial.distance import squareform
[docs]def bar_count(df, columns=None, title='Data frequency', xaxis_title = '', yaxis_title = '', sampling_freq='H'):
''' Display bar chart visualization of the nullity of the given DataFrame.
Parameters
----------
df: pandas Dataframe
Dataframe to plot
columns: list, optional
Columns from input dataframe to investigate missingness. If none is given, uses all columns.
title: str
Figure's title
xaxis_title: str, optional
x_axis's label
yaxis_title: str, optional
y_axis's label
sampling_freq: str, optional
Frequency to resample the data. Requires the dataframe to have datetime-like index. Possible values: 'H', 'T'
Returns
-------
fig: Plotly figure.
'''
assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe."
if columns == None:
columns = df.columns
resampled_df = df.resample(sampling_freq).count()
if sampling_freq == 'H':
resampled_df = resampled_df.groupby([resampled_df.index.hour])[columns].sum()
fig = px.bar(resampled_df)
# Define xticks
# Define xticks
tickvals = list(range(0, 24))
ticktexs = []
for tick in tickvals:
ticktexs.append("{:02d}:00:00".format(tick))
fig.update_layout(
xaxis = dict(
tickangle= 90,
tickmode = 'array',
tickvals = tickvals,
ticktext = ticktexs,
dtick = 5
)
)
elif sampling_freq == 'T':
resampled_df = resampled_df.groupby([resampled_df.index.minute])[columns].sum()
fig = px.bar(resampled_df)
# Define xticks
tickvals = list(range(0, 60))
ticktexs = []
for tick in tickvals:
ticktexs.append("{:02d}:00".format(tick))
fig.update_layout(
xaxis = dict(
tickmode = 'array',
tickvals = tickvals,
ticktext = ticktexs,
dtick = 5
)
)
fig.update_layout(title=title, xaxis_title=xaxis_title, yaxis_title=yaxis_title, showlegend=False)
return fig
[docs]def bar(df, columns=None, title='Data frequency', xaxis_title = '', yaxis_title = '', sampling_freq=None, sampling_method='mean'):
''' Display bar chart visualization of the nullity of the given DataFrame.
Parameters
----------
df: pandas Dataframe
Dataframe to plot
columns: list, optional
Columns from input dataframe to investigate missingness. If none is given, uses all columns.
title: str
Figure's title
xaxis_title: str, optional
x_axis's label
yaxis_title: str, optional
y_axis's label
sampling_freq: str, optional
Frequency to resample the data. Requires the dataframe to have datetime-like index. Possible values: 'H', 'T'
sampling_method: str, optional
Resampling method. Possible values: 'sum', 'mean'. Default value is 'mean'.
Returns
-------
fig: Plotly figure.
'''
assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe."
def _missing_percentage(df):
# Return each column missing percentage
# Count nullity in all columns
nullity_counts = len(df) - df.isnull().sum()
missing_perc = (nullity_counts / len(df))
return missing_perc
if columns == None:
columns = df.columns
if sampling_freq:
assert sampling_method in ['mean', 'sum'], 'Cannot recognize sampling method. Possible values: "mean", "sum".'
if sampling_method == 'mean':
resampled_df = df.resample(sampling_freq).mean()
else:
resampled_df = df.resample(sampling_freq).sum()
# Transpose the dataframe so that timestamp index become columns
resampled_df = resampled_df[columns].transpose()
fig = px.bar(_missing_percentage(resampled_df))
else:
fig = px.bar(_missing_percentage(df[columns]))
fig.update_layout(title=title, xaxis_title=xaxis_title, yaxis_title=yaxis_title, showlegend=False)
return fig
[docs]def matrix(df, height=500, title='Data frequency', xaxis_title = '', yaxis_title = '', sampling_freq=None, sampling_method='mean'):
''' Return matrix visualization of the nullity of data.
For now, this function assumes that the data frame is datetime indexed.
Parameters
----------
df: pandas Dataframe
Dataframe to plot
columns: list, optional
Columns from input dataframe to investigate missingness. If none is given, uses all columns.
title: str
Figure's title
xaxis_title: str, optional
x_axis's label
yaxis_title: str, optional
y_axis's label
sampling_freq: str, optional
Frequency to resample the data. Requires the dataframe to have datetime-like index. Possible values: 'H', 'T'
sampling_method: str, optional
Resampling method. Possible values: 'sum', 'mean'. Default value is 'mean'.
Returns
-------
fig: Plotly figure.
'''
assert isinstance(df, pd.DataFrame), "df is not a pandas dataframe."
if sampling_freq:
assert sampling_method in ['mean', 'sum'], 'Cannot recognize sampling method. Possible values: "mean", "sum".'
if sampling_method == 'mean':
resampled_df = df.resample(sampling_freq).mean()
else:
resampled_df = df.resample(sampling_freq).sum()
else:
resampled_df = df.copy()
# Create a boolean mask for the dataframe, where the null values are masked with False
bool_mask = resampled_df.isna()
# Plot the dataframe as pixel
fig = px.imshow(bool_mask, color_continuous_scale='gray')
# Update layout
fig.update_layout(title=title, xaxis_title = xaxis_title, yaxis_title = yaxis_title,
coloraxis_showscale=False, height=height)
return fig
[docs]def heatmap(df, height=800, width=800, title='', xaxis_title='', yaxis_title=''):
''' Return 'plotly' heatmap visualization of the nullity correlation of the Dataframe.
Parameters
----------
df: pandas Dataframe
Dataframe to plot
width: int:
Figure's width
height: int:
Figure's height
Returns
-------
fig: Plotly figure.
'''
# Remove completely filled or completely empty variables.
df = df.iloc[:, [i for i, n in enumerate(np.var(df.isnull(), axis='rows')) if n > 0]]
# Create and mask the correlation matrix. Construct the base heatmap.
corr_mat = df.isnull().corr()
# Calculate dissimilarity distance.
# Dissimilarity is close to zero if correlation is close to 1 or -1.
dissimilarity = 1 - abs(corr_mat)
labels = df.columns
# Initialize figure by creating upper dendrogram
fig = ff.create_dendrogram(dissimilarity, orientation='bottom', labels=labels)
for i in range(len(fig['data'])):
fig['data'][i]['yaxis'] = 'y2'
# Create Side Dendrogram
dendro_side = ff.create_dendrogram(dissimilarity, orientation='right', labels=labels)
dendro_side.for_each_trace(lambda trace: trace.update(visible=False))
for i in range(len(dendro_side['data'])):
dendro_side['data'][i]['xaxis'] = 'x2'
# Add Side Dendrogram Data to Figure
for data in dendro_side['data']:
fig.add_trace(data)
# Create Heatmap
dendro_leaves = dendro_side['layout']['yaxis']['ticktext']
dendro_vals = dendro_side['layout']['yaxis']['tickvals']
heat_data = corr_mat.reindex(columns=dendro_leaves)
heat_data = heat_data.reindex(dendro_leaves)
heatmap = [
go.Heatmap(
x = dendro_leaves,
y = dendro_leaves,
z = heat_data,
colorscale = 'Blues'
)
]
heatmap[0]['x'] = fig['layout']['xaxis']['tickvals']
heatmap[0]['y'] = dendro_side['layout']['yaxis']['tickvals']
# Add Heatmap Data to Figure
for data in heatmap:
fig.add_trace(data)
# Edit Layout
fig.update_layout({'width':width,
'height':height,
'showlegend':False,
'hovermode': 'closest',
'title':title,
'xaxis_title':xaxis_title,
'yaxis_title':yaxis_title})
# Edit xaxis
fig.update_layout(xaxis={'domain': [.15, 1],
'mirror': False,
'showgrid': False,
'showline': False,
'zeroline': False,
'ticks':""})
# Edit xaxis2
fig.update_layout(xaxis2={'domain': [0, .15],
'mirror': False,
'showgrid': False,
'showline': False,
'zeroline': False,
'showticklabels': False,
'ticks':""})
# Edit yaxis
fig.update_layout(yaxis={'domain': [0, .85],
'mirror': False,
'showgrid': False,
'showline': False,
'zeroline': False,
'ticks': "",
'tickmode': 'array',
'ticktext': dendro_leaves,
'tickvals': dendro_vals})
# Edit yaxis2
fig.update_layout(yaxis2={'domain':[.825, .975],
'mirror': False,
'showgrid': False,
'showline': False,
'zeroline': False,
'showticklabels': False,
'ticks':""})
return fig