Source code for niimpy.exploration.setup_dataframe

import pandas as pd
import numpy as np
from datetime import date, datetime

[docs]def create_dataframe():
    """Create a sample Pandas dataframe used by the test functions.
    
    Returns
    -------
    df : pandas.DataFrame
        Pandas dataframe containing sample data.
        
    """
    
    dti = pd.date_range("2018-01-01", periods=9, freq="H")
    
    d = {'user': ['user_1','user_2','user_3','user_4','user_5','user_6','user_7','user_8','user_9'], 
         'group': ['group_1','group_1','group_1','group_2','group_2','group_2','group_3','group_3','group_3'],
         'col_1': [1, 2, 3,4,5,6,7,8,9], 
         'col_2': [10, 11, 12, 13, 14, 15, 16, 17, 18]}
    
    df = pd.DataFrame(data=d,index=dti)
    
    return df

[docs]def create_categorical_dataframe():
    """Create a sample Pandas dataframe used by the test functions.
    
    Returns
    -------
    df : pandas.DataFrame
        Pandas dataframe containing sample data.
        
    """
    
    dti = pd.date_range("2018-01-01", periods=9, freq="H")
    
    d = {
        'user': ['user_1','user_2','user_3','user_4','user_5','user_6','user_7','user_8','user_9'],
        'group': ['group_1','group_1','group_1','group_2','group_2','group_2','group_3','group_3','group_3'],
        'id_1': ["str_1","str_2","str_3","str_1","str_2","str_3","str_1","str_2",""],
        'id_2': ["str_1","str_2","str_3","","str_2","str_3","str_1","str_2",""],
        'id_3': ["","","str_3","","str_2","str_3","str_1","str_2",""],
    }
    
    df = pd.DataFrame(data=d,index=dti)
    
    return df

[docs]def create_timeindex_dataframe(nrows, ncols, random_state=None, freq=None):
    """Create a datetime index Pandas dataframe 
    
    Parameters
    ----------
    nrows : int
        Number of rows
    ncols : int
        Number of columns
    random_state: float, optional
        Random seed. If not given, default to 33.
    freq: string, optional:
        Sampling frequency.
    Returns
    -------
    df : pandas.DataFrame
        Pandas dataframe containing sample data with random missing rows.    
    """
    
    # Create a nrows x ncols matrix
    data = np.random.uniform(100, size=(nrows, ncols))
    df = pd.DataFrame(data)
    
    if freq is None:
        freq='h'
    idx = _makeDatetimeIndex(nrows, freq=freq)
    df = df.set_index(idx)
    
    return df

'''
def create_missing_dataframe():
    """Create a Pandas dataframe with random missingness.
    
    Returns
    -------
    df : pandas.DataFrame
        Pandas dataframe containing sample data with random missing rows.
        
    """
    
    dti = pd.date_range("2018-01-01", periods=9, freq="H")
    
    d = {'user': ['user_1','user_2','user_3','user_4','user_5','user_6','user_7','user_8','user_9'], 
         'group': ['group_1','group_1','group_1','group_2','group_2','group_2','group_3','group_3','group_3'],
         'col_1': [1, 2, 3,4,5,6,7,8,9], 
         'col_2': [10, 11, 12, 13, 14, 15, 16, 17, 18]}
    
    df = pd.DataFrame(data=d,index=dti)
    
    # Randomly set some values to NaN
    for col in df.columns:
        df.loc[df.sample(frac=0.25).index, col] = pd.np.nan

    data = (np.random.random(1000).reshape((50, 20)) > 0.5).astype(bool)
    df = pd.DataFrame(data).replace({False: None})
    df = df.set_index(pd.period_range('1/1/2011', '2/1/2015', freq='M'))
    df.index = df.index.to_timestamp()

    return df
'''

[docs]def create_missing_dataframe(nrows, ncols, density=.9, random_state=None, index_type=None, freq=None):
    """Create a Pandas dataframe with random missingness.
    
    Parameters
    ----------
    nrows : int
        Number of rows
    ncols : int
        Number of columns
    density: float
        Amount of available data
    random_state: float, optional
        Random seed. If not given, default to 33.
    index_type: float, optional
        Accepts the following values: "dt" for timestamp, "int" for integer.
    freq: string, optional:
        Sampling frequency. This option is only available is index_type is "dt". 
        
    Returns
    -------
    df : pandas.DataFrame
        Pandas dataframe containing sample data with random missing rows.    
    """
    
    # Create a nrows x ncols matrix
    data = np.random.uniform(100, size=(nrows, ncols))
    df = pd.DataFrame(data)
    
    if index_type:
        if index_type == "dt":
            if freq is None:
                freq='h'
            idx = _makeDatetimeIndex(nrows, freq=freq)
            df = df.set_index(idx)
        elif index_type == "int":
            return
        else: 
            raise ValueError("Can't recognize index_type. Try the following values: 'dt', 'int'.")
            
    i_idx, j_idx = _create_missing_idx(nrows, ncols, density, random_state)
    df.values[i_idx, j_idx] = None
    return df

def _makeDatetimeIndex(k=10, freq='B', name=None):
    dt = datetime(2022, 1, 1)
    dr = pd.bdate_range(dt, periods=k, freq=freq, name=name)
    return pd.DatetimeIndex(dr, name=name)

def _create_missing_idx(nrows, ncols, density, random_state=None):
    if random_state is None:
        random_state = np.random
    else:
        random_state = np.random.RandomState(random_state)

    # below is cribbed from scipy.sparse
    size = int(np.round((1 - density) * nrows * ncols))
    # generate a few more to ensure unique values
    min_rows = 5
    fac = 1.02
    extra_size = min(size + min_rows, fac * size)

    def _gen_unique_rand(rng, _extra_size):
        ind = rng.rand(int(_extra_size))
        return np.unique(np.floor(ind * nrows * ncols))[:size]

    ind = _gen_unique_rand(random_state, extra_size)
    while ind.size < size:
        extra_size *= 1.05
        ind = _gen_unique_rand(random_state, extra_size)

    j = np.floor(ind * 1. / nrows).astype(int)
    i = (ind - j * nrows).astype(int)
    return i.tolist(), j.tolist()