Module `datasist.timeseries`

This module contains functions for working with Date features. It can help you extract new features from Dates and also help you visualize them.

Expand source code

'''
This module contains all functions relating to time series data

'''

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from IPython.display import display
from .structdata import get_cat_feats, get_num_feats, get_date_cols



def extract_dates(data=None, date_cols=None, subset=None, drop=True):
    '''
    Extracts date information in a dataframe and append to the original data as new columns.
    For extracting only time features, use datasist.timeseries.extract_time function
    
    Parameters:
    -----------
        data: DataFrame or named Series

            The data set to extract date information from.

        date_cols: List, Array

            Name of date columns/features in data set.

        subset: List, Array

            Date features to return. One of:
            ['dow' ==> day of the week
            'doy' ==> day of the year
            'dom' ==> day of the month
            'hr' ==> hour
            'min', ==> minute
            'is_wkd' ==> is weekend?
            'yr' ==> year
            'qtr' ==> quarter
            'mth' ==> month ]

        drop: bool, Default True

            Drops the original date columns from the data set.

    Return:
    -------
        DataFrame or Series.
    '''

    df = data.copy()

    for date_col in date_cols:
        #Convert date feature to Pandas DateTime
        df[date_col ]= pd.to_datetime(df[date_col])

        #specify columns to return
        dict_dates = {  "dow":  df[date_col].dt.weekday_name,
                        "doy":   df[date_col].dt.dayofyear,
                        "dom": df[date_col].dt.day,
                        "hr": df[date_col].dt.hour,
                        "min":   df[date_col].dt.minute,
                        "is_wkd":  df[date_col].apply( lambda x : 1 if x  in [5,6] else 0 ),
                        "yr": df[date_col].dt.year,
                        "qtr":  df[date_col].dt.quarter,
                        "mth": df[date_col].dt.month
                    } 

        if subset is None:
            #return all features
            subset = ['dow', 'doy', 'dom', 'hr', 'min', 'is_wkd', 'yr', 'qtr', 'mth']
            for date_ft in subset:
                df[date_col + '_' + date_ft] = dict_dates[date_ft]
        else:
            #Return only sepcified date features
            for date_ft in subset:
                df[date_col + '_' + date_ft] = dict_dates[date_ft]
    #Drops original time columns from the dataset
    if drop:
        df.drop(date_cols, axis=1, inplace=True)

    return df



def extract_time(data=None, time_cols=None, subset=None, drop=True):
    '''
    Returns time information in a pandas dataframe as a new set of columns 
    added to the original data frame.
    For extracting DateTime features, use datasist.timeseries.extract_dates function
    
    Parameters:
    -----------
        data: DataFrame or named Series

            The data set to extract time information from.

        time_cols: List, Array

            Name of time columns/features in data set.

        subset: List, Array

            Time features to return default to [hours, minutes and seconds].

        drop: bool, Default True

            Drops the original time features from the data set.

    Return:
    -------
        DataFrame or Series.
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    if time_cols is None:
        raise ValueError("time_cols: Expecting a list, series/ numpy1D array, got 'None'")
    
    df = data.copy()
    
    if subset is None:
        subset = ['hours', 'minutes', 'seconds']
    
    for time_col in time_cols:  
        #Convert time columns to pandas time delta
        df[time_col] = pd.to_timedelta(df[time_col])
        
        for val in subset:
            df[time_col + "_" + val] = df[time_col].dt.components[val]
        
    if drop:
        #Drop original time columns
        df.drop(time_cols, axis=1, inplace=True)
        
    return df


def get_time_elapsed(data=None, date_cols=None, by='s', col_name=None):
    '''
    Calculates the time elapsed between two specified date columns 
    and returns the value in either seconds (s), minute (m) or hours (h).
    
    Parameter:
    ----------
        data: DataFrame or name series.

            The data where the Date features are located
        
        data_col: List

            list of Date columns on which to calculate time elpased

        by: str

            specifies how time elapsed is calculated. Can be one of [h,m,s] corresponding to
            hour, minute and seconds respectively.
        
        col_name: str

            Name to use for the created column.

                
    Returns:
    --------
        Pandas DataFrame with new column for elapsed time.
    '''

    if date_cols is None:
        raise ValueError("date_col: Expecting a list of Date columns, got 'None'")
    
    if len(date_cols) != 2:
        raise ValueError("date_col: lenght of date_cols should be 2, got '{}'".format(len(date_cols)))
    
    by_mapping = {'h': 'hrs', 'm': 'mins', 's': 'secs'}

    if data is None:

        date1 = pd.to_datetime(date_cols[0])
        date2 = pd.to_datetime(date_cols[1])

        if col_name is None:
            col_name = 'time_elapsed_' + by_mapping[by]
            time_elapsed = (date1 - date2) / np.timedelta64(1,by) 
            return pd.DataFrame(time_elapsed, columns=[col_name])
        else:
            time_elapsed = (date1 - date2) / np.timedelta64(1,by) 
            return pd.DataFrame(time_elapsed, columns=[col_name])
    else:
        #convert to Pandas DateTime format
        df = data.copy()

        date1 = pd.to_datetime(df[date_cols[0]])
        date2 = pd.to_datetime(df[date_cols[1]])

        if col_name is None:
            col_name = by_mapping[by] + '_btw_' + date_cols[0] + '_' + date_cols[1]
            df[col_name] = (df[date_cols[0]] - df[date_cols[1]]) / np.timedelta64(1,by) 
            return df

        else:
            df[col_name] = (df[date_cols[0]] - df[date_cols[1]]) / np.timedelta64(1,by) 
            return df


def get_period_of_day(date_col=None):
    '''
    Returns a list of the time of the day as regards to mornings, afternoons or evenings. Hour of the day that falls
    between [0,1,2,3,4,5,6,7,8,9,10,11,12] are mapped to mornings, [13,14,15,16]] are mapped to afternoons and [17,18,19,20,21,22,23] are mapped to eveinings. 
    
    Parameter:
    ------------
        date_cols: Series, 1-D DataFrame

            The datetime feature

    Returns:
    ----------
        Series of mapped values
    
    '''

    if date_col is None:
        raise ValueError("date_cols: Expect a date columns, got 'None'")

    
    if date_col.dtype != np.int:
        
        date_col_hr = pd.to_datetime(date_col).dt.hour
        return date_col_hr.map(_map_hours)
    
    else:
        return date_col.map(_map_hours)




def describe_date(data=None, date_col=None):
    '''
    Calculate statistics of the date feature

    Parameter:
    ---------
        data: DataFrame or name series.

            The data to describe.

        data_col: str

            Name of date column to describe
    '''
    if data is None:
        raise ValueError("data: Expecting a DataFrame or Series, got 'None'")

    if date_col is None:
        raise ValueError("date_col: Expecting a string, got 'None'")


    df = extract_dates(data, date_col)
    display(df.describe())




def timeplot(data=None, num_cols=None, time_col=None, subplots=True, marker='.', 
                    figsize=(15,10), y_label='Daily Totals',save_fig=False, alpha=0.5, linestyle='None'):
    '''
    Plot all numeric features against the time column. Interpreted as a time series plot.

    Parameters:
    -----------
        data: DataFrame, Series.

            The data used in plotting.

        num_cols: list, 1-D array.

            Numerical columns in the data set. If not provided, we automatically infer them from the data set.

        time_col: str.

            The time column to plot numerical features against. We set this column as the index before plotting.

        subplots: bool, Default True.

            Uses matplotlib subplots to make plots.

        marker: str

            matplotlib supported marker to use in line decoration.

        figsize: tuple of ints, Default (15,10)

            The figure size of the plot.

        y_label: str.

            Name of the Y-axis.

        save_fig: bool, Default True

            Saves the figure to the current working directory.
        
    Returns:
    --------
        matplotlib figure
    
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame or Series, got 'None'")

    if num_cols is None:
        num_cols = get_num_feats(data)
        #remove the time_Col from num_cols
        num_cols.remove(time_col)

    if time_col is None:
        raise ValueError("time_col: Expecting a string name of time column, got 'None'")

    #Make time_col the index
    data[time_col] = pd.to_datetime(data[time_col])
    #Set as time_col as DataFrame index
    data = data.set_index(time_col)
    
    if subplots:
        axes = data[num_cols].plot(marker=marker,subplots=True, figsize=figsize, alpha=0.5, linestyle=linestyle) 
        for feature, ax in zip(num_cols, axes):
            ax.set_ylabel(y_label)
            ax.set_title("Timeseries Plot of '{}'".format(time_col))
            if save_fig:
                plt.savefig('fig_timeseries_plot_against_{}'.format(feature))
            plt.show()
    else:
        for feature in num_cols:
            fig = plt.figure()
            ax = fig.gca()
            axes = data[feature].plot(marker=marker,subplots=False, figsize=figsize, alpha=0.5, linestyle=linestyle, ax=ax) 
            plt.ylabel(feature)
            ax.set_title("Timeseries Plot of '{}' vs. '{}' ".format(time_col, feature))
            if save_fig:
                plt.savefig('fig_timeseries_plot_against_{}'.format(feature))
            plt.show()           



# def time_boxplot(data=None, features=None, x=None, subplots=True, figsize=(12,10)):
#     '''
#     Makes a box plot of features against a specified column
    
#     '''

#     if subplots:
#         fig, axes = plt.subplots(len(features), 1, figsize=figsize, sharex=True)
#         for feature, ax in zip(features, axes):
#             sns.boxplot(data=data, x=x, y=feature, ax=ax)
#             ax.set_ylabel('Count')
#             ax.set_title("Boxplot of '{}' vs. {} ".format(feature, x))
#             plt.tight_layout()
#             # Remove the automatic x-axis label from all but the bottom subplot
#             if ax != axes[-1]:
#                 ax.set_xlabel('')
#     else:
#         for feature in features:
#             fig = plt.figure(figsize=figsize) # define plot area
#             ax = fig.gca() # define axis 
#             sns.boxplot(data=data, x=x, y=feature, ax=ax)
#             ax.set_ylabel('Count')
#             ax.set_title("Boxplot of '{}' vs. {} ".format(feature, x))
#             plt.tight_layout()
#             ax.set_xlabel('')
#             plt.show()



def set_date_index(data, date_col):
    #Make time_col the index
    data[date_col] = pd.to_datetime(data[date_col])
    #Set as time_col as DataFrame index
    return data.set_index(date_col)


def _map_hours(x):   
    if x in [0,1,2,3,4,5,6,7,8,9,10,11,12]:
        return 'morning'
    elif x in [13,14,15,16]:
        return 'afternoon'
    else:
        return 'evening'

Functions

def describe_date(data=None, date_col=None)

Calculate statistics of the date feature

Parameter:

data: DataFrame or name series.

    The data to describe.

data_col: str

    Name of date column to describe

Expand source code

def describe_date(data=None, date_col=None):
    '''
    Calculate statistics of the date feature

    Parameter:
    ---------
        data: DataFrame or name series.

            The data to describe.

        data_col: str

            Name of date column to describe
    '''
    if data is None:
        raise ValueError("data: Expecting a DataFrame or Series, got 'None'")

    if date_col is None:
        raise ValueError("date_col: Expecting a string, got 'None'")


    df = extract_dates(data, date_col)
    display(df.describe())

def extract_dates(data=None, date_cols=None, subset=None, drop=True)

Extracts date information in a dataframe and append to the original data as new columns. For extracting only time features, use datasist.timeseries.extract_time function

Parameters:

data: DataFrame or named Series

    The data set to extract date information from.

date_cols: List, Array

    Name of date columns/features in data set.

subset: List, Array

    Date features to return. One of:
    ['dow' ==> day of the week
    'doy' ==> day of the year
    'dom' ==> day of the month
    'hr' ==> hour
    'min', ==> minute
    'is_wkd' ==> is weekend?
    'yr' ==> year
    'qtr' ==> quarter
    'mth' ==> month ]

drop: bool, Default True

    Drops the original date columns from the data set.

Return:

DataFrame or Series.

Expand source code

def extract_dates(data=None, date_cols=None, subset=None, drop=True):
    '''
    Extracts date information in a dataframe and append to the original data as new columns.
    For extracting only time features, use datasist.timeseries.extract_time function
    
    Parameters:
    -----------
        data: DataFrame or named Series

            The data set to extract date information from.

        date_cols: List, Array

            Name of date columns/features in data set.

        subset: List, Array

            Date features to return. One of:
            ['dow' ==> day of the week
            'doy' ==> day of the year
            'dom' ==> day of the month
            'hr' ==> hour
            'min', ==> minute
            'is_wkd' ==> is weekend?
            'yr' ==> year
            'qtr' ==> quarter
            'mth' ==> month ]

        drop: bool, Default True

            Drops the original date columns from the data set.

    Return:
    -------
        DataFrame or Series.
    '''

    df = data.copy()

    for date_col in date_cols:
        #Convert date feature to Pandas DateTime
        df[date_col ]= pd.to_datetime(df[date_col])

        #specify columns to return
        dict_dates = {  "dow":  df[date_col].dt.weekday_name,
                        "doy":   df[date_col].dt.dayofyear,
                        "dom": df[date_col].dt.day,
                        "hr": df[date_col].dt.hour,
                        "min":   df[date_col].dt.minute,
                        "is_wkd":  df[date_col].apply( lambda x : 1 if x  in [5,6] else 0 ),
                        "yr": df[date_col].dt.year,
                        "qtr":  df[date_col].dt.quarter,
                        "mth": df[date_col].dt.month
                    } 

        if subset is None:
            #return all features
            subset = ['dow', 'doy', 'dom', 'hr', 'min', 'is_wkd', 'yr', 'qtr', 'mth']
            for date_ft in subset:
                df[date_col + '_' + date_ft] = dict_dates[date_ft]
        else:
            #Return only sepcified date features
            for date_ft in subset:
                df[date_col + '_' + date_ft] = dict_dates[date_ft]
    #Drops original time columns from the dataset
    if drop:
        df.drop(date_cols, axis=1, inplace=True)

    return df

def extract_time(data=None, time_cols=None, subset=None, drop=True)

Returns time information in a pandas dataframe as a new set of columns added to the original data frame. For extracting DateTime features, use datasist.timeseries.extract_dates function

Parameters:

data: DataFrame or named Series

    The data set to extract time information from.

time_cols: List, Array

    Name of time columns/features in data set.

subset: List, Array

    Time features to return default to [hours, minutes and seconds].

drop: bool, Default True

    Drops the original time features from the data set.

Return:

DataFrame or Series.

Expand source code

def extract_time(data=None, time_cols=None, subset=None, drop=True):
    '''
    Returns time information in a pandas dataframe as a new set of columns 
    added to the original data frame.
    For extracting DateTime features, use datasist.timeseries.extract_dates function
    
    Parameters:
    -----------
        data: DataFrame or named Series

            The data set to extract time information from.

        time_cols: List, Array

            Name of time columns/features in data set.

        subset: List, Array

            Time features to return default to [hours, minutes and seconds].

        drop: bool, Default True

            Drops the original time features from the data set.

    Return:
    -------
        DataFrame or Series.
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    if time_cols is None:
        raise ValueError("time_cols: Expecting a list, series/ numpy1D array, got 'None'")
    
    df = data.copy()
    
    if subset is None:
        subset = ['hours', 'minutes', 'seconds']
    
    for time_col in time_cols:  
        #Convert time columns to pandas time delta
        df[time_col] = pd.to_timedelta(df[time_col])
        
        for val in subset:
            df[time_col + "_" + val] = df[time_col].dt.components[val]
        
    if drop:
        #Drop original time columns
        df.drop(time_cols, axis=1, inplace=True)
        
    return df

def get_period_of_day(date_col=None)

Returns a list of the time of the day as regards to mornings, afternoons or evenings. Hour of the day that falls between [0,1,2,3,4,5,6,7,8,9,10,11,12] are mapped to mornings, [13,14,15,16]] are mapped to afternoons and [17,18,19,20,21,22,23] are mapped to eveinings.

Parameter:

date_cols: Series, 1-D DataFrame

    The datetime feature

Returns:

Series of mapped values

Expand source code

def get_period_of_day(date_col=None):
    '''
    Returns a list of the time of the day as regards to mornings, afternoons or evenings. Hour of the day that falls
    between [0,1,2,3,4,5,6,7,8,9,10,11,12] are mapped to mornings, [13,14,15,16]] are mapped to afternoons and [17,18,19,20,21,22,23] are mapped to eveinings. 
    
    Parameter:
    ------------
        date_cols: Series, 1-D DataFrame

            The datetime feature

    Returns:
    ----------
        Series of mapped values
    
    '''

    if date_col is None:
        raise ValueError("date_cols: Expect a date columns, got 'None'")

    
    if date_col.dtype != np.int:
        
        date_col_hr = pd.to_datetime(date_col).dt.hour
        return date_col_hr.map(_map_hours)
    
    else:
        return date_col.map(_map_hours)

def get_time_elapsed(data=None, date_cols=None, by='s', col_name=None)

Calculates the time elapsed between two specified date columns and returns the value in either seconds (s), minute (m) or hours (h).

Parameter:

data: DataFrame or name series.

    The data where the Date features are located

data_col: List

    list of Date columns on which to calculate time elpased

by: str

    specifies how time elapsed is calculated. Can be one of [h,m,s] corresponding to
    hour, minute and seconds respectively.

col_name: str

    Name to use for the created column.

Returns:

Pandas DataFrame with new column for elapsed time.

Expand source code

def get_time_elapsed(data=None, date_cols=None, by='s', col_name=None):
    '''
    Calculates the time elapsed between two specified date columns 
    and returns the value in either seconds (s), minute (m) or hours (h).
    
    Parameter:
    ----------
        data: DataFrame or name series.

            The data where the Date features are located
        
        data_col: List

            list of Date columns on which to calculate time elpased

        by: str

            specifies how time elapsed is calculated. Can be one of [h,m,s] corresponding to
            hour, minute and seconds respectively.
        
        col_name: str

            Name to use for the created column.

                
    Returns:
    --------
        Pandas DataFrame with new column for elapsed time.
    '''

    if date_cols is None:
        raise ValueError("date_col: Expecting a list of Date columns, got 'None'")
    
    if len(date_cols) != 2:
        raise ValueError("date_col: lenght of date_cols should be 2, got '{}'".format(len(date_cols)))
    
    by_mapping = {'h': 'hrs', 'm': 'mins', 's': 'secs'}

    if data is None:

        date1 = pd.to_datetime(date_cols[0])
        date2 = pd.to_datetime(date_cols[1])

        if col_name is None:
            col_name = 'time_elapsed_' + by_mapping[by]
            time_elapsed = (date1 - date2) / np.timedelta64(1,by) 
            return pd.DataFrame(time_elapsed, columns=[col_name])
        else:
            time_elapsed = (date1 - date2) / np.timedelta64(1,by) 
            return pd.DataFrame(time_elapsed, columns=[col_name])
    else:
        #convert to Pandas DateTime format
        df = data.copy()

        date1 = pd.to_datetime(df[date_cols[0]])
        date2 = pd.to_datetime(df[date_cols[1]])

        if col_name is None:
            col_name = by_mapping[by] + '_btw_' + date_cols[0] + '_' + date_cols[1]
            df[col_name] = (df[date_cols[0]] - df[date_cols[1]]) / np.timedelta64(1,by) 
            return df

        else:
            df[col_name] = (df[date_cols[0]] - df[date_cols[1]]) / np.timedelta64(1,by) 
            return df

def set_date_index(data, date_col)

Sets a specified date column as the index of the DataFrame


Parameters:
-----------------
data: DataFrame
    The dataset which index we want to change.

date_col: string
    The date column to set as index

Returns:
    DataFrame

Expand source code

def set_date_index(data, date_col):
    #Make time_col the index
    data[date_col] = pd.to_datetime(data[date_col])
    #Set as time_col as DataFrame index
    return data.set_index(date_col)

def timeplot(data=None, num_cols=None, time_col=None, subplots=True, marker='.', figsize=(15, 10), y_label='Daily Totals', save_fig=False, alpha=0.5, linestyle='None')

Plot all numeric features against the time column. Interpreted as a time series plot.

Parameters:

data: DataFrame, Series.

    The data used in plotting.

num_cols: list, 1-D array.

    Numerical columns in the data set. If not provided, we automatically infer them from the data set.

time_col: str.

    The time column to plot numerical features against. We set this column as the index before plotting.

subplots: bool, Default True.

    Uses matplotlib subplots to make plots.

marker: str

    matplotlib supported marker to use in line decoration.

figsize: tuple of ints, Default (15,10)

    The figure size of the plot.

y_label: str.

    Name of the Y-axis.

save_fig: bool, Default True

    Saves the figure to the current working directory.

Returns:

matplotlib figure

Expand source code

def timeplot(data=None, num_cols=None, time_col=None, subplots=True, marker='.', 
                    figsize=(15,10), y_label='Daily Totals',save_fig=False, alpha=0.5, linestyle='None'):
    '''
    Plot all numeric features against the time column. Interpreted as a time series plot.

    Parameters:
    -----------
        data: DataFrame, Series.

            The data used in plotting.

        num_cols: list, 1-D array.

            Numerical columns in the data set. If not provided, we automatically infer them from the data set.

        time_col: str.

            The time column to plot numerical features against. We set this column as the index before plotting.

        subplots: bool, Default True.

            Uses matplotlib subplots to make plots.

        marker: str

            matplotlib supported marker to use in line decoration.

        figsize: tuple of ints, Default (15,10)

            The figure size of the plot.

        y_label: str.

            Name of the Y-axis.

        save_fig: bool, Default True

            Saves the figure to the current working directory.
        
    Returns:
    --------
        matplotlib figure
    
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame or Series, got 'None'")

    if num_cols is None:
        num_cols = get_num_feats(data)
        #remove the time_Col from num_cols
        num_cols.remove(time_col)

    if time_col is None:
        raise ValueError("time_col: Expecting a string name of time column, got 'None'")

    #Make time_col the index
    data[time_col] = pd.to_datetime(data[time_col])
    #Set as time_col as DataFrame index
    data = data.set_index(time_col)
    
    if subplots:
        axes = data[num_cols].plot(marker=marker,subplots=True, figsize=figsize, alpha=0.5, linestyle=linestyle) 
        for feature, ax in zip(num_cols, axes):
            ax.set_ylabel(y_label)
            ax.set_title("Timeseries Plot of '{}'".format(time_col))
            if save_fig:
                plt.savefig('fig_timeseries_plot_against_{}'.format(feature))
            plt.show()
    else:
        for feature in num_cols:
            fig = plt.figure()
            ax = fig.gca()
            axes = data[feature].plot(marker=marker,subplots=False, figsize=figsize, alpha=0.5, linestyle=linestyle, ax=ax) 
            plt.ylabel(feature)
            ax.set_title("Timeseries Plot of '{}' vs. '{}' ".format(time_col, feature))
            if save_fig:
                plt.savefig('fig_timeseries_plot_against_{}'.format(feature))
            plt.show()