Module `datasist.feature_engineering`

The feature engineering contains functions for extracting important features from raw data and transforming them into formats that are suitable for machine learning models.

Expand source code

'''
'''
import datetime as dt
import re
import platform
import pandas as pd
import numpy as np

if platform.system() == "Darwin":
    import matplotlib as plt
    plt.use('TkAgg')
else:
    import matplotlib.pyplot as plt

import seaborn as sns

from .structdata import get_cat_feats, get_num_feats, get_date_cols
from dateutil.parser import parse


def drop_missing(data=None, percent=99):
    '''
    Drops missing columns with [percent] of missing data.

    Parameters:
    -------------------------
        data: Pandas DataFrame or Series.

        percent: float, Default 99

            Percentage of missing values to be in a column before it is eligible for removal.

    Returns:

        Pandas DataFrame or Series.
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    missing_percent = (data.isna().sum() / data.shape[0]) * 100
    cols_2_drop = missing_percent[missing_percent.values >= percent].index
    print("Dropped {}".format(list(cols_2_drop)))
    #Drop missing values
    df = data.drop(cols_2_drop, axis=1)
    return df



def drop_redundant(data):
    '''
    Removes features with the same value in all cell. Drops feature If Nan is the second unique class as well.

    Parameters:
    -----------------------------
        data: DataFrame or named series.
    
    Returns:

        DataFrame or named series.
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    #get columns
    cols_2_drop = _nan_in_class(data)
    print("Dropped {}".format(cols_2_drop))
    df = data.drop(cols_2_drop, axis=1)
    return df
    

# def fill_with_model(xtrain, xtest, estimator):

    

def _nan_in_class(data):
    cols = []
    for col in data.columns:
        if len(data[col].unique()) == 1:
            cols.append(col)

        if len(data[col].unique()) == 2:
            if np.nan in list(data[col].unique()):
                cols.append(col)

    return cols


def fill_missing_cats(data=None, cat_features=None, missing_encoding=None, missing_col=False):
    '''
    Fill missing values using the mode of the categorical features.

    Parameters:
    ------------------------
        data: DataFrame or name Series.

            Data set to perform operation on.

        cat_features: List, Series, Array.

            categorical features to perform operation on. If not provided, we automatically infer the categoricals from the dataset.

        missing_encoding: List, Series, Array.

            Values used in place of missing. Popular formats are [-1, -999, -99, '', ' ']

        missin_col: bool, Default True
            Creates a new column to capture the missing values. 1 if missing and 0 otherwise. This can sometimes help a machine learning model.
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")

    if cat_features is None:
        cat_features = get_cat_feats(data)

    df = data.copy()
    #change all possible missing values to NaN
    if missing_encoding is None:
        missing_encoding = ['', ' ', -99, -999]

    df.replace(missing_encoding, np.NaN, inplace=True)
    
    for feat in cat_features:
        if missing_col:
            df[feat + '_missing_value'] = (df[feat].isna()).astype('int64')
        most_freq = df[feat].mode()[0]
        df[feat] = df[feat].replace(np.NaN, most_freq)
    
    return df


def fill_missing_num(data=None, num_features=None, method='mean', missing_col=False):
    '''
    fill missing values in numerical columns with specified [method] value

    Parameters:
        ------------------------------
        data: DataFrame or name Series.

            The data set to fill

        features: list.

            List of columns to fill

        method: str, Default 'mean'.

            method to use in calculating fill value.

        missing_col: bool, Default True

            Creates a new column to capture the missing values. 1 if missing and 0 otherwise. This can sometimes help a machine learning model.
    '''
    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    if num_features is None:
        num_features = get_num_feats(data)
        #get numerical features with missing values
        temp_df = data[num_features].isna().sum()
        features = list(temp_df[num_features][temp_df[num_features] > 0].index)
        
    df = data.copy()
    for feat in features:
        if missing_col:
            df[feat + '_missing_value'] = (df[feat].isna()).astype('int64')
        if method is 'mean':
            mean = df[feat].mean()
            df[feat].fillna(mean, inplace=True)
        elif method is 'median':
            median = df[feat].median()
            df[feat].fillna(median, inplace=True)
        elif method is 'mode':
            mode = df[feat].mode()[0]
            df[feat].fillna(mode, inplace=True)
        else:
            raise ValueError("method: must specify a fill method, one of [mean, mode or median]'")

    return df


   


def merge_groupby(data=None, cat_features=None, statistics=None, col_to_merge=None):
    '''
    Performs a groupby on the specified categorical features and merges
    the result to the original dataframe.

    Parameter:
    -----------------------

        data: DataFrame

            Data set to perform operation on.

        cat_features: list, series, 1D-array

            categorical features to groupby.

        statistics: list, series, 1D-array, Default ['mean', 'count]

            aggregates to perform on grouped data.

        col_to_merge: str

            The column to merge on the dataset. Must be present in the data set.

    Returns:

        Dataframe.

    '''
    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    if statistics is None:     
        statistics = ['mean', 'count']
    
    if cat_features is None:
        cat_features = get_num_feats(data)

    if col_to_merge is None:
        raise ValueError("col_to_merge: Expecting a string [column to merge on], got 'None'")

    
    df = data.copy()
    
    for cat in cat_features:      
        temp = df.groupby([cat]).agg(statistics)[col_to_merge]
        #rename columns
        temp = temp.rename(columns={'mean': cat + '_' + col_to_merge + '_mean', 'count': cat + '_' + col_to_merge +  "_count"})
        #merge the data sets
        df = df.merge(temp, how='left', on=cat)
    
    
    return df


def get_qcut(data=None, col=None, q=None, duplicates='drop', return_type='float64'):
    '''
    Cuts a series into bins using the pandas qcut function
    and returns the resulting bins as a series for merging.

    Parameter:
    -------------

        data: DataFrame, named Series

            Data set to perform operation on.

        col: str

            column to cut/binnarize.

        q: integer or array of quantiles

            Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
            array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.

        duplicates: Default 'drop',

            If bin edges are not unique drop non-uniques.

        return_type: dtype, Default (float64)

            Dtype of series to return. One of [float64, str, int64]
    
    Returns:
    --------

        Series, 1D-Array

    '''

    temp_df = pd.qcut(data[col], q=q, duplicates=duplicates).to_frame().astype('str')
    #retrieve only the qcut categories
    df = temp_df[col].str.split(',').apply(lambda x: x[0][1:]).astype(return_type)
    
    return df


def create_balanced_data(data=None, target=None, categories=None, class_sizes=None, replacement=False ):
    '''
    Creates a balanced data set from an imbalanced one. Used in a classification task.

    Parameter:
    ----------------------------
        data: DataFrame, name series.

            The imbalanced dataset.

        target: str

            Name of the target column.

        categories: list

            Unique categories in the target column. If not set, we use infer the unique categories in the column.

        class_sizes: list

            Size of each specified class. Must be in order with categoriess parameter.

        replacement: bool, Default True.

            samples with or without replacement.
    '''
    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    if target is None:
        raise ValueError("target: Expecting a String got 'None'")

    if categories is None:
        categories = list(data[target].unique())
    
    if class_sizes is None:
        #set size for each class to same value
        temp_val = int(data.shape[0] / len(data[target].unique()))
        class_sizes = [temp_val for _ in list(data[target].unique())]

    
    df = data.copy()
    data_category = []
    data_class_indx = []
    
    #get data corrresponding to each of the categories
    for cat in categories: 
        data_category.append(df[df[target] == cat])
    
    #sample and get the index corresponding to each category
    for class_size, cat in zip(class_sizes, data_category):
        data_class_indx.append(cat.sample(class_size, replace=True).index)
        
    #concat data together
    new_data = pd.concat([df.loc[indx] for indx in data_class_indx], ignore_index=True).sample(sum(class_sizes)).reset_index(drop=True)
    
    if not replacement:
        for indx in data_class_indx:
            df.drop(indx, inplace=True)
            
        
    return new_data



def to_date(data):
    '''
    Automatically convert all date time columns to pandas Datetime format
    '''

    date_cols = get_date_cols(data)
    for col in date_cols:
        data[col] = pd.to_datetime(data[col])
    
    return data


def haversine_distance(lat1, long1, lat2, long2):
    '''
    Calculates the Haversine distance between two location with latitude and longitude.
    The haversine distance is the great-circle distance between two points on a sphere given their longitudes and latitudes.
    
    Parameter:
    ---------------------------
        lat1: scalar,float

            Start point latitude of the location.

        lat2: scalar,float 

            End point latitude of the location.

        long1: scalar,float

            Start point longitude of the location.

        long2: scalar,float 

            End point longitude of the location.

    Returns: 

        Series: The Harversine distance between (lat1, lat2), (long1, long2)
    
    '''

    lat1, long1, lat2, long2 = map(np.radians, (lat1, long1, lat2, long2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = long2 - long1
    distance = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    harvesine_distance = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(distance))
    harvesine_distance_df = pd.Series(harvesine_distance)
    return harvesine_distance_df


def manhattan_distance(lat1, long1, lat2, long2):
    '''
    Calculates the Manhattan distance between two points.
    It is the sum of horizontal and vertical distance between any two points given their latitudes and longitudes. 

    Parameter:
    -------------------
        lat1: scalar,float

            Start point latitude of the location.

        lat2: scalar,float 

            End point latitude of the location.

        long1: scalar,float

            Start point longitude of the location.

        long2: scalar,float 

            End point longitude of the location.

    Returns: Series

        The Manhattan distance between (lat1, lat2) and (long1, long2)
    
    '''
    a = np.abs(lat2 -lat1)
    b = np.abs(long1 - long2)
    manhattan_distance = a + b
    manhattan_distance_df = pd.Series(manhattan_distance)
    return manhattan_distance_df
    

def bearing(lat1, long1, lat2, long2):
    '''
    Calculates the Bearing  between two points.
    The bearing is the compass direction to travel from a starting point, and must be within the range 0 to 360. 

    Parameter:
    -------------------------
        lat1: scalar,float

            Start point latitude of the location.

        lat2: scalar,float 

            End point latitude of the location.

        long1: scalar,float

            Start point longitude of the location.

        long2: scalar,float 

            End point longitude of the location.

    Returns: Series

        The Bearing between (lat1, lat2) and (long1, long2)
    
    '''
    AVG_EARTH_RADIUS = 6371
    long_delta = np.radians(long2 - long1)
    lat1, long1, lat2, long2 = map(np.radians, (lat1, long1, lat2, long2))
    y = np.sin(long_delta) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(long_delta)
    bearing = np.degrees(np.arctan2(y, x))
    bearing_df = pd.Series(bearing)
    return bearing_df
    

def get_location_center(point1, point2):
    '''
    Calculates the center between two points.

    Parameter:
    ---------------------------
        point1: list, series, scalar

            End point latitude of the location.

        long1: list, series, scalar

            Start point longitude of the location.

        long2: list, series, scalar

            End point longitude of the location.

    Returns: Series
    
        The center between point1 and point2
    
    '''
    center = (point1 + point2) / 2
    center_df = pd.Series(center)
    return center_df

def log_transform(data, columns, plot=True, figsize=(12,6)):
    '''
    Nomralizes the dataset to be as close to the gaussian distribution.

    Parameter:
    -----------------------------------------
    data: DataFrame, Series.
        Data to Log transform.

    columns: List, Series
        Columns to be transformed to normality using log transformation
    
    plot: bool, default True
        Plots a before and after log transformation plot
    
    Returns:
        Log-transformed dataframe
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")

    if columns is None:
        raise ValueError("columns: Expecting at least a column in the list of columns but got 'None'")
    
    df = data.copy()
    for col in columns:
        df[col] = np.log1p(df[col])

    if plot:
        for col in columns: 
            _ = plt.figure(figsize = figsize)
            plt.subplot(1, 2, 1)
            sns.distplot(data[col], color="m", label="Skewness : %.2f" % (df[col].skew()))    
            plt.title('Distribution of ' + col + " before Log transformation")
            plt.legend(loc='best')
            
            plt.subplot(1, 2, 2)
            sns.distplot(df[col], color="m", label="Skewness : %.2f" % (df[col].skew()))    
            plt.title('Distribution of ' + col + " after Log transformation")
            plt.legend(loc='best')
            plt.tight_layout(2)
            plt.show()

    return df


def convert_dtype(df):
    '''
    Convert datatype of a feature to its original datatype.
    If the datatype of a feature is being represented as a string while the initial datatype is an integer or a float 
    or even a datetime dtype. The convert_dtype() function iterates over the feature(s) in a pandas dataframe and convert the features to their appropriate datatype
    
    Parameter:
    ---------------------------
    df: DataFrame, Series
        Dataset to convert data type

    
    Returns:
    -----------------
        DataFrame or Series.

    Example: 
    data = {'Name':['Tom', 'nick', 'jack'], 
            'Age':['20', '21', '19'],
            'Date of Birth': ['1999-11-17','20 Sept 1998','Wed Sep 19 14:55:02 2000']} 
     
    df = pd.DataFrame(data)

    df.info()
    >>> 
    <class 'pandas.core.frame.DataFrame'>
        RangeIndex: 3 entries, 0 to 2
        Data columns (total 3 columns):
        Name             3 non-null object
        Age              3 non-null object
        Date of Birth    3 non-null object
        dtypes: object(3)
        memory usage: 76.0+ bytes
    
    conv = convert_dtype(df)
    conv.info()
    >>> 
    <class 'pandas.core.frame.DataFrame'>
        RangeIndex: 3 entries, 0 to 2
        Data columns (total 3 columns):
        Name             3 non-null object
        Age              3 non-null int32
        Date of Birth    3 non-null datetime64[ns]
        dtypes: datetime64[ns](1), int32(1), object(1)
        memory usage: 88.0+ bytes


    '''
    if df.isnull().any().any() == True:
        raise ValueError("DataFrame contain missing values")
    else:
        i = 0
        changed_dtype = []
        #Function to handle datetime dtype
        def is_date(string, fuzzy=False):
            try:
                parse(string, fuzzy=fuzzy)
                return True
            except ValueError:
                return False
            
        while i <= (df.shape[1])-1:
            val = df.iloc[:,i]
            if str(val.dtypes) =='object':
                val = val.apply(lambda x: re.sub(r"^\s+|\s+$", "",x, flags=re.UNICODE)) #Remove spaces between strings
        
            try:
                if str(val.dtypes) =='object':
                    if val.min().isdigit() == True: #Check if the string is an integer dtype
                        int_v = val.astype(int)
                        changed_dtype.append(int_v)
                    elif val.min().replace('.', '', 1).isdigit() == True: #Check if the string is a float type
                        float_v = val.astype(float)
                        changed_dtype.append(float_v)
                    elif is_date(val.min(),fuzzy=False) == True: #Check if the string is a datetime dtype
                        dtime = pd.to_datetime(val)
                        changed_dtype.append(dtime)
                    else:
                        changed_dtype.append(val) #This indicate the dtype is a string
                else:
                    changed_dtype.append(val) #This could count for symbols in a feature
            
            except ValueError:
                raise ValueError("DataFrame columns contain one or more DataType")
            except:
                raise Exception()

            i = i+1

        data_f = pd.concat(changed_dtype,1)

        return data_f
            


def bin_age(data, feature, bins, labels, fill_missing = None, drop_original = False):

    '''
    Categorize age data into separate bins

    Parameter:
    -----------------------------------------
    data: DataFrame, Series.
        Data for which feature to be binned exist.

    feature: List, Series
        Columns to be binned

    
    Bins: List, numpy.ndarray
        Specifies the different categories
        Bins must be one greater labels
    
    
    labels: List, Series
        Name identified to the various categories

    fill_missing(default = None): int
        mean : feature average
        mode : most occuring data in the feature
        median : middle point in the feature

    drop_original: bool
        Drops original feature after beaning
    

    Returns:
        Returns a binned dataframe
    '''

    
    df = data.copy()
    for col in feature:
        if fill_missing == None:
        
            if df[col].isnull().any():
                raise ValueError("data: Mising Value found in table")
            
            else:
                df[col + '_binned'] = pd.cut(x=df[col], bins= bins, labels=labels)
            
    
        elif fill_missing == 'mean':
            df[col].fillna(int(df[col].mean()), inplace  = True)
            df[col + '_binned'] = pd.cut(x=df[col], bins=bins, labels=labels)

        elif fill_missing == 'mode':
            df[col].fillna(int(df[col].mode()), inplace  = True)
            df[col + '_binned'] = pd.cut(x=df[col], bins=bins, labels=labels)
    
        elif fill_missing == 'median':
            df[col].fillna(int(df[col].median()), inplace  = True)
            df[col + '_binned'] = pd.cut(x=df[col], bins=bins, labels=labels)
            
        
        if drop_original == True:
           
            df.drop(columns = col, inplace = True)

    return df

Functions

def bearing(lat1, long1, lat2, long2)

Calculates the Bearing between two points. The bearing is the compass direction to travel from a starting point, and must be within the range 0 to 360.

Parameter:

lat1: scalar,float

    Start point latitude of the location.

lat2: scalar,float

    End point latitude of the location.

long1: scalar,float

    Start point longitude of the location.

long2: scalar,float

    End point longitude of the location.

Returns: Series

The Bearing between (lat1, lat2) and (long1, long2)

Expand source code

def bearing(lat1, long1, lat2, long2):
    '''
    Calculates the Bearing  between two points.
    The bearing is the compass direction to travel from a starting point, and must be within the range 0 to 360. 

    Parameter:
    -------------------------
        lat1: scalar,float

            Start point latitude of the location.

        lat2: scalar,float 

            End point latitude of the location.

        long1: scalar,float

            Start point longitude of the location.

        long2: scalar,float 

            End point longitude of the location.

    Returns: Series

        The Bearing between (lat1, lat2) and (long1, long2)
    
    '''
    AVG_EARTH_RADIUS = 6371
    long_delta = np.radians(long2 - long1)
    lat1, long1, lat2, long2 = map(np.radians, (lat1, long1, lat2, long2))
    y = np.sin(long_delta) * np.cos(lat2)
    x = np.cos(lat1) * np.sin(lat2) - np.sin(lat1) * np.cos(lat2) * np.cos(long_delta)
    bearing = np.degrees(np.arctan2(y, x))
    bearing_df = pd.Series(bearing)
    return bearing_df

def bin_age(data, feature, bins, labels, fill_missing=None, drop_original=False)


Categorize age data into separate bins
    
Parameter:
--------------------
    data: DataFrame, Series.

        Data for which feature to be binned exist.
    
    feature: List, Series

        Columns to be binned

    Bins: List, numpy.ndarray

        Specifies the different categories. Bins must be one greater labels.

    labels: List, Series

        Name identified to the various categories

    fill_missing: int Default, None

        mean : feature average
        mode : most occuring data in the feature
        median : middle point in the feature

    drop_original: bool
        Drops original feature after beaning

Returns:

    Binned DataFrame of age

Expand source code

def bin_age(data, feature, bins, labels, fill_missing = None, drop_original = False):

    '''
    Categorize age data into separate bins

    Parameter:
    -----------------------------------------
    data: DataFrame, Series.
        Data for which feature to be binned exist.

    feature: List, Series
        Columns to be binned

    
    Bins: List, numpy.ndarray
        Specifies the different categories
        Bins must be one greater labels
    
    
    labels: List, Series
        Name identified to the various categories

    fill_missing(default = None): int
        mean : feature average
        mode : most occuring data in the feature
        median : middle point in the feature

    drop_original: bool
        Drops original feature after beaning
    

    Returns:
        Returns a binned dataframe
    '''

    
    df = data.copy()
    for col in feature:
        if fill_missing == None:
        
            if df[col].isnull().any():
                raise ValueError("data: Mising Value found in table")
            
            else:
                df[col + '_binned'] = pd.cut(x=df[col], bins= bins, labels=labels)
            
    
        elif fill_missing == 'mean':
            df[col].fillna(int(df[col].mean()), inplace  = True)
            df[col + '_binned'] = pd.cut(x=df[col], bins=bins, labels=labels)

        elif fill_missing == 'mode':
            df[col].fillna(int(df[col].mode()), inplace  = True)
            df[col + '_binned'] = pd.cut(x=df[col], bins=bins, labels=labels)
    
        elif fill_missing == 'median':
            df[col].fillna(int(df[col].median()), inplace  = True)
            df[col + '_binned'] = pd.cut(x=df[col], bins=bins, labels=labels)
            
        
        if drop_original == True:
           
            df.drop(columns = col, inplace = True)

    return df

def convert_dtype(df)

Converts datatype of a feature to its original datatype.
If the datatype of a feature is being represented as a string while the initial datatype
is an integer or a float
or even a datetime dtype. The convert_dtype() function iterates over the feature(s) in a
pandas dataframe and convert the features to their appropriate datatype

Parameter:
---------------
df: DataFrame, Series
    Dataset to convert data type

Returns:

    DataFrame or Series.

Expand source code

def convert_dtype(df):
    '''
    Convert datatype of a feature to its original datatype.
    If the datatype of a feature is being represented as a string while the initial datatype is an integer or a float 
    or even a datetime dtype. The convert_dtype() function iterates over the feature(s) in a pandas dataframe and convert the features to their appropriate datatype
    
    Parameter:
    ---------------------------
    df: DataFrame, Series
        Dataset to convert data type

    
    Returns:
    -----------------
        DataFrame or Series.

    Example: 
    data = {'Name':['Tom', 'nick', 'jack'], 
            'Age':['20', '21', '19'],
            'Date of Birth': ['1999-11-17','20 Sept 1998','Wed Sep 19 14:55:02 2000']} 
     
    df = pd.DataFrame(data)

    df.info()
    >>> 
    <class 'pandas.core.frame.DataFrame'>
        RangeIndex: 3 entries, 0 to 2
        Data columns (total 3 columns):
        Name             3 non-null object
        Age              3 non-null object
        Date of Birth    3 non-null object
        dtypes: object(3)
        memory usage: 76.0+ bytes
    
    conv = convert_dtype(df)
    conv.info()
    >>> 
    <class 'pandas.core.frame.DataFrame'>
        RangeIndex: 3 entries, 0 to 2
        Data columns (total 3 columns):
        Name             3 non-null object
        Age              3 non-null int32
        Date of Birth    3 non-null datetime64[ns]
        dtypes: datetime64[ns](1), int32(1), object(1)
        memory usage: 88.0+ bytes


    '''
    if df.isnull().any().any() == True:
        raise ValueError("DataFrame contain missing values")
    else:
        i = 0
        changed_dtype = []
        #Function to handle datetime dtype
        def is_date(string, fuzzy=False):
            try:
                parse(string, fuzzy=fuzzy)
                return True
            except ValueError:
                return False
            
        while i <= (df.shape[1])-1:
            val = df.iloc[:,i]
            if str(val.dtypes) =='object':
                val = val.apply(lambda x: re.sub(r"^\s+|\s+$", "",x, flags=re.UNICODE)) #Remove spaces between strings
        
            try:
                if str(val.dtypes) =='object':
                    if val.min().isdigit() == True: #Check if the string is an integer dtype
                        int_v = val.astype(int)
                        changed_dtype.append(int_v)
                    elif val.min().replace('.', '', 1).isdigit() == True: #Check if the string is a float type
                        float_v = val.astype(float)
                        changed_dtype.append(float_v)
                    elif is_date(val.min(),fuzzy=False) == True: #Check if the string is a datetime dtype
                        dtime = pd.to_datetime(val)
                        changed_dtype.append(dtime)
                    else:
                        changed_dtype.append(val) #This indicate the dtype is a string
                else:
                    changed_dtype.append(val) #This could count for symbols in a feature
            
            except ValueError:
                raise ValueError("DataFrame columns contain one or more DataType")
            except:
                raise Exception()

            i = i+1

        data_f = pd.concat(changed_dtype,1)

        return data_f

def create_balanced_data(data=None, target=None, categories=None, class_sizes=None, replacement=False)

Creates a balanced data set from an imbalanced one. Used in a classification task.

Parameter:

data: DataFrame, name series.

    The imbalanced dataset.

target: str

    Name of the target column.

categories: list

    Unique categories in the target column. If not set, we use infer the unique categories in the column.

class_sizes: list

    Size of each specified class. Must be in order with categoriess parameter.

replacement: bool, Default True.

    samples with or without replacement.

Expand source code

def create_balanced_data(data=None, target=None, categories=None, class_sizes=None, replacement=False ):
    '''
    Creates a balanced data set from an imbalanced one. Used in a classification task.

    Parameter:
    ----------------------------
        data: DataFrame, name series.

            The imbalanced dataset.

        target: str

            Name of the target column.

        categories: list

            Unique categories in the target column. If not set, we use infer the unique categories in the column.

        class_sizes: list

            Size of each specified class. Must be in order with categoriess parameter.

        replacement: bool, Default True.

            samples with or without replacement.
    '''
    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    if target is None:
        raise ValueError("target: Expecting a String got 'None'")

    if categories is None:
        categories = list(data[target].unique())
    
    if class_sizes is None:
        #set size for each class to same value
        temp_val = int(data.shape[0] / len(data[target].unique()))
        class_sizes = [temp_val for _ in list(data[target].unique())]

    
    df = data.copy()
    data_category = []
    data_class_indx = []
    
    #get data corrresponding to each of the categories
    for cat in categories: 
        data_category.append(df[df[target] == cat])
    
    #sample and get the index corresponding to each category
    for class_size, cat in zip(class_sizes, data_category):
        data_class_indx.append(cat.sample(class_size, replace=True).index)
        
    #concat data together
    new_data = pd.concat([df.loc[indx] for indx in data_class_indx], ignore_index=True).sample(sum(class_sizes)).reset_index(drop=True)
    
    if not replacement:
        for indx in data_class_indx:
            df.drop(indx, inplace=True)
            
        
    return new_data

def drop_missing(data=None, percent=99)

Drops missing columns with [percent] of missing data.

Parameters:

data: Pandas DataFrame or Series.

percent: float, Default 99

    Percentage of missing values to be in a column before it is eligible for removal.

Returns

Pandas DataFrame or Series.

Expand source code

def drop_missing(data=None, percent=99):
    '''
    Drops missing columns with [percent] of missing data.

    Parameters:
    -------------------------
        data: Pandas DataFrame or Series.

        percent: float, Default 99

            Percentage of missing values to be in a column before it is eligible for removal.

    Returns:

        Pandas DataFrame or Series.
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    missing_percent = (data.isna().sum() / data.shape[0]) * 100
    cols_2_drop = missing_percent[missing_percent.values >= percent].index
    print("Dropped {}".format(list(cols_2_drop)))
    #Drop missing values
    df = data.drop(cols_2_drop, axis=1)
    return df

def drop_redundant(data)

Removes features with the same value in all cell. Drops feature If Nan is the second unique class as well.

Parameters:

data: DataFrame or named series.

Returns

DataFrame or named series.

Expand source code

def drop_redundant(data):
    '''
    Removes features with the same value in all cell. Drops feature If Nan is the second unique class as well.

    Parameters:
    -----------------------------
        data: DataFrame or named series.
    
    Returns:

        DataFrame or named series.
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    #get columns
    cols_2_drop = _nan_in_class(data)
    print("Dropped {}".format(cols_2_drop))
    df = data.drop(cols_2_drop, axis=1)
    return df

def fill_missing_cats(data=None, cat_features=None, missing_encoding=None, missing_col=False)

Fill missing values using the mode of the categorical features.

Parameters:

data: DataFrame or name Series.

    Data set to perform operation on.

cat_features: List, Series, Array.

    categorical features to perform operation on. If not provided, we automatically infer the categoricals from the dataset.

missing_encoding: List, Series, Array.

    Values used in place of missing. Popular formats are [-1, -999, -99, '', ' ']

missin_col: bool, Default True
    Creates a new column to capture the missing values. 1 if missing and 0 otherwise. This can sometimes help a machine learning model.

Expand source code

def fill_missing_cats(data=None, cat_features=None, missing_encoding=None, missing_col=False):
    '''
    Fill missing values using the mode of the categorical features.

    Parameters:
    ------------------------
        data: DataFrame or name Series.

            Data set to perform operation on.

        cat_features: List, Series, Array.

            categorical features to perform operation on. If not provided, we automatically infer the categoricals from the dataset.

        missing_encoding: List, Series, Array.

            Values used in place of missing. Popular formats are [-1, -999, -99, '', ' ']

        missin_col: bool, Default True

            Creates a new column to capture the missing values. 1 if missing and 0 otherwise. This can sometimes help a machine learning model.
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")

    if cat_features is None:
        cat_features = get_cat_feats(data)

    df = data.copy()
    #change all possible missing values to NaN
    if missing_encoding is None:
        missing_encoding = ['', ' ', -99, -999]

    df.replace(missing_encoding, np.NaN, inplace=True)
    
    for feat in cat_features:
        if missing_col:
            df[feat + '_missing_value'] = (df[feat].isna()).astype('int64')
        most_freq = df[feat].mode()[0]
        df[feat] = df[feat].replace(np.NaN, most_freq)
    
    return df

def fill_missing_num(data=None, num_features=None, method='mean', missing_col=False)

fill missing values in numerical columns with specified [method] value

Parameters

data

DataFrame or name Series.

The data set to fill

features

list.

List of columns to fill

method

str, Default 'mean'.

method to use in calculating fill value.

missing_col

bool, Default True

Creates a new column to capture the missing values. 1 if missing and 0 otherwise. This can sometimes help a machine learning model.

Expand source code

def fill_missing_num(data=None, num_features=None, method='mean', missing_col=False):
    '''
    fill missing values in numerical columns with specified [method] value

    Parameters:
        ------------------------------
        data: DataFrame or name Series.

            The data set to fill

        features: list.

            List of columns to fill

        method: str, Default 'mean'.

            method to use in calculating fill value.

        missing_col: bool, Default True

            Creates a new column to capture the missing values. 1 if missing and 0 otherwise. This can sometimes help a machine learning model.
    '''
    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    if num_features is None:
        num_features = get_num_feats(data)
        #get numerical features with missing values
        temp_df = data[num_features].isna().sum()
        features = list(temp_df[num_features][temp_df[num_features] > 0].index)
        
    df = data.copy()
    for feat in features:
        if missing_col:
            df[feat + '_missing_value'] = (df[feat].isna()).astype('int64')
        if method is 'mean':
            mean = df[feat].mean()
            df[feat].fillna(mean, inplace=True)
        elif method is 'median':
            median = df[feat].median()
            df[feat].fillna(median, inplace=True)
        elif method is 'mode':
            mode = df[feat].mode()[0]
            df[feat].fillna(mode, inplace=True)
        else:
            raise ValueError("method: must specify a fill method, one of [mean, mode or median]'")

    return df

def get_location_center(point1, point2)

Calculates the center between two points.

Parameter:

point1: list, series, scalar

    End point latitude of the location.

long1: list, series, scalar

    Start point longitude of the location.

long2: list, series, scalar

    End point longitude of the location.

Returns: Series

The center between point1 and point2

Expand source code

def get_location_center(point1, point2):
    '''
    Calculates the center between two points.

    Parameter:
    ---------------------------
        point1: list, series, scalar

            End point latitude of the location.

        long1: list, series, scalar

            Start point longitude of the location.

        long2: list, series, scalar

            End point longitude of the location.

    Returns: Series
    
        The center between point1 and point2
    
    '''
    center = (point1 + point2) / 2
    center_df = pd.Series(center)
    return center_df

def get_qcut(data=None, col=None, q=None, duplicates='drop', return_type='float64')

Cuts a series into bins using the pandas qcut function and returns the resulting bins as a series for merging.

Parameter:

data: DataFrame, named Series

    Data set to perform operation on.

col: str

    column to cut/binnarize.

q: integer or array of quantiles

    Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
    array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.

duplicates: Default 'drop',

    If bin edges are not unique drop non-uniques.

return_type: dtype, Default (float64)

    Dtype of series to return. One of [float64, str, int64]

Returns:

Series, 1D-Array

Expand source code

def get_qcut(data=None, col=None, q=None, duplicates='drop', return_type='float64'):
    '''
    Cuts a series into bins using the pandas qcut function
    and returns the resulting bins as a series for merging.

    Parameter:
    -------------

        data: DataFrame, named Series

            Data set to perform operation on.

        col: str

            column to cut/binnarize.

        q: integer or array of quantiles

            Number of quantiles. 10 for deciles, 4 for quartiles, etc. Alternately
            array of quantiles, e.g. [0, .25, .5, .75, 1.] for quartiles.

        duplicates: Default 'drop',

            If bin edges are not unique drop non-uniques.

        return_type: dtype, Default (float64)

            Dtype of series to return. One of [float64, str, int64]
    
    Returns:
    --------

        Series, 1D-Array

    '''

    temp_df = pd.qcut(data[col], q=q, duplicates=duplicates).to_frame().astype('str')
    #retrieve only the qcut categories
    df = temp_df[col].str.split(',').apply(lambda x: x[0][1:]).astype(return_type)
    
    return df

def haversine_distance(lat1, long1, lat2, long2)

Calculates the Haversine distance between two location with latitude and longitude. The haversine distance is the great-circle distance between two points on a sphere given their longitudes and latitudes.

Parameter:

lat1: scalar,float

    Start point latitude of the location.

lat2: scalar,float

    End point latitude of the location.

long1: scalar,float

    Start point longitude of the location.

long2: scalar,float

    End point longitude of the location.

Returns:

Series: The Harversine distance between (lat1, lat2), (long1, long2)

Expand source code

def haversine_distance(lat1, long1, lat2, long2):
    '''
    Calculates the Haversine distance between two location with latitude and longitude.
    The haversine distance is the great-circle distance between two points on a sphere given their longitudes and latitudes.
    
    Parameter:
    ---------------------------
        lat1: scalar,float

            Start point latitude of the location.

        lat2: scalar,float 

            End point latitude of the location.

        long1: scalar,float

            Start point longitude of the location.

        long2: scalar,float 

            End point longitude of the location.

    Returns: 

        Series: The Harversine distance between (lat1, lat2), (long1, long2)
    
    '''

    lat1, long1, lat2, long2 = map(np.radians, (lat1, long1, lat2, long2))
    AVG_EARTH_RADIUS = 6371  # in km
    lat = lat2 - lat1
    lng = long2 - long1
    distance = np.sin(lat * 0.5) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(lng * 0.5) ** 2
    harvesine_distance = 2 * AVG_EARTH_RADIUS * np.arcsin(np.sqrt(distance))
    harvesine_distance_df = pd.Series(harvesine_distance)
    return harvesine_distance_df

def log_transform(data, columns, plot=True, figsize=(12, 6))

Nomralizes the dataset to be as close to the gaussian distribution.

Parameter:

data: DataFrame, Series. Data to Log transform.

columns: List, Series Columns to be transformed to normality using log transformation

plot: bool, default True Plots a before and after log transformation plot

Returns

Log-transformed dataframe

Expand source code

def log_transform(data, columns, plot=True, figsize=(12,6)):
    '''
    Nomralizes the dataset to be as close to the gaussian distribution.

    Parameter:
    -----------------------------------------
    data: DataFrame, Series.
        Data to Log transform.

    columns: List, Series
        Columns to be transformed to normality using log transformation
    
    plot: bool, default True
        Plots a before and after log transformation plot
    
    Returns:
        Log-transformed dataframe
    '''

    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")

    if columns is None:
        raise ValueError("columns: Expecting at least a column in the list of columns but got 'None'")
    
    df = data.copy()
    for col in columns:
        df[col] = np.log1p(df[col])

    if plot:
        for col in columns: 
            _ = plt.figure(figsize = figsize)
            plt.subplot(1, 2, 1)
            sns.distplot(data[col], color="m", label="Skewness : %.2f" % (df[col].skew()))    
            plt.title('Distribution of ' + col + " before Log transformation")
            plt.legend(loc='best')
            
            plt.subplot(1, 2, 2)
            sns.distplot(df[col], color="m", label="Skewness : %.2f" % (df[col].skew()))    
            plt.title('Distribution of ' + col + " after Log transformation")
            plt.legend(loc='best')
            plt.tight_layout(2)
            plt.show()

    return df

def manhattan_distance(lat1, long1, lat2, long2)

Calculates the Manhattan distance between two points. It is the sum of horizontal and vertical distance between any two points given their latitudes and longitudes.

Parameter:

lat1: scalar,float

    Start point latitude of the location.

lat2: scalar,float

    End point latitude of the location.

long1: scalar,float

    Start point longitude of the location.

long2: scalar,float

    End point longitude of the location.

Returns: Series

The Manhattan distance between (lat1, lat2) and (long1, long2)

Expand source code

def manhattan_distance(lat1, long1, lat2, long2):
    '''
    Calculates the Manhattan distance between two points.
    It is the sum of horizontal and vertical distance between any two points given their latitudes and longitudes. 

    Parameter:
    -------------------
        lat1: scalar,float

            Start point latitude of the location.

        lat2: scalar,float 

            End point latitude of the location.

        long1: scalar,float

            Start point longitude of the location.

        long2: scalar,float 

            End point longitude of the location.

    Returns: Series

        The Manhattan distance between (lat1, lat2) and (long1, long2)
    
    '''
    a = np.abs(lat2 -lat1)
    b = np.abs(long1 - long2)
    manhattan_distance = a + b
    manhattan_distance_df = pd.Series(manhattan_distance)
    return manhattan_distance_df

def merge_groupby(data=None, cat_features=None, statistics=None, col_to_merge=None)

Performs a groupby on the specified categorical features and merges the result to the original dataframe.

Parameter:

data: DataFrame

    Data set to perform operation on.

cat_features: list, series, 1D-array

    categorical features to groupby.

statistics: list, series, 1D-array, Default ['mean', 'count]

    aggregates to perform on grouped data.

col_to_merge: str

    The column to merge on the dataset. Must be present in the data set.

Returns

Dataframe.

Expand source code

def merge_groupby(data=None, cat_features=None, statistics=None, col_to_merge=None):
    '''
    Performs a groupby on the specified categorical features and merges
    the result to the original dataframe.

    Parameter:
    -----------------------

        data: DataFrame

            Data set to perform operation on.

        cat_features: list, series, 1D-array

            categorical features to groupby.

        statistics: list, series, 1D-array, Default ['mean', 'count]

            aggregates to perform on grouped data.

        col_to_merge: str

            The column to merge on the dataset. Must be present in the data set.

    Returns:

        Dataframe.

    '''
    if data is None:
        raise ValueError("data: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    if statistics is None:     
        statistics = ['mean', 'count']
    
    if cat_features is None:
        cat_features = get_num_feats(data)

    if col_to_merge is None:
        raise ValueError("col_to_merge: Expecting a string [column to merge on], got 'None'")

    
    df = data.copy()
    
    for cat in cat_features:      
        temp = df.groupby([cat]).agg(statistics)[col_to_merge]
        #rename columns
        temp = temp.rename(columns={'mean': cat + '_' + col_to_merge + '_mean', 'count': cat + '_' + col_to_merge +  "_count"})
        #merge the data sets
        df = df.merge(temp, how='left', on=cat)
    
    
    return df

def to_date(data)

Automatically convert all date time columns to pandas Datetime format

Expand source code

def to_date(data):
    '''
    Automatically convert all date time columns to pandas Datetime format
    '''

    date_cols = get_date_cols(data)
    for col in date_cols:
        data[col] = pd.to_datetime(data[col])
    
    return data