Module `datasist.model`

This module contains functions used in training, testing and comparing machine learning models. Current version of datasist only supports scikit-learn and trained keras models.

Expand source code

'''
This module contains all functions relating to modeling in using sklearn library.

'''
import platform
import numpy as np
import pandas as pd
import seaborn as sns
from .visualizations import plot_auc
from sklearn.metrics import roc_curve, confusion_matrix, precision_score, accuracy_score, recall_score, f1_score, make_scorer, mean_absolute_error, mean_squared_error, r2_score, mean_squared_log_error
from sklearn.model_selection import KFold, cross_val_score


if platform.system() == "Darwin":
    import matplotlib as plt
    plt.use('TkAgg')
else:
    import matplotlib.pyplot as plt


def train_classifier(X_train = None, y_train=None, X_val=None, y_val=None, estimator=None, cross_validate=False, cv=5, show_roc_plot=True, save_plot=False):
    '''
    Train a classification estimator and calculate numerous performance metric.

    Parameters:
    ----------------------------
        X_train: Array, DataFrame, Series.

            The feature set (x) to use in training an estimator in other predict the outcome (y).

        y_train: Series, 1-d array, list

            The ground truth value for the train dataset

        X_val: Array, DataFrame. Series.

            The feature set (x) to use in validating a trained estimator.

        y_val: Series, 1-d array, list

            The ground truth value for the validation dataset.

        estimator: sklearn estimator.

            Sklearn estimator that implements the fit and predict functions.

        cross_validate: Bool, default False

            Use a cross validation strategy or not.
        
        cv: Int, default 5

            Number of folds to use in cross validation.
        
        show_roc_curve: Bool, default True.

            Plot a ROC plot showing estimator performance.
        
        save_plot: Bool, default False.
            Save the plot as a png file.
    
'''
    if X_train is None:
        raise ValueError("X_train: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    if y_train is None:
        raise ValueError("y_train: Expecting a Series/ numpy1D array, got 'None'")

    #initialize variables to hold calculations
    pred, acc, f1, precision, recall, confusion_mat = 0, 0, 0, 0, 0, None

    if cross_validate:
        dict_scorers  = {'acc' : accuracy_score,
                        'f1' : f1_score,
                        'precision': precision_score, 
                        'recall' : recall_score
                        }


        metric_names = ['Accuracy', 'F1_score', 'Precision', 'Recall']

        for metric_name, scorer in zip(metric_names, dict_scorers):
            cv_score = np.mean(cross_val_score(estimator, X_train, y_train, scoring=make_scorer(dict_scorers[scorer]),cv=cv))
            print("{} is {}".format(metric_name,  round(cv_score * 100, 4)))

        #TODO Add cross validation function for confusion matrix
  
    else:
        if X_val is None:
            raise ValueError("X_val: Expecting a DataFrame/ numpy array, got 'None'")
        
        if y_val is None:
            raise ValueError("y_val: Expecting a Series/ numpy1D array, got 'None'")

        estimator.fit(X_train, y_train)
        pred = estimator.predict(X_val)
        get_classification_report(y_val, pred, show_roc_plot, save_plot)



def plot_feature_importance(estimator=None, col_names=None):
    '''
    Plots the feature importance from a trained scikit learn estimator
    as a bar chart.

    Parameters:
    -----------
        estimator: scikit  learn estimator.

            Model that has been fit and contains the feature_importance_ attribute.

        col_names: list

            The names of the columns. Must map unto feature importance array.

    Returns:
    --------
        Matplotlib figure showing feature importances
    '''
    if estimator is None:
        raise ValueError("estimator: Expecting an estimator that implements the fit api, got None")
    if col_names is None:
        raise ValueError("col_names: Expecting a list of column names, got 'None'")
    
    if len(col_names) != len(estimator.feature_importances_):
        raise ValueError("col_names: Lenght of col_names must match lenght of feature importances")

    imps = estimator.feature_importances_
    feats_imp = pd.DataFrame({"features": col_names, "importance": imps}).sort_values(by='importance', ascending=False)
    sns.barplot(x='features', y='importance', data=feats_imp)
    plt.xticks(rotation=90)
    plt.title("Feature importance plot")
    plt.show()


def make_submission_csv(estimator=None, X_train=None, y_train=None, test_data=None,
                  sample_submision_file=None, sub_col_name=None, name="final_submission"):
    '''
    Train a estimator and makes prediction with it on the final test set. Also
    returns a sample submission file for data science competitions
    
    Parameters:
    -----------------------
    estimator: Sklearn estimator.

        An sklearn estimator that implements the fit and predict functions.

    X_train: Array, DataFrame, Series.

        The feature set (x) to use in training an estimator to predict the outcome (y).

    y_train: Series, 1-d array, list

        The ground truth value for the train dataset

    test_data: Array, DataFrame. Series.

        The final test set (x) to predict on.
    
    sample_submision_file: DataFrame, Series.

        A sample csv file provided by data science competition hosts to use in creating your final submission.

    sub_col_name: String.

        Name of the prediction column in the sample submission file.

    name: String.

        Name of the created submission file.

    Return:

        Csv file saved in current working directory
    '''
    estimator.fit(X_train, y_train)
    pred = estimator.predict(test_data)

    sub = sample_submision_file
    sub[sub_col_name] = pred
    sub.to_csv(name + '.csv', index=False)
    print("File has been saved to current working directory")



def get_classification_report(y_train=None, prediction=None, show_roc_plot=True, save_plot=False):
    '''
    Generates performance report for a classification problem.

    Parameters:
    ------------------
    y_train: Array, series, list.

        The truth/ground value from the train data set.
    
    prediction: Array, series, list.

        The predicted value by a trained model.

    show_roc_plot: Bool, default True.

        Show the model ROC curve.

    save_plot: Bool, default True.

        Save the plot to the current working directory.

    '''
    acc = accuracy_score(y_train, prediction)
    f1 = f1_score(y_train, prediction)
    precision = precision_score(y_train, prediction)
    recall = recall_score(y_train, prediction)
    confusion_mat = confusion_matrix(y_train, prediction)

    print("Accuracy is ", round(acc * 100))
    print("F1 score is ", round(f1 * 100))
    print("Precision is ", round(precision * 100))
    print("Recall is ", round(recall * 100))
    print("*" * 100)
    print("confusion Matrix")
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % confusion_mat[0,0] + '             %5d' % confusion_mat[0,1])
    print('Actual negative    %6d' % confusion_mat[1,0] + '             %5d' % confusion_mat[1,1])
    print('')

    if show_roc_plot:        
        plot_auc(y_train, prediction)

        if save_plot:
            plt.savefig("roc_plot.png")


def get_regression_report(y_true=None, prediction=None, show_r2_plot=True, save_plot=False):
    '''
    Generates performance report for a regression problem.

    Parameters:
    ------------------
    y_true: Array, series, list.

        The truth/ground value from the train data set.
    
    prediction: Array, series, list.

        The predicted value by a trained model.

    show_r2_plot: Bool, default True.

        Show the r-squared curve.

    save_plot: Bool, default True.

        Save the plot to the current working directory.

    '''
    mae = mean_absolute_error(y_true, prediction)
    mse = mean_squared_error(y_true, prediction)
    msle = precision_score(y_true, prediction)
    r2 = r2_score(y_true, prediction)
    
    print("Mean Absolute Error: ", round(mae, 5))
    print("Mean Squared Error: ", round(mse, 5))
    print("Mean Squared Log Error: ", round(msle, 5))
    print("R-squared Error:  ", round(r2, 5))
    print("*" * 100)

    if show_r2_plot:              
        plt.scatter(y_true,prediction)
        plt.xlabel('Truth values')
        plt.ylabel('Predicted values')
        plt.plot(np.unique(y_true), np.poly1d(np.polyfit(y_true, y_true, 1))(np.unique(y_true)))
        plt.text(0.7, 0.2, 'R-squared = %0.2f' % r2)
        plt.show()

        if save_plot:
            plt.savefig("r2_plot.png")





def compare_model(models_list=None, x_train=None, y_train=None, scoring_metric=None, scoring_cv=3, silenced=True, plot=True):
    """
    Train multiple user-defined model and display report based on defined metric. Enables user to pick the best base model for a problem.

    Parameters
    ----------------
        models_list: list

            a list of models to be trained

        x_train: Array, DataFrame, Series

            The feature set (x) to use in training an estimator to predict the outcome (y).

        y_train: Series, 1-d array, list

            The ground truth value for the train dataset

        scoring_metric: str

            Metric to use in scoring the model

        scoring_cv: int

            default value is 3

    Returns
    ---------------
    a tuple of fitted_model and the model evaluation scores
    """
    # if not _same_model:
    #     raise ValueError("model_list: models must be of the same class. Cannot use a classifier and a regressor.")


    if models_list is None or len(models_list) < 1:
        raise ValueError("model_list: model_list can't be 'None' or empty")

    if x_train is None:
        raise ValueError("x_train: features can't be 'None' or empty")

    if y_train is None:
        raise ValueError("y_train: features can't be 'None' or empty")



    fitted_model = []
    model_scores = []
    model_names = []

    for i, model in enumerate(models_list):
        if silenced is not True:
            print("Fitting {}".format(type(model).__name__))
        model.fit(x_train, y_train)
        # append fitted model into list
        fitted_model.append(model)
        model_score = np.mean(cross_val_score(model, x_train, y_train, scoring=scoring_metric, cv=scoring_cv))
        model_scores.append(model_score)
        model_names.append(type(fitted_model[i]).__name__)
            
    if plot:
        sns.pointplot(y=model_scores, x=model_names)
        plt.xticks(rotation=90)
        plt.title("Model comparison plot")
        plt.show()

    return fitted_model, model_scores

Functions

def compare_model(models_list=None, x_train=None, y_train=None, scoring_metric=None, scoring_cv=3, silenced=True, plot=True)

Train multiple user-defined model and display report based on defined metric. Enables user to pick the best base model for a problem.

Parameters

models_list: list

    a list of models to be trained

x_train: Array, DataFrame, Series

    The feature set (x) to use in training an estimator to predict the outcome (y).

y_train: Series, 1-d array, list

    The ground truth value for the train dataset

scoring_metric: str

    Metric to use in scoring the model

scoring_cv: int

    default value is 3

Returns

a tuple of fitted_model and the model evaluation scores

Expand source code

def compare_model(models_list=None, x_train=None, y_train=None, scoring_metric=None, scoring_cv=3, silenced=True, plot=True):
    """
    Train multiple user-defined model and display report based on defined metric. Enables user to pick the best base model for a problem.

    Parameters
    ----------------
        models_list: list

            a list of models to be trained

        x_train: Array, DataFrame, Series

            The feature set (x) to use in training an estimator to predict the outcome (y).

        y_train: Series, 1-d array, list

            The ground truth value for the train dataset

        scoring_metric: str

            Metric to use in scoring the model

        scoring_cv: int

            default value is 3

    Returns
    ---------------
    a tuple of fitted_model and the model evaluation scores
    """
    # if not _same_model:
    #     raise ValueError("model_list: models must be of the same class. Cannot use a classifier and a regressor.")


    if models_list is None or len(models_list) < 1:
        raise ValueError("model_list: model_list can't be 'None' or empty")

    if x_train is None:
        raise ValueError("x_train: features can't be 'None' or empty")

    if y_train is None:
        raise ValueError("y_train: features can't be 'None' or empty")



    fitted_model = []
    model_scores = []
    model_names = []

    for i, model in enumerate(models_list):
        if silenced is not True:
            print("Fitting {}".format(type(model).__name__))
        model.fit(x_train, y_train)
        # append fitted model into list
        fitted_model.append(model)
        model_score = np.mean(cross_val_score(model, x_train, y_train, scoring=scoring_metric, cv=scoring_cv))
        model_scores.append(model_score)
        model_names.append(type(fitted_model[i]).__name__)
            
    if plot:
        sns.pointplot(y=model_scores, x=model_names)
        plt.xticks(rotation=90)
        plt.title("Model comparison plot")
        plt.show()

    return fitted_model, model_scores

def get_classification_report(y_train=None, prediction=None, show_roc_plot=True, save_plot=False)

Generates performance report for a classification problem.

Parameters:

y_train: Array, series, list.

The truth/ground value from the train data set.

prediction: Array, series, list.

The predicted value by a trained model.

show_roc_plot: Bool, default True.

Show the model ROC curve.

save_plot: Bool, default True.

Save the plot to the current working directory.

Expand source code

def get_classification_report(y_train=None, prediction=None, show_roc_plot=True, save_plot=False):
    '''
    Generates performance report for a classification problem.

    Parameters:
    ------------------
    y_train: Array, series, list.

        The truth/ground value from the train data set.
    
    prediction: Array, series, list.

        The predicted value by a trained model.

    show_roc_plot: Bool, default True.

        Show the model ROC curve.

    save_plot: Bool, default True.

        Save the plot to the current working directory.

    '''
    acc = accuracy_score(y_train, prediction)
    f1 = f1_score(y_train, prediction)
    precision = precision_score(y_train, prediction)
    recall = recall_score(y_train, prediction)
    confusion_mat = confusion_matrix(y_train, prediction)

    print("Accuracy is ", round(acc * 100))
    print("F1 score is ", round(f1 * 100))
    print("Precision is ", round(precision * 100))
    print("Recall is ", round(recall * 100))
    print("*" * 100)
    print("confusion Matrix")
    print('                 Score positive    Score negative')
    print('Actual positive    %6d' % confusion_mat[0,0] + '             %5d' % confusion_mat[0,1])
    print('Actual negative    %6d' % confusion_mat[1,0] + '             %5d' % confusion_mat[1,1])
    print('')

    if show_roc_plot:        
        plot_auc(y_train, prediction)

        if save_plot:
            plt.savefig("roc_plot.png")

def get_regression_report(y_true=None, prediction=None, show_r2_plot=True, save_plot=False)

Generates performance report for a regression problem.

Parameters:

y_true: Array, series, list.

The truth/ground value from the train data set.

prediction: Array, series, list.

The predicted value by a trained model.

show_r2_plot: Bool, default True.

Show the r-squared curve.

save_plot: Bool, default True.

Save the plot to the current working directory.

Expand source code

def get_regression_report(y_true=None, prediction=None, show_r2_plot=True, save_plot=False):
    '''
    Generates performance report for a regression problem.

    Parameters:
    ------------------
    y_true: Array, series, list.

        The truth/ground value from the train data set.
    
    prediction: Array, series, list.

        The predicted value by a trained model.

    show_r2_plot: Bool, default True.

        Show the r-squared curve.

    save_plot: Bool, default True.

        Save the plot to the current working directory.

    '''
    mae = mean_absolute_error(y_true, prediction)
    mse = mean_squared_error(y_true, prediction)
    msle = precision_score(y_true, prediction)
    r2 = r2_score(y_true, prediction)
    
    print("Mean Absolute Error: ", round(mae, 5))
    print("Mean Squared Error: ", round(mse, 5))
    print("Mean Squared Log Error: ", round(msle, 5))
    print("R-squared Error:  ", round(r2, 5))
    print("*" * 100)

    if show_r2_plot:              
        plt.scatter(y_true,prediction)
        plt.xlabel('Truth values')
        plt.ylabel('Predicted values')
        plt.plot(np.unique(y_true), np.poly1d(np.polyfit(y_true, y_true, 1))(np.unique(y_true)))
        plt.text(0.7, 0.2, 'R-squared = %0.2f' % r2)
        plt.show()

        if save_plot:
            plt.savefig("r2_plot.png")

def make_submission_csv(estimator=None, X_train=None, y_train=None, test_data=None, sample_submision_file=None, sub_col_name=None, name='final_submission')

Train a estimator and makes prediction with it on the final test set. Also returns a sample submission file for data science competitions

Parameters:

estimator: Sklearn estimator.

An sklearn estimator that implements the fit and predict functions.

X_train: Array, DataFrame, Series.

The feature set (x) to use in training an estimator to predict the outcome (y).

y_train: Series, 1-d array, list

The ground truth value for the train dataset

test_data: Array, DataFrame. Series.

The final test set (x) to predict on.

sample_submision_file: DataFrame, Series.

A sample csv file provided by data science competition hosts to use in creating your final submission.

sub_col_name: String.

Name of the prediction column in the sample submission file.

name: String.

Name of the created submission file.

Return

Csv file saved in current working directory

Expand source code

def make_submission_csv(estimator=None, X_train=None, y_train=None, test_data=None,
                  sample_submision_file=None, sub_col_name=None, name="final_submission"):
    '''
    Train a estimator and makes prediction with it on the final test set. Also
    returns a sample submission file for data science competitions
    
    Parameters:
    -----------------------
    estimator: Sklearn estimator.

        An sklearn estimator that implements the fit and predict functions.

    X_train: Array, DataFrame, Series.

        The feature set (x) to use in training an estimator to predict the outcome (y).

    y_train: Series, 1-d array, list

        The ground truth value for the train dataset

    test_data: Array, DataFrame. Series.

        The final test set (x) to predict on.
    
    sample_submision_file: DataFrame, Series.

        A sample csv file provided by data science competition hosts to use in creating your final submission.

    sub_col_name: String.

        Name of the prediction column in the sample submission file.

    name: String.

        Name of the created submission file.

    Return:

        Csv file saved in current working directory
    '''
    estimator.fit(X_train, y_train)
    pred = estimator.predict(test_data)

    sub = sample_submision_file
    sub[sub_col_name] = pred
    sub.to_csv(name + '.csv', index=False)
    print("File has been saved to current working directory")

def plot_feature_importance(estimator=None, col_names=None)

Plots the feature importance from a trained scikit learn estimator as a bar chart.

Parameters:

estimator: scikit  learn estimator.

    Model that has been fit and contains the feature_importance_ attribute.

col_names: list

    The names of the columns. Must map unto feature importance array.

Returns:

Matplotlib figure showing feature importances

Expand source code

def plot_feature_importance(estimator=None, col_names=None):
    '''
    Plots the feature importance from a trained scikit learn estimator
    as a bar chart.

    Parameters:
    -----------
        estimator: scikit  learn estimator.

            Model that has been fit and contains the feature_importance_ attribute.

        col_names: list

            The names of the columns. Must map unto feature importance array.

    Returns:
    --------
        Matplotlib figure showing feature importances
    '''
    if estimator is None:
        raise ValueError("estimator: Expecting an estimator that implements the fit api, got None")
    if col_names is None:
        raise ValueError("col_names: Expecting a list of column names, got 'None'")
    
    if len(col_names) != len(estimator.feature_importances_):
        raise ValueError("col_names: Lenght of col_names must match lenght of feature importances")

    imps = estimator.feature_importances_
    feats_imp = pd.DataFrame({"features": col_names, "importance": imps}).sort_values(by='importance', ascending=False)
    sns.barplot(x='features', y='importance', data=feats_imp)
    plt.xticks(rotation=90)
    plt.title("Feature importance plot")
    plt.show()

def train_classifier(X_train=None, y_train=None, X_val=None, y_val=None, estimator=None, cross_validate=False, cv=5, show_roc_plot=True, save_plot=False)

Train a classification estimator and calculate numerous performance metric.

Parameters:

X_train: Array, DataFrame, Series.

    The feature set (x) to use in training an estimator in other predict the outcome (y).

y_train: Series, 1-d array, list

    The ground truth value for the train dataset

X_val: Array, DataFrame. Series.

    The feature set (x) to use in validating a trained estimator.

y_val: Series, 1-d array, list

    The ground truth value for the validation dataset.

estimator: sklearn estimator.

    Sklearn estimator that implements the fit and predict functions.

cross_validate: Bool, default False

    Use a cross validation strategy or not.

cv: Int, default 5

    Number of folds to use in cross validation.

show_roc_curve: Bool, default True.

    Plot a ROC plot showing estimator performance.

save_plot: Bool, default False.
    Save the plot as a png file.

Expand source code

def train_classifier(X_train = None, y_train=None, X_val=None, y_val=None, estimator=None, cross_validate=False, cv=5, show_roc_plot=True, save_plot=False):
    '''
    Train a classification estimator and calculate numerous performance metric.

    Parameters:
    ----------------------------
        X_train: Array, DataFrame, Series.

            The feature set (x) to use in training an estimator in other predict the outcome (y).

        y_train: Series, 1-d array, list

            The ground truth value for the train dataset

        X_val: Array, DataFrame. Series.

            The feature set (x) to use in validating a trained estimator.

        y_val: Series, 1-d array, list

            The ground truth value for the validation dataset.

        estimator: sklearn estimator.

            Sklearn estimator that implements the fit and predict functions.

        cross_validate: Bool, default False

            Use a cross validation strategy or not.
        
        cv: Int, default 5

            Number of folds to use in cross validation.
        
        show_roc_curve: Bool, default True.

            Plot a ROC plot showing estimator performance.
        
        save_plot: Bool, default False.
            Save the plot as a png file.
    
'''
    if X_train is None:
        raise ValueError("X_train: Expecting a DataFrame/ numpy2d array, got 'None'")
    
    if y_train is None:
        raise ValueError("y_train: Expecting a Series/ numpy1D array, got 'None'")

    #initialize variables to hold calculations
    pred, acc, f1, precision, recall, confusion_mat = 0, 0, 0, 0, 0, None

    if cross_validate:
        dict_scorers  = {'acc' : accuracy_score,
                        'f1' : f1_score,
                        'precision': precision_score, 
                        'recall' : recall_score
                        }


        metric_names = ['Accuracy', 'F1_score', 'Precision', 'Recall']

        for metric_name, scorer in zip(metric_names, dict_scorers):
            cv_score = np.mean(cross_val_score(estimator, X_train, y_train, scoring=make_scorer(dict_scorers[scorer]),cv=cv))
            print("{} is {}".format(metric_name,  round(cv_score * 100, 4)))

        #TODO Add cross validation function for confusion matrix
  
    else:
        if X_val is None:
            raise ValueError("X_val: Expecting a DataFrame/ numpy array, got 'None'")
        
        if y_val is None:
            raise ValueError("y_val: Expecting a Series/ numpy1D array, got 'None'")

        estimator.fit(X_train, y_train)
        pred = estimator.predict(X_val)
        get_classification_report(y_val, pred, show_roc_plot, save_plot)