import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
#Import the Datasist Library
import datasist as ds
#Read in data set
train_data = pd.read_csv('training.csv')
test_data = pd.read_csv('test.csv')
Quick summary of a data set using the describe function in the structdata module
ds.structdata.describe(train_data)
Remove features that contains only one unique field as these features are redundant
#Drop redundant features
ds.feature_engineering.drop_redundant(data=train_data)
ds.feature_engineering.drop_redundant(data=test_data)
Check for missing values in dataset with the display function
EXPLORATION OF CATEGORICAL FEATURES
cat_feats = ds.structdata.get_cat_feats(train_data)
cat_feats
ds.structdata.get_unique_counts(train_data)
From the unique display output, we notice that the TransactionId and BatchId contains too many classes and thus we can drop them
train_data.drop(['TransactionId', 'BatchId'], axis=1, inplace=True)
test_data.drop(['TransactionId', 'BatchId'], axis=1, inplace=True)
VISUALIZATION FOR CATEGORICAL FEATURES
ds.visualizations.countplot(train_data)
ds.visualizations.class_count(train_data)
train_data.columns
ds.visualizations.catbox(data=train_data, target='FraudResult', fig_size=(7,7))
VISUALIZATION OF NUMERICAL FEATURES
ds.visualizations.histogram(train_data, fig_size=(5,5), bins=5)
ds.visualizations.boxplot(data=train_data, target='FraudResult', fig_size=(5,5))
TIME FEATURES
date_feats = ds.structdata.get_date_cols(train_data)
date_feats
VISUALIZATION OF TIME FEATURES
train_data[date_feats].head()
num_feats = ds.structdata.get_num_feats(train_data)
ds.timeseries.num_timeplot(data=train_data,num_cols=num_feats, time_col='TransactionStartTime')
The extract_dates function in the timeseries module can extract date information from a datetime column automatically.
train_data = ds.timeseries.extract_dates(data=train_data, date_cols=['TransactionStartTime'])
test_data = ds.timeseries.extract_dates(data=test_data, date_cols=['TransactionStartTime'])
train_data.head(2).T
#perform merge
all_data, ntrain, ntest = ds.structdata.join_train_and_test(train_data, test_data)
#Label Encode Large Categorical features
large_cats = ['AccountId', 'SubscriptionId', 'CustomerId', 'ProductId']
from sklearn.preprocessing import LabelEncoder
lb = LabelEncoder()
for cat in large_cats:
all_data[cat] = lb.fit_transform(all_data[cat])
# One hot encode small categorical features
all_data = pd.get_dummies(all_data, drop_first=True)
#Get traina nd test set back
train = all_data[:ntrain]
test = all_data[ntrain:]
#Get target and drop it
target = train['FraudResult']
train.drop('FraudResult', axis=1, inplace=True)
test.drop('FraudResult', axis=1, inplace=True)
print("Shape of training datasets is {}".format(train.shape))
print("Shape of training target is {}".format(test.shape))
print("Shape of target is {}".format(target.shape))
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score, train_test_split
X_train, X_test, y_train, y_test = train_test_split(train, target, test_size=0.3, random_state=2)
rf_model = RandomForestClassifier(n_estimators=100,random_state=232)
lg_model = LogisticRegression(max_iter=100, random_state=2, solver='lbfgs')
Next, we will train a two classifiers and get detailed report with the datasist classification report function. Can be used to pick best model for a task.
lg_model.fit(X_train, y_train, )
pred = lg_model.predict(X_test)
#Get report from true and predicted values
ds.model.get_classification_report(y_test, pred)
rf_model.fit(X_train, y_train)
pred = rf_model.predict(X_test)
#Get report from true and predicted values
ds.model.get_classification_report(y_test, pred)
From the metrics reported, we can see that the RandomForest model does better tahn Logistic Regression. Next we can use a cross validation function to detect overfitting.
ds.model.train_classifier(train_data=train, target=target, model=rf_model, cross_validate=True, cv=3)
Finally, we can plot the feature importance to see the most important feature
feats = train.columns
ds.model.plot_feature_importance(estimator=rf_model, col_names=feats)