Module datasist.visualizations
This module contains functions that can help you create aesthetic and colorful plots with minimal codes
Expand source code
'''
This module contains all functions relating to visualization.
'''
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from . import structdata
from IPython.display import display
from sklearn.metrics import confusion_matrix
from sklearn.utils.multiclass import unique_labels
import sklearn.metrics as sklm
def countplot(data=None, features=None, separate_by=None, fig_size=(5,5), save_fig=False):
'''
Makes a bar plot of all categorical features to show their counts.
Parameters
------------
data : DataFrame, array, or list of arrays.
The data to plot.
features: str, scalar, array, or list.
The categorical features in the dataset, if not provided,
we try to infer the categorical columns from the dataframe.
separate_by: str, default None.
The feature used to seperate the plot. Called hue in seaborn.
fig_size: tuple, Default (5,5)
The size of the figure object.
save_fig: bool, Default False.
Saves the plot to the current working directory
Returns
-------
None
'''
if data is None:
raise ValueError("data: Expecting a DataFrame or Series, got 'None'")
if features is None:
features = structdata.get_cat_feats(data)
for feature in features:
#Check the size of categories in the feature: Anything greater than 20 is not plotted
if len(data[feature].unique()) > 30:
print("Unique Values in {} is too large to plot".format(feature))
print('\n')
else:
fig = plt.figure(figsize=fig_size)
ax = fig.gca()
sns.countplot(x=feature, hue=separate_by, data=data)
plt.xticks(rotation=90)
ax.set_title("Count plot for " + feature)
if save_fig:
plt.savefig('Countplot_{}'.format(feature))
def plot_missing(data=None):
'''
Plots the data as a heatmap to show missing values
Parameters
----------
data: DataFrame, array, or list of arrays.
The data to plot.
'''
if data is None:
raise ValueError("data: Expecting a DataFrame or Series, got 'None'")
sns.heatmap(data.isnull(), cbar=True)
plt.show()
def boxplot(data=None, num_features=None, target=None, fig_size=(5,5), large_data=False, save_fig=False):
'''
Makes a box plot of all numerical features against a specified categorical target column.
A box plot (or box-and-whisker plot) shows the distribution of quantitative
data in a way that facilitates comparisons between variables or across
levels of a categorical variable. The box shows the quartiles of the
dataset while the whiskers extend to show the rest of the distribution,
except for points that are determined to be "outliers" using a method
that is a function of the inter-quartile range
Parameters
------------
data : DataFrame, array, or list of arrays.
Dataset for plotting.
num_features: Scalar, array, or list.
The numerical features in the dataset, if not None,
we try to infer the numerical columns from the dataframe.
target: array, pandas series, list.
A categorical target column. Maximun number of categories is 10 and minimum is 1
fig_size: tuple, Default (8,8)
The size of the figure object.
large_data: bool, Default False.
If True, then sns boxenplot is used instead of normal boxplot. Boxenplot is
better for large dataset.
save_fig: bool, Default False.
If True, saves the current plot to the current working directory
'''
if target is None:
raise ValueError('Target value cannot be None')
if len(data[target].unique()) > 10:
raise AttributeError("Target categories must be less than 10")
if data is None:
raise ValueError("data: Expecting a DataFrame or Series, got 'None'")
if num_features is None:
num_features = structdata.get_num_feats(data)
if large_data:
#use advanced sns boxenplot
for feature in num_features:
fig = plt.figure(figsize=fig_size)
ax = fig.gca()
sns.set_style("whitegrid")
sns.boxenplot(target, feature, data=data, ax=ax)
plt.ylabel(feature) # Set text for the x axis
plt.xlabel(target)# Set text for y axis
plt.xticks(rotation=90)
plt.title('Box plot of {} against {}'.format(feature, target))
if save_fig:
plt.savefig('fig_{}_vs_{}'.format(feature,target))
plt.show()
else:
for feature in num_features:
fig = plt.figure(figsize=fig_size)
ax = fig.gca()
sns.set_style("whitegrid")
sns.boxplot(target, feature, data=data, ax=ax)
plt.ylabel(feature) # Set text for the x axis
plt.xlabel(target)# Set text for y axis
plt.xticks(rotation=90)
plt.title("Box plot of '{}' vs. '{}'".format(feature, target))
if save_fig:
plt.savefig('fig_{}_vs_{}'.format(feature,target))
plt.show()
def violinplot(data=None, num_features=None, target=None, fig_size=(5,5), save_fig=False):
'''
Makes a violin plot of all numerical features against a specified categorical target column.
A violin plot plays a similar role as a box and whisker plot. It shows the
distribution of quantitative data across several levels of one (or more)
categorical variables such that those distributions can be compared. Unlike
a box plot, in which all of the plot components correspond to actual
datapoints, the violin plot features a kernel density estimation of the
underlying distribution.
Parameters
------------
data : DataFrame, array, or list of arrays.
Dataset for plotting.
num_features: Scalar, array, or list.
The numerical features in the dataset, if not None,
we try to infer the numerical columns from the dataframe.
target: array, pandas series, list.
A categorical target column. Maximun number of categories is 10 and minimum is 1.
fig_size: tuple, Default (8,8)
The size of the figure object.
save_fig: bool, Default False.
If True, saves the current plot to the current working directory
'''
if target is None:
raise ValueError('Target value cannot be None')
if len(data[target].unique()) > 10:
raise AttributeError("Target categories must be less than 10")
if data is None:
raise ValueError("data: Expecting a DataFrame or Series, got 'None'")
if num_features is None:
num_features = structdata.get_num_feats(data)
for feature in num_features:
fig = plt.figure(figsize=fig_size)
ax = fig.gca()
sns.set_style("whitegrid")
sns.violinplot(target, feature, data=data, ax=ax)
plt.xticks(rotation=90)
plt.ylabel(feature) # Set text for the x axis
plt.xlabel(target)# Set text for y axis
plt.title("Violin plot of '{}' vs. '{}'".format(feature, target))
if save_fig:
#TODO Add function to save to a specified directory
plt.savefig('fig_{}_vs_{}'.format(feature,target))
plt.show()
def histogram(data=None, num_features=None, bins=None, show_dist_type=False, fig_size=(5,5), save_fig=False):
'''
Makes an histogram plot of all numerical features.
Helps to show univariate distribution of the features.
Parameters
------------
data : DataFrame, array, or list of arrays.
Dataset for plotting.
num_features: Scalar, array, or list.
The numerical features in the dataset, if not None,
we try to infer the numerical columns from the dataframe.
bins: int
The number of bins to use.
show_dist_type: bool, Default False
If True, Calculates the skewness of the data and display one of (Left skewed, right skewed or normal)
fig_size: tuple, Default (8,8).
The size of the figure object.
save_fig: bool, Default False.
If True, saves the current plot to the current working directory
'''
if data is None:
raise ValueError("data: Expecting a DataFrame or Series, got 'None'")
if num_features is None:
num_features = structdata.get_num_feats(data)
for feature in num_features:
fig = plt.figure(figsize=fig_size)
ax = fig.gca()
sns.distplot(data[feature].values, ax=ax, bins=bins)
ax.set_xlabel(feature) # Set text for the x axis
ax.set_ylabel('Count')# Set text for y axis
if show_dist_type:
##TODO Add Code to calculate skewness
pass
else:
ax.set_title('Histogram of ' + feature)
if save_fig:
#TODO Add function to save to a user specified directory
plt.savefig('fig_hist_{}'.format(feature))
plt.show()
def catbox(data=None, cat_features=None, target=None, fig_size=(10,5), save_fig=False):
'''
Makes a side by side bar plot of all categorical features against a categorical target feature.
Parameters
------------
data: DataFrame, array, or list of arrays.
Dataset for plotting.
cat_features: Scalar, array, or list.
The categorical features in the dataset, if None,
we try to infer the categorical columns from the dataframe.
target: Scalar, array or list.
Categorical target to plot against.
fig_size: tuple, Default (12,6)
The size of the figure object.
save_fig: bool, Default False.
If True, saves the plot to the current working directory.
'''
if data is None:
raise ValueError("data: Expecting a DataFrame or Series, got 'None'")
if cat_features is None:
cat_features = structdata.get_cat_feats(data)
#remove target from cat_features
try:
cat_features.remove(target)
except:
pass
if len(data[target].unique()) > 8:
#TODO Plot only a subset of the features say top 10
raise AttributeError("Target categories must be less than seven")
#Create a dummy column to hold count of values
data['dummy_count'] = np.ones(shape = data.shape[0])
#Loop over each categorical feature and plot the rate for each category.
for feature in cat_features:
#Plots are made for only categories with less than 15 unique values because of speed
if len(data[feature].unique()) > 15 :
print("{} feature has too many categories and will not be ploted".format(feature))
else:
counts = data[['dummy_count', target, feature]].groupby([target, feature], as_index = False).count()
#get the categories
cats = list(data[target].unique())
if len(cats) > 6:
raise ValueError("Target column: '{}' must contain less than six unique classes".format(target))
#create new figure
_ = plt.figure(figsize = fig_size)
for i, cat in enumerate(cats):
plt.subplot(1, len(cats), i+1)
#Get the counts each category in target
temp = counts[counts[target] == cat][[feature, 'dummy_count']]
sns.barplot(x=feature, y='dummy_count', data=temp)
plt.xticks(rotation=90)
plt.title('Counts for {} \n class {}'.format(feature, cat))
plt.ylabel('count')
plt.tight_layout(2)
if save_fig:
plt.savefig('fig_catbox_{}'.format(feature))
#Drop the dummy_count column from data
data.drop(['dummy_count'], axis=1, inplace = True)
def class_count(data=None, features=None, plot=False, save_fig=False):
'''
Displays the number of classes in a categorical feature.
Parameters:
data: Pandas DataFrame or Series
Dataset for plotting.
features: Scalar, array, or list.
The categorical features in the dataset, if None,
we try to infer the categorical columns from the dataframe.
plot: bool, Default False.
Plots the class counts as a barplot
save_fig: bool, Default False.
Saves the plot to the current working directory.
'''
if data is None:
raise ValueError("data: Expecting a DataFrame or Series, got 'None'")
if features is None:
features = structdata.get_cat_feats(data)
for feature in features:
if data[feature].nunique() > 15:
print("Unique classes in {} too large".format(feature))
else:
print('Class Count for', feature)
display(pd.DataFrame(data[feature].value_counts()))
if plot:
countplot(data, features, save_fig=save_fig)
def scatterplot(data=None, num_features=None, target=None, separate_by=None, fig_size=(5,5), save_fig=False):
'''
Makes a scatter plot of numerical features against a numerical target.
Helps to show the relationship between features.
Parameters
------------
data : DataFrame, array, or list of arrays.
The data to plot.
num_features: int/floats, scalar, array, or list.
The numeric features in the dataset, if not provided,
we try to infer the numeric columns from the dataframe.
target: int/float, scalar, array or list.
Numerical target feature to plot against.
separate_by: str, default None.
The feature used to seperate the plot. Called hue in seaborn.
fig_size: tuple, Default (10,10)
The size of the figure object.
save_fig: bool, Default False.
Saves the plot to the current working directory
'''
if data is None:
raise ValueError("data: Expecting a DataFrame or Series, got 'None'")
if separate_by is None:
pass
elif separate_by not in data.columns:
raise ValueError("{} not found in data columns".format(separate_by))
if target is None:
raise ValueError('Target value cannot be None')
if num_features is None:
num_features = structdata.get_num_feats(data)
for feature in num_features:
fig = plt.figure(figsize=fig_size) # define plot area
ax = fig.gca() # define axis
sns.scatterplot(x=feature, y=target, data=data, hue=separate_by)
ax.set_title("Scatter Plot of '{}' vs. '{}' \n Separated by: '{}'".format(feature, target, separate_by))
if save_fig:
plt.savefig('fig_scatterplot_{}'.format(feature))
def plot_confusion_matrix(y_true, y_pred, classes,
normalize=False,
title=None,
cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
Parameters:
"""
if not title:
if normalize:
title = 'Normalized confusion matrix'
else:
title = 'Confusion matrix, without normalization'
# Compute confusion matrix
cm = confusion_matrix(y_true, y_pred)
# Only use the labels that appear in the data
classes = classes[unique_labels(y_true, y_pred)]
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
fig, ax = plt.subplots()
im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
ax.figure.colorbar(im, ax=ax)
# We want to show all ticks...
ax.set(xticks=np.arange(cm.shape[1]),
yticks=np.arange(cm.shape[0]),
# ... and label them with the respective list entries
xticklabels=classes, yticklabels=classes,
title=title,
ylabel='True label',
xlabel='Predicted label')
# Rotate the tick labels and set their alignment.
plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
rotation_mode="anchor")
# Loop over data dimensions and create text annotations.
fmt = '.2f' if normalize else 'd'
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
for j in range(cm.shape[1]):
ax.text(j, i, format(cm[i, j], fmt),
ha="center", va="center",
color="white" if cm[i, j] > thresh else "black")
fig.tight_layout()
return ax
def plot_auc(labels, predictions):
'''
Computes and plot the false positive rate, true positive rate and threshold along with the AUC
Parameters:
--------------------
labels:
This is the true value ( in the case of binary either 0 or 1)
predictions:
This is the probability that shows the likelihood of a value being 0 or 1
Return:
plots the Receiver operating characteristics.
'''
fpr, tpr, threshold = sklm.roc_curve(labels, predictions)
auc = sklm.auc(fpr, tpr)
## Plot the result
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, color='orange', label='AUC = %0.2f' % auc)
plt.legend(loc='lower right')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
def plot_scatter_shape(data = None, cols = None, shape_col = '', col_y = '', alpha = 0.2):
'''
Makes a scatter plot of data using shape_col as seperation.
Parameter:
data: Dataframe
The data that is being imported using pandas.
cols: list
The chosen number of columns in the DataFrame.
shape_col:
The categorical column you want it to show as legend.
col_y: The y axis of the plot
Return:
Matplotlib figure
'''
# pick distinctive shapes
shapes = ['+', 'o', 's', 'x', '^']
unique_cats = data[shape_col].unique()
# loop over the columns to plot
for col in cols:
sns.set_style("whitegrid")
# loop over the unique categories
for i, cat in enumerate(unique_cats):
temp = data[data[shape_col] == cat]
sns.regplot(col, col_y, data=temp, marker = shapes[i], label = cat,
scatter_kws={"alpha":alpha}, fit_reg = False, color = 'blue')
# Give the plot a main title
plt.title('Scatter plot of ' + col_y + ' vs. ' + col)
# Set text for the x axis
plt.xlabel(col)
# Set text for y axis
plt.ylabel(col_y)
plt.legend()
plt.show()
def autoviz(data):
'''
Automatically visualize a data set. If dataset is large, autoViz uses a statistically valid sample for plotting.
Parameter:
--------------------
data: Dataframe
The data to plot
Return:
Matplotlib figure
'''
#First check if autoviz is installed, if not installed, prompt the user to install it.
import importlib.util
import logging
logging.basicConfig()
package_name = 'autoviz'
err_msg = "is not installed, to use this function, you must install " + package_name + ". \n To install, use 'pip install autoviz'"
package_stat = importlib.util.find_spec(package_name)
if package_stat is None:
logging.error(package_name + " " + err_msg)
else:
from autoviz.AutoViz_Class import AutoViz_Class
av = AutoViz_Class()
av.AutoViz(filename='', dfte=data, max_cols_analyzed=50)
Functions
def autoviz(data)
-
Automatically visualize a data set. If dataset is large, autoViz uses a statistically valid sample for plotting. Parameter:
data: Dataframe The data to plot
Return
Matplotlib figure
Expand source code
def autoviz(data): ''' Automatically visualize a data set. If dataset is large, autoViz uses a statistically valid sample for plotting. Parameter: -------------------- data: Dataframe The data to plot Return: Matplotlib figure ''' #First check if autoviz is installed, if not installed, prompt the user to install it. import importlib.util import logging logging.basicConfig() package_name = 'autoviz' err_msg = "is not installed, to use this function, you must install " + package_name + ". \n To install, use 'pip install autoviz'" package_stat = importlib.util.find_spec(package_name) if package_stat is None: logging.error(package_name + " " + err_msg) else: from autoviz.AutoViz_Class import AutoViz_Class av = AutoViz_Class() av.AutoViz(filename='', dfte=data, max_cols_analyzed=50)
def boxplot(data=None, num_features=None, target=None, fig_size=(5, 5), large_data=False, save_fig=False)
-
Makes a box plot of all numerical features against a specified categorical target column.
A box plot (or box-and-whisker plot) shows the distribution of quantitative data in a way that facilitates comparisons between variables or across levels of a categorical variable. The box shows the quartiles of the dataset while the whiskers extend to show the rest of the distribution, except for points that are determined to be "outliers" using a method that is a function of the inter-quartile range
Parameters
data : DataFrame, array, or list of arrays. Dataset for plotting. num_features: Scalar, array, or list. The numerical features in the dataset, if not None, we try to infer the numerical columns from the dataframe. target: array, pandas series, list. A categorical target column. Maximun number of categories is 10 and minimum is 1 fig_size: tuple, Default (8,8) The size of the figure object. large_data: bool, Default False. If True, then sns boxenplot is used instead of normal boxplot. Boxenplot is better for large dataset. save_fig: bool, Default False. If True, saves the current plot to the current working directory
Expand source code
def boxplot(data=None, num_features=None, target=None, fig_size=(5,5), large_data=False, save_fig=False): ''' Makes a box plot of all numerical features against a specified categorical target column. A box plot (or box-and-whisker plot) shows the distribution of quantitative data in a way that facilitates comparisons between variables or across levels of a categorical variable. The box shows the quartiles of the dataset while the whiskers extend to show the rest of the distribution, except for points that are determined to be "outliers" using a method that is a function of the inter-quartile range Parameters ------------ data : DataFrame, array, or list of arrays. Dataset for plotting. num_features: Scalar, array, or list. The numerical features in the dataset, if not None, we try to infer the numerical columns from the dataframe. target: array, pandas series, list. A categorical target column. Maximun number of categories is 10 and minimum is 1 fig_size: tuple, Default (8,8) The size of the figure object. large_data: bool, Default False. If True, then sns boxenplot is used instead of normal boxplot. Boxenplot is better for large dataset. save_fig: bool, Default False. If True, saves the current plot to the current working directory ''' if target is None: raise ValueError('Target value cannot be None') if len(data[target].unique()) > 10: raise AttributeError("Target categories must be less than 10") if data is None: raise ValueError("data: Expecting a DataFrame or Series, got 'None'") if num_features is None: num_features = structdata.get_num_feats(data) if large_data: #use advanced sns boxenplot for feature in num_features: fig = plt.figure(figsize=fig_size) ax = fig.gca() sns.set_style("whitegrid") sns.boxenplot(target, feature, data=data, ax=ax) plt.ylabel(feature) # Set text for the x axis plt.xlabel(target)# Set text for y axis plt.xticks(rotation=90) plt.title('Box plot of {} against {}'.format(feature, target)) if save_fig: plt.savefig('fig_{}_vs_{}'.format(feature,target)) plt.show() else: for feature in num_features: fig = plt.figure(figsize=fig_size) ax = fig.gca() sns.set_style("whitegrid") sns.boxplot(target, feature, data=data, ax=ax) plt.ylabel(feature) # Set text for the x axis plt.xlabel(target)# Set text for y axis plt.xticks(rotation=90) plt.title("Box plot of '{}' vs. '{}'".format(feature, target)) if save_fig: plt.savefig('fig_{}_vs_{}'.format(feature,target)) plt.show()
def catbox(data=None, cat_features=None, target=None, fig_size=(10, 5), save_fig=False)
-
Makes a side by side bar plot of all categorical features against a categorical target feature.
Parameters
data: DataFrame, array, or list of arrays. Dataset for plotting. cat_features: Scalar, array, or list. The categorical features in the dataset, if None, we try to infer the categorical columns from the dataframe. target: Scalar, array or list. Categorical target to plot against. fig_size: tuple, Default (12,6) The size of the figure object. save_fig: bool, Default False. If True, saves the plot to the current working directory.
Expand source code
def catbox(data=None, cat_features=None, target=None, fig_size=(10,5), save_fig=False): ''' Makes a side by side bar plot of all categorical features against a categorical target feature. Parameters ------------ data: DataFrame, array, or list of arrays. Dataset for plotting. cat_features: Scalar, array, or list. The categorical features in the dataset, if None, we try to infer the categorical columns from the dataframe. target: Scalar, array or list. Categorical target to plot against. fig_size: tuple, Default (12,6) The size of the figure object. save_fig: bool, Default False. If True, saves the plot to the current working directory. ''' if data is None: raise ValueError("data: Expecting a DataFrame or Series, got 'None'") if cat_features is None: cat_features = structdata.get_cat_feats(data) #remove target from cat_features try: cat_features.remove(target) except: pass if len(data[target].unique()) > 8: #TODO Plot only a subset of the features say top 10 raise AttributeError("Target categories must be less than seven") #Create a dummy column to hold count of values data['dummy_count'] = np.ones(shape = data.shape[0]) #Loop over each categorical feature and plot the rate for each category. for feature in cat_features: #Plots are made for only categories with less than 15 unique values because of speed if len(data[feature].unique()) > 15 : print("{} feature has too many categories and will not be ploted".format(feature)) else: counts = data[['dummy_count', target, feature]].groupby([target, feature], as_index = False).count() #get the categories cats = list(data[target].unique()) if len(cats) > 6: raise ValueError("Target column: '{}' must contain less than six unique classes".format(target)) #create new figure _ = plt.figure(figsize = fig_size) for i, cat in enumerate(cats): plt.subplot(1, len(cats), i+1) #Get the counts each category in target temp = counts[counts[target] == cat][[feature, 'dummy_count']] sns.barplot(x=feature, y='dummy_count', data=temp) plt.xticks(rotation=90) plt.title('Counts for {} \n class {}'.format(feature, cat)) plt.ylabel('count') plt.tight_layout(2) if save_fig: plt.savefig('fig_catbox_{}'.format(feature)) #Drop the dummy_count column from data data.drop(['dummy_count'], axis=1, inplace = True)
def class_count(data=None, features=None, plot=False, save_fig=False)
-
Displays the number of classes in a categorical feature.
Parameters
data
-
Pandas DataFrame or Series
Dataset for plotting.
features
-
Scalar, array, or list.
The categorical features in the dataset, if None, we try to infer the categorical columns from the dataframe.
plot
-
bool, Default False.
Plots the class counts as a barplot
save_fig
-
bool, Default False.
Saves the plot to the current working directory.
Expand source code
def class_count(data=None, features=None, plot=False, save_fig=False): ''' Displays the number of classes in a categorical feature. Parameters: data: Pandas DataFrame or Series Dataset for plotting. features: Scalar, array, or list. The categorical features in the dataset, if None, we try to infer the categorical columns from the dataframe. plot: bool, Default False. Plots the class counts as a barplot save_fig: bool, Default False. Saves the plot to the current working directory. ''' if data is None: raise ValueError("data: Expecting a DataFrame or Series, got 'None'") if features is None: features = structdata.get_cat_feats(data) for feature in features: if data[feature].nunique() > 15: print("Unique classes in {} too large".format(feature)) else: print('Class Count for', feature) display(pd.DataFrame(data[feature].value_counts())) if plot: countplot(data, features, save_fig=save_fig)
def countplot(data=None, features=None, separate_by=None, fig_size=(5, 5), save_fig=False)
-
Makes a bar plot of all categorical features to show their counts.
Parameters
data : DataFrame, array, or list of arrays. The data to plot. features: str, scalar, array, or list. The categorical features in the dataset, if not provided, we try to infer the categorical columns from the dataframe. separate_by: str, default None. The feature used to seperate the plot. Called hue in seaborn. fig_size: tuple, Default (5,5) The size of the figure object. save_fig: bool, Default False. Saves the plot to the current working directory
Returns
None
Expand source code
def countplot(data=None, features=None, separate_by=None, fig_size=(5,5), save_fig=False): ''' Makes a bar plot of all categorical features to show their counts. Parameters ------------ data : DataFrame, array, or list of arrays. The data to plot. features: str, scalar, array, or list. The categorical features in the dataset, if not provided, we try to infer the categorical columns from the dataframe. separate_by: str, default None. The feature used to seperate the plot. Called hue in seaborn. fig_size: tuple, Default (5,5) The size of the figure object. save_fig: bool, Default False. Saves the plot to the current working directory Returns ------- None ''' if data is None: raise ValueError("data: Expecting a DataFrame or Series, got 'None'") if features is None: features = structdata.get_cat_feats(data) for feature in features: #Check the size of categories in the feature: Anything greater than 20 is not plotted if len(data[feature].unique()) > 30: print("Unique Values in {} is too large to plot".format(feature)) print('\n') else: fig = plt.figure(figsize=fig_size) ax = fig.gca() sns.countplot(x=feature, hue=separate_by, data=data) plt.xticks(rotation=90) ax.set_title("Count plot for " + feature) if save_fig: plt.savefig('Countplot_{}'.format(feature))
def histogram(data=None, num_features=None, bins=None, show_dist_type=False, fig_size=(5, 5), save_fig=False)
-
Makes an histogram plot of all numerical features. Helps to show univariate distribution of the features.
Parameters
data : DataFrame, array, or list of arrays. Dataset for plotting. num_features: Scalar, array, or list. The numerical features in the dataset, if not None, we try to infer the numerical columns from the dataframe. bins: int The number of bins to use. show_dist_type: bool, Default False If True, Calculates the skewness of the data and display one of (Left skewed, right skewed or normal) fig_size: tuple, Default (8,8). The size of the figure object. save_fig: bool, Default False. If True, saves the current plot to the current working directory
Expand source code
def histogram(data=None, num_features=None, bins=None, show_dist_type=False, fig_size=(5,5), save_fig=False): ''' Makes an histogram plot of all numerical features. Helps to show univariate distribution of the features. Parameters ------------ data : DataFrame, array, or list of arrays. Dataset for plotting. num_features: Scalar, array, or list. The numerical features in the dataset, if not None, we try to infer the numerical columns from the dataframe. bins: int The number of bins to use. show_dist_type: bool, Default False If True, Calculates the skewness of the data and display one of (Left skewed, right skewed or normal) fig_size: tuple, Default (8,8). The size of the figure object. save_fig: bool, Default False. If True, saves the current plot to the current working directory ''' if data is None: raise ValueError("data: Expecting a DataFrame or Series, got 'None'") if num_features is None: num_features = structdata.get_num_feats(data) for feature in num_features: fig = plt.figure(figsize=fig_size) ax = fig.gca() sns.distplot(data[feature].values, ax=ax, bins=bins) ax.set_xlabel(feature) # Set text for the x axis ax.set_ylabel('Count')# Set text for y axis if show_dist_type: ##TODO Add Code to calculate skewness pass else: ax.set_title('Histogram of ' + feature) if save_fig: #TODO Add function to save to a user specified directory plt.savefig('fig_hist_{}'.format(feature)) plt.show()
def plot_auc(labels, predictions)
-
Computes and plot the false positive rate, true positive rate and threshold along with the AUC Parameters:
labels: This is the true value ( in the case of binary either 0 or 1)
predictions:
This is the probability that shows the likelihood of a value being 0 or 1
Return
plots the Receiver operating characteristics.
Expand source code
def plot_auc(labels, predictions): ''' Computes and plot the false positive rate, true positive rate and threshold along with the AUC Parameters: -------------------- labels: This is the true value ( in the case of binary either 0 or 1) predictions: This is the probability that shows the likelihood of a value being 0 or 1 Return: plots the Receiver operating characteristics. ''' fpr, tpr, threshold = sklm.roc_curve(labels, predictions) auc = sklm.auc(fpr, tpr) ## Plot the result plt.title('Receiver Operating Characteristic') plt.plot(fpr, tpr, color='orange', label='AUC = %0.2f' % auc) plt.legend(loc='lower right') plt.plot([0, 1], [0, 1], 'r--') plt.xlim([0, 1]) plt.ylim([0, 1]) plt.ylabel('True Positive Rate') plt.xlabel('False Positive Rate') plt.show()
def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=<matplotlib.colors.LinearSegmentedColormap object>)
-
This function prints and plots the confusion matrix. Normalization can be applied by setting
normalize=True
.Parameters:
Expand source code
def plot_confusion_matrix(y_true, y_pred, classes, normalize=False, title=None, cmap=plt.cm.Blues): """ This function prints and plots the confusion matrix. Normalization can be applied by setting `normalize=True`. Parameters: """ if not title: if normalize: title = 'Normalized confusion matrix' else: title = 'Confusion matrix, without normalization' # Compute confusion matrix cm = confusion_matrix(y_true, y_pred) # Only use the labels that appear in the data classes = classes[unique_labels(y_true, y_pred)] if normalize: cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] print("Normalized confusion matrix") else: print('Confusion matrix, without normalization') print(cm) fig, ax = plt.subplots() im = ax.imshow(cm, interpolation='nearest', cmap=cmap) ax.figure.colorbar(im, ax=ax) # We want to show all ticks... ax.set(xticks=np.arange(cm.shape[1]), yticks=np.arange(cm.shape[0]), # ... and label them with the respective list entries xticklabels=classes, yticklabels=classes, title=title, ylabel='True label', xlabel='Predicted label') # Rotate the tick labels and set their alignment. plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor") # Loop over data dimensions and create text annotations. fmt = '.2f' if normalize else 'd' thresh = cm.max() / 2. for i in range(cm.shape[0]): for j in range(cm.shape[1]): ax.text(j, i, format(cm[i, j], fmt), ha="center", va="center", color="white" if cm[i, j] > thresh else "black") fig.tight_layout() return ax
def plot_missing(data=None)
-
Plots the data as a heatmap to show missing values
Parameters
data: DataFrame, array, or list of arrays. The data to plot.
Expand source code
def plot_missing(data=None): ''' Plots the data as a heatmap to show missing values Parameters ---------- data: DataFrame, array, or list of arrays. The data to plot. ''' if data is None: raise ValueError("data: Expecting a DataFrame or Series, got 'None'") sns.heatmap(data.isnull(), cbar=True) plt.show()
def plot_scatter_shape(data=None, cols=None, shape_col='', col_y='', alpha=0.2)
-
Makes a scatter plot of data using shape_col as seperation.
Parameter
data
- Dataframe The data that is being imported using pandas.
cols
- list The chosen number of columns in the DataFrame.
shape_col
- The categorical column you want it to show as legend.
col_y
- The y axis of the plot
Return
Matplotlib figure
Expand source code
def plot_scatter_shape(data = None, cols = None, shape_col = '', col_y = '', alpha = 0.2): ''' Makes a scatter plot of data using shape_col as seperation. Parameter: data: Dataframe The data that is being imported using pandas. cols: list The chosen number of columns in the DataFrame. shape_col: The categorical column you want it to show as legend. col_y: The y axis of the plot Return: Matplotlib figure ''' # pick distinctive shapes shapes = ['+', 'o', 's', 'x', '^'] unique_cats = data[shape_col].unique() # loop over the columns to plot for col in cols: sns.set_style("whitegrid") # loop over the unique categories for i, cat in enumerate(unique_cats): temp = data[data[shape_col] == cat] sns.regplot(col, col_y, data=temp, marker = shapes[i], label = cat, scatter_kws={"alpha":alpha}, fit_reg = False, color = 'blue') # Give the plot a main title plt.title('Scatter plot of ' + col_y + ' vs. ' + col) # Set text for the x axis plt.xlabel(col) # Set text for y axis plt.ylabel(col_y) plt.legend() plt.show()
def scatterplot(data=None, num_features=None, target=None, separate_by=None, fig_size=(5, 5), save_fig=False)
-
Makes a scatter plot of numerical features against a numerical target. Helps to show the relationship between features.
Parameters
data : DataFrame, array, or list of arrays. The data to plot. num_features: int/floats, scalar, array, or list. The numeric features in the dataset, if not provided, we try to infer the numeric columns from the dataframe. target: int/float, scalar, array or list. Numerical target feature to plot against. separate_by: str, default None. The feature used to seperate the plot. Called hue in seaborn. fig_size: tuple, Default (10,10) The size of the figure object. save_fig: bool, Default False. Saves the plot to the current working directory
Expand source code
def scatterplot(data=None, num_features=None, target=None, separate_by=None, fig_size=(5,5), save_fig=False): ''' Makes a scatter plot of numerical features against a numerical target. Helps to show the relationship between features. Parameters ------------ data : DataFrame, array, or list of arrays. The data to plot. num_features: int/floats, scalar, array, or list. The numeric features in the dataset, if not provided, we try to infer the numeric columns from the dataframe. target: int/float, scalar, array or list. Numerical target feature to plot against. separate_by: str, default None. The feature used to seperate the plot. Called hue in seaborn. fig_size: tuple, Default (10,10) The size of the figure object. save_fig: bool, Default False. Saves the plot to the current working directory ''' if data is None: raise ValueError("data: Expecting a DataFrame or Series, got 'None'") if separate_by is None: pass elif separate_by not in data.columns: raise ValueError("{} not found in data columns".format(separate_by)) if target is None: raise ValueError('Target value cannot be None') if num_features is None: num_features = structdata.get_num_feats(data) for feature in num_features: fig = plt.figure(figsize=fig_size) # define plot area ax = fig.gca() # define axis sns.scatterplot(x=feature, y=target, data=data, hue=separate_by) ax.set_title("Scatter Plot of '{}' vs. '{}' \n Separated by: '{}'".format(feature, target, separate_by)) if save_fig: plt.savefig('fig_scatterplot_{}'.format(feature))
def violinplot(data=None, num_features=None, target=None, fig_size=(5, 5), save_fig=False)
-
Makes a violin plot of all numerical features against a specified categorical target column.
A violin plot plays a similar role as a box and whisker plot. It shows the distribution of quantitative data across several levels of one (or more) categorical variables such that those distributions can be compared. Unlike a box plot, in which all of the plot components correspond to actual datapoints, the violin plot features a kernel density estimation of the underlying distribution. Parameters
data : DataFrame, array, or list of arrays. Dataset for plotting. num_features: Scalar, array, or list. The numerical features in the dataset, if not None, we try to infer the numerical columns from the dataframe. target: array, pandas series, list. A categorical target column. Maximun number of categories is 10 and minimum is 1. fig_size: tuple, Default (8,8) The size of the figure object. save_fig: bool, Default False. If True, saves the current plot to the current working directory
Expand source code
def violinplot(data=None, num_features=None, target=None, fig_size=(5,5), save_fig=False): ''' Makes a violin plot of all numerical features against a specified categorical target column. A violin plot plays a similar role as a box and whisker plot. It shows the distribution of quantitative data across several levels of one (or more) categorical variables such that those distributions can be compared. Unlike a box plot, in which all of the plot components correspond to actual datapoints, the violin plot features a kernel density estimation of the underlying distribution. Parameters ------------ data : DataFrame, array, or list of arrays. Dataset for plotting. num_features: Scalar, array, or list. The numerical features in the dataset, if not None, we try to infer the numerical columns from the dataframe. target: array, pandas series, list. A categorical target column. Maximun number of categories is 10 and minimum is 1. fig_size: tuple, Default (8,8) The size of the figure object. save_fig: bool, Default False. If True, saves the current plot to the current working directory ''' if target is None: raise ValueError('Target value cannot be None') if len(data[target].unique()) > 10: raise AttributeError("Target categories must be less than 10") if data is None: raise ValueError("data: Expecting a DataFrame or Series, got 'None'") if num_features is None: num_features = structdata.get_num_feats(data) for feature in num_features: fig = plt.figure(figsize=fig_size) ax = fig.gca() sns.set_style("whitegrid") sns.violinplot(target, feature, data=data, ax=ax) plt.xticks(rotation=90) plt.ylabel(feature) # Set text for the x axis plt.xlabel(target)# Set text for y axis plt.title("Violin plot of '{}' vs. '{}'".format(feature, target)) if save_fig: #TODO Add function to save to a specified directory plt.savefig('fig_{}_vs_{}'.format(feature,target)) plt.show()