Source code for datto.Eda

import os
from collections import Counter

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import progressbar
import seaborn as sns

[docs]class Eda: """ Exploratory data analysis (EDA) """
[docs] def separate_cols_by_type(self, df): """ Split the DataFrame into two groups by type Parameters -------- df: DataFrame Returns -------- numerical_vals: DataFrame categorical_vals: DataFrame """ numerical_vals = df[ [ col for col in df.select_dtypes( exclude=["object", "bool", "datetime"] ).columns # ID columns values aren't particularly important to examine if "_id" not in str(col) ] ] categorical_vals = df[ [ col for col in df.select_dtypes(include=["object", "bool"]).columns if "_id" not in str(col) and str(col) != "date" and "timestamp" not in str(col) ] ] return numerical_vals, categorical_vals
[docs] def check_for_mistyped_cols(self, numerical_vals, categorical_vals): """ Check for columns coded incorrectly Parameters -------- numerical_vals: list categorical_vals: list Returns -------- mistyped_cols: list """ mistyped_cols = [] for col in numerical_vals.columns: if numerical_vals[col].nunique() <= 20: print("Coded as numerical, is this actually an object / bool?\n") print(col) print(numerical_vals[col].unique()) print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") mistyped_cols.append(col) for col in categorical_vals.columns: if "_id" in col: continue # Booleans can be recoded as floats but still are good as booleans elif categorical_vals[col].dtypes == bool: continue try: # Test two random values float(categorical_vals[col][0]) float(categorical_vals[col][5]) print("Coded as categorical, is this actually an int / float?\n") print(col) print(categorical_vals[col].unique()[:10]) print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") mistyped_cols.append(col) except Exception: pass return mistyped_cols
[docs] def find_cols_to_exclude(self, df): """ Returns columns that may not be helpful for model building. Exclusion criteria: - Possible PII (address, name, username, date, etc. in col name) - Large proportion of nulls - Only 1 value in entire col - Dates - Low variance in col values - Large number of categorical values Parameters -------- df: DataFrame Returns -------- lst: list """ lst = [] for col in df.columns: if ( "address" in str(col) or "first_name" in str(col) or "last_name" in str(col) or "username" in str(col) or "_id" in str(col) or "date" in str(col) or "time" in str(col) ): lst.append({col: "Considering excluding because potential PII column."}) elif df[col].isnull().sum() / float(df.shape[0]) >= 0.5: lst.append( { col: "Considering excluding because {}% of column is null.".format( round( (df[col].isnull().sum() / float(df.shape[0]) * 100.0), 2 ) ) } ) elif len(df[col].unique()) <= 1: lst.append( { col: "Considering excluding because column includes only one value." } ) elif df[col].dtype == "datetime64[ns]": lst.append( {col: "Considering excluding because column is a timestamp."} ) elif df[col].dtype not in ["object", "bool"]: if df[col].var() < 0.00001: lst.append( { col: "Considering excluding because column variance is low ({})".format( round(df[col].var(), 2) ) } ) elif df[col].dtype in ["object", "bool"]: if len(df[col].unique()) > 500: lst.append( { col: "Considering excluding because object column has large number of unique values ({})".format( len(df[col].unique()) ) } ) [print(x) for x in lst] return lst
[docs] def sample_unique_vals(self, df): """ Examine a few unique vals in each column Parameters -------- df: DataFrame """ for col in df: print(col) try: print(df[col].unique()[:20]) print(df[col].nunique()) except Exception: pass print("\n------------------------------------\n")
[docs] def find_correlated_features(self, df): """ Find & sort correlated features Parameters -------- df: DataFrame Returns -------- s: Series """ if df.empty: return pd.DataFrame() c = df.corr().abs() s = c.unstack() s = s[s <= 0.99999] s = s.sort_values(ascending=False) s_df = s.reset_index() s_df.columns = ["feature_1", "feature_2", "corr"] return s_df
[docs] def check_unique_by_identifier_col(self, df, identifier_col): """ Check if there are duplicates by entity (e.g. user, item). Parameters -------- df: DataFrame Returns -------- dup_rows: DataFrame """ try: dup_rows = pd.concat( x for col, x in df.groupby(identifier_col) if len(x) > 1 ).sort_values(identifier_col) except Exception: return "No duplicate rows found." return dup_rows
[docs] def violin_plots_by_col(self, df, path="../images/", group_by_var=None): """ Makes a violin plot for each numerical column. Parameters -------- df: DataFrame path: str group_by_var: str Variable to group violin plots by """ numerical_vals, _ = self.separate_cols_by_type(df) # Need to fill zeros to get accurate percentile numbers numerical_vals.fillna(0, inplace=True) if numerical_vals.empty: return "No numerical columns to graph." if not os.path.exists(path): os.makedirs(path) iter_bar = progressbar.ProgressBar() for col in iter_bar(numerical_vals): # Filter out some extreme outliers for cleaner plot filtered_df = df[ (df[col] <= df[col].quantile(0.99)) & (df[col] >= df[col].quantile(0.01)) ] fig = plt.figure(figsize=(9, 9)) ax = fig.add_subplot(111) ax.set_title(col) if group_by_var: sns.violinplot(x=group_by_var, y=col, data=filtered_df, ax=ax) else: sns.violinplot( x=col, data=filtered_df, ax=ax, ) text = "75th Percentile: {}\nMedian: {}\n25th Percentile: {}".format( round(np.percentile(numerical_vals[col], 75), 2), round(np.median(numerical_vals[col]), 2), round(np.percentile(numerical_vals[col], 25), 2), ) # Place a text box in upper left in axes coords props = dict(boxstyle="round", facecolor="white", alpha=0.5) ax.text( 0.05, 0.95, text, transform=ax.transAxes, fontsize=14, verticalalignment="top", bbox=props, ) plt.tight_layout() if group_by_var: plt.savefig(f"{path}violinplot_{col}_by_{group_by_var}.png") else: plt.savefig(f"{path}violinplot_{col}.png")
[docs] def bar_graphs_by_col(self, df, path="../images/", group_by_var=None): """ Makes a bar graph for each categorical column. Parameters -------- df: DataFrame path: str group_by_var: str Variable to group bar graphs by """ _, categorical_vals = self.separate_cols_by_type(df) if categorical_vals.empty: return "No categorical columns to graph." if not os.path.exists(path): os.makedirs(path) iter_bar = progressbar.ProgressBar() for col in iter_bar(categorical_vals): if col == group_by_var: continue num_unique_vals = len(df[col].unique()) try: if num_unique_vals == 1: continue # More values than this doesn't display well, just show the top values if group_by_var: # Group bys are hard to read unless this is smaller num_groups = len(df[group_by_var].unique()) most_vals_allowed = round(50 / num_groups) if most_vals_allowed < 5: most_vals_allowed = 5 else: most_vals_allowed = 50 if num_unique_vals > most_vals_allowed: adjust_vals = df[ df[col].isin( [ x[0] for x in Counter(df[col]).most_common(most_vals_allowed) ] ) ] else: adjust_vals = df.copy() fig = plt.figure(figsize=(9, 9)) ax = fig.add_subplot(111) ax.set_title(col) if group_by_var: # Change to proportions by group instead of straight counts (misleading by sample size) grouped_df = df.groupby([group_by_var, col]).count() grouped_df_pcts = grouped_df.groupby(level=0).apply( lambda x: x / float(x.sum()) ) grouped_df_pcts = grouped_df_pcts.reset_index() grouped_df_pcts.columns = [group_by_var, col, "proportion"] grouped_df_pcts.sort_values( by="proportion", ascending=True, inplace=True ) pivot_df = pd.pivot_table( grouped_df_pcts, values="proportion", index=col, columns=group_by_var, ).reset_index() # Sort pivot table by most common cols sorter = list( adjust_vals.groupby([col]) .count() .iloc[:, 1] .sort_values(ascending=True) .index ) sorterIndex = dict(zip(sorter, range(len(sorter)))) pivot_df["rank"] = pivot_df[col].map(sorterIndex) pivot_df.sort_values(by="rank", ascending=True, inplace=True) pivot_df.drop("rank", axis=1, inplace=True) pivot_df.plot( x=col, kind="barh", ylabel=f"proportion_{col}_within_{group_by_var}", ax=ax, ) plt.tight_layout() plt.savefig( f"{path}bargraph_proportion_{col}_within_{group_by_var}.png" ) else: grouped_df = ( adjust_vals.groupby([col]).count().iloc[:, 1] / adjust_vals.shape[0] ) grouped_df = grouped_df.reset_index() grouped_df.columns = [col, "proportion"] grouped_df.sort_values( by="proportion", ascending=True, inplace=True ) grouped_df.plot( x=col, kind="barh", legend=None, ylabel="proportion", ax=ax ) plt.tight_layout() plt.savefig(f"{path}bargraph_{col}.png") except Exception: continue