params_rf = {
    "model__n_estimators": [250, 300, 350, 400],
    "model__criterion": ['gini','entropy'],
    "model__max_depth":[10, 12, 15],
    "model__min_samples_split":[4, 7, 9],
}


plot_early_stop_rounds()


search_space = [{
    "model__learning_rate"    : [0.01, 0.1] ,
    "model__max_depth"        : [7, 8, 9],
    "model__subsample"        : [0.7, 0.9, 1],
    "model__gamma"            : [ 0.1, 0.4, 0.6],
}]


get_confusion_matrix(y_test, y_hat_xgb,mat_title="Confusion Matrix using XGBoost")


get_confusion_matrix(y_test, y_hat_rfc,mat_title="Confusion Matrix using Random Forest")


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.io import arff

# Import ML, preprocessing and CV packages
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import cross_validate, StratifiedKFold, train_test_split,RandomizedSearchCV,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb

from sklearn.metrics import classification_report,plot_confusion_matrix,plot_roc_curve
from sklearn.metrics import  auc, roc_curve, confusion_matrix

import joblib
import missingno as msno
sns.set_theme()
# options
pd.set_option('display.max_columns', None)
# Using fixed os structure
import os
cwd = os.getcwd()
d = os.path.dirname(cwd)
d

# Notebook options
pd.options.display.max_rows = 99999
pd.options.display.max_columns = 99999
#pd.options.display.float_format = '{:20,.3f}'.format


# Iterate through files provided and make data frame
files = os.listdir(cwd+'/data')
files.sort()
df = pd.DataFrame()
for i, filename in enumerate(files):
    file = cwd+"/data/"+filename
    data = arff.loadarff(file)
    df_temp = pd.DataFrame(data[0])
    df = pd.concat([df,df_temp])


# Convert target Class
def convert_target(target: str):
    if '0' in target:
        return 0
    elif '1' in target:
        return 1
    else:
        return None
df['class'] = df['class'].astype('str')
df['class'] = df['class'].apply(lambda x: convert_target(x))


df = df.reset_index()
df.drop(['index'],axis=1,inplace=True)


df.head()


df.shape

(43405, 65)


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43405 entries, 0 to 43404
Data columns (total 65 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Attr1   43397 non-null  float64
 1   Attr2   43397 non-null  float64
 2   Attr3   43397 non-null  float64
 3   Attr4   43271 non-null  float64
 4   Attr5   43316 non-null  float64
 5   Attr6   43397 non-null  float64
 6   Attr7   43397 non-null  float64
 7   Attr8   43311 non-null  float64
 8   Attr9   43396 non-null  float64
 9   Attr10  43397 non-null  float64
 10  Attr11  43361 non-null  float64
 11  Attr12  43271 non-null  float64
 12  Attr13  43278 non-null  float64
 13  Attr14  43397 non-null  float64
 14  Attr15  43369 non-null  float64
 15  Attr16  43310 non-null  float64
 16  Attr17  43311 non-null  float64
 17  Attr18  43397 non-null  float64
 18  Attr19  43277 non-null  float64
 19  Attr20  43278 non-null  float64
 20  Attr21  37551 non-null  float64
 21  Attr22  43397 non-null  float64
 22  Attr23  43278 non-null  float64
 23  Attr24  42483 non-null  float64
 24  Attr25  43397 non-null  float64
 25  Attr26  43310 non-null  float64
 26  Attr27  40641 non-null  float64
 27  Attr28  42593 non-null  float64
 28  Attr29  43397 non-null  float64
 29  Attr30  43278 non-null  float64
 30  Attr31  43278 non-null  float64
 31  Attr32  43037 non-null  float64
 32  Attr33  43271 non-null  float64
 33  Attr34  43311 non-null  float64
 34  Attr35  43397 non-null  float64
 35  Attr36  43397 non-null  float64
 36  Attr37  24421 non-null  float64
 37  Attr38  43397 non-null  float64
 38  Attr39  43278 non-null  float64
 39  Attr40  43271 non-null  float64
 40  Attr41  42651 non-null  float64
 41  Attr42  43278 non-null  float64
 42  Attr43  43278 non-null  float64
 43  Attr44  43278 non-null  float64
 44  Attr45  41258 non-null  float64
 45  Attr46  43270 non-null  float64
 46  Attr47  43108 non-null  float64
 47  Attr48  43396 non-null  float64
 48  Attr49  43278 non-null  float64
 49  Attr50  43311 non-null  float64
 50  Attr51  43397 non-null  float64
 51  Attr52  43104 non-null  float64
 52  Attr53  42593 non-null  float64
 53  Attr54  42593 non-null  float64
 54  Attr55  43404 non-null  float64
 55  Attr56  43278 non-null  float64
 56  Attr57  43398 non-null  float64
 57  Attr58  43321 non-null  float64
 58  Attr59  43398 non-null  float64
 59  Attr60  41253 non-null  float64
 60  Attr61  43303 non-null  float64
 61  Attr62  43278 non-null  float64
 62  Attr63  43271 non-null  float64
 63  Attr64  42593 non-null  float64
 64  class   43405 non-null  int64  
dtypes: float64(64), int64(1)
memory usage: 21.5 MB


df.drop('class',axis=1).describe()


df['class'].value_counts()

0    41314
1     2091
Name: class, dtype: int64


# Identify attributes missing over 15% of values
columns_missing_rate = 100*((df.isna()).sum(axis = 0) / len(df))
columns_missing_rate = pd.DataFrame(columns_missing_rate)
columns_missing_rate = columns_missing_rate.reset_index()
columns_missing_rate.columns = ['name','rate']
print("     Initial Missing Data Inspection with all Attributes In     \n")
print('{:45s} {:4s}%'.format("Variable Name:","Missing Percentage"))
print("________________________________________________________________")
for index, row in columns_missing_rate.iterrows():
    if row['rate'] > 0:
        print(' {:45s} {:4s}%'.format(row['name'],str(np.round(row['rate'],4))))

     Initial Missing Data Inspection with all Attributes In     

Variable Name:                                Missing Percentage%
________________________________________________________________
 Attr1                                         0.0184%
 Attr2                                         0.0184%
 Attr3                                         0.0184%
 Attr4                                         0.3087%
 Attr5                                         0.205%
 Attr6                                         0.0184%
 Attr7                                         0.0184%
 Attr8                                         0.2166%
 Attr9                                         0.0207%
 Attr10                                        0.0184%
 Attr11                                        0.1014%
 Attr12                                        0.3087%
 Attr13                                        0.2926%
 Attr14                                        0.0184%
 Attr15                                        0.0829%
 Attr16                                        0.2189%
 Attr17                                        0.2166%
 Attr18                                        0.0184%
 Attr19                                        0.2949%
 Attr20                                        0.2926%
 Attr21                                        13.4869%
 Attr22                                        0.0184%
 Attr23                                        0.2926%
 Attr24                                        2.1242%
 Attr25                                        0.0184%
 Attr26                                        0.2189%
 Attr27                                        6.3679%
 Attr28                                        1.8708%
 Attr29                                        0.0184%
 Attr30                                        0.2926%
 Attr31                                        0.2926%
 Attr32                                        0.8478%
 Attr33                                        0.3087%
 Attr34                                        0.2166%
 Attr35                                        0.0184%
 Attr36                                        0.0184%
 Attr37                                        43.7369%
 Attr38                                        0.0184%
 Attr39                                        0.2926%
 Attr40                                        0.3087%
 Attr41                                        1.7371%
 Attr42                                        0.2926%
 Attr43                                        0.2926%
 Attr44                                        0.2926%
 Attr45                                        4.9464%
 Attr46                                        0.311%
 Attr47                                        0.6843%
 Attr48                                        0.0207%
 Attr49                                        0.2926%
 Attr50                                        0.2166%
 Attr51                                        0.0184%
 Attr52                                        0.6935%
 Attr53                                        1.8708%
 Attr54                                        1.8708%
 Attr55                                        0.0023%
 Attr56                                        0.2926%
 Attr57                                        0.0161%
 Attr58                                        0.1935%
 Attr59                                        0.0161%
 Attr60                                        4.958%
 Attr61                                        0.235%
 Attr62                                        0.2926%
 Attr63                                        0.3087%
 Attr64                                        1.8708%


# Reorder the attribute name from less missing data to more missing data
# This is to make sure when deleting correlation > 0.95, the column with more missing data is deleted
missing_attribute_rank = list(columns_missing_rate.sort_values('rate',ascending=True)['name'])[1:]


missing_attribute_rank.append('class')
print(missing_attribute_rank)

['Attr55', 'Attr59', 'Attr57', 'Attr51', 'Attr38', 'Attr36', 'Attr35', 'Attr29', 'Attr25', 'Attr22', 'Attr18', 'Attr14', 'Attr1', 'Attr6', 'Attr7', 'Attr3', 'Attr2', 'Attr10', 'Attr48', 'Attr9', 'Attr15', 'Attr11', 'Attr58', 'Attr5', 'Attr8', 'Attr50', 'Attr34', 'Attr17', 'Attr26', 'Attr16', 'Attr61', 'Attr44', 'Attr43', 'Attr42', 'Attr13', 'Attr39', 'Attr23', 'Attr56', 'Attr49', 'Attr30', 'Attr20', 'Attr62', 'Attr31', 'Attr19', 'Attr63', 'Attr4', 'Attr40', 'Attr33', 'Attr12', 'Attr46', 'Attr47', 'Attr52', 'Attr32', 'Attr41', 'Attr53', 'Attr54', 'Attr64', 'Attr28', 'Attr24', 'Attr45', 'Attr60', 'Attr27', 'Attr21', 'Attr37', 'class']


df = df[missing_attribute_rank]


df.head()


corr = df[df.select_dtypes(include="number").columns].corr().abs()
upper_tri = corr.where(np.triu(np.ones(corr.shape),k=1).astype(np.bool))


sns.set_theme(style="white")
plt.subplots(figsize=(20,20))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(upper_tri>0.99, cmap=cmap, vmax=1, center=0.5,annot=False,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

<AxesSubplot:>


# Check for correlation greater than 0.95
# correlation greater than 0.95 columns are too high in correlation of existing columns 
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.99)]
print(to_drop)

['Attr14', 'Attr7', 'Attr3', 'Attr10', 'Attr9', 'Attr17', 'Attr16', 'Attr43', 'Attr56', 'Attr20', 'Attr31', 'Attr19', 'Attr4', 'Attr46', 'Attr54']


# Check for correlation greater than 0.95
# correlation greater than 0.95 columns are too high in correlation of existing columns 
to_drop = [column for column in upper_tri.columns if any(upper_tri[column] > 0.95)]
print(to_drop)

['Attr14', 'Attr7', 'Attr3', 'Attr10', 'Attr48', 'Attr9', 'Attr11', 'Attr17', 'Attr16', 'Attr44', 'Attr43', 'Attr56', 'Attr49', 'Attr20', 'Attr31', 'Attr19', 'Attr63', 'Attr4', 'Attr33', 'Attr12', 'Attr46', 'Attr54']


# Dropping variables that have correlation over 0.95 compared to other variables
df.drop(to_drop, axis=1, inplace=True)


msno.matrix(df)

<AxesSubplot:>


msno.matrix(df.loc[df['class']==1])

<AxesSubplot:>


msno.matrix(df.loc[df['class']==0])

<AxesSubplot:>


corr = df[df.select_dtypes(include="number").columns].corr().abs()
upper_tri = corr.where(np.triu(np.ones(corr.shape),k=0).astype(np.bool))


sns.set_theme(style="white")
plt.subplots(figsize=(20,20))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(upper_tri, cmap=cmap, vmax=1, center=0.5,annot=False,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

<AxesSubplot:>


df.shape

(43405, 43)


def get_acc_score(model, x, y):
    return model.score(x, y)
  
def plot_roc_curve_custom(x_test, y_test, model):
    fpr, tpr, _ = roc_curve(y_test, model.predict_proba(x_test)[:,1])
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, color='darkorange', label='Randomized Search CV (AUC = ' + str(round(roc_auc, 5)) + ')')
    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
    plt.legend(loc='lower right')
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.title('Receiver operating characteristic',fontdict={'fontsize':20},pad=15)
    
def get_confusion_matrix(y, yhat,mat_title="Confusion Matrix"):
    x_axis_labels = ['Not Bankrupt','Bankrupt']
    y_axis_labels = ['Not Bankrupt','Bankrupt']
    cm_n = confusion_matrix(y, yhat)
    ax = sns.heatmap(cm_n,cmap='Blues',annot=True,fmt='2d',xticklabels=x_axis_labels, yticklabels=y_axis_labels)
    ax.set(xlabel='Predicted Label', ylabel='True Label')
    ax.set_title(mat_title,fontdict={'fontsize':20},pad=10)
    
def get_classification_report(x_train, y_train, x_test, y_test, pred, model):
    """
    This function is used to get comprehensive classification report:
    Training Accuracy, Test Accuracy, print classification_report
    plot confusion matrix, and plot roc curve.
    """
    print(f"Train accuracy: {get_acc_score(model, x_train, y_train):,.5f}")
    print("Test result:")
    print(classification_report(y_test,pred))
    fpr, tpr, thresholds = roc_curve(y_test, model.predict_proba(x_test)[:,1], pos_label=1)
    print(f"Test AUC: {auc(fpr, tpr):,.5f}")
    plot_roc_curve_custom(x_test, y_test, model)
    plot_confusion_matrix(model, x_test, y_test,colorbar=False)
    plt.grid(False)
#     plot_roc_curve(model, x_test,y_test, response_method="predict_proba")


def cv_common(df, columns):
    cv_result_summary = df[columns]
    cv_result_summary.index  = np.arange(1,len(cv_result_summary)+1)
    cv_result_summary = cv_result_summary.reset_index()
    return cv_result_summary
    

def cv_summary(estimator, columns):
    df = pd.DataFrame(estimator.cv_results_)
    cv_result_summary = cv_common(df, columns)
    cv_result_summary = cv_result_summary.rename(columns = {
        "index": "param_combination",
        "mean_test_score": "mean_validation_score",
        "rank_test_score": "rank_validation_score"
    })
    return cv_result_summary


params_rf = {
    "model__n_estimators": [250, 300, 350, 400],
    "model__max_features": ["auto"],
    "model__criterion": ['gini','entropy'],
    "model__max_depth":[10, 12, 15],
    "model__min_samples_split":[4, 7, 9],
    "model__n_jobs":[-2],
    "model__class_weight":["balanced"]
}

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)


X,y = df.drop('class',axis=1, inplace=False),df['class']


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, shuffle=True, random_state=43, stratify=y)


preprocessing = Pipeline(steps=[('imputation',SimpleImputer(strategy='median')),\
                               ('scaling',MinMaxScaler())])


%%time

# Define preprocessing for columns
rfc = RandomForestClassifier(random_state=101)

rf_pipeline = Pipeline(steps=[('preprocessing',preprocessing),('model',rfc)])

# Fit
clf_rfc = RandomizedSearchCV(rf_pipeline, params_rf, cv=cv, scoring='roc_auc',
                      n_jobs=-2, verbose = 15, return_train_score=True)

search_rfc = clf_rfc.fit(X_train,y_train)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
CPU times: user 1min 24s, sys: 610 ms, total: 1min 24s
Wall time: 6min 14s


# Get cross validation result
best_rfc_columns = ['param_model__n_estimators',"param_model__criterion","param_model__max_depth",
                    "param_model__min_samples_split","mean_train_score","mean_test_score","rank_test_score"]
df_best_rfc_cv_results = cv_summary(search_rfc, best_rfc_columns)
df_best_rfc_cv_results


y_hat_rfc = search_rfc.predict(X_test)
get_classification_report(X_train, y_train, X_test, y_test, y_hat_rfc, search_rfc)

Train accuracy: 0.99468
Test result:
              precision    recall  f1-score   support

           0       0.97      0.97      0.97      8263
           1       0.34      0.32      0.33       418

    accuracy                           0.94      8681
   macro avg       0.65      0.64      0.65      8681
weighted avg       0.94      0.94      0.94      8681

Test AUC: 0.85353


file_name_rfc = "search_rfc_model.sav"
#joblib.dump(search_rfc, file_name_rfc)
search_rfc = joblib.load(file_name_rfc)


X_train_xg = preprocessing.fit_transform(X_train)


X_test_xg = preprocessing.transform(X_test)


dtrain = xgb.DMatrix(X_train_xg, label=y_train)
dtest = xgb.DMatrix(X_test_xg, label=y_test)


evallist = [(dtrain,'train'), (dtest,'eval')]


num_round = 1000
param = {
    'max_depth':10,
    'objective':'binary:logistic',
    'eval_metric':'logloss',
    'eta': 0.1
}


%%time
out = xgb.cv(params=param,dtrain=dtrain,num_boost_round=num_round, nfold=5, verbose_eval=False, stratified=True,
       early_stopping_rounds=5, seed=101)

CPU times: user 3min 56s, sys: 859 ms, total: 3min 57s
Wall time: 15.1 s


out.head()


out.shape

(71, 4)


# It looks like XGBoost from 100 - 300 rounds doesn't make so much difference between train or test set
def plot_early_stop_rounds():
    plt.subplots(figsize=(10,6))
    plt.plot(out['train-logloss-mean'],lw=2.5, label='train')
    plt.plot(out['test-logloss-mean'],lw=2.5, label = 'test')
    plt.title('Train/Test rounds vs. Error',fontdict={'fontsize':20},pad=15)
    plt.ylabel('Error')
    plt.xlabel('Training Rounds')
    plt.legend()
    plt.show()
plot_early_stop_rounds()


X_train_xg = preprocessing.fit_transform(X_train)
X_test_xg = preprocessing.transform(X_test)


search_space = [
    {
        "model__learning_rate"    : [0.01, 0.1] ,
        "model__max_depth"        : [7, 8, 9],
        "model__subsample"        : [0.7, 0.9, 1],
        "model__gamma"            : [ 0.1, 0.4, 0.6],
     }  
]


%%time
model_xgb = xgb.XGBClassifier(
    n_estimators = 1000,
    use_label_encoder =False, 
    objective="binary:logistic", 
    random_state=123,
)

fit_params={"model__early_stopping_rounds": 20, 
            "model__eval_metric" : "auc", 
            "model__verbose":False,
            "model__eval_set" : [[X_test_xg, y_test]]}

xgb_pipeline = Pipeline(steps=[('preprocessing',preprocessing),
                               ('model',model_xgb)])

clf_xgb = RandomizedSearchCV(xgb_pipeline,search_space,cv=cv, scoring='roc_auc',n_iter=10,
                      n_jobs=-2, verbose = 15, return_train_score=True, random_state=102)

search_clf_xgb = clf_xgb.fit(X_train, y_train, **fit_params)

Fitting 10 folds for each of 10 candidates, totalling 100 fits
CPU times: user 2min 13s, sys: 864 ms, total: 2min 14s
Wall time: 2min 57s


# Where did the best estimator and best_iteration occur 
# Early Stopping Rounds based on test set, early stopping set at 20 in order to let mean_validation score catch up
search_clf_xgb.best_estimator_.named_steps['model'].best_iteration

165


# Get cross validation result
best_xgb_columns = ["param_model__learning_rate","param_model__max_depth",
                    "param_model__subsample","param_model__gamma","mean_train_score",
                    "mean_test_score","rank_test_score"]
df_best_xgb_cv_results = cv_summary(search_clf_xgb, best_xgb_columns)
df_best_xgb_cv_results


# Where did the best estimator and best_iteration occur 
# Early Stopping Rounds based on test set, early stopping set at 20 in order to let mean_validation score catch up
search_clf_xgb.best_estimator_.named_steps['model'].best_iteration

178


# Get cross validation result
best_xgb_columns = ["param_model__learning_rate","param_model__max_depth",
                    "param_model__subsample","param_model__gamma","mean_train_score",
                    "mean_test_score","rank_test_score"]
df_best_xgb_cv_results = cv_summary(search_clf_xgb, best_xgb_columns)
df_best_xgb_cv_results


y_hat_xgb = search_clf_xgb.predict(X_test)
get_classification_report(X_train, y_train, X_test, y_test, y_hat_xgb, search_clf_xgb)

Train accuracy: 0.99998
Test result:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98      8263
           1       0.88      0.45      0.60       418

    accuracy                           0.97      8681
   macro avg       0.93      0.73      0.79      8681
weighted avg       0.97      0.97      0.97      8681

Test AUC: 0.95585


file_name_xgb = f"search_clf_xgb_3.sav"
#joblib.dump(search_clf_xgb, file_name_xgb)
search_clf_xgb = joblib.load(file_name_xgb)


get_confusion_matrix(y_test, y_hat_xgb,mat_title="Confusion Matrix using XGBoost")

	Attr1	Attr2	Attr3	Attr4	Attr5	Attr6	Attr7	Attr8	Attr9	Attr10	Attr11	Attr12	Attr13	Attr14	Attr15	Attr16	Attr17	Attr18	Attr19	Attr20	Attr21	Attr22	Attr23	Attr24	Attr25	Attr26	Attr27	Attr28	Attr29	Attr30	Attr31	Attr32	Attr33	Attr34	Attr35	Attr36	Attr37	Attr38	Attr39	Attr40	Attr41	Attr42	Attr43	Attr44	Attr45	Attr46	Attr47	Attr48	Attr49	Attr50	Attr51	Attr52	Attr53	Attr54	Attr55	Attr56	Attr57	Attr58	Attr59	Attr60	Attr61	Attr62	Attr63	Attr64
0	0.200550	0.37951	0.39641	2.0472	32.3510	0.38825	0.249760	1.33050	1.1389	0.50494	0.249760	0.65980	0.166600	0.249760	497.42	0.73378	2.6349	0.249760	0.149420	43.370	1.2479	0.21402	0.119980	0.47706	0.50494	0.60411	1.45820	1.7615	5.9443	0.11788	0.149420	94.14	3.8772	0.56393	0.21402	1.7410	593.2700	0.50591	0.128040	0.662950	0.051402	0.128040	114.42	71.050	1.00970	1.52250	49.394	0.185300	0.110850	2.0420	0.37854	0.25792	2.2437	2.2480	348690.0	0.121960	0.39718	0.87804	0.001924	8.4160	5.1372	82.658	4.4158	7.4277
1	0.209120	0.49988	0.47225	1.9447	14.7860	0.00000	0.258340	0.99601	1.6996	0.49788	0.261140	0.51680	0.158350	0.258340	677.96	0.53838	2.0005	0.258340	0.152000	87.981	1.4293	0.24806	0.123040	NaN	0.39542	0.43992	88.44400	16.9460	3.6884	0.26969	0.152000	122.17	2.9876	2.98760	0.20616	1.6996	NaN	0.49788	0.121300	0.086422	0.064371	0.145950	199.49	111.510	0.51045	1.12520	100.130	0.237270	0.139610	1.9447	0.49988	0.33472	17.8660	17.8660	2304.6	0.121300	0.42002	0.85300	0.000000	4.1486	3.2732	107.350	3.4000	60.9870
2	0.248660	0.69592	0.26713	1.5548	-1.1523	0.00000	0.309060	0.43695	1.3090	0.30408	0.312580	0.64184	0.244350	0.309060	794.16	0.45961	1.4369	0.309060	0.236100	73.133	1.4283	0.30260	0.189960	NaN	0.28932	0.37282	86.01100	1.0627	4.3749	0.41929	0.238150	176.93	2.0630	1.42740	0.31565	1.3090	2.3019	0.51537	0.241140	0.322020	0.074020	0.231170	165.51	92.381	0.94807	1.01010	96.372	0.291810	0.222930	1.0758	0.48152	0.48474	1.2098	2.0504	6332.7	0.241140	0.81774	0.76599	0.694840	4.9909	3.9510	134.270	2.7185	5.2078
3	0.081483	0.30734	0.45879	2.4928	51.9520	0.14988	0.092704	1.86610	1.0571	0.57353	0.092704	0.30163	0.094257	0.092704	917.01	0.39803	3.2537	0.092704	0.071428	79.788	1.5069	0.11550	0.062782	0.17193	0.57353	0.36152	0.94076	1.9618	4.6511	0.14343	0.071428	91.37	3.9948	0.37581	0.11550	1.3562	NaN	0.57353	0.088995	0.401390	0.069622	0.088995	180.77	100.980	0.28720	1.56960	84.344	0.085874	0.066165	2.4928	0.30734	0.25033	2.4524	2.4524	20545.0	0.054015	0.14207	0.94598	0.000000	4.5746	3.6147	86.435	4.2228	5.5497
4	0.187320	0.61323	0.22960	1.4063	-7.3128	0.18732	0.187320	0.63070	1.1559	0.38677	0.187320	0.33147	0.121820	0.187320	1133.20	0.32211	1.6307	0.187320	0.115530	57.045	NaN	0.19832	0.115530	0.18732	0.38677	0.32211	1.41380	1.1184	4.1424	0.27884	0.115530	147.04	2.4823	0.32340	0.19832	1.6278	11.2470	0.43489	0.122310	0.293040	0.096680	0.122310	141.62	84.574	0.73919	0.95787	65.936	0.188110	0.116010	1.2959	0.56511	0.40285	1.8839	2.1184	3186.6	0.134850	0.48431	0.86515	0.124440	6.3985	4.3158	127.210	2.8692	7.8980

	Attr1	Attr2	Attr3	Attr4	Attr5	Attr6	Attr7	Attr8	Attr9	Attr10	Attr11	Attr12	Attr13	Attr14	Attr15	Attr16	Attr17	Attr18	Attr19	Attr20	Attr21	Attr22	Attr23	Attr24	Attr25	Attr26	Attr27	Attr28	Attr29	Attr30	Attr31	Attr32	Attr33	Attr34	Attr35	Attr36	Attr37	Attr38	Attr39	Attr40	Attr41	Attr42	Attr43	Attr44	Attr45	Attr46	Attr47	Attr48	Attr49	Attr50	Attr51	Attr52	Attr53	Attr54	Attr55	Attr56	Attr57	Attr58	Attr59	Attr60	Attr61	Attr62	Attr63	Attr64
count	43397.000000	43397.000000	43397.000000	43271.000000	4.331600e+04	43397.000000	43397.000000	43311.000000	43396.000000	43397.000000	43361.000000	43271.000000	43278.000000	43397.000000	4.336900e+04	43310.000000	43311.000000	43397.000000	43277.000000	4.327800e+04	37551.000000	43397.000000	43278.000000	42483.000000	43397.000000	43310.000000	4.064100e+04	42593.000000	43397.000000	43278.000000	43278.000000	4.303700e+04	43271.000000	43311.000000	43397.000000	43397.000000	24421.000000	43397.000000	43278.000000	43271.000000	42651.000000	43278.000000	4.327800e+04	4.327800e+04	41258.000000	43270.000000	4.310800e+04	43396.000000	43278.000000	43311.000000	43397.000000	43104.000000	42593.000000	42593.000000	4.340400e+04	4.327800e+04	43398.000000	4.332100e+04	43398.000000	4.125300e+04	43303.000000	4.327800e+04	43271.000000	42593.000000
mean	0.035160	0.590212	0.114431	6.314702	-3.853466e+02	-0.056107	0.093478	12.640779	2.652166	0.626868	0.131075	1.132430	0.809544	0.093489	1.991890e+03	1.411167	13.802535	0.098565	0.156277	2.430209e+02	3.884997	0.113923	0.138582	0.270012	0.392844	1.263864	1.107896e+03	6.003208	4.005020	7.370821	0.176847	1.162621e+03	8.636016	5.411439	0.111908	2.911241	105.085363	0.724357	-0.289102	2.147411	7.717806	-0.142514	1.074125e+03	8.311085e+02	14.825016	5.428857	3.578418e+02	0.028584	-0.482584	5.835373	0.483521	6.478459	23.772623	24.654700	7.672188e+03	-2.621959e+01	-0.010510	3.002644e+01	1.333288	4.480858e+02	17.033202	1.502328e+03	9.343074	72.788592
std	2.994109	5.842748	5.439429	295.434425	6.124303e+04	7.201326	5.713075	505.894281	62.932732	14.670597	5.306782	67.590296	86.944053	5.713075	9.643193e+04	68.519998	507.322101	5.737645	48.692154	3.754517e+04	228.668931	5.155778	48.334974	7.988151	12.891440	66.224124	3.501237e+04	153.468628	0.826857	814.490078	48.748410	9.559356e+04	118.991185	120.975110	4.783398	62.984347	3058.429830	14.749270	39.256360	56.029879	1398.837543	15.990988	1.472188e+05	1.100510e+05	2428.236110	295.356200	3.314634e+04	5.097322	45.152174	307.382687	5.438357	639.887467	1213.797976	1220.884065	7.005310e+04	5.327862e+03	13.674072	5.334454e+03	122.104445	3.234560e+04	553.049406	1.392667e+05	124.177354	2369.339482
min	-463.890000	-430.870000	-479.960000	-0.403110	-1.190300e+07	-508.410000	-517.480000	-141.410000	-3.496000	-479.910000	-463.890000	-6331.800000	-1460.600000	-517.480000	-9.632400e+06	-6331.800000	-0.412920	-517.480000	-1578.700000	-2.934000e+01	-1325.000000	-431.590000	-1578.700000	-463.890000	-500.930000	-6331.800000	-2.590100e+05	-3829.900000	-0.886060	-6351.700000	-1495.600000	-9.295600e+03	-19.197000	-1696.000000	-431.590000	-0.000857	-525.520000	-479.910000	-7522.000000	-101.270000	-1234.400000	-1395.800000	-1.158700e+05	-1.158700e+05	-256230.000000	-101.260000	-9.611000e+01	-542.560000	-9001.000000	-0.045239	-0.186610	-25.467000	-3828.900000	-3828.900000	-1.805200e+06	-1.108300e+06	-1667.300000	-1.986900e+02	-327.970000	-1.244000e+01	-12.656000	-2.336500e+06	-1.543200	-10677.000000
25%	0.003429	0.268980	0.021521	1.049500	-4.908000e+01	0.000000	0.005776	0.430275	1.018500	0.295470	0.015409	0.015072	0.023881	0.005783	2.225800e+02	0.073476	1.451650	0.005783	0.003870	1.541325e+01	0.908225	0.000000	0.002269	0.021336	0.150100	0.066743	4.504800e-02	0.038214	3.495100	0.082753	0.006961	4.639500e+01	2.819900	0.305725	0.006038	1.101100	1.142300	0.419830	0.004165	0.052696	0.026679	0.000000	6.660850e+01	3.487775e+01	0.019168	0.607455	1.622575e+01	-0.038183	-0.026674	0.774935	0.190060	0.126937	0.686780	0.956330	2.755425e+01	9.348500e-03	0.014649	8.753200e-01	0.000000	5.545500e+00	4.510150	4.214400e+01	3.097650	2.176800
50%	0.049660	0.471900	0.196610	1.569800	-1.034500e+00	0.000000	0.059634	1.070400	1.195350	0.505970	0.075306	0.172390	0.068488	0.059653	8.462600e+02	0.245845	2.116400	0.059653	0.035874	3.514950e+01	1.045200	0.062262	0.029916	0.155100	0.384520	0.221705	1.084100e+00	0.465390	4.014000	0.217500	0.043104	7.832500e+01	4.625500	1.967400	0.060655	1.643400	3.096300	0.612150	0.036874	0.176720	0.085515	0.038015	9.940150e+01	5.476750e+01	0.282825	1.026650	3.812950e+01	0.018430	0.010970	1.222200	0.341010	0.214150	1.205300	1.376700	1.088350e+03	5.294300e-02	0.119670	9.509600e-01	0.006366	9.791700e+00	6.636300	7.132600e+01	5.087600	4.282500
75%	0.129580	0.688320	0.403390	2.787450	5.063425e+01	0.089446	0.150880	2.615700	2.062500	0.709100	0.166770	0.586905	0.134858	0.150880	2.226900e+03	0.664890	3.701350	0.150910	0.090932	6.372275e+01	1.203700	0.149950	0.078038	0.355590	0.610730	0.598660	5.139300e+00	1.496600	4.520200	0.408957	0.101530	1.280800e+02	7.803100	4.551050	0.150070	2.420900	11.414000	0.771830	0.091759	0.652260	0.205625	0.092132	1.406975e+02	8.052250e+01	0.955588	1.910775	7.033950e+01	0.107332	0.062246	2.208600	0.534680	0.350212	2.221400	2.369700	4.993325e+03	1.290975e-01	0.284605	9.926400e-01	0.236052	2.018100e+01	10.394500	1.172200e+02	8.598850	9.776200
max	94.280000	480.960000	28.336000	53433.000000	1.250100e+06	543.250000	649.230000	53432.000000	9742.300000	1099.500000	681.540000	8259.400000	13315.000000	649.230000	1.023600e+07	8259.400000	53433.000000	649.230000	9230.500000	7.809200e+06	29907.000000	681.540000	9230.500000	831.660000	1353.300000	8262.300000	4.208800e+06	21701.000000	9.698300	152860.000000	9244.300000	1.736400e+07	21944.000000	21944.000000	626.920000	9742.300000	398920.000000	1099.500000	2156.500000	8007.100000	288770.000000	2156.800000	3.039300e+07	2.258400e+07	366030.000000	53433.000000	6.084200e+06	623.850000	178.890000	53433.000000	480.960000	88433.000000	180440.000000	180440.000000	6.123700e+06	2.931500e+02	552.640000	1.108300e+06	23853.000000	4.818700e+06	108000.000000	2.501600e+07	23454.000000	294770.000000

	Attr55	Attr59	Attr57	Attr51	Attr38	Attr36	Attr35	Attr29	Attr25	Attr22	Attr18	Attr14	Attr1	Attr6	Attr7	Attr3	Attr2	Attr10	Attr48	Attr9	Attr15	Attr11	Attr58	Attr5	Attr8	Attr50	Attr34	Attr17	Attr26	Attr16	Attr61	Attr44	Attr43	Attr42	Attr13	Attr39	Attr23	Attr56	Attr49	Attr30	Attr20	Attr62	Attr31	Attr19	Attr63	Attr4	Attr40	Attr33	Attr12	Attr46	Attr47	Attr52	Attr32	Attr41	Attr53	Attr54	Attr64	Attr28	Attr24	Attr45	Attr60	Attr27	Attr21	Attr37
0	348690.0	0.001924	0.39718	0.37854	0.50591	1.7410	0.21402	5.9443	0.50494	0.21402	0.249760	0.249760	0.200550	0.38825	0.249760	0.39641	0.37951	0.50494	0.185300	1.1389	497.42	0.249760	0.87804	32.3510	1.33050	2.0420	0.56393	2.6349	0.60411	0.73378	5.1372	71.050	114.42	0.128040	0.166600	0.128040	0.119980	0.121960	0.110850	0.11788	43.370	82.658	0.149420	0.149420	4.4158	2.0472	0.662950	3.8772	0.65980	1.52250	49.394	0.25792	94.14	0.051402	2.2437	2.2480	7.4277	1.7615	0.47706	1.00970	8.4160	1.45820	1.2479	593.2700
1	2304.6	0.000000	0.42002	0.49988	0.49788	1.6996	0.20616	3.6884	0.39542	0.24806	0.258340	0.258340	0.209120	0.00000	0.258340	0.47225	0.49988	0.49788	0.237270	1.6996	677.96	0.261140	0.85300	14.7860	0.99601	1.9447	2.98760	2.0005	0.43992	0.53838	3.2732	111.510	199.49	0.145950	0.158350	0.121300	0.123040	0.121300	0.139610	0.26969	87.981	107.350	0.152000	0.152000	3.4000	1.9447	0.086422	2.9876	0.51680	1.12520	100.130	0.33472	122.17	0.064371	17.8660	17.8660	60.9870	16.9460	NaN	0.51045	4.1486	88.44400	1.4293	NaN
2	6332.7	0.694840	0.81774	0.48152	0.51537	1.3090	0.31565	4.3749	0.28932	0.30260	0.309060	0.309060	0.248660	0.00000	0.309060	0.26713	0.69592	0.30408	0.291810	1.3090	794.16	0.312580	0.76599	-1.1523	0.43695	1.0758	1.42740	1.4369	0.37282	0.45961	3.9510	92.381	165.51	0.231170	0.244350	0.241140	0.189960	0.241140	0.222930	0.41929	73.133	134.270	0.238150	0.236100	2.7185	1.5548	0.322020	2.0630	0.64184	1.01010	96.372	0.48474	176.93	0.074020	1.2098	2.0504	5.2078	1.0627	NaN	0.94807	4.9909	86.01100	1.4283	2.3019
3	20545.0	0.000000	0.14207	0.30734	0.57353	1.3562	0.11550	4.6511	0.57353	0.11550	0.092704	0.092704	0.081483	0.14988	0.092704	0.45879	0.30734	0.57353	0.085874	1.0571	917.01	0.092704	0.94598	51.9520	1.86610	2.4928	0.37581	3.2537	0.36152	0.39803	3.6147	100.980	180.77	0.088995	0.094257	0.088995	0.062782	0.054015	0.066165	0.14343	79.788	86.435	0.071428	0.071428	4.2228	2.4928	0.401390	3.9948	0.30163	1.56960	84.344	0.25033	91.37	0.069622	2.4524	2.4524	5.5497	1.9618	0.17193	0.28720	4.5746	0.94076	1.5069	NaN
4	3186.6	0.124440	0.48431	0.56511	0.43489	1.6278	0.19832	4.1424	0.38677	0.19832	0.187320	0.187320	0.187320	0.18732	0.187320	0.22960	0.61323	0.38677	0.188110	1.1559	1133.20	0.187320	0.86515	-7.3128	0.63070	1.2959	0.32340	1.6307	0.32211	0.32211	4.3158	84.574	141.62	0.122310	0.121820	0.122310	0.115530	0.134850	0.116010	0.27884	57.045	127.210	0.115530	0.115530	2.8692	1.4063	0.293040	2.4823	0.33147	0.95787	65.936	0.40285	147.04	0.096680	1.8839	2.1184	7.8980	1.1184	0.18732	0.73919	6.3985	1.41380	NaN	11.2470

	train-logloss-mean	train-logloss-std	test-logloss-mean	test-logloss-std
0	0.608472	0.000152	0.610626	0.000275
1	0.538652	0.000507	0.542491	0.000324
2	0.480073	0.000462	0.485724	0.000468
3	0.430425	0.000441	0.437787	0.000757
4	0.387770	0.000478	0.396856	0.001131

Case Study Four¶

Financial Delinquency Project¶

David Grijalva, Nicole Norelli, & Mingyang Nick YU¶

10/15/2021¶

1. Introduction¶

RandomForest¶

Gini Impurity¶

Entropy¶

XGBoost¶

Model Selection¶

Grid Search¶

Randomized Search¶

2. Methods¶

Data Retrieval¶

EDA & Data Preparation¶

Helper Functions & Data Split¶

First Model - Random Forest¶

Explore XGBoost early stopping round¶

Second Model - XGBoost¶

3. Results¶

4. Conclusion¶

Appendix - Code¶

Helper Function Models¶

Building Random Forest Model using MinMaxScaling and SimpleImpute¶

Explore XGBoost early stopping round¶

Search for best parameters for XGBoost using SKLearn wrapper¶

Early Stopping at 10¶

Early Stopping at 20¶

Model	Accuracy	Precision (not bankrupt)	Recall (not bankrupt)	Precision (bankrupt)	Recall (bankrupt)	AUC
XGBoost	0.97	0.97	1.00	0.88	0.45	0.95585
Random Forest	0.94	0.97	0.97	0.34	0.32	0.85353

	param_combination	param_model__n_estimators	param_model__criterion	param_model__max_depth	param_model__min_samples_split	mean_train_score	mean_validation_score	rank_validation_score
0	1	250	entropy	10	7	0.957434	0.841106	8
1	2	300	entropy	10	4	0.958092	0.840221	10
2	3	350	entropy	15	9	0.995691	0.860985	1
3	4	400	entropy	12	4	0.983371	0.844865	6
4	5	300	entropy	12	9	0.982494	0.849057	3
5	6	300	gini	15	7	0.993104	0.857418	2
6	7	400	entropy	12	7	0.982823	0.848632	4
7	8	300	gini	12	9	0.979802	0.848002	5
8	9	250	entropy	10	9	0.956826	0.841373	7
9	10	250	gini	10	7	0.957186	0.840990	9

	param_combination	param_model__learning_rate	param_model__max_depth	param_model__subsample	param_model__gamma	mean_train_score	mean_validation_score	rank_validation_score
0	1	0.01	8	0.7	0.6	0.892527	0.858119	9
1	2	0.01	8	1	0.4	0.909856	0.871040	6
2	3	0.1	8	0.7	0.4	0.950550	0.910811	1
3	4	0.01	7	1	0.4	0.887851	0.853586	10
4	5	0.1	8	0.9	0.4	0.957651	0.909990	2
5	6	0.01	7	0.9	0.1	0.898661	0.867194	7
6	7	0.01	8	0.7	0.4	0.893675	0.858840	8
7	8	0.1	8	0.9	0.1	0.956232	0.907765	3
8	9	0.1	9	1	0.4	0.956973	0.905676	5
9	10	0.1	8	0.9	0.6	0.953876	0.906863	4

	param_combination	param_model__learning_rate	param_model__max_depth	param_model__subsample	param_model__gamma	mean_train_score	mean_validation_score	rank_validation_score
0	1	0.01	8	0.7	0.6	0.919640	0.883446	8
1	2	0.01	8	1	0.4	0.930552	0.884051	7
2	3	0.1	8	0.7	0.4	0.987039	0.937418	3
3	4	0.01	7	1	0.4	0.895666	0.861091	10
4	5	0.1	8	0.9	0.4	0.989882	0.944494	1
5	6	0.01	7	0.9	0.1	0.920702	0.887716	6
6	7	0.01	8	0.7	0.4	0.917187	0.883169	9
7	8	0.1	8	0.9	0.1	0.987263	0.939730	2
8	9	0.1	9	1	0.4	0.985602	0.936997	4
9	10	0.1	8	0.9	0.6	0.979690	0.933287	5