cm_n = confusion_matrix(y, y_hat_lg)
ax = sns.heatmap(cm_n,cmap='Blues',annot=True,fmt='2d',xticklabels=x_axis_labels, yticklabels=y_axis_labels)
ax.set(xlabel='Predicted', ylabel='True')
ax.set_title("Confusion Matrix Model 1",fontdict={'fontsize':20},pad=15)

Text(0.5, 1.0, 'Confusion Matrix Model 1')


cm_n = confusion_matrix(y, y_hat_lg_method_2)
ax = sns.heatmap(cm_n,cmap='Blues',annot=True,fmt='2d',xticklabels=x_axis_labels, yticklabels=y_axis_labels)
ax.set(xlabel='Predicted', ylabel='True')
ax.set_title("Confusion Matrix Model 2",fontdict={'fontsize':20},pad=15)

Text(0.5, 1.0, 'Confusion Matrix Model 2')


sns.catplot(x="coefficient", 
            y="feature name", 
            kind="bar", 
            palette = "light:b_r",
            data=less_30.sort_values(by='coefficient', key=abs,ascending=False)[:10]).set(title="Model 1 Feature Importance Plot for class <30")

<seaborn.axisgrid.FacetGrid at 0x7f87bbec3b80>


sns.catplot(x="coefficient", 
            y="feature name", 
            kind="bar", 
            palette = "light:b_r",
            data=less_30_impute.sort_values(by='coefficient', key=abs,ascending=False)[:10]).set(title="Model 2 Feature Importance Plot for class <30")

None


sns.catplot(x="coefficient", 
            y="feature name", 
            kind="bar", 
            palette = "light:b_r",
            data=greater_30.sort_values(by='coefficient', key=abs,ascending=False)[:10]).set(title="Model 1 Feature Importance Plot for class >30")

<seaborn.axisgrid.FacetGrid at 0x7f87bc07f040>


sns.catplot(x="coefficient", 
            y="feature name", 
            kind="bar", 
            palette = "light:b_r",
            data=greater_30_impute.sort_values(by='coefficient', key=abs,ascending=False)[:10]).set(title="Model 2 Feature Importance Plot for class >30")

None


sns.catplot(x="coefficient", 
            y="feature name", 
            kind="bar", 
            palette = "light:b_r",
            data=no_admission.sort_values(by='coefficient', key=abs,ascending=False)[:10]).set(title="Model 1 Feature Importance Plot for class No Readmission")

<seaborn.axisgrid.FacetGrid at 0x7f87bb0523a0>


sns.catplot(x="coefficient", 
            y="feature name", 
            kind="bar", 
            palette = "light:b_r",
            data=no_admission_impute.sort_values(by='coefficient', key=abs,ascending=False)[:10]).set(title="Model 2 Feature Importance Plot for class No Readmission")

None


import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import label_binarize
from scipy import interp
from itertools import cycle
import warnings
warnings.filterwarnings("ignore")


from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split,GridSearchCV,StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,plot_confusion_matrix,plot_roc_curve, auc, roc_curve,confusion_matrix

import joblib
sns.set_theme()
# options
pd.set_option('display.max_columns', None)
# Using fixed os structure
import os
cwd = os.getcwd()
d = os.path.dirname(cwd)
d

# Notebook options
pd.options.display.max_rows = 99999
pd.options.display.max_columns = 99999
#pd.options.display.float_format = '{:20,.3f}'.format


path_file = f"{d}/casestudy2/dataset_diabetes/diabetic_data.csv"
raw_data = pd.read_csv(path_file)
# Convert columns to correct type
raw_data["encounter_id"] = raw_data["encounter_id"].astype(str)
raw_data["patient_nbr"] = raw_data["patient_nbr"].astype(str)
raw_data["admission_type_id"] = raw_data["admission_type_id"].astype(str)
raw_data["discharge_disposition_id"] = raw_data["discharge_disposition_id"].astype(str)
raw_data["admission_source_id"] = raw_data["admission_source_id"].astype(str)


raw_data.shape

(101766, 50)


raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101766 entries, 0 to 101765
Data columns (total 50 columns):
 #   Column                    Non-Null Count   Dtype 
---  ------                    --------------   ----- 
 0   encounter_id              101766 non-null  object
 1   patient_nbr               101766 non-null  object
 2   race                      101766 non-null  object
 3   gender                    101766 non-null  object
 4   age                       101766 non-null  object
 5   weight                    101766 non-null  object
 6   admission_type_id         101766 non-null  object
 7   discharge_disposition_id  101766 non-null  object
 8   admission_source_id       101766 non-null  object
 9   time_in_hospital          101766 non-null  int64 
 10  payer_code                101766 non-null  object
 11  medical_specialty         101766 non-null  object
 12  num_lab_procedures        101766 non-null  int64 
 13  num_procedures            101766 non-null  int64 
 14  num_medications           101766 non-null  int64 
 15  number_outpatient         101766 non-null  int64 
 16  number_emergency          101766 non-null  int64 
 17  number_inpatient          101766 non-null  int64 
 18  diag_1                    101766 non-null  object
 19  diag_2                    101766 non-null  object
 20  diag_3                    101766 non-null  object
 21  number_diagnoses          101766 non-null  int64 
 22  max_glu_serum             101766 non-null  object
 23  A1Cresult                 101766 non-null  object
 24  metformin                 101766 non-null  object
 25  repaglinide               101766 non-null  object
 26  nateglinide               101766 non-null  object
 27  chlorpropamide            101766 non-null  object
 28  glimepiride               101766 non-null  object
 29  acetohexamide             101766 non-null  object
 30  glipizide                 101766 non-null  object
 31  glyburide                 101766 non-null  object
 32  tolbutamide               101766 non-null  object
 33  pioglitazone              101766 non-null  object
 34  rosiglitazone             101766 non-null  object
 35  acarbose                  101766 non-null  object
 36  miglitol                  101766 non-null  object
 37  troglitazone              101766 non-null  object
 38  tolazamide                101766 non-null  object
 39  examide                   101766 non-null  object
 40  citoglipton               101766 non-null  object
 41  insulin                   101766 non-null  object
 42  glyburide-metformin       101766 non-null  object
 43  glipizide-metformin       101766 non-null  object
 44  glimepiride-pioglitazone  101766 non-null  object
 45  metformin-rosiglitazone   101766 non-null  object
 46  metformin-pioglitazone    101766 non-null  object
 47  change                    101766 non-null  object
 48  diabetesMed               101766 non-null  object
 49  readmitted                101766 non-null  object
dtypes: int64(8), object(42)
memory usage: 38.8+ MB


# Identify attributes missing over 15% of values
columns_missing_rate = 100*((raw_data=="?").sum(axis = 0) / len(raw_data))
columns_missing_rate = pd.DataFrame(columns_missing_rate)
columns_missing_rate = columns_missing_rate.reset_index()
columns_missing_rate.columns = ['name','rate']
print("     Initial Missing Data Inspection with all Attributes In     \n")
print('{:45s} {:4s}%'.format("Variable Name:","Missing Percentage"))
print("________________________________________________________________")
for index, row in columns_missing_rate.iterrows():
    if row['rate'] > 0:
        print(' {:45s} {:4s}%'.format(row['name'],str(np.round(row['rate'],4))))

     Initial Missing Data Inspection with all Attributes In     

Variable Name:                                Missing Percentage%
________________________________________________________________
 race                                          2.2336%
 weight                                        96.8585%
 payer_code                                    39.5574%
 medical_specialty                             49.0822%
 diag_1                                        0.0206%
 diag_2                                        0.3518%
 diag_3                                        1.3983%


raw_data.head(20)


# Look at numerical data - except encounter_id and patient_nbr
raw_data[raw_data.select_dtypes(include="number").columns].describe()


corr = raw_data[raw_data.select_dtypes(include="number").columns].corr().abs()


upper_tri = corr.where(np.triu(np.ones(corr.shape),k=0).astype(np.bool))


sns.set_theme(style="white")
plt.subplots(figsize=(8,8))
cmap = sns.diverging_palette(230, 20, as_cmap=True)
sns.heatmap(upper_tri, cmap=cmap, vmax=1, center=0.5,annot=True,
            square=True, linewidths=.5, cbar_kws={"shrink": .5})

<AxesSubplot:>


numeric_columns = raw_data[raw_data.select_dtypes(include="number").columns].sample(frac=0.2,random_state=101)


numeric_columns = pd.concat([numeric_columns,raw_data['readmitted'].sample(frac=0.2,random_state=101)],axis=1)


sns.pairplot(numeric_columns,hue='readmitted')

<seaborn.axisgrid.PairGrid at 0x1428beee0>


raw_data.loc[raw_data['readmitted']=='<30',['time_in_hospital','readmitted']].head()


sns.boxplot(data=raw_data[['time_in_hospital','readmitted']], x="readmitted",y="time_in_hospital")

<AxesSubplot:xlabel='readmitted', ylabel='time_in_hospital'>


sns.boxplot(data=raw_data.loc[raw_data['num_lab_procedures']>-1,['num_lab_procedures','readmitted']], x="readmitted",y="num_lab_procedures")

<AxesSubplot:xlabel='readmitted', ylabel='num_lab_procedures'>


sns.boxplot(data=raw_data.loc[raw_data['num_procedures']>-1,['num_procedures','readmitted']], x="readmitted",y="num_procedures")

<AxesSubplot:xlabel='readmitted', ylabel='num_procedures'>


# Displaying only rows with number_outpatient > 0 
sns.histplot(raw_data.loc[raw_data['num_procedures']>0,['num_procedures','readmitted']],x = 'num_procedures',hue='readmitted',bins = 30)

<AxesSubplot:xlabel='num_procedures', ylabel='Count'>


sns.boxplot(data=raw_data.loc[raw_data['num_medications']>-1,['num_medications','readmitted']], x="readmitted",y="num_medications")

<AxesSubplot:xlabel='readmitted', ylabel='num_medications'>


# Displaying only rows with number_outpatient > 0 
sns.histplot(raw_data.loc[raw_data['number_outpatient']>0,['number_outpatient','readmitted']],x = 'number_outpatient',hue='readmitted',bins = 30)

<AxesSubplot:xlabel='number_outpatient', ylabel='Count'>


sns.boxplot(data=raw_data.loc[raw_data['number_outpatient']>-1,['number_outpatient','readmitted']], x="readmitted",y="number_outpatient")

<AxesSubplot:xlabel='readmitted', ylabel='number_outpatient'>


# Displaying only rows with number_emergency > 0 
sns.histplot(raw_data.loc[raw_data['number_emergency']>0,['number_emergency','readmitted']],x = 'number_emergency',hue='readmitted',bins = 30)

<AxesSubplot:xlabel='number_emergency', ylabel='Count'>


sns.boxplot(data=raw_data.loc[raw_data['number_emergency']>-1,['number_emergency','readmitted']], x="readmitted",y="number_emergency")

<AxesSubplot:xlabel='readmitted', ylabel='number_emergency'>


# Displaying number_inpatient > 0 
sns.histplot(raw_data.loc[raw_data['number_inpatient']>0,['number_inpatient','readmitted']],x = 'number_inpatient',hue='readmitted',bins = 15)

<AxesSubplot:xlabel='number_inpatient', ylabel='Count'>


sns.boxplot(data=raw_data.loc[raw_data['number_inpatient']>-1,['number_inpatient','readmitted']], x="readmitted",y="number_inpatient")

<AxesSubplot:xlabel='readmitted', ylabel='number_inpatient'>


sns.histplot(raw_data[['number_diagnoses','readmitted']],x = 'number_diagnoses',hue='readmitted',bins = 15)

<AxesSubplot:xlabel='number_diagnoses', ylabel='Count'>


sns.boxplot(data=raw_data.loc[raw_data['number_diagnoses']>-1,['number_diagnoses','readmitted']], x="readmitted",y="number_diagnoses")

<AxesSubplot:xlabel='readmitted', ylabel='number_diagnoses'>


raw_data[raw_data.select_dtypes(exclude="number").columns].describe()


def detect_string_in_column(string, array):
    flag_list = []
    for element in array:
        if string in element:
            flag_list.append(True)
        else:
            flag_list.append(False)
    return flag_list


raw_data.loc[detect_string_in_column('V',raw_data['diag_1']),['diag_1','readmitted']].count()

diag_1        1644
readmitted    1644
dtype: int64


unique_patient = raw_data.groupby(by='patient_nbr').count()
patients_many_visits = unique_patient['encounter_id'].sort_values(ascending=False)
patients_many_visits = patients_many_visits.reset_index()
patients_many_visits.rename(columns={'encounter_id':'visits'},inplace=True)


patients_many_visits.head()


patients_many_visits[patients_many_visits['visits']>1].count()

patient_nbr    16773
visits         16773
dtype: int64


patients_many_visits['patient_nbr'].nunique()

71518


patients_many_visits[patients_many_visits['visits']==1].count()

patient_nbr    54745
visits         54745
dtype: int64


raw_data['glyburide-metformin'].value_counts()

No        101060
Steady       692
Up             8
Down           6
Name: glyburide-metformin, dtype: int64


raw_data.loc[raw_data['patient_nbr']=="88785891"].sort_values(by='num_lab_procedures',ascending=False)


raw_data["weight"].value_counts()

?            98569
[75-100)      1336
[50-75)        897
[100-125)      625
[125-150)      145
[25-50)         97
[0-25)          48
[150-175)       35
[175-200)       11
>200             3
Name: weight, dtype: int64


sns.catplot(data = raw_data.loc[raw_data['weight']!="?",['weight','readmitted']],y = 'weight',hue='readmitted',kind='count')

<seaborn.axisgrid.FacetGrid at 0x146cb6e50>


sns.catplot(data = raw_data.loc[raw_data['race']!="-",['race','readmitted']],y = 'race',hue='readmitted',kind='count')

<seaborn.axisgrid.FacetGrid at 0x1466e6640>


sns.catplot(data = raw_data.loc[raw_data['max_glu_serum']!="-",['max_glu_serum','readmitted']],y = 'max_glu_serum',hue='readmitted',kind='count')

<seaborn.axisgrid.FacetGrid at 0x146d7aeb0>


sns.catplot(data = raw_data.loc[raw_data['payer_code']!="-",['payer_code','readmitted']],y = 'payer_code',hue='readmitted',kind='count')

<seaborn.axisgrid.FacetGrid at 0x146d5ffd0>


raw_data["payer_code"].value_counts()

?     40256
MC    32439
HM     6274
SP     5007
BC     4655
MD     3532
CP     2533
UN     2448
CM     1937
OG     1033
PO      592
DM      549
CH      146
WC      135
OT       95
MP       79
SI       55
FR        1
Name: payer_code, dtype: int64


raw_data["medical_specialty"].nunique()

73


raw_data["readmitted"].value_counts()

NO     54864
>30    35545
<30    11357
Name: readmitted, dtype: int64


raw_data["race"].value_counts()

Caucasian          76099
AfricanAmerican    19210
?                   2273
Hispanic            2037
Other               1506
Asian                641
Name: race, dtype: int64


print(raw_data["readmitted"])

0          NO
1         >30
2          NO
3          NO
4          NO
         ... 
101761    >30
101762     NO
101763     NO
101764     NO
101765     NO
Name: readmitted, Length: 101766, dtype: object


raw_data['diag_1'].value_counts().head()

428    6862
414    6581
786    4016
410    3614
486    3508
Name: diag_1, dtype: int64


sns.catplot(data = raw_data.loc[raw_data['age']!="?",['age','readmitted']],y = 'age',hue='readmitted',kind='count')

<seaborn.axisgrid.FacetGrid at 0x146e82730>


# Weight over 90% missing
# encounter_id is unique column not useful for prediction
suggested_delete = ['weight','encounter_id','patient_nbr']
raw_data.drop(suggested_delete,inplace=True, axis=1)


# This method is used to transform diag_1, diag_2, diag_3 columns based on conditions below
def transform_diag_column(string: str):
    if "?" in string:
        return "NA"
    elif ("E" in string) or ("V" in string):
        return "ECOISC"
    else:
        value = float(string)
        assert isinstance(value, float), "Conversion of diag column failed, special case present"
        if value >= 250 and value < 251:
            return "DM"
        elif (value >= 390 and value <= 459) or value==785:
            return "DOCS"
        elif (value >= 460 and value <= 519) or value==786:
            return "DORS"
        elif (value >= 520 and value <= 579) or (value in [787,789]):
            return "DODS"
        elif (value >= 800 and value <= 999):
            return "IP"
        elif (value >= 710 and value <= 739):
            return "DOMSACT"
        elif (value >= 580 and value <= 629) or value==788:
            return "DOGNS"
        elif (value >= 140 and value <= 239):
            return "NEO"
        elif (value >= 790 and value <= 799) or (value in [780,781,784]):
            return "OSSIC"
        elif (value >= 790 and value <= 799) or (value in [780,781,784]):
            return "OSSIC"
        elif (value >= 240 and value <= 279) or value==783:
            return "ENMDISWD"
        elif (value >= 680 and value <= 709) or value==782:
            return "DOTSAST"
        elif (value >= 1 and value <= 139):
            return "IPD"
        elif (value >= 290 and value <= 319):
            return "MD"
        elif (value >= 280 and value <= 289):
            return "DOBBO"
        elif (value >= 320 and value <= 359):
            return "DONS"
        elif (value >= 630 and value <= 679):
            return "COPCP"
        elif (value >= 360 and value <= 389):
            return "DOSO"
        elif (value >= 740 and value <= 759):
            return "CA"
        else:
            print("Value: ", value)


raw_data['diag_1'] = raw_data['diag_1'].apply(lambda x: transform_diag_column(x))
raw_data['diag_2'] = raw_data['diag_2'].apply(lambda x: transform_diag_column(x))
raw_data['diag_3'] = raw_data['diag_3'].apply(lambda x: transform_diag_column(x))


def convert_unknown_to_NA(string: str):
    if string == '?':
        return "NA"
    else:
        return string


# Convert missing data in race to category NA: not available
raw_data['race'] = raw_data['race'].apply(lambda x: convert_unknown_to_NA(x))
# Convert payer_code ? to category NA: not available
raw_data['payer_code'] = raw_data['payer_code'].apply(lambda x: convert_unknown_to_NA(x))


# Getting top 15 medical_specialty by value_counts()
top_15_medical_specialty = list(pd.DataFrame(raw_data["medical_specialty"].value_counts()).reset_index()[:15]['index'])


top_15_medical_specialty = list(pd.DataFrame(raw_data["medical_specialty"].value_counts()).reset_index()[:15]['index'])
def convert_medical_specialty(string: str):
    if string=='?':
        return "NA"
    elif string in top_15_medical_specialty:
        return string
    else:
        return "Others"
raw_data['medical_specialty'] = raw_data['medical_specialty'].apply(lambda x: convert_medical_specialty(x))


raw_data['medical_specialty'] = raw_data['medical_specialty'].apply(lambda x: convert_medical_specialty(x))


# Get X and y variable
X, y = raw_data.drop(['readmitted'], axis=1),raw_data['readmitted']

# Get categorical columns and numerical columns
categorical_col = X.select_dtypes(include=['object', 'bool']).columns
numeric_col = X.select_dtypes(include=['float','int']).columns
all_col = X.columns


params_lr_multi = [
        # Tune One over rest parameters
    {
        "model__C": [10, 15,20,25,30],
        "model__multi_class": ["ovr"],
        "model__solver": ["liblinear"]
    },
    # Tune Multinomial parameters
    {
        "model__C": [10, 15, 20,30],
        "model__multi_class": ["multinomial"],
        "model__solver": ["lbfgs"]
    }
]
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
def cv_common(df, columns):
    cv_result_summary = df[columns]
    cv_result_summary.index  = np.arange(1,len(cv_result_summary)+1)
    cv_result_summary = cv_result_summary.reset_index()
    return cv_result_summary

def cv_summary(estimator, columns):
    df = pd.DataFrame(estimator.cv_results_)
    cv_result_summary = cv_common(df, columns)
    cv_result_summary = cv_result_summary.rename(columns = {
        "index": "param_combination"
    })
    return cv_result_summary


%%time

# Define preprocessing for columns
t_slr = [('cat', OneHotEncoder(handle_unknown='ignore'),categorical_col),
        ('scale',MinMaxScaler(),numeric_col)]
col_transform = ColumnTransformer(t_slr)
model_lr = LogisticRegression(tol=0.0001,C=1.0, class_weight='balanced',random_state=101,max_iter=10000)
pipeline_lr = Pipeline(steps=[('preprocessing',col_transform), 
                                     ('model', model_lr)])
# Fit
clf_lr = GridSearchCV(pipeline_lr, params_lr_multi, cv = cv, scoring='roc_auc_ovr_weighted',
                      n_jobs=-2, verbose=15)
search_lr = clf_lr.fit(X, y)

Fitting 10 folds for each of 9 candidates, totalling 90 fits
CPU times: user 1min 14s, sys: 2.99 s, total: 1min 17s
Wall time: 17min 12s


file_name_lr = f"{cwd}/logistic_regression_search_2.sav"
# joblib.dump(search_lr,file_name_lr)
search_lr = joblib.load(file_name_lr)


# Get cross validation result
search_columns = ["param_model__C",'param_model__multi_class','param_model__solver',
                                "mean_test_score","mean_fit_time","rank_test_score"]
df_search_result = cv_summary(search_lr,search_columns)
df_search_result


def get_acc_score(model, x, y):
    return model.score(x, y)

x_axis_labels = ["Readmitted <30", "Readmitted >30", "Not Readmitted"] # labels for x-axis
y_axis_labels = ["Readmitted <30", "Readmitted >30", "Not Readmitted"] # labels for y-axis

def get_classification_report_multi_response(x_train, y_train, x_test, y_test, pred, model):
    """
    This function is used to get comprehensive classification report for multiple response:
    Training Accuracy, Test Accuracy, print classification_report, plot confusion matrix.
    """
    print(f"Weighted AUC: {get_acc_score(model, x_train, y_train):,.5f}")
    print("Classification Report:")
    print(classification_report(y_test,pred))
    cm_n = confusion_matrix(y_test,pred)
    ax = sns.heatmap(cm_n,cmap='Blues',annot=True,fmt='2d',xticklabels=x_axis_labels, yticklabels=y_axis_labels)
    ax.set(xlabel='Predicted Label', ylabel='True Label')
    ax.set_title("Confusion Matrix",fontdict={'fontsize':20},pad=15)


y_hat_lg = search_lr.predict(X)
get_classification_report_multi_response(X, y, X, y, y_hat_lg, search_lr)

Weighted AUC: 0.67998
Classification Report:
              precision    recall  f1-score   support

         <30       0.27      0.19      0.22     11357
         >30       0.49      0.42      0.45     35545
          NO       0.64      0.74      0.69     54864

    accuracy                           0.57    101766
   macro avg       0.47      0.45      0.46    101766
weighted avg       0.55      0.57      0.56    101766


onehot_columns= search_lr.best_estimator_.named_steps['preprocessing'].named_transformers_['cat'].get_feature_names(input_features=categorical_col)
features_list = list(onehot_columns)
features_list.extend(list(numeric_col))


less_30 = search_lr.best_estimator_._final_estimator.coef_[0]
greater_30 = search_lr.best_estimator_._final_estimator.coef_[1]
no_admission = search_lr.best_estimator_._final_estimator.coef_[2]


less_30 = pd.DataFrame(zip(features_list, less_30), columns =('feature name','coefficient'))
greater_30 = pd.DataFrame(zip(features_list, greater_30), columns =('feature name','coefficient'))
no_admission = pd.DataFrame(zip(features_list, no_admission), columns =('feature name','coefficient'))


less_30.sort_values(by='coefficient', key=abs,ascending=False)[:10]


greater_30.sort_values(by='coefficient', key=abs,ascending=False)[:10]


no_admission.sort_values(by='coefficient', key=abs,ascending=False)[:10]


path_file = f"{d}/casestudy2/dataset_diabetes/diabetic_data.csv"
data = pd.read_csv(path_file)
# Convert columns to correct type
data["encounter_id"] = data["encounter_id"].astype(str)
data["patient_nbr"] = data["patient_nbr"].astype(str)
data["admission_type_id"] = data["admission_type_id"].astype(str)
data["discharge_disposition_id"] = data["discharge_disposition_id"].astype(str)
data["admission_source_id"] = data["admission_source_id"].astype(str)


# Identify attributes missing over 15% of values
columns_missing_rate = 100*((data=="?").sum(axis = 0) / len(data))
columns_missing_rate = pd.DataFrame(columns_missing_rate)
columns_missing_rate = columns_missing_rate.reset_index()
columns_missing_rate.columns = ['name','rate']
print("     Initial Missing Data Inspection with all Attributes In     \n")
print('{:45s} {:4s}%'.format("Variable Name:","Missing Percentage"))
print("________________________________________________________________")
for index, row in columns_missing_rate.iterrows():
    if row['rate'] > 0:
        print(' {:45s} {:4s}%'.format(row['name'],str(np.round(row['rate'],4))))

     Initial Missing Data Inspection with all Attributes In     

Variable Name:                                Missing Percentage%
________________________________________________________________
 race                                          2.2336%
 weight                                        96.8585%
 payer_code                                    39.5574%
 medical_specialty                             49.0822%
 diag_1                                        0.0206%
 diag_2                                        0.3518%
 diag_3                                        1.3983%


# Convert ? to np.nan, and classify diag cols
def new_transform_diag_column(string: str):
    if "?" in string:
        return np.nan
    elif ("E" in string) or ("V" in string):
        return "ECOISC"
    else:
        value = float(string)
        assert isinstance(value, float), "Conversion of diag column failed, special case present"
        if value >= 250 and value < 251:
            return "DM"
        elif (value >= 390 and value <= 459) or value==785:
            return "DOCS"
        elif (value >= 460 and value <= 519) or value==786:
            return "DORS"
        elif (value >= 520 and value <= 579) or (value in [787,789]):
            return "DODS"
        elif (value >= 800 and value <= 999):
            return "IP"
        elif (value >= 710 and value <= 739):
            return "DOMSACT"
        elif (value >= 580 and value <= 629) or value==788:
            return "DOGNS"
        elif (value >= 140 and value <= 239):
            return "NEO"
        elif (value >= 790 and value <= 799) or (value in [780,781,784]):
            return "OSSIC"
        elif (value >= 790 and value <= 799) or (value in [780,781,784]):
            return "OSSIC"
        elif (value >= 240 and value <= 279) or value==783:
            return "ENMDISWD"
        elif (value >= 680 and value <= 709) or value==782:
            return "DOTSAST"
        elif (value >= 1 and value <= 139):
            return "IPD"
        elif (value >= 290 and value <= 319):
            return "MD"
        elif (value >= 280 and value <= 289):
            return "DOBBO"
        elif (value >= 320 and value <= 359):
            return "DONS"
        elif (value >= 630 and value <= 679):
            return "COPCP"
        elif (value >= 360 and value <= 389):
            return "DOSO"
        elif (value >= 740 and value <= 759):
            return "CA"
        else:
            print("Value: ", value)



    
def new_convert_unknown_to_NA(string: str):
    if string == '?':
        return np.nan
    else:
        return string


data['diag_1'] = data['diag_1'].apply(lambda x: new_transform_diag_column(x))
data['diag_2'] = data['diag_2'].apply(lambda x: new_transform_diag_column(x))
data['diag_3'] = data['diag_3'].apply(lambda x: new_transform_diag_column(x))

data['race'] = data['race'].apply(lambda x: new_convert_unknown_to_NA(x))
data['weight'] = data['weight'].apply(lambda x: new_convert_unknown_to_NA(x))
data['payer_code'] = data['payer_code'].apply(lambda x: new_convert_unknown_to_NA(x))


top_15_medical_specialty_data = list(pd.DataFrame(data["medical_specialty"].value_counts()).reset_index()[:15]['index'])

def new_convert_medical_specialty(string: str):
    if string=='?':
        return np.nan
    elif string in top_15_medical_specialty_data:
        return string
    else:
        return "Others"


data['medical_specialty'] = data['medical_specialty'].apply(lambda x: new_convert_medical_specialty(x))


# delete values

suggested_delete = ['weight','encounter_id','patient_nbr']
data.drop(suggested_delete,inplace=True, axis=1)


from sklearn.impute import SimpleImputer


X, y = data.drop(['readmitted'], axis=1),data['readmitted']
# Get categorical columns and numerical columns
categorical_col = X.select_dtypes(include=['object', 'bool']).columns
numeric_col = X.select_dtypes(include=['float','int']).columns
all_col = X.columns


X.shape

(101766, 46)


%%time



params_lr_multi_method_2 = [
    # Tune One over rest parameters
    {
        "model__C": [10, 15,20,25,30],
        "model__multi_class": ["ovr"],
        "model__solver": ["liblinear"]
    },
    # Tune Multinomial parameters
    {
        "model__C": [ 15, 20],
        "model__multi_class": ["multinomial"],
        "model__solver": ["lbfgs"]
    }
]


# Define preprocessing for columns


categorical_pipe_method_2 = Pipeline(steps=[("cat_impute", SimpleImputer(missing_values=np.nan, strategy='most_frequent')), 
                                     ('cat', OneHotEncoder(handle_unknown='ignore'))])
numeric_pipe_method_2 = Pipeline(steps=[('numeric_impute', SimpleImputer(missing_values=np.nan, strategy='mean')), 
                                     ('scale',MinMaxScaler())])


t_slr_impute_method_2 = [
    ('cat', categorical_pipe_method_2, categorical_col),
   
    ('scale', numeric_pipe_method_2, numeric_col)]


col_transform_impute_method_2 = ColumnTransformer(t_slr_impute_method_2)
model_lr_method_2 = LogisticRegression(tol=0.0001,C=1.0, class_weight='balanced',random_state=101,max_iter=10000)

pipeline_lr_impute_method_2 = Pipeline(steps=[('preprocessing',col_transform_impute_method_2), 
                                     ('model', model_lr_method_2)])





# Fit
clf_lr_impute_method_2 = GridSearchCV(pipeline_lr_impute_method_2, params_lr_multi_method_2, cv = cv, scoring='roc_auc_ovr_weighted',
                      n_jobs=-2, verbose=15, return_train_score=True)

search_lr_impute_method_2 = clf_lr_impute_method_2.fit(X, y)

Fitting 10 folds for each of 7 candidates, totalling 70 fits
CPU times: user 1min 8s, sys: 2.96 s, total: 1min 11s
Wall time: 13min 25s


file_name_lr_method_2 = f"{cwd}/method_2_logistic_regression_search.sav"
#joblib.dump(search_lr_impute_method_2,file_name_lr_method_2)
search_lr_impute_method_2 = joblib.load(file_name_lr_method_2)


# Get cross validation result
search_columns = ["param_model__C",'param_model__multi_class','param_model__solver',
                                "mean_test_score","mean_fit_time","rank_test_score"]
df_search_result_method_2 = cv_summary(search_lr_impute_method_2,search_columns)
df_search_result_method_2


y_hat_lg_method_2 = search_lr_impute_method_2.predict(X)
get_classification_report_multi_response(X, y, X, y, y_hat_lg_method_2, search_lr_impute_method_2)

Weighted AUC: 0.67372
Classification Report:
              precision    recall  f1-score   support

         <30       0.26      0.18      0.22     11357
         >30       0.51      0.31      0.39     35545
          NO       0.62      0.81      0.70     54864

    accuracy                           0.57    101766
   macro avg       0.47      0.44      0.44    101766
weighted avg       0.54      0.57      0.54    101766


onehot_columns_impute= search_lr_impute_method_2.best_estimator_.named_steps['preprocessing'].named_transformers_['cat'].named_steps['cat'].get_feature_names(input_features=categorical_col)
features_list_impute = list(onehot_columns_impute)
features_list_impute.extend(list(numeric_col))

less_30_impute = search_lr_impute_method_2.best_estimator_._final_estimator.coef_[0]
greater_30_impute = search_lr_impute_method_2.best_estimator_._final_estimator.coef_[1]
no_admission_impute = search_lr_impute_method_2.best_estimator_._final_estimator.coef_[2]

less_30_impute = pd.DataFrame(zip(features_list_impute, less_30_impute), columns =('feature name','coefficient'))
greater_30_impute = pd.DataFrame(zip(features_list_impute, greater_30_impute), columns =('feature name','coefficient'))
no_admission_impute = pd.DataFrame(zip(features_list_impute, no_admission_impute), columns =('feature name','coefficient'))


sns.set_style("white")

sns.catplot(x="coefficient", 
            y="feature name", 
            kind="bar", 
            palette = "light:b_r",
            data=less_30.sort_values(by='coefficient', key=abs,ascending=False)[:10]).set(title="Model 1 Feature Importance Plot for class <30")

sns.catplot(x="coefficient", 
            y="feature name", 
            kind="bar", 
            palette = "light:b_r",
            data=less_30_impute.sort_values(by='coefficient', key=abs,ascending=False)[:10]).set(title="Model 2 Feature Importance Plot for class <30")

None


sns.catplot(x="coefficient", 
            y="feature name", 
            kind="bar", 
            palette = "light:b_r",
            data=less_30.sort_values(by='coefficient', key=abs,ascending=False)[:10]).set(title="Model 1 Feature Importance Plot for class <30")

sns.catplot(x="coefficient", 
            y="feature name", 
            kind="bar", 
            palette = "light:b_r",
            data=less_30_impute.sort_values(by='coefficient', key=abs,ascending=False)[:10]).set(title="Model 2 Feature Importance Plot for class <30")

None


sns.catplot(x="coefficient", 
            y="feature name", 
            kind="bar", 
            palette = "light:b_r",
            data=no_admission.sort_values(by='coefficient', key=abs,ascending=False)[:10]).set(title="Model 1 Feature Importance Plot for class No Admission")

sns.catplot(x="coefficient", 
            y="feature name", 
            kind="bar", 
            palette = "light:b_r",
            data=no_admission_impute.sort_values(by='coefficient', key=abs,ascending=False)[:10]).set(title="Model 2 Feature Importance Plot for class No Admission")

None


imputer_stats = search_lr_impute_method_2.best_estimator_.named_steps["preprocessing"].named_transformers_["cat"].\
named_steps["cat_impute"].statistics_

stats_cat = pd.DataFrame(imputer_stats, index=categorical_col).reset_index().rename(columns={"index":"Features", 0:"Impute Value"})

stats_cat[(stats_cat["Features"]=="race") | (stats_cat["Features"]=="payer_code") |
         (stats_cat["Features"]=="diag_1") | (stats_cat["Features"]=="diag_2" )| (stats_cat["Features"]=="diag_3)")]


stats_cat


# Try best model WITHOUT RACE category to assess necessity 

path_file = f"{d}/casestudy2/dataset_diabetes/diabetic_data.csv"
raw_data = pd.read_csv(path_file)
# Convert columns to correct type
raw_data["encounter_id"] = raw_data["encounter_id"].astype(str)
raw_data["patient_nbr"] = raw_data["patient_nbr"].astype(str)
raw_data["admission_type_id"] = raw_data["admission_type_id"].astype(str)
raw_data["discharge_disposition_id"] = raw_data["discharge_disposition_id"].astype(str)
raw_data["admission_source_id"] = raw_data["admission_source_id"].astype(str)


raw_data.shape

(101766, 50)


# Weight over 90% missing
# encounter_id is unique column not useful for prediction
# remove race
suggested_delete = ['weight','encounter_id','patient_nbr','race']
raw_data.drop(suggested_delete,inplace=True, axis=1)


# This method is used to transform diag_1, diag_2, diag_3 columns based on conditions below
def transform_diag_column(string: str):
    if "?" in string:
        return "NA"
    elif ("E" in string) or ("V" in string):
        return "ECOISC"
    else:
        value = float(string)
        assert isinstance(value, float), "Conversion of diag column failed, special case present"
        if value >= 250 and value < 251:
            return "DM"
        elif (value >= 390 and value <= 459) or value==785:
            return "DOCS"
        elif (value >= 460 and value <= 519) or value==786:
            return "DORS"
        elif (value >= 520 and value <= 579) or (value in [787,789]):
            return "DODS"
        elif (value >= 800 and value <= 999):
            return "IP"
        elif (value >= 710 and value <= 739):
            return "DOMSACT"
        elif (value >= 580 and value <= 629) or value==788:
            return "DOGNS"
        elif (value >= 140 and value <= 239):
            return "NEO"
        elif (value >= 790 and value <= 799) or (value in [780,781,784]):
            return "OSSIC"
        elif (value >= 790 and value <= 799) or (value in [780,781,784]):
            return "OSSIC"
        elif (value >= 240 and value <= 279) or value==783:
            return "ENMDISWD"
        elif (value >= 680 and value <= 709) or value==782:
            return "DOTSAST"
        elif (value >= 1 and value <= 139):
            return "IPD"
        elif (value >= 290 and value <= 319):
            return "MD"
        elif (value >= 280 and value <= 289):
            return "DOBBO"
        elif (value >= 320 and value <= 359):
            return "DONS"
        elif (value >= 630 and value <= 679):
            return "COPCP"
        elif (value >= 360 and value <= 389):
            return "DOSO"
        elif (value >= 740 and value <= 759):
            return "CA"
        else:
            print("Value: ", value)


raw_data['diag_1'] = raw_data['diag_1'].apply(lambda x: transform_diag_column(x))
raw_data['diag_2'] = raw_data['diag_2'].apply(lambda x: transform_diag_column(x))
raw_data['diag_3'] = raw_data['diag_3'].apply(lambda x: transform_diag_column(x))


def convert_unknown_to_NA(string: str):
    if string == '?':
        return "NA"
    else:
        return string


# Convert payer_code ? to category NA: not available
raw_data['payer_code'] = raw_data['payer_code'].apply(lambda x: convert_unknown_to_NA(x))


# Getting top 15 medical_specialty by value_counts()
top_15_medical_specialty = list(pd.DataFrame(raw_data["medical_specialty"].value_counts()).reset_index()[:15]['index'])


top_15_medical_specialty = list(pd.DataFrame(raw_data["medical_specialty"].value_counts()).reset_index()[:15]['index'])
def convert_medical_specialty(string: str):
    if string=='?':
        return "NA"
    elif string in top_15_medical_specialty:
        return string
    else:
        return "Others"
raw_data['medical_specialty'] = raw_data['medical_specialty'].apply(lambda x: convert_medical_specialty(x))


raw_data['medical_specialty'] = raw_data['medical_specialty'].apply(lambda x: convert_medical_specialty(x))


# Get X and y variable
X, y = raw_data.drop(['readmitted'], axis=1),raw_data['readmitted']

# Get categorical columns and numerical columns
categorical_col = X.select_dtypes(include=['object', 'bool']).columns
numeric_col = X.select_dtypes(include=['float','int']).columns
all_col = X.columns


params_lr_multi = [
        # Tune One over rest parameters
    {
        "model__C": [10, 15,20,25,30],
        "model__multi_class": ["ovr"],
        "model__solver": ["liblinear"]
    },
    # Tune Multinomial parameters
    {
        "model__C": [10, 15, 20,30],
        "model__multi_class": ["multinomial"],
        "model__solver": ["lbfgs"]
    }
]
cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=1)
def cv_common(df, columns):
    cv_result_summary = df[columns]
    cv_result_summary.index  = np.arange(1,len(cv_result_summary)+1)
    cv_result_summary = cv_result_summary.reset_index()
    return cv_result_summary

def cv_summary(estimator, columns):
    df = pd.DataFrame(estimator.cv_results_)
    cv_result_summary = cv_common(df, columns)
    cv_result_summary = cv_result_summary.rename(columns = {
        "index": "param_combination"
    })
    return cv_result_summary


%%time

# Define preprocessing for columns
t_slr = [('cat', OneHotEncoder(handle_unknown='ignore'),categorical_col),
        ('scale',MinMaxScaler(),numeric_col)]
col_transform = ColumnTransformer(t_slr)
model_lr = LogisticRegression(tol=0.0001,C=1.0, class_weight='balanced',random_state=101,max_iter=10000)
pipeline_lr = Pipeline(steps=[('preprocessing',col_transform), 
                                     ('model', model_lr)])
# Fit
clf_lr_nr = GridSearchCV(pipeline_lr, params_lr_multi, cv = cv, scoring='roc_auc_ovr_weighted',
                      n_jobs=-2, verbose=15)
search_lr_nr = clf_lr_nr.fit(X, y)

Fitting 10 folds for each of 9 candidates, totalling 90 fits
CPU times: user 1min 22s, sys: 3.84 s, total: 1min 26s
Wall time: 1h 35min 53s


file_name_lr_nr = f"{cwd}/logistic_regression_search_3.sav"
#joblib.dump(search_lr_nr,file_name_lr_nr)
search_lr_nr = joblib.load(file_name_lr_nr)


# Get cross validation result
search_columns = ["param_model__C",'param_model__multi_class','param_model__solver',
                                "mean_test_score","mean_fit_time","rank_test_score"]
df_search_result_nr = cv_summary(search_lr_nr,search_columns)
df_search_result_nr


y_hat_lg_nr = search_lr_nr.predict(X)
get_classification_report_multi_response(X, y, X, y, y_hat_lg_nr, search_lr_nr)

Weighted AUC: 0.67904
Classification Report:
              precision    recall  f1-score   support

         <30       0.27      0.19      0.22     11357
         >30       0.49      0.42      0.45     35545
          NO       0.64      0.75      0.69     54864

    accuracy                           0.57    101766
   macro avg       0.47      0.45      0.46    101766
weighted avg       0.55      0.57      0.56    101766


onehot_columns_nr= search_lr_nr.best_estimator_.named_steps['preprocessing'].named_transformers_['cat'].get_feature_names(input_features=categorical_col)
features_list_nr = list(onehot_columns_nr)
features_list_nr.extend(list(numeric_col))


less_30_nr = search_lr_nr.best_estimator_._final_estimator.coef_[0]
greater_30_nr = search_lr_nr.best_estimator_._final_estimator.coef_[1]
no_admission_nr = search_lr_nr.best_estimator_._final_estimator.coef_[2]


less_30_nr = pd.DataFrame(zip(features_list_nr, less_30_nr), columns =('feature name','coefficient'))
greater_30_nr = pd.DataFrame(zip(features_list_nr, greater_30_nr), columns =('feature name','coefficient'))
no_admission_nr = pd.DataFrame(zip(features_list_nr, no_admission_nr), columns =('feature name','coefficient'))


less_30_nr.sort_values(by='coefficient', key=abs,ascending=False)[:10]


greater_30_nr.sort_values(by='coefficient', key=abs,ascending=False)[:10]


no_admission_nr.sort_values(by='coefficient', key=abs,ascending=False)[:10]

	time_in_hospital	num_lab_procedures	num_procedures	num_medications	number_outpatient	number_emergency	number_inpatient	number_diagnoses
count	101766.000000	101766.000000	101766.000000	101766.000000	101766.000000	101766.000000	101766.000000	101766.000000
mean	4.395987	43.095641	1.339730	16.021844	0.369357	0.197836	0.635566	7.422607
std	2.985108	19.674362	1.705807	8.127566	1.267265	0.930472	1.262863	1.933600
min	1.000000	1.000000	0.000000	1.000000	0.000000	0.000000	0.000000	1.000000
25%	2.000000	31.000000	0.000000	10.000000	0.000000	0.000000	0.000000	6.000000
50%	4.000000	44.000000	1.000000	15.000000	0.000000	0.000000	0.000000	8.000000
75%	6.000000	57.000000	2.000000	20.000000	0.000000	0.000000	1.000000	9.000000
max	14.000000	132.000000	6.000000	81.000000	42.000000	76.000000	21.000000	16.000000

	feature name	coefficient
29	discharge_disposition_id_11	-7.290246
252	number_inpatient	5.616444
251	number_emergency	4.753703
25	admission_type_id_7	-3.694891
30	discharge_disposition_id_12	3.386354
35	discharge_disposition_id_17	-3.169061
34	discharge_disposition_id_16	-2.901195
28	discharge_disposition_id_10	-2.734104
54	admission_source_id_10	-2.729364
37	discharge_disposition_id_19	-2.676893

	feature name	coefficient
29	discharge_disposition_id_11	-7.292866
251	number_emergency	5.893515
250	number_outpatient	3.306798
25	admission_type_id_7	-3.189072
252	number_inpatient	2.942271
32	discharge_disposition_id_14	-2.502248
37	discharge_disposition_id_19	-2.228082
34	discharge_disposition_id_16	1.838597
28	discharge_disposition_id_10	1.815514
230	glyburide-metformin_Down	-1.814557

	feature name	coefficient
251	number_emergency	-16.773356
29	discharge_disposition_id_11	8.489421
252	number_inpatient	-8.005087
25	admission_type_id_7	3.873536
250	number_outpatient	-3.543765
37	discharge_disposition_id_19	3.327759
39	discharge_disposition_id_20	2.262298
215	miglitol_Down	-2.069000
62	admission_source_id_25	1.961177
32	discharge_disposition_id_14	1.916457

	feature name	coefficient
23	discharge_disposition_id_11	-7.267444
246	number_inpatient	5.628425
245	number_emergency	4.756888
19	admission_type_id_7	-3.691063
24	discharge_disposition_id_12	3.369211
29	discharge_disposition_id_17	-3.156421
28	discharge_disposition_id_16	-2.894637
22	discharge_disposition_id_10	-2.736923
48	admission_source_id_10	-2.717518
227	glyburide-metformin_Up	-2.679923

Case Study Two¶

Diabetes Readmission Study¶

David Grijalva, Nicole Norelli, & Mingyang Nick YU¶

9/17/2021¶

1. Introduction¶

Logistic Regression

Ridge Regularization (L2)

Target Class Imbalance

2. Methods¶

Initial Data Observations¶

Consolidation of Categories¶

Missing Data¶

Creating Models Using Pipeline¶

First Model: Logistic Regression with Flag Value Imputation¶

Second Model: Logistic Regression with SimpleImputer (Mode Imputation)¶

Alternative Model¶

3. Results¶

Models¶

Feature Importance¶

Alternative Model¶

4. Conclusion¶

Appendix - Code¶

Initial Data Read In and Conversion¶

Exploratory Data Analysis¶

Numerical Data¶

Categorical Data¶

Data Deletion, Imputation and Conversion for & Fitting Logistic Regression Model - Missing Data converted to Individual Levels¶

Data Deletion, Imputation and Conversion for & Fitting Logistic Regression Model - Imputation using SimpleImputer¶

Method 2 of imputation¶

Feature	Percentage Missing
race	2.2336%
weight	96.8585%
payer_code	39.5574%
medical_specialty	49.0822%
diag_1	0.0206%
diag_2	0.3518%
diag_3	1.3983%

	encounter_id	patient_nbr	race	gender	age	weight	admission_type_id	discharge_disposition_id	admission_source_id	time_in_hospital	payer_code	medical_specialty	num_lab_procedures	num_procedures	num_medications	number_outpatient	number_emergency	number_inpatient	diag_1	diag_2	diag_3	number_diagnoses	max_glu_serum	A1Cresult	metformin	repaglinide	nateglinide	chlorpropamide	glimepiride	acetohexamide	glipizide	glyburide	tolbutamide	pioglitazone	rosiglitazone	acarbose	miglitol	troglitazone	tolazamide	examide	citoglipton	insulin	glyburide-metformin	glipizide-metformin	glimepiride-pioglitazone	metformin-rosiglitazone	metformin-pioglitazone	change	diabetesMed	readmitted
0	2278392	8222157	Caucasian	Female	[0-10)	?	6	25	1	1	?	Pediatrics-Endocrinology	41	0	1	0	0	0	250.83	?	?	1	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	NO
1	149190	55629189	Caucasian	Female	[10-20)	?	1	1	7	3	?	?	59	0	18	0	0	0	276	250.01	255	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	>30
2	64410	86047875	AfricanAmerican	Female	[20-30)	?	1	1	7	2	?	?	11	5	13	2	0	1	648	250	V27	6	None	None	No	No	No	No	No	No	Steady	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Yes	NO
3	500364	82442376	Caucasian	Male	[30-40)	?	1	1	7	2	?	?	44	1	16	0	0	0	8	250.43	403	7	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	NO
4	16680	42519267	Caucasian	Male	[40-50)	?	1	1	7	1	?	?	51	0	8	0	0	0	197	157	250	5	None	None	No	No	No	No	No	No	Steady	No	No	No	No	No	No	No	No	No	No	Steady	No	No	No	No	No	Ch	Yes	NO
5	35754	82637451	Caucasian	Male	[50-60)	?	2	1	2	3	?	?	31	6	16	0	0	0	414	411	250	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Steady	No	No	No	No	No	No	Yes	>30
6	55842	84259809	Caucasian	Male	[60-70)	?	3	1	2	4	?	?	70	1	21	0	0	0	414	411	V45	7	None	None	Steady	No	No	No	Steady	No	No	No	No	No	No	No	No	No	No	No	No	Steady	No	No	No	No	No	Ch	Yes	NO
7	63768	114882984	Caucasian	Male	[70-80)	?	1	1	7	5	?	?	73	0	12	0	0	0	428	492	250	8	None	None	No	No	No	No	No	No	No	Steady	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Yes	>30
8	12522	48330783	Caucasian	Female	[80-90)	?	2	1	4	13	?	?	68	2	28	0	0	0	398	427	38	8	None	None	No	No	No	No	No	No	Steady	No	No	No	No	No	No	No	No	No	No	Steady	No	No	No	No	No	Ch	Yes	NO
9	15738	63555939	Caucasian	Female	[90-100)	?	3	3	4	12	?	InternalMedicine	33	3	18	0	0	0	434	198	486	8	None	None	No	No	No	No	No	No	No	No	No	No	Steady	No	No	No	No	No	No	Steady	No	No	No	No	No	Ch	Yes	NO
10	28236	89869032	AfricanAmerican	Female	[40-50)	?	1	1	7	9	?	?	47	2	17	0	0	0	250.7	403	996	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Steady	No	No	No	No	No	No	Yes	>30
11	36900	77391171	AfricanAmerican	Male	[60-70)	?	2	1	4	7	?	?	62	0	11	0	0	0	157	288	197	7	None	None	No	No	No	No	No	No	No	Up	No	No	No	No	No	No	No	No	No	Steady	No	No	No	No	No	Ch	Yes	<30
12	40926	85504905	Caucasian	Female	[40-50)	?	1	3	7	7	?	Family/GeneralPractice	60	0	15	0	1	0	428	250.43	250.6	8	None	None	Steady	Up	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Down	No	No	No	No	No	Ch	Yes	<30
13	42570	77586282	Caucasian	Male	[80-90)	?	1	6	7	10	?	Family/GeneralPractice	55	1	31	0	0	0	428	411	427	8	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Steady	No	No	No	No	No	No	Yes	NO
14	62256	49726791	AfricanAmerican	Female	[60-70)	?	3	1	2	1	?	?	49	5	2	0	0	0	518	998	627	8	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Steady	No	No	No	No	No	No	Yes	>30
15	73578	86328819	AfricanAmerican	Male	[60-70)	?	1	3	7	12	?	?	75	5	13	0	0	0	999	507	996	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	NO
16	77076	92519352	AfricanAmerican	Male	[50-60)	?	1	1	7	4	?	?	45	4	17	0	0	0	410	411	414	8	None	None	No	No	No	No	No	No	Steady	No	No	No	No	No	No	No	No	No	No	Steady	No	No	No	No	No	Ch	Yes	<30
17	84222	108662661	Caucasian	Female	[50-60)	?	1	1	7	3	?	Cardiology	29	0	11	0	0	0	682	174	250	3	None	None	No	No	No	No	No	No	No	Steady	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Yes	NO
18	89682	107389323	AfricanAmerican	Male	[70-80)	?	1	1	7	5	?	?	35	5	23	0	0	0	402	425	416	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Steady	No	No	No	No	No	No	Yes	>30
19	148530	69422211	?	Male	[70-80)	?	3	6	2	6	?	?	42	2	23	0	0	0	737	427	714	8	None	None	No	No	No	No	No	No	No	Down	No	No	No	No	No	No	No	No	No	Steady	No	No	No	No	No	Ch	Yes	NO

	encounter_id	patient_nbr	race	gender	age	weight	admission_type_id	discharge_disposition_id	admission_source_id	payer_code	medical_specialty	diag_1	diag_2	diag_3	max_glu_serum	A1Cresult	metformin	repaglinide	nateglinide	chlorpropamide	glimepiride	acetohexamide	glipizide	glyburide	tolbutamide	pioglitazone	rosiglitazone	acarbose	miglitol	troglitazone	tolazamide	examide	citoglipton	insulin	glyburide-metformin	glipizide-metformin	glimepiride-pioglitazone	metformin-rosiglitazone	metformin-pioglitazone	change	diabetesMed	readmitted
count	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766	101766
unique	101766	71518	6	3	10	10	8	26	17	18	73	717	749	790	4	4	4	4	4	4	4	2	4	4	2	4	4	4	4	2	3	1	1	4	4	2	2	2	2	2	2	3
top	267325308	88785891	Caucasian	Female	[70-80)	?	1	1	7	?	?	428	276	250	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Yes	NO
freq	1	40	76099	54708	26068	98569	53990	60234	57494	40256	49949	6862	6752	11555	96420	84748	81778	100227	101063	101680	96575	101765	89080	91116	101743	94438	95401	101458	101728	101763	101727	101766	101766	47383	101060	101753	101765	101764	101765	54755	78363	54864

	encounter_id	patient_nbr	race	gender	age	weight	admission_type_id	discharge_disposition_id	admission_source_id	time_in_hospital	payer_code	medical_specialty	num_lab_procedures	num_procedures	num_medications	number_outpatient	number_emergency	number_inpatient	diag_1	diag_2	diag_3	number_diagnoses	max_glu_serum	A1Cresult	metformin	repaglinide	nateglinide	chlorpropamide	glimepiride	acetohexamide	glipizide	glyburide	tolbutamide	pioglitazone	rosiglitazone	acarbose	miglitol	troglitazone	tolazamide	examide	citoglipton	insulin	glyburide-metformin	glipizide-metformin	glimepiride-pioglitazone	metformin-rosiglitazone	metformin-pioglitazone	change	diabetesMed	readmitted
90913	302508452	88785891	Caucasian	Female	[20-30)	?	2	1	7	5	OG	Emergency/Trauma	81	0	17	2	2	11	250.13	250.6	536	7	None	>8	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
60263	168920160	88785891	Caucasian	Female	[20-30)	?	1	1	7	8	OG	Emergency/Trauma	75	1	22	3	2	12	250.11	540	276	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	>30
78355	239153370	88785891	Caucasian	Female	[20-30)	?	1	1	7	7	OG	Emergency/Trauma	74	0	24	0	4	9	250.13	305	276	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
89964	291808098	88785891	Caucasian	Female	[20-30)	?	1	5	7	6	OG	Family/GeneralPractice	70	0	20	2	3	13	787	305	250.03	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Down	No	No	No	No	No	Ch	Yes	>30
78882	241426656	88785891	Caucasian	Female	[20-30)	?	1	1	7	3	OG	Surgery-General	70	0	14	0	4	10	250.13	305	242	4	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
97433	388509176	88785891	Caucasian	Female	[20-30)	?	2	1	7	5	OG	Emergency/Trauma	68	2	24	2	3	8	8	250.01	276	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Down	No	No	No	No	No	Ch	Yes	<30
63165	175632402	88785891	Caucasian	Female	[20-30)	?	2	1	7	4	OG	Emergency/Trauma	64	0	19	3	3	13	250.1	305	305	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Down	No	No	No	No	No	Ch	Yes	>30
50393	151413846	88785891	Caucasian	Female	[20-30)	?	1	1	7	4	SP	Emergency/Trauma	64	0	19	4	1	9	250.11	305	V58	6	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
53771	157363182	88785891	Caucasian	Female	[20-30)	?	2	1	7	2	BC	Emergency/Trauma	64	0	13	3	1	13	250.12	530	578	7	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Down	No	No	No	No	No	Ch	Yes	<30
87854	281267754	88785891	Caucasian	Female	[20-30)	?	1	1	7	7	OG	Emergency/Trauma	63	1	20	2	3	11	250.11	305	V15	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
88392	284109186	88785891	Caucasian	Female	[20-30)	?	1	1	7	3	OG	Emergency/Trauma	63	0	15	2	3	12	250.11	276	276	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	>30
87063	277490424	88785891	Caucasian	Female	[20-30)	?	1	1	7	7	OG	Emergency/Trauma	61	1	21	2	3	11	789	250.03	276	9	None	>8	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Down	No	No	No	No	No	Ch	Yes	<30
76525	230986482	88785891	Caucasian	Female	[20-30)	?	2	1	7	7	OG	Emergency/Trauma	57	2	21	0	4	10	250.11	530	112	8	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
72068	210079272	88785891	Caucasian	Female	[20-30)	?	1	7	7	2	OG	Emergency/Trauma	54	0	18	0	4	7	250.13	599	V58	6	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Down	No	No	No	No	No	Ch	Yes	>30
76907	232655340	88785891	Caucasian	Female	[20-30)	?	2	1	7	4	OG	Surgery-General	50	0	23	0	4	11	789	787	250	6	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Down	No	No	No	No	No	Ch	Yes	>30
67968	191345028	88785891	Caucasian	Female	[20-30)	?	3	1	7	5	OG	Emergency/Trauma	50	1	19	0	2	12	250.11	346	244	5	None	>8	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Down	No	No	No	No	No	Ch	Yes	>30
71029	205106328	88785891	Caucasian	Female	[20-30)	?	2	1	7	1	OG	Emergency/Trauma	50	0	9	0	4	7	250.1	276	244	5	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
79202	242844252	88785891	Caucasian	Female	[20-30)	?	2	1	7	1	OG	Emergency/Trauma	47	0	10	0	5	11	250.11	305	V58	7	None	>8	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
65667	183194856	88785891	Caucasian	Female	[20-30)	?	1	1	7	2	OG	Emergency/Trauma	41	0	12	2	2	11	250.11	305	296	4	None	>8	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	>30
54497	158500410	88785891	Caucasian	Female	[20-30)	?	2	1	7	3	BC	Emergency/Trauma	39	0	12	3	1	13	250.1	244	V02	4	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Down	No	No	No	No	No	Ch	Yes	>30
62759	174689286	88785891	Caucasian	Female	[20-30)	?	1	1	7	3	OG	Emergency/Trauma	36	0	15	3	3	11	250.11	250.6	536	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
51519	153558456	88785891	Caucasian	Female	[20-30)	?	2	1	7	1	BC	Emergency/Trauma	35	0	10	3	1	11	250.11	V15	V58	4	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
40661	126171582	88785891	Caucasian	Female	[20-30)	?	1	1	7	5	BC	Emergency/Trauma	35	0	15	1	0	3	250.13	536	V58	6	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	>30
97732	392073212	88785891	Caucasian	Female	[20-30)	?	1	1	7	3	OG	Emergency/Trauma	34	0	12	2	4	10	250.13	276	276	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	>30
75267	225428574	88785891	Caucasian	Female	[20-30)	?	2	1	7	2	OG	Emergency/Trauma	33	0	11	0	4	9	250.1	276	276	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Down	No	No	No	No	No	Ch	Yes	>30
50773	152188656	88785891	Caucasian	Female	[20-30)	?	2	7	7	1	BC	Surgery-General	33	0	10	4	1	10	250.11	244	V15	6	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
45147	139425576	88785891	Caucasian	Female	[20-30)	?	1	1	7	2	BC	Emergency/Trauma	32	0	4	1	0	6	250.11	V58	V15	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
38307	119039172	88785891	Caucasian	Female	[20-30)	?	1	1	7	1	OG	Emergency/Trauma	32	0	10	0	0	0	250.13	244	490	3	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	>30
74405	221588430	88785891	Caucasian	Female	[20-30)	?	2	1	7	1	OG	Emergency/Trauma	21	0	14	0	3	8	250.02	244	V58	3	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
52057	154527444	88785891	Caucasian	Female	[20-30)	?	2	1	7	1	BC	Emergency/Trauma	20	0	7	3	1	12	250.11	300	296	5	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	>30
74876	223647786	88785891	Caucasian	Female	[20-30)	?	2	1	7	2	OG	Emergency/Trauma	15	0	8	0	4	9	250.13	305	V58	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Down	No	No	No	No	No	Ch	Yes	<30
79467	243967338	88785891	Caucasian	Female	[20-30)	?	2	1	7	1	OG	Family/GeneralPractice	11	0	8	0	5	12	250.12	304	296	7	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	>30
99950	419611106	88785891	Caucasian	Female	[20-30)	?	2	1	7	2	OG	Emergency/Trauma	11	0	9	2	3	12	250.11	276	305	5	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	>30
40252	125094312	88785891	Caucasian	Female	[20-30)	?	1	1	7	1	BC	Emergency/Trauma	10	0	10	1	0	2	250.11	244	?	2	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Down	No	No	No	No	No	Ch	Yes	<30
91444	309563132	88785891	Caucasian	Female	[20-30)	?	1	1	7	3	OG	Emergency/Trauma	9	0	10	2	4	12	787	305	305	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
101385	438529178	88785891	Caucasian	Female	[20-30)	?	3	7	4	3	OG	Psychology	9	0	8	2	3	9	296	305	292	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	<30
45986	141994242	88785891	Caucasian	Female	[20-30)	?	2	1	7	4	BC	Emergency/Trauma	8	0	14	1	0	7	250.1	276	276	9	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
50167	150986298	88785891	Caucasian	Female	[20-30)	?	2	1	7	1	BC	Emergency/Trauma	2	0	9	4	1	9	8	250.01	V09	7	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30
82693	257757342	88785891	Caucasian	Female	[20-30)	?	2	7	7	2	OG	Emergency/Trauma	1	0	15	1	5	11	780	250.03	784	8	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Down	No	No	No	No	No	Ch	Yes	>30
44515	137245596	88785891	Caucasian	Female	[20-30)	?	3	1	7	2	SP	Surgery-General	1	0	8	1	0	5	250.11	465	244	3	None	None	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	No	Up	No	No	No	No	No	Ch	Yes	<30

	param_combination	param_model__C	param_model__multi_class	param_model__solver	mean_test_score	mean_fit_time	rank_test_score
0	1	10	ovr	liblinear	0.675217	129.579676	5
1	2	15	ovr	liblinear	0.675267	135.753840	4
2	3	20	ovr	liblinear	0.675285	145.241070	3
3	4	25	ovr	liblinear	0.675302	149.289326	2
4	5	30	ovr	liblinear	0.675315	152.201373	1
5	6	10	multinomial	lbfgs	0.670687	170.560082	9
6	7	15	multinomial	lbfgs	0.670724	183.531837	8
7	8	20	multinomial	lbfgs	0.670737	187.360534	7
8	9	30	multinomial	lbfgs	0.670751	175.321616	6

	param_combination	param_model__C	param_model__multi_class	param_model__solver	mean_test_score	mean_fit_time	rank_test_score
0	1	10	ovr	liblinear	0.674044	130.526944	5
1	2	15	ovr	liblinear	0.674095	147.574898	4
2	3	20	ovr	liblinear	0.674122	153.111788	3
3	4	25	ovr	liblinear	0.674142	154.705570	2
4	5	30	ovr	liblinear	0.674150	160.185588	1
5	6	10	multinomial	lbfgs	0.669556	196.593391	9
6	7	10	multinomial	sag	0.604539	1963.478480	12
7	8	15	multinomial	lbfgs	0.669567	193.709670	8
8	9	15	multinomial	sag	0.606732	2013.285076	11
9	10	20	multinomial	lbfgs	0.669598	159.383780	7
10	11	20	multinomial	sag	0.603339	2396.618500	13
11	12	30	multinomial	lbfgs	0.669615	313.316035	6
12	13	30	multinomial	sag	0.613806	1765.977914	10

	Features	Impute Value
0	race	Caucasian
1	gender	Female
2	age	[70-80)
3	admission_type_id	1
4	discharge_disposition_id	1
5	admission_source_id	7
6	payer_code	MC
7	medical_specialty	InternalMedicine
8	diag_1	DOCS
9	diag_2	DOCS
10	diag_3	DOCS
11	max_glu_serum	None
12	A1Cresult	None
13	metformin	No
14	repaglinide	No
15	nateglinide	No
16	chlorpropamide	No
17	glimepiride	No
18	acetohexamide	No
19	glipizide	No
20	glyburide	No
21	tolbutamide	No
22	pioglitazone	No
23	rosiglitazone	No
24	acarbose	No
25	miglitol	No
26	troglitazone	No
27	tolazamide	No
28	examide	No
29	citoglipton	No
30	insulin	No
31	glyburide-metformin	No
32	glipizide-metformin	No
33	glimepiride-pioglitazone	No
34	metformin-rosiglitazone	No
35	metformin-pioglitazone	No
36	change	No
37	diabetesMed	Yes

	param_combination	param_model__C	param_model__multi_class	param_model__solver	mean_test_score	mean_fit_time	rank_test_score
0	1	10	ovr	liblinear	0.674353	82.364852	5
1	2	15	ovr	liblinear	0.674393	87.828505	4
2	3	20	ovr	liblinear	0.674421	90.487435	3
3	4	25	ovr	liblinear	0.674436	89.860154	2
4	5	30	ovr	liblinear	0.674445	600.919614	1
5	6	10	multinomial	lbfgs	0.669873	810.619839	9
6	7	15	multinomial	lbfgs	0.669897	138.422084	8
7	8	20	multinomial	lbfgs	0.669914	370.296794	7
8	9	30	multinomial	lbfgs	0.669915	1636.466048	6

	feature name	coefficient
23	discharge_disposition_id_11	-7.329701
245	number_emergency	5.967224
19	admission_type_id_7	-3.550970
244	number_outpatient	3.334202
246	number_inpatient	2.971152
31	discharge_disposition_id_19	-2.704291
26	discharge_disposition_id_14	-2.492066
224	glyburide-metformin_Down	-2.266470
217	tolazamide_Up	2.185643
28	discharge_disposition_id_16	2.001332

	feature name	coefficient
245	number_emergency	-16.853079
23	discharge_disposition_id_11	8.448766
246	number_inpatient	-8.042771
19	admission_type_id_7	3.901907
244	number_outpatient	-3.586954
31	discharge_disposition_id_19	3.401095
33	discharge_disposition_id_20	2.370626
209	miglitol_Down	-2.150712
2	gender_Unknown/Invalid	2.122247
69	payer_code_FR	2.103909

	patient_nbr	visits
0	88785891	40
1	43140906	28
2	23199021	23
3	88227540	23
4	1660293	23