import shap
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import eli5
from eli5.sklearn import PermutationImportance
Reading in the data
= pd.read_csv('../input/hospital-readmissions/train.csv') data
Training our model
= data.readmitted
y
= [c for c in data.columns if c != "readmitted"]
base_features
= data[base_features]
X
= train_test_split(X, y, random_state=1)
train_X, val_X, train_y, val_y = RandomForestClassifier(n_estimators=30, random_state=1).fit(train_X, train_y) my_model
Perfoming some Permutation importance first
= PermutationImportance(my_model, random_state=1).fit(val_X, val_y)
perm1 = val_X.columns.tolist()) eli5.show_weights(perm1, feature_names
Weight | Feature |
---|---|
0.0451 ± 0.0068 | number_inpatient |
0.0087 ± 0.0046 | number_emergency |
0.0062 ± 0.0053 | number_outpatient |
0.0033 ± 0.0016 | payer_code_MC |
0.0020 ± 0.0016 | diag_3_401 |
0.0016 ± 0.0031 | medical_specialty_Emergency/Trauma |
0.0014 ± 0.0024 | A1Cresult_None |
0.0014 ± 0.0021 | medical_specialty_Family/GeneralPractice |
0.0013 ± 0.0010 | diag_2_427 |
0.0013 ± 0.0011 | diag_2_276 |
0.0011 ± 0.0022 | age_[50-60) |
0.0010 ± 0.0022 | age_[80-90) |
0.0007 ± 0.0006 | repaglinide_No |
0.0006 ± 0.0010 | diag_1_428 |
0.0006 ± 0.0022 | payer_code_SP |
0.0005 ± 0.0030 | insulin_No |
0.0004 ± 0.0028 | diabetesMed_Yes |
0.0004 ± 0.0021 | diag_3_250 |
0.0003 ± 0.0018 | diag_2_250 |
0.0003 ± 0.0015 | glipizide_No |
… 44 more … |
= val_X.iloc[0,:]
data_for_prediction
# Creating an object that can calculate shap values
= shap.TreeExplainer(my_model)
explainer = explainer.shap_values(data_for_prediction)
shap_values
shap.initjs()0], shap_values[0], data_for_prediction) shap.force_plot(explainer.expected_value[
Looking at both the permutation importance table and the shaply values it would be safe to conclude that "number_inpatient" is a really important feature