[R] R을 이용한 신용평가모형 개발

GermanCredit data
EDA
Modeling with caret
Evaluation
- ROC curve

library(caret) # GermanCredit
library(tidyverse)
library(ggthemes)
library(ggmosaic)
library(gridExtra)

모델링에 사용할 데이터는 GermanCredit이다. caret 패키지의 내장 데이터이고, 패키지에 있는 데이터 설명을 보면 아래와 같다.

Description

Data from Dr.<u+00a0>Hans Hofmann of the University of Hamburg.</u+00a0>

Details

These data have two classes for the credit worthiness: good or bad. There are predictors related to attributes, such as: checking account status, duration, credit history, purpose of the loan, amount of the loan, savings accounts or bonds, employment duration, Installment rate in percentage of disposable income, personal information, other debtors/guarantors, residence duration, property, age, other installment plans, housing, number of existing credits, job information, Number of people being liable to provide maintenance for, telephone, and foreign worker status.

Many of these predictors are discrete and have been expanded into several 0/1 indicator variables

Source

UCI Machine Learning Repository

data("GermanCredit")

Class 변수가 우량(Good)과 불량(Bad) 라벨이 붙은 타겟변수이고, 우불량 여부에 영향을 미칠 수 있는 독립변수들이 포함되어 있다. 범주형 변수는 모형이 바로 투입하기 용이하도록 원핫인코딩 되어 있다.

GermanCredit$Class %>% table

.
 Bad Good 
 300  700

연속형 변수에 대해서는 density plot, 범주형 변수에 대해서는 mosaic plot을 그려볼 수 있다.

GermanCredit %>% ggplot(aes(Duration, fill=Class)) + geom_density(alpha=0.5) + theme_bw() + scale_fill_tableau()

GermanCredit %>% ggplot() + 
  geom_mosaic(aes(product(NumberExistingCredits), fill=Class)) + 
  theme_bw() + scale_fill_tableau() + xlab('NumberExistingCredits') + ylab('')

원 데이터에는 범주형 변수들이 원핫인코딩 되어 있지만 모자이크 플랏을 그리기 편하게 하기 위해 다시 하나의 변수로 바꿔주었다.

data_reform <- GermanCredit %>% 
  mutate(CheckingAccountStatus = case_when(CheckingAccountStatus.lt.0==1~"0미만",
                                           CheckingAccountStatus.0.to.200==1~"0~200",
                                           CheckingAccountStatus.gt.200==1~"200초과",
                                           CheckingAccountStatus.none==1~"정보없음"),
         CreditHistory = case_when(CreditHistory.NoCredit.AllPaid==1~"NoCredit.AllPaid",
                                   CreditHistory.ThisBank.AllPaid==1~"ThisBank.AllPaid",
                                   CreditHistory.PaidDuly==1~"PaidDuly",
                                   CreditHistory.Delay==1~"Delay",
                                   CreditHistory.Critical==1~"Critical"),
         Purpose = case_when(Purpose.NewCar==1~"NewCar",
                             Purpose.UsedCar==1~"UsedCar",
                             Purpose.Furniture.Equipment==1~"Furniture.Equipment",
                             Purpose.Radio.Television==1~"Radio.Television",
                             Purpose.DomesticAppliance==1~"DomesticAppliance",
                             Purpose.Repairs==1~"Repairs",
                             Purpose.Education==1~"Education",
                             Purpose.Vacation==1~"Vacation",
                             Purpose.Retraining==1~"Retraining",
                             Purpose.Business==1~"Business",
                             Purpose.Other==1~"Other"),
         SavingsAccountBonds = case_when(SavingsAccountBonds.lt.100==1~"100미만",
                                         SavingsAccountBonds.100.to.500==1~"100~500",
                                         SavingsAccountBonds.500.to.1000==1~"500~1000",
                                         SavingsAccountBonds.gt.1000==1~"1000초과",
                                         SavingsAccountBonds.Unknown==1~"정보없음"),
         EmploymentDuration = case_when(EmploymentDuration.lt.1==1~"1년미만",
                                        EmploymentDuration.1.to.4==1~"1년~4년",
                                        EmploymentDuration.4.to.7==1~"4년~7년",
                                        EmploymentDuration.gt.7==1~"7년초과",
                                        EmploymentDuration.Unemployed==1~"미고용"),
         Personal = case_when(Personal.Male.Divorced.Seperated==1~"Male.Divorced.Seperated",
                              Personal.Female.NotSingle==1~"Female.NotSingle",
                              Personal.Male.Single==1~"Male.Single",
                              Personal.Male.Married.Widowed==1~"Male.Married.Widowed",
                              Personal.Female.Single==1~"Female.Single"),
         OtherDebtorsGuarantors = case_when(OtherDebtorsGuarantors.None==1~"None",
                                            OtherDebtorsGuarantors.CoApplicant==1~"CoApplicant",
                                            OtherDebtorsGuarantors.Guarantor==1~"Guarantor"),
         Property = case_when(Property.RealEstate==1~"RealEstate",
                              Property.Insurance==1~"Insurance",
                              Property.CarOther==1~"CarOther",
                              Property.Unknown==1~"Unknown"),
         OtherInstallmentPlans = case_when(OtherInstallmentPlans.Bank==1~"Bank",
                                           OtherInstallmentPlans.Stores==1~"Stores",
                                           OtherInstallmentPlans.None==1~"None"),
         Housing = case_when(Housing.Rent==1~"Rent",
                             Housing.Own==1~"Own",
                             Housing.ForFree==1~"ForeFree"),
         Job = case_when(Job.UnemployedUnskilled==1~"UnemployedUnskilled",
                         Job.UnskilledResident==1~"UnskilledResident",
                         Job.SkilledEmployee==1~"SkilledEmployee",
                         Job.Management.SelfEmp.HighlyQualified==1~"Management.SelfEmp.HighlyQualified")
  )

data_reform %>% ggplot() + 
  geom_mosaic(aes(product(CheckingAccountStatus), fill=Class)) + 
  theme_bw() + scale_fill_tableau() + xlab('CheckingAccountStatus') + ylab('')

다른 변수들도 같은 방식으로 탐색해 볼 수 있다.

sel_vars <- c("Duration", "Amount", "InstallmentRatePercentage", "ResidenceDuration", 
  "Age", "NumberExistingCredits", "NumberPeopleMaintenance", "Telephone",
  "ForeignWorker", "Class", names(data_reform)[sapply(data_reform, class) == "character"])

dat <- data_reform %>% select(sel_vars)

Make imbalance dataset

# dat_b <- dat %>% filter(Class=="Bad") 
# dat_g <- dat %>% filter(Class=="Good")
# 
# set.seed(42)
# dat <- rbind(dat_b[sample(nrow(dat_b), 20),], dat_g)

Split dataset

set.seed(42)
train_idx <- sample(nrow(dat), nrow(dat)*0.8)
dtrain <- dat[train_idx, ]
dtest <- dat[-train_idx, ]

trainControl setup

ctrl <- trainControl(method="cv",
                     number=5, # 5-folds CV
                     classProbs = TRUE,
                     summaryFunction = twoClassSummary,
                     verboseIter = TRUE)

Logistic Regression

set.seed(42)
fit_logit <- train(Class~., data=dtrain,
                   method="glm", family="binomial", maxit=100, epsilon=0.0001,
                   trControl=ctrl, metric="ROC")

+ Fold1: parameter=none 
- Fold1: parameter=none 
+ Fold2: parameter=none 
- Fold2: parameter=none 
+ Fold3: parameter=none 
- Fold3: parameter=none 
+ Fold4: parameter=none 
- Fold4: parameter=none 
+ Fold5: parameter=none 
- Fold5: parameter=none 
Aggregating results
Fitting final model on full training set

fit_logit

Generalized Linear Model 

800 samples
 20 predictor
  2 classes: 'Bad', 'Good' 

No pre-processing
Resampling: Cross-Validated (5 fold) 
Summary of sample sizes: 640, 640, 640, 640, 640 
Resampling results:

  ROC        Sens       Spec    
  0.7661335  0.4857143  0.845045

pred_logit <- predict(fit_logit, newdata=dtest, type="prob")

Random Forest

set.seed(42)
fit_rf <- train(Class~., data=dtrain,
                method="ranger", 
                trControl=ctrl, metric="ROC")

+ Fold1: mtry= 2, min.node.size=1, splitrule=gini 
- Fold1: mtry= 2, min.node.size=1, splitrule=gini 
+ Fold1: mtry=25, min.node.size=1, splitrule=gini 
- Fold1: mtry=25, min.node.size=1, splitrule=gini 
+ Fold1: mtry=48, min.node.size=1, splitrule=gini 
- Fold1: mtry=48, min.node.size=1, splitrule=gini 
+ Fold1: mtry= 2, min.node.size=1, splitrule=extratrees 
- Fold1: mtry= 2, min.node.size=1, splitrule=extratrees 
+ Fold1: mtry=25, min.node.size=1, splitrule=extratrees 
- Fold1: mtry=25, min.node.size=1, splitrule=extratrees 
+ Fold1: mtry=48, min.node.size=1, splitrule=extratrees 
- Fold1: mtry=48, min.node.size=1, splitrule=extratrees 
+ Fold2: mtry= 2, min.node.size=1, splitrule=gini 
- Fold2: mtry= 2, min.node.size=1, splitrule=gini 
+ Fold2: mtry=25, min.node.size=1, splitrule=gini 
- Fold2: mtry=25, min.node.size=1, splitrule=gini 
+ Fold2: mtry=48, min.node.size=1, splitrule=gini 
- Fold2: mtry=48, min.node.size=1, splitrule=gini 
+ Fold2: mtry= 2, min.node.size=1, splitrule=extratrees 
- Fold2: mtry= 2, min.node.size=1, splitrule=extratrees 
+ Fold2: mtry=25, min.node.size=1, splitrule=extratrees 
- Fold2: mtry=25, min.node.size=1, splitrule=extratrees 
+ Fold2: mtry=48, min.node.size=1, splitrule=extratrees 
- Fold2: mtry=48, min.node.size=1, splitrule=extratrees 
+ Fold3: mtry= 2, min.node.size=1, splitrule=gini 
- Fold3: mtry= 2, min.node.size=1, splitrule=gini 
+ Fold3: mtry=25, min.node.size=1, splitrule=gini 
- Fold3: mtry=25, min.node.size=1, splitrule=gini 
+ Fold3: mtry=48, min.node.size=1, splitrule=gini 
- Fold3: mtry=48, min.node.size=1, splitrule=gini 
+ Fold3: mtry= 2, min.node.size=1, splitrule=extratrees 
- Fold3: mtry= 2, min.node.size=1, splitrule=extratrees 
+ Fold3: mtry=25, min.node.size=1, splitrule=extratrees 
- Fold3: mtry=25, min.node.size=1, splitrule=extratrees 
+ Fold3: mtry=48, min.node.size=1, splitrule=extratrees 
- Fold3: mtry=48, min.node.size=1, splitrule=extratrees 
+ Fold4: mtry= 2, min.node.size=1, splitrule=gini 
- Fold4: mtry= 2, min.node.size=1, splitrule=gini 
+ Fold4: mtry=25, min.node.size=1, splitrule=gini 
- Fold4: mtry=25, min.node.size=1, splitrule=gini 
+ Fold4: mtry=48, min.node.size=1, splitrule=gini 
- Fold4: mtry=48, min.node.size=1, splitrule=gini 
+ Fold4: mtry= 2, min.node.size=1, splitrule=extratrees 
- Fold4: mtry= 2, min.node.size=1, splitrule=extratrees 
+ Fold4: mtry=25, min.node.size=1, splitrule=extratrees 
- Fold4: mtry=25, min.node.size=1, splitrule=extratrees 
+ Fold4: mtry=48, min.node.size=1, splitrule=extratrees 
- Fold4: mtry=48, min.node.size=1, splitrule=extratrees 
+ Fold5: mtry= 2, min.node.size=1, splitrule=gini 
- Fold5: mtry= 2, min.node.size=1, splitrule=gini 
+ Fold5: mtry=25, min.node.size=1, splitrule=gini 
- Fold5: mtry=25, min.node.size=1, splitrule=gini 
+ Fold5: mtry=48, min.node.size=1, splitrule=gini 
- Fold5: mtry=48, min.node.size=1, splitrule=gini 
+ Fold5: mtry= 2, min.node.size=1, splitrule=extratrees 
- Fold5: mtry= 2, min.node.size=1, splitrule=extratrees 
+ Fold5: mtry=25, min.node.size=1, splitrule=extratrees 
- Fold5: mtry=25, min.node.size=1, splitrule=extratrees 
+ Fold5: mtry=48, min.node.size=1, splitrule=extratrees 
- Fold5: mtry=48, min.node.size=1, splitrule=extratrees 
Aggregating results
Selecting tuning parameters
Fitting mtry = 2, splitrule = gini, min.node.size = 1 on full training set

fit_rf

Random Forest 

800 samples
 20 predictor
  2 classes: 'Bad', 'Good' 

No pre-processing
Resampling: Cross-Validated (5 fold) 
Summary of sample sizes: 640, 640, 640, 640, 640 
Resampling results across tuning parameters:

  mtry  splitrule   ROC        Sens       Spec     
   2    gini        0.7766133  0.1551020  0.9837838
   2    extratrees  0.7719066  0.1061224  0.9819820
  25    gini        0.7752712  0.4489796  0.8846847
  25    extratrees  0.7557455  0.4163265  0.8720721
  48    gini        0.7763743  0.4775510  0.8720721
  48    extratrees  0.7547343  0.4408163  0.8594595

Tuning parameter 'min.node.size' was held constant at a value of 1
ROC was used to select the optimal model using the largest value.
The final values used for the model were mtry = 2, splitrule = gini
 and min.node.size = 1.

pred_rf <- predict(fit_rf, newdata = dtest, type = "prob")

Support Vector Machine

set.seed(42)
fit_svm <- train(Class~., data=dtrain,
                method="svmRadial", 
                trControl=ctrl, metric="ROC")

+ Fold1: sigma=0.01236, C=0.25 
- Fold1: sigma=0.01236, C=0.25 
+ Fold1: sigma=0.01236, C=0.50 
- Fold1: sigma=0.01236, C=0.50 
+ Fold1: sigma=0.01236, C=1.00 
- Fold1: sigma=0.01236, C=1.00 
+ Fold2: sigma=0.01236, C=0.25 
- Fold2: sigma=0.01236, C=0.25 
+ Fold2: sigma=0.01236, C=0.50 
- Fold2: sigma=0.01236, C=0.50 
+ Fold2: sigma=0.01236, C=1.00 
- Fold2: sigma=0.01236, C=1.00 
+ Fold3: sigma=0.01236, C=0.25 
- Fold3: sigma=0.01236, C=0.25 
+ Fold3: sigma=0.01236, C=0.50 
- Fold3: sigma=0.01236, C=0.50 
+ Fold3: sigma=0.01236, C=1.00 
- Fold3: sigma=0.01236, C=1.00 
+ Fold4: sigma=0.01236, C=0.25 
- Fold4: sigma=0.01236, C=0.25 
+ Fold4: sigma=0.01236, C=0.50 
- Fold4: sigma=0.01236, C=0.50 
+ Fold4: sigma=0.01236, C=1.00 
- Fold4: sigma=0.01236, C=1.00 
+ Fold5: sigma=0.01236, C=0.25 
- Fold5: sigma=0.01236, C=0.25 
+ Fold5: sigma=0.01236, C=0.50 
- Fold5: sigma=0.01236, C=0.50 
+ Fold5: sigma=0.01236, C=1.00 
- Fold5: sigma=0.01236, C=1.00 
Aggregating results
Selecting tuning parameters
Fitting sigma = 0.0124, C = 0.5 on full training set

fit_svm

Support Vector Machines with Radial Basis Function Kernel 

800 samples
 20 predictor
  2 classes: 'Bad', 'Good' 

No pre-processing
Resampling: Cross-Validated (5 fold) 
Summary of sample sizes: 640, 640, 640, 640, 640 
Resampling results across tuning parameters:

  C     ROC        Sens       Spec     
  0.25  0.7762824  0.4571429  0.8648649
  0.50  0.7765030  0.4530612  0.8666667
  1.00  0.7743335  0.4204082  0.8810811

Tuning parameter 'sigma' was held constant at a value of 0.01235994
ROC was used to select the optimal model using the largest value.
The final values used for the model were sigma = 0.01235994 and C = 0.5.

pred_svm <- predict(fit_svm, newdata = dtest, type = "prob")

ROC curve

TPR = sensitivity = recall = \(\frac{TP}{TP+FN}\) = 실제 불량인 것을 불량으로 예측

FPR = 1-specificity = \(\frac{FP}{FP+TN}\) = 실제는 우량인데 불량으로 예측

simple_roc <- function(true, pred){
  true <- true[order(pred, decreasing=TRUE)]
  pred <- pred[order(pred, decreasing=TRUE)]
  data.frame(TPR=cumsum(true)/sum(true), FPR=cumsum(!true)/sum(!true), true, pred, 
             Bad=cumsum(true), Bad_TOT=sum(true), Good=cumsum(!true), Good_TOT=sum(!true))
}
roc_df <- simple_roc(if_else(dtest$Class=="Bad", 1, 0), pred_logit$Bad)
roc_df %>% ggplot(aes(x=FPR, y=TPR)) + geom_line() + theme(aspect.ratio = 1) + theme_bw()

library(pROC)
roc_logit <- roc(dtest$Class, pred_logit$Bad)
cat(paste("AUC(로지스틱회귀):", pROC::auc(roc_logit)))

AUC(로지스틱회귀): 0.787962382445141

roc_rf <- roc(dtest$Class, pred_rf$Bad)
cat(paste("AUC(랜덤포레스트):", pROC::auc(roc_rf)))

AUC(랜덤포레스트): 0.82307210031348

roc_svm <- roc(dtest$Class, pred_svm$Bad)
cat(paste("AUC(서포트벡터머신):", pROC::auc(roc_svm)))

AUC(서포트벡터머신): 0.799623824451411

plot(roc_logit, legacy.axes=TRUE, 
     print.auc=TRUE, print.auc.adj=c(1, -1), 
     auc.polygon=TRUE)

plot(roc_rf, col="blue", legacy.axes=TRUE, 
     print.auc=TRUE, add=T, print.auc.adj=c(1, 1), 
     auc.polygon=TRUE, auc.polygon.col=adjustcolor("blue", alpha.f = 0.1))

plot(roc_svm, col="red", legacy.axes=TRUE, 
     print.auc=TRUE, add=T, print.auc.adj=c(1, 3), 
     auc.polygon=TRUE, auc.polygon.col=adjustcolor("red", alpha.f = 0.1))

res_logit <- data.frame(pred=pred_logit$Bad, true=if_else(dtest$Class=="Bad", 1, 0))
# write.csv(res_logit, file="res_logit.csv", row.names = F)

res_rf <- data.frame(pred=pred_rf$Bad, true=if_else(dtest$Class=="Bad", 1, 0))
# write.csv(res_rf, file="res_rf.csv", row.names = F)

res_svm <- data.frame(pred=pred_svm$Bad, true=if_else(dtest$Class=="Bad", 1, 0))
# write.csv(res_svm, file="res_svm.csv", row.names = F)

library(MLmetrics)
AUC(pred_logit$Bad, if_else(dtest$Class=="Bad", 1, 0))

[1] 0.7879624

AUC(pred_rf$Bad, if_else(dtest$Class=="Bad", 1, 0))

[1] 0.8230721

AUC(pred_svm$Bad, if_else(dtest$Class=="Bad", 1, 0))

[1] 0.7996238

AUC <- function (y_pred, y_true){
  rank <- rank(y_pred)
  n_pos <- sum(y_true == 1)
  n_neg <- sum(y_true == 0)
  AUC <- (sum(rank[y_true == 1]) - n_pos*(n_pos+1)/2) / (n_pos*n_neg) # wilcoxon rank sum statitic
  return(AUC)
}

library(xgboost)
dtrain <- xgb.DMatrix(model.matrix(~.-1, data=dat %>% select(-Class)), label=ifelse(dat$Class=="Bad", 1, 0))
param <- list(objective="binary:logistic", eta=0.1, gamma=1, seed=42)
fit_xgb <- xgboost(data=dtrain, verbose=0, params = param, nrounds = 100)

xgb.ggplot.importance(xgb.importance(colnames(dtrain), model=fit_xgb), rel_to_first = T) + theme_fivethirtyeight() + scale_fill_tableau()

library(pdp)
train <- model.matrix(~.-1, data=dat %>% select(-Class))
pdp_Amount <- pdp::partial(fit_xgb, pred.var="Amount", plot=T, rug=T, prob=T, chull=T, 
                           
                           plot.engine='ggplot2', train=train) + theme_bw() + ylim(0, 1)
ice_Amount <- pdp::partial(fit_xgb, pred.var="Amount", plot=T, rug=T, prob=T, chull=T, 
                           ice=T, alpha=0.3,  
                           plot.engine='ggplot2', 
                           train=train[sample(nrow(train), nrow(train)*0.1),]) + theme_bw() + ylim(0, 1)
c_ice_Amount <- pdp::partial(fit_xgb, pred.var="Amount", plot=T, rug=T, prob=T, chull=T, 
                             ice=T, alpha=0.3, center=T,  
                             plot.engine='ggplot2', 
                             train=train[sample(nrow(train), nrow(train)*0.1),]) + theme_bw() + ylim(0, 1)
grid.arrange(pdp_Amount, ice_Amount, c_ice_Amount)

pdp_Duration <- pdp::partial(fit_xgb, pred.var="Duration", plot=T, rug=T, prob=T, chull=T, 
                           
                           plot.engine='ggplot2', train=train) + theme_bw() + ylim(0, 1)
ice_Duration <- pdp::partial(fit_xgb, pred.var="Duration", plot=T, rug=T, prob=T, chull=T, 
                           ice=T, alpha=0.3,  
                           plot.engine='ggplot2', 
                           train=train[sample(nrow(train), nrow(train)*0.1),]) + theme_bw() + ylim(0, 1)
c_ice_Duration <- pdp::partial(fit_xgb, pred.var="Duration", plot=T, rug=T, prob=T, chull=T, 
                             ice=T, alpha=0.3, center=T,  
                             plot.engine='ggplot2', 
                             train=train[sample(nrow(train), nrow(train)*0.1),]) + theme_bw() + ylim(0, 1)
grid.arrange(pdp_Duration, ice_Duration, c_ice_Duration)

pdp_Age <- pdp::partial(fit_xgb, pred.var="Age", plot=T, rug=T, prob=T, chull=T, 
                             
                             plot.engine='ggplot2', train=train) + theme_bw() + ylim(0, 1)
ice_Age <- pdp::partial(fit_xgb, pred.var="Age", plot=T, rug=T, prob=T, chull=T, 
                             ice=T, alpha=0.3,  
                             plot.engine='ggplot2', 
                             train=train[sample(nrow(train), nrow(train)*0.1),]) + theme_bw() + ylim(0, 1)
c_ice_Age <- pdp::partial(fit_xgb, pred.var="Age", plot=T, rug=T, prob=T, chull=T, 
                               ice=T, alpha=0.3, center=T,  
                               plot.engine='ggplot2', 
                               train=train[sample(nrow(train), nrow(train)*0.1),]) + theme_bw() + ylim(0, 1)
grid.arrange(pdp_Age, ice_Age, c_ice_Age)

library(ICEbox)
ice_Amount <- ice(object = fit_xgb, X=train, y=ifelse(dat$Class=="Bad", 1, 0), 
                  predictor="Amount", frac_to_build = 0.1)

....................................................................................................

plot(ice_Amount, plot_orig_pts_preds=T, pts_preds_size=1)

dice_Amount <- dice(ice_Amount)
plot(dice_Amount, pts_preds_size=1)

NULL

cat_vars <- names(dat)[sapply(dat, class)=="character"]
dat[cat_vars] <- lapply(dat[cat_vars], factor)

grid <- expand.grid(eta=0.1, gamma=1, nrounds=100, max_depth=3, colsample_bytree=0.8, min_child_weight=1, subsample=0.8)
ctrl <- trainControl(method="none", number = 2, verboseIter = T ,classProbs = T, summaryFunction = twoClassSummary)
fit_xgb2 <- train(Class~., data=dat, method="xgbTree", trControl=ctrl, tuneGrid=grid, metric="ROC")

Fitting eta = 0.1, gamma = 1, nrounds = 100, max_depth = 3, colsample_bytree = 0.8, min_child_weight = 1, subsample = 0.8 on full training set

xgb_imp <- varImp(fit_xgb2)$importance %>% data.frame()
xgb_imp_df <- data.frame(feature=rownames(xgb_imp), importance=xgb_imp$Overall)
xgb_imp_df %>% ggplot(aes(x=reorder(feature, importance), y=importance)) + 
  geom_bar(stat="identity") + coord_flip() + theme_fivethirtyeight()

library(pdp)
pdp_Amount <- pdp::partial(fit_xgb2, pred.var="Amount", plot=T, rug=T, prob=T, chull=T, 
                           
                           plot.engine='ggplot2', train=dat) + theme_bw() + ylim(0, 1)
ice_Amount <- pdp::partial(fit_xgb2, pred.var="Amount", plot=T, rug=T, prob=T, chull=T, 
                           ice=T, alpha=0.3,  
                           plot.engine='ggplot2', 
                           train=dat[sample(nrow(dat), nrow(dat)*0.1),]) + theme_bw() + ylim(0, 1)
c_ice_Amount <- pdp::partial(fit_xgb2, pred.var="Amount", plot=T, rug=T, prob=T, chull=T, 
                             ice=T, alpha=0.3, center=T,  
                             plot.engine='ggplot2', 
                             train=dat[sample(nrow(dat), nrow(dat)*0.1),]) + theme_bw() + ylim(0, 1)
grid.arrange(pdp_Amount, ice_Amount, c_ice_Amount)

library(pdp)
pdp_CheckingAccountStatus <- pdp::partial(fit_xgb2, pred.var="CheckingAccountStatus", plot=T, rug=T, prob=T, chull=T, 
                            cats=cat_vars, 
                           plot.engine='ggplot2', train=dat) + theme_bw() + ylim(0, 1)
ice_CheckingAccountStatus <- pdp::partial(fit_xgb2, pred.var="CheckingAccountStatus", plot=T, rug=T, prob=T, chull=T, 
                           ice=T, alpha=0.3,  
                           plot.engine='ggplot2', cats=cat_vars,
                           train=dat[sample(nrow(dat), nrow(dat)*0.1),]) + theme_bw() + ylim(0, 1)
c_ice_CheckingAccountStatus <- pdp::partial(fit_xgb2, pred.var="CheckingAccountStatus", plot=T, rug=T, prob=T, chull=T, 
                             ice=T, alpha=0.3, center=T,  
                             plot.engine='ggplot2', cats=cat_vars,
                             train=dat[sample(nrow(dat), nrow(dat)*0.1),]) + theme_bw() + ylim(0, 1)
grid.arrange(pdp_CheckingAccountStatus, ice_CheckingAccountStatus, c_ice_CheckingAccountStatus)

library(iml)
model <- Predictor$new(fit_xgb2, data=dat)

pdp_Amount <- Partial$new(model, "Amount", ice=F)
pdp_Amount$results %>% filter(.class=="Bad") %>% 
  ggplot(aes(x=Amount, y=.y.hat)) + geom_line() + ylim(0,1)

pdp_CheckingAccountStatus <- Partial$new(model, "CheckingAccountStatus", ice=F)
pdp_CheckingAccountStatus$results %>% filter(.class=="Bad") %>% 
  ggplot(aes(x=CheckingAccountStatus, y=.y.hat)) + geom_bar(stat="identity") + ylim(0,1)