library(caret) # GermanCredit
library(tidyverse)
library(ggthemes)
library(ggmosaic)
library(gridExtra)
모델링에 사용할 데이터는 GermanCredit이다. caret 패키지의 내장 데이터이고, 패키지에 있는 데이터 설명을 보면 아래와 같다.
Description
Data from Dr.<u+00a0>Hans Hofmann of the University of Hamburg.</u+00a0>
Details
These data have two classes for the credit worthiness: good or bad. There are predictors related to attributes, such as: checking account status, duration, credit history, purpose of the loan, amount of the loan, savings accounts or bonds, employment duration, Installment rate in percentage of disposable income, personal information, other debtors/guarantors, residence duration, property, age, other installment plans, housing, number of existing credits, job information, Number of people being liable to provide maintenance for, telephone, and foreign worker status.
Many of these predictors are discrete and have been expanded into several 0/1 indicator variables
Source
UCI Machine Learning Repository
data("GermanCredit")
- Class 변수가 우량(Good)과 불량(Bad) 라벨이 붙은 타겟변수이고, 우불량 여부에 영향을 미칠 수 있는 독립변수들이 포함되어 있다. 범주형 변수는 모형이 바로 투입하기 용이하도록 원핫인코딩 되어 있다.
GermanCredit$Class %>% table
.
Bad Good
300 700
연속형 변수에 대해서는 density plot, 범주형 변수에 대해서는 mosaic plot을 그려볼 수 있다.
GermanCredit %>% ggplot(aes(Duration, fill=Class)) + geom_density(alpha=0.5) + theme_bw() + scale_fill_tableau()
GermanCredit %>% ggplot() +
geom_mosaic(aes(product(NumberExistingCredits), fill=Class)) +
theme_bw() + scale_fill_tableau() + xlab('NumberExistingCredits') + ylab('')
원 데이터에는 범주형 변수들이 원핫인코딩 되어 있지만 모자이크 플랏을 그리기 편하게 하기 위해 다시 하나의 변수로 바꿔주었다.
data_reform <- GermanCredit %>%
mutate(CheckingAccountStatus = case_when(CheckingAccountStatus.lt.0==1~"0미만",
CheckingAccountStatus.0.to.200==1~"0~200",
CheckingAccountStatus.gt.200==1~"200초과",
CheckingAccountStatus.none==1~"정보없음"),
CreditHistory = case_when(CreditHistory.NoCredit.AllPaid==1~"NoCredit.AllPaid",
CreditHistory.ThisBank.AllPaid==1~"ThisBank.AllPaid",
CreditHistory.PaidDuly==1~"PaidDuly",
CreditHistory.Delay==1~"Delay",
CreditHistory.Critical==1~"Critical"),
Purpose = case_when(Purpose.NewCar==1~"NewCar",
Purpose.UsedCar==1~"UsedCar",
Purpose.Furniture.Equipment==1~"Furniture.Equipment",
Purpose.Radio.Television==1~"Radio.Television",
Purpose.DomesticAppliance==1~"DomesticAppliance",
Purpose.Repairs==1~"Repairs",
Purpose.Education==1~"Education",
Purpose.Vacation==1~"Vacation",
Purpose.Retraining==1~"Retraining",
Purpose.Business==1~"Business",
Purpose.Other==1~"Other"),
SavingsAccountBonds = case_when(SavingsAccountBonds.lt.100==1~"100미만",
SavingsAccountBonds.100.to.500==1~"100~500",
SavingsAccountBonds.500.to.1000==1~"500~1000",
SavingsAccountBonds.gt.1000==1~"1000초과",
SavingsAccountBonds.Unknown==1~"정보없음"),
EmploymentDuration = case_when(EmploymentDuration.lt.1==1~"1년미만",
EmploymentDuration.1.to.4==1~"1년~4년",
EmploymentDuration.4.to.7==1~"4년~7년",
EmploymentDuration.gt.7==1~"7년초과",
EmploymentDuration.Unemployed==1~"미고용"),
Personal = case_when(Personal.Male.Divorced.Seperated==1~"Male.Divorced.Seperated",
Personal.Female.NotSingle==1~"Female.NotSingle",
Personal.Male.Single==1~"Male.Single",
Personal.Male.Married.Widowed==1~"Male.Married.Widowed",
Personal.Female.Single==1~"Female.Single"),
OtherDebtorsGuarantors = case_when(OtherDebtorsGuarantors.None==1~"None",
OtherDebtorsGuarantors.CoApplicant==1~"CoApplicant",
OtherDebtorsGuarantors.Guarantor==1~"Guarantor"),
Property = case_when(Property.RealEstate==1~"RealEstate",
Property.Insurance==1~"Insurance",
Property.CarOther==1~"CarOther",
Property.Unknown==1~"Unknown"),
OtherInstallmentPlans = case_when(OtherInstallmentPlans.Bank==1~"Bank",
OtherInstallmentPlans.Stores==1~"Stores",
OtherInstallmentPlans.None==1~"None"),
Housing = case_when(Housing.Rent==1~"Rent",
Housing.Own==1~"Own",
Housing.ForFree==1~"ForeFree"),
Job = case_when(Job.UnemployedUnskilled==1~"UnemployedUnskilled",
Job.UnskilledResident==1~"UnskilledResident",
Job.SkilledEmployee==1~"SkilledEmployee",
Job.Management.SelfEmp.HighlyQualified==1~"Management.SelfEmp.HighlyQualified")
)
data_reform %>% ggplot() +
geom_mosaic(aes(product(CheckingAccountStatus), fill=Class)) +
theme_bw() + scale_fill_tableau() + xlab('CheckingAccountStatus') + ylab('')
다른 변수들도 같은 방식으로 탐색해 볼 수 있다.
sel_vars <- c("Duration", "Amount", "InstallmentRatePercentage", "ResidenceDuration",
"Age", "NumberExistingCredits", "NumberPeopleMaintenance", "Telephone",
"ForeignWorker", "Class", names(data_reform)[sapply(data_reform, class) == "character"])
dat <- data_reform %>% select(sel_vars)
Make imbalance dataset
# dat_b <- dat %>% filter(Class=="Bad")
# dat_g <- dat %>% filter(Class=="Good")
#
# set.seed(42)
# dat <- rbind(dat_b[sample(nrow(dat_b), 20),], dat_g)
Split dataset
set.seed(42)
train_idx <- sample(nrow(dat), nrow(dat)*0.8)
dtrain <- dat[train_idx, ]
dtest <- dat[-train_idx, ]
trainControl setup
ctrl <- trainControl(method="cv",
number=5, # 5-folds CV
classProbs = TRUE,
summaryFunction = twoClassSummary,
verboseIter = TRUE)
Logistic Regression
set.seed(42)
fit_logit <- train(Class~., data=dtrain,
method="glm", family="binomial", maxit=100, epsilon=0.0001,
trControl=ctrl, metric="ROC")
+ Fold1: parameter=none
- Fold1: parameter=none
+ Fold2: parameter=none
- Fold2: parameter=none
+ Fold3: parameter=none
- Fold3: parameter=none
+ Fold4: parameter=none
- Fold4: parameter=none
+ Fold5: parameter=none
- Fold5: parameter=none
Aggregating results
Fitting final model on full training set
fit_logit
Generalized Linear Model
800 samples
20 predictor
2 classes: 'Bad', 'Good'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 640, 640, 640, 640, 640
Resampling results:
ROC Sens Spec
0.7661335 0.4857143 0.845045
pred_logit <- predict(fit_logit, newdata=dtest, type="prob")
Random Forest
set.seed(42)
fit_rf <- train(Class~., data=dtrain,
method="ranger",
trControl=ctrl, metric="ROC")
+ Fold1: mtry= 2, min.node.size=1, splitrule=gini
- Fold1: mtry= 2, min.node.size=1, splitrule=gini
+ Fold1: mtry=25, min.node.size=1, splitrule=gini
- Fold1: mtry=25, min.node.size=1, splitrule=gini
+ Fold1: mtry=48, min.node.size=1, splitrule=gini
- Fold1: mtry=48, min.node.size=1, splitrule=gini
+ Fold1: mtry= 2, min.node.size=1, splitrule=extratrees
- Fold1: mtry= 2, min.node.size=1, splitrule=extratrees
+ Fold1: mtry=25, min.node.size=1, splitrule=extratrees
- Fold1: mtry=25, min.node.size=1, splitrule=extratrees
+ Fold1: mtry=48, min.node.size=1, splitrule=extratrees
- Fold1: mtry=48, min.node.size=1, splitrule=extratrees
+ Fold2: mtry= 2, min.node.size=1, splitrule=gini
- Fold2: mtry= 2, min.node.size=1, splitrule=gini
+ Fold2: mtry=25, min.node.size=1, splitrule=gini
- Fold2: mtry=25, min.node.size=1, splitrule=gini
+ Fold2: mtry=48, min.node.size=1, splitrule=gini
- Fold2: mtry=48, min.node.size=1, splitrule=gini
+ Fold2: mtry= 2, min.node.size=1, splitrule=extratrees
- Fold2: mtry= 2, min.node.size=1, splitrule=extratrees
+ Fold2: mtry=25, min.node.size=1, splitrule=extratrees
- Fold2: mtry=25, min.node.size=1, splitrule=extratrees
+ Fold2: mtry=48, min.node.size=1, splitrule=extratrees
- Fold2: mtry=48, min.node.size=1, splitrule=extratrees
+ Fold3: mtry= 2, min.node.size=1, splitrule=gini
- Fold3: mtry= 2, min.node.size=1, splitrule=gini
+ Fold3: mtry=25, min.node.size=1, splitrule=gini
- Fold3: mtry=25, min.node.size=1, splitrule=gini
+ Fold3: mtry=48, min.node.size=1, splitrule=gini
- Fold3: mtry=48, min.node.size=1, splitrule=gini
+ Fold3: mtry= 2, min.node.size=1, splitrule=extratrees
- Fold3: mtry= 2, min.node.size=1, splitrule=extratrees
+ Fold3: mtry=25, min.node.size=1, splitrule=extratrees
- Fold3: mtry=25, min.node.size=1, splitrule=extratrees
+ Fold3: mtry=48, min.node.size=1, splitrule=extratrees
- Fold3: mtry=48, min.node.size=1, splitrule=extratrees
+ Fold4: mtry= 2, min.node.size=1, splitrule=gini
- Fold4: mtry= 2, min.node.size=1, splitrule=gini
+ Fold4: mtry=25, min.node.size=1, splitrule=gini
- Fold4: mtry=25, min.node.size=1, splitrule=gini
+ Fold4: mtry=48, min.node.size=1, splitrule=gini
- Fold4: mtry=48, min.node.size=1, splitrule=gini
+ Fold4: mtry= 2, min.node.size=1, splitrule=extratrees
- Fold4: mtry= 2, min.node.size=1, splitrule=extratrees
+ Fold4: mtry=25, min.node.size=1, splitrule=extratrees
- Fold4: mtry=25, min.node.size=1, splitrule=extratrees
+ Fold4: mtry=48, min.node.size=1, splitrule=extratrees
- Fold4: mtry=48, min.node.size=1, splitrule=extratrees
+ Fold5: mtry= 2, min.node.size=1, splitrule=gini
- Fold5: mtry= 2, min.node.size=1, splitrule=gini
+ Fold5: mtry=25, min.node.size=1, splitrule=gini
- Fold5: mtry=25, min.node.size=1, splitrule=gini
+ Fold5: mtry=48, min.node.size=1, splitrule=gini
- Fold5: mtry=48, min.node.size=1, splitrule=gini
+ Fold5: mtry= 2, min.node.size=1, splitrule=extratrees
- Fold5: mtry= 2, min.node.size=1, splitrule=extratrees
+ Fold5: mtry=25, min.node.size=1, splitrule=extratrees
- Fold5: mtry=25, min.node.size=1, splitrule=extratrees
+ Fold5: mtry=48, min.node.size=1, splitrule=extratrees
- Fold5: mtry=48, min.node.size=1, splitrule=extratrees
Aggregating results
Selecting tuning parameters
Fitting mtry = 2, splitrule = gini, min.node.size = 1 on full training set
fit_rf
Random Forest
800 samples
20 predictor
2 classes: 'Bad', 'Good'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 640, 640, 640, 640, 640
Resampling results across tuning parameters:
mtry splitrule ROC Sens Spec
2 gini 0.7766133 0.1551020 0.9837838
2 extratrees 0.7719066 0.1061224 0.9819820
25 gini 0.7752712 0.4489796 0.8846847
25 extratrees 0.7557455 0.4163265 0.8720721
48 gini 0.7763743 0.4775510 0.8720721
48 extratrees 0.7547343 0.4408163 0.8594595
Tuning parameter 'min.node.size' was held constant at a value of 1
ROC was used to select the optimal model using the largest value.
The final values used for the model were mtry = 2, splitrule = gini
and min.node.size = 1.
pred_rf <- predict(fit_rf, newdata = dtest, type = "prob")
Support Vector Machine
set.seed(42)
fit_svm <- train(Class~., data=dtrain,
method="svmRadial",
trControl=ctrl, metric="ROC")
+ Fold1: sigma=0.01236, C=0.25
- Fold1: sigma=0.01236, C=0.25
+ Fold1: sigma=0.01236, C=0.50
- Fold1: sigma=0.01236, C=0.50
+ Fold1: sigma=0.01236, C=1.00
- Fold1: sigma=0.01236, C=1.00
+ Fold2: sigma=0.01236, C=0.25
- Fold2: sigma=0.01236, C=0.25
+ Fold2: sigma=0.01236, C=0.50
- Fold2: sigma=0.01236, C=0.50
+ Fold2: sigma=0.01236, C=1.00
- Fold2: sigma=0.01236, C=1.00
+ Fold3: sigma=0.01236, C=0.25
- Fold3: sigma=0.01236, C=0.25
+ Fold3: sigma=0.01236, C=0.50
- Fold3: sigma=0.01236, C=0.50
+ Fold3: sigma=0.01236, C=1.00
- Fold3: sigma=0.01236, C=1.00
+ Fold4: sigma=0.01236, C=0.25
- Fold4: sigma=0.01236, C=0.25
+ Fold4: sigma=0.01236, C=0.50
- Fold4: sigma=0.01236, C=0.50
+ Fold4: sigma=0.01236, C=1.00
- Fold4: sigma=0.01236, C=1.00
+ Fold5: sigma=0.01236, C=0.25
- Fold5: sigma=0.01236, C=0.25
+ Fold5: sigma=0.01236, C=0.50
- Fold5: sigma=0.01236, C=0.50
+ Fold5: sigma=0.01236, C=1.00
- Fold5: sigma=0.01236, C=1.00
Aggregating results
Selecting tuning parameters
Fitting sigma = 0.0124, C = 0.5 on full training set
fit_svm
Support Vector Machines with Radial Basis Function Kernel
800 samples
20 predictor
2 classes: 'Bad', 'Good'
No pre-processing
Resampling: Cross-Validated (5 fold)
Summary of sample sizes: 640, 640, 640, 640, 640
Resampling results across tuning parameters:
C ROC Sens Spec
0.25 0.7762824 0.4571429 0.8648649
0.50 0.7765030 0.4530612 0.8666667
1.00 0.7743335 0.4204082 0.8810811
Tuning parameter 'sigma' was held constant at a value of 0.01235994
ROC was used to select the optimal model using the largest value.
The final values used for the model were sigma = 0.01235994 and C = 0.5.
pred_svm <- predict(fit_svm, newdata = dtest, type = "prob")
ROC curve
TPR = sensitivity = recall = \(\frac{TP}{TP+FN}\) = 실제 불량인 것을 불량으로 예측
FPR = 1-specificity = \(\frac{FP}{FP+TN}\) = 실제는 우량인데 불량으로 예측
simple_roc <- function(true, pred){
true <- true[order(pred, decreasing=TRUE)]
pred <- pred[order(pred, decreasing=TRUE)]
data.frame(TPR=cumsum(true)/sum(true), FPR=cumsum(!true)/sum(!true), true, pred,
Bad=cumsum(true), Bad_TOT=sum(true), Good=cumsum(!true), Good_TOT=sum(!true))
}
roc_df <- simple_roc(if_else(dtest$Class=="Bad", 1, 0), pred_logit$Bad)
roc_df %>% ggplot(aes(x=FPR, y=TPR)) + geom_line() + theme(aspect.ratio = 1) + theme_bw()
library(pROC)
roc_logit <- roc(dtest$Class, pred_logit$Bad)
cat(paste("AUC(로지스틱회귀):", pROC::auc(roc_logit)))
AUC(로지스틱회귀): 0.787962382445141
roc_rf <- roc(dtest$Class, pred_rf$Bad)
cat(paste("AUC(랜덤포레스트):", pROC::auc(roc_rf)))
AUC(랜덤포레스트): 0.82307210031348
roc_svm <- roc(dtest$Class, pred_svm$Bad)
cat(paste("AUC(서포트벡터머신):", pROC::auc(roc_svm)))
AUC(서포트벡터머신): 0.799623824451411
plot(roc_logit, legacy.axes=TRUE,
print.auc=TRUE, print.auc.adj=c(1, -1),
auc.polygon=TRUE)
plot(roc_rf, col="blue", legacy.axes=TRUE,
print.auc=TRUE, add=T, print.auc.adj=c(1, 1),
auc.polygon=TRUE, auc.polygon.col=adjustcolor("blue", alpha.f = 0.1))
plot(roc_svm, col="red", legacy.axes=TRUE,
print.auc=TRUE, add=T, print.auc.adj=c(1, 3),
auc.polygon=TRUE, auc.polygon.col=adjustcolor("red", alpha.f = 0.1))
res_logit <- data.frame(pred=pred_logit$Bad, true=if_else(dtest$Class=="Bad", 1, 0))
# write.csv(res_logit, file="res_logit.csv", row.names = F)
res_rf <- data.frame(pred=pred_rf$Bad, true=if_else(dtest$Class=="Bad", 1, 0))
# write.csv(res_rf, file="res_rf.csv", row.names = F)
res_svm <- data.frame(pred=pred_svm$Bad, true=if_else(dtest$Class=="Bad", 1, 0))
# write.csv(res_svm, file="res_svm.csv", row.names = F)
library(MLmetrics)
AUC(pred_logit$Bad, if_else(dtest$Class=="Bad", 1, 0))
[1] 0.7879624
AUC(pred_rf$Bad, if_else(dtest$Class=="Bad", 1, 0))
[1] 0.8230721
AUC(pred_svm$Bad, if_else(dtest$Class=="Bad", 1, 0))
[1] 0.7996238
AUC <- function (y_pred, y_true){
rank <- rank(y_pred)
n_pos <- sum(y_true == 1)
n_neg <- sum(y_true == 0)
AUC <- (sum(rank[y_true == 1]) - n_pos*(n_pos+1)/2) / (n_pos*n_neg) # wilcoxon rank sum statitic
return(AUC)
}
library(xgboost)
dtrain <- xgb.DMatrix(model.matrix(~.-1, data=dat %>% select(-Class)), label=ifelse(dat$Class=="Bad", 1, 0))
param <- list(objective="binary:logistic", eta=0.1, gamma=1, seed=42)
fit_xgb <- xgboost(data=dtrain, verbose=0, params = param, nrounds = 100)
xgb.ggplot.importance(xgb.importance(colnames(dtrain), model=fit_xgb), rel_to_first = T) + theme_fivethirtyeight() + scale_fill_tableau()
library(pdp)
train <- model.matrix(~.-1, data=dat %>% select(-Class))
pdp_Amount <- pdp::partial(fit_xgb, pred.var="Amount", plot=T, rug=T, prob=T, chull=T,
plot.engine='ggplot2', train=train) + theme_bw() + ylim(0, 1)
ice_Amount <- pdp::partial(fit_xgb, pred.var="Amount", plot=T, rug=T, prob=T, chull=T,
ice=T, alpha=0.3,
plot.engine='ggplot2',
train=train[sample(nrow(train), nrow(train)*0.1),]) + theme_bw() + ylim(0, 1)
c_ice_Amount <- pdp::partial(fit_xgb, pred.var="Amount", plot=T, rug=T, prob=T, chull=T,
ice=T, alpha=0.3, center=T,
plot.engine='ggplot2',
train=train[sample(nrow(train), nrow(train)*0.1),]) + theme_bw() + ylim(0, 1)
grid.arrange(pdp_Amount, ice_Amount, c_ice_Amount)
pdp_Duration <- pdp::partial(fit_xgb, pred.var="Duration", plot=T, rug=T, prob=T, chull=T,
plot.engine='ggplot2', train=train) + theme_bw() + ylim(0, 1)
ice_Duration <- pdp::partial(fit_xgb, pred.var="Duration", plot=T, rug=T, prob=T, chull=T,
ice=T, alpha=0.3,
plot.engine='ggplot2',
train=train[sample(nrow(train), nrow(train)*0.1),]) + theme_bw() + ylim(0, 1)
c_ice_Duration <- pdp::partial(fit_xgb, pred.var="Duration", plot=T, rug=T, prob=T, chull=T,
ice=T, alpha=0.3, center=T,
plot.engine='ggplot2',
train=train[sample(nrow(train), nrow(train)*0.1),]) + theme_bw() + ylim(0, 1)
grid.arrange(pdp_Duration, ice_Duration, c_ice_Duration)
pdp_Age <- pdp::partial(fit_xgb, pred.var="Age", plot=T, rug=T, prob=T, chull=T,
plot.engine='ggplot2', train=train) + theme_bw() + ylim(0, 1)
ice_Age <- pdp::partial(fit_xgb, pred.var="Age", plot=T, rug=T, prob=T, chull=T,
ice=T, alpha=0.3,
plot.engine='ggplot2',
train=train[sample(nrow(train), nrow(train)*0.1),]) + theme_bw() + ylim(0, 1)
c_ice_Age <- pdp::partial(fit_xgb, pred.var="Age", plot=T, rug=T, prob=T, chull=T,
ice=T, alpha=0.3, center=T,
plot.engine='ggplot2',
train=train[sample(nrow(train), nrow(train)*0.1),]) + theme_bw() + ylim(0, 1)
grid.arrange(pdp_Age, ice_Age, c_ice_Age)
library(ICEbox)
ice_Amount <- ice(object = fit_xgb, X=train, y=ifelse(dat$Class=="Bad", 1, 0),
predictor="Amount", frac_to_build = 0.1)
....................................................................................................
plot(ice_Amount, plot_orig_pts_preds=T, pts_preds_size=1)
dice_Amount <- dice(ice_Amount)
plot(dice_Amount, pts_preds_size=1)
NULL
cat_vars <- names(dat)[sapply(dat, class)=="character"]
dat[cat_vars] <- lapply(dat[cat_vars], factor)
grid <- expand.grid(eta=0.1, gamma=1, nrounds=100, max_depth=3, colsample_bytree=0.8, min_child_weight=1, subsample=0.8)
ctrl <- trainControl(method="none", number = 2, verboseIter = T ,classProbs = T, summaryFunction = twoClassSummary)
fit_xgb2 <- train(Class~., data=dat, method="xgbTree", trControl=ctrl, tuneGrid=grid, metric="ROC")
Fitting eta = 0.1, gamma = 1, nrounds = 100, max_depth = 3, colsample_bytree = 0.8, min_child_weight = 1, subsample = 0.8 on full training set
xgb_imp <- varImp(fit_xgb2)$importance %>% data.frame()
xgb_imp_df <- data.frame(feature=rownames(xgb_imp), importance=xgb_imp$Overall)
xgb_imp_df %>% ggplot(aes(x=reorder(feature, importance), y=importance)) +
geom_bar(stat="identity") + coord_flip() + theme_fivethirtyeight()
library(pdp)
pdp_Amount <- pdp::partial(fit_xgb2, pred.var="Amount", plot=T, rug=T, prob=T, chull=T,
plot.engine='ggplot2', train=dat) + theme_bw() + ylim(0, 1)
ice_Amount <- pdp::partial(fit_xgb2, pred.var="Amount", plot=T, rug=T, prob=T, chull=T,
ice=T, alpha=0.3,
plot.engine='ggplot2',
train=dat[sample(nrow(dat), nrow(dat)*0.1),]) + theme_bw() + ylim(0, 1)
c_ice_Amount <- pdp::partial(fit_xgb2, pred.var="Amount", plot=T, rug=T, prob=T, chull=T,
ice=T, alpha=0.3, center=T,
plot.engine='ggplot2',
train=dat[sample(nrow(dat), nrow(dat)*0.1),]) + theme_bw() + ylim(0, 1)
grid.arrange(pdp_Amount, ice_Amount, c_ice_Amount)
library(pdp)
pdp_CheckingAccountStatus <- pdp::partial(fit_xgb2, pred.var="CheckingAccountStatus", plot=T, rug=T, prob=T, chull=T,
cats=cat_vars,
plot.engine='ggplot2', train=dat) + theme_bw() + ylim(0, 1)
ice_CheckingAccountStatus <- pdp::partial(fit_xgb2, pred.var="CheckingAccountStatus", plot=T, rug=T, prob=T, chull=T,
ice=T, alpha=0.3,
plot.engine='ggplot2', cats=cat_vars,
train=dat[sample(nrow(dat), nrow(dat)*0.1),]) + theme_bw() + ylim(0, 1)
c_ice_CheckingAccountStatus <- pdp::partial(fit_xgb2, pred.var="CheckingAccountStatus", plot=T, rug=T, prob=T, chull=T,
ice=T, alpha=0.3, center=T,
plot.engine='ggplot2', cats=cat_vars,
train=dat[sample(nrow(dat), nrow(dat)*0.1),]) + theme_bw() + ylim(0, 1)
grid.arrange(pdp_CheckingAccountStatus, ice_CheckingAccountStatus, c_ice_CheckingAccountStatus)
library(iml)
model <- Predictor$new(fit_xgb2, data=dat)
pdp_Amount <- Partial$new(model, "Amount", ice=F)
pdp_Amount$results %>% filter(.class=="Bad") %>%
ggplot(aes(x=Amount, y=.y.hat)) + geom_line() + ylim(0,1)
pdp_CheckingAccountStatus <- Partial$new(model, "CheckingAccountStatus", ice=F)
pdp_CheckingAccountStatus$results %>% filter(.class=="Bad") %>%
ggplot(aes(x=CheckingAccountStatus, y=.y.hat)) + geom_bar(stat="identity") + ylim(0,1)