[R] 신용평가 모형 평가지표

library(tidyverse) library(ggthemes)

Functions in MLmetrics Package

#### ROC-AUC #### ROCAUC <- function(y_pred, y_true){   rank <- rank(y_pred) # 예측값을 오름차순 정렬   n_pos <- sum(y_true == 1)   n_neg <- sum(y_true == 0)   AUC <- (sum(rank[y_true == 1]) - n_pos * (n_pos + 1)/2)/(n_pos * n_neg)   return(AUC) }  #### PRAUC #### if (!require(ROCR)) install.packages("ROCR", type="source") PRAUC <- function (y_pred, y_true){   pred_obj <- ROCR::prediction(y_pred, y_true)   perf_obj <- ROCR::performance(pred_obj, measure = "prec", x.measure = "rec")   PRAUC <- Area_Under_Curve(perf_obj@x.values[[1]], perf_obj@y.values[[1]], method = "trapezoid", na.rm = TRUE)   return(PRAUC) }  Area_Under_Curve <- function(x, y, method = c("trapezoid", "step", "spline"), na.rm = FALSE){   if (na.rm == TRUE) {     xy_cbind <- na.omit(cbind(x, y))     x <- xy_cbind[, 1]     y <- xy_cbind[, 2]   }   if (length(x) != length(y)) {     stop("length x must equal length y")   }   idx <- order(x)   x <- x[idx]   y <- y[idx]   switch(match.arg(arg = method, choices = c("trapezoid", "step", "spline")),          trapezoid = {AUC <- sum((apply(cbind(y[-length(y)], y[-1]), 1, mean)) * (x[-1] - x[-length(x)]))},          step = {AUC <- sum(y[-length(y)] * (x[-1] - x[-length(x)]))},          spline = {AUC <- integrate(splinefun(x, y, method = "natural"), lower = min(x), upper = max(x))$value})   return(AUC) }  #### GINI #### Gini <- function(y_pred, y_true){   SumGini <- function(y_pred, y_true){     y_true_sort <- y_true[order(y_pred, decreasing = TRUE)]     y_random <- 1:length(y_pred)/length(y_pred)     y_Lorentz <- cumsum(y_true_sort)/sum(y_true_sort)     SumGini <- sum(y_Lorentz - y_random)     return(SumGini)   }   NormalizedGini <- SumGini(y_pred, y_true)/SumGini(y_true, y_true)   return(NormalizedGini) }   #### KS #### KS_Stat <- function(y_pred, y_true){   n_pos <- sum(y_true == 0)   n_neg <- sum(y_true == 1)   rpp_vec <- numeric(length = 0)   rnp_vec <- numeric(length = 0)   for (i in sort(y_pred)) {     pos_neg <- y_true[y_pred <= i]     rpp_vec <- append(rpp_vec, sum(pos_neg == 0)/n_pos)     rnp_vec <- append(rnp_vec, sum(pos_neg == 1)/n_neg)   }   KS_Stat <- max(rpp_vec - rnp_vec) * 100   return(KS_Stat) }

User defined functions for gini and ks

# 20개 그룹으로 나눠서 계산 get_gini_df <- function(y_pred, y_true){   library(dplyr)   temp <- data.frame(y_pred, y_true= if_else(y_true==1, "Bad", "Good"))   # 20개 그릅으로 분할   temp <- temp %>%  mutate(group=cut(y_pred, breaks = quantile(y_pred, p=seq(0, 1, 0.05)),                                      right=F, include.lowest=T, labels=paste0("G", 1:20)))    temp <- temp %>% group_by(group, y_true) %>% summarise(N=n())   temp <- temp %>% spread(y_true, N, fill=0)   temp$Cum_Bad <- cumsum(temp$Bad)   temp$Cum_Good <- cumsum(temp$Good)   temp <- temp %>% mutate(P_Cum_Bad=Cum_Bad/sum(temp$Bad),                           P_Cum_Good=Cum_Good/sum(temp$Good))   return(temp) }  gini_index <- function(gini_df){   # 20개 그룹 나누어서 구한 값   gini <- (Area_Under_Curve(gini_df$P_Cum_Bad, gini_df$P_Cum_Good) - 0.5)/0.5    # 불량확률 기준으로 잘라서 score로 그렸을 때랑 반대로 생각해야 함   # G1 -> G20으로 갈수록 불량 확률이 높아짐   # score로 변환하면 G1 -> G20으로 갈수록 점수가 낮아지는 형태가 될 것   return(gini) }   # 20개 그룹으로 나눠서 계산 get_ks_df <- function(y_pred, y_true){   library(dplyr)   temp <- data.frame(y_pred, y_true= if_else(y_true==1, "Bad", "Good"))   # 20개 그릅으로 분할   temp <- temp %>%  mutate(group=cut(y_pred, breaks = quantile(y_pred, p=seq(0, 1, 0.05)),                                      right=F, include.lowest=T, labels=paste0("G", 1:20)))    temp <- temp %>% group_by(group, y_true) %>% summarise(N=n())   temp <- temp %>% spread(y_true, N, fill=0)   temp$Cum_Bad <- cumsum(temp$Bad)   temp$Cum_Good <- cumsum(temp$Good)   temp <- temp %>% mutate(P_Cum_Bad=Cum_Bad/sum(temp$Bad),                           P_Cum_Good=Cum_Good/sum(temp$Good),                           KS=abs(P_Cum_Good-P_Cum_Bad)*100)   return(temp) }  ks_stat <- function(ks_df){   ks_stat <- max(ks_df$KS)   return(ks_stat) }
res_logit <- read.csv('res_logit.csv') res_rf <- read.csv('res_rf.csv') res_svm <- read.csv('res_svm.csv')  # roc-auc ROCAUC(res_logit$pred, res_logit$true)
[1] 0.7879624
# pr-auc PRAUC(res_logit$pred, res_logit$true)
[1] 0.6010135
# gini Gini(res_logit$pred, res_logit$true) # MLmetrics 패키지 함수
[1] 0.5759248
gini_index(get_gini_df(res_logit$pred, res_logit$true)) # 20개 그룹 나누어서 구한 값
[1] 0.5755486
# ks KS_Stat(res_logit$pred, res_logit$true) # MLmetrics 패키지 함수
[1] 50.59561
ks_stat(get_ks_df(res_logit$pred, res_logit$true)) # 20개 그룹 나누어서 구한 값
[1] 48.27586
# ROC curve get_roc_df <- function(y_pred, y_true){   y_true <- y_true[order(y_pred, decreasing=TRUE)]   y_pred <- y_pred[order(y_pred, decreasing=TRUE)]   roc_df <- data.frame(TPR=cumsum(y_true)/sum(y_true), FPR=cumsum(!y_true)/sum(!y_true), y_true, y_pred,                        Bad=cumsum(y_true), Bad_TOT=sum(y_true), Good=cumsum(!y_true), Good_TOT=sum(!y_true))   return(roc_df) }  df <- rbind(data.frame(get_roc_df(res_logit$pred, res_logit$true), model="LR"),             data.frame(get_roc_df(res_rf$pred, res_rf$true), model="RF"),             data.frame(get_roc_df(res_svm$pred, res_svm$true), model="SVM"))  ggplot(df, aes(x=FPR, y=TPR, group=model, color=model)) +   geom_ribbon(aes(ymin=0, ymax=TPR, group=model, fill=model), alpha=0.1) + geom_line(size=1) +   theme_bw() + ylim(0,1) + xlim(0,1) +   ggtitle("ROC curve") +   theme(aspect.ratio=1,          plot.title = element_text(hjust=0.5, size=15, face="bold"),         axis.text = element_text(size=11),         axis.title = element_text(size=15),         legend.text = element_text(size=11))
# precision-recall curve get_pr_df <- function(y_pred, y_true){   pred_obj <- ROCR::prediction(y_pred, y_true)   perf_obj <- ROCR::performance(pred_obj, measure = "prec", x.measure = "rec")   pr_df <- data.frame(Recall=perf_obj@x.values[[1]], Precision=perf_obj@y.values[[1]]) }  df <- rbind(data.frame(get_pr_df(res_logit$pred, res_logit$true), model="LR"),             data.frame(get_pr_df(res_rf$pred, res_rf$true), model="RF"),             data.frame(get_pr_df(res_svm$pred, res_svm$true), model="SVM"))  ggplot(df, aes(x=Precision, y=Recall, group=model, color=model)) +   geom_ribbon(aes(ymin=0, ymax=Recall, group=model, fill=model), alpha=0.1) +    geom_line(size=0.7) +   theme_bw() + ylim(0,1) + xlim(0,1) +   ggtitle("Precision-Recall curve") +   theme(aspect.ratio=1,          plot.title = element_text(hjust=0.5, size=15, face="bold"),         axis.text = element_text(size=11),         axis.title = element_text(size=15),         legend.text = element_text(size=11))
ggplot(df, aes(x=Precision, y=Recall, group=model, color=model)) +   geom_line(size=0.7) +   theme_bw() + ylim(0,1) + xlim(0,1) +   ggtitle("Precision-Recall curve") +   theme(aspect.ratio=1,          plot.title = element_text(hjust=0.5, size=15, face="bold"),         axis.text = element_text(size=11),         axis.title = element_text(size=15),         legend.text = element_text(size=11))
# Accuracy/Cutoff 그래프 get_acc_cutoff_df <- function(y_pred, y_true){   pred_obj <- ROCR::prediction(y_pred, y_true)   perf_obj <- ROCR::performance(pred_obj, measure = "acc", x.measure = "cutoff")   pr_df <- data.frame(Cutoff=perf_obj@x.values[[1]], Accuracy=perf_obj@y.values[[1]]) }  df <- rbind(data.frame(get_acc_cutoff_df(res_logit$pred, res_logit$true), model="LR"),             data.frame(get_acc_cutoff_df(res_rf$pred, res_rf$true), model="RF"),             data.frame(get_acc_cutoff_df(res_svm$pred, res_svm$true), model="SVM"))  ggplot(df, aes(x=Cutoff, y=Accuracy, group=model, color=model)) +   geom_line(size=0.7) +   theme_bw() +   ggtitle("Accuracy/Cutoff") +   theme(aspect.ratio=1,          plot.title = element_text(hjust=0.5, size=15, face="bold"),         axis.text = element_text(size=11),         axis.title = element_text(size=15),         legend.text = element_text(size=11))
# K-S temp <- get_ks_df(res_logit$pred, res_logit$true) %>%   select(group, P_Cum_Bad, P_Cum_Good) %>%   gather(key="Target", value="Cum_percent", -group) temp$Target <- factor(temp$Target, labels=c("불량", "우량")) ggplot(temp, aes(x=group, y=Cum_percent, group=Target, color=Target)) +   geom_ribbon(aes(ymin=0, ymax=Cum_percent, group=Target, fill=Target), alpha=0.1) +   geom_line(size=1.2) +   scale_color_fivethirtyeight() +   scale_fill_fivethirtyeight() +   theme_bw() + ylab("누적구성비") + xlab("그룹") + ggtitle("K-S") +   theme(aspect.ratio=0.6,          plot.title = element_text(hjust=0.5, size=15, face="bold"),         axis.text = element_text(size=11),         axis.title = element_text(size=15),         legend.text = element_text(size=11))
# Gini temp <- get_gini_df(res_logit$pred, res_logit$true) ggplot(temp, aes(x=P_Cum_Bad, y=P_Cum_Good)) +   geom_line(size=0.7) + geom_abline(intercept = 0, slope=1, size=1) +   theme_bw() +   xlab("누적불량구성비") + ylab("누적우량구성비") +   theme(aspect.ratio=1,          plot.title = element_text(hjust=0.5, size=15, face="bold"),         axis.text = element_text(size=11),         axis.title = element_text(size=15),         legend.text = element_text(size=11))
# Divergence plot df <- data.frame(y_pred=res_logit$pred, y_true=if_else(res_logit$true==1, "불량", "우량")) ggplot(df, aes(x=y_pred, fill=y_true)) + geom_density(alpha=0.3) +   theme_bw() + scale_fill_fivethirtyeight() +   xlab("불량확률") + ylab("density") + labs(fill="Target") +   theme(aspect.ratio=1,          plot.title = element_text(hjust=0.5, size=15, face="bold"),         axis.text = element_text(size=11),         axis.title = element_text(size=15),         legend.text = element_text(size=11))