library(tidyverse) library(ggthemes)
Functions in MLmetrics
Package
#### ROC-AUC #### ROCAUC <- function(y_pred, y_true){ rank <- rank(y_pred) # 예측값을 오름차순 정렬 n_pos <- sum(y_true == 1) n_neg <- sum(y_true == 0) AUC <- (sum(rank[y_true == 1]) - n_pos * (n_pos + 1)/2)/(n_pos * n_neg) return(AUC) } #### PRAUC #### if (!require(ROCR)) install.packages("ROCR", type="source") PRAUC <- function (y_pred, y_true){ pred_obj <- ROCR::prediction(y_pred, y_true) perf_obj <- ROCR::performance(pred_obj, measure = "prec", x.measure = "rec") PRAUC <- Area_Under_Curve(perf_obj@x.values[[1]], perf_obj@y.values[[1]], method = "trapezoid", na.rm = TRUE) return(PRAUC) } Area_Under_Curve <- function(x, y, method = c("trapezoid", "step", "spline"), na.rm = FALSE){ if (na.rm == TRUE) { xy_cbind <- na.omit(cbind(x, y)) x <- xy_cbind[, 1] y <- xy_cbind[, 2] } if (length(x) != length(y)) { stop("length x must equal length y") } idx <- order(x) x <- x[idx] y <- y[idx] switch(match.arg(arg = method, choices = c("trapezoid", "step", "spline")), trapezoid = {AUC <- sum((apply(cbind(y[-length(y)], y[-1]), 1, mean)) * (x[-1] - x[-length(x)]))}, step = {AUC <- sum(y[-length(y)] * (x[-1] - x[-length(x)]))}, spline = {AUC <- integrate(splinefun(x, y, method = "natural"), lower = min(x), upper = max(x))$value}) return(AUC) } #### GINI #### Gini <- function(y_pred, y_true){ SumGini <- function(y_pred, y_true){ y_true_sort <- y_true[order(y_pred, decreasing = TRUE)] y_random <- 1:length(y_pred)/length(y_pred) y_Lorentz <- cumsum(y_true_sort)/sum(y_true_sort) SumGini <- sum(y_Lorentz - y_random) return(SumGini) } NormalizedGini <- SumGini(y_pred, y_true)/SumGini(y_true, y_true) return(NormalizedGini) } #### KS #### KS_Stat <- function(y_pred, y_true){ n_pos <- sum(y_true == 0) n_neg <- sum(y_true == 1) rpp_vec <- numeric(length = 0) rnp_vec <- numeric(length = 0) for (i in sort(y_pred)) { pos_neg <- y_true[y_pred <= i] rpp_vec <- append(rpp_vec, sum(pos_neg == 0)/n_pos) rnp_vec <- append(rnp_vec, sum(pos_neg == 1)/n_neg) } KS_Stat <- max(rpp_vec - rnp_vec) * 100 return(KS_Stat) }
User defined functions for gini and ks
# 20개 그룹으로 나눠서 계산 get_gini_df <- function(y_pred, y_true){ library(dplyr) temp <- data.frame(y_pred, y_true= if_else(y_true==1, "Bad", "Good")) # 20개 그릅으로 분할 temp <- temp %>% mutate(group=cut(y_pred, breaks = quantile(y_pred, p=seq(0, 1, 0.05)), right=F, include.lowest=T, labels=paste0("G", 1:20))) temp <- temp %>% group_by(group, y_true) %>% summarise(N=n()) temp <- temp %>% spread(y_true, N, fill=0) temp$Cum_Bad <- cumsum(temp$Bad) temp$Cum_Good <- cumsum(temp$Good) temp <- temp %>% mutate(P_Cum_Bad=Cum_Bad/sum(temp$Bad), P_Cum_Good=Cum_Good/sum(temp$Good)) return(temp) } gini_index <- function(gini_df){ # 20개 그룹 나누어서 구한 값 gini <- (Area_Under_Curve(gini_df$P_Cum_Bad, gini_df$P_Cum_Good) - 0.5)/0.5 # 불량확률 기준으로 잘라서 score로 그렸을 때랑 반대로 생각해야 함 # G1 -> G20으로 갈수록 불량 확률이 높아짐 # score로 변환하면 G1 -> G20으로 갈수록 점수가 낮아지는 형태가 될 것 return(gini) } # 20개 그룹으로 나눠서 계산 get_ks_df <- function(y_pred, y_true){ library(dplyr) temp <- data.frame(y_pred, y_true= if_else(y_true==1, "Bad", "Good")) # 20개 그릅으로 분할 temp <- temp %>% mutate(group=cut(y_pred, breaks = quantile(y_pred, p=seq(0, 1, 0.05)), right=F, include.lowest=T, labels=paste0("G", 1:20))) temp <- temp %>% group_by(group, y_true) %>% summarise(N=n()) temp <- temp %>% spread(y_true, N, fill=0) temp$Cum_Bad <- cumsum(temp$Bad) temp$Cum_Good <- cumsum(temp$Good) temp <- temp %>% mutate(P_Cum_Bad=Cum_Bad/sum(temp$Bad), P_Cum_Good=Cum_Good/sum(temp$Good), KS=abs(P_Cum_Good-P_Cum_Bad)*100) return(temp) } ks_stat <- function(ks_df){ ks_stat <- max(ks_df$KS) return(ks_stat) }
res_logit <- read.csv('res_logit.csv') res_rf <- read.csv('res_rf.csv') res_svm <- read.csv('res_svm.csv') # roc-auc ROCAUC(res_logit$pred, res_logit$true)
[1] 0.7879624
# pr-auc PRAUC(res_logit$pred, res_logit$true)
[1] 0.6010135
# gini Gini(res_logit$pred, res_logit$true) # MLmetrics 패키지 함수
[1] 0.5759248
gini_index(get_gini_df(res_logit$pred, res_logit$true)) # 20개 그룹 나누어서 구한 값
[1] 0.5755486
# ks KS_Stat(res_logit$pred, res_logit$true) # MLmetrics 패키지 함수
[1] 50.59561
ks_stat(get_ks_df(res_logit$pred, res_logit$true)) # 20개 그룹 나누어서 구한 값
[1] 48.27586
# ROC curve get_roc_df <- function(y_pred, y_true){ y_true <- y_true[order(y_pred, decreasing=TRUE)] y_pred <- y_pred[order(y_pred, decreasing=TRUE)] roc_df <- data.frame(TPR=cumsum(y_true)/sum(y_true), FPR=cumsum(!y_true)/sum(!y_true), y_true, y_pred, Bad=cumsum(y_true), Bad_TOT=sum(y_true), Good=cumsum(!y_true), Good_TOT=sum(!y_true)) return(roc_df) } df <- rbind(data.frame(get_roc_df(res_logit$pred, res_logit$true), model="LR"), data.frame(get_roc_df(res_rf$pred, res_rf$true), model="RF"), data.frame(get_roc_df(res_svm$pred, res_svm$true), model="SVM")) ggplot(df, aes(x=FPR, y=TPR, group=model, color=model)) + geom_ribbon(aes(ymin=0, ymax=TPR, group=model, fill=model), alpha=0.1) + geom_line(size=1) + theme_bw() + ylim(0,1) + xlim(0,1) + ggtitle("ROC curve") + theme(aspect.ratio=1, plot.title = element_text(hjust=0.5, size=15, face="bold"), axis.text = element_text(size=11), axis.title = element_text(size=15), legend.text = element_text(size=11))
# precision-recall curve get_pr_df <- function(y_pred, y_true){ pred_obj <- ROCR::prediction(y_pred, y_true) perf_obj <- ROCR::performance(pred_obj, measure = "prec", x.measure = "rec") pr_df <- data.frame(Recall=perf_obj@x.values[[1]], Precision=perf_obj@y.values[[1]]) } df <- rbind(data.frame(get_pr_df(res_logit$pred, res_logit$true), model="LR"), data.frame(get_pr_df(res_rf$pred, res_rf$true), model="RF"), data.frame(get_pr_df(res_svm$pred, res_svm$true), model="SVM")) ggplot(df, aes(x=Precision, y=Recall, group=model, color=model)) + geom_ribbon(aes(ymin=0, ymax=Recall, group=model, fill=model), alpha=0.1) + geom_line(size=0.7) + theme_bw() + ylim(0,1) + xlim(0,1) + ggtitle("Precision-Recall curve") + theme(aspect.ratio=1, plot.title = element_text(hjust=0.5, size=15, face="bold"), axis.text = element_text(size=11), axis.title = element_text(size=15), legend.text = element_text(size=11))
ggplot(df, aes(x=Precision, y=Recall, group=model, color=model)) + geom_line(size=0.7) + theme_bw() + ylim(0,1) + xlim(0,1) + ggtitle("Precision-Recall curve") + theme(aspect.ratio=1, plot.title = element_text(hjust=0.5, size=15, face="bold"), axis.text = element_text(size=11), axis.title = element_text(size=15), legend.text = element_text(size=11))
# Accuracy/Cutoff 그래프 get_acc_cutoff_df <- function(y_pred, y_true){ pred_obj <- ROCR::prediction(y_pred, y_true) perf_obj <- ROCR::performance(pred_obj, measure = "acc", x.measure = "cutoff") pr_df <- data.frame(Cutoff=perf_obj@x.values[[1]], Accuracy=perf_obj@y.values[[1]]) } df <- rbind(data.frame(get_acc_cutoff_df(res_logit$pred, res_logit$true), model="LR"), data.frame(get_acc_cutoff_df(res_rf$pred, res_rf$true), model="RF"), data.frame(get_acc_cutoff_df(res_svm$pred, res_svm$true), model="SVM")) ggplot(df, aes(x=Cutoff, y=Accuracy, group=model, color=model)) + geom_line(size=0.7) + theme_bw() + ggtitle("Accuracy/Cutoff") + theme(aspect.ratio=1, plot.title = element_text(hjust=0.5, size=15, face="bold"), axis.text = element_text(size=11), axis.title = element_text(size=15), legend.text = element_text(size=11))
# K-S temp <- get_ks_df(res_logit$pred, res_logit$true) %>% select(group, P_Cum_Bad, P_Cum_Good) %>% gather(key="Target", value="Cum_percent", -group) temp$Target <- factor(temp$Target, labels=c("불량", "우량")) ggplot(temp, aes(x=group, y=Cum_percent, group=Target, color=Target)) + geom_ribbon(aes(ymin=0, ymax=Cum_percent, group=Target, fill=Target), alpha=0.1) + geom_line(size=1.2) + scale_color_fivethirtyeight() + scale_fill_fivethirtyeight() + theme_bw() + ylab("누적구성비") + xlab("그룹") + ggtitle("K-S") + theme(aspect.ratio=0.6, plot.title = element_text(hjust=0.5, size=15, face="bold"), axis.text = element_text(size=11), axis.title = element_text(size=15), legend.text = element_text(size=11))
# Gini temp <- get_gini_df(res_logit$pred, res_logit$true) ggplot(temp, aes(x=P_Cum_Bad, y=P_Cum_Good)) + geom_line(size=0.7) + geom_abline(intercept = 0, slope=1, size=1) + theme_bw() + xlab("누적불량구성비") + ylab("누적우량구성비") + theme(aspect.ratio=1, plot.title = element_text(hjust=0.5, size=15, face="bold"), axis.text = element_text(size=11), axis.title = element_text(size=15), legend.text = element_text(size=11))
# Divergence plot df <- data.frame(y_pred=res_logit$pred, y_true=if_else(res_logit$true==1, "불량", "우량")) ggplot(df, aes(x=y_pred, fill=y_true)) + geom_density(alpha=0.3) + theme_bw() + scale_fill_fivethirtyeight() + xlab("불량확률") + ylab("density") + labs(fill="Target") + theme(aspect.ratio=1, plot.title = element_text(hjust=0.5, size=15, face="bold"), axis.text = element_text(size=11), axis.title = element_text(size=15), legend.text = element_text(size=11))