[設定所需的函式庫(libraries)以及載入資料]
setwd("/home/m600/Working Area/Rdata Practice/Customer Course/riskC5.0")
risk=read.csv("./Risk.csv",header=T,sep=",")
[Part 1].Data-ETL
1-1.取得資料集的初探
head(risk)
## ID AGE INCOME GENDER MARITAL NUMKIDS NUMCARDS HOWPAID MORTGAGE
## 1 100756 44 59944 m married 1 2 monthly y
## 2 100668 35 59692 m married 1 1 monthly y
## 3 100418 34 59508 m married 1 1 monthly y
## 4 100416 34 59463 m married 0 2 monthly y
## 5 100590 39 59393 f married 0 2 monthly y
## 6 100657 41 59276 m married 1 2 monthly y
## STORECAR LOANS RISK
## 1 2 0 good risk
## 2 1 0 bad loss
## 3 2 1 good risk
## 4 1 1 bad loss
## 5 1 0 good risk
## 6 1 1 good risk
sumdebt=risk$NUMCARDS+risk$STORECAR+risk$LOANS
sumdebt[1:5]
## [1] 4 2 4 4 3
#risk=risk[,-c(1,7,10,11)]
risk=risk[,c(3,5,6,12)]
risk2=data.frame(risk,sumdebt)
head(risk2)
## INCOME MARITAL NUMKIDS RISK sumdebt
## 1 59944 married 1 good risk 4
## 2 59692 married 1 bad loss 2
## 3 59508 married 1 good risk 4
## 4 59463 married 0 bad loss 4
## 5 59393 married 0 good risk 3
## 6 59276 married 1 good risk 4
1-2.Test group 建立
n=0.3*nrow(risk)
test.index=sample(1:nrow(risk),n)
risk2.train=risk2[-test.index,]
risk2.test=risk2[test.index,]
[Part 2].C5.0 模式建立
2-1.Test group 建立
#install.packages("C50")
library(C50)
risk.tree=C5.0(RISK~ . ,data=risk2.train)
summary(risk.tree)
##
## Call:
## C5.0.formula(formula = RISK ~ ., data = risk2.train)
##
##
## C5.0 [Release 2.07 GPL Edition] Sat Feb 13 20:56:45 2016
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 2882 cases (5 attributes) from undefined.data
##
## Decision tree:
##
## INCOME <= 25728:
## :...sumdebt <= 7: bad profit (1425/162)
## : sumdebt > 7:
## : :...MARITAL = divsepwid: bad profit (401/228)
## : MARITAL in {married,single}: bad loss (179/21)
## INCOME > 25728:
## :...NUMKIDS <= 1: good risk (556/174)
## NUMKIDS > 1:
## :...MARITAL = divsepwid: bad profit (234/122)
## MARITAL in {married,single}: bad loss (87/12)
##
##
## Evaluation on training data (2882 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 6 719(24.9%) <<
##
##
## (a) (b) (c) <-classified as
## ---- ---- ----
## 233 331 84 (a): class bad loss
## 27 1548 90 (b): class bad profit
## 6 181 382 (c): class good risk
##
##
## Attribute usage:
##
## 100.00% INCOME
## 69.57% sumdebt
## 31.26% MARITAL
## 30.43% NUMKIDS
##
##
## Time: 0.0 secs
plot(risk.tree)
2-2.Train confusion matrix
RISK.train=risk2$RISK[-test.index]
train.pred=predict(risk.tree,risk2.train,type='class')
table.train=table(RISK.train,train.pred)
table.train
## train.pred
## RISK.train bad loss bad profit good risk
## bad loss 233 331 84
## bad profit 27 1548 90
## good risk 6 181 382
cat("Total records(train)=",nrow(risk2.train),"\n")
## Total records(train)= 2882
cat("Correct Classification Ratio(train)=",sum(diag(table.train))/sum(table.train)*100,"%\n")
## Correct Classification Ratio(train)= 75.05205 %
2-3.Test confusion matrix
RISK.test=risk2$RISK[test.index]
test.pred=predict(risk.tree,risk2.test,type='class')
table.test=table(RISK.test,test.pred)
table.test
## test.pred
## RISK.test bad loss bad profit good risk
## bad loss 100 122 36
## bad profit 6 690 46
## good risk 4 76 155
cat("Total records(test)=",nrow(risk2.test),"\n")
## Total records(test)= 1235
cat("Correct Classification Ratio(test)=",sum(diag(table.test))/sum(table.test)*100,"%\n")
## Correct Classification Ratio(test)= 76.51822 %
[Part 3].預測資料集
3-1.以RiskNew.xlsx 當成新的預測資料
riskNew=read.csv("./RiskNew.csv",header=T,sep=",")
head(riskNew)
## AGE INCOME GENDER MARITAL NUMKIDS NUMCARDS HOWPAID MORTGAGE STORECARDS
## 1 34 59463 m married 0 2 monthly y 1
## 2 34 59463 m married 0 2 monthly y 1
## 3 34 59463 m married 0 2 monthly y 1
## 4 34 59463 m married 0 2 monthly y 1
## 5 34 59463 m married 0 2 monthly y 1
## 6 34 59463 m married 0 2 monthly y 1
## LOANS
## 1 1
## 2 1
## 3 1
## 4 1
## 5 1
## 6 1
sumdebt=riskNew$NUMCARDS+riskNew$STORECARDS+riskNew$LOANS
riskNew=riskNew[,c(2,4,5)]
riskNew2=data.frame(riskNew,sumdebt)
head(riskNew2)
## INCOME MARITAL NUMKIDS sumdebt
## 1 59463 married 0 4
## 2 59463 married 0 4
## 3 59463 married 0 4
## 4 59463 married 0 4
## 5 59463 married 0 4
## 6 59463 married 0 4
3-2.Predict risknew2
risk.pred=predict(risk.tree,riskNew2,type='class')
3-3.Merge predict result and output
riskNewAll=data.frame(riskNew2,RiskPred=risk.pred)
head(riskNewAll,20)
## INCOME MARITAL NUMKIDS sumdebt RiskPred
## 1 59463 married 0 4 good risk
## 2 59463 married 0 4 good risk
## 3 59463 married 0 4 good risk
## 4 59463 married 0 4 good risk
## 5 59463 married 0 4 good risk
## 6 59463 married 0 4 good risk
## 7 59463 married 0 4 good risk
## 8 59463 married 0 4 good risk
## 9 59463 married 0 4 good risk
## 10 59463 married 0 4 good risk
## 11 59276 married 1 4 good risk
## 12 59276 married 1 4 good risk
## 13 59276 married 1 4 good risk
## 14 59276 married 1 4 good risk
## 15 59276 married 1 4 good risk
## 16 59276 married 1 4 good risk
## 17 59276 married 1 4 good risk
## 18 59276 married 1 4 good risk
## 19 59276 married 1 4 good risk
## 20 59276 married 1 4 good risk