資料說明

1.本單元主題僅在介紹建立C5.0模型與決策樹

2.從risk.xlsx作資料準備

- 以新資料riskNew.xlsx來預測

[設定所需的函式庫(libraries)以及載入資料]

setwd("/home/m600/Working Area/Rdata Practice/Customer Course/riskC5.0")
risk=read.csv("./Risk.csv",header=T,sep=",")

[Part 1].Data-ETL

1-1.取得資料集的初探

head(risk)
##       ID AGE INCOME GENDER MARITAL NUMKIDS NUMCARDS HOWPAID MORTGAGE
## 1 100756  44  59944      m married       1        2 monthly        y
## 2 100668  35  59692      m married       1        1 monthly        y
## 3 100418  34  59508      m married       1        1 monthly        y
## 4 100416  34  59463      m married       0        2 monthly        y
## 5 100590  39  59393      f married       0        2 monthly        y
## 6 100657  41  59276      m married       1        2 monthly        y
##   STORECAR LOANS      RISK
## 1        2     0 good risk
## 2        1     0  bad loss
## 3        2     1 good risk
## 4        1     1  bad loss
## 5        1     0 good risk
## 6        1     1 good risk
sumdebt=risk$NUMCARDS+risk$STORECAR+risk$LOANS
sumdebt[1:5]
## [1] 4 2 4 4 3
#risk=risk[,-c(1,7,10,11)]
risk=risk[,c(3,5,6,12)]
risk2=data.frame(risk,sumdebt)
head(risk2)
##   INCOME MARITAL NUMKIDS      RISK sumdebt
## 1  59944 married       1 good risk       4
## 2  59692 married       1  bad loss       2
## 3  59508 married       1 good risk       4
## 4  59463 married       0  bad loss       4
## 5  59393 married       0 good risk       3
## 6  59276 married       1 good risk       4

1-2.Test group 建立

n=0.3*nrow(risk)
test.index=sample(1:nrow(risk),n)
risk2.train=risk2[-test.index,]
risk2.test=risk2[test.index,]

[Part 2].C5.0 模式建立

2-1.Test group 建立

#install.packages("C50")
library(C50)
risk.tree=C5.0(RISK~ . ,data=risk2.train)
summary(risk.tree)
## 
## Call:
## C5.0.formula(formula = RISK ~ ., data = risk2.train)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sat Feb 13 20:56:45 2016
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 2882 cases (5 attributes) from undefined.data
## 
## Decision tree:
## 
## INCOME <= 25728:
## :...sumdebt <= 7: bad profit (1425/162)
## :   sumdebt > 7:
## :   :...MARITAL = divsepwid: bad profit (401/228)
## :       MARITAL in {married,single}: bad loss (179/21)
## INCOME > 25728:
## :...NUMKIDS <= 1: good risk (556/174)
##     NUMKIDS > 1:
##     :...MARITAL = divsepwid: bad profit (234/122)
##         MARITAL in {married,single}: bad loss (87/12)
## 
## 
## Evaluation on training data (2882 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       6  719(24.9%)   <<
## 
## 
##     (a)   (b)   (c)    <-classified as
##    ----  ----  ----
##     233   331    84    (a): class bad loss
##      27  1548    90    (b): class bad profit
##       6   181   382    (c): class good risk
## 
## 
##  Attribute usage:
## 
##  100.00% INCOME
##   69.57% sumdebt
##   31.26% MARITAL
##   30.43% NUMKIDS
## 
## 
## Time: 0.0 secs
plot(risk.tree)

2-2.Train confusion matrix

RISK.train=risk2$RISK[-test.index]
train.pred=predict(risk.tree,risk2.train,type='class')
table.train=table(RISK.train,train.pred)
table.train
##             train.pred
## RISK.train   bad loss bad profit good risk
##   bad loss        233        331        84
##   bad profit       27       1548        90
##   good risk         6        181       382
cat("Total records(train)=",nrow(risk2.train),"\n")
## Total records(train)= 2882
cat("Correct Classification Ratio(train)=",sum(diag(table.train))/sum(table.train)*100,"%\n")
## Correct Classification Ratio(train)= 75.05205 %

2-3.Test confusion matrix

RISK.test=risk2$RISK[test.index]
test.pred=predict(risk.tree,risk2.test,type='class')
table.test=table(RISK.test,test.pred)
table.test
##             test.pred
## RISK.test    bad loss bad profit good risk
##   bad loss        100        122        36
##   bad profit        6        690        46
##   good risk         4         76       155
cat("Total records(test)=",nrow(risk2.test),"\n")
## Total records(test)= 1235
cat("Correct Classification Ratio(test)=",sum(diag(table.test))/sum(table.test)*100,"%\n")
## Correct Classification Ratio(test)= 76.51822 %

[Part 3].預測資料集

3-1.以RiskNew.xlsx 當成新的預測資料

riskNew=read.csv("./RiskNew.csv",header=T,sep=",")

head(riskNew)
##   AGE INCOME GENDER MARITAL NUMKIDS NUMCARDS HOWPAID MORTGAGE STORECARDS
## 1  34  59463      m married       0        2 monthly        y          1
## 2  34  59463      m married       0        2 monthly        y          1
## 3  34  59463      m married       0        2 monthly        y          1
## 4  34  59463      m married       0        2 monthly        y          1
## 5  34  59463      m married       0        2 monthly        y          1
## 6  34  59463      m married       0        2 monthly        y          1
##   LOANS
## 1     1
## 2     1
## 3     1
## 4     1
## 5     1
## 6     1
sumdebt=riskNew$NUMCARDS+riskNew$STORECARDS+riskNew$LOANS
riskNew=riskNew[,c(2,4,5)]
riskNew2=data.frame(riskNew,sumdebt)
head(riskNew2)
##   INCOME MARITAL NUMKIDS sumdebt
## 1  59463 married       0       4
## 2  59463 married       0       4
## 3  59463 married       0       4
## 4  59463 married       0       4
## 5  59463 married       0       4
## 6  59463 married       0       4

3-2.Predict risknew2

risk.pred=predict(risk.tree,riskNew2,type='class')

3-3.Merge predict result and output

riskNewAll=data.frame(riskNew2,RiskPred=risk.pred)
head(riskNewAll,20)
##    INCOME MARITAL NUMKIDS sumdebt  RiskPred
## 1   59463 married       0       4 good risk
## 2   59463 married       0       4 good risk
## 3   59463 married       0       4 good risk
## 4   59463 married       0       4 good risk
## 5   59463 married       0       4 good risk
## 6   59463 married       0       4 good risk
## 7   59463 married       0       4 good risk
## 8   59463 married       0       4 good risk
## 9   59463 married       0       4 good risk
## 10  59463 married       0       4 good risk
## 11  59276 married       1       4 good risk
## 12  59276 married       1       4 good risk
## 13  59276 married       1       4 good risk
## 14  59276 married       1       4 good risk
## 15  59276 married       1       4 good risk
## 16  59276 married       1       4 good risk
## 17  59276 married       1       4 good risk
## 18  59276 married       1       4 good risk
## 19  59276 married       1       4 good risk
## 20  59276 married       1       4 good risk