setwd("F:/Customer Course/UCI_Bank")
data=read.csv("bank.csv",header=TRUE,sep=";")
head(data)
## age job marital education default balance housing loan contact
## 1 30 unemployed married primary no 1787 no no cellular
## 2 33 services married secondary no 4789 yes yes cellular
## 3 35 management single tertiary no 1350 yes no cellular
## 4 30 management married tertiary no 1476 yes yes unknown
## 5 59 blue-collar married secondary no 0 yes no unknown
## 6 35 management single tertiary no 747 no no cellular
## day month duration campaign pdays previous poutcome y
## 1 19 oct 79 1 -1 0 unknown no
## 2 11 may 220 1 339 4 failure no
## 3 16 apr 185 1 330 1 failure no
## 4 3 jun 199 4 -1 0 unknown no
## 5 5 may 226 1 -1 0 unknown no
## 6 23 feb 141 2 176 3 failure no
dim(data)
## [1] 4521 17
summary(data)
## age job marital education
## Min. :19.00 management :969 divorced: 528 primary : 678
## 1st Qu.:33.00 blue-collar:946 married :2797 secondary:2306
## Median :39.00 technician :768 single :1196 tertiary :1350
## Mean :41.17 admin. :478 unknown : 187
## 3rd Qu.:49.00 services :417
## Max. :87.00 retired :230
## (Other) :713
## default balance housing loan contact
## no :4445 Min. :-3313 no :1962 no :3830 cellular :2896
## yes: 76 1st Qu.: 69 yes:2559 yes: 691 telephone: 301
## Median : 444 unknown :1324
## Mean : 1423
## 3rd Qu.: 1480
## Max. :71188
##
## day month duration campaign
## Min. : 1.00 may :1398 Min. : 4 Min. : 1.000
## 1st Qu.: 9.00 jul : 706 1st Qu.: 104 1st Qu.: 1.000
## Median :16.00 aug : 633 Median : 185 Median : 2.000
## Mean :15.92 jun : 531 Mean : 264 Mean : 2.794
## 3rd Qu.:21.00 nov : 389 3rd Qu.: 329 3rd Qu.: 3.000
## Max. :31.00 apr : 293 Max. :3025 Max. :50.000
## (Other): 571
## pdays previous poutcome y
## Min. : -1.00 Min. : 0.0000 failure: 490 no :4000
## 1st Qu.: -1.00 1st Qu.: 0.0000 other : 197 yes: 521
## Median : -1.00 Median : 0.0000 success: 129
## Mean : 39.77 Mean : 0.5426 unknown:3705
## 3rd Qu.: -1.00 3rd Qu.: 0.0000
## Max. :871.00 Max. :25.0000
##
sum(data$y=="yes"); sum(data$y=="no")
## [1] 521
## [1] 4000
sub=sample(1:nrow(data),round(nrow(data)/4)) #隨機取出1/4樣本
length(sub) #顯示這1/4的樣本序號個數
## [1] 1130
data_train=data[-sub,] #將不包含在sub中的資料建為訓練集
data_test=data[sub,] #將包含在sub中的資料建為測試集
dim(data_train);dim(data_test) #顯示訓練集及測試集的維度
## [1] 3391 17
## [1] 1130 17
#install.packages("adabag")
#install.packages("rpart")
library(adabag)
library(rpart)
bag=bagging(y~.,data_train,mfinal=5) #建立模型,產生五棵決策樹
names(bag) #顯示模型bag的輸出項名稱
## [1] "formula" "trees" "votes" "prob" "class"
## [6] "samples" "importance" "terms" "call"
bag$formula #模型bag的所依據的公式
## y ~ .
bag$trees[2] #顯示兩棵樹來看看
## [[1]]
## n= 3391
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3391 417 no (0.87702743 0.12297257)
## 2) duration< 414.5 2796 181 no (0.93526466 0.06473534)
## 4) month=apr,aug,feb,jan,jul,jun,may,nov,sep 2704 134 no (0.95044379 0.04955621)
## 8) poutcome=failure,other,unknown 2657 111 no (0.95822356 0.04177644) *
## 9) poutcome=success 47 23 no (0.51063830 0.48936170)
## 18) job=blue-collar,self-employed,services,student,technician 27 6 no (0.77777778 0.22222222) *
## 19) job=admin.,entrepreneur,housemaid,management,retired,unemployed,unknown 20 3 yes (0.15000000 0.85000000) *
## 5) month=dec,mar,oct 92 45 yes (0.48913043 0.51086957)
## 10) duration< 197.5 33 5 no (0.84848485 0.15151515) *
## 11) duration>=197.5 59 17 yes (0.28813559 0.71186441) *
## 3) duration>=414.5 595 236 no (0.60336134 0.39663866)
## 6) duration< 638.5 324 90 no (0.72222222 0.27777778)
## 12) poutcome=failure,unknown 284 63 no (0.77816901 0.22183099)
## 24) pdays< 260.5 263 51 no (0.80608365 0.19391635)
## 48) balance< 1295.5 181 22 no (0.87845304 0.12154696) *
## 49) balance>=1295.5 82 29 no (0.64634146 0.35365854)
## 98) month=apr,jul,jun,mar,may,nov,oct 71 20 no (0.71830986 0.28169014)
## 196) day< 19.5 52 7 no (0.86538462 0.13461538) *
## 197) day>=19.5 19 6 yes (0.31578947 0.68421053) *
## 99) month=aug,feb 11 2 yes (0.18181818 0.81818182) *
## 25) pdays>=260.5 21 9 yes (0.42857143 0.57142857)
## 50) day< 13.5 11 2 no (0.81818182 0.18181818) *
## 51) day>=13.5 10 0 yes (0.00000000 1.00000000) *
## 13) poutcome=other,success 40 13 yes (0.32500000 0.67500000)
## 26) job=blue-collar 8 0 no (1.00000000 0.00000000) *
## 27) job=admin.,management,retired,self-employed,student,technician,unemployed 32 5 yes (0.15625000 0.84375000) *
## 7) duration>=638.5 271 125 yes (0.46125461 0.53874539)
## 14) month=jan,jul,may,nov,oct 157 65 no (0.58598726 0.41401274)
## 28) contact=telephone,unknown 53 11 no (0.79245283 0.20754717) *
## 29) contact=cellular 104 50 yes (0.48076923 0.51923077)
## 58) job=entrepreneur,housemaid,self-employed,unemployed 10 0 no (1.00000000 0.00000000) *
## 59) job=admin.,blue-collar,management,retired,services,student,technician 94 40 yes (0.42553191 0.57446809) *
## 15) month=apr,aug,dec,feb,jun 114 33 yes (0.28947368 0.71052632)
## 30) day< 16.5 69 27 yes (0.39130435 0.60869565)
## 60) job=blue-collar,housemaid,retired,self-employed 31 11 no (0.64516129 0.35483871)
## 120) balance>=147.5 22 4 no (0.81818182 0.18181818) *
## 121) balance< 147.5 9 2 yes (0.22222222 0.77777778) *
## 61) job=admin.,entrepreneur,management,services,student,technician,unemployed 38 7 yes (0.18421053 0.81578947) *
## 31) day>=16.5 45 6 yes (0.13333333 0.86666667) *
bag$votes[105:115,] #模型bag中第105-115個樣本的投票狀況
## [,1] [,2]
## [1,] 4 1
## [2,] 5 0
## [3,] 5 0
## [4,] 5 0
## [5,] 5 0
## [6,] 4 1
## [7,] 4 1
## [8,] 5 0
## [9,] 5 0
## [10,] 4 1
## [11,] 5 0
bag$prob[105:115,] #模型bag中第105-115個樣本被預測為各種別的機率
## [,1] [,2]
## [1,] 0.8 0.2
## [2,] 1.0 0.0
## [3,] 1.0 0.0
## [4,] 1.0 0.0
## [5,] 1.0 0.0
## [6,] 0.8 0.2
## [7,] 0.8 0.2
## [8,] 1.0 0.0
## [9,] 1.0 0.0
## [10,] 0.8 0.2
## [11,] 1.0 0.0
bag$class[105:115] #模型bag中第105-115個樣本的預測類別
## [1] "no" "no" "no" "no" "no" "no" "no" "no" "no" "no" "no"
bag$samples[105:115,] #模型bag中第105-115個樣本在五次反覆運算過程中的抽樣情況
## [,1] [,2] [,3] [,4] [,5]
## [1,] 2788 57 346 331 25
## [2,] 128 3219 104 1583 3090
## [3,] 2317 778 2538 2111 733
## [4,] 2099 1704 1536 2673 2605
## [5,] 19 2321 2312 2701 2031
## [6,] 650 2523 1279 2867 938
## [7,] 1685 1872 1259 1204 1588
## [8,] 3016 3287 1029 1147 651
## [9,] 2814 1266 995 2064 3347
## [10,] 1737 2958 776 3350 2453
## [11,] 2996 535 1211 3387 1481
bag$importance #模型bag中各變數的相對重要性
## age balance campaign contact day default
## 1.0806487 4.0033237 0.8451415 0.4986750 4.1639277 0.0000000
## duration education housing job loan marital
## 44.5640233 0.4905595 0.0000000 7.0659681 0.6963961 1.2401696
## month pdays poutcome previous
## 13.2141546 2.3739104 18.6115690 1.1515325
barplot(bag$importance)
#透過control參數控制基礎分類的覆雜度
# maxdepth=3是控制樹的深度為3,所以子樹的複雜度明顯降低
bag1=bagging(y~.,data_train,mfinal=5,control=rpart.control(maxdepth=3))
bag1$trees[2] #檢視第二顆子樹的實際結構
## [[1]]
## n= 3391
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 3391 424 no (0.87496314 0.12503686)
## 2) duration< 648 3103 277 no (0.91073155 0.08926845)
## 4) poutcome=failure,other,unknown 3001 214 no (0.92869044 0.07130956) *
## 5) poutcome=success 102 39 yes (0.38235294 0.61764706)
## 10) month=apr,dec,feb,may,oct 44 15 no (0.65909091 0.34090909) *
## 11) month=aug,jan,jul,jun,mar,nov,sep 58 10 yes (0.17241379 0.82758621) *
## 3) duration>=648 288 141 yes (0.48958333 0.51041667)
## 6) marital=married 165 64 no (0.61212121 0.38787879)
## 12) job=admin.,blue-collar,entrepreneur,housemaid,management,retired,self-employed,services,unemployed 140 44 no (0.68571429 0.31428571) *
## 13) job=student,technician 25 5 yes (0.20000000 0.80000000) *
## 7) marital=divorced,single 123 40 yes (0.32520325 0.67479675) *
pre_bag=predict(bag,data_test) #使用bag模型對data_test進行預測
names(pre_bag) #顯示pre_bag的輸出項名稱
## [1] "formula" "votes" "prob" "class" "confusion" "error"
pre_bag$votes[1:10,] #前10個樣本的投票狀況
## [,1] [,2]
## [1,] 5 0
## [2,] 5 0
## [3,] 5 0
## [4,] 5 0
## [5,] 5 0
## [6,] 5 0
## [7,] 5 0
## [8,] 5 0
## [9,] 5 0
## [10,] 4 1
pre_bag$prob[1:10,] #前10個樣本被預測為各種別的機率
## [,1] [,2]
## [1,] 1.0 0.0
## [2,] 1.0 0.0
## [3,] 1.0 0.0
## [4,] 1.0 0.0
## [5,] 1.0 0.0
## [6,] 1.0 0.0
## [7,] 1.0 0.0
## [8,] 1.0 0.0
## [9,] 1.0 0.0
## [10,] 0.8 0.2
pre_bag$class[1:10] #前10個樣本的預測類別
## [1] "no" "no" "no" "no" "no" "no" "no" "no" "no" "no"
pre_bag$confusion #預測集的混淆矩陣
## Observed Class
## Predicted Class no yes
## no 986 72
## yes 31 41
pre_bag$error #預測集的錯誤機率
## [1] 0.09115044
#我們用少數類別與多數類別做檢視,以做為後續兩種類別錯誤率的計算
sub_minor=which(data_test$y=="yes") #少數類別yes在測試集中的編號
sub_major=which(data_test$y=="no") #多數類別no在測試集中的編號
length(sub_minor); length(sub_major) #檢視兩種類別的個數
## [1] 113
## [1] 1017
#分別計算整體與兩種錯誤率
err_bag=sum(pre_bag$class!=data_test$y)/nrow(data_test) #計算整體錯誤率
err_minor_bag=sum(pre_bag$class[sub_minor]!=data_test$y[sub_minor])/length(sub_minor)#計算yes的錯誤率
err_major_bag=sum(pre_bag$class[sub_major]!=data_test$y[sub_major])/length(sub_major)#計算no錯誤率
err_bag; err_minor_bag; err_major_bag
## [1] 0.09115044
## [1] 0.6371681
## [1] 0.03048181
boo=boosting(y~.,data_train,mfinal=5) #建立Adaboost模型
pre_boo=predict(boo,data_test)
err_boo=sum(pre_boo$class!=data_test$y)/nrow(data_test)#計算整體錯誤率
err_minor_boo=sum(pre_boo$class[sub_minor]!=data_test$y[sub_minor])/length(sub_minor)#計算yes的錯誤率
err_major_boo=sum(pre_boo$class[sub_major]!=data_test$y[sub_major])/length(sub_major)#計算no錯誤率
err_boo; err_minor_boo; err_major_boo
## [1] 0.1079646
## [1] 0.7168142
## [1] 0.04031465