資料說明

1.This course was introduce “Cluster”"

2.data set has 1477 records 16 fields

[Setting libraries and Input Data]

setwd("/home/m600/Working Area/Rdata Practice/Customer Course/churn")

churnall=read.table("./churn.txt",header=T,sep=",")

[Part 1].Data-ETL

1-1.Data Distribution

head(churnall)
##   ID LONGDIST International    LOCAL DROPPED PAY_MTHD LocalBillType
## 1  0  5.24640       7.51510 86.32780       0       CH     FreeLocal
## 2  3  0.00000       0.00000  3.94229       0       CC        Budget
## 3  4  5.55564       0.00000  9.36347       1       CC        Budget
## 4  8 14.01930       5.68043 29.80650       0       CC        Budget
## 5 10 13.66400       2.95642 32.63810       0       CC     FreeLocal
## 6 11  0.00000       0.00000  1.41294       0       CC     FreeLocal
##   LongDistanceBillType AGE SEX STATUS CHILDREN Est_Income Car_Owner
## 1             Standard  57   F      M        2    27535.3         Y
## 2       Intnl_discount  50   F      S        2    64632.3         N
## 3       Intnl_discount  68   F      M        2    81000.9         N
## 4             Standard  34   M      S        0    87467.1         Y
## 5       Intnl_discount  60   M      M        2    83220.6         N
## 6             Standard  84   F      S        0    50290.7         N
##   CHURNED
## 1     Vol
## 2   InVol
## 3     Vol
## 4 Current
## 5     Vol
## 6   InVol
churnall=na.exclude(churnall) # 
churn=churnall[,c(2:4)]
head(churn)
##   LONGDIST International    LOCAL
## 1  5.24640       7.51510 86.32780
## 2  0.00000       0.00000  3.94229
## 3  5.55564       0.00000  9.36347
## 4 14.01930       5.68043 29.80650
## 5 13.66400       2.95642 32.63810
## 6  0.00000       0.00000  1.41294

[Part 2].Cluster analysis

2-1.K-mean

churn.result=kmeans(churn,5)
table(churn.result$cluster)
## 
##   1   2   3   4   5 
## 655 267 425  27 103

**2-2.Table and Figure display*

pie(table(churn.result$cluster))

table(churnall$CHURNED,churn.result$cluster)
##          
##             1   2   3   4   5
##   Current 326 162 262  16  66
##   InVol   132   0   0   0   0
##   Vol     197 105 163  11  37
barplot(table(churnall$CHURNED,churn.result$cluster),col=2:4)
legend(0,600,c("Current","Invol","Vol"),col=2:4,pch=15)

head(churnall[churn.result$cluster==3,])
##    ID LONGDIST International   LOCAL DROPPED PAY_MTHD LocalBillType
## 4   8 14.01930       5.68043 29.8065       0       CC        Budget
## 5  10 13.66400       2.95642 32.6381       0       CC     FreeLocal
## 9  19 11.03070       0.00000 34.2777       0       CC        Budget
## 16 31 26.49520       0.00000 31.0847       0       CC     FreeLocal
## 19 40 24.34560       0.00000 62.8260       0       CC        Budget
## 20 42  8.86499       4.43676 43.6439       0       CH        Budget
##    LongDistanceBillType AGE SEX STATUS CHILDREN Est_Income Car_Owner
## 4              Standard  34   M      S        0   87467.10         Y
## 5        Intnl_discount  60   M      M        2   83220.60         N
## 9              Standard  87   F      S        2    3776.12         N
## 16       Intnl_discount  81   M      S        1    3968.54         N
## 19             Standard  37   F      S        2    4988.14         N
## 20       Intnl_discount  55   M      S        2   85753.80         N
##    CHURNED
## 4  Current
## 5      Vol
## 9      Vol
## 16 Current
## 19 Current
## 20     Vol

[Part 3].Outlier Scan

boxplot.stats(churn[,3])$out
##  [1] 180.967 188.555 241.049 212.356 287.295 337.564 450.624 220.977
##  [9] 160.666 296.629 184.525 165.434 163.975 159.885 311.290 163.414
## [17] 257.192 332.464 255.236 208.196 161.815 249.961 410.611 169.071
## [25] 312.202 171.469 216.295 165.868 193.892 167.321 262.916 203.516
## [33] 160.422 190.782 172.533 189.518 160.272 207.598 218.123 173.619
## [41] 295.308 183.624 176.485 234.595 213.184 183.138 188.736 161.387
## [49] 291.729 166.827 177.741 189.931 241.720 191.490 284.106 286.242
## [57] 294.599 174.155 268.958 285.176 262.644 233.810 233.985 167.637
## [65] 187.505 186.103 404.415 202.438 164.820 163.848 226.998
outliers=which(churn[,3] %in% boxplot.stats(churn[,3])$out)
outliers
##  [1]   75  166  186  229  244  262  279  373  388  420  460  470  507  520
## [15]  522  526  538  593  646  664  665  706  714  722  744  757  775  777
## [29]  779  785  789  801  812  814  816  818  827  836  864  892  909  911
## [43] 1028 1045 1054 1065 1074 1100 1109 1116 1118 1125 1150 1175 1179 1189
## [57] 1190 1229 1244 1252 1269 1284 1285 1289 1303 1343 1381 1387 1396 1403
## [71] 1429
extremes=subset(churn,churn[,3]>sd(churn[,3]*5))
extremes
##      LONGDIST International   LOCAL
## 244  14.82570      7.795010 287.295
## 262  13.79680      0.000000 337.564
## 279  29.23120      0.000000 450.624
## 420   4.44961      7.273760 296.629
## 522  12.53720      0.000000 311.290
## 593  29.41840      0.000000 332.464
## 714  22.55980      0.000000 410.611
## 744  29.94820      0.000000 312.202
## 909  15.19550      0.000000 295.308
## 1109 23.30150      0.000000 291.729
## 1179 25.11570      0.000000 284.106
## 1189 10.65790      0.600762 286.242
## 1190  5.73078      0.000000 294.599
## 1252 11.04650      0.000000 285.176
## 1381  5.72491      0.000000 404.415
boxplot(churn[,3])