資料說明
1.This course was introduce “Cluster”"
2.data set has 1477 records 16 fields
[Part 1].Data-ETL
1-1.Data Distribution
head(churnall)
## ID LONGDIST International LOCAL DROPPED PAY_MTHD LocalBillType
## 1 0 5.24640 7.51510 86.32780 0 CH FreeLocal
## 2 3 0.00000 0.00000 3.94229 0 CC Budget
## 3 4 5.55564 0.00000 9.36347 1 CC Budget
## 4 8 14.01930 5.68043 29.80650 0 CC Budget
## 5 10 13.66400 2.95642 32.63810 0 CC FreeLocal
## 6 11 0.00000 0.00000 1.41294 0 CC FreeLocal
## LongDistanceBillType AGE SEX STATUS CHILDREN Est_Income Car_Owner
## 1 Standard 57 F M 2 27535.3 Y
## 2 Intnl_discount 50 F S 2 64632.3 N
## 3 Intnl_discount 68 F M 2 81000.9 N
## 4 Standard 34 M S 0 87467.1 Y
## 5 Intnl_discount 60 M M 2 83220.6 N
## 6 Standard 84 F S 0 50290.7 N
## CHURNED
## 1 Vol
## 2 InVol
## 3 Vol
## 4 Current
## 5 Vol
## 6 InVol
churnall=na.exclude(churnall) #
churn=churnall[,c(2:4)]
head(churn)
## LONGDIST International LOCAL
## 1 5.24640 7.51510 86.32780
## 2 0.00000 0.00000 3.94229
## 3 5.55564 0.00000 9.36347
## 4 14.01930 5.68043 29.80650
## 5 13.66400 2.95642 32.63810
## 6 0.00000 0.00000 1.41294
[Part 2].Cluster analysis
2-1.K-mean
churn.result=kmeans(churn,5)
table(churn.result$cluster)
##
## 1 2 3 4 5
## 655 267 425 27 103
[Part 3].Outlier Scan
boxplot.stats(churn[,3])$out
## [1] 180.967 188.555 241.049 212.356 287.295 337.564 450.624 220.977
## [9] 160.666 296.629 184.525 165.434 163.975 159.885 311.290 163.414
## [17] 257.192 332.464 255.236 208.196 161.815 249.961 410.611 169.071
## [25] 312.202 171.469 216.295 165.868 193.892 167.321 262.916 203.516
## [33] 160.422 190.782 172.533 189.518 160.272 207.598 218.123 173.619
## [41] 295.308 183.624 176.485 234.595 213.184 183.138 188.736 161.387
## [49] 291.729 166.827 177.741 189.931 241.720 191.490 284.106 286.242
## [57] 294.599 174.155 268.958 285.176 262.644 233.810 233.985 167.637
## [65] 187.505 186.103 404.415 202.438 164.820 163.848 226.998
outliers=which(churn[,3] %in% boxplot.stats(churn[,3])$out)
outliers
## [1] 75 166 186 229 244 262 279 373 388 420 460 470 507 520
## [15] 522 526 538 593 646 664 665 706 714 722 744 757 775 777
## [29] 779 785 789 801 812 814 816 818 827 836 864 892 909 911
## [43] 1028 1045 1054 1065 1074 1100 1109 1116 1118 1125 1150 1175 1179 1189
## [57] 1190 1229 1244 1252 1269 1284 1285 1289 1303 1343 1381 1387 1396 1403
## [71] 1429
extremes=subset(churn,churn[,3]>sd(churn[,3]*5))
extremes
## LONGDIST International LOCAL
## 244 14.82570 7.795010 287.295
## 262 13.79680 0.000000 337.564
## 279 29.23120 0.000000 450.624
## 420 4.44961 7.273760 296.629
## 522 12.53720 0.000000 311.290
## 593 29.41840 0.000000 332.464
## 714 22.55980 0.000000 410.611
## 744 29.94820 0.000000 312.202
## 909 15.19550 0.000000 295.308
## 1109 23.30150 0.000000 291.729
## 1179 25.11570 0.000000 284.106
## 1189 10.65790 0.600762 286.242
## 1190 5.73078 0.000000 294.599
## 1252 11.04650 0.000000 285.176
## 1381 5.72491 0.000000 404.415
boxplot(churn[,3])