資料說明

1.本單元主題僅在介紹基本指令,以及變數的轉換

2.Insurance是某保險公司在1973年第三季車險投保人資訊,共有五個變數

- District 投保人家庭住址區域

- Group 汽車排氣量

- Age 年齡

- Holders 投保人數量

- Claims 要求索賠的投保人數量


[設定所需的函式庫(libraries)以及載入資料]

setwd("/home/m600/Working Area/Rdata Practice/Customer Course/Insurance")
library ( MASS )                                  # 載入含有資料集的軟體包MASS
data ( Insurance )                                          # 取得資料集Insurance

[Part 1].Data-ETL

1-1.取得資料集的維度

dim ( Insurance )                                             # 取得資料集的維度
## [1] 64  5
dim ( Insurance[1:10, ] )                            # 取得資料集前10條資料的維度
## [1] 10  5
dim ( Insurance[ ,2:4] )                 # 取得資料集僅含第2、3、4個變數部分的維度
## [1] 64  3
dim ( Insurance ) [1]                     # 取得資料集維度向量的第一個元素,即行數
## [1] 64
dim ( Insurance ) [2]                     # 取得資料集維度向量的第二個元素,即列數
## [1] 5

1-2.篩選資料

vars = c ( "District", "Age" )     # 建構含有“District”和“Age”兩個元素的字元向量vars
Insurance [ 20:25, vars ]                 # 篩選出District及Age變數的第20-25行資料
##    District   Age
## 20        2   >35
## 21        2   <25
## 22        2 25-29
## 23        2 30-35
## 24        2   >35
## 25        2   <25
names ( Insurance )                                  # 輸出Insurance資料集變數名
## [1] "District" "Group"    "Age"      "Holders"  "Claims"
head ( names(Insurance), n=2 )                                # 僅輸出前2個變數名
## [1] "District" "Group"
tail ( names(Insurance), n=2 )                                # 僅輸出後2個變數名
## [1] "Holders" "Claims"
head ( Insurance$Age )                              # 僅輸出Age變數前許多條資料
## [1] <25   25-29 30-35 >35   <25   25-29
## Levels: <25 < 25-29 < 30-35 < >35

1-3.變數型態

class ( Insurance$District )                                # 顯示District的變數型態
## [1] "factor"
class ( Insurance$Age )                                      # 顯示Age的變數型態
## [1] "ordered" "factor"
class ( Insurance$Holders )                                # 顯示Holders的變數型態
## [1] "integer"

1-4.修改變數

levels ( Insurance$Age )                                #  顯示Age變數的4個水平值
## [1] "<25"   "25-29" "30-35" ">35"
levels ( Insurance$Age) [1]                            # 顯示Age變數的第1個水平值
## [1] "<25"
levels ( Insurance$Age ) [1] = "young"        # 將Age變數的第1個水平值修改為“young”
head ( Insurance$Age )                          #回看修改後Age變數前許多個取值
## [1] young 25-29 30-35 >35   young 25-29
## Levels: young < 25-29 < 30-35 < >35

1-5.再次判斷變數型態

is.character ( Insurance$Age )                           # 判斷Age是否為字元型變數
## [1] FALSE
class ( Insurance$Claims )                                  # 顯示Claims的變數型態
## [1] "integer"
class ( as.numeric (Insurance$Claims) )         # 將Claims的資料型態強制轉為數值型
## [1] "numeric"

[Part 3].資料視覺化

3-1.數字化探索

names(Insurance)
## [1] "District" "Group"    "Age"      "Holders"  "Claims"
attributes(Insurance)
## $names
## [1] "District" "Group"    "Age"      "Holders"  "Claims"  
## 
## $row.names
##  [1]  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [47] 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
## 
## $class
## [1] "data.frame"
str(Insurance)
## 'data.frame':    64 obs. of  5 variables:
##  $ District: Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Group   : Ord.factor w/ 4 levels "<1l"<"1-1.5l"<..: 1 1 1 1 2 2 2 2 3 3 ...
##  $ Age     : Ord.factor w/ 4 levels "young"<"25-29"<..: 1 2 3 4 1 2 3 4 1 2 ...
##  $ Holders : int  197 264 246 1680 284 536 696 3582 133 286 ...
##  $ Claims  : int  38 35 20 156 63 84 89 400 19 52 ...
summary(Insurance)
##  District    Group       Age        Holders            Claims      
##  1:16     <1l   :16   young:16   Min.   :   3.00   Min.   :  0.00  
##  2:16     1-1.5l:16   25-29:16   1st Qu.:  46.75   1st Qu.:  9.50  
##  3:16     1.5-2l:16   30-35:16   Median : 136.00   Median : 22.00  
##  4:16     >2l   :16   >35  :16   Mean   : 364.98   Mean   : 49.23  
##                                  3rd Qu.: 327.50   3rd Qu.: 55.50  
##                                  Max.   :3582.00   Max.   :400.00
#install.packages("Hmisc")
library(Hmisc)
describe(Insurance[,1:3])
## Insurance[, 1:3] 
## 
##  3  Variables      64  Observations
## ---------------------------------------------------------------------------
## District 
##       n missing  unique 
##      64       0       4 
## 
## 1 (16, 25%), 2 (16, 25%), 3 (16, 25%), 4 (16, 25%) 
## ---------------------------------------------------------------------------
## Group 
##       n missing  unique 
##      64       0       4 
## 
## <1l (16, 25%), 1-1.5l (16, 25%), 1.5-2l (16, 25%) 
## >2l (16, 25%) 
## ---------------------------------------------------------------------------
## Age 
##       n missing  unique 
##      64       0       4 
## 
## young (16, 25%), 25-29 (16, 25%), 30-35 (16, 25%) 
## >35 (16, 25%) 
## ---------------------------------------------------------------------------
describe(Insurance[,4:5])
## Insurance[, 4:5] 
## 
##  2  Variables      64  Observations
## ---------------------------------------------------------------------------
## Holders 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##      64       0      63       1     365   16.30   24.00   46.75  136.00 
##     .75     .90     .95 
##  327.50  868.90 1639.25 
## 
## lowest :    3    7    9   16   18, highest: 1635 1640 1680 2443 3582 
## ---------------------------------------------------------------------------
## Claims 
##       n missing  unique    Info    Mean     .05     .10     .25     .50 
##      64       0      46       1   49.23    3.15    4.30    9.50   22.00 
##     .75     .90     .95 
##   55.50  101.70  182.35 
## 
## lowest :   0   2   3   4   5, highest: 156 187 233 290 400 
## ---------------------------------------------------------------------------

3-2.直方圖 MASS

hist(Insurance$Claims,main="Histogram of Freq of Insurance$Claims")

hist(Insurance$Claims,freq=FALSE,density=20,
     main="Histogram of Density of Insurance$Claims")
lines(density(Insurance$Claims))

str(hist(Insurance$Claims,breaks=20,labels = TRUE,
         col="black",border="white",
         main="Histogram of Insurance$Claims with 20 bars"))

## List of 6
##  $ breaks  : num [1:21] 0 20 40 60 80 100 120 140 160 180 ...
##  $ counts  : int [1:20] 30 13 5 5 3 2 0 2 0 1 ...
##  $ density : num [1:20] 0.02344 0.01016 0.00391 0.00391 0.00234 ...
##  $ mids    : num [1:20] 10 30 50 70 90 110 130 150 170 190 ...
##  $ xname   : chr "Insurance$Claims"
##  $ equidist: logi TRUE
##  - attr(*, "class")= chr "histogram"

3-3.累積分佈圖 Hmisc

library(Hmisc)

Ecdf(Insurance$Claims,xlab="Claims",main="Cumulative Distribution of Claims")

3-4.箱型圖

Claims_bp=boxplot(Insurance$Claims,main="Distribution of Claims")

Claims_bp$stats
##      [,1]
## [1,]    0
## [2,]    9
## [3,]   22
## [4,]   58
## [5,]  102
## attr(,"class")
##           
## "integer"

3-5.條形圖

Claims_Age = with(Insurance,
                c( sum(Claims[which(Age=="<25")]), sum(Claims[which(Age=="25-29")]),
                   sum(Claims[which(Age=="30-35")]), sum(Claims[which(Age==">35")]) ) )
 
barplot(Claims_Age, names.arg=c("<25","25-29","30-35",">35"),density=rep(20,4),
        main="Distribution of Age by Claims", xlab="Age", ylab="Claims")

Holders_Age = with(Insurance,
                c( sum(Holders[which(Age=="<25")]), sum(Holders[which(Age=="25-29")]),
                   sum(Holders[which(Age=="30-35")]), sum(Holders[which(Age==">35")]) ) )
Holders_Age
## [1]     0  2336  3007 16878
data_bar = rbind(Claims_Age,Holders_Age)
data_bar
##             [,1] [,2] [,3]  [,4]
## Claims_Age     0  404  453  2065
## Holders_Age    0 2336 3007 16878
barplot(data_bar, names.arg=c("<25","25-29","30-35",">35"),beside=TRUE,
        main="Age Distribution by Claims and Holders",
        xlab="Age", ylab="Claims&Holders", col=c("black","darkgrey"))
legend(x="topleft", rownames(data_bar), fill = c("black","darkgrey"))

barplot(data_bar, names.arg=c("<25","25-29","30-35",">35"),
        main="Age Distribution by Claims and Holders",
       ylab="Claims&Holders", col=c("black","darkgrey"))
legend(x="topleft", rownames(data_bar), fill = c("black","darkgrey"))

3-6.點陣圖

dotchart(data_bar,xlab="Claims&Holders", pch=1:2,
         main="Age Distribution by Claims and Holders")
legend(x=14000,y=15,"<25",bty="n")
legend(x=14000,y=11,"25-29",bty="n")
legend(x=14000,y=7,"30-35",bty="n")
legend(x=14000,y=3,">35",bty="n")

3-7.圓餅圖

pie(Claims_Age,labels=c("<25","25-29","30-35",">35"),
    main="Pie Chart of Age by Claims",col=c("white","lightgray","darkgrey","black"))

percent = round(Claims_Age/sum(Claims_Age)*100)
label = paste(paste(c("<25","25-29","30-35",">35"),":"), percent,"%",sep="")
pie(Claims_Age,labels = label,  
    main="Pie Chart of Age by Claims",col=c("white","lightgray","darkgrey","black"))

#install.packages("plotrix")
library(plotrix)

pie3D(Claims_Age,labels=c("<25","25-29","30-35",">35"),explode=0.05,
      main="3D Pie Chart of Age by Claims",labelcex=0.8,
      col=c("white","lightgray","darkgrey","black"))