[Part 1].Data-ETL
1-1.取得資料集的維度
dim ( Insurance ) # 取得資料集的維度
## [1] 64 5
dim ( Insurance[1:10, ] ) # 取得資料集前10條資料的維度
## [1] 10 5
dim ( Insurance[ ,2:4] ) # 取得資料集僅含第2、3、4個變數部分的維度
## [1] 64 3
dim ( Insurance ) [1] # 取得資料集維度向量的第一個元素,即行數
## [1] 64
dim ( Insurance ) [2] # 取得資料集維度向量的第二個元素,即列數
## [1] 5
1-2.篩選資料
vars = c ( "District", "Age" ) # 建構含有“District”和“Age”兩個元素的字元向量vars
Insurance [ 20:25, vars ] # 篩選出District及Age變數的第20-25行資料
## District Age
## 20 2 >35
## 21 2 <25
## 22 2 25-29
## 23 2 30-35
## 24 2 >35
## 25 2 <25
names ( Insurance ) # 輸出Insurance資料集變數名
## [1] "District" "Group" "Age" "Holders" "Claims"
head ( names(Insurance), n=2 ) # 僅輸出前2個變數名
## [1] "District" "Group"
tail ( names(Insurance), n=2 ) # 僅輸出後2個變數名
## [1] "Holders" "Claims"
head ( Insurance$Age ) # 僅輸出Age變數前許多條資料
## [1] <25 25-29 30-35 >35 <25 25-29
## Levels: <25 < 25-29 < 30-35 < >35
1-3.變數型態
class ( Insurance$District ) # 顯示District的變數型態
## [1] "factor"
class ( Insurance$Age ) # 顯示Age的變數型態
## [1] "ordered" "factor"
class ( Insurance$Holders ) # 顯示Holders的變數型態
## [1] "integer"
1-4.修改變數
levels ( Insurance$Age ) # 顯示Age變數的4個水平值
## [1] "<25" "25-29" "30-35" ">35"
levels ( Insurance$Age) [1] # 顯示Age變數的第1個水平值
## [1] "<25"
levels ( Insurance$Age ) [1] = "young" # 將Age變數的第1個水平值修改為“young”
head ( Insurance$Age ) #回看修改後Age變數前許多個取值
## [1] young 25-29 30-35 >35 young 25-29
## Levels: young < 25-29 < 30-35 < >35
1-5.再次判斷變數型態
is.character ( Insurance$Age ) # 判斷Age是否為字元型變數
## [1] FALSE
class ( Insurance$Claims ) # 顯示Claims的變數型態
## [1] "integer"
class ( as.numeric (Insurance$Claims) ) # 將Claims的資料型態強制轉為數值型
## [1] "numeric"
[Part 3].資料視覺化
3-1.數字化探索
names(Insurance)
## [1] "District" "Group" "Age" "Holders" "Claims"
attributes(Insurance)
## $names
## [1] "District" "Group" "Age" "Holders" "Claims"
##
## $row.names
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
## [24] 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46
## [47] 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
##
## $class
## [1] "data.frame"
str(Insurance)
## 'data.frame': 64 obs. of 5 variables:
## $ District: Factor w/ 4 levels "1","2","3","4": 1 1 1 1 1 1 1 1 1 1 ...
## $ Group : Ord.factor w/ 4 levels "<1l"<"1-1.5l"<..: 1 1 1 1 2 2 2 2 3 3 ...
## $ Age : Ord.factor w/ 4 levels "young"<"25-29"<..: 1 2 3 4 1 2 3 4 1 2 ...
## $ Holders : int 197 264 246 1680 284 536 696 3582 133 286 ...
## $ Claims : int 38 35 20 156 63 84 89 400 19 52 ...
summary(Insurance)
## District Group Age Holders Claims
## 1:16 <1l :16 young:16 Min. : 3.00 Min. : 0.00
## 2:16 1-1.5l:16 25-29:16 1st Qu.: 46.75 1st Qu.: 9.50
## 3:16 1.5-2l:16 30-35:16 Median : 136.00 Median : 22.00
## 4:16 >2l :16 >35 :16 Mean : 364.98 Mean : 49.23
## 3rd Qu.: 327.50 3rd Qu.: 55.50
## Max. :3582.00 Max. :400.00
#install.packages("Hmisc")
library(Hmisc)
describe(Insurance[,1:3])
## Insurance[, 1:3]
##
## 3 Variables 64 Observations
## ---------------------------------------------------------------------------
## District
## n missing unique
## 64 0 4
##
## 1 (16, 25%), 2 (16, 25%), 3 (16, 25%), 4 (16, 25%)
## ---------------------------------------------------------------------------
## Group
## n missing unique
## 64 0 4
##
## <1l (16, 25%), 1-1.5l (16, 25%), 1.5-2l (16, 25%)
## >2l (16, 25%)
## ---------------------------------------------------------------------------
## Age
## n missing unique
## 64 0 4
##
## young (16, 25%), 25-29 (16, 25%), 30-35 (16, 25%)
## >35 (16, 25%)
## ---------------------------------------------------------------------------
describe(Insurance[,4:5])
## Insurance[, 4:5]
##
## 2 Variables 64 Observations
## ---------------------------------------------------------------------------
## Holders
## n missing unique Info Mean .05 .10 .25 .50
## 64 0 63 1 365 16.30 24.00 46.75 136.00
## .75 .90 .95
## 327.50 868.90 1639.25
##
## lowest : 3 7 9 16 18, highest: 1635 1640 1680 2443 3582
## ---------------------------------------------------------------------------
## Claims
## n missing unique Info Mean .05 .10 .25 .50
## 64 0 46 1 49.23 3.15 4.30 9.50 22.00
## .75 .90 .95
## 55.50 101.70 182.35
##
## lowest : 0 2 3 4 5, highest: 156 187 233 290 400
## ---------------------------------------------------------------------------
3-2.直方圖 MASS
hist(Insurance$Claims,main="Histogram of Freq of Insurance$Claims")
hist(Insurance$Claims,freq=FALSE,density=20,
main="Histogram of Density of Insurance$Claims")
lines(density(Insurance$Claims))
str(hist(Insurance$Claims,breaks=20,labels = TRUE,
col="black",border="white",
main="Histogram of Insurance$Claims with 20 bars"))
## List of 6
## $ breaks : num [1:21] 0 20 40 60 80 100 120 140 160 180 ...
## $ counts : int [1:20] 30 13 5 5 3 2 0 2 0 1 ...
## $ density : num [1:20] 0.02344 0.01016 0.00391 0.00391 0.00234 ...
## $ mids : num [1:20] 10 30 50 70 90 110 130 150 170 190 ...
## $ xname : chr "Insurance$Claims"
## $ equidist: logi TRUE
## - attr(*, "class")= chr "histogram"
3-3.累積分佈圖 Hmisc
library(Hmisc)
Ecdf(Insurance$Claims,xlab="Claims",main="Cumulative Distribution of Claims")
3-4.箱型圖
Claims_bp=boxplot(Insurance$Claims,main="Distribution of Claims")
Claims_bp$stats
## [,1]
## [1,] 0
## [2,] 9
## [3,] 22
## [4,] 58
## [5,] 102
## attr(,"class")
##
## "integer"
3-5.條形圖
Claims_Age = with(Insurance,
c( sum(Claims[which(Age=="<25")]), sum(Claims[which(Age=="25-29")]),
sum(Claims[which(Age=="30-35")]), sum(Claims[which(Age==">35")]) ) )
barplot(Claims_Age, names.arg=c("<25","25-29","30-35",">35"),density=rep(20,4),
main="Distribution of Age by Claims", xlab="Age", ylab="Claims")
Holders_Age = with(Insurance,
c( sum(Holders[which(Age=="<25")]), sum(Holders[which(Age=="25-29")]),
sum(Holders[which(Age=="30-35")]), sum(Holders[which(Age==">35")]) ) )
Holders_Age
## [1] 0 2336 3007 16878
data_bar = rbind(Claims_Age,Holders_Age)
data_bar
## [,1] [,2] [,3] [,4]
## Claims_Age 0 404 453 2065
## Holders_Age 0 2336 3007 16878
barplot(data_bar, names.arg=c("<25","25-29","30-35",">35"),beside=TRUE,
main="Age Distribution by Claims and Holders",
xlab="Age", ylab="Claims&Holders", col=c("black","darkgrey"))
legend(x="topleft", rownames(data_bar), fill = c("black","darkgrey"))
barplot(data_bar, names.arg=c("<25","25-29","30-35",">35"),
main="Age Distribution by Claims and Holders",
ylab="Claims&Holders", col=c("black","darkgrey"))
legend(x="topleft", rownames(data_bar), fill = c("black","darkgrey"))
3-6.點陣圖
dotchart(data_bar,xlab="Claims&Holders", pch=1:2,
main="Age Distribution by Claims and Holders")
legend(x=14000,y=15,"<25",bty="n")
legend(x=14000,y=11,"25-29",bty="n")
legend(x=14000,y=7,"30-35",bty="n")
legend(x=14000,y=3,">35",bty="n")
3-7.圓餅圖
pie(Claims_Age,labels=c("<25","25-29","30-35",">35"),
main="Pie Chart of Age by Claims",col=c("white","lightgray","darkgrey","black"))
percent = round(Claims_Age/sum(Claims_Age)*100)
label = paste(paste(c("<25","25-29","30-35",">35"),":"), percent,"%",sep="")
pie(Claims_Age,labels = label,
main="Pie Chart of Age by Claims",col=c("white","lightgray","darkgrey","black"))
#install.packages("plotrix")
library(plotrix)
pie3D(Claims_Age,labels=c("<25","25-29","30-35",">35"),explode=0.05,
main="3D Pie Chart of Age by Claims",labelcex=0.8,
col=c("white","lightgray","darkgrey","black"))