Data Description

[設定所需的函式庫(libraries)以及載入資料]

options(stringsAsFactors = FALSE)
op <- getwd()
setwd("/media/hsusir/DATA/Github/Diabetes-mellitus/")
load("/media/hsusir/DATA/Github/Diabetes-mellitus/h_nhi_opdte95.rda")
install.packages("data.table")
library(data.table)
system.time(load("h_nhi_opdte95.rda"))
##    user  system elapsed 
##  16.528   0.140  16.679
dim(h_nhi_opdte95)
## [1] 960497     44
head(h_nhi_opdte95)
##   fee_ym appl_type appl_date case_type seq_no cure_item_no1 cure_item_no2
## 1 200601         1  20060210        09      1                          19
## 2 200601         1  20060210        04      2            82            14
## 3 200601         1  20060210        09      3                          09
## 4 200601         1  20060210        09      4            02              
## 5 200601         1  20060210        04      5            02              
## 6 200601         1  20060210        09      6            A6            07
##   cure_item_no3 cure_item_no4 func_type func_date treat_end_date birth_ym
## 1                                    02  20060123                  193211
## 2                                    06  20060113                  193602
## 3                                    06  20060101                  195106
## 4                                    02  20060110                  193211
## 5                                    02  20060130                  193211
## 6                                    02  20060116                  193609
##   card_seq_no gave_kind part_no icd9cm_1 icd9cm_2 icd9cm_3 icd_op_code1
## 1        0006         4     C10      401      272      345             
## 2        0002         4     C20      715                               
## 3        0002         4     C20      715      726      721             
## 4        0011         4     C20      401      274      362             
## 5        0005         4     C10      401      200      300             
## 6        0002         4     004      401      305      375             
##   drug_day med_type drug_dot treat_dot treat_code diag_dot dsvc_no
## 1        0        0      848       323     00102B      198  05229B
## 2        7        0      266       539     00102B      206  05229B
## 3        0        0      162       552     00102B      203  05229B
## 4        2        0      778       413     00102B      211        
## 5       15        0      740       355     00102B      201  05229B
## 6       30        0      809       465                 198        
##   dsvc_dot by_pass_code t_dot part_dot t_appl_dot        id id_roc id_s
## 1       50               1419      120       1299 003110863      0    2
## 2       32               1043       80        963 022665874      0    1
## 3       40                957      280        677 009574823      0    2
## 4       51               1453        0       1453 003110863      0    2
## 5       48               1344       80       1264 003110863      0    2
## 6       46               1518      250       1268 008009787      0    1
##     prsn_id prsn_roc prsn_s   phar_id phar_roc phar_s hosp_id city hos
## 1 002139138        0      1 023908719        1      9  010366 4129  01
## 2 010905690        0      1 003791070        0      1  010366 4129  01
## 3 010905690        0      1 003791070        0      1  010366 4129  01
## 4 002139138        0      1 003791070        0      1  010366 4129  01
## 5 002139138        0      1 003791070        0      1  010366 4129  01
## 6 002139138        0      1 003791070        0      1  010366 4129  01
names(h_nhi_opdte95)
##  [1] "fee_ym"         "appl_type"      "appl_date"      "case_type"     
##  [5] "seq_no"         "cure_item_no1"  "cure_item_no2"  "cure_item_no3" 
##  [9] "cure_item_no4"  "func_type"      "func_date"      "treat_end_date"
## [13] "birth_ym"       "card_seq_no"    "gave_kind"      "part_no"       
## [17] "icd9cm_1"       "icd9cm_2"       "icd9cm_3"       "icd_op_code1"  
## [21] "drug_day"       "med_type"       "drug_dot"       "treat_dot"     
## [25] "treat_code"     "diag_dot"       "dsvc_no"        "dsvc_dot"      
## [29] "by_pass_code"   "t_dot"          "part_dot"       "t_appl_dot"    
## [33] "id"             "id_roc"         "id_s"           "prsn_id"       
## [37] "prsn_roc"       "prsn_s"         "phar_id"        "phar_roc"      
## [41] "phar_s"         "hosp_id"        "city"           "hos"
temp1 <- data.table(h_nhi_opdte95)
setDT(temp1)[substr(icd9cm_1,1,3) %in% "250" | 
             substr(icd9cm_2,1,3) %in% "250" |
             substr(icd9cm_3,1,3) %in% "250", dm:=1]; gc()
##            used  (Mb) gc trigger   (Mb)  max used   (Mb)
## Ncells  1498025  80.1    2637877  140.9   1550641   82.9
## Vcells 87786267 669.8  148216463 1130.9 140315423 1070.6
table(temp1$dm)
## 
##     1 
## 45148
ttt <- temp1[!duplicated( temp1[, list(temp1$id) ])==T,]
setnames(temp1,"func_date","dm_date")

dm1 <- subset(temp1, dm==1, select=c("id", "dm_date"))
dm2 <- dm1[order(dm1$id, dm1$dm_date), ]
dm3 <- dm2[!duplicated( dm2[, list(dm2$id) ])==T,]
dim(dm3) 
## [1] 8501    2

[第一部份].資料整理

dt1 <- data.table(h_nhi_opdte95, key="id")
dt2 <- data.table(dm3, key="id")
temp2 <- merge(dt1, dt2, all = TRUE)
temp2[, dm:=1]
setDT(temp2)[is.na(dm_date), dm:=0]
table(temp2$dm)
## 
##      0      1 
## 748726 211771
temp2$age <- as.numeric(as.Date("20060101", "%Y%m%d") - as.Date(paste0(temp2$birth_ym, "15"), "%Y%m%d") + 1) / 365.25
temp2[1:20, names(temp2) %in% c("id", "birth_ym", "age"), with = FALSE]
##            id birth_ym      age
##  1: 000000837   200305  2.63655
##  2: 000000837   200305  2.63655
##  3: 000000837   200305  2.63655
##  4: 000000837   200305  2.63655
##  5: 000000837   200305  2.63655
##  6: 000000960   198604 19.71800
##  7: 000000960   198604 19.71800
##  8: 000000960   198604 19.71800
##  9: 000001326   195506 50.55168
## 10: 000001326   195506 50.55168
## 11: 000001326   195506 50.55168
## 12: 000001326   195506 50.55168
## 13: 000001326   195506 50.55168
## 14: 000001326   195506 50.55168
## 15: 000001326   195506 50.55168
## 16: 000001370   199710  8.21629
## 17: 000001370   199710  8.21629
## 18: 000001370   199710  8.21629
## 19: 000001370   199710  8.21629
## 20: 000001370   199710  8.21629
summary(temp2$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   -0.95   21.63   43.80   41.65   61.72  101.40
temp2$agegp <- as.numeric(cut(temp2$age, breaks = c(-Inf, 20, 65, Inf), labels=1:3), right = F)
# temp2$agegp <- ifelse(temp2$age<20, 1, ifelse(20<=temp2$age & temp2$age<65, 2, 3))
table(temp2$agegp)
## 
##      1      2      3 
## 227139 531980 201378
# count the number of instances of an attribute each patient had in their record
mydata <- temp2[order(temp2$id, temp2$func_date), ]
mydata[ , ':='( count = .N , idx = 1:.N ) , by = id ]
# mydata[1:10, c("id", "func_date", "idx", "count"), with = FALSE]
dm <- mydata[!duplicated( mydata[, list(mydata$id) ])==T,]
# dm[1:10, c("id", "func_date", "idx", "count"), with = FALSE]
dim(dm)
## [1] 82544    50

[第二部份].基本糖尿病資訊

2-1.診斷代碼(欄位序號17、18、19)之 ICD9CM 的前三碼為250 (糖尿病)的病患人數,共有幾人(去重覆)

table(dm$dm) # for Q1
## 
##     0     1 
## 74043  8501

2-2.以2006年1月1日為基準日,計算所有人的年齡,並分成年齡層< 20yr, 20-65yr,>=65yr等3組,各有幾人(去重覆))

table(dm$agegp) # for Q2
## 
##     1     2     3 
## 22596 51810  8138

2-3.男性與女性各有多少人有糖尿病(去重覆))

table(dm$dm, dm$id_s) # for Q3
##    
##         1     2     9
##   0 35301 37158  1584
##   1  3939  4515    47

[第三部份].去除性別不明者、年齡<=0歲(2006年1月1日後出生)及重覆後,有糖尿病診斷的人與沒有糖尿病診斷的人 (小數位數四捨五入後兩位))

3-1.各是多少人?

dmsub <- as.data.frame(subset(dm, (id_s =="1" | id_s == "2") & (age > 0)))
table(dmsub$dm)
## 
##     0     1 
## 71835  8452

3-2.男性與女性各有多少人及其比例各是多少?有沒有差異?

(mytable1 <- table(dmsub$id_s, dmsub$dm))
##    
##         0     1
##   1 34981  3938
##   2 36854  4514
round(prop.table(mytable1, 2)*100, 2)
##    
##         0     1
##   1 48.70 46.59
##   2 51.30 53.41
# prop.table(mytable1, 2)*100
summary(mytable1)
## Number of cases in table: 80287 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 13.401, df = 1, p-value = 0.0002516

3-3.年齡有沒有差異?

# dstats <- function(x)(c(mean=mean(x), sd=sd(x), 
#                         median=median(x), quantile(x, probs = 0.25), quantile(x, probs = 0.75)))
# xx <- by(dmsub$age, dmsub$dm, dstats)
# xx[] <- lapply(xx, round, 2)
# xx

dstats <- function(x)(as.numeric(format(c(mean=mean(x), sd=sd(x), 
                        median=median(x), quantile(x, probs = 0.25), quantile(x, probs = 0.75)), digits=4)))
by(dmsub$age, dmsub$dm, dstats) 
## dmsub$dm: 0
## [1] 33.14 20.18 31.47 16.55 47.22
## -------------------------------------------------------- 
## dmsub$dm: 1
## [1] 55.61 17.05 56.22 45.30 68.38
aggregate(dmsub$age, by=list(dm=dmsub$dm), dstats)
##   dm   x.1   x.2   x.3   x.4   x.5
## 1  0 33.14 20.18 31.47 16.55 47.22
## 2  1 55.61 17.05 56.22 45.30 68.38
var.test(age ~ dm, data=dmsub)
## 
##  F test to compare two variances
## 
## data:  age by dm
## F = 1.4011, num df = 71834, denom df = 8451, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  1.356837 1.446159
## sample estimates:
## ratio of variances 
##           1.401071
# t.test(age ~ dm, data=dmsub, var.equal=T) 
t.test(age ~ dm, data=dmsub) 
## 
##  Welch Two Sample t-test
## 
## data:  age by dm
## t = -112.31, df = 11430, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -22.86570 -22.08121
## sample estimates:
## mean in group 0 mean in group 1 
##        33.14029        55.61374
wilcox.test(age ~ dm, data=dmsub)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  age by dm
## W = 120980000, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0

3-4.年齡層各有多少人及其比例各是多少?有沒有差異?

mytable2 <- table(dmsub$agegp, dmsub$dm)
round(prop.table(mytable2, 2)*100, 2)
##    
##         0     1
##   1 30.15  3.03
##   2 62.27 65.20
##   3  7.58 31.77
summary(mytable2)
## Number of cases in table: 80287 
## Number of factors: 2 
## Test for independence of all factors:
##  Chisq = 6415, df = 2, p-value = 0

3-5.全年的就醫次數平均、標準差各是多少?次數有沒有差異?

dstats <- function(x)(c(mean=mean(x), sd=sd(x), 
                        median=median(x), quantile(x, probs = 0.25), quantile(x, probs = 0.75)))
xx <- by(dmsub$count, dmsub$dm, dstats)
xx[] <- lapply(xx, round, 2)
xx
## dmsub$dm: 0
##   mean     sd median    25%    75% 
##  10.23  11.04   7.00   3.00  13.00 
## -------------------------------------------------------- 
## dmsub$dm: 1
##   mean     sd median    25%    75% 
##  24.95  18.65  20.00  13.00  32.00
var.test(count ~ dm, data=dmsub)
## 
##  F test to compare two variances
## 
## data:  count by dm
## F = 0.35014, num df = 71834, denom df = 8451, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.3390812 0.3614034
## sample estimates:
## ratio of variances 
##          0.3501356
# t.test(count ~ dm, data=dmsub, var.equal=T) 
t.test(count ~ dm, data=dmsub) 
## 
##  Welch Two Sample t-test
## 
## data:  count by dm
## t = -71.136, df = 9159.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -15.13254 -14.32091
## sample estimates:
## mean in group 0 mean in group 1 
##        10.22713        24.95386
wilcox.test(count ~ dm, data=dmsub)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  count by dm
## W = 114500000, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0