Data Description
- 2006 (95)年的門急診資料
- 診斷代碼(欄位序號17、18、19)之 ICD9CM 的前三碼為250 (糖尿病)的病患人數
[設定所需的函式庫(libraries)以及載入資料]
options(stringsAsFactors = FALSE)
op <- getwd()
setwd("/media/hsusir/DATA/Github/Diabetes-mellitus/")
load("/media/hsusir/DATA/Github/Diabetes-mellitus/h_nhi_opdte95.rda")
install.packages("data.table")
library(data.table)
system.time(load("h_nhi_opdte95.rda"))
## user system elapsed
## 16.528 0.140 16.679
dim(h_nhi_opdte95)
## [1] 960497 44
head(h_nhi_opdte95)
## fee_ym appl_type appl_date case_type seq_no cure_item_no1 cure_item_no2
## 1 200601 1 20060210 09 1 19
## 2 200601 1 20060210 04 2 82 14
## 3 200601 1 20060210 09 3 09
## 4 200601 1 20060210 09 4 02
## 5 200601 1 20060210 04 5 02
## 6 200601 1 20060210 09 6 A6 07
## cure_item_no3 cure_item_no4 func_type func_date treat_end_date birth_ym
## 1 02 20060123 193211
## 2 06 20060113 193602
## 3 06 20060101 195106
## 4 02 20060110 193211
## 5 02 20060130 193211
## 6 02 20060116 193609
## card_seq_no gave_kind part_no icd9cm_1 icd9cm_2 icd9cm_3 icd_op_code1
## 1 0006 4 C10 401 272 345
## 2 0002 4 C20 715
## 3 0002 4 C20 715 726 721
## 4 0011 4 C20 401 274 362
## 5 0005 4 C10 401 200 300
## 6 0002 4 004 401 305 375
## drug_day med_type drug_dot treat_dot treat_code diag_dot dsvc_no
## 1 0 0 848 323 00102B 198 05229B
## 2 7 0 266 539 00102B 206 05229B
## 3 0 0 162 552 00102B 203 05229B
## 4 2 0 778 413 00102B 211
## 5 15 0 740 355 00102B 201 05229B
## 6 30 0 809 465 198
## dsvc_dot by_pass_code t_dot part_dot t_appl_dot id id_roc id_s
## 1 50 1419 120 1299 003110863 0 2
## 2 32 1043 80 963 022665874 0 1
## 3 40 957 280 677 009574823 0 2
## 4 51 1453 0 1453 003110863 0 2
## 5 48 1344 80 1264 003110863 0 2
## 6 46 1518 250 1268 008009787 0 1
## prsn_id prsn_roc prsn_s phar_id phar_roc phar_s hosp_id city hos
## 1 002139138 0 1 023908719 1 9 010366 4129 01
## 2 010905690 0 1 003791070 0 1 010366 4129 01
## 3 010905690 0 1 003791070 0 1 010366 4129 01
## 4 002139138 0 1 003791070 0 1 010366 4129 01
## 5 002139138 0 1 003791070 0 1 010366 4129 01
## 6 002139138 0 1 003791070 0 1 010366 4129 01
names(h_nhi_opdte95)
## [1] "fee_ym" "appl_type" "appl_date" "case_type"
## [5] "seq_no" "cure_item_no1" "cure_item_no2" "cure_item_no3"
## [9] "cure_item_no4" "func_type" "func_date" "treat_end_date"
## [13] "birth_ym" "card_seq_no" "gave_kind" "part_no"
## [17] "icd9cm_1" "icd9cm_2" "icd9cm_3" "icd_op_code1"
## [21] "drug_day" "med_type" "drug_dot" "treat_dot"
## [25] "treat_code" "diag_dot" "dsvc_no" "dsvc_dot"
## [29] "by_pass_code" "t_dot" "part_dot" "t_appl_dot"
## [33] "id" "id_roc" "id_s" "prsn_id"
## [37] "prsn_roc" "prsn_s" "phar_id" "phar_roc"
## [41] "phar_s" "hosp_id" "city" "hos"
temp1 <- data.table(h_nhi_opdte95)
setDT(temp1)[substr(icd9cm_1,1,3) %in% "250" |
substr(icd9cm_2,1,3) %in% "250" |
substr(icd9cm_3,1,3) %in% "250", dm:=1]; gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 1498025 80.1 2637877 140.9 1550641 82.9
## Vcells 87786267 669.8 148216463 1130.9 140315423 1070.6
table(temp1$dm)
##
## 1
## 45148
ttt <- temp1[!duplicated( temp1[, list(temp1$id) ])==T,]
setnames(temp1,"func_date","dm_date")
dm1 <- subset(temp1, dm==1, select=c("id", "dm_date"))
dm2 <- dm1[order(dm1$id, dm1$dm_date), ]
dm3 <- dm2[!duplicated( dm2[, list(dm2$id) ])==T,]
dim(dm3)
## [1] 8501 2
[第一部份].資料整理
dt1 <- data.table(h_nhi_opdte95, key="id")
dt2 <- data.table(dm3, key="id")
temp2 <- merge(dt1, dt2, all = TRUE)
temp2[, dm:=1]
setDT(temp2)[is.na(dm_date), dm:=0]
table(temp2$dm)
##
## 0 1
## 748726 211771
temp2$age <- as.numeric(as.Date("20060101", "%Y%m%d") - as.Date(paste0(temp2$birth_ym, "15"), "%Y%m%d") + 1) / 365.25
temp2[1:20, names(temp2) %in% c("id", "birth_ym", "age"), with = FALSE]
## id birth_ym age
## 1: 000000837 200305 2.63655
## 2: 000000837 200305 2.63655
## 3: 000000837 200305 2.63655
## 4: 000000837 200305 2.63655
## 5: 000000837 200305 2.63655
## 6: 000000960 198604 19.71800
## 7: 000000960 198604 19.71800
## 8: 000000960 198604 19.71800
## 9: 000001326 195506 50.55168
## 10: 000001326 195506 50.55168
## 11: 000001326 195506 50.55168
## 12: 000001326 195506 50.55168
## 13: 000001326 195506 50.55168
## 14: 000001326 195506 50.55168
## 15: 000001326 195506 50.55168
## 16: 000001370 199710 8.21629
## 17: 000001370 199710 8.21629
## 18: 000001370 199710 8.21629
## 19: 000001370 199710 8.21629
## 20: 000001370 199710 8.21629
summary(temp2$age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -0.95 21.63 43.80 41.65 61.72 101.40
temp2$agegp <- as.numeric(cut(temp2$age, breaks = c(-Inf, 20, 65, Inf), labels=1:3), right = F)
# temp2$agegp <- ifelse(temp2$age<20, 1, ifelse(20<=temp2$age & temp2$age<65, 2, 3))
table(temp2$agegp)
##
## 1 2 3
## 227139 531980 201378
# count the number of instances of an attribute each patient had in their record
mydata <- temp2[order(temp2$id, temp2$func_date), ]
mydata[ , ':='( count = .N , idx = 1:.N ) , by = id ]
# mydata[1:10, c("id", "func_date", "idx", "count"), with = FALSE]
dm <- mydata[!duplicated( mydata[, list(mydata$id) ])==T,]
# dm[1:10, c("id", "func_date", "idx", "count"), with = FALSE]
dim(dm)
## [1] 82544 50
[第三部份].去除性別不明者、年齡<=0歲(2006年1月1日後出生)及重覆後,有糖尿病診斷的人與沒有糖尿病診斷的人 (小數位數四捨五入後兩位))
3-1.各是多少人?
dmsub <- as.data.frame(subset(dm, (id_s =="1" | id_s == "2") & (age > 0)))
table(dmsub$dm)
##
## 0 1
## 71835 8452
3-2.男性與女性各有多少人及其比例各是多少?有沒有差異?
(mytable1 <- table(dmsub$id_s, dmsub$dm))
##
## 0 1
## 1 34981 3938
## 2 36854 4514
round(prop.table(mytable1, 2)*100, 2)
##
## 0 1
## 1 48.70 46.59
## 2 51.30 53.41
# prop.table(mytable1, 2)*100
summary(mytable1)
## Number of cases in table: 80287
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 13.401, df = 1, p-value = 0.0002516
3-3.年齡有沒有差異?
# dstats <- function(x)(c(mean=mean(x), sd=sd(x),
# median=median(x), quantile(x, probs = 0.25), quantile(x, probs = 0.75)))
# xx <- by(dmsub$age, dmsub$dm, dstats)
# xx[] <- lapply(xx, round, 2)
# xx
dstats <- function(x)(as.numeric(format(c(mean=mean(x), sd=sd(x),
median=median(x), quantile(x, probs = 0.25), quantile(x, probs = 0.75)), digits=4)))
by(dmsub$age, dmsub$dm, dstats)
## dmsub$dm: 0
## [1] 33.14 20.18 31.47 16.55 47.22
## --------------------------------------------------------
## dmsub$dm: 1
## [1] 55.61 17.05 56.22 45.30 68.38
aggregate(dmsub$age, by=list(dm=dmsub$dm), dstats)
## dm x.1 x.2 x.3 x.4 x.5
## 1 0 33.14 20.18 31.47 16.55 47.22
## 2 1 55.61 17.05 56.22 45.30 68.38
var.test(age ~ dm, data=dmsub)
##
## F test to compare two variances
##
## data: age by dm
## F = 1.4011, num df = 71834, denom df = 8451, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 1.356837 1.446159
## sample estimates:
## ratio of variances
## 1.401071
# t.test(age ~ dm, data=dmsub, var.equal=T)
t.test(age ~ dm, data=dmsub)
##
## Welch Two Sample t-test
##
## data: age by dm
## t = -112.31, df = 11430, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -22.86570 -22.08121
## sample estimates:
## mean in group 0 mean in group 1
## 33.14029 55.61374
wilcox.test(age ~ dm, data=dmsub)
##
## Wilcoxon rank sum test with continuity correction
##
## data: age by dm
## W = 120980000, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0
3-4.年齡層各有多少人及其比例各是多少?有沒有差異?
mytable2 <- table(dmsub$agegp, dmsub$dm)
round(prop.table(mytable2, 2)*100, 2)
##
## 0 1
## 1 30.15 3.03
## 2 62.27 65.20
## 3 7.58 31.77
summary(mytable2)
## Number of cases in table: 80287
## Number of factors: 2
## Test for independence of all factors:
## Chisq = 6415, df = 2, p-value = 0
3-5.全年的就醫次數平均、標準差各是多少?次數有沒有差異?
dstats <- function(x)(c(mean=mean(x), sd=sd(x),
median=median(x), quantile(x, probs = 0.25), quantile(x, probs = 0.75)))
xx <- by(dmsub$count, dmsub$dm, dstats)
xx[] <- lapply(xx, round, 2)
xx
## dmsub$dm: 0
## mean sd median 25% 75%
## 10.23 11.04 7.00 3.00 13.00
## --------------------------------------------------------
## dmsub$dm: 1
## mean sd median 25% 75%
## 24.95 18.65 20.00 13.00 32.00
var.test(count ~ dm, data=dmsub)
##
## F test to compare two variances
##
## data: count by dm
## F = 0.35014, num df = 71834, denom df = 8451, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.3390812 0.3614034
## sample estimates:
## ratio of variances
## 0.3501356
# t.test(count ~ dm, data=dmsub, var.equal=T)
t.test(count ~ dm, data=dmsub)
##
## Welch Two Sample t-test
##
## data: count by dm
## t = -71.136, df = 9159.8, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
## -15.13254 -14.32091
## sample estimates:
## mean in group 0 mean in group 1
## 10.22713 24.95386
wilcox.test(count ~ dm, data=dmsub)
##
## Wilcoxon rank sum test with continuity correction
##
## data: count by dm
## W = 114500000, p-value < 2.2e-16
## alternative hypothesis: true location shift is not equal to 0