R語言程式設計導論

林嶔 (Lin, Chin)

Lesson 8 進階驗證篩選與外部套件應用

第一節:進階驗證篩選(1)

– 我們再使用CKD門診衛教計劃的範例資料,請在這裡下載範例資料。

dat = read.csv("validated_example.csv", header = TRUE, fileEncoding = 'CP950') 
head(dat)
##   Patient       Date MDRD.GFR Stage   WBC  RBC  HB  Hct  MCV Urea.Nitrogen
## 1    1472  2011/5/31     15.9     4 10.10 2.76 8.7 27.0 98.0            75
## 2    1472   2011/8/8     24.0     4  5.79 1.80 5.7 17.9 99.4            41
## 3    1472 2011/10/17     22.7     4  5.20 2.44 7.8 23.0 94.3            49
## 4    1472  2012/1/10     18.7     4 12.99 2.95 9.4 28.2 95.6            46
## 5    1472  2013/5/14     30.4     3  6.78 3.02 9.7 30.0 99.3            48
## 6    1472  2013/8/22     26.8     4  6.88 3.73 8.6 26.0 95.2            82
##   Creatinine Uric.Acid  Na   K Albumin
## 1        3.0       3.9 139 3.3     4.2
## 2        2.1       4.8 143 4.6     4.2
## 3        2.2       4.7 146 4.8     4.5
## 4        2.6       4.0 141 4.2     4.6
## 5        1.7       3.9 141 4.4     4.1
## 6        1.9       3.0 138 4.1     4.3

– 我們先進行一個前處理,刪掉日期不符的變項

dat$Date = as.Date(dat[,"Date"])
dat = dat[!dat$Date %in% NA,]
levels.Patient = levels(as.factor(dat$Patient))

第一節:進階驗證篩選(2)

– 首先,我們還是先創造一個新變項

dat$Wrong.Date_interval = NA

– 我們選擇第5個人

i = 5
dat[dat$Patient==levels.Patient[i],c("Patient", "Date", "Wrong.Date_interval")]
##    Patient       Date Wrong.Date_interval
## 41     566 2008-03-21                  NA
## 42     566 2008-05-17                  NA
## 43     566 2008-06-10                  NA
## 44     566 2008-06-16                  NA
## 46     566 2008-07-12                  NA
i = 5
dat[dat$Patient==levels.Patient[i],"Wrong.Date_interval"][1] = FALSE
dat[dat$Patient==levels.Patient[i],c("Patient", "Date", "Wrong.Date_interval")]
##    Patient       Date Wrong.Date_interval
## 41     566 2008-03-21               FALSE
## 42     566 2008-05-17                  NA
## 43     566 2008-06-10                  NA
## 44     566 2008-06-16                  NA
## 46     566 2008-07-12                  NA

第一節:進階驗證篩選(3)

– 值得注意的是,這位病患在『2008-03-21』、『2008-05-17』、『2008-06-10』這三天分別被申報,其間格分別是27與65,雖然『2008-05-17』必須被核刪,但考慮到刪除這天後,『2008-03-21』與『2008-06-10』就相距81天,這是一個可以接受的日期,所以我們必須想一下,該怎麼解決這件事。

– 記得避免遺漏值!

i = 5
dat[dat$Patient==levels.Patient[i],"Wrong.Date_interval"][1] = FALSE
n.date = length(dat[dat$Patient==levels.Patient[i],"Date"])

k = 2
false.dates = dat[dat$Patient==levels.Patient[i] & dat$Wrong.Date_interval == FALSE & !is.na(dat$Wrong.Date_interval),"Date"]
last.date = false.dates[length(false.dates)]
dif = dat[dat$Patient==levels.Patient[i],"Date"][k] - last.date
dat[dat$Patient==levels.Patient[i],"Wrong.Date_interval"][k] = dif < 60

dat[dat$Patient==levels.Patient[i],c("Patient", "Date", "Wrong.Date_interval")]
##    Patient       Date Wrong.Date_interval
## 41     566 2008-03-21               FALSE
## 42     566 2008-05-17                TRUE
## 43     566 2008-06-10                  NA
## 44     566 2008-06-16                  NA
## 46     566 2008-07-12                  NA

第一節:進階驗證篩選(4)

– 需要注意的是,如果n.date>1才需要做後續檢查

levels.Patient = levels(as.factor(dat$Patient))
n.Patient = length(levels.Patient)
dat$Wrong.Date_interval = NA

pb = txtProgressBar(max = n.Patient, style=3)

for (i in 1:n.Patient) {
  dat[dat$Patient==levels.Patient[i],"Wrong.Date_interval"][1] = FALSE
  n.date = length(dat[dat$Patient==levels.Patient[i],"Date"])
  if (n.date>1) {
    for (k in 2:n.date) {
      false.dates = dat[dat$Patient==levels.Patient[i] & dat$Wrong.Date_interval == FALSE & !is.na(dat$Wrong.Date_interval),"Date"]
      last.date = false.dates[length(false.dates)]
      dif = dat[dat$Patient==levels.Patient[i],"Date"][k] - last.date
      dat[dat$Patient==levels.Patient[i],"Wrong.Date_interval"][k] = dif < 60
    }
  }
  setTxtProgressBar(pb, i)
}

close(pb)

練習1:把剛剛學到的推廣到GFR的變化量

– 因此請透過類似的方法找出GFR的異常值,如果出現短時間內變化過大的值請把它找出來(單月斜率超過2以上),之後在檢查時請忽略它!

練習1答案

#Read data
dat = read.csv("validated_example.csv", header = TRUE, fileEncoding = 'CP950') 

#Rule 1: check date-format
dat$Date = as.Date(dat[,"Date"])
dat$Wrong.Date = is.na(dat$Date)
dat = dat[dat$Wrong.Date == FALSE,]

#Rule 2: check eGFR change
levels.Patient = levels(as.factor(dat$Patient))
n.Patient = length(levels.Patient)
dat$Wrong.eGFR_change = NA

pb = txtProgressBar(max = n.Patient, style=3)

for (i in 1:n.Patient) {
  dat[dat$Patient==levels.Patient[i], "Wrong.eGFR_change"][1] = FALSE
  n.date = length(dat[dat$Patient==levels.Patient[i],"Date"])
  if (n.date>1) {
    for (k in 2:n.date) {
      false.dates = dat[dat$Patient==levels.Patient[i] & dat$Wrong.eGFR_change == FALSE & !is.na(dat$Wrong.eGFR_change),"Date"]
      last.date = false.dates[length(false.dates)]
      diff_date = dat[dat$Patient==levels.Patient[i],"Date"][k] - last.date
      diff_date = as.numeric(diff_date, units = 'days') # 注意要轉成數字,否則無法相除

      false.eGFRs = dat[dat$Patient==levels.Patient[i] & dat$Wrong.eGFR_change == FALSE & !is.na(dat$Wrong.eGFR_change),"MDRD.GFR"]
      last.eGFR = false.eGFRs[length(false.eGFRs)]
      diff_eGFR = dat[dat$Patient==levels.Patient[i],"MDRD.GFR"][k] - last.eGFR
      
      slope = diff_eGFR / diff_date * 30
      
      dat[dat$Patient==levels.Patient[i],"Wrong.eGFR_change"][k] = abs(slope) > 2
    }
  }
  setTxtProgressBar(pb, i)
}

close(pb)

第二節:快速讀寫檔案(1)

– 不同的主因其實牽涉到程式碼執行過程,但目前我們的知識很有限,做出來就不容易了更別說還要求速度快,因此比較可行的方式是去「抄」程式碼!

第二節:快速讀寫檔案(2)

– 至於安裝套件的方法,假設你是用Rstudio,可以看到右下角有個Packages分頁,點選後你能看到Install按鍵,透過這種方式就能安裝套件了

F01

install.packages("data.table")
library(data.table)

第二節:快速讀寫檔案(3)

– 至於大Data在哪,我們可以使用第六節課用到的大檔案,請在這裡下載。

t0 = Sys.time()
dat1 = read.csv('laboratory_2.csv', header = TRUE, fileEncoding = 'CP950')
Sys.time() - t0
## Time difference of 1.707349 secs
t0 = Sys.time()
dat2 = fread('laboratory_2.csv', header = TRUE)
Sys.time() - t0
## Time difference of 0.07944632 secs

第二節:快速讀寫檔案(4)

class(dat1)
## [1] "data.frame"
class(dat2)
## [1] "data.table" "data.frame"
t0 = Sys.time()
dat2 = fread('laboratory_2.csv', header = TRUE, data.table = FALSE)
Sys.time() - t0
## Time difference of 0.1181285 secs
class(dat2)
## [1] "data.frame"

第二節:快速讀寫檔案(5)

all.equal(dat1, dat2)
## [1] "Component \"COLLECTIONDATE\": 'current' is not a factor"
## [2] "Component \"TESTNAME\": 'current' is not a factor"      
## [3] "Component \"UNITS\": 'current' is not a factor"
dat1 = read.csv('laboratory_2.csv', header = TRUE, stringsAsFactors = FALSE, fileEncoding = 'CP950')
all.equal(dat1, dat2)
## [1] "Component \"COLLECTIONDATE\": 181483 string mismatches"

第二節:快速讀寫檔案(6)

– 這是使用「write.csv」寫出的速度:

t0 = Sys.time()
write.csv(dat1, 'laboratory_test.csv', row.names = FALSE, quote = TRUE)
Sys.time() - t0
## Time difference of 0.9308543 secs

– 這是使用「fwrite」寫出的速度:

t0 = Sys.time()
fwrite(dat2, 'laboratory_test.csv', row.names = FALSE, quote = TRUE)
Sys.time() - t0
## Time difference of 0.0409317 secs

第二節:快速讀寫檔案(7)

– 這是儲存

t0 = Sys.time()
save(dat1, file = 'laboratory_test.RData')
Sys.time() - t0
## Time difference of 0.6009369 secs

– 這是載入

t0 = Sys.time()
load('laboratory_test.RData')
Sys.time() - t0
## Time difference of 0.204529 secs

練習2:讀取圖片及顯示圖片

– 透過Google搜尋「R display image」後,你將可以找到這個頁面,其中第一個連結進去後你會發現這裡已經有人發問和回答了:

F02

練習2答案

library("jpeg")
img <- readJPEG(system.file("img", "Rlogo.jpg", package = "jpeg"))
plot(0:1, 0:1, type = "n", ann = FALSE, axes = FALSE)
rasterImage(img, -0.04, -0.04, 1.04, 1.04)

第三節:增加程式可讀性(1)

length(levels(factor(dat1$TESTNAME)))
## [1] 25
factorized_TESTNAME = factor(dat1$TESTNAME)
lvl_TESTNAME = levels(factorized_TESTNAME)
length(lvl_TESTNAME)
## [1] 25

第三節:增加程式可讀性(2)

– 後面函數的「.」代表上一步的結果

library(magrittr)

n.TESTNAME = dat1$TESTNAME %>% factor %>% levels %>% length
n.TESTNAME
## [1] 25
n.TESTNAME = dat1$TESTNAME %>% factor() %>% levels() %>% length()
n.TESTNAME
## [1] 25
n.TESTNAME = dat1$TESTNAME %>% factor(.) %>% levels(.) %>% length(.)
n.TESTNAME
## [1] 25
f = function(x, a, b) {a*x^2 + b}
1:5 %>% f(., 2, 5)
## [1]  7 13 23 37 55
1:5 %>% f(2, ., 5)
## [1]  9 13 17 21 25
1:5 %>% f(2, 5, .)
## [1] 21 22 23 24 25

第三節:增加程式可讀性(3)

– 「%<>%」:不要顯示結果,而是改變物件內容

a = 1
a %<>% add(1)
a
## [1] 2

– 「%$%」:指定物件內的索引格式

n.TESTNAME = dat1 %$% TESTNAME %>% factor %>% levels %>% length
n.TESTNAME
## [1] 25

第三節:增加程式可讀性(4)

t0 = Sys.time()

dat1$COLLECTIONDATE = dat1[,3] %>% as.Date

levels.TESTNAME = dat1[,4] %>% factor %>% levels
n.TESTNAME = levels.TESTNAME %>% length
levels.PATNUMBER = dat1[,1] %>% factor %>% levels
n.PATNUMBER = levels.PATNUMBER %>% length

dat_list = list()

for (i in 1:n.PATNUMBER) {
  
  subdat = dat1[dat1[,1]==levels.PATNUMBER[i],]
  levels.COLLECTIONDATE = subdat[,3] %>% factor %>% levels
  n.COLLECTIONDATE = levels.COLLECTIONDATE %>% length

  submatrix = matrix(NA, nrow = n.COLLECTIONDATE, ncol = n.TESTNAME + 2)
  colnames(submatrix) = c("PATNUMBER", "COLLECTIONDATE", levels.TESTNAME)

  submatrix[,1] = levels.PATNUMBER[i]
  submatrix[,2] = levels.COLLECTIONDATE

  for (j in 1:n.COLLECTIONDATE) {
    subsubdat = subdat[subdat[,3]==levels.COLLECTIONDATE[j],]
    for (k in 1:nrow(subsubdat)) {
      NAME = subsubdat[k,4]
      position = which(NAME == levels.TESTNAME) + 2
      submatrix[j, position] = subsubdat[k,5]
    }
  }
  
  dat_list[[i]] = submatrix
}

final.data = do.call("rbind", dat_list)

Sys.time() - t0
## Time difference of 1.657264 mins
head(final.data)
##      PATNUMBER COLLECTIONDATE Albumin Albumin body fluid AST BUN BUN Fluid
## [1,] "26"      "2011-05-12"   NA      NA                 NA  NA  NA       
## [2,] "26"      "2011-05-30"   NA      NA                 NA  NA  NA       
## [3,] "26"      "2011-05-31"   NA      NA                 NA  NA  NA       
## [4,] "26"      "2011-06-01"   NA      NA                 NA  NA  NA       
## [5,] "26"      "2011-06-02"   NA      NA                 NA  NA  NA       
## [6,] "26"      "2011-06-06"   NA      NA                 NA  NA  NA       
##      Cholesterol Fluid Creatinine Creatinine Fluid GLU(AC) HDL-Cholesterol
## [1,] NA                "1.8"      NA               NA      NA             
## [2,] NA                "3"        NA               NA      NA             
## [3,] NA                "2.9"      NA               NA      NA             
## [4,] NA                "2.9"      NA               NA      NA             
## [5,] NA                "2.4"      NA               NA      NA             
## [6,] NA                "1.9"      NA               NA      NA             
##      IP    K  LDL-Cholesterol Na    NA Fluid Total Calcium Total Cholesterol
## [1,] NA    NA NA              "140" NA       NA            NA               
## [2,] NA    NA NA              "139" NA       NA            NA               
## [3,] "4.6" NA NA              "145" NA       "7.8"         "134"            
## [4,] NA    NA NA              "144" NA       "6.1"         NA               
## [5,] NA    NA NA              "138" NA       "7.1"         NA               
## [6,] NA    NA NA              "134" NA       "8"           NA               
##      Triglyceride Triglycerol Fluid Uric Acid urine Calcium urine Phosphorus
## [1,] NA           NA                NA        NA            NA              
## [2,] NA           NA                NA        NA            NA              
## [3,] "131"        NA                NA        "5.6"         "41.4"          
## [4,] NA           NA                NA        NA            NA              
## [5,] NA           NA                NA        NA            NA              
## [6,] NA           NA                NA        NA            NA              
##      urine Potassium urine Sodium urine Uric Acid
## [1,] NA              NA           NA             
## [2,] NA              NA           NA             
## [3,] "29"            "61"         NA             
## [4,] NA              NA           NA             
## [5,] NA              NA           NA             
## [6,] NA              NA           NA
final.data = as.data.frame(final.data, stringsAsFactors = FALSE)

fwrite(final.data, 'final_data.csv', row.names = FALSE, quote = TRUE)

小結

– 但你可以直接把「輪子」買來,但你也要有能力確保若廠商不供貨時,你也可以靠自己把「輪子」製造出來!