機器學習及演算法

林嶔 (Lin, Chin)

Lesson 9 人工智慧基礎6(存活分析模型)

第一節:Cox比例風險模型簡介(1)

– 請至這裡下載範例資料

dat <- read.csv("ECG_train.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE, na.strings = "")
library(survival)

第一節:Cox比例風險模型簡介(2)

subdat <- dat[!(dat[,'death'] %in% NA) & !(dat[,'time'] %in% NA) & !(dat[,'GENDER'] %in% NA) & !(dat[,'AGE'] %in% NA),]
subdat[,"GENDER"] <- as.factor(subdat[,"GENDER"])

Result <- coxph(Surv(subdat[,"time"], subdat[,"death"]) ~ ., data = subdat[,c("GENDER", "AGE")])
summary(Result)
## Call:
## coxph(formula = Surv(subdat[, "time"], subdat[, "death"]) ~ ., 
##     data = subdat[, c("GENDER", "AGE")])
## 
##   n= 4668, number of events= 456 
## 
##                coef exp(coef) se(coef)      z Pr(>|z|)    
## GENDERmale 0.051859  1.053228 0.095127  0.545    0.586    
## AGE        0.041815  1.042701 0.003151 13.271   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
##            exp(coef) exp(-coef) lower .95 upper .95
## GENDERmale     1.053     0.9495    0.8741     1.269
## AGE            1.043     0.9590    1.0363     1.049
## 
## Concordance= 0.673  (se = 0.014 )
## Rsquare= 0.042   (max possible= 0.784 )
## Likelihood ratio test= 199.5  on 2 df,   p=<2e-16
## Wald test            = 177  on 2 df,   p=<2e-16
## Score (logrank) test = 185.2  on 2 df,   p=<2e-16

第一節:Cox比例風險模型簡介(3)

\[log(\frac{h_i(t)}{h_0(t)})= b_{1}x_{1,i} + b_{2}x_{2,i}\]

– 這樣對於第\(i\)個人而言,他的「hazard」就會是baseline的\(exp(b_{1}x_{1,i} + b_{2}x_{2,i})\)倍。

\[log(\frac{h_i(t)}{h_0(t)})= 0.051859 \times Gender_i + 0.041815 \times Age_i\] - 因此我們可以翻譯成這樣:

– 對於\(Gender\)而言,男生相較於女生的hazard ratio是\(exp(0.051859) = 1.053\)

– 對於\(Age\)而言,每提升一個單位(歲)的hazard ratio是\(exp(0.041815) = 1.043\)

第一節:Cox比例風險模型簡介(4)

– 「hazard」是Cox比例風險模型所定義的特殊參數,可以跟第\(t\)個時間點的累積存活率\(S(t)\)做下列公式的轉換:

\[S(t) = exp(-h_i(t))\]

h0.hazard <- basehaz(Result, centered = FALSE)
head(h0.hazard)
##         hazard time
## 1 0.0002760165    1
## 2 0.0005456561    2
## 3 0.0007269227    3
## 4 0.0008842276    4
## 5 0.0010015822    5
## 6 0.0011663935    6

– 他在第1個時間點\(t=1\)時累積存活率是\(S(t) = exp(-0.0002760165) = 0.999724\)

– 他在第2個時間點\(t=2\)時他的時累積存活率是\(S(t) = exp(-0.0005456561) = 0.9994545\),依此類推。

– 他在第1個時間點\(t=1\)時累積存活率是\(S(t) = exp(-0.0002760165 \times 12.94564) = 0.9964332\)

– 他在第2個時間點\(t=2\)時他的時累積存活率是\(S(t) = exp(-0.0005456561 \times 12.94564) = 0.992961\),依此類推。

第一節:Cox比例風險模型簡介(5)

Predict.Surv <- survfit(Result, newdata = data.frame(GENDER = c('female', 'male'), AGE = c(60, 80)))
plot(Predict.Surv, col = c('blue', 'red'), xlab = 'Time (days)',  ylab = 'Survival (%)', mark.time = FALSE, lwd = 2)

第一節:Cox比例風險模型簡介(6)

– 但他有多個不同時間點,所以我們要把每個時間點的AUC做平均,這樣能獲得一個叫index of concordance(C-index)的東西,這是存活分析的模型評估指標

– 我們可以透過函數「concordance()」計算出來

concordance(Result)
## Call:
## concordance.coxph(object = Result)
## 
## n= 4668 
## Concordance= 0.6728 se= 0.01374
## discordant concordant     tied.x     tied.y    tied.xy 
##     851331     413995          0       1250          0

\[ \text{C-index} = \frac{ \sum_{i, j} \mathbb{1}_{T_j < T_i} \cdot \mathbb{1}_{\eta_j > \eta_i} \cdot \delta_j }{\sum_{i, j} \mathbb{1}_{T_j < T_i}\cdot \delta_j } \]

其中,\(\eta_i\)是第\(i\)個人的風險分數,\(\mathbb{1}_{\eta_j > \eta_i}\)代表判斷是否發生\(\eta_j > \eta_i\)\(\mathbb{1}_{T_j < T_i}\)代表判斷是否發生\(T_j < T_i\)

練習1:使用套件完成資料科學實驗流程

subdat <- dat[!(dat[,"time"] %in% NA) & !(dat[,"death"] %in% NA) & !(dat[,"AGE"] %in% NA) & !(dat[,"Rate"] %in% NA),]
set.seed(0)
all_idx <- 1:nrow(subdat)

train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]

train_dat <- subdat[train_idx,]
valid_dat <- subdat[valid_idx,]
test_dat <- subdat[test_idx,]

練習1答案(1)

fit <- coxph(Surv(time, death) ~ AGE + Rate, data = train_dat)
fit
## Call:
## coxph(formula = Surv(time, death) ~ AGE + Rate, data = train_dat)
## 
##          coef exp(coef) se(coef)     z       p
## AGE  0.037304  1.038009 0.004099 9.101 < 2e-16
## Rate 0.014929  1.015041 0.002035 7.336 2.2e-13
## 
## Likelihood ratio test=144.2  on 2 df, p=< 2.2e-16
## n= 2800, number of events= 267

\[\text{risk score} = 0.037304 \times Age_i + 0.014929 \times Rate_i\]

練習1答案(2)

valid_dat[,'score'] <- predict(fit, valid_dat)
test_dat[,'score'] <- predict(fit, test_dat)
library(pROC)

par(mfrow = c(1, 3))

for (current_t in c(365, 730, 1825)) {
  
  valid_dat[,'y'] <- valid_dat[,'death']
  valid_dat[valid_dat[,'time'] > current_t, 'y'] <- 0
  valid_dat[valid_dat[,'time'] < current_t & valid_dat[,'death'] %in% 0, 'y'] <- NA
  
  test_dat[,'y'] <- test_dat[,'death']
  test_dat[test_dat[,'time'] > current_t, 'y'] <- 0
  test_dat[test_dat[,'time'] < current_t & test_dat[,'death'] %in% 0, 'y'] <- NA
  
  roc_valid <- roc(y ~ score, data = valid_dat)
  best_pos <- which.max(roc_valid$sensitivities + roc_valid$specificities)
  best_cut <- roc_valid$thresholds[best_pos]
  
  tab_test <- table(test_dat$score >= best_cut, test_dat$y)
  sens <- tab_test[2,2] / sum(tab_test[,2])
  spec <- tab_test[1,1] / sum(tab_test[,1])
  
  roc_test <- roc(y ~ score, data = test_dat)
  plot(roc_test)
  
  points(spec, sens, pch = 19, main = paste0('t = ', formatC(current_t, digits = 1, format = 'f')))
  text(0.5, 0.5, paste0('Cut = ', formatC(best_cut, digits = 3, format = 'f'),
                        '\nSens = ', formatC(sens, digits = 3, format = 'f'),
                        '\nSpec = ', formatC(spec, digits = 3, format = 'f'),
                        '\nAUC = ', formatC(roc_test$auc, digits = 3, format = 'f')), col = 'red')
  
}

第二節:Cox模型推導求解(1)

先依據追蹤時間將所有樣本進行排序,並且在每個事件發生時,去計算該事件發生的個體其風險機率相較於全體機率的比例

將每個事件的比例累加起來,期望這個總和最大化

x <- c(10, 5, 7, 8, 3, 2, 1, 4, 9, 6)
event <- c(1, 1, 0, 1, 0, 1, 0, 0, 1, 0)
time <- c(1, 1, 2, 3, 4, 4, 5, 5, 5, 6)

survival_model <- coxph(Surv(time, event) ~ x)
coef(survival_model)
##         x 
## 0.2441335

第二節:Cox模型推導求解(2)

\[\eta_i = log(\frac{h_i(t)}{h_0(t)})= b_{1}x_{i}\]

F01

\[P_{event_1} = \begin{pmatrix} p_1 & p_3 & p_4 & p_5 & p_6 & p_7 & p_8 & p_9 & p_{10} \end{pmatrix} = \frac{exp(\eta_1)}{\sum \limits_{i=1,3,4,5,6,7,8,9,10} exp(\eta_i)}\] - 依此類推,我們可以獲得所有的\(P\)

\[ \begin{align} P_{event_2} & = \begin{pmatrix} p_2 & p_3 & p_4 & p_5 & p_6 & p_7 & p_8 & p_9 & p_{10} \end{pmatrix} = \frac{exp(\eta_i)}{\sum \limits_{i=2,3,4,5,6,7,8,9,10} exp(\eta_i)} \\ P_{event_3} & = \begin{pmatrix} p_4 & p_5 & p_6 & p_7 & p_8 & p_9 & p_{10} \end{pmatrix} = \frac{exp(\eta_i)}{\sum \limits_{i=4,5,6,7,8,9,10} exp(\eta_i)} \\ P_{event_4} & = \begin{pmatrix} p_5 & p_6 & p_7 & p_8 & p_9 & p_{10} \end{pmatrix} = \frac{exp(\eta_i)}{\sum \limits_{i=5,6,7,8,9,10} exp(\eta_i)} \\ P_{event_5} & = \begin{pmatrix} p_7 & p_8 & p_9 & p_{10} \end{pmatrix} = \frac{exp(\eta_i)}{\sum \limits_{i=7,8,9,10} exp(\eta_i)} \end{align} \]

\[ \begin{align} Y_{event_1} & = \begin{pmatrix} y_1 & y_3 & y_4 & y_5 & y_6 & y_7 & y_8 & y_9 & y_{10} \end{pmatrix} = \begin{pmatrix} 1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \end{pmatrix} \\ Y_{event_2} & = \begin{pmatrix} y_2 & y_3 & y_4 & y_5 & y_6 & y_7 & y_8 & y_9 & y_{10} \end{pmatrix} = \begin{pmatrix} 1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \end{pmatrix} \\ Y_{event_3} & = \begin{pmatrix} y_4 & y_5 & y_6 & y_7 & y_8 & y_9 & y_{10} \end{pmatrix} = \begin{pmatrix} 1 & 0 & 0 & 0 & 0 & 0 & 0 \end{pmatrix} \\ Y_{event_4} & = \begin{pmatrix} y_5 & y_6 & y_7 & y_8 & y_9 & y_{10} \end{pmatrix} = \begin{pmatrix} 0 & 1 & 0 & 0 & 0 & 0 \end{pmatrix} \\ Y_{event_5} & = \begin{pmatrix} y_7 & y_8 & y_9 & y_{10} \end{pmatrix} = \begin{pmatrix} 0 & 0 & 1 & 0 \end{pmatrix} \\ \end{align} \]

\[ \begin{align} loss &= \sum mlogloss(Y_{event_m}, P_{event_m})\\\\ mlogloss(Y_{event_1}, P_{event_1}) & = - \sum y_{i} log(p_{i}) \\ mlogloss(Y_{event_2}, P_{event_2}) & = - \sum y_{i} log(p_{i}) \\ mlogloss(Y_{event_3}, P_{event_3}) & = - \sum y_{i} log(p_{i}) \\ mlogloss(Y_{event_4}, P_{event_4}) & = - \sum y_{i} log(p_{i}) \\ mlogloss(Y_{event_5}, P_{event_5}) & = - \sum y_{i} log(p_{i}) \\ \end{align} \]

第二節:Cox模型推導求解(3)

– 首先先定義log-hazard-ratio matrix(\(H\)):

\[\begin{align} H & = \begin{pmatrix} (XB)^T \\ (XB)^T \\ (XB)^T \\ (XB)^T \\ (XB)^T \\ (XB)^T \\ (XB)^T \\ (XB)^T \\ (XB)^T \\ (XB)^T \end{pmatrix} = \begin{pmatrix} \eta_1 & \eta_2 & \eta_3 & \eta_4 & \eta_5 & \eta_6 & \eta_7 & \eta_8 & \eta_9 & \eta_{10} \\ \eta_1 & \eta_2 & \eta_3 & \eta_4 & \eta_5 & \eta_6 & \eta_7 & \eta_8 & \eta_9 & \eta_{10} \\ \eta_1 & \eta_2 & \eta_3 & \eta_4 & \eta_5 & \eta_6 & \eta_7 & \eta_8 & \eta_9 & \eta_{10} \\ \eta_1 & \eta_2 & \eta_3 & \eta_4 & \eta_5 & \eta_6 & \eta_7 & \eta_8 & \eta_9 & \eta_{10} \\ \eta_1 & \eta_2 & \eta_3 & \eta_4 & \eta_5 & \eta_6 & \eta_7 & \eta_8 & \eta_9 & \eta_{10} \\ \eta_1 & \eta_2 & \eta_3 & \eta_4 & \eta_5 & \eta_6 & \eta_7 & \eta_8 & \eta_9 & \eta_{10} \\ \eta_1 & \eta_2 & \eta_3 & \eta_4 & \eta_5 & \eta_6 & \eta_7 & \eta_8 & \eta_9 & \eta_{10} \\ \eta_1 & \eta_2 & \eta_3 & \eta_4 & \eta_5 & \eta_6 & \eta_7 & \eta_8 & \eta_9 & \eta_{10} \\ \eta_1 & \eta_2 & \eta_3 & \eta_4 & \eta_5 & \eta_6 & \eta_7 & \eta_8 & \eta_9 & \eta_{10} \\ \eta_1 & \eta_2 & \eta_3 & \eta_4 & \eta_5 & \eta_6 & \eta_7 & \eta_8 & \eta_9 & \eta_{10} \end{pmatrix} = \begin{pmatrix} b_{1}x_1 & b_{1}x_2 & b_{1}x_3 & b_{1}x_4 & b_{1}x_5 & b_{1}x_6 & b_{1}x_7 & b_{1}x_8 & b_{1}x_9 & b_{1}x_{10} \\ b_{1}x_1 & b_{1}x_2 & b_{1}x_3 & b_{1}x_4 & b_{1}x_5 & b_{1}x_6 & b_{1}x_7 & b_{1}x_8 & b_{1}x_9 & b_{1}x_{10} \\ b_{1}x_1 & b_{1}x_2 & b_{1}x_3 & b_{1}x_4 & b_{1}x_5 & b_{1}x_6 & b_{1}x_7 & b_{1}x_8 & b_{1}x_9 & b_{1}x_{10} \\ b_{1}x_1 & b_{1}x_2 & b_{1}x_3 & b_{1}x_4 & b_{1}x_5 & b_{1}x_6 & b_{1}x_7 & b_{1}x_8 & b_{1}x_9 & b_{1}x_{10} \\ b_{1}x_1 & b_{1}x_2 & b_{1}x_3 & b_{1}x_4 & b_{1}x_5 & b_{1}x_6 & b_{1}x_7 & b_{1}x_8 & b_{1}x_9 & b_{1}x_{10} \\ b_{1}x_1 & b_{1}x_2 & b_{1}x_3 & b_{1}x_4 & b_{1}x_5 & b_{1}x_6 & b_{1}x_7 & b_{1}x_8 & b_{1}x_9 & b_{1}x_{10} \\ b_{1}x_1 & b_{1}x_2 & b_{1}x_3 & b_{1}x_4 & b_{1}x_5 & b_{1}x_6 & b_{1}x_7 & b_{1}x_8 & b_{1}x_9 & b_{1}x_{10} \\ b_{1}x_1 & b_{1}x_2 & b_{1}x_3 & b_{1}x_4 & b_{1}x_5 & b_{1}x_6 & b_{1}x_7 & b_{1}x_8 & b_{1}x_9 & b_{1}x_{10} \\ b_{1}x_1 & b_{1}x_2 & b_{1}x_3 & b_{1}x_4 & b_{1}x_5 & b_{1}x_6 & b_{1}x_7 & b_{1}x_8 & b_{1}x_9 & b_{1}x_{10} \\ b_{1}x_1 & b_{1}x_2 & b_{1}x_3 & b_{1}x_4 & b_{1}x_5 & b_{1}x_6 & b_{1}x_7 & b_{1}x_8 & b_{1}x_9 & b_{1}x_{10} \end{pmatrix} \end{align}\]

– 接著定義label matrix(\(Y\))以及mask matrix(\(M\)):

\[\begin{align} Y & = \begin{pmatrix} 1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\ 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\ 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\ 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 \\ 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 1 & 0 \\ 0 & 0 & 0 & 0 & 0 & 0 & 1 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 1 \end{pmatrix} \space \space \space M = \begin{pmatrix} 1 & 0 & 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 \\ 0 & 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 & 1 \\ 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\ 0 & 0 & 0 & 1 & 1 & 1 & 1 & 1 & 1 & 1 \\ 0 & 0 & 0 & 0 & 1 & 1 & 1 & 1 & 1 & 1 \\ 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 & 0 & 0 & 1 & 1 & 1 & 1 \\ 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \\ 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 & 0 \end{pmatrix} \end{align}\]

\[\begin{align} loss & = - \sum \limits_{\text{by column}} log( \frac {\sum \limits_{\text{by row}} M \otimes Y \otimes exp(H)}{\sum \limits_{\text{by row}} M \otimes exp(H)} ) = \sum \limits_{\text{by column}} log( \sum \limits_{\text{by row}} M \otimes exp(H) ) - \sum \limits_{\text{by column}} log( \sum \limits_{\text{by row}} M \otimes Y \otimes exp(H) ) \end{align}\]

第二節:Cox模型推導求解(4)

\[\begin{align} \frac{\partial}{\partial B} loss & = \frac{\partial}{\partial B} \sum \limits_{\text{by column}} log( \sum \limits_{\text{by row}} M \otimes exp(H) ) - \frac{\partial}{\partial B} \sum \limits_{\text{by column}} log( \sum \limits_{\text{by row}} M \otimes Y \otimes exp(H) ) \\ & = \sum \limits_{\text{by column}} \frac{\partial}{\partial B} log( \sum \limits_{\text{by row}} M \otimes exp(H) ) - \sum \limits_{\text{by column}} \frac{\partial}{\partial B} log( \sum \limits_{\text{by row}} M \otimes Y \otimes exp(H) ) \\ & = \sum \limits_{\text{by column}} \frac{ \sum \limits_{\text{by row}} M \otimes \frac{\partial}{\partial B} exp(H) } {\sum \limits_{\text{by row}} M \otimes exp(H) } - \sum \limits_{\text{by column}} \frac{ \sum \limits_{\text{by row}} M \otimes Y \otimes \frac{\partial}{\partial B} exp(H)}{\sum \limits_{\text{by row}} M \otimes Y \otimes exp(H)} \\ & = \sum \limits_{\text{by column}} \frac{ \left( M \otimes exp(H) \right) \odot X} {\left( M \otimes exp(H) \right) \odot 1_{\text{shape}=X}} - \sum \limits_{\text{by column}} \frac{ \left( M \otimes Y \otimes exp(H) \right) \odot X}{\left( M \otimes Y \otimes exp(H) \right) \odot 1_{\text{shape}=X}} \\\\\\ X & = \begin{pmatrix} x_1 \\ x_2 \\ x_3 \\ \vdots \\ x_n \end{pmatrix} \end{align}\]

第二節:Cox模型推導求解(5)

survival_encode <- function (event, time) {
  
  sub_dat <- data.frame(id = 1:length(event), event = event, time = time, stringsAsFactors = FALSE)
  sub_dat <- sub_dat[order(sub_dat[,'event'], decreasing = TRUE),]
  sub_dat <- sub_dat[order(sub_dat[,'time']),]
  
  label <- array(0L, dim = rep(nrow(sub_dat), 2))
  mask <- array(0L, dim = rep(nrow(sub_dat), 2))
  
  diag(label) <- 1L
  
  for (m in 1:nrow(sub_dat)) {
    
    mask[m,m:nrow(sub_dat)] <- 1L
    
    if (sub_dat[m,'event'] == 0) {
      
      mask[m,] <- 0L
      
    } else if (m > 1) {
      
      zero_pos <- which(sub_dat[1:(m-1),'time'] == sub_dat[m,'time'])
      
      if (length(zero_pos) > 0) {
        
        mask[zero_pos,m] <- 0L
        
      }
      
    }
    
  }
  
  original_pos <- order(sub_dat[,'id'])
  
  return(list(label = label[,original_pos,drop = FALSE], mask = mask[,original_pos,drop = FALSE]))
  
}
X_mat <- matrix(x, ncol = 1)
Y_list <- survival_encode(event = event, time = time)
Y_mat <- Y_list$label
M_mat <- Y_list$mask

第二節:Cox模型推導求解(6)

num.iteration <- 2000
lr <- 0.01
B_mat <- matrix(0, nrow = ncol(X_mat), ncol = 1)

for (i in 1:num.iteration) {
  
  H_mat <- matrix(X_mat %*% B_mat, nrow = nrow(Y_mat), ncol = ncol(Y_mat), byrow = TRUE)
  
  P11_mat <- M_mat * exp(H_mat)
  P12_mat <- P11_mat %*% X_mat
  P13_mat <- P11_mat %*% matrix(1, nrow = nrow(X_mat), ncol = ncol(X_mat))
  P14_mat <- apply(P12_mat / P13_mat, 2, sum, na.rm = TRUE)
  
  P21_mat <- M_mat * Y_mat * exp(H_mat)
  P22_mat <- P21_mat %*% X_mat
  P23_mat <- P21_mat %*% matrix(1, nrow = nrow(X_mat), ncol = ncol(X_mat))
  P24_mat <- apply(P22_mat / P23_mat, 2, sum, na.rm = TRUE)
  
  grad_B <- P14_mat - P24_mat

  B_mat <- B_mat - lr * grad_B / nrow(X_mat)
  
}

B_mat
##           [,1]
## [1,] 0.2557805

– 我們試試不同處理方法的答案:

coef(coxph(Surv(time, event) ~ x, method = 'efron'))
##         x 
## 0.2441335
coef(coxph(Surv(time, event) ~ x, method = 'breslow'))
##         x 
## 0.2359488
coef(coxph(Surv(time, event) ~ x, method = 'exact'))
##         x 
## 0.2476218

第二節:Cox模型推導求解(7)

– 只用50筆資料,免得速度太慢:

subdat <- dat[!(dat[,"time"] %in% NA) & !(dat[,"death"] %in% NA) & !(dat[,"AGE"] %in% NA) & !(dat[,"Rate"] %in% NA),]
subdat <- subdat[1:50,]
survival_model <- coxph(Surv(time, death) ~ AGE + Rate, data = subdat)
coef(survival_model)
##         AGE        Rate 
## 0.037072488 0.008077607
X_mat <- matrix(as.matrix(subdat[,c("AGE", "Rate")]), ncol = 2)
Y_list <- survival_encode(event = subdat[,'death'], time = subdat[,'time'])
Y_mat <- Y_list$label
M_mat <- Y_list$mask

num.iteration <- 2000 
lr <- 0.01
B_mat <- matrix(0, nrow = ncol(X_mat), ncol = 1)

for (i in 1:num.iteration) {
  
  H_mat <- matrix(X_mat %*% B_mat, nrow = nrow(Y_mat), ncol = ncol(Y_mat), byrow = TRUE)

  P11_mat <- M_mat * exp(H_mat)
  P12_mat <- P11_mat %*% X_mat
  P13_mat <- P11_mat %*% matrix(1, nrow = nrow(X_mat), ncol = ncol(X_mat))
  P14_mat <- apply(P12_mat / P13_mat, 2, sum, na.rm = TRUE)
  
  P21_mat <- M_mat * Y_mat * exp(H_mat)
  P22_mat <- P21_mat %*% X_mat
  P23_mat <- P21_mat %*% matrix(1, nrow = nrow(X_mat), ncol = ncol(X_mat))
  P24_mat <- apply(P22_mat / P23_mat, 2, sum, na.rm = TRUE)

  grad_B <- P14_mat - P24_mat
  
  B_mat <- B_mat - lr * grad_B / nrow(X_mat)
  
}

print(B_mat)
##             [,1]
## [1,] 0.037072488
## [2,] 0.008077607

第三節:非線性擬合曲線(1)

  1. Linear regression

\[\hat{y} = b_{0} + b_{1}x\]

  1. Logistic regression

\[log(\frac{{p}}{1-p}) = b_{0} + b_{1}x\]

  1. Softmax regression

\[softmax^{-1}(P) = XB\]

  1. Cox proportional hazards model

\[log(\frac{h(t)}{h_0(t)})= b_{1}x\]

第三節:非線性擬合曲線(2)

– 我們先以線性回歸為例:

fit <- lm(K ~ pspline(PR), data = dat)

lines.x <- seq(60, 300, by = 0.1)
lines.y <- predict(fit, data.frame(PR = lines.x))

plot(K ~ PR, data = dat, ylab = "K", xlab = "PR", main = "Scatter plot of PR and K", pch = 19, col = "#00000030")
lines(lines.x, lines.y, col = 'red', lwd = 2)

第三節:非線性擬合曲線(3)

fit <- coxph(Surv(time, death) ~ pspline(Rate, df = 4), data = dat)

lines.x <- seq(30, 180, by = 0.1)

pred_list <- predict(fit, data.frame(Rate = lines.x), type = 'risk', se.fit = TRUE)

lines.y <- pred_list$fit
lines.y_low <- lines.y - qnorm(0.975) * pred_list$se.fit
lines.y_up <- lines.y + qnorm(0.975) * pred_list$se.fit

plot(lines.x, lines.y, xlim = c(40, 170), ylim = c(0.1, 10), type = 'l', ylab = "Hazard ratio", xlab = "Rate", log = 'y', col = 'red', lwd = 2)
lines(lines.x, lines.y_low, col = 'red', lty = 2)
lines(lines.x, lines.y_up, col = 'red', lty = 2)

練習2:以資料科學實驗流程決定模型參數

subdat <- dat[!(dat[,"time"] %in% NA) & !(dat[,"death"] %in% NA) & !(dat[,"AGE"] %in% NA) & !(dat[,"Rate"] %in% NA),]
set.seed(0)
all_idx <- 1:nrow(subdat)

train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]

train_dat <- subdat[train_idx,]
valid_dat <- subdat[valid_idx,]
test_dat <- subdat[test_idx,]

練習2答案(1)

fit <- coxph(Surv(time, death) ~ AGE + Rate, data = train_dat)

valid_dat[,'score'] <- predict(fit, valid_dat, type = 'lp')
concordance(coxph(Surv(time, death) ~ score, data = valid_dat))[['concordance']]
## [1] 0.7478008
test_dat[,'score'] <- predict(fit, test_dat, type = 'lp')
concordance(coxph(Surv(time, death) ~ score, data = test_dat))[['concordance']]
## [1] 0.7577791

練習2答案(2)

result_dat <- data.frame(df1 = rep(2:5, each = 3), df2 = rep(2:5, 3), idx = NA)

for (i in 1:nrow(result_dat)) {
  
  fit <- coxph(Surv(time, death) ~ pspline(AGE, df = result_dat[i,'df1']) + pspline(Rate, df = result_dat[i,'df2']), data = train_dat)
  
  valid_dat[,'score'] <- predict(fit, valid_dat)
  result_dat[i,'idx'] <- concordance(coxph(Surv(time, death) ~ score, data = valid_dat))[['concordance']]
  
}

result_dat
##    df1 df2       idx
## 1    2   2 0.7571997
## 2    2   3 0.7608130
## 3    2   4 0.7617378
## 4    3   5 0.7631788
## 5    3   2 0.7607270
## 6    3   3 0.7636305
## 7    4   4 0.7655447
## 8    4   5 0.7649855
## 9    4   2 0.7614152
## 10   5   3 0.7645553
## 11   5   4 0.7655017
## 12   5   5 0.7646414
fit <- coxph(Surv(time, death) ~ pspline(AGE, df = 4) + pspline(Rate, df = 4), data = train_dat)

test_dat[,'score'] <- predict(fit, test_dat)
concordance(coxph(Surv(time, death) ~ score, data = test_dat))[['concordance']]
## [1] 0.7654466

我們需要特別注意,參數多調一定會造成「驗證組」上的準確度上升,但是否有效一定要看「測試組」上的結果。

課程小結

– 目前我們手上有的工具:linear regression、logistic regression、softmax regression以及Cox proportional hazards model能分別對Outcome為「連續變項」、「二元類別變項」、「多元類別變項」、「存活變項」進行分析

– 雖然所有分析都有相對應的套件能夠進行,但所有推導過程仍然是非常重要,特別是今天教的Cox proportional hazards model,未來我們在深度學習的擴展上就沒有相對應的套件能協助你了,如果你不了解後面的計算基礎那就沒有辦法用深度學習進行存活分析

– 下節課開始我們會開始教導機器學習的部分,從這裡開始就跟生物統計學有一些不同了,有一些模型的算法甚至獨樹一格沒有辦法套用之前的經驗!