深度學習理論與實務

林嶔 (Lin, Chin)

Lesson 4 MxNet框架編程介紹

第一節:MxNet基本介紹(1)

– 透過前幾節課的推導,我們應該了解到這種開發框架其實也不是很難實現,這是因為目前的人工神經網路無論多深多複雜,也一定可以被定義成一個複雜函數的組合,透過分別解微分以及連鎖律我們將能求得任何參數的梯度,從而能實現梯度下降法來訓練網路。

– 但要注意一點,僅有64位元的作業系統能安裝MxNet。

– 他的安裝方法比較特別,並且有安裝GPU版本的方法,下面是在WINDOW系統安裝CPU版本的作法:

install.packages("https://s3.ca-central-1.amazonaws.com/jeremiedb/share/mxnet/CPU/3.6/mxnet.zip", repos = NULL)

– 或者採這個方法:

cran <- getOption("repos")
cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/"
options(repos = cran)
install.packages("mxnet")
library(mxnet)

第一節:MxNet基本介紹(2)

  1. 資料結構

  2. 預測函數(網路結構)

  3. 優化方法

  1. Iterator

  2. Model architecture

  3. Optimizer

第一節:MxNet基本介紹(3)

– 這次我們會示範很多依變項的分析語法,所以就不先進行特別的前處理

dat <- read.csv("ECG_train.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE, na.strings = "")
subdat <- dat[!dat[,'K'] %in% NA & !dat[,'AGE'] %in% NA & !dat[,'Rate'] %in% NA, c('K', 'AGE', 'Rate')]

used_dat.x <- model.matrix(~ ., data = subdat[,-1])

X.array <- t(used_dat.x[,-1])
Y.array <- array(subdat[,1], dim = c(1, nrow(subdat)))

第一節:MxNet基本介紹(4)

my_iterator_core = function (batch_size) {
  
  batch = 0
  batch_per_epoch = length(Y.array)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > ncol(X.array)] = sample(1:ncol(X.array), sum(idx > ncol(X.array)))
    data = mx.nd.array(X.array[,idx, drop=FALSE])
    label = mx.nd.array(Y.array[,idx, drop=FALSE])
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

第一節:MxNet基本介紹(5)

my_iter = my_iterator_func(iter = NULL, batch_size = 20)
  1. 重置batch(切換到下一個epoch):
my_iter$reset()
  1. 跳到下一個batch:
my_iter$iter.next()
## [1] TRUE
  1. 產生小批量樣本:
my_iter$value()
## $data
##           [,1]     [,2]     [,3]     [,4]    [,5]     [,6]     [,7]     [,8]
## [1,]  50.95044 66.22767 67.46526 52.39062 38.6022 83.70883 62.12934 60.54764
## [2,] 112.00000 76.00000 65.00000 93.00000 63.0000 71.00000 62.00000 65.00000
##          [,9]    [,10]     [,11]    [,12]    [,13]    [,14]     [,15]    [,16]
## [1,] 62.38272 55.70469  82.29976 67.57248 36.93145 52.65045  64.91228 64.37901
## [2,] 85.00000 66.00000 109.00000 55.00000 67.00000 65.00000 103.00000 96.00000
##         [,17]     [,18]    [,19]    [,20]
## [1,] 88.57705  83.13152 31.84526 81.80301
## [2,] 97.00000 153.00000 78.00000 60.00000
## 
## $label
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,]  2.2  3.7  2.7  3.8  5.7  5.7    6    3  3.2   3.5   2.1   2.8   3.4   2.7
##      [,15] [,16] [,17] [,18] [,19] [,20]
## [1,]   6.5     3   5.9   2.9   2.1   4.2

第一節:MxNet基本介紹(6)

– 函數「mx.symbol.Variable」用來產生一個新的模型起點,他通常是Iterator所輸出的名稱,有時候也可以用來指定權重的名稱

– 函數「mx.symbol.FullyConnected」代表著全連接層,這相當於之前MLP做線性運算的\(L\)函數,其中參數「num.hidden」意旨輸出維度

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
weight = mx.symbol.Variable(name = 'weight')
bias = mx.symbol.Variable(name = 'bias')
pred_y = mx.symbol.FullyConnected(data = data, weight = weight, bias = bias,
                                  num.hidden = 1, no.bias = FALSE, name = 'pred_y')
mx.symbol.infer.shape(pred_y, data = c(2, 20))
## $arg.shapes
## $arg.shapes$data
## [1]  2 20
## 
## $arg.shapes$weight
## [1] 2 1
## 
## $arg.shapes$bias
## [1] 1
## 
## 
## $out.shapes
## $out.shapes$pred_y_output
## [1]  1 20
## 
## 
## $aux.shapes
## named list()
residual = mx.symbol.broadcast_minus(lhs = label, rhs = pred_y) 
square_residual = mx.symbol.square(data = residual)
mean_square_residual = mx.symbol.mean(data = square_residual, axis = 0, keepdims = FALSE)
mse_loss = mx.symbol.MakeLoss(data = mean_square_residual, name = 'rmse')
mx.symbol.infer.shape(mse_loss, data = c(2, 20), label = c(1, 20))$out.shapes
## $rmse_output
## [1] 1

第一節:MxNet基本介紹(7)

my_optimizer = mx.opt.create(name = "adam", learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999,
  epsilon = 1e-08, wd = 0)
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 1e-4, momentum = 0.9, wd = 0)

第一節:MxNet基本介紹(8)

– 注意,這裡的num.round是指epoch的意思,而非batch。

mx.set.seed(0)

lr_model = mx.model.FeedForward.create(symbol = mse_loss, X = my_iter, optimizer = my_optimizer,
                                       array.batch.size = 20, ctx = mx.cpu(), num.round = 200)
lr_model$arg.params
## $weight
##            [,1]
## [1,] 0.05455168
## [2,] 0.01002080
## 
## $bias
## [1] 2.180768
lm(K ~ ., data = subdat)
## 
## Call:
## lm(formula = K ~ ., data = subdat)
## 
## Coefficients:
## (Intercept)          AGE         Rate  
##    2.461510     0.017666     0.002416

第一節:MxNet基本介紹(9)

my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)

mx.set.seed(0)

lr_model = mx.model.FeedForward.create(symbol = mse_loss, X = my_iter, optimizer = my_optimizer,
                                       array.batch.size = 20, ctx = mx.cpu(), num.round = 10)

lr_model$arg.params
## $weight
##      [,1]
## [1,]  NaN
## [2,]  NaN
## 
## $bias
## [1] NaN

\[ \begin{align} \frac{\partial}{\partial b_0}loss & = \frac{1}{n} \sum \limits_{i=1}^{n}\left( \hat{y_{i}} - y_{i} \right) \\ \frac{\partial}{\partial b_1}loss & = \frac{1}{n} \sum \limits_{i=1}^{n}\left( \hat{y_{i}} - y_{i} \right) \cdot x_{i} \end{align} \]

第一節:MxNet基本介紹(10)

X.array <- t(scale(t(X.array)))
mean.X <- attr(X.array, "scaled:center")
sd.X <- attr(X.array, "scaled:scale")

my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)

mx.set.seed(0)

lr_model = mx.model.FeedForward.create(symbol = mse_loss, X = my_iter, optimizer = my_optimizer,
                                       array.batch.size = 20, ctx = mx.cpu(), num.round = 20)

lr_model$arg.params
## $weight
##           [,1]
## [1,] 0.7609081
## [2,] 0.1595915
## 
## $bias
## [1] 3.993352
subdat[,'AGE'] <- (subdat[,'AGE'] - mean(subdat[,'AGE'])) / sd(subdat[,'AGE'])
subdat[,'Rate'] <- (subdat[,'Rate'] - mean(subdat[,'Rate'])) / sd(subdat[,'Rate'])

lm(K ~ ., data = subdat)
## 
## Call:
## lm(formula = K ~ ., data = subdat)
## 
## Coefficients:
## (Intercept)          AGE         Rate  
##      3.7810       0.3304       0.0571

練習1:請試著利用MxNet做出邏輯斯迴歸

– 讓我們直接標準化,以方便比較結果。

subdat <- dat[!dat[,'LVD'] %in% NA & !dat[,'AGE'] %in% NA & !dat[,'Rate'] %in% NA, c('LVD', 'AGE', 'Rate')]
subdat[,-1] <- scale(subdat[,-1])

used_dat.x <- model.matrix(~ ., data = subdat[,-1])

X.array <- t(used_dat.x[,-1])
Y.array <- array(subdat[,1], dim = c(1, nrow(subdat)))

– 這是函數「glm」的運算結果:

glm(LVD ~ ., data = subdat, family = 'binomial')
## 
## Call:  glm(formula = LVD ~ ., family = "binomial", data = subdat)
## 
## Coefficients:
## (Intercept)          AGE         Rate  
##    -0.20658      0.02904      0.55717  
## 
## Degrees of Freedom: 2111 Total (i.e. Null);  2109 Residual
## Null Deviance:       2907 
## Residual Deviance: 2763  AIC: 2769

\[ \begin{align} log(\frac{{p}}{1-p}) & = b_{0} + b_{1}x \\ p & = \frac{{1}}{1+e^{-b_{0} - b_{1}x}} \\ loss & = CE(y, p) = -\frac{{1}}{n}\sum \limits_{i=1}^{n} \left(y_{i} \cdot log(p_{i}) + (1-y_{i}) \cdot log(1-p_{i})\right) \end{align} \]

練習1答案(1)

my_iterator_core = function(batch_size) {
  
  batch = 0
  batch_per_epoch = length(Y.array)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > ncol(X.array)] = sample(1:ncol(X.array), sum(idx > ncol(X.array)))
    data = mx.nd.array(X.array[,idx, drop=FALSE])
    label = mx.nd.array(Y.array[,idx, drop=FALSE])
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter = my_iterator_func(iter = NULL, batch_size = 100)

練習1答案(2)

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
linear_pred = mx.symbol.FullyConnected(data = data,
                                       num.hidden = 1, no.bias = FALSE, name = 'linear_pred')
logistic_pred = mx.symbol.sigmoid(data = linear_pred, name = 'logistic_pred')

eps = 1e-8
ce_loss_pos =  mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg =  mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean = 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss = mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
mx.set.seed(0)

logistic_model = mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
                                             array.batch.size = 20, ctx = mx.cpu(), num.round = 20)

logistic_model$arg.params
## $linear_pred_weight
##            [,1]
## [1,] 0.01686738
## [2,] 0.55081975
## 
## $linear_pred_bias
## [1] -0.1971277

第二節:MxNet的底層執行器(1)

– 在前面的練習題中,我們已經寫好了邏輯斯迴歸的Iterator、Model architecture、Optimizer等三個部分,現在讓我們再編寫結合這三者的Executor。

my_executor = mx.simple.bind(symbol = ce_loss,
                             data = c(2, 100), label = c(1, 100),
                             ctx = mx.cpu(), grad.req = "write")
mx.set.seed(0)

new_arg = mxnet:::mx.model.init.params(symbol = ce_loss,
                                       input.shape = list(data = c(2, 100), label = c(1, 100)),
                                       output.shape = NULL,
                                       initializer = mxnet:::mx.init.uniform(0.01),
                                       ctx = mx.cpu())
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)
my_updater = mx.opt.get.updater(optimizer = my_optimizer, weights = my_executor$ref.arg.arrays)

第二節:MxNet的底層執行器(2)

– 「my_iter$iter.next()」會持續回答TRUE直到一個epoch結束,而每個epoch的

my_iter$reset()

while (my_iter$iter.next()) {
  
  my_values <- my_iter$value()
  mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
  mx.exec.forward(my_executor, is.train = TRUE)
  mx.exec.backward(my_executor)
  update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
  mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
  
}
set.seed(0)

for (i in 1:20) {
  
  my_iter$reset()
  batch_loss = NULL
  
  while (my_iter$iter.next()) {
    
    my_values <- my_iter$value()
    mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
    mx.exec.forward(my_executor, is.train = TRUE)
    mx.exec.backward(my_executor)
    update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
    mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
    batch_loss = c(batch_loss, as.array(my_executor$ref.outputs$ce_loss_output))
    
  }
  
  if (i %% 10 == 0 | i < 10) {
    cat(paste0("epoch = ", i,
               ": Cross-Entropy (CE) = ", formatC(mean(batch_loss), format = "f", 4),
               "; weight = ", paste(formatC(as.array(my_executor$ref.arg.arrays$linear_pred_weight), format = "f", 3), collapse = ", "),
               "; bias = ", formatC(as.array(my_executor$ref.arg.arrays$linear_pred_bias), format = "f", 3), "\n"))
  }
  
}
## epoch = 1: Cross-Entropy (CE) = 0.6556; weight = 0.013, 0.619; bias = -0.219
## epoch = 2: Cross-Entropy (CE) = 0.6544; weight = 0.017, 0.541; bias = -0.191
## epoch = 3: Cross-Entropy (CE) = 0.6548; weight = 0.017, 0.545; bias = -0.197
## epoch = 4: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.553; bias = -0.198
## epoch = 5: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.551; bias = -0.197
## epoch = 6: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.551; bias = -0.197
## epoch = 7: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.551; bias = -0.197
## epoch = 8: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.551; bias = -0.197
## epoch = 9: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.551; bias = -0.197
## epoch = 10: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.551; bias = -0.197
## epoch = 20: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.551; bias = -0.197

第二節:MxNet的底層執行器(3)

logistic_model <- mxnet:::mx.model.extract.model(symbol = logistic_pred,
                                                 train.execs = list(my_executor))
mx.model.save(logistic_model, "logistic_regression", iteration = 0)
logistic_model = mx.model.load("logistic_regression", iteration = 0)
predict_Y = predict(logistic_model, X.array, array.layout = "colmajor")
confusion_table = table(predict_Y > 0.5, Y.array)
print(confusion_table)
##        Y.array
##           0   1
##   FALSE 883 545
##   TRUE  279 405

第二節:MxNet的底層執行器(4)

– MxNet的底層執行器來既然能進行Forward/Backward程序,當然也可以只進行Forward程序。

– 需要注意的是,假設你的驗證樣本很大,那你可能需要用迴圈慢慢丟樣本進行推論,不然會把記憶體塞爆。

all_layers = logistic_model$symbol$get.internals()

linear_pred_output = all_layers$get.output(which(all_layers$outputs == 'linear_pred_output'))

symbol_group = mx.symbol.Group(c(linear_pred_output, logistic_model$symbol))
Pred_executor = mx.simple.bind(symbol = symbol_group, data = c(2, ncol(X.array)), ctx = mx.cpu())

– 執行器已經寫完了,剩下的程序差不多,只有一些細節上的差異:

mx.exec.update.arg.arrays(Pred_executor, logistic_model$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(Pred_executor, logistic_model$aux.params, match.name = TRUE)
mx.exec.update.arg.arrays(Pred_executor, list(data = mx.nd.array(X.array)), match.name = TRUE)
mx.exec.forward(Pred_executor, is.train = FALSE)

PRED_linear_pred = as.array(Pred_executor$ref.outputs$linear_pred_output)
PRED_logistic_pred = as.array(Pred_executor$ref.outputs$logistic_pred_output)
confusion_table = table(PRED_logistic_pred > 0.5, Y.array)
print(confusion_table)
##        Y.array
##           0   1
##   FALSE 883 545
##   TRUE  279 405

練習2:使用底層執行器進行多分類任務

– 需要特別注意的是,我們需要對Y做特殊的編碼方式,這個之前我們已經學過非常多次了,請你再熟悉一下:

subdat <- dat[!dat[,'AMI'] %in% NA & !dat[,'AGE'] %in% NA & !dat[,'Rate'] %in% NA, c('AMI', 'AGE', 'Rate')]
subdat[,-1] <- scale(subdat[,-1])

used_dat.x <- model.matrix(~ ., data = subdat[,-1])

set.seed(0)
all_idx <- 1:nrow(subdat)

train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]

X_mat <- t(used_dat.x[,-1])
Y_mat <- array(t(model.matrix(~ -1 + subdat[,1])), dim = c(3, nrow(subdat)))

train_X <- X_mat[,train_idx]
valid_X <- X_mat[,valid_idx]
test_X <- X_mat[,test_idx]

train_Y <- Y_mat[,train_idx]
valid_Y <- Y_mat[,valid_idx]
test_Y <- Y_mat[,test_idx]

\[ \begin{align} mlogloss(y_{i}, p_{i}) & = - \sum \limits_{i=1}^{m} y_{i} log(p_{i}) \end{align} \]

練習2答案(1)

– Iterator的部分,要特別注意這裡要改成train_X:

my_iterator_core = function(batch_size) {
  
  batch = 0
  batch_per_epoch = ncol(train_X)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > ncol(train_Y)] = sample(1:ncol(train_Y), sum(idx > ncol(train_Y)))
    data = mx.nd.array(train_X[,idx, drop=FALSE])
    label = mx.nd.array(train_Y[,idx, drop=FALSE])
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter = my_iterator_func(iter = NULL, batch_size = 20)

練習2答案(2)

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
linear_pred = mx.symbol.FullyConnected(data = data, num.hidden = 3, name = 'linear_pred')
softmax_layer = mx.symbol.softmax(data = linear_pred, axis = 1, name = 'softmax_layer')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)

練習2答案(3)

#1. Build an executor to train model

my_executor = mx.simple.bind(symbol = m_logloss,
                             data = c(2, 20), label = c(3, 20),
                             ctx = mx.cpu(), grad.req = "write")

#2. Set the initial parameters
mx.set.seed(0)

new_arg = mxnet:::mx.model.init.params(symbol = m_logloss,
                                       input.shape = list(data = c(2, 20), label = c(3, 20)),
                                       output.shape = NULL,
                                       initializer = mxnet:::mx.init.uniform(0.01),
                                       ctx = mx.cpu())
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)

#3. Define the updater
my_updater = mx.opt.get.updater(optimizer = my_optimizer, weights = my_executor$ref.arg.arrays)
set.seed(0)

for (i in 1:20) {
  
  my_iter$reset()
  batch_loss = NULL
  
  while (my_iter$iter.next()) {
    
    my_values <- my_iter$value()
    mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
    mx.exec.forward(my_executor, is.train = TRUE)
    mx.exec.backward(my_executor)
    update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
    mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
    batch_loss = c(batch_loss, as.array(my_executor$ref.outputs$m_logloss_output))
    
  }
  
  if (i %% 10 == 0 | i <= 5) {
    cat(paste0("epoch = ", i, ": m-logloss = ", formatC(mean(batch_loss), format = "f", 4), "\n"))
  }
  
}
## epoch = 1: m-logloss = 0.2864
## epoch = 2: m-logloss = 0.2531
## epoch = 3: m-logloss = 0.2528
## epoch = 4: m-logloss = 0.2527
## epoch = 5: m-logloss = 0.2527
## epoch = 10: m-logloss = 0.2526
## epoch = 20: m-logloss = 0.2526

練習2答案(4)

softmax_model <- mxnet:::mx.model.extract.model(symbol = softmax_layer,
                                                train.execs = list(my_executor))

predict_Y = predict(softmax_model, valid_X, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(valid_Y)))
print(confusion_table)
##    
##       1   2   3
##   1 207  32  57
library(pROC)

par(mfrow = c(1, 2))

roc_valid <- roc((max.col(t(valid_Y)) == 2) ~ predict_Y[2,])
plot(roc_valid, col = 'blue')

roc_valid <- roc((max.col(t(valid_Y)) == 1) ~ predict_Y[1,])
plot(roc_valid, col = 'blue')

第三節:MxNet的進階應用(1)

– 要進行過度採樣就必須修正Iterator的部分:

my_iterator_core = function(batch_size) {
  
  batch = 0
  batch_per_epoch = ncol(train_X)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx.1 <- sample(which(max.col(t(train_Y)) == 1), floor(batch_size / 3)) 
    idx.2 <- sample(which(max.col(t(train_Y)) == 2), floor(batch_size / 3)) 
    idx.3 <- sample(which(max.col(t(train_Y)) == 3), batch_size - length(idx.1) - length(idx.2)) 
    idx = c(idx.1, idx.2, idx.3)
    data = mx.nd.array(train_X[,idx, drop=FALSE])
    label = mx.nd.array(train_Y[,idx, drop=FALSE])
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter = my_iterator_func(iter = NULL, batch_size = 20)

第三節:MxNet的進階應用(2)

#1. Build an executor to train model

my_executor = mx.simple.bind(symbol = m_logloss,
                             data = c(2, 20), label = c(3, 20),
                             ctx = mx.cpu(), grad.req = "write")

#2. Set the initial parameters
mx.set.seed(0)

new_arg = mxnet:::mx.model.init.params(symbol = m_logloss,
                                       input.shape = list(data = c(2, 20), label = c(3, 20)),
                                       output.shape = NULL,
                                       initializer = mxnet:::mx.init.uniform(0.01),
                                       ctx = mx.cpu())
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)

#3. Define the updater
my_updater = mx.opt.get.updater(optimizer = my_optimizer, weights = my_executor$ref.arg.arrays)

#4. Forward/Backward
set.seed(0)

for (i in 1:20) {
  
  my_iter$reset()
  batch_loss = NULL
  
  while (my_iter$iter.next()) {
    
    my_values <- my_iter$value()
    mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
    mx.exec.forward(my_executor, is.train = TRUE)
    mx.exec.backward(my_executor)
    update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
    mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
    batch_loss = c(batch_loss, as.array(my_executor$ref.outputs$m_logloss_output))
    
  }
  
  if (i %% 10 == 0 | i <= 5) {
    cat(paste0("epoch = ", i, ": m-logloss = ", formatC(mean(batch_loss), format = "f", 4), "\n"))
  }
  
}
## epoch = 1: m-logloss = 0.3618
## epoch = 2: m-logloss = 0.3575
## epoch = 3: m-logloss = 0.3593
## epoch = 4: m-logloss = 0.3611
## epoch = 5: m-logloss = 0.3600
## epoch = 10: m-logloss = 0.3569
## epoch = 20: m-logloss = 0.3579
softmax_model <- mxnet:::mx.model.extract.model(symbol = softmax_layer,
                                                train.execs = list(my_executor))

predict_Y = predict(softmax_model, valid_X, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(valid_Y)))
print(confusion_table)
##    
##       1   2   3
##   1  52   3   4
##   2  24   4   6
##   3 131  25  47

第三節:MxNet的進階應用(3)

– 讓我們嘗試一下把Softmax regression改造成具有1個隱藏層的多層感知器,其實只要把Model architecture的部分作修正,其他部分完全不需要改:

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 3, name = 'fc2')
softmax_layer = mx.symbol.softmax(data = fc2, axis = 1, name = 'softmax_layer')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')

第三節:MxNet的進階應用(4)

#1. Build an executor to train model

my_executor = mx.simple.bind(symbol = m_logloss,
                             data = c(2, 20), label = c(3, 20),
                             ctx = mx.cpu(), grad.req = "write")

#2. Set the initial parameters
mx.set.seed(0)

new_arg = mxnet:::mx.model.init.params(symbol = m_logloss,
                                       input.shape = list(data = c(2, 20), label = c(3, 20)),
                                       output.shape = NULL,
                                       initializer = mxnet:::mx.init.uniform(0.01),
                                       ctx = mx.cpu())
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)

#3. Define the updater
my_updater = mx.opt.get.updater(optimizer = my_optimizer, weights = my_executor$ref.arg.arrays)

#4. Forward/Backward
set.seed(0)

for (i in 1:20) {
  
  my_iter$reset()
  batch_loss = NULL
  
  while (my_iter$iter.next()) {
    
    my_values <- my_iter$value()
    mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
    mx.exec.forward(my_executor, is.train = TRUE)
    mx.exec.backward(my_executor)
    update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
    mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
    batch_loss = c(batch_loss, as.array(my_executor$ref.outputs$m_logloss_output))
    
  }
  
  if (i %% 10 == 0 | i <= 5) {
    cat(paste0("epoch = ", i, ": m-logloss = ", formatC(mean(batch_loss), format = "f", 4), "\n"))
  }
  
}
## epoch = 1: m-logloss = 0.3639
## epoch = 2: m-logloss = 0.3630
## epoch = 3: m-logloss = 0.3630
## epoch = 4: m-logloss = 0.3629
## epoch = 5: m-logloss = 0.3629
## epoch = 10: m-logloss = 0.3605
## epoch = 20: m-logloss = 0.3454
softmax_model <- mxnet:::mx.model.extract.model(symbol = softmax_layer,
                                                train.execs = list(my_executor))

predict_Y = predict(softmax_model, valid_X, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(valid_Y)))
print(confusion_table)
##    
##       1   2   3
##   1  88   7   8
##   2   2   0   2
##   3 117  25  47

第四節:製作存活分析的神經網路(1)

subdat <- dat[!dat[,'time'] %in% NA & !dat[,'death'] %in% NA & !dat[,'AGE'] %in% NA & !dat[,'Rate'] %in% NA, c('time', 'death', 'AGE', 'Rate')]
subdat[,-1:-2] <- scale(subdat[,-1:-2])

used_dat.x <- model.matrix(~ ., data = subdat[,-1:-2])

set.seed(0)
all_idx <- 1:nrow(subdat)

train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]

X_mat <- t(used_dat.x[,-1])

train_X <- X_mat[,train_idx]
valid_X <- X_mat[,valid_idx]
test_X <- X_mat[,test_idx]

train_Y <- subdat[train_idx,1:2]
valid_Y <- subdat[valid_idx,1:2]
test_Y <- subdat[test_idx,1:2]
library(survival)

coxph(Surv(time, death) ~ ., data = subdat[train_idx,])
## Call:
## coxph(formula = Surv(time, death) ~ ., data = subdat[train_idx, 
##     ])
## 
##         coef exp(coef) se(coef)     z       p
## AGE  0.68026   1.97439  0.07475 9.101 < 2e-16
## Rate 0.34593   1.41330  0.04715 7.336 2.2e-13
## 
## Likelihood ratio test=144.2  on 2 df, p=< 2.2e-16
## n= 2800, number of events= 267

第四節:製作存活分析的神經網路(2)

survival_encode <- function (event, time) {
  
  sub_dat <- data.frame(id = 1:length(event), event = event, time = time, stringsAsFactors = FALSE)
  sub_dat <- sub_dat[order(sub_dat[,'event'], decreasing = TRUE),]
  sub_dat <- sub_dat[order(sub_dat[,'time']),]
  
  label <- array(0L, dim = rep(nrow(sub_dat), 2))
  mask <- array(0L, dim = rep(nrow(sub_dat), 2))
  
  diag(label) <- 1L
  
  for (m in 1:nrow(sub_dat)) {
    
    mask[m,m:nrow(sub_dat)] <- 1L
    
    if (sub_dat[m,'event'] == 0) {
      
      mask[m,] <- 0L
      
    } else if (m > 1) {
      
      zero_pos <- which(sub_dat[1:(m-1),'time'] == sub_dat[m,'time'])
      
      if (length(zero_pos) > 0) {
        
        mask[zero_pos,m] <- 0L
        
      }
      
    }
    
  }
  
  original_pos <- order(sub_dat[,'id'])
  
  return(list(label = label[,original_pos,drop = FALSE], mask = mask[,original_pos,drop = FALSE]))
  
}

第四節:製作存活分析的神經網路(3)

my_iterator_core = function(batch_size) {
  
  batch = 0
  batch_per_epoch = ncol(train_X)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > ncol(train_X)] = sample(1:ncol(train_X), sum(idx > ncol(train_X)))
    data = mx.nd.array(train_X[,idx, drop=FALSE])
    label_list = survival_encode(event = train_Y[idx,'death'], time = train_Y[idx,'time'])
    return(list(data = data, label = mx.nd.array(label_list$label), mask = mx.nd.array(label_list$mask)))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter = my_iterator_func(iter = NULL, batch_size = 500)

第四節:製作存活分析的神經網路(4)

\[ \begin{align} loss & = \sum \limits_{\text{by column}} log( \sum \limits_{\text{by row}} M \otimes exp(H) ) - \sum \limits_{\text{by column}} log( \sum \limits_{\text{by row}} M \otimes Y \otimes exp(H) ) \end{align} \]

data = mx.symbol.Variable(name = 'data')
final_pred = mx.symbol.FullyConnected(data = data, num.hidden = 1, no.bias = TRUE, name = 'final_pred')

COX_LOSS <- function (indata, inlabel, inmask, eps = 1e-8, make_loss = TRUE) {
  
  exp_data <- mx.symbol.exp(data = indata, name = 'exp_data')
  
  part_event_y <-  mx.symbol.broadcast_mul(lhs = inlabel, rhs = inmask, name = 'part_event_y')
  
  part_event <-  mx.symbol.broadcast_mul(lhs = part_event_y, rhs = exp_data, name = 'part_event')
  
  part_at_risk <-  mx.symbol.broadcast_mul(lhs = inmask, rhs = exp_data, name = 'part_at_risk')
  
  sum_part_event <- mx.symbol.sum(data = part_event, axis = 0, keepdims = TRUE, name = 'part_event')
  
  sum_part_at_risk <- mx.symbol.sum(data = part_at_risk, axis = 0, keepdims = TRUE, name = 'sum_part_at_risk')
  
  log_part_event <- mx.symbol.log(data = sum_part_event + eps, name = 'log_part_event')
  
  log_part_at_risk <- mx.symbol.log(data = sum_part_at_risk + eps, name = 'log_part_at_risk')
  
  diff_at_risk_event <- mx.symbol.broadcast_minus(lhs = log_part_at_risk, rhs = log_part_event, name = 'diff_at_risk_event')
  
  important_time <- mx.symbol.sum(data = part_event_y, axis = 0, keepdims = TRUE, name = 'important_time')
  
  number_time <- mx.symbol.sum(data = important_time, axis = 1, name = 'number_time')
  
  event_loss <-  mx.symbol.broadcast_mul(lhs = diff_at_risk_event, rhs = important_time, name = 'event_loss')
  
  sum_event_loss <- mx.symbol.sum(data = event_loss, axis = 1, name = 'sum_event_loss')
  
  final_loss <- mx.symbol.broadcast_div(lhs = sum_event_loss, rhs = number_time, name = 'final_loss')
  
  if (make_loss) {final_loss <- mx.symbol.MakeLoss(data = final_loss, name = 'COX_loss')}
  
  return(final_loss)
  
}

label_sym <- mx.symbol.Variable('label')
mask_sym <- mx.symbol.Variable('mask')

loss_symbol <- COX_LOSS(indata = final_pred, inlabel = label_sym, inmask = mask_sym)

第四節:製作存活分析的神經網路(5)

#1. Build an executor to train model

my_executor = mx.simple.bind(symbol = loss_symbol,
                             data = c(2, 500), label = c(500, 500), mask = c(500, 500),
                             ctx = mx.cpu(), grad.req = "write")

#2. Set the initial parameters
mx.set.seed(0)

new_arg = mxnet:::mx.model.init.params(symbol = loss_symbol,
                                       input.shape = list(data = c(2, 500), label = c(500, 500), mask = c(500, 500)),
                                       output.shape = NULL,
                                       initializer = mxnet:::mx.init.uniform(0.01),
                                       ctx = mx.cpu())
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)

#3. Define the updater
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
my_updater = mx.opt.get.updater(optimizer = my_optimizer, weights = my_executor$ref.arg.arrays)

#4. Forward/Backward
set.seed(0)

for (i in 1:20) {
  
  my_iter$reset()
  batch_loss = NULL
  
  while (my_iter$iter.next()) {
    
    my_values <- my_iter$value()
    mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
    mx.exec.forward(my_executor, is.train = TRUE)
    mx.exec.backward(my_executor)
    update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
    mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
    batch_loss = c(batch_loss, as.array(my_executor$ref.outputs$COX_loss_output))
    
  }
  
  if (i %% 10 == 0 | i <= 5) {
    cat(paste0("epoch = ", i, ": loss = ", formatC(mean(batch_loss), format = "f", 4), "\n"))
  }
  
}
## epoch = 1: loss = 5.5123
## epoch = 2: loss = 5.3611
## epoch = 3: loss = 5.3609
## epoch = 4: loss = 5.3483
## epoch = 5: loss = 5.3471
## epoch = 10: loss = 5.3277
## epoch = 20: loss = 5.3253
my_executor$ref.arg.arrays$final_pred_weight
##           [,1]
## [1,] 0.6448541
## [2,] 0.3448658

練習3:存活分析神經網路

– 這次我們使用比較小的Batch size,這樣可以加快很多速度:

my_iter = my_iterator_func(iter = NULL, batch_size = 50)

練習3答案(1)

data = mx.symbol.Variable(name = 'data')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
final_pred = mx.symbol.FullyConnected(data = relu1, num.hidden = 1, no.bias = TRUE, name = 'final_pred')

label_sym <- mx.symbol.Variable('label')
mask_sym <- mx.symbol.Variable('mask')

loss_symbol <- COX_LOSS(indata = final_pred, inlabel = label_sym, inmask = mask_sym)
#1. Build an executor to train model

my_executor = mx.simple.bind(symbol = loss_symbol,
                             data = c(2, 50), label = c(50, 50), mask = c(50, 50),
                             ctx = mx.cpu(), grad.req = "write")

#2. Set the initial parameters
mx.set.seed(0)

new_arg = mxnet:::mx.model.init.params(symbol = loss_symbol,
                                       input.shape = list(data = c(2, 50), label = c(50, 50), mask = c(50, 50)),
                                       output.shape = NULL,
                                       initializer = mxnet:::mx.init.uniform(0.01),
                                       ctx = mx.cpu())
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)

#3. Define the updater
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
my_updater = mx.opt.get.updater(optimizer = my_optimizer, weights = my_executor$ref.arg.arrays)

#4. Forward/Backward
set.seed(0)

for (i in 1:20) {
  
  my_iter$reset()
  batch_loss = NULL
  
  while (my_iter$iter.next()) {
    
    my_values <- my_iter$value()
    mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
    mx.exec.forward(my_executor, is.train = TRUE)
    mx.exec.backward(my_executor)
    update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
    mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
    batch_loss = c(batch_loss, as.array(my_executor$ref.outputs$COX_loss_output))
    
  }
  
  if (i %% 10 == 0 | i <= 5) {
    cat(paste0("epoch = ", i, ": loss = ", formatC(mean(batch_loss), format = "f", 4), "\n"))
  }
  
}
## epoch = 1: loss = 3.2296
## epoch = 2: loss = 3.1440
## epoch = 3: loss = 3.2010
## epoch = 4: loss = 3.1345
## epoch = 5: loss = 3.1355
## epoch = 10: loss = 3.1025
## epoch = 20: loss = 3.1012

練習3答案(2)

– 需要注意,我們要去除mask,這樣才能進行預測:

COX_model <- mxnet:::mx.model.extract.model(symbol = final_pred,
                                            train.execs = list(my_executor))

COX_model$arg.params <- COX_model$arg.params[!names(COX_model$arg.params) %in% 'mask']

predict_Y <- predict(COX_model, valid_X, array.layout = "colmajor")

valid_Y[,'score'] <- predict_Y[1,]
concordance(coxph(Surv(time, death) ~ score, data = valid_Y))[['concordance']]
## [1] 0.7538015
cox_fit <- coxph(Surv(time, death) ~ ., data = subdat[train_idx,])
valid_Y[,'score'] <- predict(cox_fit, subdat[valid_idx,])
concordance(coxph(Surv(time, death) ~ score, data = valid_Y))[['concordance']]
## [1] 0.7478008

結語

– 舉例來說,疊代器可以協助我們在將資料丟去梯度下降法前,能進行許多前處理

– 底層執行器使用迴圈進行分析,能讓我們做細微的操作

– 並且,我們終於可以調用gpu進行運算。