深度學習理論與實務

林嶔 (Lin, Chin)

Lesson 5 解決梯度消失問題

第一節:深度神經網路訓練(1)

– 透過MxNet,讓我們可以簡單的寫出非常複雜的架構,並且看到裡面每一層的梯度。

library(mxnet)

my.model.FeedForward.create = function (Iterator, 
                                        loss_symbol, pred_symbol,
                                        Optimizer, num_round = 100) {
  
  require(abind)
  
  #0. Check data shape
  Iterator$reset()
  Iterator$iter.next()
  my_values <- Iterator$value()
  input_shape <- lapply(my_values, dim)
  batch_size <- tail(input_shape[[1]], 1)
  
  #1. Build an executor to train model
  exec_list = list(symbol = loss_symbol, ctx = mx.cpu(), grad.req = "write")
  exec_list = append(exec_list, input_shape)
  my_executor = do.call(mx.simple.bind, exec_list)
  
  #2. Set the initial parameters
  mx.set.seed(0)
  new_arg = mxnet:::mx.model.init.params(symbol = loss_symbol,
                                         input.shape = input_shape,
                                         output.shape = NULL,
                                         initializer = mxnet:::mx.init.uniform(0.01),
                                         ctx = mx.cpu())
  mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
  mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)
  
  #3. Define the updater
  my_updater = mx.opt.get.updater(optimizer = Optimizer, weights = my_executor$ref.arg.arrays)
  
  #4. Forward/Backward
  message('Start training:')
  
  set.seed(0)
  epoch_grad = NULL
  
  for (i in 1:num_round) {
    
    Iterator$reset()
    batch_loss = list()
    batch_grad = list()
    batch_seq = 0
    t0 = Sys.time()
    
    while (Iterator$iter.next()) {
      
      my_values <- Iterator$value()
      mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
      mx.exec.forward(my_executor, is.train = TRUE)
      mx.exec.backward(my_executor)
      update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
      mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
      batch_loss[[length(batch_loss) + 1]] = as.array(my_executor$ref.outputs[[1]])
      grad_list = sapply(my_executor$ref.grad.arrays, function (x) {if (!is.null(x)) {mean(abs(as.array(x)))}})
      grad_list = unlist(grad_list[grepl('weight', names(grad_list), fixed = TRUE)])
      batch_grad[[length(batch_grad) + 1]] = grad_list
      batch_seq = batch_seq + 1
      
    }
    
    if (i %% 10 == 0 | i <= 5) {
      message(paste0("epoch = ", i,
                     ": loss = ", formatC(mean(unlist(batch_loss)), format = "f", 4),
                     " (Speed: ", formatC(batch_seq * batch_size/as.numeric(Sys.time() - t0, units = 'secs'), format = "f", 2), " sample/secs)"))
    }
    
    epoch_grad = rbind(epoch_grad, apply(abind(batch_grad, along = 2), 1, mean))

  }
  
  epoch_grad[epoch_grad < 1e-8] = 1e-8
  
  COL = rainbow(ncol(epoch_grad))
  random_pos = 2^runif(ncol(epoch_grad), -0.5, 0.5)
  
  plot(epoch_grad[,1] * random_pos[1], type = 'l', col = COL[1],
       xlab = 'epoch', ylab = 'mean of abs(grad)', log = 'y',
       ylim = range(epoch_grad))
  
  for (i in 2:ncol(epoch_grad)) {lines(1:nrow(epoch_grad), epoch_grad[,i] * random_pos[i], col = COL[i])}
  
  legend('bottomright', paste0('fc', 1:ncol(epoch_grad), '_weight'), col = COL, lwd = 1)
  
  #5. Get model
  my_model <- mxnet:::mx.model.extract.model(symbol = pred_symbol,
                                             train.execs = list(my_executor))
  
  return(my_model)
  
}

第一節:深度神經網路訓練(2)

F01

data(iris)

X.array = array(t(as.matrix(iris[,-5])), dim = c(4, 150))
Y.array = array(t(model.matrix(~ -1 + iris[,5])), dim = c(3, 150))

set.seed(0)
TRAIN.seq = sample(1:150, 100)

TRAIN.X.array = X.array[,TRAIN.seq]
TRAIN.Y.array = Y.array[,TRAIN.seq]
TEST.X.array = X.array[,-TRAIN.seq]
TEST.Y.array = Y.array[,-TRAIN.seq]

第一節:深度神經網路訓練(3)

my_optimizer = mx.opt.create(name = "adam", learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999,
                             epsilon = 1e-08, wd = 0)
my_iterator_core = function(batch_size) {
  
  batch = 0
  batch_per_epoch = ncol(TRAIN.Y.array)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > ncol(TRAIN.Y.array)] = sample(1:ncol(TRAIN.Y.array), sum(idx > ncol(TRAIN.Y.array)))
    data = mx.nd.array(TRAIN.X.array[,idx, drop=FALSE])
    label = mx.nd.array(TRAIN.Y.array[,idx, drop=FALSE])
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter = my_iterator_func(iter = NULL, batch_size = 20)

第一節:深度神經網路訓練(4)

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 10, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
fc4 = mx.symbol.FullyConnected(data = relu3, num.hidden = 3, name = 'fc4')
softmax_layer = mx.symbol.softmax(data = fc4, axis = 1, name = 'softmax_layer')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
model = my.model.FeedForward.create(Iterator = my_iter,
                                    loss_symbol = m_logloss, pred_symbol = softmax_layer,
                                    Optimizer = my_optimizer, num_round = 100)

– 透過這種方式來得到預測結果:

predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##    
##      1  2  3
##   1 18  0  0
##   2  0  1  0
##   3  0 14 17

第一節:深度神經網路訓練(5)

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 10, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
fc4 = mx.symbol.FullyConnected(data = relu3, num.hidden = 10, name = 'fc4')
relu4 = mx.symbol.Activation(data = fc4, act.type = 'relu', name = 'relu4')
fc5 = mx.symbol.FullyConnected(data = relu4, num.hidden = 10, name = 'fc5')
relu5 = mx.symbol.Activation(data = fc5, act.type = 'relu', name = 'relu5')
fc6 = mx.symbol.FullyConnected(data = relu5, num.hidden = 10, name = 'fc6')
relu6 = mx.symbol.Activation(data = fc6, act.type = 'relu', name = 'relu6')
fc7 = mx.symbol.FullyConnected(data = relu6, num.hidden = 10, name = 'fc7')
relu7 = mx.symbol.Activation(data = fc7, act.type = 'relu', name = 'relu7')
fc8 = mx.symbol.FullyConnected(data = relu7, num.hidden = 3, name = 'fc8')
softmax_layer = mx.symbol.softmax(data = fc8, axis = 1, name = 'softmax_layer')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')

model = my.model.FeedForward.create(Iterator = my_iter,
                                    loss_symbol = m_logloss, pred_symbol = softmax_layer,
                                    Optimizer = my_optimizer, num_round = 100)

predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##    
##      1  2  3
##   2 18 15 17

第二節:數據分布問題(1)

– 理論上,反向傳播的過程隨著離輸出層越來越遠,梯度也將越來越小,讓我們再看看當初多層感知機的梯度公式吧:

\[ \begin{align} grad.o & = \frac{\partial}{\partial o}loss = \frac{o-y}{o(1-o)} \\ grad.l_2 & = \frac{\partial}{\partial l_2}loss = grad.o \otimes \frac{\partial}{\partial l_2}o= o-y \\ grad.W^2_1 & = \frac{\partial}{\partial W^2_1}loss = grad.l_2 \otimes \frac{\partial}{\partial W^2_1}l_2 = \frac{{1}}{n} \otimes (h_1^E)^T \bullet grad.l_2\\ grad.h_1^E & = \frac{\partial}{\partial h_1^E}loss = grad.l_2 \otimes \frac{\partial}{\partial h_1^E}l_2 = grad.l_2 \bullet (W^2_1)^T \\ grad.l_1 & = \frac{\partial}{\partial l_1}loss = grad.h_1 \otimes \frac{\partial}{\partial l_1}h_1 = grad.h_1 \otimes \frac{\partial}{\partial l_1}ReLU(l_1) \\ grad.W^1_d & = \frac{\partial}{\partial W^1_d}loss = grad.l_1 \otimes \frac{\partial}{\partial W^1_d}l_1 = \frac{{1}}{n} \otimes (x^E)^T \bullet grad.l_1 \end{align} \]

第二節:數據分布問題(2)

– 我們試想一下,目前我們隨機決定的權重大多是介於0的附近,因此輸入的值如果變異非常大,那就會造成梯度的波動。

– 這也是我們上一節課最開始的時候,為什麼要對輸入數據進行標準化的原因。

– 這個做法叫做「批量標準化」(Batch normalization),兩位Google的研究員Sergey Ioffe以及Christian Szegedy在2015年所發表的研究:Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift第一次提到了這個想法。

F02

第二節:數據分布問題(3)

\[ \begin{align} \hat{x_i} & = \frac{x_i - \bar{x}}{\sqrt{\sigma^2_{x} + \epsilon}} \\ y_i = BatchNorm(x_i) & = \hat{x_i} \times \gamma \ + \beta \\\\ \bar{x} & = \frac{1}{n} \sum\limits_{i=1}^{n} x_i \\ \sigma^2_{x} & = \frac{1}{n} \sum\limits_{i=1}^{n} (x_i - \bar{x})^2 \end{align} \]

– 這裡的\(\epsilon\)代表一個很小的數字(避免除以0),\(\bar{x}\)\(\sigma^2_{x}\)則分別是\(x\)的平均值以及變異數,\(\gamma\)以及\(\beta\)則是兩個線性轉換項,這使批量標準化是一個可還原的過程(假定\(\gamma = \sqrt{\sigma^2_{x} + \epsilon}\)\(\beta = \bar{x}\))

data(iris)

demo_X.array = array(t(as.matrix(iris[,-5])), dim = c(4, 150))
bn_X.array = demo_X.array
for (i in 1:4) {
  bn_mean = mean(demo_X.array[i,])
  bn_var = var(demo_X.array[i,])
  eps = 1e-3
  gamma = 1
  beta = 0
  bn_X.array[i,] = (demo_X.array[i,] - bn_mean) /  sqrt(bn_var + eps) * gamma + beta
}

第二節:數據分布問題(4)

– 我們假設在反向傳播到\(BatchNorm\)時已經存在一個\(grad.y\),並以這個開始往下推導(過程略):

\[ \begin{align} \frac{\partial y}{\partial \beta} & = \frac{1}{n} \sum\limits_{i=1}^{n} grad.y_i \\ \frac{\partial y}{\partial \gamma} & = \frac{1}{n} \sum\limits_{i=1}^{n} grad.y_i \times \hat{x_i} \\\\ \frac{\partial y}{\partial \hat{x}} & = grad.y \otimes \gamma \\ \frac{\partial y}{\partial \sigma^2_{x}} & = - \frac{1} {2} \sum\limits_{i=1}^{n} \gamma (x_i - \bar{x}) (\sigma^2_{x} + \epsilon)^{-1.5} grad.y_i \\ \frac{\partial y}{\partial \bar{x}} & = \sum\limits_{i=1}^{n} \frac {- grad.y_i \times \gamma} {\sqrt{\sigma^2_{x} + \epsilon}} + \frac{\partial y}{\partial \sigma^2_{x}} \times \frac {-2 \sum\limits_{i=1}^{n} (x_i - \bar{x}) } {n} \\\\ \frac{\partial y}{\partial x} & = \frac{\partial y}{\partial \hat{x}} \otimes \frac {1} {\sqrt{\sigma^2_{x} + \epsilon}} \oplus \frac{\partial y}{\partial \sigma^2_{x}} \otimes \frac {2(x_i - \bar{x})} {n} \oplus \frac{\partial y}{\partial \bar{x}} \otimes \frac {1} {n} \end{align} \]

第二節:數據分布問題(4)

– 在MxNet的輔助下,要實現批量標準化其實非常簡單!

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')

fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
bn1 = mx.symbol.BatchNorm(data = fc1, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn1')
relu1 = mx.symbol.Activation(data = bn1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
bn2 = mx.symbol.BatchNorm(data = fc2, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn2')
relu2 = mx.symbol.Activation(data = bn2, act.type = 'relu', name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 10, name = 'fc3')
bn3 = mx.symbol.BatchNorm(data = fc3, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn3')
relu3 = mx.symbol.Activation(data = bn3, act.type = 'relu', name = 'relu3')
fc4 = mx.symbol.FullyConnected(data = relu3, num.hidden = 10, name = 'fc4')
bn4 = mx.symbol.BatchNorm(data = fc4, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn4')
relu4 = mx.symbol.Activation(data = bn4, act.type = 'relu', name = 'relu4')
fc5 = mx.symbol.FullyConnected(data = relu4, num.hidden = 10, name = 'fc5')
bn5 = mx.symbol.BatchNorm(data = fc5, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn5')
relu5 = mx.symbol.Activation(data = bn5, act.type = 'relu', name = 'relu5')
fc6 = mx.symbol.FullyConnected(data = relu5, num.hidden = 10, name = 'fc6')
bn6 = mx.symbol.BatchNorm(data = fc6, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn6')
relu6 = mx.symbol.Activation(data = bn6, act.type = 'relu', name = 'relu6')
fc7 = mx.symbol.FullyConnected(data = relu6, num.hidden = 10, name = 'fc7')
bn7 = mx.symbol.BatchNorm(data = fc7, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn7')
relu7 = mx.symbol.Activation(data = bn7, act.type = 'relu', name = 'relu7')
fc8 = mx.symbol.FullyConnected(data = relu7, num.hidden = 3, name = 'fc8')
softmax_layer = mx.symbol.softmax(data = fc8, axis = 1, name = 'softmax_layer')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')

model = my.model.FeedForward.create(Iterator = my_iter,
                                    loss_symbol = m_logloss, pred_symbol = softmax_layer,
                                    Optimizer = my_optimizer, num_round = 100)

predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##    
##      1  2  3
##   1 18  0  0
##   2  0 14  0
##   3  0  1 17

練習1:重現使用BN的MLP之推理過程

PARAMS = model$arg.params
MEANS = model$aux.params
ls(PARAMS)
##  [1] "bn1_beta"   "bn1_gamma"  "bn2_beta"   "bn2_gamma"  "bn3_beta"  
##  [6] "bn3_gamma"  "bn4_beta"   "bn4_gamma"  "bn5_beta"   "bn5_gamma" 
## [11] "bn6_beta"   "bn6_gamma"  "bn7_beta"   "bn7_gamma"  "fc1_bias"  
## [16] "fc1_weight" "fc2_bias"   "fc2_weight" "fc3_bias"   "fc3_weight"
## [21] "fc4_bias"   "fc4_weight" "fc5_bias"   "fc5_weight" "fc6_bias"  
## [26] "fc6_weight" "fc7_bias"   "fc7_weight" "fc8_bias"   "fc8_weight"
ls(MEANS)
##  [1] "bn1_moving_mean" "bn1_moving_var"  "bn2_moving_mean" "bn2_moving_var" 
##  [5] "bn3_moving_mean" "bn3_moving_var"  "bn4_moving_mean" "bn4_moving_var" 
##  [9] "bn5_moving_mean" "bn5_moving_var"  "bn6_moving_mean" "bn6_moving_var" 
## [13] "bn7_moving_mean" "bn7_moving_var"
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
preds = predict(model, Input, array.layout = "colmajor")
print(preds)
##            [,1]
## [1,] 0.91460079
## [2,] 0.03534730
## [3,] 0.05005192

練習1答案

PARAMS = model$arg.params
MEANS = model$aux.params

Input = TEST.X.array[,1]
dim(Input) = c(4, 1)

bn_eps = 1e-3

fc1_out = t(Input) %*% as.array(PARAMS$fc1_weight) + as.array(PARAMS$fc1_bias)
bn1_out = (fc1_out - as.array(MEANS$bn1_moving_mean)) / sqrt(as.array(MEANS$bn1_moving_var) + bn_eps) * as.array(PARAMS$bn1_gamma) + as.array(PARAMS$bn1_beta)
relu1_out = bn1_out
relu1_out[relu1_out < 0] = 0

fc2_out = relu1_out %*% as.array(PARAMS$fc2_weight) + as.array(PARAMS$fc2_bias)
bn2_out = (fc2_out - as.array(MEANS$bn2_moving_mean)) / sqrt(as.array(MEANS$bn2_moving_var) + bn_eps) * as.array(PARAMS$bn2_gamma) + as.array(PARAMS$bn2_beta)
relu2_out = bn2_out
relu2_out[relu2_out < 0] = 0

fc3_out = relu2_out %*% as.array(PARAMS$fc3_weight) + as.array(PARAMS$fc3_bias)
bn3_out = (fc3_out - as.array(MEANS$bn3_moving_mean)) / sqrt(as.array(MEANS$bn3_moving_var) + bn_eps) * as.array(PARAMS$bn3_gamma) + as.array(PARAMS$bn3_beta)
relu3_out = bn3_out
relu3_out[relu3_out < 0] = 0

fc4_out = relu3_out %*% as.array(PARAMS$fc4_weight) + as.array(PARAMS$fc4_bias)
bn4_out = (fc4_out - as.array(MEANS$bn4_moving_mean)) / sqrt(as.array(MEANS$bn4_moving_var) + bn_eps) * as.array(PARAMS$bn4_gamma) + as.array(PARAMS$bn4_beta)
relu4_out = bn4_out
relu4_out[relu4_out < 0] = 0

fc5_out = relu4_out %*% as.array(PARAMS$fc5_weight) + as.array(PARAMS$fc5_bias)
bn5_out = (fc5_out - as.array(MEANS$bn5_moving_mean)) / sqrt(as.array(MEANS$bn5_moving_var) + bn_eps) * as.array(PARAMS$bn5_gamma) + as.array(PARAMS$bn5_beta)
relu5_out = bn5_out
relu5_out[relu5_out < 0] = 0

fc6_out = relu5_out %*% as.array(PARAMS$fc6_weight) + as.array(PARAMS$fc6_bias)
bn6_out = (fc6_out - as.array(MEANS$bn6_moving_mean)) / sqrt(as.array(MEANS$bn6_moving_var) + bn_eps) * as.array(PARAMS$bn6_gamma) + as.array(PARAMS$bn6_beta)
relu6_out = bn6_out
relu6_out[relu6_out < 0] = 0

fc7_out = relu6_out %*% as.array(PARAMS$fc7_weight) + as.array(PARAMS$fc7_bias)
bn7_out = (fc7_out - as.array(MEANS$bn7_moving_mean)) / sqrt(as.array(MEANS$bn7_moving_var) + bn_eps) * as.array(PARAMS$bn7_gamma) + as.array(PARAMS$bn7_beta)
relu7_out = bn7_out
relu7_out[relu7_out < 0] = 0

fc8_out = relu7_out %*% as.array(PARAMS$fc8_weight) + as.array(PARAMS$fc8_bias)

Softmax_out = exp(fc8_out)/sum(exp(fc8_out))
cbind(t(Softmax_out), preds)
##            [,1]       [,2]
## [1,] 0.91460075 0.91460079
## [2,] 0.03534731 0.03534730
## [3,] 0.05005193 0.05005192

第三節:更直接的梯度傳遞法(1)

data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')

for (i in 1:25) {
  if (i == 1) {
    fc = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = paste0('fc', i))
  } else {
    fc = mx.symbol.FullyConnected(data = relu, num.hidden = 10, name = paste0('fc', i))
  }
  bn = mx.symbol.BatchNorm(data = fc, axis = 1, name = paste0('bn', i))
  relu = mx.symbol.Activation(data = bn, act.type = 'relu', name = paste0('relu', i))
}

fc_final = mx.symbol.FullyConnected(data = relu, num.hidden = 3, name = 'fc_final')
softmax_layer = mx.symbol.softmax(data = fc_final, axis = 1, name = 'softmax_layer')

eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')

model = my.model.FeedForward.create(Iterator = my_iter,
                                    loss_symbol = m_logloss, pred_symbol = softmax_layer,
                                    Optimizer = my_optimizer, num_round = 100)

predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##    
##      1  2  3
##   1 18  0 15
##   2  0 15  2

第三節:更直接的梯度傳遞法(2)

– 因此,梯度消失問題並沒有辦法這麼簡單的被解決掉,我們仍然需要其他手段來解決這個問題!

– 事實上一個更關鍵的突破在2015年的ILSVRC競賽出現,這個突破可以說是至今為止深度學習在理論上最重要的突破,獲勝團隊是由微軟亞洲研究院何愷明所領軍的團隊,他們發展出的ResNet將錯誤率降低至3.57%,大幅超越了人類平均的5.0%。

– 更值得一提的是,在所有人都被梯度消失問題所困擾的時刻,何愷明的團隊在2015年的ILSVRC中所提出的ResNet是一個1000層的網路,同一個時間幾乎沒有團隊有能力訓練超過50層的神經網路。

– 想當然耳,這個爆炸級的研究:Deep Residual Learning for Image Recognition在2016年的CVPR上發表後,理所當然的獲得了該研討會的最佳會議論文獎: