林嶔 (Lin, Chin)
Lesson 5 解決梯度消失問題
– 透過MxNet,讓我們可以簡單的寫出非常複雜的架構,並且看到裡面每一層的梯度。
library(mxnet)
my.model.FeedForward.create = function (Iterator,
loss_symbol, pred_symbol,
Optimizer, num_round = 100) {
require(abind)
#0. Check data shape
Iterator$reset()
Iterator$iter.next()
my_values <- Iterator$value()
input_shape <- lapply(my_values, dim)
batch_size <- tail(input_shape[[1]], 1)
#1. Build an executor to train model
exec_list = list(symbol = loss_symbol, ctx = mx.cpu(), grad.req = "write")
exec_list = append(exec_list, input_shape)
my_executor = do.call(mx.simple.bind, exec_list)
#2. Set the initial parameters
mx.set.seed(0)
new_arg = mxnet:::mx.model.init.params(symbol = loss_symbol,
input.shape = input_shape,
output.shape = NULL,
initializer = mxnet:::mx.init.uniform(0.01),
ctx = mx.cpu())
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)
#3. Define the updater
my_updater = mx.opt.get.updater(optimizer = Optimizer, weights = my_executor$ref.arg.arrays)
#4. Forward/Backward
message('Start training:')
set.seed(0)
epoch_grad = NULL
for (i in 1:num_round) {
Iterator$reset()
batch_loss = list()
batch_grad = list()
batch_seq = 0
t0 = Sys.time()
while (Iterator$iter.next()) {
my_values <- Iterator$value()
mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
mx.exec.forward(my_executor, is.train = TRUE)
mx.exec.backward(my_executor)
update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
batch_loss[[length(batch_loss) + 1]] = as.array(my_executor$ref.outputs[[1]])
grad_list = sapply(my_executor$ref.grad.arrays, function (x) {if (!is.null(x)) {mean(abs(as.array(x)))}})
grad_list = unlist(grad_list[grepl('weight', names(grad_list), fixed = TRUE)])
batch_grad[[length(batch_grad) + 1]] = grad_list
batch_seq = batch_seq + 1
}
if (i %% 10 == 0 | i <= 5) {
message(paste0("epoch = ", i,
": loss = ", formatC(mean(unlist(batch_loss)), format = "f", 4),
" (Speed: ", formatC(batch_seq * batch_size/as.numeric(Sys.time() - t0, units = 'secs'), format = "f", 2), " sample/secs)"))
}
epoch_grad = rbind(epoch_grad, apply(abind(batch_grad, along = 2), 1, mean))
}
epoch_grad[epoch_grad < 1e-8] = 1e-8
COL = rainbow(ncol(epoch_grad))
random_pos = 2^runif(ncol(epoch_grad), -0.5, 0.5)
plot(epoch_grad[,1] * random_pos[1], type = 'l', col = COL[1],
xlab = 'epoch', ylab = 'mean of abs(grad)', log = 'y',
ylim = range(epoch_grad))
for (i in 2:ncol(epoch_grad)) {lines(1:nrow(epoch_grad), epoch_grad[,i] * random_pos[i], col = COL[i])}
legend('bottomright', paste0('fc', 1:ncol(epoch_grad), '_weight'), col = COL, lwd = 1)
#5. Get model
my_model <- mxnet:::mx.model.extract.model(symbol = pred_symbol,
train.execs = list(my_executor))
return(my_model)
}
data(iris)
X.array = array(t(as.matrix(iris[,-5])), dim = c(4, 150))
Y.array = array(t(model.matrix(~ -1 + iris[,5])), dim = c(3, 150))
set.seed(0)
TRAIN.seq = sample(1:150, 100)
TRAIN.X.array = X.array[,TRAIN.seq]
TRAIN.Y.array = Y.array[,TRAIN.seq]
TEST.X.array = X.array[,-TRAIN.seq]
TEST.Y.array = Y.array[,-TRAIN.seq]
my_optimizer = mx.opt.create(name = "adam", learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999,
epsilon = 1e-08, wd = 0)
my_iterator_core = function(batch_size) {
batch = 0
batch_per_epoch = ncol(TRAIN.Y.array)/batch_size
reset = function() {batch <<- 0}
iter.next = function() {
batch <<- batch+1
if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
}
value = function() {
idx = 1:batch_size + (batch - 1) * batch_size
idx[idx > ncol(TRAIN.Y.array)] = sample(1:ncol(TRAIN.Y.array), sum(idx > ncol(TRAIN.Y.array)))
data = mx.nd.array(TRAIN.X.array[,idx, drop=FALSE])
label = mx.nd.array(TRAIN.Y.array[,idx, drop=FALSE])
return(list(data = data, label = label))
}
return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "batch_size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, batch_size = 100){
.self$iter <- my_iterator_core(batch_size = batch_size)
.self
},
value = function(){
.self$iter$value()
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter = my_iterator_func(iter = NULL, batch_size = 20)
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 10, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
fc4 = mx.symbol.FullyConnected(data = relu3, num.hidden = 3, name = 'fc4')
softmax_layer = mx.symbol.softmax(data = fc4, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
model = my.model.FeedForward.create(Iterator = my_iter,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 100)
– 透過這種方式來得到預測結果:
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 1 18 0 0
## 2 0 1 0
## 3 0 14 17
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 10, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
fc4 = mx.symbol.FullyConnected(data = relu3, num.hidden = 10, name = 'fc4')
relu4 = mx.symbol.Activation(data = fc4, act.type = 'relu', name = 'relu4')
fc5 = mx.symbol.FullyConnected(data = relu4, num.hidden = 10, name = 'fc5')
relu5 = mx.symbol.Activation(data = fc5, act.type = 'relu', name = 'relu5')
fc6 = mx.symbol.FullyConnected(data = relu5, num.hidden = 10, name = 'fc6')
relu6 = mx.symbol.Activation(data = fc6, act.type = 'relu', name = 'relu6')
fc7 = mx.symbol.FullyConnected(data = relu6, num.hidden = 10, name = 'fc7')
relu7 = mx.symbol.Activation(data = fc7, act.type = 'relu', name = 'relu7')
fc8 = mx.symbol.FullyConnected(data = relu7, num.hidden = 3, name = 'fc8')
softmax_layer = mx.symbol.softmax(data = fc8, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
model = my.model.FeedForward.create(Iterator = my_iter,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 100)
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 2 18 15 17
– 理論上,反向傳播的過程隨著離輸出層越來越遠,梯度也將越來越小,讓我們再看看當初多層感知機的梯度公式吧:
\[ \begin{align} grad.o & = \frac{\partial}{\partial o}loss = \frac{o-y}{o(1-o)} \\ grad.l_2 & = \frac{\partial}{\partial l_2}loss = grad.o \otimes \frac{\partial}{\partial l_2}o= o-y \\ grad.W^2_1 & = \frac{\partial}{\partial W^2_1}loss = grad.l_2 \otimes \frac{\partial}{\partial W^2_1}l_2 = \frac{{1}}{n} \otimes (h_1^E)^T \bullet grad.l_2\\ grad.h_1^E & = \frac{\partial}{\partial h_1^E}loss = grad.l_2 \otimes \frac{\partial}{\partial h_1^E}l_2 = grad.l_2 \bullet (W^2_1)^T \\ grad.l_1 & = \frac{\partial}{\partial l_1}loss = grad.h_1 \otimes \frac{\partial}{\partial l_1}h_1 = grad.h_1 \otimes \frac{\partial}{\partial l_1}ReLU(l_1) \\ grad.W^1_d & = \frac{\partial}{\partial W^1_d}loss = grad.l_1 \otimes \frac{\partial}{\partial W^1_d}l_1 = \frac{{1}}{n} \otimes (x^E)^T \bullet grad.l_1 \end{align} \]
– 我們試想一下,目前我們隨機決定的權重大多是介於0的附近,因此輸入的值如果變異非常大,那就會造成梯度的波動。
– 這也是我們上一節課最開始的時候,為什麼要對輸入數據進行標準化的原因。
– 這個做法叫做「批量標準化」(Batch normalization),兩位Google的研究員Sergey Ioffe以及Christian Szegedy在2015年所發表的研究:Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift第一次提到了這個想法。
\[ \begin{align} \hat{x_i} & = \frac{x_i - \bar{x}}{\sqrt{\sigma^2_{x} + \epsilon}} \\ y_i = BatchNorm(x_i) & = \hat{x_i} \times \gamma \ + \beta \\\\ \bar{x} & = \frac{1}{n} \sum\limits_{i=1}^{n} x_i \\ \sigma^2_{x} & = \frac{1}{n} \sum\limits_{i=1}^{n} (x_i - \bar{x})^2 \end{align} \]
– 這裡的\(\epsilon\)代表一個很小的數字(避免除以0),\(\bar{x}\)與\(\sigma^2_{x}\)則分別是\(x\)的平均值以及變異數,\(\gamma\)以及\(\beta\)則是兩個線性轉換項,這使批量標準化是一個可還原的過程(假定\(\gamma = \sqrt{\sigma^2_{x} + \epsilon}\)而\(\beta = \bar{x}\))
– 我們假設在反向傳播到\(BatchNorm\)時已經存在一個\(grad.y\),並以這個開始往下推導(過程略):
\[ \begin{align} \frac{\partial y}{\partial \beta} & = \frac{1}{n} \sum\limits_{i=1}^{n} grad.y_i \\ \frac{\partial y}{\partial \gamma} & = \frac{1}{n} \sum\limits_{i=1}^{n} grad.y_i \times \hat{x_i} \\\\ \frac{\partial y}{\partial \hat{x}} & = grad.y \otimes \gamma \\ \frac{\partial y}{\partial \sigma^2_{x}} & = - \frac{1} {2} \sum\limits_{i=1}^{n} \gamma (x_i - \bar{x}) (\sigma^2_{x} + \epsilon)^{-1.5} grad.y_i \\ \frac{\partial y}{\partial \bar{x}} & = \sum\limits_{i=1}^{n} \frac {- grad.y_i \times \gamma} {\sqrt{\sigma^2_{x} + \epsilon}} + \frac{\partial y}{\partial \sigma^2_{x}} \times \frac {-2 \sum\limits_{i=1}^{n} (x_i - \bar{x}) } {n} \\\\ \frac{\partial y}{\partial x} & = \frac{\partial y}{\partial \hat{x}} \otimes \frac {1} {\sqrt{\sigma^2_{x} + \epsilon}} \oplus \frac{\partial y}{\partial \sigma^2_{x}} \otimes \frac {2(x_i - \bar{x})} {n} \oplus \frac{\partial y}{\partial \bar{x}} \otimes \frac {1} {n} \end{align} \]
– 在MxNet的輔助下,要實現批量標準化其實非常簡單!
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
bn1 = mx.symbol.BatchNorm(data = fc1, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn1')
relu1 = mx.symbol.Activation(data = bn1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
bn2 = mx.symbol.BatchNorm(data = fc2, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn2')
relu2 = mx.symbol.Activation(data = bn2, act.type = 'relu', name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 10, name = 'fc3')
bn3 = mx.symbol.BatchNorm(data = fc3, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn3')
relu3 = mx.symbol.Activation(data = bn3, act.type = 'relu', name = 'relu3')
fc4 = mx.symbol.FullyConnected(data = relu3, num.hidden = 10, name = 'fc4')
bn4 = mx.symbol.BatchNorm(data = fc4, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn4')
relu4 = mx.symbol.Activation(data = bn4, act.type = 'relu', name = 'relu4')
fc5 = mx.symbol.FullyConnected(data = relu4, num.hidden = 10, name = 'fc5')
bn5 = mx.symbol.BatchNorm(data = fc5, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn5')
relu5 = mx.symbol.Activation(data = bn5, act.type = 'relu', name = 'relu5')
fc6 = mx.symbol.FullyConnected(data = relu5, num.hidden = 10, name = 'fc6')
bn6 = mx.symbol.BatchNorm(data = fc6, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn6')
relu6 = mx.symbol.Activation(data = bn6, act.type = 'relu', name = 'relu6')
fc7 = mx.symbol.FullyConnected(data = relu6, num.hidden = 10, name = 'fc7')
bn7 = mx.symbol.BatchNorm(data = fc7, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn7')
relu7 = mx.symbol.Activation(data = bn7, act.type = 'relu', name = 'relu7')
fc8 = mx.symbol.FullyConnected(data = relu7, num.hidden = 3, name = 'fc8')
softmax_layer = mx.symbol.softmax(data = fc8, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
model = my.model.FeedForward.create(Iterator = my_iter,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 100)
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 1 18 0 0
## 2 0 14 0
## 3 0 1 17
## [1] "bn1_beta" "bn1_gamma" "bn2_beta" "bn2_gamma" "bn3_beta"
## [6] "bn3_gamma" "bn4_beta" "bn4_gamma" "bn5_beta" "bn5_gamma"
## [11] "bn6_beta" "bn6_gamma" "bn7_beta" "bn7_gamma" "fc1_bias"
## [16] "fc1_weight" "fc2_bias" "fc2_weight" "fc3_bias" "fc3_weight"
## [21] "fc4_bias" "fc4_weight" "fc5_bias" "fc5_weight" "fc6_bias"
## [26] "fc6_weight" "fc7_bias" "fc7_weight" "fc8_bias" "fc8_weight"
## [1] "bn1_moving_mean" "bn1_moving_var" "bn2_moving_mean" "bn2_moving_var"
## [5] "bn3_moving_mean" "bn3_moving_var" "bn4_moving_mean" "bn4_moving_var"
## [9] "bn5_moving_mean" "bn5_moving_var" "bn6_moving_mean" "bn6_moving_var"
## [13] "bn7_moving_mean" "bn7_moving_var"
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
preds = predict(model, Input, array.layout = "colmajor")
print(preds)
## [,1]
## [1,] 0.91460079
## [2,] 0.03534730
## [3,] 0.05005192
PARAMS = model$arg.params
MEANS = model$aux.params
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
bn_eps = 1e-3
fc1_out = t(Input) %*% as.array(PARAMS$fc1_weight) + as.array(PARAMS$fc1_bias)
bn1_out = (fc1_out - as.array(MEANS$bn1_moving_mean)) / sqrt(as.array(MEANS$bn1_moving_var) + bn_eps) * as.array(PARAMS$bn1_gamma) + as.array(PARAMS$bn1_beta)
relu1_out = bn1_out
relu1_out[relu1_out < 0] = 0
fc2_out = relu1_out %*% as.array(PARAMS$fc2_weight) + as.array(PARAMS$fc2_bias)
bn2_out = (fc2_out - as.array(MEANS$bn2_moving_mean)) / sqrt(as.array(MEANS$bn2_moving_var) + bn_eps) * as.array(PARAMS$bn2_gamma) + as.array(PARAMS$bn2_beta)
relu2_out = bn2_out
relu2_out[relu2_out < 0] = 0
fc3_out = relu2_out %*% as.array(PARAMS$fc3_weight) + as.array(PARAMS$fc3_bias)
bn3_out = (fc3_out - as.array(MEANS$bn3_moving_mean)) / sqrt(as.array(MEANS$bn3_moving_var) + bn_eps) * as.array(PARAMS$bn3_gamma) + as.array(PARAMS$bn3_beta)
relu3_out = bn3_out
relu3_out[relu3_out < 0] = 0
fc4_out = relu3_out %*% as.array(PARAMS$fc4_weight) + as.array(PARAMS$fc4_bias)
bn4_out = (fc4_out - as.array(MEANS$bn4_moving_mean)) / sqrt(as.array(MEANS$bn4_moving_var) + bn_eps) * as.array(PARAMS$bn4_gamma) + as.array(PARAMS$bn4_beta)
relu4_out = bn4_out
relu4_out[relu4_out < 0] = 0
fc5_out = relu4_out %*% as.array(PARAMS$fc5_weight) + as.array(PARAMS$fc5_bias)
bn5_out = (fc5_out - as.array(MEANS$bn5_moving_mean)) / sqrt(as.array(MEANS$bn5_moving_var) + bn_eps) * as.array(PARAMS$bn5_gamma) + as.array(PARAMS$bn5_beta)
relu5_out = bn5_out
relu5_out[relu5_out < 0] = 0
fc6_out = relu5_out %*% as.array(PARAMS$fc6_weight) + as.array(PARAMS$fc6_bias)
bn6_out = (fc6_out - as.array(MEANS$bn6_moving_mean)) / sqrt(as.array(MEANS$bn6_moving_var) + bn_eps) * as.array(PARAMS$bn6_gamma) + as.array(PARAMS$bn6_beta)
relu6_out = bn6_out
relu6_out[relu6_out < 0] = 0
fc7_out = relu6_out %*% as.array(PARAMS$fc7_weight) + as.array(PARAMS$fc7_bias)
bn7_out = (fc7_out - as.array(MEANS$bn7_moving_mean)) / sqrt(as.array(MEANS$bn7_moving_var) + bn_eps) * as.array(PARAMS$bn7_gamma) + as.array(PARAMS$bn7_beta)
relu7_out = bn7_out
relu7_out[relu7_out < 0] = 0
fc8_out = relu7_out %*% as.array(PARAMS$fc8_weight) + as.array(PARAMS$fc8_bias)
Softmax_out = exp(fc8_out)/sum(exp(fc8_out))
cbind(t(Softmax_out), preds)
## [,1] [,2]
## [1,] 0.91460075 0.91460079
## [2,] 0.03534731 0.03534730
## [3,] 0.05005193 0.05005192
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
for (i in 1:25) {
if (i == 1) {
fc = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = paste0('fc', i))
} else {
fc = mx.symbol.FullyConnected(data = relu, num.hidden = 10, name = paste0('fc', i))
}
bn = mx.symbol.BatchNorm(data = fc, axis = 1, name = paste0('bn', i))
relu = mx.symbol.Activation(data = bn, act.type = 'relu', name = paste0('relu', i))
}
fc_final = mx.symbol.FullyConnected(data = relu, num.hidden = 3, name = 'fc_final')
softmax_layer = mx.symbol.softmax(data = fc_final, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
model = my.model.FeedForward.create(Iterator = my_iter,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 100)
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 1 18 0 15
## 2 0 15 2
– 因此,梯度消失問題並沒有辦法這麼簡單的被解決掉,我們仍然需要其他手段來解決這個問題!
– 事實上一個更關鍵的突破在2015年的ILSVRC競賽出現,這個突破可以說是至今為止深度學習在理論上最重要的突破,獲勝團隊是由微軟亞洲研究院何愷明所領軍的團隊,他們發展出的ResNet將錯誤率降低至3.57%,大幅超越了人類平均的5.0%。
– 更值得一提的是,在所有人都被梯度消失問題所困擾的時刻,何愷明的團隊在2015年的ILSVRC中所提出的ResNet是一個1000層的網路,同一個時間幾乎沒有團隊有能力訓練超過50層的神經網路。
– 想當然耳,這個爆炸級的研究:Deep Residual Learning for Image Recognition在2016年的CVPR上發表後,理所當然的獲得了該研討會的最佳會議論文獎:
– 讓我們用數學式稍微描述一下,假設我們有一個雙隱藏層的MLP,那預測式在加入他的概念後會變成什麼樣子:
\[ \begin{align} l_1 & = L(x,W^1) \\ h_1 & = ReLU(l_1) \\ r_1 & = h_1 + x \\\\ l_2 & = L(r_1,W^2) \\ h_2 & = ReLU(l_2) \\ r_2 & = h_2 + r_1 \\\\ l_3 & = L(r_2,W^3) \\ o & = S(l_3) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]
– 假使我們要改變維度時,那我們就必需放棄這一個連接手段。
\[ \begin{align} grad.o & = \frac{\partial}{\partial o}loss = \frac{o-y}{o(1-o)} \\ grad.l_3 & = \frac{\partial}{\partial l_3}loss = grad.o \otimes \frac{\partial}{\partial l_3}o= o-y \\ grad.W^3 & = \frac{\partial}{\partial W^3}loss = grad.l_3 \otimes \frac{\partial}{\partial W^3}l_3 = \frac{{1}}{n} \otimes (r_2)^T \bullet grad.l_3\\ grad.r_2 & = \frac{\partial}{\partial r_2}loss = grad.l_3 \otimes \frac{\partial}{\partial r_2}l_3 = grad.l_3 \bullet (W^3)^T \\\\ grad.h_2 & = \frac{\partial}{\partial h_2}loss = grad.r_2 \otimes \frac{\partial}{\partial h_2}r_2 = grad.r_2 \\ grad.l_2 & = \frac{\partial}{\partial l_2}loss = grad.h_2 \otimes \frac{\partial}{\partial l_2}h_2 = grad.h_2 \otimes \frac{\partial}{\partial l_2}ReLU(l_2) \\ grad.W^2 & = \frac{\partial}{\partial W^2}loss = grad.l_2 \otimes \frac{\partial}{\partial W^2}l_2 = \frac{{1}}{n} \otimes (r_1)^T \bullet grad.l_2\\ grad.r_1 & = \frac{\partial}{\partial r_1}loss = grad.l_2 \otimes \frac{\partial}{\partial r_1}l_2 + grad.r_2 \otimes \frac{\partial}{\partial r_1} r_2 \\ & = grad.l_2 \bullet (W^2)^T + grad.r_2 \\\\ grad.h_1 & = \frac{\partial}{\partial h_2}loss = grad.r_1 \otimes \frac{\partial}{\partial h_1}r_1 = grad.r_1 \\ grad.l_1 & = \frac{\partial}{\partial l_1}loss = grad.h_1 \otimes \frac{\partial}{\partial l_1}h_1 = grad.h_1 \otimes \frac{\partial}{\partial l_1}ReLU(l_1) \\ grad.W^1 & = \frac{\partial}{\partial W^1}loss = grad.l_1 \otimes \frac{\partial}{\partial W^1}l_1 = \frac{{1}}{n} \otimes (x)^T \bullet grad.l_1 \\ grad.x & = \frac{\partial}{\partial x}loss = grad.l_1 \otimes \frac{\partial}{\partial x}l_1 + grad.r_1 \otimes \frac{\partial}{\partial x} r_1 = grad.l_1 \bullet (W^1)^T + grad.r_1 \\ & = grad.l_1 \bullet (W^1)^T + grad.l_2 \bullet (W^2)^T + grad.r_2 \end{align} \]
– 因為每一層\(r\)的梯度都包含最頂層的值,所以梯度消失問題迎刃而解!這樣自然可以訓練一個1000層深的網路而不會發生梯度消失問題。
– 你可以稍微想一下,這樣一個1000層的網路似乎失去了生物學上的意義,那這樣的模型還會有預測效果嗎?
– 讓我們展開預測式來看看它到底是長什麼樣子:
\[ \begin{align} l_1 & = L(x,W^1) = xW^1\\ h_1 & = ReLU(l_1) \\ r_1 & = h_1 + x \\\\ l_2 & = L(r_1,W^2) = r_1W^2 = (h_1 + x)W^2 \\ h_2 & = ReLU(l_2) \\ r_2 & = h_2 + r_1 \\\\ l_3 & = L(r_2,W^3) = r_2W^3 = (h_2 + h_1 + x)W^3 \\ o & = S(l_3) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]
\[ \begin{align} l_3 & = (h_2 + h_1 + x)W^3 \\ & = (ReLU(l_2) + ReLU(l_1) + x)W^3 \\ & = (ReLU((h_1 + x)W^2) + ReLU(xW^1) + x)W^3 \\ & = (ReLU((ReLU(xW^1) + x)W^2) + ReLU(xW^1) + x)W^3 \end{align} \]
– 讓我們直接試試看之前用SGD絕對不可能優化成功的6層網路訓練,即使不依靠批量標準化,這個技術依然能成功:
# Optimizer
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
#Model Architecture
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 3, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 3, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
plus2 = mx.symbol.broadcast_plus(lhs = relu2, rhs = relu1, name = 'plus2')
fc3 = mx.symbol.FullyConnected(data = plus2, num.hidden = 3, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
plus3 = mx.symbol.broadcast_plus(lhs = relu3, rhs = plus2, name = 'plus3')
fc4 = mx.symbol.FullyConnected(data = plus3, num.hidden = 3, name = 'fc4')
relu4 = mx.symbol.Activation(data = fc4, act.type = 'relu', name = 'relu4')
plus4 = mx.symbol.broadcast_plus(lhs = relu4, rhs = plus3, name = 'plus4')
fc5 = mx.symbol.FullyConnected(data = plus4, num.hidden = 3, name = 'fc5')
relu5 = mx.symbol.Activation(data = fc5, act.type = 'relu', name = 'relu5')
plus5 = mx.symbol.broadcast_plus(lhs = relu5, rhs = plus4, name = 'plus5')
fc6 = mx.symbol.FullyConnected(data = plus5, num.hidden = 3, name = 'fc6')
relu6 = mx.symbol.Activation(data = fc6, act.type = 'relu', name = 'relu6')
plus6 = mx.symbol.broadcast_plus(lhs = relu6, rhs = plus5, name = 'plus6')
fc7 = mx.symbol.FullyConnected(data = plus6, num.hidden = 3, name = 'fc7')
softmax_layer = mx.symbol.softmax(data = fc7, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
# Training
model = my.model.FeedForward.create(Iterator = my_iter,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 100)
# Predicting
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 1 18 0 0
## 2 0 13 0
## 3 0 2 17
## [1] "fc1_bias" "fc1_weight" "fc2_bias" "fc2_weight" "fc3_bias"
## [6] "fc3_weight" "fc4_bias" "fc4_weight" "fc5_bias" "fc5_weight"
## [11] "fc6_bias" "fc6_weight" "fc7_bias" "fc7_weight"
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
preds = predict(model, Input, array.layout = "colmajor")
print(preds)
## [,1]
## [1,] 0.936310470
## [2,] 0.061589610
## [3,] 0.002099835
PARAMS = model$arg.params
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
fc1_out = t(Input) %*% as.array(PARAMS$fc1_weight) + as.array(PARAMS$fc1_bias)
relu1_out = fc1_out
relu1_out[relu1_out < 0] = 0
fc2_out = relu1_out %*% as.array(PARAMS$fc2_weight) + as.array(PARAMS$fc2_bias)
relu2_out = fc2_out
relu2_out[relu2_out < 0] = 0
plus2_out = relu2_out + relu1_out
fc3_out = plus2_out %*% as.array(PARAMS$fc3_weight) + as.array(PARAMS$fc3_bias)
relu3_out = fc3_out
relu3_out[relu3_out < 0] = 0
plus3_out = relu3_out + plus2_out
fc4_out = plus3_out %*% as.array(PARAMS$fc4_weight) + as.array(PARAMS$fc4_bias)
relu4_out = fc4_out
relu4_out[relu4_out < 0] = 0
plus4_out = relu4_out + plus3_out
fc5_out = plus4_out %*% as.array(PARAMS$fc5_weight) + as.array(PARAMS$fc5_bias)
relu5_out = fc5_out
relu5_out[relu5_out < 0] = 0
plus5_out = relu5_out + plus4_out
fc6_out = plus5_out %*% as.array(PARAMS$fc6_weight) + as.array(PARAMS$fc6_bias)
relu6_out = fc6_out
relu6_out[relu6_out < 0] = 0
plus6_out = relu6_out + plus5_out
fc7_out = plus6_out %*% as.array(PARAMS$fc7_weight) + as.array(PARAMS$fc7_bias)
Softmax_out = exp(fc7_out)/sum(exp(fc7_out))
cbind(t(Softmax_out), preds)
## [,1] [,2]
## [1,] 0.936310548 0.936310470
## [2,] 0.061589617 0.061589610
## [3,] 0.002099835 0.002099835
– 不需要什麼奇技淫巧,批量標準化與Adam都不是必須的,Residual learning真的是超級好用:
# Optimizer
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
#Model Architecture
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
plus = mx.symbol.broadcast_plus(lhs = relu2, rhs = relu1, name = 'plus2')
for (i in 3:25) {
fc = mx.symbol.FullyConnected(data = plus, num.hidden = 10, name = paste0('fc', i))
relu = mx.symbol.Activation(data = fc, act.type = 'relu', name = paste0('relu', i))
plus = mx.symbol.broadcast_plus(lhs = relu, rhs = plus, name = paste0('plus', i))
}
fc_final = mx.symbol.FullyConnected(data = plus, num.hidden = 3, name = paste0('fc', i + 1))
softmax_layer = mx.symbol.softmax(data = fc_final, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
# Training
model = my.model.FeedForward.create(Iterator = my_iter,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 100)
# Predicting
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 1 18 0 0
## 2 0 13 0
## 3 0 2 17
– 100層甚至是1000層理論上都是可行的,但要注意目前你會面對的問題是「梯度爆炸」,所以你可能需要調整一下SGD的學習率,或是使用Adam!
每次維度做修正時將無法繼續使用
整個網路需要優化的參數相當的浪費
– 這個研究是由康乃爾大學的博士後研究員黄高、清華大學生劉壯、Facebook AI研究院的Laurens van der Maaten以及康乃爾大學的電腦科學教授 Kilian Q. Weinberger等人所發表,論文名稱為:Densely Connected Convolutional Networks
– 這個研究在2017年的CVPR上發表後(Residual Learning發表於2016年的CVPR),也成功獲得了該屆的最佳會議論文獎!
– 符號\(||\)代表矩陣的並聯:
\[ \begin{align} l_1 & = L(x,W^1) \\ h_1 & = ReLU(l_1) \\ r_1 & = h_1 || x \\\\ l_2 & = L(r_1,W^2) \\ h_2 & = ReLU(l_2) \\ r_2 & = h_2 || r_1 \\\\ l_3 & = L(r_2,W^3) \\ o & = S(l_3) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]
– 在這裡梯度的數學推導需要全部展開才能做(並且涉及很多你可能沒有學過的數學符號),我們就不做了,直接用MxNet幫我們實現。
#Model Architecture
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 3, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 4, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
concat2 = mx.symbol.concat(data = list(relu1, relu2), num.args = 2, dim = 1, name = 'concat2')
fc3 = mx.symbol.FullyConnected(data = concat2, num.hidden = 5, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
concat3 = mx.symbol.concat(data = list(concat2, relu3), num.args = 2, dim = 1, name = 'concat3')
fc4 = mx.symbol.FullyConnected(data = concat3, num.hidden = 6, name = 'fc4')
relu4 = mx.symbol.Activation(data = fc4, act.type = 'relu', name = 'relu4')
concat4 = mx.symbol.concat(data = list(concat3, relu4), num.args = 2, dim = 1, name = 'concat4')
fc5 = mx.symbol.FullyConnected(data = concat4, num.hidden = 7, name = 'fc5')
relu5 = mx.symbol.Activation(data = fc5, act.type = 'relu', name = 'relu5')
concat5 = mx.symbol.concat(data = list(concat4, relu5), num.args = 2, dim = 1, name = 'concat5')
fc6 = mx.symbol.FullyConnected(data = concat5, num.hidden = 8, name = 'fc6')
relu6 = mx.symbol.Activation(data = fc6, act.type = 'relu', name = 'relu6')
concat6 = mx.symbol.concat(data = list(concat5, relu6), num.args = 2, dim = 1, name = 'concat6')
fc7 = mx.symbol.FullyConnected(data = concat6, num.hidden = 3, name = 'fc7')
softmax_layer = mx.symbol.softmax(data = fc7, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
# Training
model = my.model.FeedForward.create(Iterator = my_iter,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 100)
# Predicting
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 1 18 0 0
## 2 0 13 0
## 3 0 2 17
## [1] "fc1_bias" "fc1_weight" "fc2_bias" "fc2_weight" "fc3_bias"
## [6] "fc3_weight" "fc4_bias" "fc4_weight" "fc5_bias" "fc5_weight"
## [11] "fc6_bias" "fc6_weight" "fc7_bias" "fc7_weight"
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
preds = predict(model, Input, array.layout = "colmajor")
print(preds)
## [,1]
## [1,] 9.988134e-01
## [2,] 1.186645e-03
## [3,] 1.607887e-16
PARAMS = model$arg.params
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
fc1_out = t(Input) %*% as.array(PARAMS$fc1_weight) + as.array(PARAMS$fc1_bias)
relu1_out = fc1_out
relu1_out[relu1_out < 0] = 0
fc2_out = relu1_out %*% as.array(PARAMS$fc2_weight) + as.array(PARAMS$fc2_bias)
relu2_out = fc2_out
relu2_out[relu2_out < 0] = 0
plus2_out = cbind(relu1_out, relu2_out)
fc3_out = plus2_out %*% as.array(PARAMS$fc3_weight) + as.array(PARAMS$fc3_bias)
relu3_out = fc3_out
relu3_out[relu3_out < 0] = 0
plus3_out = cbind(plus2_out, relu3_out)
fc4_out = plus3_out %*% as.array(PARAMS$fc4_weight) + as.array(PARAMS$fc4_bias)
relu4_out = fc4_out
relu4_out[relu4_out < 0] = 0
plus4_out = cbind(plus3_out, relu4_out)
fc5_out = plus4_out %*% as.array(PARAMS$fc5_weight) + as.array(PARAMS$fc5_bias)
relu5_out = fc5_out
relu5_out[relu5_out < 0] = 0
plus5_out = cbind(plus4_out, relu5_out)
fc6_out = plus5_out %*% as.array(PARAMS$fc6_weight) + as.array(PARAMS$fc6_bias)
relu6_out = fc6_out
relu6_out[relu6_out < 0] = 0
plus6_out = cbind(plus5_out, relu6_out)
fc7_out = plus6_out %*% as.array(PARAMS$fc7_weight) + as.array(PARAMS$fc7_bias)
Softmax_out = exp(fc7_out)/sum(exp(fc7_out))
cbind(t(Softmax_out), preds)
## [,1] [,2]
## [1,] 9.988134e-01 9.988134e-01
## [2,] 1.186645e-03 1.186645e-03
## [3,] 1.607889e-16 1.607887e-16
– 一個比較常用的非線性轉換函數叫做LeakyReLU,而他的數學式長成這樣:
\[ LeakyReLU(x, \alpha) = \left\{ \begin{array} -x & \mbox{ if x > 0} \\ \alpha x & \mbox{ otherwise} \end{array} \right. \]
\[ \frac{\partial}{\partial x}LeakyReLU(x, \alpha) = \left\{ \begin{array} -1 & \mbox{ if x > 0} \\ \alpha & \mbox{ otherwise} \end{array} \right. \]
# Optimizer
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
# Model Architecture
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.LeakyReLU(data = fc1, act.type = 'leaky', slope = 0.25, name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
relu2 = mx.symbol.LeakyReLU(data = fc2, act.type = 'leaky', slope = 0.25, name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 3, name = 'fc3')
softmax_layer = mx.symbol.softmax(data = fc3, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
# Training
model = my.model.FeedForward.create(Iterator = my_iter,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 100)
– 但這可以作為一種輔助手段以協助傳遞梯度,他可以用在「無法」使用Residual Learning以及Dense Connection時的情境。
– 讓我們稍微整理一下梯度消失問題的解決方案:
改寫損失函數,像是殘差平方和到交叉熵、直通通道等
從優化手段上下手,像是使用Adam替代SGD
改變非線性轉換函數,像是ReLU與LeakyReLU
數據標準化,像是Batch Normalization
改變網路結構,像是Residual Learning、Dense Connection等
– 我們應該驚訝於深度學習領域的研究進展之快,並且基石級的突破居然出現在如此近代的研究中,這也是為什麼直到近年的第三波人工智慧革命到目前為止都仍然火熱。自從Residual Learning讓1000層的網路變成可行後,讓人工智慧(神經網路)再一次成為了主流,之後的課程我們會先從2012年的AlexNet開始依序介紹幾個在深度學習領域中的經典研究,以進一步學習其中奧妙!