林嶔 (Lin, Chin)
Lesson 5 解決梯度消失問題
– 透過MxNet,讓我們可以簡單的寫出非常複雜的架構,並且看到裡面每一層的梯度。
library(mxnet)
my.model.FeedForward.create = function (Iterator,
loss_symbol, pred_symbol,
Optimizer, num_round = 100) {
require(abind)
#0. Check data shape
Iterator$reset()
Iterator$iter.next()
my_values <- Iterator$value()
input_shape <- lapply(my_values, dim)
batch_size <- tail(input_shape[[1]], 1)
#1. Build an executor to train model
exec_list = list(symbol = loss_symbol, ctx = mx.cpu(), grad.req = "write")
exec_list = append(exec_list, input_shape)
my_executor = do.call(mx.simple.bind, exec_list)
#2. Set the initial parameters
mx.set.seed(0)
new_arg = mxnet:::mx.model.init.params(symbol = loss_symbol,
input.shape = input_shape,
output.shape = NULL,
initializer = mxnet:::mx.init.uniform(0.01),
ctx = mx.cpu())
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)
#3. Define the updater
my_updater = mx.opt.get.updater(optimizer = Optimizer, weights = my_executor$ref.arg.arrays)
#4. Forward/Backward
message('Start training:')
set.seed(0)
epoch_grad = NULL
for (i in 1:num_round) {
Iterator$reset()
batch_loss = list()
batch_grad = list()
batch_seq = 0
t0 = Sys.time()
while (Iterator$iter.next()) {
my_values <- Iterator$value()
mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
mx.exec.forward(my_executor, is.train = TRUE)
mx.exec.backward(my_executor)
update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
batch_loss[[length(batch_loss) + 1]] = as.array(my_executor$ref.outputs[[1]])
grad_list = sapply(my_executor$ref.grad.arrays, function (x) {if (!is.null(x)) {mean(abs(as.array(x)))}})
grad_list = unlist(grad_list[grepl('weight', names(grad_list), fixed = TRUE)])
batch_grad[[length(batch_grad) + 1]] = grad_list
batch_seq = batch_seq + 1
}
if (i %% 10 == 0 | i <= 5) {
message(paste0("epoch = ", i,
": loss = ", formatC(mean(unlist(batch_loss)), format = "f", 4),
" (Speed: ", formatC(batch_seq * batch_size/as.numeric(Sys.time() - t0, units = 'secs'), format = "f", 2), " sample/secs)"))
}
epoch_grad = rbind(epoch_grad, apply(abind(batch_grad, along = 2), 1, mean))
}
epoch_grad[epoch_grad < 1e-8] = 1e-8
COL = rainbow(ncol(epoch_grad))
random_pos = 2^runif(ncol(epoch_grad), -0.5, 0.5)
plot(epoch_grad[,1] * random_pos[1], type = 'l', col = COL[1],
xlab = 'epoch', ylab = 'mean of abs(grad)', log = 'y',
ylim = range(epoch_grad))
for (i in 2:ncol(epoch_grad)) {lines(1:nrow(epoch_grad), epoch_grad[,i] * random_pos[i], col = COL[i])}
legend('bottomright', paste0('fc', 1:ncol(epoch_grad), '_weight'), col = COL, lwd = 1)
#5. Get model
my_model <- mxnet:::mx.model.extract.model(symbol = pred_symbol,
train.execs = list(my_executor))
return(my_model)
}
data(iris)
X.array = array(t(as.matrix(iris[,-5])), dim = c(4, 150))
Y.array = array(t(model.matrix(~ -1 + iris[,5])), dim = c(3, 150))
set.seed(0)
TRAIN.seq = sample(1:150, 100)
TRAIN.X.array = X.array[,TRAIN.seq]
TRAIN.Y.array = Y.array[,TRAIN.seq]
TEST.X.array = X.array[,-TRAIN.seq]
TEST.Y.array = Y.array[,-TRAIN.seq]
my_optimizer = mx.opt.create(name = "adam", learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999,
epsilon = 1e-08, wd = 0)
my_iterator_core = function(batch_size) {
batch = 0
batch_per_epoch = ncol(TRAIN.Y.array)/batch_size
reset = function() {batch <<- 0}
iter.next = function() {
batch <<- batch+1
if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
}
value = function() {
idx = 1:batch_size + (batch - 1) * batch_size
idx[idx > ncol(TRAIN.Y.array)] = sample(1:ncol(TRAIN.Y.array), sum(idx > ncol(TRAIN.Y.array)))
data = mx.nd.array(TRAIN.X.array[,idx, drop=FALSE])
label = mx.nd.array(TRAIN.Y.array[,idx, drop=FALSE])
return(list(data = data, label = label))
}
return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "batch_size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, batch_size = 100){
.self$iter <- my_iterator_core(batch_size = batch_size)
.self
},
value = function(){
.self$iter$value()
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter = my_iterator_func(iter = NULL, batch_size = 20)
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 10, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
fc4 = mx.symbol.FullyConnected(data = relu3, num.hidden = 3, name = 'fc4')
softmax_layer = mx.symbol.softmax(data = fc4, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
model = my.model.FeedForward.create(Iterator = my_iter,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 100)
– 透過這種方式來得到預測結果:
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 1 18 0 0
## 2 0 1 0
## 3 0 14 17
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
relu2 = mx.symbol.Activation(data = fc2, act.type = 'relu', name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 10, name = 'fc3')
relu3 = mx.symbol.Activation(data = fc3, act.type = 'relu', name = 'relu3')
fc4 = mx.symbol.FullyConnected(data = relu3, num.hidden = 10, name = 'fc4')
relu4 = mx.symbol.Activation(data = fc4, act.type = 'relu', name = 'relu4')
fc5 = mx.symbol.FullyConnected(data = relu4, num.hidden = 10, name = 'fc5')
relu5 = mx.symbol.Activation(data = fc5, act.type = 'relu', name = 'relu5')
fc6 = mx.symbol.FullyConnected(data = relu5, num.hidden = 10, name = 'fc6')
relu6 = mx.symbol.Activation(data = fc6, act.type = 'relu', name = 'relu6')
fc7 = mx.symbol.FullyConnected(data = relu6, num.hidden = 10, name = 'fc7')
relu7 = mx.symbol.Activation(data = fc7, act.type = 'relu', name = 'relu7')
fc8 = mx.symbol.FullyConnected(data = relu7, num.hidden = 3, name = 'fc8')
softmax_layer = mx.symbol.softmax(data = fc8, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
model = my.model.FeedForward.create(Iterator = my_iter,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 100)
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 2 18 15 17
– 理論上,反向傳播的過程隨著離輸出層越來越遠,梯度也將越來越小,讓我們再看看當初多層感知機的梯度公式吧:
\[ \begin{align} grad.o & = \frac{\partial}{\partial o}loss = \frac{o-y}{o(1-o)} \\ grad.l_2 & = \frac{\partial}{\partial l_2}loss = grad.o \otimes \frac{\partial}{\partial l_2}o= o-y \\ grad.W^2_1 & = \frac{\partial}{\partial W^2_1}loss = grad.l_2 \otimes \frac{\partial}{\partial W^2_1}l_2 = \frac{{1}}{n} \otimes (h_1^E)^T \bullet grad.l_2\\ grad.h_1^E & = \frac{\partial}{\partial h_1^E}loss = grad.l_2 \otimes \frac{\partial}{\partial h_1^E}l_2 = grad.l_2 \bullet (W^2_1)^T \\ grad.l_1 & = \frac{\partial}{\partial l_1}loss = grad.h_1 \otimes \frac{\partial}{\partial l_1}h_1 = grad.h_1 \otimes \frac{\partial}{\partial l_1}ReLU(l_1) \\ grad.W^1_d & = \frac{\partial}{\partial W^1_d}loss = grad.l_1 \otimes \frac{\partial}{\partial W^1_d}l_1 = \frac{{1}}{n} \otimes (x^E)^T \bullet grad.l_1 \end{align} \]
– 我們試想一下,目前我們隨機決定的權重大多是介於0的附近,因此輸入的值如果變異非常大,那就會造成梯度的波動。
– 這也是我們上一節課最開始的時候,為什麼要對輸入數據進行標準化的原因。
– 這個做法叫做「批量標準化」(Batch normalization),兩位Google的研究員Sergey Ioffe以及Christian Szegedy在2015年所發表的研究:Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift第一次提到了這個想法。
\[ \begin{align} \hat{x_i} & = \frac{x_i - \bar{x}}{\sqrt{\sigma^2_{x} + \epsilon}} \\ y_i = BatchNorm(x_i) & = \hat{x_i} \times \gamma \ + \beta \\\\ \bar{x} & = \frac{1}{n} \sum\limits_{i=1}^{n} x_i \\ \sigma^2_{x} & = \frac{1}{n} \sum\limits_{i=1}^{n} (x_i - \bar{x})^2 \end{align} \]
– 這裡的\(\epsilon\)代表一個很小的數字(避免除以0),\(\bar{x}\)與\(\sigma^2_{x}\)則分別是\(x\)的平均值以及變異數,\(\gamma\)以及\(\beta\)則是兩個線性轉換項,這使批量標準化是一個可還原的過程(假定\(\gamma = \sqrt{\sigma^2_{x} + \epsilon}\)而\(\beta = \bar{x}\))
– 我們假設在反向傳播到\(BatchNorm\)時已經存在一個\(grad.y\),並以這個開始往下推導(過程略):
\[ \begin{align} \frac{\partial y}{\partial \beta} & = \frac{1}{n} \sum\limits_{i=1}^{n} grad.y_i \\ \frac{\partial y}{\partial \gamma} & = \frac{1}{n} \sum\limits_{i=1}^{n} grad.y_i \times \hat{x_i} \\\\ \frac{\partial y}{\partial \hat{x}} & = grad.y \otimes \gamma \\ \frac{\partial y}{\partial \sigma^2_{x}} & = - \frac{1} {2} \sum\limits_{i=1}^{n} \gamma (x_i - \bar{x}) (\sigma^2_{x} + \epsilon)^{-1.5} grad.y_i \\ \frac{\partial y}{\partial \bar{x}} & = \sum\limits_{i=1}^{n} \frac {- grad.y_i \times \gamma} {\sqrt{\sigma^2_{x} + \epsilon}} + \frac{\partial y}{\partial \sigma^2_{x}} \times \frac {-2 \sum\limits_{i=1}^{n} (x_i - \bar{x}) } {n} \\\\ \frac{\partial y}{\partial x} & = \frac{\partial y}{\partial \hat{x}} \otimes \frac {1} {\sqrt{\sigma^2_{x} + \epsilon}} \oplus \frac{\partial y}{\partial \sigma^2_{x}} \otimes \frac {2(x_i - \bar{x})} {n} \oplus \frac{\partial y}{\partial \bar{x}} \otimes \frac {1} {n} \end{align} \]
– 在MxNet的輔助下,要實現批量標準化其實非常簡單!
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
bn1 = mx.symbol.BatchNorm(data = fc1, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn1')
relu1 = mx.symbol.Activation(data = bn1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 10, name = 'fc2')
bn2 = mx.symbol.BatchNorm(data = fc2, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn2')
relu2 = mx.symbol.Activation(data = bn2, act.type = 'relu', name = 'relu2')
fc3 = mx.symbol.FullyConnected(data = relu2, num.hidden = 10, name = 'fc3')
bn3 = mx.symbol.BatchNorm(data = fc3, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn3')
relu3 = mx.symbol.Activation(data = bn3, act.type = 'relu', name = 'relu3')
fc4 = mx.symbol.FullyConnected(data = relu3, num.hidden = 10, name = 'fc4')
bn4 = mx.symbol.BatchNorm(data = fc4, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn4')
relu4 = mx.symbol.Activation(data = bn4, act.type = 'relu', name = 'relu4')
fc5 = mx.symbol.FullyConnected(data = relu4, num.hidden = 10, name = 'fc5')
bn5 = mx.symbol.BatchNorm(data = fc5, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn5')
relu5 = mx.symbol.Activation(data = bn5, act.type = 'relu', name = 'relu5')
fc6 = mx.symbol.FullyConnected(data = relu5, num.hidden = 10, name = 'fc6')
bn6 = mx.symbol.BatchNorm(data = fc6, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn6')
relu6 = mx.symbol.Activation(data = bn6, act.type = 'relu', name = 'relu6')
fc7 = mx.symbol.FullyConnected(data = relu6, num.hidden = 10, name = 'fc7')
bn7 = mx.symbol.BatchNorm(data = fc7, axis = 1, eps = 1e-3, fix.gamma = TRUE, name = 'bn7')
relu7 = mx.symbol.Activation(data = bn7, act.type = 'relu', name = 'relu7')
fc8 = mx.symbol.FullyConnected(data = relu7, num.hidden = 3, name = 'fc8')
softmax_layer = mx.symbol.softmax(data = fc8, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
model = my.model.FeedForward.create(Iterator = my_iter,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 100)
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 1 18 0 0
## 2 0 14 0
## 3 0 1 17
## [1] "bn1_beta" "bn1_gamma" "bn2_beta" "bn2_gamma" "bn3_beta"
## [6] "bn3_gamma" "bn4_beta" "bn4_gamma" "bn5_beta" "bn5_gamma"
## [11] "bn6_beta" "bn6_gamma" "bn7_beta" "bn7_gamma" "fc1_bias"
## [16] "fc1_weight" "fc2_bias" "fc2_weight" "fc3_bias" "fc3_weight"
## [21] "fc4_bias" "fc4_weight" "fc5_bias" "fc5_weight" "fc6_bias"
## [26] "fc6_weight" "fc7_bias" "fc7_weight" "fc8_bias" "fc8_weight"
## [1] "bn1_moving_mean" "bn1_moving_var" "bn2_moving_mean" "bn2_moving_var"
## [5] "bn3_moving_mean" "bn3_moving_var" "bn4_moving_mean" "bn4_moving_var"
## [9] "bn5_moving_mean" "bn5_moving_var" "bn6_moving_mean" "bn6_moving_var"
## [13] "bn7_moving_mean" "bn7_moving_var"
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
preds = predict(model, Input, array.layout = "colmajor")
print(preds)
## [,1]
## [1,] 0.91460079
## [2,] 0.03534730
## [3,] 0.05005192
PARAMS = model$arg.params
MEANS = model$aux.params
Input = TEST.X.array[,1]
dim(Input) = c(4, 1)
bn_eps = 1e-3
fc1_out = t(Input) %*% as.array(PARAMS$fc1_weight) + as.array(PARAMS$fc1_bias)
bn1_out = (fc1_out - as.array(MEANS$bn1_moving_mean)) / sqrt(as.array(MEANS$bn1_moving_var) + bn_eps) * as.array(PARAMS$bn1_gamma) + as.array(PARAMS$bn1_beta)
relu1_out = bn1_out
relu1_out[relu1_out < 0] = 0
fc2_out = relu1_out %*% as.array(PARAMS$fc2_weight) + as.array(PARAMS$fc2_bias)
bn2_out = (fc2_out - as.array(MEANS$bn2_moving_mean)) / sqrt(as.array(MEANS$bn2_moving_var) + bn_eps) * as.array(PARAMS$bn2_gamma) + as.array(PARAMS$bn2_beta)
relu2_out = bn2_out
relu2_out[relu2_out < 0] = 0
fc3_out = relu2_out %*% as.array(PARAMS$fc3_weight) + as.array(PARAMS$fc3_bias)
bn3_out = (fc3_out - as.array(MEANS$bn3_moving_mean)) / sqrt(as.array(MEANS$bn3_moving_var) + bn_eps) * as.array(PARAMS$bn3_gamma) + as.array(PARAMS$bn3_beta)
relu3_out = bn3_out
relu3_out[relu3_out < 0] = 0
fc4_out = relu3_out %*% as.array(PARAMS$fc4_weight) + as.array(PARAMS$fc4_bias)
bn4_out = (fc4_out - as.array(MEANS$bn4_moving_mean)) / sqrt(as.array(MEANS$bn4_moving_var) + bn_eps) * as.array(PARAMS$bn4_gamma) + as.array(PARAMS$bn4_beta)
relu4_out = bn4_out
relu4_out[relu4_out < 0] = 0
fc5_out = relu4_out %*% as.array(PARAMS$fc5_weight) + as.array(PARAMS$fc5_bias)
bn5_out = (fc5_out - as.array(MEANS$bn5_moving_mean)) / sqrt(as.array(MEANS$bn5_moving_var) + bn_eps) * as.array(PARAMS$bn5_gamma) + as.array(PARAMS$bn5_beta)
relu5_out = bn5_out
relu5_out[relu5_out < 0] = 0
fc6_out = relu5_out %*% as.array(PARAMS$fc6_weight) + as.array(PARAMS$fc6_bias)
bn6_out = (fc6_out - as.array(MEANS$bn6_moving_mean)) / sqrt(as.array(MEANS$bn6_moving_var) + bn_eps) * as.array(PARAMS$bn6_gamma) + as.array(PARAMS$bn6_beta)
relu6_out = bn6_out
relu6_out[relu6_out < 0] = 0
fc7_out = relu6_out %*% as.array(PARAMS$fc7_weight) + as.array(PARAMS$fc7_bias)
bn7_out = (fc7_out - as.array(MEANS$bn7_moving_mean)) / sqrt(as.array(MEANS$bn7_moving_var) + bn_eps) * as.array(PARAMS$bn7_gamma) + as.array(PARAMS$bn7_beta)
relu7_out = bn7_out
relu7_out[relu7_out < 0] = 0
fc8_out = relu7_out %*% as.array(PARAMS$fc8_weight) + as.array(PARAMS$fc8_bias)
Softmax_out = exp(fc8_out)/sum(exp(fc8_out))
cbind(t(Softmax_out), preds)
## [,1] [,2]
## [1,] 0.91460075 0.91460079
## [2,] 0.03534731 0.03534730
## [3,] 0.05005193 0.05005192
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
for (i in 1:25) {
if (i == 1) {
fc = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = paste0('fc', i))
} else {
fc = mx.symbol.FullyConnected(data = relu, num.hidden = 10, name = paste0('fc', i))
}
bn = mx.symbol.BatchNorm(data = fc, axis = 1, name = paste0('bn', i))
relu = mx.symbol.Activation(data = bn, act.type = 'relu', name = paste0('relu', i))
}
fc_final = mx.symbol.FullyConnected(data = relu, num.hidden = 3, name = 'fc_final')
softmax_layer = mx.symbol.softmax(data = fc_final, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
model = my.model.FeedForward.create(Iterator = my_iter,
loss_symbol = m_logloss, pred_symbol = softmax_layer,
Optimizer = my_optimizer, num_round = 100)
predict_Y = predict(model, TEST.X.array, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(TEST.Y.array)))
print(confusion_table)
##
## 1 2 3
## 1 18 0 15
## 2 0 15 2
– 因此,梯度消失問題並沒有辦法這麼簡單的被解決掉,我們仍然需要其他手段來解決這個問題!
– 事實上一個更關鍵的突破在2015年的ILSVRC競賽出現,這個突破可以說是至今為止深度學習在理論上最重要的突破,獲勝團隊是由微軟亞洲研究院何愷明所領軍的團隊,他們發展出的ResNet將錯誤率降低至3.57%,大幅超越了人類平均的5.0%。
– 更值得一提的是,在所有人都被梯度消失問題所困擾的時刻,何愷明的團隊在2015年的ILSVRC中所提出的ResNet是一個1000層的網路,同一個時間幾乎沒有團隊有能力訓練超過50層的神經網路。
– 想當然耳,這個爆炸級的研究:Deep Residual Learning for Image Recognition在2016年的CVPR上發表後,理所當然的獲得了該研討會的最佳會議論文獎: