林嶔 (Lin, Chin)
Lesson 3
我們一樣先從簡單的問題開始,我們希望求\(y = 2x^2 +3x^3 +x^4\)的極值,但我們先看看函數圖形:
differential.fun = function(x) {
return(4*x + 9*x^2 + 4*x^3)
}
start.value = 0.5
num.iteration = 30
learning.rate = 0.05
result.x = rep(NA, num.iteration)
for (i in 1:num.iteration) {
if (i == 1) {
result.x[1] = start.value
} else {
result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
}
}
tail(result.x, 1)
## [1] 0.000275395
start.value = -3
num.iteration = 30
learning.rate = 0.05
result.x = rep(NA, num.iteration)
for (i in 1:num.iteration) {
if (i == 1) {
result.x[1] = start.value
} else {
result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
}
}
tail(result.x, 1)
## [1] -1.640338
– 你可以親自在線性迴歸/邏輯斯迴歸中設定不同的起始值,你會發現線性迴歸、邏輯斯迴歸都沒有這個問題,這是因為他們的函數圖形都比較像\(y = x^2\)這種凸函數
– 而\(y = 2x^2 +3x^3 +x^4\)這種叫做凹凸函數
所以我們了解到梯度下降法這種近似解求法在凹凸函數上,通常僅能求出局部最小值,而局部最小值與求解起始值有相當大的關係。
其實從預測未知結果的角度,局部最佳解所求得的方程式儘管不是預測能力最好的方程式,但仍然有一定的預測能力,所以仍然有其實用性。
– 當然,我們仍然希望盡可能的避免局部極值,統計/電腦學家在這方面下了非常多的努力,而我們就即將介紹一些梯度下降法的變種試圖避免我們的優化器陷入局部極值。
\[x_{\left(epoch:t\right)} = x_{\left(epoch:t - 1\right)} - lr \cdot \frac{\partial}{\partial x}f(x_{\left(epoch:t - 1\right)}) + \rho \cdot \left[x_{\left(epoch:t - 1\right)}-x_{\left(epoch:t - 2\right)}\right]\]
differential.fun = function(x) {
return(4*x + 9*x^2 + 4*x^3)
}
start.value = 0.5
num.iteration = 30
learning.rate = 0.1
momentum = 0.9
result.x = rep(NA, num.iteration)
for (i in 1:num.iteration) {
if (i == 1) {
result.x[1] = start.value
} else if (i == 2) {
result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
} else {
result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1]) + momentum * (result.x[i-1] - result.x[i-2])
}
}
tail(result.x, 1)
## [1] -1.602265
– 這個例子是求\(y=x^2\)的極值,但學習率錯誤訂成了1.02,在正常狀況下是無法收斂的,但具有慣性動量將可以收斂。
differential.fun = function(x) {
return(2*x)
}
start.value = 0.5
num.iteration = 30
learning.rate = 1.02
momentum = 0.9
result.x = rep(NA, num.iteration)
for (i in 1:num.iteration) {
if (i == 1) {
result.x[1] = start.value
} else if (i == 2) {
result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
} else {
result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1]) + momentum * (result.x[i-1] - result.x[i-2])
}
}
tail(result.x, 1)
## [1] -0.03117517
\[log(\frac{{p}}{1-p}) = sin(b_{1}x_1x_2+b_2x_2)\]
set.seed(0)
x1 = runif(1000, min = -3, max = 3)
x2 = runif(1000, min = -3, max = 3)
lr = sin(0.7*x1*x2+0.7*x2) + rnorm(1000, sd = 0.3)
y = lr > 0 + 0L
\[ \begin{align} L(x_1,x_2) & = sin(b_{1}x_1x_2+b_2x_2) \\ S(x) & = \frac{{1}}{1+e^{-x}} \\ CE(y, p) & = -\left(y_{i} \cdot log(p_{i}) + (1-y_{i}) \cdot log(1-p_{i})\right) \\ loss & = CE(y, S(L(x_1,x_2))) \end{align} \]
– \(b_1\)的偏導函數(過程略):
\[ \begin{align} \frac{\partial}{\partial b_1}loss_i & = (p_i - y_i) \cdot cos(b_{1}x_1x_2+b_2x_2) \cdot x_{1i}x_{2i} \end{align} \]
– \(b_2\)的偏導函數(過程略):
\[ \begin{align} \frac{\partial}{\partial b_2}loss_i & = (p_i - y_i) \cdot cos(b_{1}x_1x_2+b_2x_2) \cdot x_{2i} \end{align} \]
pred.fun = function(b1, b2, x1 = x1, x2 = x2) {
lr = sin(b1*x1*x2+b2*x2)
p = 1 / (1 + exp(-lr))
return(p)
}
loss.fun = function(p, y = y, eps = 1e-8) {
loss = -mean(y * log(p + eps) + (1 - y) * log(1 - p + eps))
return(loss)
}
differential.fun.b1 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x1*x2))
}
differential.fun.b2 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x2))
}
LOGISTIC_VIS = function (x1, x2, y, num.iteration = 100, lr = 0.1, momentum = 0.9, start_b1 = 0, start_b2 = 0) {
ans_b1 = rep(start_b1, num.iteration)
ans_b2 = rep(start_b2, num.iteration)
ans_loss = rep(0, num.iteration)
pred.fun = function(b1, b2, x1 = x1, x2 = x2) {
lr = sin(b1*x1*x2+b2*x2)
p = 1 / (1 + exp(-lr))
return(p)
}
loss.fun = function(p, y = y, eps = 1e-8) {
loss = -mean(y * log(p + eps) + (1 - y) * log(1 - p + eps))
return(loss)
}
differential.fun.b1 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x1*x2))
}
differential.fun.b2 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x2))
}
for (i in 1:num.iteration) {
if (i >= 2) {
grad_b1 = differential.fun.b1(p = current_p, b1 = ans_b1[i-1], b2 = ans_b2[i-1], x1 = x1, x2 = x2, y = y)
grad_b2 = differential.fun.b2(p = current_p, b1 = ans_b1[i-1], b2 = ans_b2[i-1], x1 = x1, x2 = x2, y = y)
if (i == 2) {
momentum_b1 = 0
momentum_b2 = 0
} else {
momentum_b1 = momentum * (ans_b1[i-1] - ans_b1[i-2])
momentum_b2 = momentum * (ans_b2[i-1] - ans_b2[i-2])
}
ans_b1[i] = ans_b1[i-1] - lr * grad_b1 + momentum_b1
ans_b2[i] = ans_b2[i-1] - lr * grad_b2 + momentum_b2
}
current_p = pred.fun(b1 = ans_b1[i], b2 = ans_b2[i], x1 = x1, x2 = x2)
ans_loss[i] = loss.fun(p = current_p, y = y)
}
par(mfrow = c(1, 2))
require(scales)
require(plot3D)
x1_seq = seq(min(x1), max(x1), length.out = 300)
x2_seq = seq(min(x2), max(x2), length.out = 300)
z_matrix = sapply(x2_seq, function(x) {pred.fun(b1 = ans_b1[num.iteration], b2 = ans_b2[num.iteration], x1 = x1_seq, x2 = x)})
image2D(z = z_matrix, main = paste0('b1=', formatC(ans_b1[num.iteration], format = 'f', 1), ';b2=', formatC(ans_b2[num.iteration], format = 'f', 1)),
x = x1_seq, xlab = 'x1',
y = x2_seq, ylab = 'x2',
shade = 0.2, rasterImage = TRUE,
col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
require(grDevices) # for trans3d
b1_seq = seq(-0.5, 2, length.out = 50)
b2_seq = seq(-0.5, 2, length.out = 50)
LOSS <- b1_seq %*% t(b2_seq)
for (i in 1:length(b1_seq)) {
for (j in 1:length(b2_seq)) {
LOSS[i,j] = loss.fun(p = pred.fun(b1 = b1_seq[i], b2 = b2_seq[j], x1 = x1, x2 = x2), y = y, eps = 1e-8)
}
}
persp(b1_seq, b2_seq, LOSS, theta = 120, phi = 50, expand = 0.8, col = "lightblue",
ltheta = 120, shade = 0.75, ticktype = "detailed",
xlab = "b1", ylab = "b2", zlab = "loss", main = paste0('loss=', formatC(ans_loss[num.iteration], 4, format = 'f'))) -> res
lines(trans3d(ans_b1, ans_b2, ans_loss, pmat = res), col = '#AA0000', lwd = 1.5)
points(trans3d(ans_b1[1], ans_b2[1], ans_loss[1], pmat = res), col = '#AA0000', cex = 1, pch = 16)
points(trans3d(ans_b1[num.iteration], ans_b2[num.iteration], ans_loss[num.iteration], pmat = res), col = '#FF0000', cex = 1, pch = 16)
}
LOGISTIC_VIS(x1 = x1, x2 = x2, y = y, num.iteration = 100, lr = 0.1, momentum = 0, start_b1 = 0, start_b2 = 0)
LOGISTIC_VIS(x1 = x1, x2 = x2, y = y, num.iteration = 100, lr = 0.1, momentum = 0.9, start_b1 = 0, start_b2 = 0)
在剛剛的實驗中,我們發現了慣性動量能協助我們衝破局部極值,但如果函數圖形更加複雜呢?事實上剛剛的例子中\(b_1 = 0.7\)且\(b_2 = 0.7\)的情形是一個精心設計的特例,我們試試看產生數據時把他設定成\(b_1 = 2\)且\(b_2 = 3\)再看看結果
這是當動量為0.9的結果:
set.seed(0)
x1 = runif(1000, min = -3, max = 3)
x2 = runif(1000, min = -3, max = 3)
lr = sin(2*x1*x2+3*x2) + rnorm(1000, sd = 0.3)
y = lr > 0 + 0L
LOGISTIC_VIS(x1 = x1, x2 = x2, y = y, num.iteration = 300, lr = 0.1, momentum = 0.9, start_b1 = 0, start_b2 = 0)
在我們之前的例子中,每次進行權重更新時都使用了所有的樣本,但如今真實問題訓練數據都是非常巨大,因此現在的常規做法是在每次運算時只使用小部分的樣本。
從『理論上』,只要我們選擇樣本沒有特別偏好,那麼小的隨機樣本提供梯度的期望值應該與全樣本相同,所以結果應該一樣。
– 然而『實際上』,由於每次抽樣造成的變異,這會使的當前梯度會因為抽樣的關係存在於一定的變異性,這將會導致使用「小批量梯度下降法(Mini-batch Stochastic Gradient Descent, SGD)」與「全樣本梯度下降法」的結果有所不同。但問題在於,你認為「小批量梯度下降法」與「全樣本梯度下降法」哪一種能得到較好的結果?而如果使用「小批量梯度下降法」,那批量「樣本大小」與優化結果的關係又是什麼呢?
– 下面是利用練習1的問題所描繪的損失函數,這6張圖分別是批量大小從小到大其最終損失函數的變異情形,試著觀察並討論此問題:
– 當然,過小的批量樣本似乎也不是非常理想,這同時對全局極值也存在著危害,同樣有可能讓優化器錯過全局極值。但整體來說,似乎批量樣本較小較為有利!
– 這個例子是這是\(b_1 = 0.7\)且\(b_2 = 0.7\)所產生的數據,這在練習1中若設為動量為0,則無法離開局部極值,而我們改用批量大小為30再次試試:
– 下面這個例子是這是\(b_1 = 2\)且\(b_2 = 3\)所產生的數據,這在練習1中即使我們使用了動量為0.9的優化過程仍然沒辦法找到答案,但是透過調整批量大小(批量大小 = 30)後,我們將能找到全局極值:
– 最終發現在幾個常見的演算法及資料集中,「批量大小」不超過32時「測試集準確度」較高。
set.seed(0)
x1 = rnorm(100, sd = 1)
x2 = rnorm(100, sd = 1)
lr1 = - 1.5 + x1^2 + x2^2 + rnorm(100)
y = lr1 > 0 + 0L
test_x1 = rnorm(100, sd = 1)
test_x2 = rnorm(100, sd = 1)
lr1 = - 1.5 + test_x1^2 + test_x2^2 + rnorm(100)
test_y = lr1 > 0 + 0L
MLP_Trainer = function (num.iteration = 500, num.hidden = 30, lr = 0.1, x1 = x1, x2 = x2, y = y, test_x1 = NULL, test_x2 = NULL, test_y = NULL) {
#Functions
#Forward
S.fun = function (x, eps = 1e-8) {
S = 1/(1 + exp(-x))
S[S < eps] = eps
S[S > 1 - eps] = 1 - eps
return(S)
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
CE.fun = function (o, y, eps = 1e-8) {
loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
return(loss)
}
#Backward
grad_o.fun = function (o, y) {
return((o - y)/(o*(1-o)))
}
grad_l2.fun = function (grad_o, o) {
return(grad_o*(o*(1-o)))
}
grad_W2.fun = function (grad_l2, h1) {
h1.E = cbind(1, h1)
return(t(h1.E) %*% grad_l2/nrow(h1))
}
grad_h1.fun = function (grad_l2, W2) {
return(grad_l2 %*% t(W2[-1,]))
}
grad_l1.fun = function (grad_h1, l1) {
de_l1 = l1
de_l1[de_l1<0] = 0
de_l1[de_l1>0] = 1
return(grad_h1*de_l1)
}
grad_W1.fun = function (grad_l1, x) {
x.E = cbind(1, x)
return(t(x.E) %*% grad_l1/nrow(x))
}
#Caculating
X_matrix = cbind(x1, x2)
W1_list = list()
W2_list = list()
loss_seq = rep(0, num.iteration)
#Start random values
W1_list[[1]] = matrix(rnorm(3*num.hidden, sd = 1), nrow = 3, ncol = num.hidden)
W2_list[[1]] = matrix(rnorm(num.hidden + 1, sd = 1), nrow = num.hidden + 1, ncol = 1)
for (i in 2:(num.iteration+1)) {
#Forward
current_l1 = L.fun(X = X_matrix, W = W1_list[[i - 1]])
current_h1 = ReLU.fun(x = current_l1)
current_l2 = L.fun(X = current_h1, W = W2_list[[i - 1]])
current_o = S.fun(x = current_l2)
loss_seq[i-1] = CE.fun(o = current_o, y = y, eps = 1e-8)
#Backward
current_grad_o = grad_o.fun(o = current_o, y = y)
current_grad_l2 = grad_l2.fun(grad_o = current_grad_o, o = current_o)
current_grad_W2 = grad_W2.fun(grad_l2 = current_grad_l2, h1 = current_h1)
current_grad_h1 = grad_h1.fun(grad_l2 = current_grad_l2, W2 = W2_list[[i - 1]])
current_grad_l1 = grad_l1.fun(grad_h1 = current_grad_h1, l1 = current_l1)
current_grad_W1 = grad_W1.fun(grad_l1 = current_grad_l1, x = X_matrix)
W2_list[[i]] = W2_list[[i-1]] - lr * current_grad_W2
W1_list[[i]] = W1_list[[i-1]] - lr * current_grad_W1
}
require(scales)
require(plot3D)
x1_seq = seq(min(x1), max(x1), length.out = 100)
x2_seq = seq(min(x2), max(x2), length.out = 100)
pre_func = function (x1, x2, W1 = W1_list[[length(W1_list)]], W2 = W2_list[[length(W2_list)]]) {
new_X = cbind(x1, x2)
O = S.fun(x = L.fun(X = ReLU.fun(x = L.fun(X = new_X, W = W1)), W = W2))
return(O)
}
pred_y = pre_func(x1 = x1, x2 = x2)
MAIN_TXT = paste0('Train-Acc:', formatC(mean((pred_y > 0.5) == y), 2, format = 'f'))
if (!is.null(test_x1)) {
pred_test_y = pre_func(x1 = test_x1, x2 = test_x2)
MAIN_TXT = paste0(MAIN_TXT, '; Test-Acc:', formatC(mean((pred_test_y > 0.5) == test_y), 2, format = 'f'))
}
z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
par(mfrow = c(1, 2))
image2D(z = z_matrix, main = MAIN_TXT,
x = x1_seq, xlab = 'x1',
y = x2_seq, ylab = 'x2',
shade = 0.2, rasterImage = TRUE,
col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
if (!is.null(test_x1)) {
points(test_x1, test_x2, col = 'black', bg = c('#C00000', '#0000C0')[(test_y + 1)], pch = 21)
}
plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
}
MLP_Trainer = function (num.iteration = 500, num.hidden = 30, batch_size = 30, lr = 0.1, momentum = 0, x1 = x1, x2 = x2, y = y, test_x1 = NULL, test_x2 = NULL, test_y = NULL) {
#Functions
#Forward
S.fun = function (x, eps = 1e-8) {
S = 1/(1 + exp(-x))
S[S < eps] = eps
S[S > 1 - eps] = 1 - eps
return(S)
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
CE.fun = function (o, y, eps = 1e-8) {
loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
return(loss)
}
#Backward
grad_o.fun = function (o, y) {
return((o - y)/(o*(1-o)))
}
grad_l2.fun = function (grad_o, o) {
return(grad_o*(o*(1-o)))
}
grad_W2.fun = function (grad_l2, h1) {
h1.E = cbind(1, h1)
return(t(h1.E) %*% grad_l2/nrow(h1))
}
grad_h1.fun = function (grad_l2, W2) {
return(grad_l2 %*% t(W2[-1,]))
}
grad_l1.fun = function (grad_h1, l1) {
de_l1 = l1
de_l1[de_l1<0] = 0
de_l1[de_l1>0] = 1
return(grad_h1*de_l1)
}
grad_W1.fun = function (grad_l1, x) {
x.E = cbind(1, x)
return(t(x.E) %*% grad_l1/nrow(x))
}
#Caculating
X_matrix = cbind(x1, x2)
W1_list = list()
W2_list = list()
loss_seq = rep(0, num.iteration)
#Start random values
W1_list[[1]] = matrix(rnorm(3*num.hidden, sd = 1), nrow = 3, ncol = num.hidden)
W2_list[[1]] = matrix(rnorm(num.hidden + 1, sd = 1), nrow = num.hidden + 1, ncol = 1)
for (i in 2:(num.iteration+1)) {
idx <- sample(1:length(x1), batch_size)
#Forward
current_l1 = L.fun(X = X_matrix[idx,], W = W1_list[[i - 1]])
current_h1 = ReLU.fun(x = current_l1)
current_l2 = L.fun(X = current_h1, W = W2_list[[i - 1]])
current_o = S.fun(x = current_l2)
loss_seq[i-1] = CE.fun(o = current_o, y = y[idx], eps = 1e-8)
#Backward
current_grad_o = grad_o.fun(o = current_o, y = y[idx])
current_grad_l2 = grad_l2.fun(grad_o = current_grad_o, o = current_o)
current_grad_W2 = grad_W2.fun(grad_l2 = current_grad_l2, h1 = current_h1)
current_grad_h1 = grad_h1.fun(grad_l2 = current_grad_l2, W2 = W2_list[[i - 1]])
current_grad_l1 = grad_l1.fun(grad_h1 = current_grad_h1, l1 = current_l1)
current_grad_W1 = grad_W1.fun(grad_l1 = current_grad_l1, x = X_matrix[idx,])
if (i == 2) {
W2_list[[i]] = W2_list[[i-1]] - lr * current_grad_W2
W1_list[[i]] = W1_list[[i-1]] - lr * current_grad_W1
} else {
W2_list[[i]] = W2_list[[i-1]] - lr * current_grad_W2 + momentum * (W2_list[[i-1]] - W2_list[[i-2]])
W1_list[[i]] = W1_list[[i-1]] - lr * current_grad_W1 + momentum * (W1_list[[i-1]] - W1_list[[i-2]])
}
}
require(scales)
require(plot3D)
x1_seq = seq(min(x1), max(x1), length.out = 100)
x2_seq = seq(min(x2), max(x2), length.out = 100)
pre_func = function (x1, x2, W1 = W1_list[[length(W1_list)]], W2 = W2_list[[length(W2_list)]]) {
new_X = cbind(x1, x2)
O = S.fun(x = L.fun(X = ReLU.fun(x = L.fun(X = new_X, W = W1)), W = W2))
return(O)
}
pred_y = pre_func(x1 = x1, x2 = x2)
MAIN_TXT = paste0('Train-Acc:', formatC(mean((pred_y > 0.5) == y), 2, format = 'f'))
if (!is.null(test_x1)) {
pred_test_y = pre_func(x1 = test_x1, x2 = test_x2)
MAIN_TXT = paste0(MAIN_TXT, '; Test-Acc:', formatC(mean((pred_test_y > 0.5) == test_y), 2, format = 'f'))
}
z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
par(mfrow = c(1, 2))
image2D(z = z_matrix, main = MAIN_TXT,
x = x1_seq, xlab = 'x1',
y = x2_seq, ylab = 'x2',
shade = 0.2, rasterImage = TRUE,
col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
if (!is.null(test_x1)) {
points(test_x1, test_x2, col = 'black', bg = c('#C00000', '#0000C0')[(test_y + 1)], pch = 21)
}
plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
}
MLP_Trainer(num.iteration = 5000, num.hidden = 30, batch_size = 100, lr = 0.1, momentum = 0.9, x1 = x1, x2 = x2, y = y, test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
MLP_Trainer(num.iteration = 5000, num.hidden = 30, batch_size = 20, lr = 0.1, momentum = 0.9, x1 = x1, x2 = x2, y = y, test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– 一個合理的方法就是透過調整學習率,在疊代初期給一個較大的學習率,而在後期給一個較小的學習率。
– 另外目前為止,所有參數的更新「共享」了同一個學習率,有沒有可能針對每一個參數設計自己的學習率,從而導致優化過程更好?
\[ \begin{align} grad.x_t & = \frac{\partial}{\partial x}f(x_{t - 1}) \\ n_t & = n_{t-1} + grad.x_t^2 \\ x_{t} & = x_{t - 1} - \frac{lr}{\sqrt{n_t+\delta}}\cdot grad.x_t \end{align} \]
– 在R語言中的實現如下,我們以求\(y=x^2\)的極值為例實現下列程式碼:
differential.fun = function(x) {
return(2*x)
}
start.value = 0.5
num.iteration = 30
learning.rate = 0.1
eps = 1e-8
result.x = rep(NA, num.iteration)
for (i in 1:num.iteration) {
if (i == 1) {
n = 0
result.x[1] = start.value
} else {
grad.x = differential.fun(result.x[i-1])
n = n + grad.x^2
result.x[i] = result.x[i-1] - learning.rate/sqrt(n + eps) * grad.x
}
}
tail(result.x, 1)
## [1] 0.01459057
– 這裡我們可以對其做一些簡單的擴展,主要的概念是來自於Adadelta,我們可以在累積梯度平方和的部分利用了動量的概念,讓當前累積值會隨著時間做衰減,從而保證訓練不會提前結束
\[ \begin{align} grad.x_t & = \frac{\partial}{\partial x}f(x_{t - 1}) \\ n_t & = \beta_2 \cdot n_{t-1} + (1 - \beta_2) \cdot grad.x_t^2 \\ x_{t} & = x_{t - 1} - \frac{lr}{\sqrt{n_t+\delta}}\cdot grad.x_t \end{align} \]
\[ \begin{align} grad.x_t & = \frac{\partial}{\partial x}f(x_{t - 1}) \\ m_t & = \beta_1 \cdot m_{t-1} + (1 - \beta_1) \cdot grad.x_t \\ n_t & = \beta_2 \cdot n_{t-1} + (1 - \beta_2) \cdot grad.x_t^2 \\ x_{t} & = x_{t - 1} - \frac{lr}{\sqrt{n_t+\delta}}\cdot m_t \end{align} \]
– 擴展自此,我們考慮最後一個問題,那就是由於\(m_t\)與\(n_t\)是隨著時間累積的,在訓練剛開始的時候這個數字將會非常小,所以我們將計算\(\hat{m_t}\)與\(\hat{n_t}\)作為更新梯度使用,而他的分母項將會隨著時間越來越接近1:
\[ \begin{align} grad.x_t & = \frac{\partial}{\partial x}f(x_{t - 1}) \\ m_t & = \beta_1 \cdot m_{t-1} + (1 - \beta_1) \cdot grad.x_t \\ n_t & = \beta_2 \cdot n_{t-1} + (1 - \beta_2) \cdot grad.x_t^2 \\ \hat{m_t} & = \frac{m_t}{1 - \beta_1^t} \\ \hat{n_t} & = \frac{n_t}{1 - \beta_2^t} \\ x_{t} & = x_{t - 1} - \frac{lr}{\sqrt{\hat{n_t}+\delta}}\cdot \hat{m_t} \end{align} \]
differential.fun = function(x) {
return(2*x)
}
start.value = 0.5
num.iteration = 30
learning.rate = 0.1 #注意,一般Adam在優化時通常選擇0.001
beta1 = 0.9
beta2 = 0.999
eps = 1e-8
result.x = rep(NA, num.iteration + 1)
for (i in 0:num.iteration) {
if (i == 0) {
m = 0
n = 0
result.x[i+1] = start.value
} else {
grad.x = differential.fun(result.x[i])
m = beta1 * m + (1 - beta1) * grad.x
n = beta2 * n + (1 - beta2) * grad.x^2
m.hat = m/(1 - beta1^i)
n.hat = n/(1 - beta2^i)
result.x[i+1] = result.x[i] - learning.rate/sqrt(n.hat + eps) * m.hat
}
}
tail(result.x, 1)
## [1] 0.02099521
– SGD的結果(\(lr = 0.1\), \(\rho = 0.9\)):
– Adam的結果(\(lr = 0.1\), \(\beta_1 = 0.9\), \(\beta_2 = 0.999\)):
– 因此,也有研究在訓練初期使用Adam做快速的優化,而在後期使用SGD做收尾(Nitish Shirish Keskar & Richard Socher, 2017)。事實上「最佳化問題」上一直以來都是數學上的千古難題,幾百年的進展下來並沒有一種方法可以在各種問題上都相較於其他有明顯優勢,我們只能說目前的主流是帶有慣性量的SGD與Adam是最常用的兩個方法,但可以預見未來進一步的研究也許會改寫這個格局。
– 初始程式碼的部分請你參考練習2的答案,用這個程式碼開始改。
– 使用的樣本如下:
set.seed(0)
x1 = rnorm(100, sd = 1)
x2 = rnorm(100, sd = 1)
lr1 = - 1.5 + x1^2 + x2^2 + rnorm(100)
y = lr1 > 0 + 0L
test_x1 = rnorm(100, sd = 1)
test_x2 = rnorm(100, sd = 1)
lr1 = - 1.5 + test_x1^2 + test_x2^2 + rnorm(100)
test_y = lr1 > 0 + 0L
MLP_Trainer = function (num.iteration = 500, num.hidden = 2, batch_size = 30,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
x1 = x1, x2 = x2, y = y,
test_x1 = NULL, test_x2 = NULL, test_y = NULL) {
#Functions
#Forward
S.fun = function (x, eps = 1e-8) {
S = 1/(1 + exp(-x))
S[S < eps] = eps
S[S > 1 - eps] = 1 - eps
return(S)
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
CE.fun = function (o, y, eps = 1e-8) {
loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
return(loss)
}
#Backward
grad_o.fun = function (o, y) {
return((o - y)/(o*(1-o)))
}
grad_l2.fun = function (grad_o, o) {
return(grad_o*(o*(1-o)))
}
grad_W2.fun = function (grad_l2, h1) {
h1.E = cbind(1, h1)
return(t(h1.E) %*% grad_l2/nrow(h1))
}
grad_h1.fun = function (grad_l2, W2) {
return(grad_l2 %*% t(W2[-1,]))
}
grad_l1.fun = function (grad_h1, l1) {
de_l1 = l1
de_l1[de_l1<0] = 0
de_l1[de_l1>0] = 1
return(grad_h1*de_l1)
}
grad_W1.fun = function (grad_l1, x) {
x.E = cbind(1, x)
return(t(x.E) %*% grad_l1/nrow(x))
}
#Caculating
X_matrix = cbind(x1, x2)
W1_list = list()
W2_list = list()
loss_seq = rep(0, num.iteration)
#Start random values
W1_list[[1]] = matrix(rnorm(3*num.hidden, sd = 1), nrow = 3, ncol = num.hidden)
W2_list[[1]] = matrix(rnorm(num.hidden + 1, sd = 1), nrow = num.hidden + 1, ncol = 1)
for (i in 2:(num.iteration+1)) {
idx <- sample(1:length(x1), batch_size)
#Forward
current_l1 = L.fun(X = X_matrix[idx,], W = W1_list[[i - 1]])
current_h1 = ReLU.fun(x = current_l1)
current_l2 = L.fun(X = current_h1, W = W2_list[[i - 1]])
current_o = S.fun(x = current_l2)
loss_seq[i-1] = CE.fun(o = current_o, y = y[idx], eps = 1e-8)
#Backward
current_grad_o = grad_o.fun(o = current_o, y = y[idx])
current_grad_l2 = grad_l2.fun(grad_o = current_grad_o, o = current_o)
current_grad_W2 = grad_W2.fun(grad_l2 = current_grad_l2, h1 = current_h1)
current_grad_h1 = grad_h1.fun(grad_l2 = current_grad_l2, W2 = W2_list[[i - 1]])
current_grad_l1 = grad_l1.fun(grad_h1 = current_grad_h1, l1 = current_l1)
current_grad_W1 = grad_W1.fun(grad_l1 = current_grad_l1, x = X_matrix[idx,])
if (optimizer == 'adam') {
if (i == 2) {
cum_m.W1 = 0
cum_n.W1 = 0
cum_m.W2 = 0
cum_n.W2 = 0
}
cum_m.W1 = beta1 * cum_m.W1 + (1 - beta1) * current_grad_W1
cum_n.W1 = beta2 * cum_n.W1 + (1 - beta2) * current_grad_W1^2
cum_m.W1.hat = cum_m.W1/(1 - beta1^(i-1))
cum_n.W1.hat = cum_n.W1/(1 - beta2^(i-1))
cum_m.W2 = beta1 * cum_m.W2 + (1 - beta1) * current_grad_W2
cum_n.W2 = beta2 * cum_n.W2 + (1 - beta2) * current_grad_W2^2
cum_m.W2.hat = cum_m.W2/(1 - beta1^(i-1))
cum_n.W2.hat = cum_n.W2/(1 - beta2^(i-1))
W2_list[[i]] = W2_list[[i-1]] - lr * cum_m.W2.hat / sqrt(cum_n.W2.hat + 1e-8)
W1_list[[i]] = W1_list[[i-1]] - lr * cum_m.W1.hat / sqrt(cum_n.W1.hat + 1e-8)
} else if (optimizer == 'sgd') {
if (i == 2) {
W2_list[[i]] = W2_list[[i-1]] - lr * current_grad_W2
W1_list[[i]] = W1_list[[i-1]] - lr * current_grad_W1
} else {
W2_list[[i]] = W2_list[[i-1]] - lr * current_grad_W2 + beta1 * (W2_list[[i-1]] - W2_list[[i-2]])
W1_list[[i]] = W1_list[[i-1]] - lr * current_grad_W1 + beta1 * (W1_list[[i-1]] - W1_list[[i-2]])
}
} else {
stop('optimizer must be selected from "sgd" or "adam".')
}
}
require(scales)
require(plot3D)
x1_seq = seq(min(x1), max(x1), length.out = 100)
x2_seq = seq(min(x2), max(x2), length.out = 100)
pre_func = function (x1, x2, W1 = W1_list[[length(W1_list)]], W2 = W2_list[[length(W2_list)]]) {
new_X = cbind(x1, x2)
O = S.fun(x = L.fun(X = ReLU.fun(x = L.fun(X = new_X, W = W1)), W = W2))
return(O)
}
pred_y = pre_func(x1 = x1, x2 = x2)
MAIN_TXT = paste0('Train-Acc:', formatC(mean((pred_y > 0.5) == y), 2, format = 'f'))
if (!is.null(test_x1)) {
pred_test_y = pre_func(x1 = test_x1, x2 = test_x2)
MAIN_TXT = paste0(MAIN_TXT, '; Test-Acc:', formatC(mean((pred_test_y > 0.5) == test_y), 2, format = 'f'))
}
z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
par(mfrow = c(1, 2))
image2D(z = z_matrix, main = MAIN_TXT,
x = x1_seq, xlab = 'x1',
y = x2_seq, ylab = 'x2',
shade = 0.2, rasterImage = TRUE,
col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
if (!is.null(test_x1)) {
points(test_x1, test_x2, col = 'black', bg = c('#C00000', '#0000C0')[(test_y + 1)], pch = 21)
}
plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
}
MLP_Trainer(num.iteration = 2000, num.hidden = 100, batch_size = 20,
lr = 0.1, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
MLP_Trainer(num.iteration = 2000, num.hidden = 100, batch_size = 20,
lr = 0.1, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
MLP_Trainer(num.iteration = 100, num.hidden = 100, batch_size = 20,
lr = 0.1, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
MLP_Trainer(num.iteration = 100, num.hidden = 100, batch_size = 20,
lr = 0.1, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– 在第二課我們已經改用ReLU作為非線性轉換函數,此舉雖然已經減少了梯度反向傳播的問題,但面對極深的網路時仍然需要其他幫助。
– 在這裡我們試著比較SGD以及Adam在網路極深時的表現,由於實現的程式碼較為複雜,我們直接提供現成的程式碼進行測試:
DEEP_MLP_Trainer = function (num.iteration = 500, num.hidden = c(10, 10, 10), batch_size = 30,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', eps = 1e-8,
x1 = x1, x2 = x2, y = y,
test_x1 = NULL, test_x2 = NULL, test_y = NULL) {
#Functions
#Forward
S.fun = function (x, eps = eps) {
S = 1/(1 + exp(-x))
S[S < eps] = eps
S[S > 1 - eps] = 1 - eps
return(S)
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
CE.fun = function (o, y, eps = eps) {
loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
return(loss)
}
#Backward
grad_o.fun = function (o, y) {
return((o - y)/(o*(1-o)))
}
grad_s.fun = function (grad_o, o) {
return(grad_o*(o*(1-o)))
}
grad_W.fun = function (grad_l, h) {
h.E = cbind(1, h)
return(t(h.E) %*% grad_l/nrow(h))
}
grad_h.fun = function (grad_l, W) {
return(grad_l %*% t(W[-1,]))
}
grad_l.fun = function (grad_h, l) {
de_l = l
de_l[de_l<0] = 0
de_l[de_l>0] = 1
return(grad_h*de_l)
}
#initialization
X_matrix = cbind(x1, x2)
Y_matrix = t(t(y))
W_list = list()
M_list = list()
N_list = list()
len_h = length(num.hidden)
for (w_seq in 1:(len_h+1)) {
if (w_seq == 1) {
NROW_W = ncol(X_matrix) + 1
NCOL_W = num.hidden[w_seq]
} else if (w_seq == len_h+1) {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = ncol(Y_matrix)
} else {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = num.hidden[w_seq]
}
W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
}
loss_seq = rep(0, num.iteration)
#Caculating
for (i in 1:num.iteration) {
idx = sample(1:length(x1), batch_size)
#Forward
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = X_matrix[idx,], W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
loss_seq[i] = CE.fun(o = current_o, y = y[idx], eps = eps)
#Backward
current_grad_l_list = list()
current_grad_W_list = list()
current_grad_h_list = list()
current_grad_o = grad_o.fun(o = current_o, y = y[idx])
current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
for (j in len_h:1) {
current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
if (j != 1) {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
} else {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = X_matrix[idx,])
}
}
if (optimizer == 'adam') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
M.hat = M_list[[j]]/(1 - beta1^i)
N.hat = N_list[[j]]/(1 - beta2^i)
W_list[[j]] = W_list[[j]] - lr*M.hat/sqrt(N.hat+eps)
}
} else if (optimizer == 'sgd') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
W_list[[j]] = W_list[[j]] - M_list[[j]]
}
} else {
stop('optimizer must be selected from "sgd" or "adam".')
}
}
require(scales)
require(plot3D)
x1_seq = seq(min(x1), max(x1), length.out = 100)
x2_seq = seq(min(x2), max(x2), length.out = 100)
pre_func = function (x1, x2) {
new_X = cbind(x1, x2)
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = new_X, W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
return(current_o)
}
pred_y = pre_func(x1 = x1, x2 = x2)
MAIN_TXT = paste0('Train-Acc:', formatC(mean((pred_y > 0.5) == y), 2, format = 'f'))
if (!is.null(test_x1)) {
pred_test_y = pre_func(x1 = test_x1, x2 = test_x2)
MAIN_TXT = paste0(MAIN_TXT, '; Test-Acc:', formatC(mean((pred_test_y > 0.5) == test_y), 2, format = 'f'))
}
z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
par(mfrow = c(1, 2))
image2D(z = z_matrix, main = MAIN_TXT,
x = x1_seq, xlab = 'x1',
y = x2_seq, ylab = 'x2',
shade = 0.2, rasterImage = TRUE,
col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
if (!is.null(test_x1)) {
points(test_x1, test_x2, col = 'black', bg = c('#C00000', '#0000C0')[(test_y + 1)], pch = 21)
}
plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
}
set.seed(0)
x1 = rnorm(100, sd = 1)
x2 = rnorm(100, sd = 1)
lr1 = - 1.5 + x1^2 + x2^2 + rnorm(100)
y = lr1 > 0 + 0L
test_x1 = rnorm(100, sd = 1)
test_x2 = rnorm(100, sd = 1)
lr1 = - 1.5 + test_x1^2 + test_x2^2 + rnorm(100)
test_y = lr1 > 0 + 0L
– 這是Adam的結果(這裡我們都使用一般常規默認參數\(lr=0.001\))
DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– 這是sgd的結果(一般默認\(lr=0.05\))
DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
lr = 0.05, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– \(lr = 0.01\)
DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
lr = 0.01, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– \(lr = 0.001\)
DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– \(lr = 0.0001\)
DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
lr = 0.0001, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– \(lr = 0.01\)
DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
lr = 0.01, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– \(lr = 0.001\)
DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– \(lr = 0.0001\)
DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
lr = 0.0001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– 你可能需要拆解剛剛的code,以了解Adam在深層網路時的優勢在哪。
DEEP_MLP_Trainer(num.iteration = 10000, num.hidden = rep(10, 9), batch_size = 20,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
DEEP_MLP_Trainer(num.iteration = 10000, num.hidden = rep(10, 9), batch_size = 20,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– 這樣的狀況會讓各層的梯度分布不一致,因此SGD對於各層統一學習率就對結果可能造成不好的影響,我們試著視覺化呈現訓練中第1層到第10層的平均絕對值梯度的值:
DEEP_MLP_Trainer(num.iteration = 10000, num.hidden = rep(10, 9), batch_size = 20,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
DEEP_MLP_Trainer(num.iteration = 10000, num.hidden = rep(10, 9), batch_size = 20,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
x1 = x1, x2 = x2, y = y,
test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)
– 另外,你是否注意到了越深的網路準確度並非越高,他的狀況其實跟第二節課時我們隨意給出幾百個神經元的問題類似,我們遇到了「過度擬合問題」,這個問題其實就是待解的權重參數數量遠遠超過樣本數所造成的。
– 另外,你是否還有發現(或記得)網路訓練的另一問題,那就是我們總是使用隨機起始值作為優化開始的起點,而在凹凸函數上起始值對於優化結果的影響又非常關鍵,因此「權重初始化問題」亦是相關研究的一大重點。