局部極值與優化技術

林嶔 (Lin, Chin)

Lesson 3

局部極值問題(1)

我們一樣先從簡單的問題開始,我們希望求\(y = 2x^2 +3x^3 +x^4\)的極值,但我們先看看函數圖形:

F3_1

differential.fun = function(x) {
  return(4*x + 9*x^2 + 4*x^3)
}

start.value = 0.5 
num.iteration = 30
learning.rate = 0.05

result.x = rep(NA, num.iteration)

for (i in 1:num.iteration) {
  if (i == 1) {
    result.x[1] = start.value
  } else {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
  }
}

tail(result.x, 1)
## [1] 0.000275395
start.value = -3 
num.iteration = 30
learning.rate = 0.05

result.x = rep(NA, num.iteration)

for (i in 1:num.iteration) {
  if (i == 1) {
    result.x[1] = start.value
  } else {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
  }
}

tail(result.x, 1)
## [1] -1.640338

局部極值問題(2)

– 你可以親自在線性迴歸/邏輯斯迴歸中設定不同的起始值,你會發現線性迴歸、邏輯斯迴歸都沒有這個問題,這是因為他們的函數圖形都比較像\(y = x^2\)這種凸函數

– 而\(y = 2x^2 +3x^3 +x^4\)這種叫做凹凸函數

– 當然,我們仍然希望盡可能的避免局部極值,統計/電腦學家在這方面下了非常多的努力,而我們就即將介紹一些梯度下降法的變種試圖避免我們的優化器陷入局部極值。

慣性動量(1)

F3_2

\[x_{\left(epoch:t\right)} = x_{\left(epoch:t - 1\right)} - lr \cdot \frac{\partial}{\partial x}f(x_{\left(epoch:t - 1\right)}) + \rho \cdot \left[x_{\left(epoch:t - 1\right)}-x_{\left(epoch:t - 2\right)}\right]\]

慣性動量(2)

differential.fun = function(x) {
  return(4*x + 9*x^2 + 4*x^3)
}

start.value = 0.5 
num.iteration = 30
learning.rate = 0.1
momentum = 0.9

result.x = rep(NA, num.iteration)

for (i in 1:num.iteration) {
  if (i == 1) {
    result.x[1] = start.value
  } else if (i == 2) {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
  } else {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1]) + momentum * (result.x[i-1] - result.x[i-2])
  }
}

tail(result.x, 1)
## [1] -1.602265

F3_3

慣性動量(3)

– 這個例子是求\(y=x^2\)的極值,但學習率錯誤訂成了1.02,在正常狀況下是無法收斂的,但具有慣性動量將可以收斂。

differential.fun = function(x) {
  return(2*x)
}

start.value = 0.5 
num.iteration = 30
learning.rate = 1.02
momentum = 0.9

result.x = rep(NA, num.iteration)

for (i in 1:num.iteration) {
  if (i == 1) {
    result.x[1] = start.value
  } else if (i == 2) {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
  } else {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1]) + momentum * (result.x[i-1] - result.x[i-2])
  }
}

tail(result.x, 1)
## [1] -0.03117517

F3_4

練習1:變種邏輯斯回歸的優化

\[log(\frac{{p}}{1-p}) = sin(b_{1}x_1x_2+b_2x_2)\]

set.seed(0)
x1 = runif(1000, min = -3, max = 3) 
x2 = runif(1000, min = -3, max = 3) 
lr = sin(0.7*x1*x2+0.7*x2) + rnorm(1000, sd = 0.3)
y = lr > 0 + 0L

\[ \begin{align} L(x_1,x_2) & = sin(b_{1}x_1x_2+b_2x_2) \\ S(x) & = \frac{{1}}{1+e^{-x}} \\ CE(y, p) & = -\left(y_{i} \cdot log(p_{i}) + (1-y_{i}) \cdot log(1-p_{i})\right) \\ loss & = CE(y, S(L(x_1,x_2))) \end{align} \]

\(b_1\)的偏導函數(過程略):

\[ \begin{align} \frac{\partial}{\partial b_1}loss_i & = (p_i - y_i) \cdot cos(b_{1}x_1x_2+b_2x_2) \cdot x_{1i}x_{2i} \end{align} \]

\(b_2\)的偏導函數(過程略):

\[ \begin{align} \frac{\partial}{\partial b_2}loss_i & = (p_i - y_i) \cdot cos(b_{1}x_1x_2+b_2x_2) \cdot x_{2i} \end{align} \]

pred.fun = function(b1, b2, x1 = x1, x2 = x2) {
  lr = sin(b1*x1*x2+b2*x2)
  p = 1 / (1 + exp(-lr))
  return(p)
}

loss.fun = function(p, y = y, eps = 1e-8) {
  loss = -mean(y * log(p + eps) + (1 - y) * log(1 - p + eps))
  return(loss)
}

differential.fun.b1 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
  return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x1*x2))
}

differential.fun.b2 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
  return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x2))
}

練習1答案(程式碼)

LOGISTIC_VIS = function (x1, x2, y, num.iteration = 100, lr = 0.1, momentum = 0.9, start_b1 = 0, start_b2 = 0) {

  ans_b1 = rep(start_b1, num.iteration)
  ans_b2 = rep(start_b2, num.iteration)
  ans_loss = rep(0, num.iteration)
  
  pred.fun = function(b1, b2, x1 = x1, x2 = x2) {
    lr = sin(b1*x1*x2+b2*x2)
    p = 1 / (1 + exp(-lr))
    return(p)
  }
  
  loss.fun = function(p, y = y, eps = 1e-8) {
    loss = -mean(y * log(p + eps) + (1 - y) * log(1 - p + eps))
    return(loss)
  }
  
  differential.fun.b1 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
    return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x1*x2))
  }
  
  differential.fun.b2 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
    return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x2))
  }
  
  for (i in 1:num.iteration) {
    if (i >= 2) {
      grad_b1 = differential.fun.b1(p = current_p, b1 = ans_b1[i-1], b2 = ans_b2[i-1], x1 = x1, x2 = x2, y = y)
      grad_b2 = differential.fun.b2(p = current_p, b1 = ans_b1[i-1], b2 = ans_b2[i-1], x1 = x1, x2 = x2, y = y)
      if (i == 2) {
        momentum_b1 = 0
        momentum_b2 = 0
      } else {
        momentum_b1 = momentum * (ans_b1[i-1] - ans_b1[i-2])
        momentum_b2 = momentum * (ans_b2[i-1] - ans_b2[i-2]) 
      }
      ans_b1[i] = ans_b1[i-1] - lr * grad_b1 + momentum_b1
      ans_b2[i] = ans_b2[i-1] - lr * grad_b2 + momentum_b2
    }
    current_p = pred.fun(b1 = ans_b1[i], b2 = ans_b2[i], x1 = x1, x2 = x2)
    ans_loss[i] = loss.fun(p = current_p, y = y)
  }
  
  par(mfrow = c(1, 2))
  
  require(scales)
  require(plot3D)
  
  x1_seq = seq(min(x1), max(x1), length.out = 300)
  x2_seq = seq(min(x2), max(x2), length.out = 300)
  
  z_matrix = sapply(x2_seq, function(x) {pred.fun(b1 = ans_b1[num.iteration], b2 = ans_b2[num.iteration], x1 = x1_seq, x2 = x)})
  
  image2D(z = z_matrix, main = paste0('b1=', formatC(ans_b1[num.iteration], format = 'f', 1), ';b2=', formatC(ans_b2[num.iteration], format = 'f', 1)),
          x = x1_seq, xlab = 'x1',
          y = x2_seq, ylab = 'x2',
          shade = 0.2, rasterImage = TRUE,
          col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
  
  points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
  
  require(grDevices) # for trans3d
  
  b1_seq = seq(-0.5, 2, length.out = 50)
  b2_seq = seq(-0.5, 2, length.out = 50)
  LOSS <- b1_seq %*% t(b2_seq)
  for (i in 1:length(b1_seq)) {
    for (j in 1:length(b2_seq)) {
      LOSS[i,j] = loss.fun(p = pred.fun(b1 = b1_seq[i], b2 = b2_seq[j], x1 = x1, x2 = x2), y = y, eps = 1e-8)
    }
  }
  
  persp(b1_seq, b2_seq, LOSS, theta = 120, phi = 50, expand = 0.8, col = "lightblue",
        ltheta = 120, shade = 0.75, ticktype = "detailed",
        xlab = "b1", ylab = "b2", zlab = "loss", main = paste0('loss=', formatC(ans_loss[num.iteration], 4, format = 'f'))) -> res
  
  lines(trans3d(ans_b1, ans_b2, ans_loss, pmat = res), col = '#AA0000', lwd = 1.5)
  points(trans3d(ans_b1[1], ans_b2[1], ans_loss[1], pmat = res), col = '#AA0000', cex = 1, pch = 16)
  points(trans3d(ans_b1[num.iteration], ans_b2[num.iteration], ans_loss[num.iteration], pmat = res), col = '#FF0000', cex = 1, pch = 16)
  
}

練習1答案(實驗結果)

LOGISTIC_VIS(x1 = x1, x2 = x2, y = y, num.iteration = 100, lr = 0.1, momentum = 0, start_b1 = 0, start_b2 = 0)

LOGISTIC_VIS(x1 = x1, x2 = x2, y = y, num.iteration = 100, lr = 0.1, momentum = 0.9, start_b1 = 0, start_b2 = 0)

練習1之引申問題

set.seed(0)
x1 = runif(1000, min = -3, max = 3) 
x2 = runif(1000, min = -3, max = 3) 
lr = sin(2*x1*x2+3*x2) + rnorm(1000, sd = 0.3)
y = lr > 0 + 0L
LOGISTIC_VIS(x1 = x1, x2 = x2, y = y, num.iteration = 300, lr = 0.1, momentum = 0.9, start_b1 = 0, start_b2 = 0)

小批量梯度下降法(1)

– 然而『實際上』,由於每次抽樣造成的變異,這會使的當前梯度會因為抽樣的關係存在於一定的變異性,這將會導致使用「小批量梯度下降法(Mini-batch Stochastic Gradient Descent, SGD)」與「全樣本梯度下降法」的結果有所不同。但問題在於,你認為「小批量梯度下降法」與「全樣本梯度下降法」哪一種能得到較好的結果?而如果使用「小批量梯度下降法」,那批量「樣本大小」與優化結果的關係又是什麼呢?

– 下面是利用練習1的問題所描繪的損失函數,這6張圖分別是批量大小從小到大其最終損失函數的變異情形,試著觀察並討論此問題:

F3_5

小批量梯度下降法(2)

– 當然,過小的批量樣本似乎也不是非常理想,這同時對全局極值也存在著危害,同樣有可能讓優化器錯過全局極值。但整體來說,似乎批量樣本較小較為有利!

– 這個例子是這是\(b_1 = 0.7\)\(b_2 = 0.7\)所產生的數據,這在練習1中若設為動量為0,則無法離開局部極值,而我們改用批量大小為30再次試試:

F3_6

– 下面這個例子是這是\(b_1 = 2\)\(b_2 = 3\)所產生的數據,這在練習1中即使我們使用了動量為0.9的優化過程仍然沒辦法找到答案,但是透過調整批量大小(批量大小 = 30)後,我們將能找到全局極值:

F3_7

小批量梯度下降法(3)

F3_8

– 最終發現在幾個常見的演算法及資料集中,「批量大小」不超過32時「測試集準確度」較高。

F3_9

練習2:調整批量樣本

set.seed(0)
x1 = rnorm(100, sd = 1) 
x2 = rnorm(100, sd = 1) 
lr1 = - 1.5 + x1^2 + x2^2 + rnorm(100)
y = lr1 > 0 + 0L

test_x1 = rnorm(100, sd = 1) 
test_x2 = rnorm(100, sd = 1) 
lr1 = - 1.5 + test_x1^2 + test_x2^2 + rnorm(100)
test_y = lr1 > 0 + 0L
MLP_Trainer = function (num.iteration = 500, num.hidden = 30, lr = 0.1, x1 = x1, x2 = x2, y = y, test_x1 = NULL, test_x2 = NULL, test_y = NULL) {
  
  #Functions
  
  #Forward
  
  S.fun = function (x, eps = 1e-8) {
    S = 1/(1 + exp(-x))
    S[S < eps] = eps
    S[S > 1 - eps] = 1 - eps
    return(S)
  }
  
  ReLU.fun = function (x) {
    x[x < 0] <- 0
    return(x)
  }
  
  L.fun = function (X, W) {
    X.E = cbind(1, X)
    L = X.E %*% W
    return(L)
  }
  
  CE.fun = function (o, y, eps = 1e-8) {
    loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
    return(loss)
  }
  
  #Backward
  
  grad_o.fun = function (o, y) {
    return((o - y)/(o*(1-o)))
  }
  
  grad_l2.fun = function (grad_o, o) {
    return(grad_o*(o*(1-o)))
  }
  
  grad_W2.fun = function (grad_l2, h1) {
    h1.E = cbind(1, h1)
    return(t(h1.E) %*% grad_l2/nrow(h1))
  }
  
  grad_h1.fun = function (grad_l2, W2) {
    return(grad_l2 %*% t(W2[-1,]))
  }
  
  grad_l1.fun = function (grad_h1, l1) {
    de_l1 = l1
    de_l1[de_l1<0] = 0
    de_l1[de_l1>0] = 1
    return(grad_h1*de_l1)
  }
  
  grad_W1.fun = function (grad_l1, x) {
    x.E = cbind(1, x)
    return(t(x.E) %*% grad_l1/nrow(x))
  }
  
  #Caculating
  
  X_matrix = cbind(x1, x2)
  
  W1_list = list()
  W2_list = list()
  loss_seq = rep(0, num.iteration)
  
  #Start random values
  
  W1_list[[1]] = matrix(rnorm(3*num.hidden, sd = 1), nrow = 3, ncol = num.hidden)
  W2_list[[1]] = matrix(rnorm(num.hidden + 1, sd = 1), nrow = num.hidden + 1, ncol = 1)
  
  for (i in 2:(num.iteration+1)) {
    
    #Forward
    
    current_l1 = L.fun(X = X_matrix, W = W1_list[[i - 1]])
    current_h1 = ReLU.fun(x = current_l1)
    current_l2 = L.fun(X = current_h1, W = W2_list[[i - 1]])
    current_o = S.fun(x = current_l2)
    loss_seq[i-1] = CE.fun(o = current_o, y = y, eps = 1e-8)
    
    #Backward
    
    current_grad_o = grad_o.fun(o = current_o, y = y)
    current_grad_l2 = grad_l2.fun(grad_o = current_grad_o, o = current_o)
    current_grad_W2 = grad_W2.fun(grad_l2 = current_grad_l2, h1 = current_h1)
    current_grad_h1 = grad_h1.fun(grad_l2 = current_grad_l2, W2 = W2_list[[i - 1]])
    current_grad_l1 = grad_l1.fun(grad_h1 = current_grad_h1, l1 = current_l1)
    current_grad_W1 = grad_W1.fun(grad_l1 = current_grad_l1, x = X_matrix)
    W2_list[[i]] = W2_list[[i-1]] - lr * current_grad_W2
    W1_list[[i]] = W1_list[[i-1]] - lr * current_grad_W1
    
  }
  
  require(scales)
  require(plot3D)
  
  x1_seq = seq(min(x1), max(x1), length.out = 100)
  x2_seq = seq(min(x2), max(x2), length.out = 100)
  
  pre_func = function (x1, x2, W1 = W1_list[[length(W1_list)]], W2 = W2_list[[length(W2_list)]]) {
    new_X = cbind(x1, x2)
    O = S.fun(x = L.fun(X = ReLU.fun(x = L.fun(X = new_X, W = W1)), W = W2))
    return(O)
  }
  
  pred_y = pre_func(x1 = x1, x2 = x2)
  MAIN_TXT = paste0('Train-Acc:', formatC(mean((pred_y > 0.5) == y), 2, format = 'f'))
  if (!is.null(test_x1)) {
    pred_test_y = pre_func(x1 = test_x1, x2 = test_x2)
    MAIN_TXT = paste0(MAIN_TXT, '; Test-Acc:', formatC(mean((pred_test_y > 0.5) == test_y), 2, format = 'f'))
  }
  
  z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
  
  par(mfrow = c(1, 2))
  
  image2D(z = z_matrix, main = MAIN_TXT,
          x = x1_seq, xlab = 'x1',
          y = x2_seq, ylab = 'x2',
          shade = 0.2, rasterImage = TRUE,
          col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
  
  points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
  if (!is.null(test_x1)) {
    points(test_x1, test_x2, col = 'black', bg = c('#C00000', '#0000C0')[(test_y + 1)], pch = 21)
  }
  
  plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
  
}

練習2答案(程式碼)

MLP_Trainer = function (num.iteration = 500, num.hidden = 30, batch_size = 30, lr = 0.1, momentum = 0, x1 = x1, x2 = x2, y = y, test_x1 = NULL, test_x2 = NULL, test_y = NULL) {
  
  #Functions
  
  #Forward
  
  S.fun = function (x, eps = 1e-8) {
    S = 1/(1 + exp(-x))
    S[S < eps] = eps
    S[S > 1 - eps] = 1 - eps
    return(S)
  }
  
  ReLU.fun = function (x) {
    x[x < 0] <- 0
    return(x)
  }
  
  L.fun = function (X, W) {
    X.E = cbind(1, X)
    L = X.E %*% W
    return(L)
  }
  
  CE.fun = function (o, y, eps = 1e-8) {
    loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
    return(loss)
  }
  
  #Backward
  
  grad_o.fun = function (o, y) {
    return((o - y)/(o*(1-o)))
  }
  
  grad_l2.fun = function (grad_o, o) {
    return(grad_o*(o*(1-o)))
  }
  
  grad_W2.fun = function (grad_l2, h1) {
    h1.E = cbind(1, h1)
    return(t(h1.E) %*% grad_l2/nrow(h1))
  }
  
  grad_h1.fun = function (grad_l2, W2) {
    return(grad_l2 %*% t(W2[-1,]))
  }
  
  grad_l1.fun = function (grad_h1, l1) {
    de_l1 = l1
    de_l1[de_l1<0] = 0
    de_l1[de_l1>0] = 1
    return(grad_h1*de_l1)
  }
  
  grad_W1.fun = function (grad_l1, x) {
    x.E = cbind(1, x)
    return(t(x.E) %*% grad_l1/nrow(x))
  }
  
  #Caculating
  
  X_matrix = cbind(x1, x2)
  
  W1_list = list()
  W2_list = list()
  loss_seq = rep(0, num.iteration)
  
  #Start random values
  
  W1_list[[1]] = matrix(rnorm(3*num.hidden, sd = 1), nrow = 3, ncol = num.hidden)
  W2_list[[1]] = matrix(rnorm(num.hidden + 1, sd = 1), nrow = num.hidden + 1, ncol = 1)
  
  for (i in 2:(num.iteration+1)) {
    
    idx <- sample(1:length(x1), batch_size)
    
    #Forward
    
    current_l1 = L.fun(X = X_matrix[idx,], W = W1_list[[i - 1]])
    current_h1 = ReLU.fun(x = current_l1)
    current_l2 = L.fun(X = current_h1, W = W2_list[[i - 1]])
    current_o = S.fun(x = current_l2)
    loss_seq[i-1] = CE.fun(o = current_o, y = y[idx], eps = 1e-8)
    
    #Backward
    
    current_grad_o = grad_o.fun(o = current_o, y = y[idx])
    current_grad_l2 = grad_l2.fun(grad_o = current_grad_o, o = current_o)
    current_grad_W2 = grad_W2.fun(grad_l2 = current_grad_l2, h1 = current_h1)
    current_grad_h1 = grad_h1.fun(grad_l2 = current_grad_l2, W2 = W2_list[[i - 1]])
    current_grad_l1 = grad_l1.fun(grad_h1 = current_grad_h1, l1 = current_l1)
    current_grad_W1 = grad_W1.fun(grad_l1 = current_grad_l1, x = X_matrix[idx,])
    
    if (i == 2) {
      W2_list[[i]] = W2_list[[i-1]] - lr * current_grad_W2
      W1_list[[i]] = W1_list[[i-1]] - lr * current_grad_W1
    } else {
      W2_list[[i]] = W2_list[[i-1]] - lr * current_grad_W2 + momentum * (W2_list[[i-1]] - W2_list[[i-2]])
      W1_list[[i]] = W1_list[[i-1]] - lr * current_grad_W1 + momentum * (W1_list[[i-1]] - W1_list[[i-2]])
    }
    

    
  }
  
  require(scales)
  require(plot3D)
  
  x1_seq = seq(min(x1), max(x1), length.out = 100)
  x2_seq = seq(min(x2), max(x2), length.out = 100)
  
  pre_func = function (x1, x2, W1 = W1_list[[length(W1_list)]], W2 = W2_list[[length(W2_list)]]) {
    new_X = cbind(x1, x2)
    O = S.fun(x = L.fun(X = ReLU.fun(x = L.fun(X = new_X, W = W1)), W = W2))
    return(O)
  }
  
  pred_y = pre_func(x1 = x1, x2 = x2)
  MAIN_TXT = paste0('Train-Acc:', formatC(mean((pred_y > 0.5) == y), 2, format = 'f'))
  if (!is.null(test_x1)) {
    pred_test_y = pre_func(x1 = test_x1, x2 = test_x2)
    MAIN_TXT = paste0(MAIN_TXT, '; Test-Acc:', formatC(mean((pred_test_y > 0.5) == test_y), 2, format = 'f'))
  }
  
  z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
  
  par(mfrow = c(1, 2))
  
  image2D(z = z_matrix, main = MAIN_TXT,
          x = x1_seq, xlab = 'x1',
          y = x2_seq, ylab = 'x2',
          shade = 0.2, rasterImage = TRUE,
          col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
  
  points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
  if (!is.null(test_x1)) {
    points(test_x1, test_x2, col = 'black', bg = c('#C00000', '#0000C0')[(test_y + 1)], pch = 21)
  }
  
  plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
  
}

練習2答案(實驗結果)

MLP_Trainer(num.iteration = 5000, num.hidden = 30, batch_size = 100, lr = 0.1, momentum = 0.9, x1 = x1, x2 = x2, y = y, test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

MLP_Trainer(num.iteration = 5000, num.hidden = 30, batch_size = 20, lr = 0.1, momentum = 0.9, x1 = x1, x2 = x2, y = y, test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

自適應學習率優化演算法(1)

– 一個合理的方法就是透過調整學習率,在疊代初期給一個較大的學習率,而在後期給一個較小的學習率。

– 另外目前為止,所有參數的更新「共享」了同一個學習率,有沒有可能針對每一個參數設計自己的學習率,從而導致優化過程更好?

\[ \begin{align} grad.x_t & = \frac{\partial}{\partial x}f(x_{t - 1}) \\ n_t & = n_{t-1} + grad.x_t^2 \\ x_{t} & = x_{t - 1} - \frac{lr}{\sqrt{n_t+\delta}}\cdot grad.x_t \end{align} \]

– 在R語言中的實現如下,我們以求\(y=x^2\)的極值為例實現下列程式碼:

differential.fun = function(x) {
  return(2*x)
}

start.value = 0.5 
num.iteration = 30
learning.rate = 0.1
eps = 1e-8

result.x = rep(NA, num.iteration)

for (i in 1:num.iteration) {
  if (i == 1) {
    n = 0
    result.x[1] = start.value
  } else {
    grad.x = differential.fun(result.x[i-1])
    n = n + grad.x^2
    result.x[i] = result.x[i-1] - learning.rate/sqrt(n + eps) * grad.x
  }
}

tail(result.x, 1)
## [1] 0.01459057

自適應學習率優化演算法(2)

– 這裡我們可以對其做一些簡單的擴展,主要的概念是來自於Adadelta,我們可以在累積梯度平方和的部分利用了動量的概念,讓當前累積值會隨著時間做衰減,從而保證訓練不會提前結束

\[ \begin{align} grad.x_t & = \frac{\partial}{\partial x}f(x_{t - 1}) \\ n_t & = \beta_2 \cdot n_{t-1} + (1 - \beta_2) \cdot grad.x_t^2 \\ x_{t} & = x_{t - 1} - \frac{lr}{\sqrt{n_t+\delta}}\cdot grad.x_t \end{align} \]

\[ \begin{align} grad.x_t & = \frac{\partial}{\partial x}f(x_{t - 1}) \\ m_t & = \beta_1 \cdot m_{t-1} + (1 - \beta_1) \cdot grad.x_t \\ n_t & = \beta_2 \cdot n_{t-1} + (1 - \beta_2) \cdot grad.x_t^2 \\ x_{t} & = x_{t - 1} - \frac{lr}{\sqrt{n_t+\delta}}\cdot m_t \end{align} \]

– 擴展自此,我們考慮最後一個問題,那就是由於\(m_t\)\(n_t\)是隨著時間累積的,在訓練剛開始的時候這個數字將會非常小,所以我們將計算\(\hat{m_t}\)\(\hat{n_t}\)作為更新梯度使用,而他的分母項將會隨著時間越來越接近1:

\[ \begin{align} grad.x_t & = \frac{\partial}{\partial x}f(x_{t - 1}) \\ m_t & = \beta_1 \cdot m_{t-1} + (1 - \beta_1) \cdot grad.x_t \\ n_t & = \beta_2 \cdot n_{t-1} + (1 - \beta_2) \cdot grad.x_t^2 \\ \hat{m_t} & = \frac{m_t}{1 - \beta_1^t} \\ \hat{n_t} & = \frac{n_t}{1 - \beta_2^t} \\ x_{t} & = x_{t - 1} - \frac{lr}{\sqrt{\hat{n_t}+\delta}}\cdot \hat{m_t} \end{align} \]

自適應學習率優化演算法(3)

F3_10

differential.fun = function(x) {
  return(2*x)
}

start.value = 0.5 
num.iteration = 30
learning.rate = 0.1 #注意,一般Adam在優化時通常選擇0.001
beta1 = 0.9
beta2 = 0.999
eps = 1e-8

result.x = rep(NA, num.iteration + 1)

for (i in 0:num.iteration) {
  if (i == 0) {
    m = 0
    n = 0
    result.x[i+1] = start.value
  } else {
    grad.x = differential.fun(result.x[i])
    m = beta1 * m + (1 - beta1) * grad.x
    n = beta2 * n + (1 - beta2) * grad.x^2
    m.hat = m/(1 - beta1^i)
    n.hat = n/(1 - beta2^i)
    result.x[i+1] = result.x[i] - learning.rate/sqrt(n.hat + eps) * m.hat
  }
}

tail(result.x, 1)
## [1] 0.02099521

自適應學習率優化演算法(4)

– SGD的結果(\(lr = 0.1\), \(\rho = 0.9\)):

F3_7

– Adam的結果(\(lr = 0.1\), \(\beta_1 = 0.9\), \(\beta_2 = 0.999\)):

F3_11

– 因此,也有研究在訓練初期使用Adam做快速的優化,而在後期使用SGD做收尾(Nitish Shirish Keskar & Richard Socher, 2017)。事實上「最佳化問題」上一直以來都是數學上的千古難題,幾百年的進展下來並沒有一種方法可以在各種問題上都相較於其他有明顯優勢,我們只能說目前的主流是帶有慣性量的SGD與Adam是最常用的兩個方法,但可以預見未來進一步的研究也許會改寫這個格局。

練習3:使用Adam優化MLP

– 初始程式碼的部分請你參考練習2的答案,用這個程式碼開始改。

– 使用的樣本如下:

set.seed(0)
x1 = rnorm(100, sd = 1) 
x2 = rnorm(100, sd = 1) 
lr1 = - 1.5 + x1^2 + x2^2 + rnorm(100)
y = lr1 > 0 + 0L

test_x1 = rnorm(100, sd = 1) 
test_x2 = rnorm(100, sd = 1) 
lr1 = - 1.5 + test_x1^2 + test_x2^2 + rnorm(100)
test_y = lr1 > 0 + 0L

練習3答案(程式碼)

MLP_Trainer = function (num.iteration = 500, num.hidden = 2, batch_size = 30,
                        lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
                        x1 = x1, x2 = x2, y = y,
                        test_x1 = NULL, test_x2 = NULL, test_y = NULL) {
  
  #Functions
  
  #Forward
  
  S.fun = function (x, eps = 1e-8) {
    S = 1/(1 + exp(-x))
    S[S < eps] = eps
    S[S > 1 - eps] = 1 - eps
    return(S)
  }
  
  ReLU.fun = function (x) {
    x[x < 0] <- 0
    return(x)
  }
  
  L.fun = function (X, W) {
    X.E = cbind(1, X)
    L = X.E %*% W
    return(L)
  }
  
  CE.fun = function (o, y, eps = 1e-8) {
    loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
    return(loss)
  }
  
  #Backward
  
  grad_o.fun = function (o, y) {
    return((o - y)/(o*(1-o)))
  }
  
  grad_l2.fun = function (grad_o, o) {
    return(grad_o*(o*(1-o)))
  }
  
  grad_W2.fun = function (grad_l2, h1) {
    h1.E = cbind(1, h1)
    return(t(h1.E) %*% grad_l2/nrow(h1))
  }
  
  grad_h1.fun = function (grad_l2, W2) {
    return(grad_l2 %*% t(W2[-1,]))
  }
  
  grad_l1.fun = function (grad_h1, l1) {
    de_l1 = l1
    de_l1[de_l1<0] = 0
    de_l1[de_l1>0] = 1
    return(grad_h1*de_l1)
  }
  
  grad_W1.fun = function (grad_l1, x) {
    x.E = cbind(1, x)
    return(t(x.E) %*% grad_l1/nrow(x))
  }
  
  #Caculating
  
  X_matrix = cbind(x1, x2)
  
  W1_list = list()
  W2_list = list()
  loss_seq = rep(0, num.iteration)
  
  #Start random values
  
  W1_list[[1]] = matrix(rnorm(3*num.hidden, sd = 1), nrow = 3, ncol = num.hidden)
  W2_list[[1]] = matrix(rnorm(num.hidden + 1, sd = 1), nrow = num.hidden + 1, ncol = 1)
  
  for (i in 2:(num.iteration+1)) {
    
    idx <- sample(1:length(x1), batch_size)
    
    #Forward
    
    current_l1 = L.fun(X = X_matrix[idx,], W = W1_list[[i - 1]])
    current_h1 = ReLU.fun(x = current_l1)
    current_l2 = L.fun(X = current_h1, W = W2_list[[i - 1]])
    current_o = S.fun(x = current_l2)
    loss_seq[i-1] = CE.fun(o = current_o, y = y[idx], eps = 1e-8)
    
    #Backward
    
    current_grad_o = grad_o.fun(o = current_o, y = y[idx])
    current_grad_l2 = grad_l2.fun(grad_o = current_grad_o, o = current_o)
    current_grad_W2 = grad_W2.fun(grad_l2 = current_grad_l2, h1 = current_h1)
    current_grad_h1 = grad_h1.fun(grad_l2 = current_grad_l2, W2 = W2_list[[i - 1]])
    current_grad_l1 = grad_l1.fun(grad_h1 = current_grad_h1, l1 = current_l1)
    current_grad_W1 = grad_W1.fun(grad_l1 = current_grad_l1, x = X_matrix[idx,])
    
    if (optimizer == 'adam') {
      
      if (i == 2) {
        cum_m.W1 = 0
        cum_n.W1 = 0
        cum_m.W2 = 0
        cum_n.W2 = 0
      }
      
      cum_m.W1 = beta1 * cum_m.W1 + (1 - beta1) * current_grad_W1
      cum_n.W1 = beta2 * cum_n.W1 + (1 - beta2) * current_grad_W1^2
      cum_m.W1.hat = cum_m.W1/(1 - beta1^(i-1))
      cum_n.W1.hat = cum_n.W1/(1 - beta2^(i-1))
      
      cum_m.W2 = beta1 * cum_m.W2 + (1 - beta1) * current_grad_W2
      cum_n.W2 = beta2 * cum_n.W2 + (1 - beta2) * current_grad_W2^2
      cum_m.W2.hat = cum_m.W2/(1 - beta1^(i-1))
      cum_n.W2.hat = cum_n.W2/(1 - beta2^(i-1))
      
      W2_list[[i]] = W2_list[[i-1]] - lr * cum_m.W2.hat / sqrt(cum_n.W2.hat + 1e-8)
      W1_list[[i]] = W1_list[[i-1]] - lr * cum_m.W1.hat / sqrt(cum_n.W1.hat + 1e-8)
      
    } else if (optimizer == 'sgd') {
      if (i == 2) {
        W2_list[[i]] = W2_list[[i-1]] - lr * current_grad_W2
        W1_list[[i]] = W1_list[[i-1]] - lr * current_grad_W1
      } else {
        W2_list[[i]] = W2_list[[i-1]] - lr * current_grad_W2 + beta1 * (W2_list[[i-1]] - W2_list[[i-2]])
        W1_list[[i]] = W1_list[[i-1]] - lr * current_grad_W1 + beta1 * (W1_list[[i-1]] - W1_list[[i-2]])
      }
    } else {
      stop('optimizer must be selected from "sgd" or "adam".')
    }
    
    
    
  }
  
  require(scales)
  require(plot3D)
  
  x1_seq = seq(min(x1), max(x1), length.out = 100)
  x2_seq = seq(min(x2), max(x2), length.out = 100)
  
  pre_func = function (x1, x2, W1 = W1_list[[length(W1_list)]], W2 = W2_list[[length(W2_list)]]) {
    new_X = cbind(x1, x2)
    O = S.fun(x = L.fun(X = ReLU.fun(x = L.fun(X = new_X, W = W1)), W = W2))
    return(O)
  }
  
  pred_y = pre_func(x1 = x1, x2 = x2)
  MAIN_TXT = paste0('Train-Acc:', formatC(mean((pred_y > 0.5) == y), 2, format = 'f'))
  if (!is.null(test_x1)) {
    pred_test_y = pre_func(x1 = test_x1, x2 = test_x2)
    MAIN_TXT = paste0(MAIN_TXT, '; Test-Acc:', formatC(mean((pred_test_y > 0.5) == test_y), 2, format = 'f'))
  }
  
  z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
  
  par(mfrow = c(1, 2))
  
  image2D(z = z_matrix, main = MAIN_TXT,
          x = x1_seq, xlab = 'x1',
          y = x2_seq, ylab = 'x2',
          shade = 0.2, rasterImage = TRUE,
          col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
  
  points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
  if (!is.null(test_x1)) {
    points(test_x1, test_x2, col = 'black', bg = c('#C00000', '#0000C0')[(test_y + 1)], pch = 21)
  }
  
  plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
  
}

練習3答案(實驗結果)

MLP_Trainer(num.iteration = 2000, num.hidden = 100, batch_size = 20,
            lr = 0.1, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
            x1 = x1, x2 = x2, y = y,
            test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

MLP_Trainer(num.iteration = 2000, num.hidden = 100, batch_size = 20,
            lr = 0.1, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
            x1 = x1, x2 = x2, y = y,
            test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

MLP_Trainer(num.iteration = 100, num.hidden = 100, batch_size = 20,
            lr = 0.1, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
            x1 = x1, x2 = x2, y = y,
            test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

MLP_Trainer(num.iteration = 100, num.hidden = 100, batch_size = 20,
            lr = 0.1, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
            x1 = x1, x2 = x2, y = y,
            test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

極深網路的優化求解(1)

– 在第二課我們已經改用ReLU作為非線性轉換函數,此舉雖然已經減少了梯度反向傳播的問題,但面對極深的網路時仍然需要其他幫助。

– 在這裡我們試著比較SGD以及Adam在網路極深時的表現,由於實現的程式碼較為複雜,我們直接提供現成的程式碼進行測試:

DEEP_MLP_Trainer = function (num.iteration = 500, num.hidden = c(10, 10, 10), batch_size = 30,
                             lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', eps = 1e-8,
                             x1 = x1, x2 = x2, y = y,
                             test_x1 = NULL, test_x2 = NULL, test_y = NULL) {
  
  #Functions
  
  #Forward
  
  S.fun = function (x, eps = eps) {
    S = 1/(1 + exp(-x))
    S[S < eps] = eps
    S[S > 1 - eps] = 1 - eps
    return(S)
  }
  
  ReLU.fun = function (x) {
    x[x < 0] <- 0
    return(x)
  }
  
  L.fun = function (X, W) {
    X.E = cbind(1, X)
    L = X.E %*% W
    return(L)
  }
  
  CE.fun = function (o, y, eps = eps) {
    loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
    return(loss)
  }
  
  #Backward
  
  grad_o.fun = function (o, y) {
    return((o - y)/(o*(1-o)))
  }
  
  grad_s.fun = function (grad_o, o) {
    return(grad_o*(o*(1-o)))
  }
  
  grad_W.fun = function (grad_l, h) {
    h.E = cbind(1, h)
    return(t(h.E) %*% grad_l/nrow(h))
  }
  
  grad_h.fun = function (grad_l, W) {
    return(grad_l %*% t(W[-1,]))
  }
  
  grad_l.fun = function (grad_h, l) {
    de_l = l
    de_l[de_l<0] = 0
    de_l[de_l>0] = 1
    return(grad_h*de_l)
  }
  
  #initialization
  
  X_matrix = cbind(x1, x2)
  Y_matrix = t(t(y))
  
  W_list = list()
  M_list = list()
  N_list = list()
  
  len_h = length(num.hidden)
  
  for (w_seq in 1:(len_h+1)) {
    if (w_seq == 1) {
      NROW_W = ncol(X_matrix) + 1
      NCOL_W = num.hidden[w_seq]
    } else if (w_seq == len_h+1) {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = ncol(Y_matrix)
    } else {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = num.hidden[w_seq]
    }
    W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
    M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
    N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
  }
  
  loss_seq = rep(0, num.iteration)
  
  #Caculating
  
  for (i in 1:num.iteration) {
    
    idx = sample(1:length(x1), batch_size)
    
    #Forward
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = X_matrix[idx,], W = W_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
      }
      current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
    }
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    loss_seq[i] = CE.fun(o = current_o, y = y[idx], eps = eps)
    
    #Backward
    
    current_grad_l_list = list()
    current_grad_W_list = list()
    current_grad_h_list = list()
    
    current_grad_o = grad_o.fun(o = current_o, y = y[idx])
    current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
    current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
    
    for (j in len_h:1) {
      current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
      current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
      if (j != 1) {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
      } else {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = X_matrix[idx,])
      }
    }
    
    if (optimizer == 'adam') {
      
      for (j in 1:(len_h+1)) {
        M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
        N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
        M.hat = M_list[[j]]/(1 - beta1^i)
        N.hat = N_list[[j]]/(1 - beta2^i)
        W_list[[j]] = W_list[[j]] - lr*M.hat/sqrt(N.hat+eps)
      }
      
    } else if (optimizer == 'sgd') {
      
      for (j in 1:(len_h+1)) {
        M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
        W_list[[j]] = W_list[[j]] - M_list[[j]]
      }
      
    } else {
      stop('optimizer must be selected from "sgd" or "adam".')
    }
    
  }
  
  require(scales)
  require(plot3D)
  
  x1_seq = seq(min(x1), max(x1), length.out = 100)
  x2_seq = seq(min(x2), max(x2), length.out = 100)
  
  pre_func = function (x1, x2) {
    new_X = cbind(x1, x2)
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = new_X, W = W_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
      }
      current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
    }
    
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    
    return(current_o)
  }
  
  pred_y = pre_func(x1 = x1, x2 = x2)
  MAIN_TXT = paste0('Train-Acc:', formatC(mean((pred_y > 0.5) == y), 2, format = 'f'))
  if (!is.null(test_x1)) {
    pred_test_y = pre_func(x1 = test_x1, x2 = test_x2)
    MAIN_TXT = paste0(MAIN_TXT, '; Test-Acc:', formatC(mean((pred_test_y > 0.5) == test_y), 2, format = 'f'))
  }
  
  z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
  
  par(mfrow = c(1, 2))
  
  image2D(z = z_matrix, main = MAIN_TXT,
          x = x1_seq, xlab = 'x1',
          y = x2_seq, ylab = 'x2',
          shade = 0.2, rasterImage = TRUE,
          col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
  
  points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
  if (!is.null(test_x1)) {
    points(test_x1, test_x2, col = 'black', bg = c('#C00000', '#0000C0')[(test_y + 1)], pch = 21)
  }
  
  plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
  
}

極深網路的優化求解(2)

set.seed(0)
x1 = rnorm(100, sd = 1) 
x2 = rnorm(100, sd = 1) 
lr1 = - 1.5 + x1^2 + x2^2 + rnorm(100)
y = lr1 > 0 + 0L

test_x1 = rnorm(100, sd = 1) 
test_x2 = rnorm(100, sd = 1) 
lr1 = - 1.5 + test_x1^2 + test_x2^2 + rnorm(100)
test_y = lr1 > 0 + 0L

– 這是Adam的結果(這裡我們都使用一般常規默認參數\(lr=0.001\))

DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
                 lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
                 x1 = x1, x2 = x2, y = y,
                 test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

– 這是sgd的結果(一般默認\(lr=0.05\))

DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
                 lr = 0.05, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
                 x1 = x1, x2 = x2, y = y,
                 test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

極深網路的優化求解(3)

\(lr = 0.01\)

DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
                 lr = 0.01, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
                 x1 = x1, x2 = x2, y = y,
                 test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

\(lr = 0.001\)

DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
                 lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
                 x1 = x1, x2 = x2, y = y,
                 test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

\(lr = 0.0001\)

DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
                 lr = 0.0001, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
                 x1 = x1, x2 = x2, y = y,
                 test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

極深網路的優化求解(4)

\(lr = 0.01\)

DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
                 lr = 0.01, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
                 x1 = x1, x2 = x2, y = y,
                 test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

\(lr = 0.001\)

DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
                 lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
                 x1 = x1, x2 = x2, y = y,
                 test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

\(lr = 0.0001\)

DEEP_MLP_Trainer(num.iteration = 2000, num.hidden = c(10, 10, 10, 10, 10), batch_size = 20,
                 lr = 0.0001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
                 x1 = x1, x2 = x2, y = y,
                 test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

練習4:嘗試解釋Adam在這種情況下優於SGD的原因

– 你可能需要拆解剛剛的code,以了解Adam在深層網路時的優勢在哪。

DEEP_MLP_Trainer(num.iteration = 10000, num.hidden = rep(10, 9), batch_size = 20,
                 lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
                 x1 = x1, x2 = x2, y = y,
                 test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

DEEP_MLP_Trainer(num.iteration = 10000, num.hidden = rep(10, 9), batch_size = 20,
                 lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
                 x1 = x1, x2 = x2, y = y,
                 test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

練習4答案

– 這樣的狀況會讓各層的梯度分布不一致,因此SGD對於各層統一學習率就對結果可能造成不好的影響,我們試著視覺化呈現訓練中第1層到第10層的平均絕對值梯度的值:

DEEP_MLP_Trainer(num.iteration = 10000, num.hidden = rep(10, 9), batch_size = 20,
                 lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'sgd',
                 x1 = x1, x2 = x2, y = y,
                 test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

DEEP_MLP_Trainer(num.iteration = 10000, num.hidden = rep(10, 9), batch_size = 20,
                 lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam',
                 x1 = x1, x2 = x2, y = y,
                 test_x1 = test_x1, test_x2 = test_x2, test_y = test_y)

結語

– 另外,你是否注意到了越深的網路準確度並非越高,他的狀況其實跟第二節課時我們隨意給出幾百個神經元的問題類似,我們遇到了「過度擬合問題」,這個問題其實就是待解的權重參數數量遠遠超過樣本數所造成的。

– 另外,你是否還有發現(或記得)網路訓練的另一問題,那就是我們總是使用隨機起始值作為優化開始的起點,而在凹凸函數上起始值對於優化結果的影響又非常關鍵,因此「權重初始化問題」亦是相關研究的一大重點。