深度學習理論與實務

林嶔 (Lin, Chin)

Lesson 2 局部極值與優化技術

第一節:局部極值問題(1)

– 在函數圖形「特別複雜」的狀況下,他可能不能帶領我們得到最好的答案。

F01

differential.fun = function(x) {
  return(4*x + 9*x^2 + 4*x^3)
}

start.value = 0.5 
num.iteration = 30
learning.rate = 0.05

result.x = rep(NA, num.iteration)

for (i in 1:num.iteration) {
  if (i == 1) {
    result.x[1] = start.value
  } else {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
  }
}

tail(result.x, 1)
## [1] 0.000275395
start.value = -3 
num.iteration = 30
learning.rate = 0.05

result.x = rep(NA, num.iteration)

for (i in 1:num.iteration) {
  if (i == 1) {
    result.x[1] = start.value
  } else {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
  }
}

tail(result.x, 1)
## [1] -1.640338

第一節:局部極值問題(2)

– 你可以親自在線性迴歸/邏輯斯迴歸中設定不同的起始值,你會發現線性迴歸、邏輯斯迴歸都沒有這個問題,這是因為他們的函數圖形都比較像\(y = x^2\)這種凸函數

– 而\(y = 2x^2 +3x^3 +x^4\)這種叫做凹凸函數

– 當然,我們仍然希望盡可能的避免局部極值,統計/電腦學家在這方面下了非常多的努力,而我們就即將介紹一些梯度下降法的變種試圖避免我們的優化器陷入局部極值。

第二節:慣性動量(1)

F02

\[x_{\left(epoch:t\right)} = x_{\left(epoch:t - 1\right)} - lr \cdot \frac{\partial}{\partial x}f(x_{\left(epoch:t - 1\right)}) + \rho \cdot \left[x_{\left(epoch:t - 1\right)}-x_{\left(epoch:t - 2\right)}\right]\]

第二節:慣性動量(2)

differential.fun = function(x) {
  return(4*x + 9*x^2 + 4*x^3)
}

start.value = 0.5 
num.iteration = 30
learning.rate = 0.1
momentum = 0.9

result.x = rep(NA, num.iteration)

for (i in 1:num.iteration) {
  if (i == 1) {
    result.x[1] = start.value
  } else if (i == 2) {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
  } else {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1]) + momentum * (result.x[i-1] - result.x[i-2])
  }
}

tail(result.x, 1)
## [1] -1.602265

F03

第二節:慣性動量(3)

– 這個例子是求\(y=x^2\)的極值,但學習率錯誤訂成了1.02,在正常狀況下是無法收斂的,但具有慣性動量將可以收斂。

differential.fun = function(x) {
  return(2*x)
}

start.value = 0.5 
num.iteration = 30
learning.rate = 1.02
momentum = 0.9

result.x = rep(NA, num.iteration)

for (i in 1:num.iteration) {
  if (i == 1) {
    result.x[1] = start.value
  } else if (i == 2) {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
  } else {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1]) + momentum * (result.x[i-1] - result.x[i-2])
  }
}

tail(result.x, 1)
## [1] -0.03117517

F04

練習1:特殊預測函數的優化

\[log(\frac{{p}}{1-p}) = sin(b_{1}x_1x_2+b_2x_2)\]

set.seed(0)
x1 = runif(1000, min = -3, max = 3) 
x2 = runif(1000, min = -3, max = 3) 
lr = sin(0.7*x1*x2+0.7*x2) + rnorm(1000, sd = 0.3)
y = lr > 0 + 0L

\[ \begin{align} L(x_1,x_2) & = sin(b_{1}x_1x_2+b_2x_2) \\ S(x) & = \frac{{1}}{1+e^{-x}} \\ CE(y, p) & = -\left(y_{i} \cdot log(p_{i}) + (1-y_{i}) \cdot log(1-p_{i})\right) \\ loss & = CE(y, S(L(x_1,x_2))) \end{align} \]

\(b_1\)的偏導函數(過程略):

\[ \begin{align} \frac{\partial}{\partial b_1}loss_i & = (p_i - y_i) \cdot cos(b_{1}x_1x_2+b_2x_2) \cdot x_{1i}x_{2i} \end{align} \]

\(b_2\)的偏導函數(過程略):

\[ \begin{align} \frac{\partial}{\partial b_2}loss_i & = (p_i - y_i) \cdot cos(b_{1}x_1x_2+b_2x_2) \cdot x_{2i} \end{align} \]

pred.fun = function(b1, b2, x1 = x1, x2 = x2) {
  lr = sin(b1*x1*x2+b2*x2)
  p = 1 / (1 + exp(-lr))
  return(p)
}

loss.fun = function(p, y = y, eps = 1e-8) {
  loss = -mean(y * log(p + eps) + (1 - y) * log(1 - p + eps))
  return(loss)
}

differential.fun.b1 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
  return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x1*x2))
}

differential.fun.b2 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
  return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x2))
}

練習1答案(1)

LOGISTIC_VIS = function (x1, x2, y, num.iteration = 100, lr = 0.1, momentum = 0.9, start_b1 = 0, start_b2 = 0) {

  ans_b1 = rep(start_b1, num.iteration)
  ans_b2 = rep(start_b2, num.iteration)
  ans_loss = rep(0, num.iteration)
  
  pred.fun = function(b1, b2, x1 = x1, x2 = x2) {
    lr = sin(b1*x1*x2+b2*x2)
    p = 1 / (1 + exp(-lr))
    return(p)
  }
  
  loss.fun = function(p, y = y, eps = 1e-8) {
    loss = -mean(y * log(p + eps) + (1 - y) * log(1 - p + eps))
    return(loss)
  }
  
  differential.fun.b1 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
    return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x1*x2))
  }
  
  differential.fun.b2 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
    return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x2))
  }
  
  for (i in 1:num.iteration) {
    if (i >= 2) {
      grad_b1 = differential.fun.b1(p = current_p, b1 = ans_b1[i-1], b2 = ans_b2[i-1], x1 = x1, x2 = x2, y = y)
      grad_b2 = differential.fun.b2(p = current_p, b1 = ans_b1[i-1], b2 = ans_b2[i-1], x1 = x1, x2 = x2, y = y)
      if (i == 2) {
        momentum_b1 = 0
        momentum_b2 = 0
      } else {
        momentum_b1 = momentum * (ans_b1[i-1] - ans_b1[i-2])
        momentum_b2 = momentum * (ans_b2[i-1] - ans_b2[i-2]) 
      }
      ans_b1[i] = ans_b1[i-1] - lr * grad_b1 + momentum_b1
      ans_b2[i] = ans_b2[i-1] - lr * grad_b2 + momentum_b2
    }
    current_p = pred.fun(b1 = ans_b1[i], b2 = ans_b2[i], x1 = x1, x2 = x2)
    ans_loss[i] = loss.fun(p = current_p, y = y)
  }
  
  par(mfrow = c(1, 2))
  
  require(scales)
  require(plot3D)
  
  x1_seq = seq(min(x1), max(x1), length.out = 300)
  x2_seq = seq(min(x2), max(x2), length.out = 300)
  
  z_matrix = sapply(x2_seq, function(x) {pred.fun(b1 = ans_b1[num.iteration], b2 = ans_b2[num.iteration], x1 = x1_seq, x2 = x)})
  
  image2D(z = z_matrix, main = paste0('b1=', formatC(ans_b1[num.iteration], format = 'f', 1), ';b2=', formatC(ans_b2[num.iteration], format = 'f', 1)),
          x = x1_seq, xlab = 'x1',
          y = x2_seq, ylab = 'x2',
          shade = 0.2, rasterImage = TRUE,
          col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
  
  points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
  
  require(grDevices) # for trans3d
  
  b1_seq = seq(-0.5, 2, length.out = 50)
  b2_seq = seq(-0.5, 2, length.out = 50)
  LOSS <- b1_seq %*% t(b2_seq)
  for (i in 1:length(b1_seq)) {
    for (j in 1:length(b2_seq)) {
      LOSS[i,j] = loss.fun(p = pred.fun(b1 = b1_seq[i], b2 = b2_seq[j], x1 = x1, x2 = x2), y = y, eps = 1e-8)
    }
  }
  
  persp(b1_seq, b2_seq, LOSS, theta = 120, phi = 50, expand = 0.8, col = "lightblue",
        ltheta = 120, shade = 0.75, ticktype = "detailed",
        xlab = "b1", ylab = "b2", zlab = "loss", main = paste0('loss=', formatC(ans_loss[num.iteration], 4, format = 'f'))) -> res
  
  lines(trans3d(ans_b1, ans_b2, ans_loss, pmat = res), col = '#AA0000', lwd = 1.5)
  points(trans3d(ans_b1[1], ans_b2[1], ans_loss[1], pmat = res), col = '#AA0000', cex = 1, pch = 16)
  points(trans3d(ans_b1[num.iteration], ans_b2[num.iteration], ans_loss[num.iteration], pmat = res), col = '#FF0000', cex = 1, pch = 16)
  
}

練習1答案(2)

LOGISTIC_VIS(x1 = x1, x2 = x2, y = y, num.iteration = 100, lr = 0.1, momentum = 0, start_b1 = 0, start_b2 = 0)

LOGISTIC_VIS(x1 = x1, x2 = x2, y = y, num.iteration = 100, lr = 0.1, momentum = 0.9, start_b1 = 0, start_b2 = 0)

練習1引申

set.seed(0)
x1 = runif(1000, min = -3, max = 3) 
x2 = runif(1000, min = -3, max = 3) 
lr = sin(2*x1*x2+3*x2) + rnorm(1000, sd = 0.3)
y = lr > 0 + 0L
LOGISTIC_VIS(x1 = x1, x2 = x2, y = y, num.iteration = 300, lr = 0.1, momentum = 0.9, start_b1 = 0, start_b2 = 0)

第三節:線性整流函數(1)

– 上節課我們已經發現S型轉換函數的問題了,現在我們來用線性整流函數(Rectified Linear Unit, ReLU)取代看看

– 2012年他一舉摘下了ILSVRC競賽的冠軍,並且效果大幅度超過傳統的方法,也讓錯誤率從25%降低至15%以下。

F12

第三節:線性整流函數(2)

\[ ReLU(x) = \left\{ \begin{array} -x & \mbox{ if x > 0} \\ 0 & \mbox{ otherwise} \end{array} \right. \]

\[ \frac{\partial}{\partial x}ReLU(x) = \left\{ \begin{array} -1 & \mbox{ if x > 0} \\ 0 & \mbox{ otherwise} \end{array} \right. \]

第三節:線性整流函數(3)

\[ \begin{align} l_1 & = L^1_d(x^E,W^1_d) \\ h_1 & = ReLU(l_1) \\ l_2 & = L^2_1(h_1^E,W^2_1) \\ o & = S(l_2) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]

– 各元素的導函數:

\[ \begin{align} grad.o & = \frac{\partial}{\partial o}loss = \frac{o-y}{o(1-o)} \\ grad.l_2 & = \frac{\partial}{\partial l_2}loss = grad.o \otimes \frac{\partial}{\partial l_2}o= o-y \\ grad.W^2_1 & = \frac{\partial}{\partial W^2_1}loss = grad.l_2 \otimes \frac{\partial}{\partial W^2_1}l_2 = \frac{{1}}{n} \otimes (h_1^E)^T \bullet grad.l_2\\ grad.h_1^E & = \frac{\partial}{\partial h_1^E}loss = grad.l_2 \otimes \frac{\partial}{\partial h_1^E}l_2 = grad.l_2 \bullet (W^2_1)^T \\ grad.l_1 & = \frac{\partial}{\partial l_1}loss = grad.h_1 \otimes \frac{\partial}{\partial l_1}h_1 = grad.h_1 \otimes \frac{\partial}{\partial l_1}ReLU(l_1) \\ grad.W^1_d & = \frac{\partial}{\partial W^1_d}loss = grad.l_1 \otimes \frac{\partial}{\partial W^1_d}l_1 = \frac{{1}}{n} \otimes (x^E)^T \bullet grad.l_1 \end{align} \]

練習2:建構使用ReLU及慣性動量的MLP函數

set.seed(0)
x1 <- rnorm(300, sd = 2) 
x2 <- rnorm(300, sd = 2) 
lp <- x1^2 +x2^2
y <- (lp > 9 | (lp < 4 & lp > 1)) + 0L

– 這是上節課最後的函數:

DEEP_MLP_Trainer = function (num.iteration = 500, num.hidden = c(10, 10, 10), lr = 0.05,
                             x1 = x1, x2 = x2, y = y, eps = 1e-8) {
  
  #Functions
  
  #Forward
  
  S.fun = function (x, eps = 1e-5) {
    S = 1/(1 + exp(-x))
    S[S < eps] = eps
    S[S > 1 - eps] = 1 - eps
    return(S)
  }
  
  L.fun = function (X, W) {
    X.E = cbind(1, X)
    L = X.E %*% W
    return(L)
  }
  
  CE.fun = function (o, y, eps = 1e-5) {
    loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
    return(loss)
  }
  
  #Backward
  
  grad_o.fun = function (o, y) {
    return((o - y)/(o*(1-o)))
  }
  
  grad_s.fun = function (grad_o, o) {
    return(grad_o*(o*(1-o)))
  }
  
  grad_W.fun = function (grad_l, h) {
    h.E = cbind(1, h)
    return(t(h.E) %*% grad_l/nrow(h))
  }
  
  grad_h.fun = function (grad_l, W) {
    return(grad_l %*% t(W[-1,]))
  }
  
  grad_l.fun = function (grad_h, h) {
    return(grad_h * h * (1 - h))
  }
  
  #initialization
  
  X_matrix = cbind(x1, x2)
  Y_matrix = t(t(y))
  
  W_list = list()
  
  len_h = length(num.hidden)
  
  for (w_seq in 1:(len_h+1)) {
    if (w_seq == 1) {
      NROW_W = ncol(X_matrix) + 1
      NCOL_W = num.hidden[w_seq]
    } else if (w_seq == len_h+1) {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = ncol(Y_matrix)
    } else {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = num.hidden[w_seq]
    }
    W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
  }
  
  loss_seq = rep(0, num.iteration)
  
  #Caculating
  
  for (i in 1:num.iteration) {
    
    #Forward
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = X_matrix, W = W_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
      }
      current_h_list[[j]] = S.fun(x = current_l_list[[j]])
    }
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    loss_seq[i] = CE.fun(o = current_o, y = y, eps = eps)
    
    #Backward
    
    current_grad_l_list = list()
    current_grad_W_list = list()
    current_grad_h_list = list()
    
    current_grad_o = grad_o.fun(o = current_o, y = y)
    current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
    current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
    
    for (j in len_h:1) {
      current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
      current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], h = current_h_list[[j]])
      if (j != 1) {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
      } else {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = X_matrix)
      }
    }
    
    for (j in 1:(len_h+1)) {
      
      W_list[[j]] = W_list[[j]] - lr * current_grad_W_list[[j]]
      
    }
    
  }
  
  require(scales)
  require(plot3D)
  
  x1_seq = seq(min(x1), max(x1), length.out = 100)
  x2_seq = seq(min(x2), max(x2), length.out = 100)
  
  pre_func = function (x1, x2) {
    new_X = cbind(x1, x2)
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = new_X, W = W_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
      }
      current_h_list[[j]] = S.fun(x = current_l_list[[j]])
    }
    
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    
    return(current_o)
  }
  
  pred_y = pre_func(x1 = x1, x2 = x2)
  z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
  
  par(mfrow = c(1, 2))
  
  image2D(z = z_matrix,
          x = x1_seq, xlab = 'x1',
          y = x2_seq, ylab = 'x2',
          shade = 0.2, rasterImage = TRUE,
          col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
  
  points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
  plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
  
}

練習2答案(1)

DEEP_MLP_Trainer = function (num.iteration = 500, num.hidden = c(10, 10, 10),
                             lr = 0.05, momentum = 0.9, 
                             x1 = x1, x2 = x2, y = y, eps = 1e-8) {
  
  #Functions
  
  #Forward
  
  S.fun = function (x, eps = eps) {
    S = 1/(1 + exp(-x))
    S[S < eps] = eps
    S[S > 1 - eps] = 1 - eps
    return(S)
  }
  
  ReLU.fun = function (x) {
    x[x < 0] <- 0
    return(x)
  }
  
  L.fun = function (X, W) {
    X.E = cbind(1, X)
    L = X.E %*% W
    return(L)
  }
  
  CE.fun = function (o, y, eps = eps) {
    loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
    return(loss)
  }
  
  #Backward
  
  grad_o.fun = function (o, y) {
    return((o - y)/(o*(1-o)))
  }
  
  grad_s.fun = function (grad_o, o) {
    return(grad_o*(o*(1-o)))
  }
  
  grad_W.fun = function (grad_l, h) {
    h.E = cbind(1, h)
    return(t(h.E) %*% grad_l/nrow(h))
  }
  
  grad_h.fun = function (grad_l, W) {
    return(grad_l %*% t(W[-1,]))
  }
  
  grad_l.fun = function (grad_h, l) {
    de_l = l
    de_l[de_l<0] = 0
    de_l[de_l>0] = 1
    return(grad_h*de_l)
  }
  
  #initialization
  
  X_matrix = cbind(x1, x2)
  Y_matrix = t(t(y))
  
  W_list = list()
  M_list = list()
  
  len_h = length(num.hidden)
  
  for (w_seq in 1:(len_h+1)) {
    if (w_seq == 1) {
      NROW_W = ncol(X_matrix) + 1
      NCOL_W = num.hidden[w_seq]
    } else if (w_seq == len_h+1) {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = ncol(Y_matrix)
    } else {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = num.hidden[w_seq]
    }
    W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
    M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
  }
  
  loss_seq = rep(0, num.iteration)
  
  #Caculating
  
  for (i in 1:num.iteration) {
    
    #Forward
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = X_matrix, W = W_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
      }
      current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
    }
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    loss_seq[i] = CE.fun(o = current_o, y = y, eps = eps)
    
    #Backward
    
    current_grad_l_list = list()
    current_grad_W_list = list()
    current_grad_h_list = list()
    
    current_grad_o = grad_o.fun(o = current_o, y = y)
    current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
    current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
    
    for (j in len_h:1) {
      current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
      current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
      if (j != 1) {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
      } else {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = X_matrix)
      }
    }
    
    for (j in 1:(len_h+1)) {
      M_list[[j]] = momentum * M_list[[j]] + lr * current_grad_W_list[[j]]
      W_list[[j]] = W_list[[j]] - M_list[[j]]
    }
    
  }
  
  require(scales)
  require(plot3D)
  
  x1_seq = seq(min(x1), max(x1), length.out = 100)
  x2_seq = seq(min(x2), max(x2), length.out = 100)
  
  pre_func = function (x1, x2) {
    new_X = cbind(x1, x2)
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = new_X, W = W_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
      }
      current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
    }
    
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    
    return(current_o)
  }
  
  pred_y = pre_func(x1 = x1, x2 = x2)
  z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
  
  par(mfrow = c(1, 2))
  
  image2D(z = z_matrix,
          x = x1_seq, xlab = 'x1',
          y = x2_seq, ylab = 'x2',
          shade = 0.2, rasterImage = TRUE,
          col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
  
  points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
  plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
  
}

練習2答案(2)

DEEP_MLP_Trainer(num.iteration = 10000, num.hidden = c(20, 20), lr = 0.1, x1 = x1, x2 = x2, y = y)

第四節:小批量梯度下降法(1)

– 然而『實際上』,由於每次抽樣造成的變異,這會使的當前梯度會因為抽樣的關係存在於一定的變異性,這將會導致使用「小批量梯度下降法(Mini-batch Stochastic Gradient Descent, SGD)」與「全樣本梯度下降法」的結果有所不同。但問題在於,你認為「小批量梯度下降法」與「全樣本梯度下降法」哪一種能得到較好的結果?而如果使用「小批量梯度下降法」,那批量「樣本大小」與優化結果的關係又是什麼呢?

– 下面是利用練習1的問題所描繪的損失函數,這6張圖分別是批量大小從小到大其最終損失函數的變異情形,試著觀察並討論此問題:

F05

第四節:小批量梯度下降法(2)

– 當然,過小的批量樣本似乎也不是非常理想,這同時對全局極值也存在著危害,同樣有可能讓優化器錯過全局極值。但整體來說,似乎批量樣本較小較為有利!

– 這個例子是這是\(b_1 = 0.7\)\(b_2 = 0.7\)所產生的數據,這在練習1中若設為動量為0,則無法離開局部極值,而我們改用批量大小為30再次試試:

F06

– 下面這個例子是這是\(b_1 = 2\)\(b_2 = 3\)所產生的數據,這在練習1中即使我們使用了動量為0.9的優化過程仍然沒辦法找到答案,但是透過調整批量大小(批量大小 = 30)後,我們將能找到全局極值:

F07

第四節:小批量梯度下降法(3)

F08

– 最終發現在幾個常見的演算法及資料集中,「批量大小」不超過32時「測試集準確度」較高。

F09

練習3:調整批量樣本

set.seed(0)
x1 = rnorm(100, sd = 1) 
x2 = rnorm(100, sd = 1) 
lr1 = - 1.5 + x1^2 + x2^2 + rnorm(100)
y = lr1 > 0 + 0L

練習3答案(1)

DEEP_MLP_Trainer = function (num.iteration = 500, num.hidden = c(10, 10, 10),
                             lr = 0.05, momentum = 0.9, batch_size = 20,
                             x1 = x1, x2 = x2, y = y, eps = 1e-8) {
  
  #Functions
  
  #Forward
  
  S.fun = function (x, eps = eps) {
    S = 1/(1 + exp(-x))
    S[S < eps] = eps
    S[S > 1 - eps] = 1 - eps
    return(S)
  }
  
  ReLU.fun = function (x) {
    x[x < 0] <- 0
    return(x)
  }
  
  L.fun = function (X, W) {
    X.E = cbind(1, X)
    L = X.E %*% W
    return(L)
  }
  
  CE.fun = function (o, y, eps = eps) {
    loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
    return(loss)
  }
  
  #Backward
  
  grad_o.fun = function (o, y) {
    return((o - y)/(o*(1-o)))
  }
  
  grad_s.fun = function (grad_o, o) {
    return(grad_o*(o*(1-o)))
  }
  
  grad_W.fun = function (grad_l, h) {
    h.E = cbind(1, h)
    return(t(h.E) %*% grad_l/nrow(h))
  }
  
  grad_h.fun = function (grad_l, W) {
    return(grad_l %*% t(W[-1,]))
  }
  
  grad_l.fun = function (grad_h, l) {
    de_l = l
    de_l[de_l<0] = 0
    de_l[de_l>0] = 1
    return(grad_h*de_l)
  }
  
  #initialization
  
  X_matrix = cbind(x1, x2)
  Y_matrix = t(t(y))
  
  W_list = list()
  M_list = list()
  N_list = list()
  
  len_h = length(num.hidden)
  
  for (w_seq in 1:(len_h+1)) {
    if (w_seq == 1) {
      NROW_W = ncol(X_matrix) + 1
      NCOL_W = num.hidden[w_seq]
    } else if (w_seq == len_h+1) {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = ncol(Y_matrix)
    } else {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = num.hidden[w_seq]
    }
    W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
    M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
  }
  
  loss_seq = rep(0, num.iteration)
  
  #Caculating
  
  for (i in 1:num.iteration) {
    
    idx = sample(1:length(x1), batch_size)
    
    #Forward
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = X_matrix[idx,], W = W_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
      }
      current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
    }
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    loss_seq[i] = CE.fun(o = current_o, y = y[idx], eps = eps)
    
    #Backward
    
    current_grad_l_list = list()
    current_grad_W_list = list()
    current_grad_h_list = list()
    
    current_grad_o = grad_o.fun(o = current_o, y = y[idx])
    current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
    current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
    
    for (j in len_h:1) {
      current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
      current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
      if (j != 1) {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
      } else {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = X_matrix[idx,])
      }
    }
    
    for (j in 1:(len_h+1)) {
      M_list[[j]] = momentum * M_list[[j]] + lr * current_grad_W_list[[j]]
      W_list[[j]] = W_list[[j]] - M_list[[j]]
    }
    
  }
  
  require(scales)
  require(plot3D)
  
  x1_seq = seq(min(x1), max(x1), length.out = 100)
  x2_seq = seq(min(x2), max(x2), length.out = 100)
  
  pre_func = function (x1, x2) {
    new_X = cbind(x1, x2)
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = new_X, W = W_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
      }
      current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
    }
    
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    
    return(current_o)
  }
  
  pred_y = pre_func(x1 = x1, x2 = x2)
  z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
  
  par(mfrow = c(1, 2))
  
  image2D(z = z_matrix,
          x = x1_seq, xlab = 'x1',
          y = x2_seq, ylab = 'x2',
          shade = 0.2, rasterImage = TRUE,
          col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
  
  points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
  plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
  
}

練習3答案(2)

DEEP_MLP_Trainer(num.iteration = 5000, num.hidden = 30, batch_size = 100, lr = 0.1, momentum = 0.9, x1 = x1, x2 = x2, y = y)

DEEP_MLP_Trainer(num.iteration = 5000, num.hidden = 30, batch_size = 20, lr = 0.1, momentum = 0.9, x1 = x1, x2 = x2, y = y)

第五節:自適應學習率優化演算法(1)

– 一個合理的方法就是透過調整學習率,在疊代初期給一個較大的學習率,而在後期給一個較小的學習率。

– 另外目前為止,所有參數的更新「共享」了同一個學習率,有沒有可能針對每一個參數設計自己的學習率,從而導致優化過程更好?

\[ \begin{align} grad.x_t & = \frac{\partial}{\partial x}f(x_{t - 1}) \\ n_t & = n_{t-1} + grad.x_t^2 \\ x_{t} & = x_{t - 1} - \frac{lr}{\sqrt{n_t+\delta}}\cdot grad.x_t \end{align} \]

– 在R語言中的實現如下,我們以求\(y=x^2\)的極值為例實現下列程式碼:

differential.fun = function(x) {
  return(2*x)
}

start.value = 0.5 
num.iteration = 30
learning.rate = 0.1
eps = 1e-8

result.x = rep(NA, num.iteration)

for (i in 1:num.iteration) {
  if (i == 1) {
    n = 0
    result.x[1] = start.value
  } else {
    grad.x = differential.fun(result.x[i-1])
    n = n + grad.x^2
    result.x[i] = result.x[i-1] - learning.rate/sqrt(n + eps) * grad.x
  }
}

tail(result.x, 1)
## [1] 0.01459057

第五節:自適應學習率優化演算法(2)

– 這裡我們可以對其做一些簡單的擴展,主要的概念是來自於Adadelta,我們可以在累積梯度平方和的部分利用了動量的概念,讓當前累積值會隨著時間做衰減,從而保證訓練不會提前結束

\[ \begin{align} grad.x_t & = \frac{\partial}{\partial x}f(x_{t - 1}) \\ n_t & = \beta_2 \cdot n_{t-1} + (1 - \beta_2) \cdot grad.x_t^2 \\ x_{t} & = x_{t - 1} - \frac{lr}{\sqrt{n_t+\delta}}\cdot grad.x_t \end{align} \]

\[ \begin{align} grad.x_t & = \frac{\partial}{\partial x}f(x_{t - 1}) \\ m_t & = \beta_1 \cdot m_{t-1} + (1 - \beta_1) \cdot grad.x_t \\ n_t & = \beta_2 \cdot n_{t-1} + (1 - \beta_2) \cdot grad.x_t^2 \\ x_{t} & = x_{t - 1} - \frac{lr}{\sqrt{n_t+\delta}}\cdot m_t \end{align} \]

– 擴展自此,我們考慮最後一個問題,那就是由於\(m_t\)\(n_t\)是隨著時間累積的,在訓練剛開始的時候這個數字將會非常小,所以我們將計算\(\hat{m_t}\)\(\hat{n_t}\)作為更新梯度使用,而他的分母項將會隨著時間越來越接近1:

\[ \begin{align} grad.x_{t-1} & = \frac{\partial}{\partial x}f(x_{t - 1}) \\ m_t & = \beta_1 \cdot m_{t-1} + (1 - \beta_1) \cdot grad.x_{t-1} \\ n_t & = \beta_2 \cdot n_{t-1} + (1 - \beta_2) \cdot grad.x_{t-1}^2 \\ \hat{m_t} & = \frac{m_t}{1 - \beta_1^t} \\ \hat{n_t} & = \frac{n_t}{1 - \beta_2^t} \\ x_{t} & = x_{t - 1} - \frac{lr}{\sqrt{\hat{n_t}+\delta}}\cdot \hat{m_t} \end{align} \]

第五節:自適應學習率優化演算法(3)

F10

differential.fun = function(x) {
  return(2*x)
}

start.value = 0.5 
num.iteration = 30
learning.rate = 0.1 #注意,一般Adam在優化時通常選擇0.001
beta1 = 0.9
beta2 = 0.999
eps = 1e-8

result.x = rep(NA, num.iteration + 1)

for (i in 0:num.iteration) {
  if (i == 0) {
    m = 0
    n = 0
    result.x[i+1] = start.value
  } else {
    grad.x = differential.fun(result.x[i])
    m = beta1 * m + (1 - beta1) * grad.x
    n = beta2 * n + (1 - beta2) * grad.x^2
    m.hat = m/(1 - beta1^i)
    n.hat = n/(1 - beta2^i)
    result.x[i+1] = result.x[i] - learning.rate/sqrt(n.hat + eps) * m.hat
  }
}

tail(result.x, 1)
## [1] 0.02099521

第五節:自適應學習率優化演算法(4)

– SGD的結果(\(lr = 0.1\), \(\rho = 0.9\)):

F07

– Adam的結果(\(lr = 0.1\), \(\beta_1 = 0.9\), \(\beta_2 = 0.999\)):

F11

– 因此,也有研究在訓練初期使用Adam做快速的優化,而在後期使用SGD做收尾(Nitish Shirish Keskar & Richard Socher, 2017)。事實上「最佳化問題」上一直以來都是數學上的千古難題,幾百年的進展下來並沒有一種方法可以在各種問題上都相較於其他有明顯優勢,我們只能說目前的主流是帶有慣性量的SGD與Adam是最常用的兩個方法,但可以預見未來進一步的研究也許會改寫這個格局。

練習4:使用Adam優化MLP

– 初始程式碼的部分請你參考練習3的答案,用這個程式碼開始改。

– 使用的樣本如下:

set.seed(0)
x1 <- rnorm(300, sd = 2) 
x2 <- rnorm(300, sd = 2) 
lp <- x1^2 +x2^2
y <- (lp > 9 | (lp < 4 & lp > 1)) + 0L

練習4答案(1)

DEEP_MLP_Trainer = function (num.iteration = 500, num.hidden = c(10, 10, 10),
                             lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20,
                             x1 = x1, x2 = x2, y = y, eps = 1e-8) {
  
  #Functions
  
  #Forward
  
  S.fun = function (x, eps = eps) {
    S = 1/(1 + exp(-x))
    S[S < eps] = eps
    S[S > 1 - eps] = 1 - eps
    return(S)
  }
  
  ReLU.fun = function (x) {
    x[x < 0] <- 0
    return(x)
  }
  
  L.fun = function (X, W) {
    X.E = cbind(1, X)
    L = X.E %*% W
    return(L)
  }
  
  CE.fun = function (o, y, eps = eps) {
    loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
    return(loss)
  }
  
  #Backward
  
  grad_o.fun = function (o, y) {
    return((o - y)/(o*(1-o)))
  }
  
  grad_s.fun = function (grad_o, o) {
    return(grad_o*(o*(1-o)))
  }
  
  grad_W.fun = function (grad_l, h) {
    h.E = cbind(1, h)
    return(t(h.E) %*% grad_l/nrow(h))
  }
  
  grad_h.fun = function (grad_l, W) {
    return(grad_l %*% t(W[-1,]))
  }
  
  grad_l.fun = function (grad_h, l) {
    de_l = l
    de_l[de_l<0] = 0
    de_l[de_l>0] = 1
    return(grad_h*de_l)
  }
  
  #initialization
  
  X_matrix = cbind(x1, x2)
  Y_matrix = t(t(y))
  
  W_list = list()
  M_list = list()
  N_list = list()
  
  len_h = length(num.hidden)
  
  for (w_seq in 1:(len_h+1)) {
    if (w_seq == 1) {
      NROW_W = ncol(X_matrix) + 1
      NCOL_W = num.hidden[w_seq]
    } else if (w_seq == len_h+1) {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = ncol(Y_matrix)
    } else {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = num.hidden[w_seq]
    }
    W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
    M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
    N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
  }
  
  loss_seq = rep(0, num.iteration)
  
  #Caculating
  
  for (i in 1:num.iteration) {
    
    idx = sample(1:length(x1), batch_size)
    
    #Forward
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = X_matrix[idx,], W = W_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
      }
      current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
    }
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    loss_seq[i] = CE.fun(o = current_o, y = y[idx], eps = eps)
    
    #Backward
    
    current_grad_l_list = list()
    current_grad_W_list = list()
    current_grad_h_list = list()
    
    current_grad_o = grad_o.fun(o = current_o, y = y[idx])
    current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
    current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
    
    for (j in len_h:1) {
      current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
      current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
      if (j != 1) {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
      } else {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = X_matrix[idx,])
      }
    }
    
    if (optimizer == 'adam') {
      
      for (j in 1:(len_h+1)) {
        M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
        N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
        M.hat = M_list[[j]]/(1 - beta1^i)
        N.hat = N_list[[j]]/(1 - beta2^i)
        W_list[[j]] = W_list[[j]] - lr*M.hat/sqrt(N.hat+eps)
      }
      
    } else if (optimizer == 'sgd') {
      
      for (j in 1:(len_h+1)) {
        M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
        W_list[[j]] = W_list[[j]] - M_list[[j]]
      }
      
    } else {
      stop('optimizer must be selected from "sgd" or "adam".')
    }
    
  }
  
  require(scales)
  require(plot3D)
  
  x1_seq = seq(min(x1), max(x1), length.out = 100)
  x2_seq = seq(min(x2), max(x2), length.out = 100)
  
  pre_func = function (x1, x2) {
    new_X = cbind(x1, x2)
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = new_X, W = W_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
      }
      current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
    }
    
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    
    return(current_o)
  }
  
  pred_y = pre_func(x1 = x1, x2 = x2)
  z_matrix = sapply(x2_seq, function(x) {pre_func(x1 = x1_seq, x2 = x)})
  
  par(mfrow = c(1, 2))
  
  image2D(z = z_matrix,
          x = x1_seq, xlab = 'x1',
          y = x2_seq, ylab = 'x2',
          shade = 0.2, rasterImage = TRUE,
          col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
  
  points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
  plot(loss_seq, type = 'l', main = 'loss', xlab = 'iter.', ylab = 'CE loss')
  
}

練習4答案(2)

DEEP_MLP_Trainer(num.iteration = 10000, num.hidden = c(20, 20),
                 lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20,
                 x1 = x1, x2 = x2, y = y)

DEEP_MLP_Trainer(num.iteration = 10000, num.hidden = c(20, 20),
                 lr = 0.05, beta1 = 0.9, optimizer = 'sgd', batch_size = 20,
                 x1 = x1, x2 = x2, y = y)

練習4引申

DEEP_MLP_Trainer(num.iteration = 10000, num.hidden = c(20, 20, 20, 20, 20, 20, 20, 20),
                 lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20,
                 x1 = x1, x2 = x2, y = y)

DEEP_MLP_Trainer(num.iteration = 10000, num.hidden = c(20, 20, 20, 20, 20, 20, 20, 20),
                 lr = 0.05, beta1 = 0.9, optimizer = 'sgd', batch_size = 20,
                 x1 = x1, x2 = x2, y = y)

結語

– 另外,你是否注意到了越深的網路雖然能應對各種莫名其妙的關係式,但預測邊界也變得奇怪,這感覺就會有機會有「過度擬合問題」,這個問題其實就是待解的權重參數數量遠遠超過樣本數所造成的。

– 另外,你是否還有發現(或記得)網路訓練的另一問題,那就是我們總是使用隨機起始值作為優化開始的起點,而在凹凸函數上起始值對於優化結果的影響又非常關鍵,因此「權重初始化問題」亦是相關研究的一大重點。