局部極值與優化技術

林嶔 (Lin, Chin)

Lesson 3

局部極值問題(1)

我們一樣先從簡單的問題開始,我們希望求\(y = 2x^2 +3x^3 +x^4\)的極值,但我們先看看函數圖形:

F3_1

differential.fun = function(x) {
  return(4*x + 9*x^2 + 4*x^3)
}

start.value = 0.5 
num.iteration = 30
learning.rate = 0.05

result.x = rep(NA, num.iteration)

for (i in 1:num.iteration) {
  if (i == 1) {
    result.x[1] = start.value
  } else {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
  }
}

tail(result.x, 1)
## [1] 0.000275395
start.value = -3 
num.iteration = 30
learning.rate = 0.05

result.x = rep(NA, num.iteration)

for (i in 1:num.iteration) {
  if (i == 1) {
    result.x[1] = start.value
  } else {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
  }
}

tail(result.x, 1)
## [1] -1.640338

局部極值問題(2)

– 你可以親自在線性迴歸/邏輯斯迴歸中設定不同的起始值,你會發現線性迴歸、邏輯斯迴歸都沒有這個問題,這是因為他們的函數圖形都比較像\(y = x^2\)這種凸函數

– 而\(y = 2x^2 +3x^3 +x^4\)這種叫做凹凸函數

– 當然,我們仍然希望盡可能的避免局部極值,統計/電腦學家在這方面下了非常多的努力,而我們就即將介紹一些梯度下降法的變種試圖避免我們的優化器陷入局部極值。

慣性動量(1)

F3_2

\[x_{\left(epoch:t\right)} = x_{\left(epoch:t - 1\right)} - lr \cdot \frac{\partial}{\partial x}f(x_{\left(epoch:t - 1\right)}) + \rho \cdot \left[x_{\left(epoch:t - 1\right)}-x_{\left(epoch:t - 2\right)}\right]\]

慣性動量(2)

differential.fun = function(x) {
  return(4*x + 9*x^2 + 4*x^3)
}

start.value = 0.5 
num.iteration = 30
learning.rate = 0.1
momentum = 0.9

result.x = rep(NA, num.iteration)

for (i in 1:num.iteration) {
  if (i == 1) {
    result.x[1] = start.value
  } else if (i == 2) {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
  } else {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1]) + momentum * (result.x[i-1] - result.x[i-2])
  }
}

tail(result.x, 1)
## [1] -1.602265

F3_3

慣性動量(3)

– 這個例子是求\(y=x^2\)的極值,但學習率錯誤訂成了1.02,在正常狀況下是無法收斂的,但具有慣性動量將可以收斂。

differential.fun = function(x) {
  return(2*x)
}

start.value = 0.5 
num.iteration = 30
learning.rate = 1.02
momentum = 0.9

result.x = rep(NA, num.iteration)

for (i in 1:num.iteration) {
  if (i == 1) {
    result.x[1] = start.value
  } else if (i == 2) {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1])
  } else {
    result.x[i] = result.x[i-1] - learning.rate * differential.fun(result.x[i-1]) + momentum * (result.x[i-1] - result.x[i-2])
  }
}

tail(result.x, 1)
## [1] -0.03117517

F3_4

練習1:變種邏輯斯回歸的優化

\[log(\frac{{p}}{1-p}) = sin(b_{1}x_1x_2+b_2x_2)\]

set.seed(0)
x1 = runif(1000, min = -3, max = 3) 
x2 = runif(1000, min = -3, max = 3) 
lr = sin(0.7*x1*x2+0.7*x2) + rnorm(1000, sd = 0.3)
y = lr > 0 + 0L

\[ \begin{align} L(x_1,x_2) & = sin(b_{1}x_1x_2+b_2x_2) \\ S(x) & = \frac{{1}}{1+e^{-x}} \\ CE(y, p) & = -\left(y_{i} \cdot log(p_{i}) + (1-y_{i}) \cdot log(1-p_{i})\right) \\ loss & = CE(y, S(L(x_1,x_2))) \end{align} \]

\(b_1\)的偏導函數(過程略):

\[ \begin{align} \frac{\partial}{\partial b_1}loss_i & = (p_i - y_i) \cdot cos(b_{1}x_1x_2+b_2x_2) \cdot x_{1i}x_{2i} \end{align} \]

\(b_2\)的偏導函數(過程略):

\[ \begin{align} \frac{\partial}{\partial b_2}loss_i & = (p_i - y_i) \cdot cos(b_{1}x_1x_2+b_2x_2) \cdot x_{2i} \end{align} \]

pred.fun = function(b1, b2, x1 = x1, x2 = x2) {
  lr = sin(b1*x1*x2+b2*x2)
  p = 1 / (1 + exp(-lr))
  return(p)
}

loss.fun = function(p, y = y, eps = 1e-8) {
  loss = -mean(y * log(p + eps) + (1 - y) * log(1 - p + eps))
  return(loss)
}

differential.fun.b1 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
  return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x1*x2))
}

differential.fun.b2 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
  return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x2))
}

練習1答案(程式碼)

LOGISTIC_VIS = function (x1, x2, y, num.iteration = 100, lr = 0.1, momentum = 0.9, start_b1 = 0, start_b2 = 0) {

  ans_b1 = rep(start_b1, num.iteration)
  ans_b2 = rep(start_b2, num.iteration)
  ans_loss = rep(0, num.iteration)
  
  pred.fun = function(b1, b2, x1 = x1, x2 = x2) {
    lr = sin(b1*x1*x2+b2*x2)
    p = 1 / (1 + exp(-lr))
    return(p)
  }
  
  loss.fun = function(p, y = y, eps = 1e-8) {
    loss = -mean(y * log(p + eps) + (1 - y) * log(1 - p + eps))
    return(loss)
  }
  
  differential.fun.b1 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
    return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x1*x2))
  }
  
  differential.fun.b2 = function(p, b1, b2, x1 = x1, x2 = x2, y = y) {
    return(mean((p-y)*cos(b1*x1*x2+b2*x2)*x2))
  }
  
  for (i in 1:num.iteration) {
    if (i >= 2) {
      grad_b1 = differential.fun.b1(p = current_p, b1 = ans_b1[i-1], b2 = ans_b2[i-1], x1 = x1, x2 = x2, y = y)
      grad_b2 = differential.fun.b2(p = current_p, b1 = ans_b1[i-1], b2 = ans_b2[i-1], x1 = x1, x2 = x2, y = y)
      if (i == 2) {
        momentum_b1 = 0
        momentum_b2 = 0
      } else {
        momentum_b1 = momentum * (ans_b1[i-1] - ans_b1[i-2])
        momentum_b2 = momentum * (ans_b2[i-1] - ans_b2[i-2]) 
      }
      ans_b1[i] = ans_b1[i-1] - lr * grad_b1 + momentum_b1
      ans_b2[i] = ans_b2[i-1] - lr * grad_b2 + momentum_b2
    }
    current_p = pred.fun(b1 = ans_b1[i], b2 = ans_b2[i], x1 = x1, x2 = x2)
    ans_loss[i] = loss.fun(p = current_p, y = y)
  }
  
  par(mfrow = c(1, 2))
  
  require(scales)
  require(plot3D)
  
  x1_seq = seq(min(x1), max(x1), length.out = 300)
  x2_seq = seq(min(x2), max(x2), length.out = 300)
  
  z_matrix = sapply(x2_seq, function(x) {pred.fun(b1 = ans_b1[num.iteration], b2 = ans_b2[num.iteration], x1 = x1_seq, x2 = x)})
  
  image2D(z = z_matrix, main = paste0('b1=', formatC(ans_b1[num.iteration], format = 'f', 1), ';b2=', formatC(ans_b2[num.iteration], format = 'f', 1)),
          x = x1_seq, xlab = 'x1',
          y = x2_seq, ylab = 'x2',
          shade = 0.2, rasterImage = TRUE,
          col = colorRampPalette(c("#FFA0A0", "#FFFFFF", "#A0A0FF"))(100))
  
  points(x1, x2, col = (y + 1)*2, pch = 19, cex = 0.5)
  
  require(grDevices) # for trans3d
  
  b1_seq = seq(-0.5, 2, length.out = 50)
  b2_seq = seq(-0.5, 2, length.out = 50)
  LOSS <- b1_seq %*% t(b2_seq)
  for (i in 1:length(b1_seq)) {
    for (j in 1:length(b2_seq)) {
      LOSS[i,j] = loss.fun(p = pred.fun(b1 = b1_seq[i], b2 = b2_seq[j], x1 = x1, x2 = x2), y = y, eps = 1e-8)
    }
  }
  
  persp(b1_seq, b2_seq, LOSS, theta = 120, phi = 50, expand = 0.8, col = "lightblue",
        ltheta = 120, shade = 0.75, ticktype = "detailed",
        xlab = "b1", ylab = "b2", zlab = "loss", main = paste0('loss=', formatC(ans_loss[num.iteration], 4, format = 'f'))) -> res
  
  lines(trans3d(ans_b1, ans_b2, ans_loss, pmat = res), col = '#AA0000', lwd = 1.5)
  points(trans3d(ans_b1[1], ans_b2[1], ans_loss[1], pmat = res), col = '#AA0000', cex = 1, pch = 16)
  points(trans3d(ans_b1[num.iteration], ans_b2[num.iteration], ans_loss[num.iteration], pmat = res), col = '#FF0000', cex = 1, pch = 16)
  
}

練習1答案(實驗結果)

LOGISTIC_VIS(x1 = x1, x2 = x2, y = y, num.iteration = 100, lr = 0.1, momentum = 0, start_b1 = 0, start_b2 = 0)