深度學習理論與實務

林嶔 (Lin, Chin)

Lesson 3 過度擬合的解決方案

前言

– 當時我們就提到了過度擬合(overfitting),而我們當時發現複雜的神經網路在「驗證組」的表現居然沒有簡單的彈性網路來的好。

– 整個實驗的流程應該是利用「訓練組」建立模型,透過「驗證組」選擇最佳模型後,在「測試組」驗證一次獲得最終結果。

– 我們發展預測函數的目標並不是在「訓練組」上預測的多準,而是希望能在「驗證組」上足夠準確。

資料前處理

– 前處理的過程與之前的課程相同,讓我們用不同的心電圖參數來預測「LVD」。我們的目標是希望在「驗證組」上盡可能提升它的AUC。

– 對於正在修習醫療人工智慧實作課程的同學來說,這就是你的第一個任務左心室功能障礙分類挑戰的子資料,你可以將今天學到的技巧用到上面去。

library(mice)

dat <- read.csv("ECG_train.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE, na.strings = "")
subdat <- dat[!(dat[,"LVD"] %in% NA), c(-1, -2, -4, -5)]

subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
for (i in 1:31) {subdat[,paste0('rhythm.', i)] <- as.factor(subdat[,paste0('rhythm.', i)])}

used_dat.x <- subdat[,-1]
mice_dat <- mice(used_dat.x, m = 1, maxit = 5, meth = 'cart', seed = 123, printFlag = FALSE)
impute_dat.x <- mice:::complete(mice_dat, action = 1)

set.seed(0)
all_idx <- 1:nrow(subdat)

train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]

train_X <- impute_dat.x[train_idx,]
valid_X <- impute_dat.x[valid_idx,]
test_X <- impute_dat.x[test_idx,]

train_Y <- subdat[train_idx,"LVD"]
valid_Y <- subdat[valid_idx,"LVD"]
test_Y <- subdat[test_idx,"LVD"]

第一節:資料擴增(1)

– 這樣看起來,增加變異是一件相當重要的事情,並且在實驗中我們還發現這麼做可以給出較為平滑的預測邊界。

– 這個手法統稱為資料擴增(data augmentation),其方法及種類的多樣性足夠上一整學期的課。我們這裡先小試身手,之後的課程還有許多部份同樣涉及資料擴增的部分會再持續補充!

第一節:資料擴增(2)

– 由於變數數目超過了2維,我們取消了視覺化的呈現,但加上了「validation」的選項

DEEP_MLP_Trainer = function (train_X, train_Y, valid_X = NULL, valid_Y = NULL, noise = 0,
                             num.iteration = 500, num.hidden = c(10, 10, 10), eps = 1e-8,
                             lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20) {
  
  #Functions
  
  #Forward
  
  S.fun = function (x, eps = eps) {
    S = 1/(1 + exp(-x))
    S[S < eps] = eps
    S[S > 1 - eps] = 1 - eps
    return(S)
  }
  
  ReLU.fun = function (x) {
    x[x < 0] <- 0
    return(x)
  }
  
  L.fun = function (X, W) {
    X.E = cbind(1, X)
    L = X.E %*% W
    return(L)
  }
  
  CE.fun = function (o, y, eps = eps) {
    loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
    return(loss)
  }
  
  #Backward
  
  grad_o.fun = function (o, y) {
    return((o - y)/(o*(1-o)))
  }
  
  grad_s.fun = function (grad_o, o) {
    return(grad_o*(o*(1-o)))
  }
  
  grad_W.fun = function (grad_l, h) {
    h.E = cbind(1, h)
    return(t(h.E) %*% grad_l/nrow(h))
  }
  
  grad_h.fun = function (grad_l, W) {
    return(grad_l %*% t(W[-1,]))
  }
  
  grad_l.fun = function (grad_h, l) {
    de_l = l
    de_l[de_l<0] = 0
    de_l[de_l>0] = 1
    return(grad_h*de_l)
  }
  
  # Noise
  
  sd.vec <- NULL
  
  for (k in 1:ncol(train_X)) {
    
    if (class(train_X[,k])[1] %in% c('numeric', 'integer')) {
      
      sd.val <- sd(train_X[,k])
      sd.vec <- c(sd.vec, sd.val * noise)
      
    } else {
      
      sd.vec <- c(sd.vec, 0L)
      
    }
    
  }
  
  #initialization
  
  train_X_mat <- model.matrix(~ ., data = train_X)
  train_X_mat <- train_X_mat[,-1]
  train_Y_mat <- t(t(train_Y))
  
  W_list = list()
  M_list = list()
  N_list = list()
  
  len_h = length(num.hidden)
  
  for (w_seq in 1:(len_h+1)) {
    if (w_seq == 1) {
      NROW_W = ncol(train_X_mat) + 1
      NCOL_W = num.hidden[w_seq]
    } else if (w_seq == len_h+1) {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = ncol(train_Y_mat)
    } else {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = num.hidden[w_seq]
    }
    W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
    M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
    N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
  }
  
  loss_seq = rep(0, num.iteration)
  
  #Caculating
  
  for (i in 1:num.iteration) {
    
    idx = sample(1:nrow(train_X_mat), batch_size)
    noise_mat = t(matrix(rnorm(batch_size * length(sd.vec), sd = sd.vec), nrow = length(sd.vec)))
    sub_X_mat = train_X_mat[idx,] + noise_mat
    
    #Forward
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = sub_X_mat, W = W_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
      }
      current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
    }
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    loss_seq[i] = CE.fun(o = current_o, y = train_Y_mat[idx,], eps = eps)
    
    #Backward
    
    current_grad_l_list = list()
    current_grad_W_list = list()
    current_grad_h_list = list()
    
    current_grad_o = grad_o.fun(o = current_o, y = train_Y_mat[idx,])
    current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
    current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
    
    for (j in len_h:1) {
      current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
      current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
      if (j != 1) {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
      } else {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = sub_X_mat)
      }
    }
    
    if (optimizer == 'adam') {
      
      for (j in 1:(len_h+1)) {
        M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
        N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
        M.hat = M_list[[j]]/(1 - beta1^i)
        N.hat = N_list[[j]]/(1 - beta2^i)
        W_list[[j]] = W_list[[j]] - lr*M.hat/sqrt(N.hat+eps)
      }
      
    } else if (optimizer == 'sgd') {
      
      for (j in 1:(len_h+1)) {
        M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
        W_list[[j]] = W_list[[j]] - M_list[[j]]
      }
      
    } else {
      stop('optimizer must be selected from "sgd" or "adam".')
    }
    
  }
  
  pre_func = function (new_X, w_list = W_list) {
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = new_X, W = w_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = w_list[[j]])
      }
      current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
    }
    
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = w_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    
    return(current_o)
    
  }
  
  require(pROC)
  
  pred_y = pre_func(new_X = train_X_mat)
    
  roc_train <- roc(train_Y ~ pred_y)
  plot(roc_train, col = 'red')
  text(0.5, 0.5, paste0('AUC = ', formatC(roc_train[['auc']], 4, format = 'f')), col = 'red')
  
  if (!is.null(valid_X)) {
    
    valid_X_mat <- model.matrix(~ ., data = valid_X)
    valid_X_mat <- valid_X_mat[,-1]
    
    pred_y = pre_func(new_X = valid_X_mat)
    
    roc_valid <- roc(valid_Y ~ pred_y)
    plot(roc_valid, col = 'blue', add = TRUE)
    text(0.5, 0.4, paste0('AUC = ', formatC(roc_valid[['auc']], 4, format = 'f')), col = 'blue')
    legend('bottomright', c('train', 'valid'), col = c('red', 'blue'), lwd = 1)
    
  }
  
  return(list(pre_func = pre_func, W_list = W_list))
  
}

第一節:資料擴增(3)

DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y, noise = 0, 
                             num.iteration = 10000, num.hidden = 30, eps = 1e-8,
                             lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 100)

DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y, noise = 0, 
                             num.iteration = 10000, num.hidden = 300, eps = 1e-8,
                             lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 100)

第一節:資料擴增(4)

DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y, noise = 0.3, 
                             num.iteration = 10000, num.hidden = 30, eps = 1e-8,
                             lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 100)

DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y, noise = 0.3, 
                             num.iteration = 10000, num.hidden = 300, eps = 1e-8,
                             lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 100)

第一節:資料擴增(5)

– 舉例來說,如果原先的data在區分LVD與non-LVD時,最佳的Rate切點是70,那資料擴增就會讓69和71沒有差距這麼大,從而導致邊界平滑化。

– 而隨機森林是從「全樣本」中隨機抽取「子樣本」建立新的樹,這不是就跟上節課教到的「小批量」概念完全一樣嗎?

DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y, noise = 0, 
                             num.iteration = 10000, num.hidden = 300, eps = 1e-8,
                             lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20)

F01

練習1:類別不平衡問題

– 當然,我們已經選擇了使用「ROC曲線」進行分析,因此在評估指標上是沒有問題的,但事實上隨著批量越來越小,是不是有可能整個批量都沒有出現任何一個case?

– 因此,為了確保每個批量都能抽出同樣數量的case與control,我們通常會採取一個「過採樣」的策略,請你試著實現看看。

練習1答案(1)

DEEP_MLP_Trainer = function (train_X, train_Y, valid_X = NULL, valid_Y = NULL, noise = 0, oversampling = TRUE,
                             num.iteration = 500, num.hidden = c(10, 10, 10), eps = 1e-8,
                             lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20) {
  
  #Functions
  
  #Forward
  
  S.fun = function (x, eps = eps) {
    S = 1/(1 + exp(-x))
    S[S < eps] = eps
    S[S > 1 - eps] = 1 - eps
    return(S)
  }
  
  ReLU.fun = function (x) {
    x[x < 0] <- 0
    return(x)
  }
  
  L.fun = function (X, W) {
    X.E = cbind(1, X)
    L = X.E %*% W
    return(L)
  }
  
  CE.fun = function (o, y, eps = eps) {
    loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
    return(loss)
  }
  
  #Backward
  
  grad_o.fun = function (o, y) {
    return((o - y)/(o*(1-o)))
  }
  
  grad_s.fun = function (grad_o, o) {
    return(grad_o*(o*(1-o)))
  }
  
  grad_W.fun = function (grad_l, h) {
    h.E = cbind(1, h)
    return(t(h.E) %*% grad_l/nrow(h))
  }
  
  grad_h.fun = function (grad_l, W) {
    return(grad_l %*% t(W[-1,]))
  }
  
  grad_l.fun = function (grad_h, l) {
    de_l = l
    de_l[de_l<0] = 0
    de_l[de_l>0] = 1
    return(grad_h*de_l)
  }
  
  # Noise
  
  sd.vec <- NULL
  
  for (k in 1:ncol(train_X)) {
    
    if (class(train_X[,k])[1] %in% c('numeric', 'integer')) {
      
      sd.val <- sd(train_X[,k])
      sd.vec <- c(sd.vec, sd.val * noise)
      
    } else {
      
      sd.vec <- c(sd.vec, 0L)
      
    }
    
  }
  
  #initialization
  
  train_X_mat <- model.matrix(~ ., data = train_X)
  train_X_mat <- train_X_mat[,-1]
  train_Y_mat <- t(t(train_Y))
  
  W_list = list()
  M_list = list()
  N_list = list()
  
  len_h = length(num.hidden)
  
  for (w_seq in 1:(len_h+1)) {
    if (w_seq == 1) {
      NROW_W = ncol(train_X_mat) + 1
      NCOL_W = num.hidden[w_seq]
    } else if (w_seq == len_h+1) {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = ncol(train_Y_mat)
    } else {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = num.hidden[w_seq]
    }
    W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
    M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
    N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
  }
  
  loss_seq = rep(0, num.iteration)
  
  #Caculating
  
  for (i in 1:num.iteration) {
    
    if (oversampling) {
      
      idx.pos = sample(which(train_Y == 1), batch_size / 2)
      idx.neg = sample(which(train_Y == 0), batch_size / 2)
      idx = c(idx.pos, idx.neg)
      
    } else {
      
      idx = sample(1:nrow(train_X_mat), batch_size)
      
    }
    
    noise_mat = t(matrix(rnorm(batch_size * length(sd.vec), sd = sd.vec), nrow = length(sd.vec)))
    sub_X_mat = train_X_mat[idx,] + noise_mat
    
    #Forward
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = sub_X_mat, W = W_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
      }
      current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
    }
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    loss_seq[i] = CE.fun(o = current_o, y = train_Y_mat[idx,], eps = eps)
    
    #Backward
    
    current_grad_l_list = list()
    current_grad_W_list = list()
    current_grad_h_list = list()
    
    current_grad_o = grad_o.fun(o = current_o, y = train_Y_mat[idx,])
    current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
    current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
    
    for (j in len_h:1) {
      current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
      current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
      if (j != 1) {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
      } else {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = sub_X_mat)
      }
    }
    
    if (optimizer == 'adam') {
      
      for (j in 1:(len_h+1)) {
        M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
        N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
        M.hat = M_list[[j]]/(1 - beta1^i)
        N.hat = N_list[[j]]/(1 - beta2^i)
        W_list[[j]] = W_list[[j]] - lr*M.hat/sqrt(N.hat+eps)
      }
      
    } else if (optimizer == 'sgd') {
      
      for (j in 1:(len_h+1)) {
        M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
        W_list[[j]] = W_list[[j]] - M_list[[j]]
      }
      
    } else {
      stop('optimizer must be selected from "sgd" or "adam".')
    }
    
  }
  
  pre_func = function (new_X, w_list = W_list) {
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = new_X, W = w_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = w_list[[j]])
      }
      current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
    }
    
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = w_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    
    return(current_o)
    
  }
  
  require(pROC)
  
  pred_y = pre_func(new_X = train_X_mat)
    
  roc_train <- roc(train_Y ~ pred_y)
  plot(roc_train, col = 'red')
  text(0.5, 0.5, paste0('AUC = ', formatC(roc_train[['auc']], 4, format = 'f')), col = 'red')
  
  if (!is.null(valid_X)) {
    
    valid_X_mat <- model.matrix(~ ., data = valid_X)
    valid_X_mat <- valid_X_mat[,-1]
    
    pred_y = pre_func(new_X = valid_X_mat)
    
    roc_valid <- roc(valid_Y ~ pred_y)
    plot(roc_valid, col = 'blue', add = TRUE)
    text(0.5, 0.4, paste0('AUC = ', formatC(roc_valid[['auc']], 4, format = 'f')), col = 'blue')
    legend('bottomright', c('train', 'valid'), col = c('red', 'blue'), lwd = 1)
    
  }
  
  return(list(pre_func = pre_func, W_list = W_list))
  
}

練習1答案(2)

DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y,
                             noise = 0.3, oversampling = TRUE, 
                             num.iteration = 10000, num.hidden = 300, eps = 1e-8,
                             lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20)

第二節:正則化(1)

– 這是一個可以防止過度擬合的強大武器,而且非常容易使用。就是在原先的loss之上下了一個額外懲罰權重大小的loss

– 想複習可以點這裡

  1. L1 Regularization - 代表技術有套索迴歸(LASSO regression)

\[ \begin{align} \hat{loss_1} & = loss + \lambda \times ||W|| \end{align} \]

  1. L2 Regularization - 代表技術有脊迴歸(Ridge regression)

\[ \begin{align} \hat{loss_2} & = loss + \frac{\lambda}{2} \times||W||^2 \end{align} \]

第二節:正則化(2)

\[ \begin{align} \frac{\partial}{\partial W} \hat{loss_1} & = grad.W + \lambda \frac{W}{|W|} \\ \frac{\partial}{\partial W} \hat{loss_2} & = grad.W + \lambda W \end{align} \]

– 很明顯的,當\(\lambda\)很大的時候,那所有權重都將會很難離開0。

練習2:將正則化技術引入深度神經網路中

– 這裡我們引入符號\(||W||^2\)代表向量(矩陣)\(W\)的距離值(也就是平方和)

\[ \begin{align} l_1 & = L^1_d(x^E,W^1_d) \\ h_1 & = ReLU(l_1) \\ l_2 & = L^2_1(h_1^E,W^2_1) \\ o & = S(l_2) \\\\ loss & = CE(y, o) + L2Reg(W) \\ & = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) + \frac{\lambda}{2} ||W^2_1||^2 + \frac{\lambda}{2} ||W^1_d||^2 \end{align} \]

\[ \begin{align} grad.o & = \frac{\partial}{\partial o}loss = \frac{o-y}{o(1-o)} \\ grad.l_2 & = \frac{\partial}{\partial l_2}loss = grad.o \otimes \frac{\partial}{\partial l_2}o= o-y \\ grad.W^2_1 & = \frac{\partial}{\partial W^2_1}loss = grad.l_2 \otimes \frac{\partial}{\partial W^2_1}l_2 + \frac{\partial}{\partial W^2_1}\frac{\lambda}{2}||W^2_1||^2 = \frac{{1}}{n} \otimes (h_1^E)^T \bullet grad.l_2 + \lambda W^2_1\\ grad.h_1^E & = \frac{\partial}{\partial h_1^E}loss = grad.l_2 \otimes \frac{\partial}{\partial h_1^E}l_2 = grad.l_2 \bullet (W^2_1)^T \\ grad.l_1 & = \frac{\partial}{\partial l_1}loss = grad.h_1 \otimes \frac{\partial}{\partial l_1}h_1 = grad.h_1 \otimes \frac{\partial}{\partial l_1}ReLU(l_1) \\ grad.W^1_d & = \frac{\partial}{\partial W^1_d}loss = grad.l_1 \otimes \frac{\partial}{\partial W^1_d}l_1 + \frac{\partial}{\partial W^1_d}\frac{\lambda}{2}||W^1_d||^2 = \frac{{1}}{n} \otimes (x^E)^T \bullet grad.l_1 + \lambda W^1_d \end{align} \]

– 需要注意的是,由於梯度計算完成後我們通常會再乘上一個學習率\(lr\),所以最後的衰減率會是\(lr \times \lambda\)。為了避免應用上的麻煩,我們通常會把這一項獨立出來不讓他乘以學習率,從而固定其影響力。

練習2答案(1)

DEEP_MLP_Trainer = function (train_X, train_Y, valid_X = NULL, valid_Y = NULL,
                             noise = 0, oversampling = TRUE, lambda = 0.001,
                             num.iteration = 500, num.hidden = c(10, 10, 10), eps = 1e-8,
                             lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20) {
  
  #Functions
  
  #Forward
  
  S.fun = function (x, eps = eps) {
    S = 1/(1 + exp(-x))
    S[S < eps] = eps
    S[S > 1 - eps] = 1 - eps
    return(S)
  }
  
  ReLU.fun = function (x) {
    x[x < 0] <- 0
    return(x)
  }
  
  L.fun = function (X, W) {
    X.E = cbind(1, X)
    L = X.E %*% W
    return(L)
  }
  
  CE.fun = function (o, y, eps = eps) {
    loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
    return(loss)
  }
  
  #Backward
  
  grad_o.fun = function (o, y) {
    return((o - y)/(o*(1-o)))
  }
  
  grad_s.fun = function (grad_o, o) {
    return(grad_o*(o*(1-o)))
  }
  
  grad_W.fun = function (grad_l, h) {
    h.E = cbind(1, h)
    return(t(h.E) %*% grad_l/nrow(h))
  }
  
  grad_h.fun = function (grad_l, W) {
    return(grad_l %*% t(W[-1,]))
  }
  
  grad_l.fun = function (grad_h, l) {
    de_l = l
    de_l[de_l<0] = 0
    de_l[de_l>0] = 1
    return(grad_h*de_l)
  }
  
  # Noise
  
  sd.vec <- NULL
  
  for (k in 1:ncol(train_X)) {
    
    if (class(train_X[,k])[1] %in% c('numeric', 'integer')) {
      
      sd.val <- sd(train_X[,k])
      sd.vec <- c(sd.vec, sd.val * noise)
      
    } else {
      
      sd.vec <- c(sd.vec, 0L)
      
    }
    
  }
  
  #initialization
  
  train_X_mat <- model.matrix(~ ., data = train_X)
  train_X_mat <- train_X_mat[,-1]
  train_Y_mat <- t(t(train_Y))
  
  W_list = list()
  M_list = list()
  N_list = list()
  
  len_h = length(num.hidden)
  
  for (w_seq in 1:(len_h+1)) {
    if (w_seq == 1) {
      NROW_W = ncol(train_X_mat) + 1
      NCOL_W = num.hidden[w_seq]
    } else if (w_seq == len_h+1) {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = ncol(train_Y_mat)
    } else {
      NROW_W = num.hidden[w_seq - 1] + 1
      NCOL_W = num.hidden[w_seq]
    }
    W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
    M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
    N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
  }
  
  loss_seq = rep(0, num.iteration)
  
  #Caculating
  
  for (i in 1:num.iteration) {
    
    if (oversampling) {
      
      idx.pos = sample(which(train_Y == 1), batch_size / 2)
      idx.neg = sample(which(train_Y == 0), batch_size / 2)
      idx = c(idx.pos, idx.neg)
      
    } else {
      
      idx = sample(1:nrow(train_X_mat), batch_size)
      
    }
    
    noise_mat = t(matrix(rnorm(batch_size * length(sd.vec), sd = sd.vec), nrow = length(sd.vec)))
    sub_X_mat = train_X_mat[idx,] + noise_mat
    
    #Forward
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = sub_X_mat, W = W_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
      }
      current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
    }
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    loss_seq[i] = CE.fun(o = current_o, y = train_Y_mat[idx,], eps = eps)
    
    #Backward
    
    current_grad_l_list = list()
    current_grad_W_list = list()
    current_grad_h_list = list()
    
    current_grad_o = grad_o.fun(o = current_o, y = train_Y_mat[idx,])
    current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
    current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
    
    for (j in len_h:1) {
      current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
      current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
      if (j != 1) {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
      } else {
        current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = sub_X_mat)
      }
    }
    
    if (optimizer == 'adam') {
      
      for (j in 1:(len_h+1)) {
        M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
        N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
        M.hat = M_list[[j]]/(1 - beta1^i)
        N.hat = N_list[[j]]/(1 - beta2^i)
        W_list[[j]] = W_list[[j]] - lr * M.hat / sqrt(N.hat+eps) - lambda * W_list[[j]]
      }
      
    } else if (optimizer == 'sgd') {
      
      for (j in 1:(len_h+1)) {
        M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
        W_list[[j]] = W_list[[j]] - M_list[[j]] - lambda * W_list[[j]]
      }
      
    } else {
      stop('optimizer must be selected from "sgd" or "adam".')
    }
    
  }
  
  pre_func = function (new_X, w_list = W_list) {
    
    current_l_list = list()
    current_h_list = list()
    
    for (j in 1:len_h) {
      if (j == 1) {
        current_l_list[[j]] = L.fun(X = new_X, W = w_list[[j]])
      } else {
        current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = w_list[[j]])
      }
      current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
    }
    
    current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = w_list[[len_h+1]])
    current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
    
    return(current_o)
    
  }
  
  require(pROC)
  
  pred_y = pre_func(new_X = train_X_mat)
    
  roc_train <- roc(train_Y ~ pred_y)
  plot(roc_train, col = 'red')
  text(0.5, 0.5, paste0('AUC = ', formatC(roc_train[['auc']], 4, format = 'f')), col = 'red')
  
  if (!is.null(valid_X)) {
    
    valid_X_mat <- model.matrix(~ ., data = valid_X)
    valid_X_mat <- valid_X_mat[,-1]
    
    pred_y = pre_func(new_X = valid_X_mat)
    
    roc_valid <- roc(valid_Y ~ pred_y)
    plot(roc_valid, col = 'blue', add = TRUE)
    text(0.5, 0.4, paste0('AUC = ', formatC(roc_valid[['auc']], 4, format = 'f')), col = 'blue')
    legend('bottomright', c('train', 'valid'), col = c('red', 'blue'), lwd = 1)
    
  }
  
  return(list(pre_func = pre_func, W_list = W_list))
  
}

練習2答案(2)

DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y,
                             noise = 0.3, oversampling = TRUE, lambda = 0.001,
                             num.iteration = 10000, num.hidden = 300, eps = 1e-8,
                             lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20)

第三節:Dropout(1)

– 我們要再次回到2012年,當多倫多大學的Alex Krizhevsky、Ilya Sutskever以及Geoffrey Hinton重新使用神經網路AlexNet成功奪冠時,世界上其他人都犯蠢了都沒想到這個方法?事實上,能夠在取得如此重大的突破絕對是同時解決了許多世紀難題才有的成果!