林嶔 (Lin, Chin)
Lesson 3 過度擬合的解決方案
– 當時我們就提到了過度擬合(overfitting),而我們當時發現複雜的神經網路在「驗證組」的表現居然沒有簡單的彈性網路來的好。
– 整個實驗的流程應該是利用「訓練組」建立模型,透過「驗證組」選擇最佳模型後,在「測試組」驗證一次獲得最終結果。
– 我們發展預測函數的目標並不是在「訓練組」上預測的多準,而是希望能在「驗證組」上足夠準確。
– 前處理的過程與之前的課程相同,讓我們用不同的心電圖參數來預測「LVD」。我們的目標是希望在「驗證組」上盡可能提升它的AUC。
– 對於正在修習醫療人工智慧實作課程的同學來說,這就是你的第一個任務左心室功能障礙分類挑戰的子資料,你可以將今天學到的技巧用到上面去。
library(mice)
dat <- read.csv("ECG_train.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE, na.strings = "")
subdat <- dat[!(dat[,"LVD"] %in% NA), c(-1, -2, -4, -5)]
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
for (i in 1:31) {subdat[,paste0('rhythm.', i)] <- as.factor(subdat[,paste0('rhythm.', i)])}
used_dat.x <- subdat[,-1]
mice_dat <- mice(used_dat.x, m = 1, maxit = 5, meth = 'cart', seed = 123, printFlag = FALSE)
impute_dat.x <- mice:::complete(mice_dat, action = 1)
set.seed(0)
all_idx <- 1:nrow(subdat)
train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]
train_X <- impute_dat.x[train_idx,]
valid_X <- impute_dat.x[valid_idx,]
test_X <- impute_dat.x[test_idx,]
train_Y <- subdat[train_idx,"LVD"]
valid_Y <- subdat[valid_idx,"LVD"]
test_Y <- subdat[test_idx,"LVD"]
– 這樣看起來,增加變異是一件相當重要的事情,並且在實驗中我們還發現這麼做可以給出較為平滑的預測邊界。
– 這個手法統稱為資料擴增(data augmentation),其方法及種類的多樣性足夠上一整學期的課。我們這裡先小試身手,之後的課程還有許多部份同樣涉及資料擴增的部分會再持續補充!
– 由於變數數目超過了2維,我們取消了視覺化的呈現,但加上了「validation」的選項
DEEP_MLP_Trainer = function (train_X, train_Y, valid_X = NULL, valid_Y = NULL, noise = 0,
num.iteration = 500, num.hidden = c(10, 10, 10), eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20) {
#Functions
#Forward
S.fun = function (x, eps = eps) {
S = 1/(1 + exp(-x))
S[S < eps] = eps
S[S > 1 - eps] = 1 - eps
return(S)
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
CE.fun = function (o, y, eps = eps) {
loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
return(loss)
}
#Backward
grad_o.fun = function (o, y) {
return((o - y)/(o*(1-o)))
}
grad_s.fun = function (grad_o, o) {
return(grad_o*(o*(1-o)))
}
grad_W.fun = function (grad_l, h) {
h.E = cbind(1, h)
return(t(h.E) %*% grad_l/nrow(h))
}
grad_h.fun = function (grad_l, W) {
return(grad_l %*% t(W[-1,]))
}
grad_l.fun = function (grad_h, l) {
de_l = l
de_l[de_l<0] = 0
de_l[de_l>0] = 1
return(grad_h*de_l)
}
# Noise
sd.vec <- NULL
for (k in 1:ncol(train_X)) {
if (class(train_X[,k])[1] %in% c('numeric', 'integer')) {
sd.val <- sd(train_X[,k])
sd.vec <- c(sd.vec, sd.val * noise)
} else {
sd.vec <- c(sd.vec, 0L)
}
}
#initialization
train_X_mat <- model.matrix(~ ., data = train_X)
train_X_mat <- train_X_mat[,-1]
train_Y_mat <- t(t(train_Y))
W_list = list()
M_list = list()
N_list = list()
len_h = length(num.hidden)
for (w_seq in 1:(len_h+1)) {
if (w_seq == 1) {
NROW_W = ncol(train_X_mat) + 1
NCOL_W = num.hidden[w_seq]
} else if (w_seq == len_h+1) {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = ncol(train_Y_mat)
} else {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = num.hidden[w_seq]
}
W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
}
loss_seq = rep(0, num.iteration)
#Caculating
for (i in 1:num.iteration) {
idx = sample(1:nrow(train_X_mat), batch_size)
noise_mat = t(matrix(rnorm(batch_size * length(sd.vec), sd = sd.vec), nrow = length(sd.vec)))
sub_X_mat = train_X_mat[idx,] + noise_mat
#Forward
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = sub_X_mat, W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
loss_seq[i] = CE.fun(o = current_o, y = train_Y_mat[idx,], eps = eps)
#Backward
current_grad_l_list = list()
current_grad_W_list = list()
current_grad_h_list = list()
current_grad_o = grad_o.fun(o = current_o, y = train_Y_mat[idx,])
current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
for (j in len_h:1) {
current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
if (j != 1) {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
} else {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = sub_X_mat)
}
}
if (optimizer == 'adam') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
M.hat = M_list[[j]]/(1 - beta1^i)
N.hat = N_list[[j]]/(1 - beta2^i)
W_list[[j]] = W_list[[j]] - lr*M.hat/sqrt(N.hat+eps)
}
} else if (optimizer == 'sgd') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
W_list[[j]] = W_list[[j]] - M_list[[j]]
}
} else {
stop('optimizer must be selected from "sgd" or "adam".')
}
}
pre_func = function (new_X, w_list = W_list) {
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = new_X, W = w_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = w_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = w_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
return(current_o)
}
require(pROC)
pred_y = pre_func(new_X = train_X_mat)
roc_train <- roc(train_Y ~ pred_y)
plot(roc_train, col = 'red')
text(0.5, 0.5, paste0('AUC = ', formatC(roc_train[['auc']], 4, format = 'f')), col = 'red')
if (!is.null(valid_X)) {
valid_X_mat <- model.matrix(~ ., data = valid_X)
valid_X_mat <- valid_X_mat[,-1]
pred_y = pre_func(new_X = valid_X_mat)
roc_valid <- roc(valid_Y ~ pred_y)
plot(roc_valid, col = 'blue', add = TRUE)
text(0.5, 0.4, paste0('AUC = ', formatC(roc_valid[['auc']], 4, format = 'f')), col = 'blue')
legend('bottomright', c('train', 'valid'), col = c('red', 'blue'), lwd = 1)
}
return(list(pre_func = pre_func, W_list = W_list))
}
DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y, noise = 0,
num.iteration = 10000, num.hidden = 30, eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 100)
DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y, noise = 0,
num.iteration = 10000, num.hidden = 300, eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 100)
DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y, noise = 0.3,
num.iteration = 10000, num.hidden = 30, eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 100)
DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y, noise = 0.3,
num.iteration = 10000, num.hidden = 300, eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 100)
– 舉例來說,如果原先的data在區分LVD與non-LVD時,最佳的Rate切點是70,那資料擴增就會讓69和71沒有差距這麼大,從而導致邊界平滑化。
– 而隨機森林是從「全樣本」中隨機抽取「子樣本」建立新的樹,這不是就跟上節課教到的「小批量」概念完全一樣嗎?
DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y, noise = 0,
num.iteration = 10000, num.hidden = 300, eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20)
– 當然,我們已經選擇了使用「ROC曲線」進行分析,因此在評估指標上是沒有問題的,但事實上隨著批量越來越小,是不是有可能整個批量都沒有出現任何一個case?
– 因此,為了確保每個批量都能抽出同樣數量的case與control,我們通常會採取一個「過採樣」的策略,請你試著實現看看。
DEEP_MLP_Trainer = function (train_X, train_Y, valid_X = NULL, valid_Y = NULL, noise = 0, oversampling = TRUE,
num.iteration = 500, num.hidden = c(10, 10, 10), eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20) {
#Functions
#Forward
S.fun = function (x, eps = eps) {
S = 1/(1 + exp(-x))
S[S < eps] = eps
S[S > 1 - eps] = 1 - eps
return(S)
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
CE.fun = function (o, y, eps = eps) {
loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
return(loss)
}
#Backward
grad_o.fun = function (o, y) {
return((o - y)/(o*(1-o)))
}
grad_s.fun = function (grad_o, o) {
return(grad_o*(o*(1-o)))
}
grad_W.fun = function (grad_l, h) {
h.E = cbind(1, h)
return(t(h.E) %*% grad_l/nrow(h))
}
grad_h.fun = function (grad_l, W) {
return(grad_l %*% t(W[-1,]))
}
grad_l.fun = function (grad_h, l) {
de_l = l
de_l[de_l<0] = 0
de_l[de_l>0] = 1
return(grad_h*de_l)
}
# Noise
sd.vec <- NULL
for (k in 1:ncol(train_X)) {
if (class(train_X[,k])[1] %in% c('numeric', 'integer')) {
sd.val <- sd(train_X[,k])
sd.vec <- c(sd.vec, sd.val * noise)
} else {
sd.vec <- c(sd.vec, 0L)
}
}
#initialization
train_X_mat <- model.matrix(~ ., data = train_X)
train_X_mat <- train_X_mat[,-1]
train_Y_mat <- t(t(train_Y))
W_list = list()
M_list = list()
N_list = list()
len_h = length(num.hidden)
for (w_seq in 1:(len_h+1)) {
if (w_seq == 1) {
NROW_W = ncol(train_X_mat) + 1
NCOL_W = num.hidden[w_seq]
} else if (w_seq == len_h+1) {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = ncol(train_Y_mat)
} else {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = num.hidden[w_seq]
}
W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
}
loss_seq = rep(0, num.iteration)
#Caculating
for (i in 1:num.iteration) {
if (oversampling) {
idx.pos = sample(which(train_Y == 1), batch_size / 2)
idx.neg = sample(which(train_Y == 0), batch_size / 2)
idx = c(idx.pos, idx.neg)
} else {
idx = sample(1:nrow(train_X_mat), batch_size)
}
noise_mat = t(matrix(rnorm(batch_size * length(sd.vec), sd = sd.vec), nrow = length(sd.vec)))
sub_X_mat = train_X_mat[idx,] + noise_mat
#Forward
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = sub_X_mat, W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
loss_seq[i] = CE.fun(o = current_o, y = train_Y_mat[idx,], eps = eps)
#Backward
current_grad_l_list = list()
current_grad_W_list = list()
current_grad_h_list = list()
current_grad_o = grad_o.fun(o = current_o, y = train_Y_mat[idx,])
current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
for (j in len_h:1) {
current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
if (j != 1) {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
} else {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = sub_X_mat)
}
}
if (optimizer == 'adam') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
M.hat = M_list[[j]]/(1 - beta1^i)
N.hat = N_list[[j]]/(1 - beta2^i)
W_list[[j]] = W_list[[j]] - lr*M.hat/sqrt(N.hat+eps)
}
} else if (optimizer == 'sgd') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
W_list[[j]] = W_list[[j]] - M_list[[j]]
}
} else {
stop('optimizer must be selected from "sgd" or "adam".')
}
}
pre_func = function (new_X, w_list = W_list) {
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = new_X, W = w_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = w_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = w_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
return(current_o)
}
require(pROC)
pred_y = pre_func(new_X = train_X_mat)
roc_train <- roc(train_Y ~ pred_y)
plot(roc_train, col = 'red')
text(0.5, 0.5, paste0('AUC = ', formatC(roc_train[['auc']], 4, format = 'f')), col = 'red')
if (!is.null(valid_X)) {
valid_X_mat <- model.matrix(~ ., data = valid_X)
valid_X_mat <- valid_X_mat[,-1]
pred_y = pre_func(new_X = valid_X_mat)
roc_valid <- roc(valid_Y ~ pred_y)
plot(roc_valid, col = 'blue', add = TRUE)
text(0.5, 0.4, paste0('AUC = ', formatC(roc_valid[['auc']], 4, format = 'f')), col = 'blue')
legend('bottomright', c('train', 'valid'), col = c('red', 'blue'), lwd = 1)
}
return(list(pre_func = pre_func, W_list = W_list))
}
DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y,
noise = 0.3, oversampling = TRUE,
num.iteration = 10000, num.hidden = 300, eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20)
– 這是一個可以防止過度擬合的強大武器,而且非常容易使用。就是在原先的loss之上下了一個額外懲罰權重大小的loss
– 想複習可以點這裡。
\[ \begin{align} \hat{loss_1} & = loss + \lambda \times ||W|| \end{align} \]
\[ \begin{align} \hat{loss_2} & = loss + \frac{\lambda}{2} \times||W||^2 \end{align} \]
\[ \begin{align} \frac{\partial}{\partial W} \hat{loss_1} & = grad.W + \lambda \frac{W}{|W|} \\ \frac{\partial}{\partial W} \hat{loss_2} & = grad.W + \lambda W \end{align} \]
– 很明顯的,當\(\lambda\)很大的時候,那所有權重都將會很難離開0。
– 這裡我們引入符號\(||W||^2\)代表向量(矩陣)\(W\)的距離值(也就是平方和)
\[ \begin{align} l_1 & = L^1_d(x^E,W^1_d) \\ h_1 & = ReLU(l_1) \\ l_2 & = L^2_1(h_1^E,W^2_1) \\ o & = S(l_2) \\\\ loss & = CE(y, o) + L2Reg(W) \\ & = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) + \frac{\lambda}{2} ||W^2_1||^2 + \frac{\lambda}{2} ||W^1_d||^2 \end{align} \]
\[ \begin{align} grad.o & = \frac{\partial}{\partial o}loss = \frac{o-y}{o(1-o)} \\ grad.l_2 & = \frac{\partial}{\partial l_2}loss = grad.o \otimes \frac{\partial}{\partial l_2}o= o-y \\ grad.W^2_1 & = \frac{\partial}{\partial W^2_1}loss = grad.l_2 \otimes \frac{\partial}{\partial W^2_1}l_2 + \frac{\partial}{\partial W^2_1}\frac{\lambda}{2}||W^2_1||^2 = \frac{{1}}{n} \otimes (h_1^E)^T \bullet grad.l_2 + \lambda W^2_1\\ grad.h_1^E & = \frac{\partial}{\partial h_1^E}loss = grad.l_2 \otimes \frac{\partial}{\partial h_1^E}l_2 = grad.l_2 \bullet (W^2_1)^T \\ grad.l_1 & = \frac{\partial}{\partial l_1}loss = grad.h_1 \otimes \frac{\partial}{\partial l_1}h_1 = grad.h_1 \otimes \frac{\partial}{\partial l_1}ReLU(l_1) \\ grad.W^1_d & = \frac{\partial}{\partial W^1_d}loss = grad.l_1 \otimes \frac{\partial}{\partial W^1_d}l_1 + \frac{\partial}{\partial W^1_d}\frac{\lambda}{2}||W^1_d||^2 = \frac{{1}}{n} \otimes (x^E)^T \bullet grad.l_1 + \lambda W^1_d \end{align} \]
– 需要注意的是,由於梯度計算完成後我們通常會再乘上一個學習率\(lr\),所以最後的衰減率會是\(lr \times \lambda\)。為了避免應用上的麻煩,我們通常會把這一項獨立出來不讓他乘以學習率,從而固定其影響力。
DEEP_MLP_Trainer = function (train_X, train_Y, valid_X = NULL, valid_Y = NULL,
noise = 0, oversampling = TRUE, lambda = 0.001,
num.iteration = 500, num.hidden = c(10, 10, 10), eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20) {
#Functions
#Forward
S.fun = function (x, eps = eps) {
S = 1/(1 + exp(-x))
S[S < eps] = eps
S[S > 1 - eps] = 1 - eps
return(S)
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
CE.fun = function (o, y, eps = eps) {
loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
return(loss)
}
#Backward
grad_o.fun = function (o, y) {
return((o - y)/(o*(1-o)))
}
grad_s.fun = function (grad_o, o) {
return(grad_o*(o*(1-o)))
}
grad_W.fun = function (grad_l, h) {
h.E = cbind(1, h)
return(t(h.E) %*% grad_l/nrow(h))
}
grad_h.fun = function (grad_l, W) {
return(grad_l %*% t(W[-1,]))
}
grad_l.fun = function (grad_h, l) {
de_l = l
de_l[de_l<0] = 0
de_l[de_l>0] = 1
return(grad_h*de_l)
}
# Noise
sd.vec <- NULL
for (k in 1:ncol(train_X)) {
if (class(train_X[,k])[1] %in% c('numeric', 'integer')) {
sd.val <- sd(train_X[,k])
sd.vec <- c(sd.vec, sd.val * noise)
} else {
sd.vec <- c(sd.vec, 0L)
}
}
#initialization
train_X_mat <- model.matrix(~ ., data = train_X)
train_X_mat <- train_X_mat[,-1]
train_Y_mat <- t(t(train_Y))
W_list = list()
M_list = list()
N_list = list()
len_h = length(num.hidden)
for (w_seq in 1:(len_h+1)) {
if (w_seq == 1) {
NROW_W = ncol(train_X_mat) + 1
NCOL_W = num.hidden[w_seq]
} else if (w_seq == len_h+1) {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = ncol(train_Y_mat)
} else {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = num.hidden[w_seq]
}
W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
}
loss_seq = rep(0, num.iteration)
#Caculating
for (i in 1:num.iteration) {
if (oversampling) {
idx.pos = sample(which(train_Y == 1), batch_size / 2)
idx.neg = sample(which(train_Y == 0), batch_size / 2)
idx = c(idx.pos, idx.neg)
} else {
idx = sample(1:nrow(train_X_mat), batch_size)
}
noise_mat = t(matrix(rnorm(batch_size * length(sd.vec), sd = sd.vec), nrow = length(sd.vec)))
sub_X_mat = train_X_mat[idx,] + noise_mat
#Forward
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = sub_X_mat, W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
loss_seq[i] = CE.fun(o = current_o, y = train_Y_mat[idx,], eps = eps)
#Backward
current_grad_l_list = list()
current_grad_W_list = list()
current_grad_h_list = list()
current_grad_o = grad_o.fun(o = current_o, y = train_Y_mat[idx,])
current_grad_l_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
for (j in len_h:1) {
current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
current_grad_l_list[[j]] = grad_l.fun(grad_h = current_grad_h_list[[j]], l = current_l_list[[j]])
if (j != 1) {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
} else {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = sub_X_mat)
}
}
if (optimizer == 'adam') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
M.hat = M_list[[j]]/(1 - beta1^i)
N.hat = N_list[[j]]/(1 - beta2^i)
W_list[[j]] = W_list[[j]] - lr * M.hat / sqrt(N.hat+eps) - lambda * W_list[[j]]
}
} else if (optimizer == 'sgd') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
W_list[[j]] = W_list[[j]] - M_list[[j]] - lambda * W_list[[j]]
}
} else {
stop('optimizer must be selected from "sgd" or "adam".')
}
}
pre_func = function (new_X, w_list = W_list) {
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = new_X, W = w_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = w_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = w_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
return(current_o)
}
require(pROC)
pred_y = pre_func(new_X = train_X_mat)
roc_train <- roc(train_Y ~ pred_y)
plot(roc_train, col = 'red')
text(0.5, 0.5, paste0('AUC = ', formatC(roc_train[['auc']], 4, format = 'f')), col = 'red')
if (!is.null(valid_X)) {
valid_X_mat <- model.matrix(~ ., data = valid_X)
valid_X_mat <- valid_X_mat[,-1]
pred_y = pre_func(new_X = valid_X_mat)
roc_valid <- roc(valid_Y ~ pred_y)
plot(roc_valid, col = 'blue', add = TRUE)
text(0.5, 0.4, paste0('AUC = ', formatC(roc_valid[['auc']], 4, format = 'f')), col = 'blue')
legend('bottomright', c('train', 'valid'), col = c('red', 'blue'), lwd = 1)
}
return(list(pre_func = pre_func, W_list = W_list))
}
DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y,
noise = 0.3, oversampling = TRUE, lambda = 0.001,
num.iteration = 10000, num.hidden = 300, eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20)
– 我們要再次回到2012年,當多倫多大學的Alex Krizhevsky、Ilya Sutskever以及Geoffrey Hinton重新使用神經網路AlexNet成功奪冠時,世界上其他人都犯蠢了都沒想到這個方法?事實上,能夠在取得如此重大的突破絕對是同時解決了許多世紀難題才有的成果!
– 這邊帶大家稍微了解一下時代背景,我們先不考慮計算能力只關注在統計問題。整個AlexNet待解有超過6200萬,而當時ImageNet資料庫所能提供的也只有120萬的數據量,因此過擬合問題非常嚴重,即使運用了資料擴增、正則化都並不足以解決這個問題。
– 這時候如果加入了Dropout機制,也就是在每次考試時隨機的抽走一半的學生,讓剩下的學生進行考試,這是不是會更刺激每個學生學習?
\[ dropout(x) = \left\{ \begin{array} - \frac{x}{1-dp} & \mbox{ otherwise} \\ 0 & \mbox{ if sampled (rate = dp)} \end{array} \right. \]
– 這裡你會發現,由於\(dropout(x)\)與\(ReLU(x)\)非常非常的像,所以他們的偏導函數也非常類似:
\[ \frac{\partial}{\partial x}dropout(x) = \left\{ \begin{array} - \frac{1}{1 -dp} & \mbox{ otherwise} \\ 0 & \mbox{ if sampled (rate = dp)} \end{array} \right. \]
\[ \begin{align} l_1 & = L^1_d(x^E,W^1_d) \\ dp_1 & = dropout(l_1) \\ h_1 & = ReLU(dp_1) \\ l_2 & = L^2_1(h_1^E,W^2_1) \\ dp_2 & = dropout(l_2) \\ o & = S(dp_2) \\ loss & = CE(y, o) = -\left(y \cdot log(o) + (1-y) \cdot log(1-o)\right) \end{align} \]
\[ \begin{align} grad.o & = \frac{\partial}{\partial o}loss = \frac{o-y}{o(1-o)} \\ grad.dp_2 & = \frac{\partial}{\partial dp_2}loss = grad.o \otimes \frac{\partial}{\partial dp_2}o= o-y \\ grad.l_2 & = \frac{\partial}{\partial l_2}loss = grad.dp_2 \otimes \frac{\partial}{\partial l_2}dp_2 = (o-y) \otimes \frac{\partial}{\partial l_2}dropout(l_2) \\ grad.W^2_1 & = \frac{\partial}{\partial W^2_1}loss = grad.l_2 \otimes \frac{\partial}{\partial W^2_1}l_2 = \frac{{1}}{n} \otimes (h_1^E)^T \bullet grad.l_2\\ grad.h_1^E & = \frac{\partial}{\partial h_1^E}loss = grad.l_2 \otimes \frac{\partial}{\partial h_1^E}l_2 = grad.l_2 \bullet (W^2_1)^T \\ grad.dp_1 & = \frac{\partial}{\partial dp_1}loss = grad.h_1 \otimes \frac{\partial}{\partial dp_1}h_1 = grad.h_1 \otimes \frac{\partial}{\partial dp_1}ReLU(dp_1) \\ grad.l_1 & = \frac{\partial}{\partial l_1}loss = grad.dp_1 \otimes \frac{\partial}{\partial l_1}dp_1 = grad.dp_1 \otimes \frac{\partial}{\partial l_1}dropout(l_1) \\ grad.W^1_d & = \frac{\partial}{\partial W^1_d}loss = grad.l_1 \otimes \frac{\partial}{\partial W^1_d}l_1 = \frac{{1}}{n} \otimes (x^E)^T \bullet grad.l_1 \end{align} \]
DEEP_MLP_Trainer = function (train_X, train_Y, valid_X = NULL, valid_Y = NULL,
noise = 0, oversampling = TRUE, lambda = 0.001, dp = 0.5,
num.iteration = 500, num.hidden = c(10, 10, 10), eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20) {
#Functions
#Forward
S.fun = function (x, eps = eps) {
S = 1/(1 + exp(-x))
S[S < eps] = eps
S[S > 1 - eps] = 1 - eps
return(S)
}
ReLU.fun = function (x) {
x[x < 0] <- 0
return(x)
}
#---------------------------------------------#
dropout.fun = function (x, dp = dp) {
len_x = length(x)
x[sample(1:len_x, len_x * dp)] <- 0
return(x/(1-dp))
}
#---------------------------------------------#
L.fun = function (X, W) {
X.E = cbind(1, X)
L = X.E %*% W
return(L)
}
CE.fun = function (o, y, eps = eps) {
loss = -1/length(y) * sum(y * log(o + eps) + (1 - y) * log(1 - o + eps))
return(loss)
}
#Backward
grad_o.fun = function (o, y) {
return((o - y)/(o*(1-o)))
}
grad_s.fun = function (grad_o, o) {
return(grad_o*(o*(1-o)))
}
grad_W.fun = function (grad_l, h) {
h.E = cbind(1, h)
return(t(h.E) %*% grad_l/nrow(h))
}
grad_h.fun = function (grad_l, W) {
return(grad_l %*% t(W[-1,]))
}
#---------------------------------------------#
grad_dp.fun = function (grad_h, DP, dp = dp) {
de_DP = DP
de_DP[de_DP<0] = 0
de_DP[de_DP>0] = 1
return(grad_h*de_DP)
}
grad_l.fun = function (grad_dp, DP, dp = dp) {
de_DP = DP
de_DP[de_DP == 0] = 0
de_DP[de_DP != 0] = 1/(1-dp)
return(grad_dp*de_DP)
}
#---------------------------------------------#
# Noise
sd.vec <- NULL
for (k in 1:ncol(train_X)) {
if (class(train_X[,k])[1] %in% c('numeric', 'integer')) {
sd.val <- sd(train_X[,k])
sd.vec <- c(sd.vec, sd.val * noise)
} else {
sd.vec <- c(sd.vec, 0L)
}
}
#initialization
train_X_mat <- model.matrix(~ ., data = train_X)
train_X_mat <- train_X_mat[,-1]
train_Y_mat <- t(t(train_Y))
W_list = list()
M_list = list()
N_list = list()
len_h = length(num.hidden)
for (w_seq in 1:(len_h+1)) {
if (w_seq == 1) {
NROW_W = ncol(train_X_mat) + 1
NCOL_W = num.hidden[w_seq]
} else if (w_seq == len_h+1) {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = ncol(train_Y_mat)
} else {
NROW_W = num.hidden[w_seq - 1] + 1
NCOL_W = num.hidden[w_seq]
}
W_list[[w_seq]] = matrix(rnorm(NROW_W*NCOL_W, sd = 1), nrow = NROW_W, ncol = NCOL_W)
M_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
N_list[[w_seq]] = matrix(0, nrow = NROW_W, ncol = NCOL_W)
}
loss_seq = rep(0, num.iteration)
#Caculating
for (i in 1:num.iteration) {
if (oversampling) {
idx.pos = sample(which(train_Y == 1), batch_size / 2)
idx.neg = sample(which(train_Y == 0), batch_size / 2)
idx = c(idx.pos, idx.neg)
} else {
idx = sample(1:nrow(train_X_mat), batch_size)
}
noise_mat = t(matrix(rnorm(batch_size * length(sd.vec), sd = sd.vec), nrow = length(sd.vec)))
sub_X_mat = train_X_mat[idx,] + noise_mat
#Forward
current_l_list = list()
#---------------------------------------------#
current_dp_list = list()
#---------------------------------------------#
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = sub_X_mat, W = W_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = W_list[[j]])
}
#---------------------------------------------#
current_dp_list[[j]] = dropout.fun(x = current_l_list[[j]], dp = dp)
current_h_list[[j]] = ReLU.fun(x = current_dp_list[[j]])
#---------------------------------------------#
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = W_list[[len_h+1]])
#---------------------------------------------#
current_dp_list[[len_h+1]] = dropout.fun(x = current_l_list[[len_h+1]], dp = dp)
current_o = S.fun(x = current_dp_list[[len_h+1]], eps = eps)
#---------------------------------------------#
loss_seq[i] = CE.fun(o = current_o, y = train_Y_mat[idx,], eps = eps)
#Backward
current_grad_l_list = list()
#---------------------------------------------#
current_grad_dp_list = list()
#---------------------------------------------#
current_grad_W_list = list()
current_grad_h_list = list()
current_grad_o = grad_o.fun(o = current_o, y = train_Y_mat[idx,])
#---------------------------------------------#
current_grad_dp_list[[len_h+1]] = grad_s.fun(grad_o = current_grad_o, o = current_o)
current_grad_l_list[[len_h+1]] = grad_l.fun(grad_dp = current_grad_dp_list[[len_h+1]], DP = current_dp_list[[len_h+1]], dp = dp)
#---------------------------------------------#
current_grad_W_list[[len_h+1]] = grad_W.fun(grad_l = current_grad_l_list[[len_h+1]], h = current_h_list[[len_h]])
for (j in len_h:1) {
current_grad_h_list[[j]] = grad_h.fun(grad_l = current_grad_l_list[[j+1]], W = W_list[[j+1]])
#---------------------------------------------#
current_grad_dp_list[[j]] = grad_dp.fun(grad_h = current_grad_h_list[[j]], DP = current_dp_list[[j]], dp = dp)
current_grad_l_list[[j]] = grad_l.fun(grad_dp = current_grad_dp_list[[j]], DP = current_dp_list[[j]], dp = dp)
#---------------------------------------------#
if (j != 1) {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = current_h_list[[j - 1]])
} else {
current_grad_W_list[[j]] = grad_W.fun(grad_l = current_grad_l_list[[j]], h = sub_X_mat)
}
}
if (optimizer == 'adam') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + (1 - beta1) * current_grad_W_list[[j]]
N_list[[j]] = beta2 * N_list[[j]] + (1 - beta2) * current_grad_W_list[[j]]^2
M.hat = M_list[[j]]/(1 - beta1^i)
N.hat = N_list[[j]]/(1 - beta2^i)
W_list[[j]] = W_list[[j]] - lr * M.hat / sqrt(N.hat+eps) - lambda * W_list[[j]]
}
} else if (optimizer == 'sgd') {
for (j in 1:(len_h+1)) {
M_list[[j]] = beta1 * M_list[[j]] + lr * current_grad_W_list[[j]]
W_list[[j]] = W_list[[j]] - M_list[[j]] - lambda * W_list[[j]]
}
} else {
stop('optimizer must be selected from "sgd" or "adam".')
}
}
pre_func = function (new_X, w_list = W_list) {
current_l_list = list()
current_h_list = list()
for (j in 1:len_h) {
if (j == 1) {
current_l_list[[j]] = L.fun(X = new_X, W = w_list[[j]])
} else {
current_l_list[[j]] = L.fun(X = current_h_list[[j-1]], W = w_list[[j]])
}
current_h_list[[j]] = ReLU.fun(x = current_l_list[[j]])
}
current_l_list[[len_h+1]] = L.fun(X = current_h_list[[len_h]], W = w_list[[len_h+1]])
current_o = S.fun(x = current_l_list[[len_h+1]], eps = eps)
return(current_o)
}
require(pROC)
pred_y = pre_func(new_X = train_X_mat)
roc_train <- roc(train_Y ~ pred_y)
plot(roc_train, col = 'red')
text(0.5, 0.5, paste0('AUC = ', formatC(roc_train[['auc']], 4, format = 'f')), col = 'red')
if (!is.null(valid_X)) {
valid_X_mat <- model.matrix(~ ., data = valid_X)
valid_X_mat <- valid_X_mat[,-1]
pred_y = pre_func(new_X = valid_X_mat)
roc_valid <- roc(valid_Y ~ pred_y)
plot(roc_valid, col = 'blue', add = TRUE)
text(0.5, 0.4, paste0('AUC = ', formatC(roc_valid[['auc']], 4, format = 'f')), col = 'blue')
legend('bottomright', c('train', 'valid'), col = c('red', 'blue'), lwd = 1)
}
return(list(pre_func = pre_func, W_list = W_list))
}
DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y,
noise = 0.3, oversampling = TRUE, lambda = 0.001, dp = 0.5,
num.iteration = 10000, num.hidden = 300, eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 20)
– 這是在第5000代的時候停下來:
set.seed(0)
DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y,
noise = 0, oversampling = TRUE, lambda = 0, dp = 0,
num.iteration = 5000, num.hidden = 30, eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 100)
– 這是在第10000代的時候停下來:
set.seed(0)
DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y,
noise = 0, oversampling = TRUE, lambda = 0, dp = 0,
num.iteration = 10000, num.hidden = 30, eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 100)
– 這是在第20000代的時候停下來:
set.seed(0)
DNN_list <- DEEP_MLP_Trainer(train_X = train_X, train_Y = train_Y, valid_X = valid_X, valid_Y = valid_Y,
noise = 0, oversampling = TRUE, lambda = 0, dp = 0,
num.iteration = 20000, num.hidden = 30, eps = 1e-8,
lr = 0.001, beta1 = 0.9, beta2 = 0.999, optimizer = 'adam', batch_size = 100)
– 要怎樣解決這個問題呢?我們必須每一訓練到一個階段,就確認一下目前在「驗證組」的準確度是否繼續上升。
– 這是不是有點像「極限梯度提升機」在運算時所做的事情?
– … We trained the networks with minibatches of size 8 and used an initial learning rate of 0.0001 that was decayed by a factor of 10 each time the loss on the tuning set plateaued after an epoch (a full pass over the training set). In order to prevent the networks from overfitting, early stopping was performed by saving the network after every epoch and choosing the saved network with the lowest loss on the tuning set. …
– 如果你很想知道怎樣做,醫療人工智慧實作- 胸部X光左心室功能障礙分類挑戰中有範例語法,你可以參考
– 這意思是說原始樣本假設是1000,批量大小為20,那一個「epoch」就是50個「batch」;若批量大小為25,那一個「epoch」就是40個「batch」。
– 剛剛我們引用的那篇paper非常的經典,他提到了非常多的「訓練技巧」,我們通常會依據「驗證組」選擇一個最佳模型,最後再放到「測試組」上做最終裁決。
深度學習領域中存在三大理論問題:過度擬合問題、梯度消失問題、權重初始化問題,而我們已經在過度擬合問題上跟上了2012年的時代,並且發現了網路要訓練的準,那鐵定是需要大量的「參數實驗」,從而找到一組最佳參數讓自己的模型在測試集中準確性較高。
隨著連續3節課的推導,你是否發現其實只要你熟練使用連鎖律,那其實我們可以疊加任意多「可微分」的層,從而建構越來越複雜的神經網路。
– 因此,目前對於深度學習開發的主流都是透過「框架/平台」,由你指定一個由「眾多可微分層累加的預測函數」,而「框架/平台」負責利用連鎖律幫你解微分,從而獲得梯度優化整個網路取得權重參數,因此之後的課程我們將開始使用「框架/平台」進行開發。
– 到時候假使我們遇到了一個新的函數,那也只要解出該函數的偏導函數你就應該有能力將其應用於自己的模型中。