林嶔 (Lin, Chin)
Lesson 4 MxNet框架編程介紹
在今天之前的課程我們較重視理論的推導,而之後我們會更重視實務開發,並且透過大量的實驗輔助說明一些較為難懂的數學原理。
讓我們使用套件來開發人工神經網路吧,這能讓我們快速的實驗各式不同的網路結構對分類結果帶來的影響。
– 透過前幾節課的推導,我們應該了解到這種開發框架其實也不是很難實現,這是因為目前的人工神經網路無論多深多複雜,也一定可以被定義成一個複雜函數的組合,透過分別解微分以及連鎖律我們將能求得任何參數的梯度,從而能實現梯度下降法來訓練網路。
– 但要注意一點,僅有64位元的作業系統能安裝MxNet。
– 他的安裝方法比較特別,並且有安裝GPU版本的方法,下面是在WINDOW系統安裝CPU版本的作法:
install.packages("https://s3.ca-central-1.amazonaws.com/jeremiedb/share/mxnet/CPU/3.6/mxnet.zip", repos = NULL)
– 或者採這個方法:
cran <- getOption("repos")
cran["dmlc"] <- "https://apache-mxnet.s3-accelerate.dualstack.amazonaws.com/R/CRAN/"
options(repos = cran)
install.packages("mxnet")
資料結構
預測函數(網路結構)
優化方法
Iterator
Model architecture
Optimizer
– 這次我們會示範很多依變項的分析語法,所以就不先進行特別的前處理
dat <- read.csv("ECG_train.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE, na.strings = "")
subdat <- dat[!dat[,'K'] %in% NA & !dat[,'AGE'] %in% NA & !dat[,'Rate'] %in% NA, c('K', 'AGE', 'Rate')]
used_dat.x <- model.matrix(~ ., data = subdat[,-1])
X.array <- t(used_dat.x[,-1])
Y.array <- array(subdat[,1], dim = c(1, nrow(subdat)))
my_iterator_core = function (batch_size) {
batch = 0
batch_per_epoch = length(Y.array)/batch_size
reset = function() {batch <<- 0}
iter.next = function() {
batch <<- batch+1
if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
}
value = function() {
idx = 1:batch_size + (batch - 1) * batch_size
idx[idx > ncol(X.array)] = sample(1:ncol(X.array), sum(idx > ncol(X.array)))
data = mx.nd.array(X.array[,idx, drop=FALSE])
label = mx.nd.array(Y.array[,idx, drop=FALSE])
return(list(data = data, label = label))
}
return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "batch_size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, batch_size = 100){
.self$iter <- my_iterator_core(batch_size = batch_size)
.self
},
value = function(){
.self$iter$value()
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
## [1] TRUE
## $data
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8]
## [1,] 50.95044 66.22767 67.46526 52.39062 38.6022 83.70883 62.12934 60.54764
## [2,] 112.00000 76.00000 65.00000 93.00000 63.0000 71.00000 62.00000 65.00000
## [,9] [,10] [,11] [,12] [,13] [,14] [,15] [,16]
## [1,] 62.38272 55.70469 82.29976 67.57248 36.93145 52.65045 64.91228 64.37901
## [2,] 85.00000 66.00000 109.00000 55.00000 67.00000 65.00000 103.00000 96.00000
## [,17] [,18] [,19] [,20]
## [1,] 88.57705 83.13152 31.84526 81.80301
## [2,] 97.00000 153.00000 78.00000 60.00000
##
## $label
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 2.2 3.7 2.7 3.8 5.7 5.7 6 3 3.2 3.5 2.1 2.8 3.4 2.7
## [,15] [,16] [,17] [,18] [,19] [,20]
## [1,] 6.5 3 5.9 2.9 2.1 4.2
– 函數「mx.symbol.Variable」用來產生一個新的模型起點,他通常是Iterator所輸出的名稱,有時候也可以用來指定權重的名稱
– 函數「mx.symbol.FullyConnected」代表著全連接層,這相當於之前MLP做線性運算的\(L\)函數,其中參數「num.hidden」意旨輸出維度
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
weight = mx.symbol.Variable(name = 'weight')
bias = mx.symbol.Variable(name = 'bias')
pred_y = mx.symbol.FullyConnected(data = data, weight = weight, bias = bias,
num.hidden = 1, no.bias = FALSE, name = 'pred_y')
## $arg.shapes
## $arg.shapes$data
## [1] 2 20
##
## $arg.shapes$weight
## [1] 2 1
##
## $arg.shapes$bias
## [1] 1
##
##
## $out.shapes
## $out.shapes$pred_y_output
## [1] 1 20
##
##
## $aux.shapes
## named list()
residual = mx.symbol.broadcast_minus(lhs = label, rhs = pred_y)
square_residual = mx.symbol.square(data = residual)
mean_square_residual = mx.symbol.mean(data = square_residual, axis = 0, keepdims = FALSE)
mse_loss = mx.symbol.MakeLoss(data = mean_square_residual, name = 'rmse')
mx.symbol.infer.shape(mse_loss, data = c(2, 20), label = c(1, 20))$out.shapes
## $rmse_output
## [1] 1
my_optimizer = mx.opt.create(name = "adam", learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999,
epsilon = 1e-08, wd = 0)
– 注意,這裡的num.round是指epoch的意思,而非batch。
mx.set.seed(0)
lr_model = mx.model.FeedForward.create(symbol = mse_loss, X = my_iter, optimizer = my_optimizer,
array.batch.size = 20, ctx = mx.cpu(), num.round = 200)
## $weight
## [,1]
## [1,] 0.05455168
## [2,] 0.01002080
##
## $bias
## [1] 2.180768
##
## Call:
## lm(formula = K ~ ., data = subdat)
##
## Coefficients:
## (Intercept) AGE Rate
## 2.461510 0.017666 0.002416
結果差不多,有一點小誤差主要是因為我們使用的是「隨機梯度下降法」而非全樣本,所以結果會有一點點的隨機誤差存在。
但我們對這個結果其實不是很滿意,因為我們注意到了我們的「Optimizer」使用的參數非常奇怪,並且epoch次數有點多,我們把它改正常一點:
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
mx.set.seed(0)
lr_model = mx.model.FeedForward.create(symbol = mse_loss, X = my_iter, optimizer = my_optimizer,
array.batch.size = 20, ctx = mx.cpu(), num.round = 10)
lr_model$arg.params
## $weight
## [,1]
## [1,] NaN
## [2,] NaN
##
## $bias
## [1] NaN
\[ \begin{align} \frac{\partial}{\partial b_0}loss & = \frac{1}{n} \sum \limits_{i=1}^{n}\left( \hat{y_{i}} - y_{i} \right) \\ \frac{\partial}{\partial b_1}loss & = \frac{1}{n} \sum \limits_{i=1}^{n}\left( \hat{y_{i}} - y_{i} \right) \cdot x_{i} \end{align} \]
X.array <- t(scale(t(X.array)))
mean.X <- attr(X.array, "scaled:center")
sd.X <- attr(X.array, "scaled:scale")
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
mx.set.seed(0)
lr_model = mx.model.FeedForward.create(symbol = mse_loss, X = my_iter, optimizer = my_optimizer,
array.batch.size = 20, ctx = mx.cpu(), num.round = 20)
lr_model$arg.params
## $weight
## [,1]
## [1,] 0.7609081
## [2,] 0.1595915
##
## $bias
## [1] 3.993352
subdat[,'AGE'] <- (subdat[,'AGE'] - mean(subdat[,'AGE'])) / sd(subdat[,'AGE'])
subdat[,'Rate'] <- (subdat[,'Rate'] - mean(subdat[,'Rate'])) / sd(subdat[,'Rate'])
lm(K ~ ., data = subdat)
##
## Call:
## lm(formula = K ~ ., data = subdat)
##
## Coefficients:
## (Intercept) AGE Rate
## 3.7810 0.3304 0.0571
– 讓我們直接標準化,以方便比較結果。
subdat <- dat[!dat[,'LVD'] %in% NA & !dat[,'AGE'] %in% NA & !dat[,'Rate'] %in% NA, c('LVD', 'AGE', 'Rate')]
subdat[,-1] <- scale(subdat[,-1])
used_dat.x <- model.matrix(~ ., data = subdat[,-1])
X.array <- t(used_dat.x[,-1])
Y.array <- array(subdat[,1], dim = c(1, nrow(subdat)))
– 這是函數「glm」的運算結果:
##
## Call: glm(formula = LVD ~ ., family = "binomial", data = subdat)
##
## Coefficients:
## (Intercept) AGE Rate
## -0.20658 0.02904 0.55717
##
## Degrees of Freedom: 2111 Total (i.e. Null); 2109 Residual
## Null Deviance: 2907
## Residual Deviance: 2763 AIC: 2769
\[ \begin{align} log(\frac{{p}}{1-p}) & = b_{0} + b_{1}x \\ p & = \frac{{1}}{1+e^{-b_{0} - b_{1}x}} \\ loss & = CE(y, p) = -\frac{{1}}{n}\sum \limits_{i=1}^{n} \left(y_{i} \cdot log(p_{i}) + (1-y_{i}) \cdot log(1-p_{i})\right) \end{align} \]
my_iterator_core = function(batch_size) {
batch = 0
batch_per_epoch = length(Y.array)/batch_size
reset = function() {batch <<- 0}
iter.next = function() {
batch <<- batch+1
if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
}
value = function() {
idx = 1:batch_size + (batch - 1) * batch_size
idx[idx > ncol(X.array)] = sample(1:ncol(X.array), sum(idx > ncol(X.array)))
data = mx.nd.array(X.array[,idx, drop=FALSE])
label = mx.nd.array(Y.array[,idx, drop=FALSE])
return(list(data = data, label = label))
}
return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "batch_size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, batch_size = 100){
.self$iter <- my_iterator_core(batch_size = batch_size)
.self
},
value = function(){
.self$iter$value()
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter = my_iterator_func(iter = NULL, batch_size = 100)
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
linear_pred = mx.symbol.FullyConnected(data = data,
num.hidden = 1, no.bias = FALSE, name = 'linear_pred')
logistic_pred = mx.symbol.sigmoid(data = linear_pred, name = 'logistic_pred')
eps = 1e-8
ce_loss_pos = mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg = mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean = 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss = mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
mx.set.seed(0)
logistic_model = mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
array.batch.size = 20, ctx = mx.cpu(), num.round = 20)
logistic_model$arg.params
## $linear_pred_weight
## [,1]
## [1,] 0.01686738
## [2,] 0.55081975
##
## $linear_pred_bias
## [1] -0.1971277
– 在前面的練習題中,我們已經寫好了邏輯斯迴歸的Iterator、Model architecture、Optimizer等三個部分,現在讓我們再編寫結合這三者的Executor。
my_executor = mx.simple.bind(symbol = ce_loss,
data = c(2, 100), label = c(1, 100),
ctx = mx.cpu(), grad.req = "write")
mx.set.seed(0)
new_arg = mxnet:::mx.model.init.params(symbol = ce_loss,
input.shape = list(data = c(2, 100), label = c(1, 100)),
output.shape = NULL,
initializer = mxnet:::mx.init.uniform(0.01),
ctx = mx.cpu())
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)
– 「my_iter$iter.next()」會持續回答TRUE直到一個epoch結束,而每個epoch的
my_iter$reset()
while (my_iter$iter.next()) {
my_values <- my_iter$value()
mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
mx.exec.forward(my_executor, is.train = TRUE)
mx.exec.backward(my_executor)
update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
}
set.seed(0)
for (i in 1:20) {
my_iter$reset()
batch_loss = NULL
while (my_iter$iter.next()) {
my_values <- my_iter$value()
mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
mx.exec.forward(my_executor, is.train = TRUE)
mx.exec.backward(my_executor)
update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
batch_loss = c(batch_loss, as.array(my_executor$ref.outputs$ce_loss_output))
}
if (i %% 10 == 0 | i < 10) {
cat(paste0("epoch = ", i,
": Cross-Entropy (CE) = ", formatC(mean(batch_loss), format = "f", 4),
"; weight = ", paste(formatC(as.array(my_executor$ref.arg.arrays$linear_pred_weight), format = "f", 3), collapse = ", "),
"; bias = ", formatC(as.array(my_executor$ref.arg.arrays$linear_pred_bias), format = "f", 3), "\n"))
}
}
## epoch = 1: Cross-Entropy (CE) = 0.6556; weight = 0.013, 0.619; bias = -0.219
## epoch = 2: Cross-Entropy (CE) = 0.6544; weight = 0.017, 0.541; bias = -0.191
## epoch = 3: Cross-Entropy (CE) = 0.6548; weight = 0.017, 0.545; bias = -0.197
## epoch = 4: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.553; bias = -0.198
## epoch = 5: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.551; bias = -0.197
## epoch = 6: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.551; bias = -0.197
## epoch = 7: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.551; bias = -0.197
## epoch = 8: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.551; bias = -0.197
## epoch = 9: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.551; bias = -0.197
## epoch = 10: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.551; bias = -0.197
## epoch = 20: Cross-Entropy (CE) = 0.6547; weight = 0.017, 0.551; bias = -0.197
logistic_model <- mxnet:::mx.model.extract.model(symbol = logistic_pred,
train.execs = list(my_executor))
mx.model.save(logistic_model, "logistic_regression", iteration = 0)
logistic_model = mx.model.load("logistic_regression", iteration = 0)
predict_Y = predict(logistic_model, X.array, array.layout = "colmajor")
confusion_table = table(predict_Y > 0.5, Y.array)
print(confusion_table)
## Y.array
## 0 1
## FALSE 883 545
## TRUE 279 405
– MxNet的底層執行器來既然能進行Forward/Backward程序,當然也可以只進行Forward程序。
– 需要注意的是,假設你的驗證樣本很大,那你可能需要用迴圈慢慢丟樣本進行推論,不然會把記憶體塞爆。
all_layers = logistic_model$symbol$get.internals()
linear_pred_output = all_layers$get.output(which(all_layers$outputs == 'linear_pred_output'))
symbol_group = mx.symbol.Group(c(linear_pred_output, logistic_model$symbol))
Pred_executor = mx.simple.bind(symbol = symbol_group, data = c(2, ncol(X.array)), ctx = mx.cpu())
– 執行器已經寫完了,剩下的程序差不多,只有一些細節上的差異:
mx.exec.update.arg.arrays(Pred_executor, logistic_model$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(Pred_executor, logistic_model$aux.params, match.name = TRUE)
mx.exec.update.arg.arrays(Pred_executor, list(data = mx.nd.array(X.array)), match.name = TRUE)
mx.exec.forward(Pred_executor, is.train = FALSE)
PRED_linear_pred = as.array(Pred_executor$ref.outputs$linear_pred_output)
PRED_logistic_pred = as.array(Pred_executor$ref.outputs$logistic_pred_output)
confusion_table = table(PRED_logistic_pred > 0.5, Y.array)
print(confusion_table)
## Y.array
## 0 1
## FALSE 883 545
## TRUE 279 405
– 需要特別注意的是,我們需要對Y做特殊的編碼方式,這個之前我們已經學過非常多次了,請你再熟悉一下:
subdat <- dat[!dat[,'AMI'] %in% NA & !dat[,'AGE'] %in% NA & !dat[,'Rate'] %in% NA, c('AMI', 'AGE', 'Rate')]
subdat[,-1] <- scale(subdat[,-1])
used_dat.x <- model.matrix(~ ., data = subdat[,-1])
set.seed(0)
all_idx <- 1:nrow(subdat)
train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]
X_mat <- t(used_dat.x[,-1])
Y_mat <- array(t(model.matrix(~ -1 + subdat[,1])), dim = c(3, nrow(subdat)))
train_X <- X_mat[,train_idx]
valid_X <- X_mat[,valid_idx]
test_X <- X_mat[,test_idx]
train_Y <- Y_mat[,train_idx]
valid_Y <- Y_mat[,valid_idx]
test_Y <- Y_mat[,test_idx]
\[ \begin{align} mlogloss(y_{i}, p_{i}) & = - \sum \limits_{i=1}^{m} y_{i} log(p_{i}) \end{align} \]
– Iterator的部分,要特別注意這裡要改成train_X:
my_iterator_core = function(batch_size) {
batch = 0
batch_per_epoch = ncol(train_X)/batch_size
reset = function() {batch <<- 0}
iter.next = function() {
batch <<- batch+1
if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
}
value = function() {
idx = 1:batch_size + (batch - 1) * batch_size
idx[idx > ncol(train_Y)] = sample(1:ncol(train_Y), sum(idx > ncol(train_Y)))
data = mx.nd.array(train_X[,idx, drop=FALSE])
label = mx.nd.array(train_Y[,idx, drop=FALSE])
return(list(data = data, label = label))
}
return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "batch_size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, batch_size = 100){
.self$iter <- my_iterator_core(batch_size = batch_size)
.self
},
value = function(){
.self$iter$value()
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter = my_iterator_func(iter = NULL, batch_size = 20)
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
linear_pred = mx.symbol.FullyConnected(data = data, num.hidden = 3, name = 'linear_pred')
softmax_layer = mx.symbol.softmax(data = linear_pred, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
#1. Build an executor to train model
my_executor = mx.simple.bind(symbol = m_logloss,
data = c(2, 20), label = c(3, 20),
ctx = mx.cpu(), grad.req = "write")
#2. Set the initial parameters
mx.set.seed(0)
new_arg = mxnet:::mx.model.init.params(symbol = m_logloss,
input.shape = list(data = c(2, 20), label = c(3, 20)),
output.shape = NULL,
initializer = mxnet:::mx.init.uniform(0.01),
ctx = mx.cpu())
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)
#3. Define the updater
my_updater = mx.opt.get.updater(optimizer = my_optimizer, weights = my_executor$ref.arg.arrays)
set.seed(0)
for (i in 1:20) {
my_iter$reset()
batch_loss = NULL
while (my_iter$iter.next()) {
my_values <- my_iter$value()
mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
mx.exec.forward(my_executor, is.train = TRUE)
mx.exec.backward(my_executor)
update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
batch_loss = c(batch_loss, as.array(my_executor$ref.outputs$m_logloss_output))
}
if (i %% 10 == 0 | i <= 5) {
cat(paste0("epoch = ", i, ": m-logloss = ", formatC(mean(batch_loss), format = "f", 4), "\n"))
}
}
## epoch = 1: m-logloss = 0.2864
## epoch = 2: m-logloss = 0.2531
## epoch = 3: m-logloss = 0.2528
## epoch = 4: m-logloss = 0.2527
## epoch = 5: m-logloss = 0.2527
## epoch = 10: m-logloss = 0.2526
## epoch = 20: m-logloss = 0.2526
softmax_model <- mxnet:::mx.model.extract.model(symbol = softmax_layer,
train.execs = list(my_executor))
predict_Y = predict(softmax_model, valid_X, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(valid_Y)))
print(confusion_table)
##
## 1 2 3
## 1 207 32 57
library(pROC)
par(mfrow = c(1, 2))
roc_valid <- roc((max.col(t(valid_Y)) == 2) ~ predict_Y[2,])
plot(roc_valid, col = 'blue')
roc_valid <- roc((max.col(t(valid_Y)) == 1) ~ predict_Y[1,])
plot(roc_valid, col = 'blue')
– 要進行過度採樣就必須修正Iterator的部分:
my_iterator_core = function(batch_size) {
batch = 0
batch_per_epoch = ncol(train_X)/batch_size
reset = function() {batch <<- 0}
iter.next = function() {
batch <<- batch+1
if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
}
value = function() {
idx.1 <- sample(which(max.col(t(train_Y)) == 1), floor(batch_size / 3))
idx.2 <- sample(which(max.col(t(train_Y)) == 2), floor(batch_size / 3))
idx.3 <- sample(which(max.col(t(train_Y)) == 3), batch_size - length(idx.1) - length(idx.2))
idx = c(idx.1, idx.2, idx.3)
data = mx.nd.array(train_X[,idx, drop=FALSE])
label = mx.nd.array(train_Y[,idx, drop=FALSE])
return(list(data = data, label = label))
}
return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "batch_size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, batch_size = 100){
.self$iter <- my_iterator_core(batch_size = batch_size)
.self
},
value = function(){
.self$iter$value()
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter = my_iterator_func(iter = NULL, batch_size = 20)
#1. Build an executor to train model
my_executor = mx.simple.bind(symbol = m_logloss,
data = c(2, 20), label = c(3, 20),
ctx = mx.cpu(), grad.req = "write")
#2. Set the initial parameters
mx.set.seed(0)
new_arg = mxnet:::mx.model.init.params(symbol = m_logloss,
input.shape = list(data = c(2, 20), label = c(3, 20)),
output.shape = NULL,
initializer = mxnet:::mx.init.uniform(0.01),
ctx = mx.cpu())
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)
#3. Define the updater
my_updater = mx.opt.get.updater(optimizer = my_optimizer, weights = my_executor$ref.arg.arrays)
#4. Forward/Backward
set.seed(0)
for (i in 1:20) {
my_iter$reset()
batch_loss = NULL
while (my_iter$iter.next()) {
my_values <- my_iter$value()
mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
mx.exec.forward(my_executor, is.train = TRUE)
mx.exec.backward(my_executor)
update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
batch_loss = c(batch_loss, as.array(my_executor$ref.outputs$m_logloss_output))
}
if (i %% 10 == 0 | i <= 5) {
cat(paste0("epoch = ", i, ": m-logloss = ", formatC(mean(batch_loss), format = "f", 4), "\n"))
}
}
## epoch = 1: m-logloss = 0.3618
## epoch = 2: m-logloss = 0.3575
## epoch = 3: m-logloss = 0.3593
## epoch = 4: m-logloss = 0.3611
## epoch = 5: m-logloss = 0.3600
## epoch = 10: m-logloss = 0.3569
## epoch = 20: m-logloss = 0.3579
softmax_model <- mxnet:::mx.model.extract.model(symbol = softmax_layer,
train.execs = list(my_executor))
predict_Y = predict(softmax_model, valid_X, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(valid_Y)))
print(confusion_table)
##
## 1 2 3
## 1 52 3 4
## 2 24 4 6
## 3 131 25 47
– 讓我們嘗試一下把Softmax regression改造成具有1個隱藏層的多層感知器,其實只要把Model architecture的部分作修正,其他部分完全不需要改:
data = mx.symbol.Variable(name = 'data')
label = mx.symbol.Variable(name = 'label')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 = mx.symbol.FullyConnected(data = relu1, num.hidden = 3, name = 'fc2')
softmax_layer = mx.symbol.softmax(data = fc2, axis = 1, name = 'softmax_layer')
eps = 1e-8
m_log = 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss = mx.symbol.MakeLoss(m_log, name = 'm_logloss')
#1. Build an executor to train model
my_executor = mx.simple.bind(symbol = m_logloss,
data = c(2, 20), label = c(3, 20),
ctx = mx.cpu(), grad.req = "write")
#2. Set the initial parameters
mx.set.seed(0)
new_arg = mxnet:::mx.model.init.params(symbol = m_logloss,
input.shape = list(data = c(2, 20), label = c(3, 20)),
output.shape = NULL,
initializer = mxnet:::mx.init.uniform(0.01),
ctx = mx.cpu())
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)
#3. Define the updater
my_updater = mx.opt.get.updater(optimizer = my_optimizer, weights = my_executor$ref.arg.arrays)
#4. Forward/Backward
set.seed(0)
for (i in 1:20) {
my_iter$reset()
batch_loss = NULL
while (my_iter$iter.next()) {
my_values <- my_iter$value()
mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
mx.exec.forward(my_executor, is.train = TRUE)
mx.exec.backward(my_executor)
update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
batch_loss = c(batch_loss, as.array(my_executor$ref.outputs$m_logloss_output))
}
if (i %% 10 == 0 | i <= 5) {
cat(paste0("epoch = ", i, ": m-logloss = ", formatC(mean(batch_loss), format = "f", 4), "\n"))
}
}
## epoch = 1: m-logloss = 0.3639
## epoch = 2: m-logloss = 0.3630
## epoch = 3: m-logloss = 0.3630
## epoch = 4: m-logloss = 0.3629
## epoch = 5: m-logloss = 0.3629
## epoch = 10: m-logloss = 0.3605
## epoch = 20: m-logloss = 0.3454
softmax_model <- mxnet:::mx.model.extract.model(symbol = softmax_layer,
train.execs = list(my_executor))
predict_Y = predict(softmax_model, valid_X, array.layout = "colmajor")
confusion_table = table(max.col(t(predict_Y)), max.col(t(valid_Y)))
print(confusion_table)
##
## 1 2 3
## 1 88 7 8
## 2 2 0 2
## 3 117 25 47
subdat <- dat[!dat[,'time'] %in% NA & !dat[,'death'] %in% NA & !dat[,'AGE'] %in% NA & !dat[,'Rate'] %in% NA, c('time', 'death', 'AGE', 'Rate')]
subdat[,-1:-2] <- scale(subdat[,-1:-2])
used_dat.x <- model.matrix(~ ., data = subdat[,-1:-2])
set.seed(0)
all_idx <- 1:nrow(subdat)
train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]
X_mat <- t(used_dat.x[,-1])
train_X <- X_mat[,train_idx]
valid_X <- X_mat[,valid_idx]
test_X <- X_mat[,test_idx]
train_Y <- subdat[train_idx,1:2]
valid_Y <- subdat[valid_idx,1:2]
test_Y <- subdat[test_idx,1:2]
## Call:
## coxph(formula = Surv(time, death) ~ ., data = subdat[train_idx,
## ])
##
## coef exp(coef) se(coef) z p
## AGE 0.68026 1.97439 0.07475 9.101 < 2e-16
## Rate 0.34593 1.41330 0.04715 7.336 2.2e-13
##
## Likelihood ratio test=144.2 on 2 df, p=< 2.2e-16
## n= 2800, number of events= 267
survival_encode <- function (event, time) {
sub_dat <- data.frame(id = 1:length(event), event = event, time = time, stringsAsFactors = FALSE)
sub_dat <- sub_dat[order(sub_dat[,'event'], decreasing = TRUE),]
sub_dat <- sub_dat[order(sub_dat[,'time']),]
label <- array(0L, dim = rep(nrow(sub_dat), 2))
mask <- array(0L, dim = rep(nrow(sub_dat), 2))
diag(label) <- 1L
for (m in 1:nrow(sub_dat)) {
mask[m,m:nrow(sub_dat)] <- 1L
if (sub_dat[m,'event'] == 0) {
mask[m,] <- 0L
} else if (m > 1) {
zero_pos <- which(sub_dat[1:(m-1),'time'] == sub_dat[m,'time'])
if (length(zero_pos) > 0) {
mask[zero_pos,m] <- 0L
}
}
}
original_pos <- order(sub_dat[,'id'])
return(list(label = label[,original_pos,drop = FALSE], mask = mask[,original_pos,drop = FALSE]))
}
my_iterator_core = function(batch_size) {
batch = 0
batch_per_epoch = ncol(train_X)/batch_size
reset = function() {batch <<- 0}
iter.next = function() {
batch <<- batch+1
if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
}
value = function() {
idx = 1:batch_size + (batch - 1) * batch_size
idx[idx > ncol(train_X)] = sample(1:ncol(train_X), sum(idx > ncol(train_X)))
data = mx.nd.array(train_X[,idx, drop=FALSE])
label_list = survival_encode(event = train_Y[idx,'death'], time = train_Y[idx,'time'])
return(list(data = data, label = mx.nd.array(label_list$label), mask = mx.nd.array(label_list$mask)))
}
return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "batch_size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, batch_size = 100){
.self$iter <- my_iterator_core(batch_size = batch_size)
.self
},
value = function(){
.self$iter$value()
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter = my_iterator_func(iter = NULL, batch_size = 500)
\[ \begin{align} loss & = \sum \limits_{\text{by column}} log( \sum \limits_{\text{by row}} M \otimes exp(H) ) - \sum \limits_{\text{by column}} log( \sum \limits_{\text{by row}} M \otimes Y \otimes exp(H) ) \end{align} \]
data = mx.symbol.Variable(name = 'data')
final_pred = mx.symbol.FullyConnected(data = data, num.hidden = 1, no.bias = TRUE, name = 'final_pred')
COX_LOSS <- function (indata, inlabel, inmask, eps = 1e-8, make_loss = TRUE) {
exp_data <- mx.symbol.exp(data = indata, name = 'exp_data')
part_event_y <- mx.symbol.broadcast_mul(lhs = inlabel, rhs = inmask, name = 'part_event_y')
part_event <- mx.symbol.broadcast_mul(lhs = part_event_y, rhs = exp_data, name = 'part_event')
part_at_risk <- mx.symbol.broadcast_mul(lhs = inmask, rhs = exp_data, name = 'part_at_risk')
sum_part_event <- mx.symbol.sum(data = part_event, axis = 0, keepdims = TRUE, name = 'part_event')
sum_part_at_risk <- mx.symbol.sum(data = part_at_risk, axis = 0, keepdims = TRUE, name = 'sum_part_at_risk')
log_part_event <- mx.symbol.log(data = sum_part_event + eps, name = 'log_part_event')
log_part_at_risk <- mx.symbol.log(data = sum_part_at_risk + eps, name = 'log_part_at_risk')
diff_at_risk_event <- mx.symbol.broadcast_minus(lhs = log_part_at_risk, rhs = log_part_event, name = 'diff_at_risk_event')
important_time <- mx.symbol.sum(data = part_event_y, axis = 0, keepdims = TRUE, name = 'important_time')
number_time <- mx.symbol.sum(data = important_time, axis = 1, name = 'number_time')
event_loss <- mx.symbol.broadcast_mul(lhs = diff_at_risk_event, rhs = important_time, name = 'event_loss')
sum_event_loss <- mx.symbol.sum(data = event_loss, axis = 1, name = 'sum_event_loss')
final_loss <- mx.symbol.broadcast_div(lhs = sum_event_loss, rhs = number_time, name = 'final_loss')
if (make_loss) {final_loss <- mx.symbol.MakeLoss(data = final_loss, name = 'COX_loss')}
return(final_loss)
}
label_sym <- mx.symbol.Variable('label')
mask_sym <- mx.symbol.Variable('mask')
loss_symbol <- COX_LOSS(indata = final_pred, inlabel = label_sym, inmask = mask_sym)
#1. Build an executor to train model
my_executor = mx.simple.bind(symbol = loss_symbol,
data = c(2, 500), label = c(500, 500), mask = c(500, 500),
ctx = mx.cpu(), grad.req = "write")
#2. Set the initial parameters
mx.set.seed(0)
new_arg = mxnet:::mx.model.init.params(symbol = loss_symbol,
input.shape = list(data = c(2, 500), label = c(500, 500), mask = c(500, 500)),
output.shape = NULL,
initializer = mxnet:::mx.init.uniform(0.01),
ctx = mx.cpu())
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)
#3. Define the updater
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
my_updater = mx.opt.get.updater(optimizer = my_optimizer, weights = my_executor$ref.arg.arrays)
#4. Forward/Backward
set.seed(0)
for (i in 1:20) {
my_iter$reset()
batch_loss = NULL
while (my_iter$iter.next()) {
my_values <- my_iter$value()
mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
mx.exec.forward(my_executor, is.train = TRUE)
mx.exec.backward(my_executor)
update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
batch_loss = c(batch_loss, as.array(my_executor$ref.outputs$COX_loss_output))
}
if (i %% 10 == 0 | i <= 5) {
cat(paste0("epoch = ", i, ": loss = ", formatC(mean(batch_loss), format = "f", 4), "\n"))
}
}
## epoch = 1: loss = 5.5123
## epoch = 2: loss = 5.3611
## epoch = 3: loss = 5.3609
## epoch = 4: loss = 5.3483
## epoch = 5: loss = 5.3471
## epoch = 10: loss = 5.3277
## epoch = 20: loss = 5.3253
## [,1]
## [1,] 0.6448541
## [2,] 0.3448658
– 這次我們使用比較小的Batch size,這樣可以加快很多速度:
data = mx.symbol.Variable(name = 'data')
fc1 = mx.symbol.FullyConnected(data = data, num.hidden = 10, name = 'fc1')
relu1 = mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
final_pred = mx.symbol.FullyConnected(data = relu1, num.hidden = 1, no.bias = TRUE, name = 'final_pred')
label_sym <- mx.symbol.Variable('label')
mask_sym <- mx.symbol.Variable('mask')
loss_symbol <- COX_LOSS(indata = final_pred, inlabel = label_sym, inmask = mask_sym)
#1. Build an executor to train model
my_executor = mx.simple.bind(symbol = loss_symbol,
data = c(2, 50), label = c(50, 50), mask = c(50, 50),
ctx = mx.cpu(), grad.req = "write")
#2. Set the initial parameters
mx.set.seed(0)
new_arg = mxnet:::mx.model.init.params(symbol = loss_symbol,
input.shape = list(data = c(2, 50), label = c(50, 50), mask = c(50, 50)),
output.shape = NULL,
initializer = mxnet:::mx.init.uniform(0.01),
ctx = mx.cpu())
mx.exec.update.arg.arrays(my_executor, new_arg$arg.params, match.name = TRUE)
mx.exec.update.aux.arrays(my_executor, new_arg$aux.params, match.name = TRUE)
#3. Define the updater
my_optimizer = mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 0)
my_updater = mx.opt.get.updater(optimizer = my_optimizer, weights = my_executor$ref.arg.arrays)
#4. Forward/Backward
set.seed(0)
for (i in 1:20) {
my_iter$reset()
batch_loss = NULL
while (my_iter$iter.next()) {
my_values <- my_iter$value()
mx.exec.update.arg.arrays(my_executor, arg.arrays = my_values, match.name = TRUE)
mx.exec.forward(my_executor, is.train = TRUE)
mx.exec.backward(my_executor)
update_args = my_updater(weight = my_executor$ref.arg.arrays, grad = my_executor$ref.grad.arrays)
mx.exec.update.arg.arrays(my_executor, update_args, skip.null = TRUE)
batch_loss = c(batch_loss, as.array(my_executor$ref.outputs$COX_loss_output))
}
if (i %% 10 == 0 | i <= 5) {
cat(paste0("epoch = ", i, ": loss = ", formatC(mean(batch_loss), format = "f", 4), "\n"))
}
}
## epoch = 1: loss = 3.2296
## epoch = 2: loss = 3.1440
## epoch = 3: loss = 3.2010
## epoch = 4: loss = 3.1345
## epoch = 5: loss = 3.1355
## epoch = 10: loss = 3.1025
## epoch = 20: loss = 3.1012
– 需要注意,我們要去除mask,這樣才能進行預測:
COX_model <- mxnet:::mx.model.extract.model(symbol = final_pred,
train.execs = list(my_executor))
COX_model$arg.params <- COX_model$arg.params[!names(COX_model$arg.params) %in% 'mask']
predict_Y <- predict(COX_model, valid_X, array.layout = "colmajor")
valid_Y[,'score'] <- predict_Y[1,]
concordance(coxph(Surv(time, death) ~ score, data = valid_Y))[['concordance']]
## [1] 0.7538015
cox_fit <- coxph(Surv(time, death) ~ ., data = subdat[train_idx,])
valid_Y[,'score'] <- predict(cox_fit, subdat[valid_idx,])
concordance(coxph(Surv(time, death) ~ score, data = valid_Y))[['concordance']]
## [1] 0.7478008
– 舉例來說,疊代器可以協助我們在將資料丟去梯度下降法前,能進行許多前處理
– 底層執行器使用迴圈進行分析,能讓我們做細微的操作
– 並且,我們終於可以調用gpu進行運算。