林嶔 (Lin, Chin)
Lesson 8 反卷積層與自編碼器
– 但目前為止它能夠應用的場景仍然太少了,我們開始教大家相關的技術能夠運用到哪些地方
– 自編碼器是一種數據的壓縮算法,其中數據的壓縮和解壓縮函數是數據相關的、有損的、從樣本中自動學習的。
library(data.table)
DAT = fread("data/MNIST.csv", data.table = FALSE)
DAT = data.matrix(DAT)
#Split data
set.seed(0)
Train.sample = sample(1:nrow(DAT), nrow(DAT)*0.6, replace = FALSE)
Train.X = DAT[Train.sample,-1]
Train.Y = DAT[Train.sample,1]
Test.X = DAT[-Train.sample,-1]
Test.Y = DAT[-Train.sample,1]
#Display
library(OpenImageR)
imageShow(t(matrix(as.numeric(Train.X[1,]), nrow = 28, byrow = TRUE)))
library(mxnet)
my_iterator_func <- setRefClass("Custom_Iter1",
fields = c("iter", "data.csv", "data.shape", "batch.size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, data.csv, data.shape, batch.size){
csv_iter <- mx.io.CSVIter(data.csv = data.csv, data.shape = data.shape, batch.size = batch.size)
.self$iter <- csv_iter
.self
},
value = function(){
val <- as.array(.self$iter$value()$data)
val.x <- val[-1,]
dim(val.x) <- c(28, 28, 1, ncol(val.x))
val.x <- val.x/255
val.x <- mx.nd.array(val.x)
val.y <- val.x
list(data=val.x, label=val.y)
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter1 = my_iterator_func(iter = NULL, data.csv = 'data/train_data.csv', data.shape = 785, batch.size = 20)
– 我們再看一次這個Iterator怎樣使用:
## [1] TRUE
my_value = my_iter1$value()
library(OpenImageR)
imageShow(t(matrix(as.numeric(as.array(my_value$data)[,,,1]), nrow = 28, byrow = TRUE)))
– 需要特別注意的是,為了確保我們的Encoder是具有壓縮的感覺,每一層的數值總數都必須小於前一層!
# Encoder
data <- mx.symbol.Variable('data')
fc1 <- mx.symbol.FullyConnected(data = data, num.hidden = 128, name = 'fc1')
relu1 <- mx.symbol.Activation(data = fc1, act_type = "relu", name = 'relu1')
encoder <- mx.symbol.FullyConnected(data = relu1, num.hidden = 32, name = 'encoder')
# Decoder
fc3 <- mx.symbol.FullyConnected(data = encoder, num.hidden = 128, name = 'fc3')
relu3 <- mx.symbol.Activation(data = fc3, act_type = "relu", name = 'relu3')
fc4 <- mx.symbol.FullyConnected(data = relu3, num.hidden = 784, name = 'fc4')
decoder <- mx.symbol.reshape(data = fc4, shape = c(28, 28, 1, -1), name = 'decoder')
# MSE loss
label <- mx.symbol.Variable(name = 'label')
residual <- mx.symbol.broadcast_minus(lhs = label, rhs = decoder)
square_residual <- mx.symbol.square(data = residual)
mean_square_residual <- mx.symbol.mean(data = square_residual, axis = 0:3, keepdims = FALSE)
mse_loss <- mx.symbol.MakeLoss(data = mean_square_residual, name = 'mse')
my_optimizer <- mx.opt.create(name = "adam", learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999, wd = 1e-4)
my.eval.metric.loss <- mx.metric.custom(
name = "mlog-loss",
function(real, pred) {
return(as.array(pred))
}
)
mx.set.seed(0)
model <- mx.model.FeedForward.create(symbol = mse_loss, X = my_iter1, optimizer = my_optimizer,
eval.metric = my.eval.metric.loss,
array.batch.size = 20, ctx = mx.gpu(), num.round = 20)
model$symbol <- decoder
Test.DAT = fread("data/test_data.csv", data.table = FALSE)
Test.X = t(Test.DAT[,-1])
dim(Test.X) = c(28, 28, 1, ncol(Test.X))
Test.X = Test.X/255
Test.Y = Test.DAT[,1]
unzip_pred <- predict(model, Test.X)
unzip_pred[unzip_pred > 1] <- 1
unzip_pred[unzip_pred < 0] <- 0
library(imager)
par(mar=rep(0,4), mfcol = c(4, 5))
for (i in 1:10) {
plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
rasterImage((Test.X[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
rasterImage((unzip_pred[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
}
– 除此之外,你也能嘗試看看隨便給一串32個數字,測試一下解壓縮模型能幫你解碼成什麼東西!
– 想要分離壓縮模型並不困難,需要用到我們之前做轉移特徵學習類似的方式:
all_layers <- model$symbol$get.internals()
encoder_output <- which(all_layers$outputs == 'encoder_output') %>% all_layers$get.output()
encoder_model <- model
encoder_model$symbol <- encoder_output
encoder_model$arg.params <- encoder_model$arg.params[names(encoder_model$arg.params) %in% names(mx.symbol.infer.shape(encoder_output, data = c(28, 28, 1, 7))$arg.shapes)]
encoder_model$aux.params <- encoder_model$aux.params[names(encoder_model$aux.params) %in% names(mx.symbol.infer.shape(encoder_output, data = c(28, 28, 1, 7))$aux.shapes)]
## [1] 32 16800
# Decoder
data <- mx.symbol.Variable('data')
fc3 <- mx.symbol.FullyConnected(data = data, num.hidden = 128, name = 'fc3')
relu3 <- mx.symbol.Activation(data = fc3, act_type = "relu", name = 'relu3')
fc4 <- mx.symbol.FullyConnected(data = relu3, num.hidden = 784, name = 'fc4')
decoder_output <- mx.symbol.reshape(data = fc4, shape = c(28, 28, 1, -1), name = 'decoder')
decoder_model <- model
decoder_model$symbol <- decoder_output
decoder_model$arg.params <- decoder_model$arg.params[names(decoder_model$arg.params) %in% names(mx.symbol.infer.shape(decoder_output, data = c(32, 7))$arg.shapes)]
decoder_model$aux.params <- decoder_model$aux.params[names(decoder_model$aux.params) %in% names(mx.symbol.infer.shape(decoder_output, data = c(32, 7))$aux.shapes)]
unzip_pred <- predict(decoder_model, zip_code, array.layout = 'colmajor')
unzip_pred[unzip_pred > 1] <- 1
unzip_pred[unzip_pred < 0] <- 0
library(imager)
par(mar=rep(0,4), mfcol = c(4, 5))
for (i in 1:20) {
plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
rasterImage((unzip_pred[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
}
randon_zip_code <- array(rnorm(320, sd = 3), dim = c(32, 10))
unzip_pred <- predict(decoder_model, randon_zip_code)
unzip_pred[unzip_pred > 1] <- 1
unzip_pred[unzip_pred < 0] <- 0
library(imager)
par(mar=rep(0,4), mfcol = c(2, 5))
for (i in 1:10) {
plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
rasterImage(t(unzip_pred[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
}
test_array <- Test.X
test_array <- test_array + rnorm(prod(dim(test_array)), sd = 0.3)
test_array[test_array > 1] <- 1
test_array[test_array < 0] <- 0
unzip_pred <- predict(model, test_array)
unzip_pred[unzip_pred > 1] <- 1
unzip_pred[unzip_pred < 0] <- 0
library(imager)
par(mar=rep(0,4), mfcol = c(4, 5))
for (i in 1:10) {
plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
rasterImage((test_array[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
rasterImage((unzip_pred[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
}
– 在今天之前,我們所有使用到的卷積層都只能把特徵圖縮小(下採樣,down sampling),這在對於圖像分類並不會有太大的問題,但對於其他任務來說操作就比較受限了。
這個操作的方式也不會太困難,我們用個簡單的範例把過程實現出來。
假設X是一個3維陣列,而Filter是一個標準2x2反卷積器:
– 這是當步輻為1的狀況下:
Filter_size <- dim(Filter)[1]
Stride <- 1
out <- array(0, dim = c(4, 4, 1))
for (l in 1:dim(X)[3]) {
for (k in 1:dim(Filter)[3]) {
for (j in 1:dim(X)[2]) {
for (i in 1:dim(X)[1]) {
row_seq <- ((i-1) * Stride + 1):((i-1) * Stride + Filter_size)
col_seq <- ((j-1) * Stride + 1):((j-1) * Stride + Filter_size)
out[row_seq,col_seq,k] <- out[row_seq,col_seq,k] + X[i,j,l] * Filter[,,k,l]
}
}
}
}
out
## , , 1
##
## [,1] [,2] [,3] [,4]
## [1,] -1 -4 -7 0
## [2,] -2 -4 -4 7
## [3,] -3 -4 -4 8
## [4,] 0 3 6 9
– 這是當步輻為2的狀況下:
Filter_size <- dim(Filter)[1]
Stride <- 2
out <- array(0, dim = c(6, 6, 1))
for (l in 1:dim(X)[3]) {
for (k in 1:dim(Filter)[3]) {
for (j in 1:dim(X)[2]) {
for (i in 1:dim(X)[1]) {
row_seq <- ((i-1) * Stride + 1):((i-1) * Stride + Filter_size)
col_seq <- ((j-1) * Stride + 1):((j-1) * Stride + Filter_size)
out[row_seq,col_seq,k] <- out[row_seq,col_seq,k] + X[i,j,l] * Filter[,,k,l]
}
}
}
}
out
## , , 1
##
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -1 0 -4 0 -7 0
## [2,] 0 1 0 4 0 7
## [3,] -2 0 -5 0 -8 0
## [4,] 0 2 0 5 0 8
## [5,] -3 0 -6 0 -9 0
## [6,] 0 3 0 6 0 9
X <- array(1:18, dim = c(3, 3, 2))
Filter <- array(c(-1, 0, 0, 1, 0, 1, -1, 0), dim = c(2, 2, 1, 2))
– 這是當步輻為1的狀況下:
Filter_size <- dim(Filter)[1]
Stride <- 1
out <- array(0, dim = c(4, 4, 1))
for (l in 1:dim(X)[3]) {
for (k in 1:dim(Filter)[3]) {
for (j in 1:dim(X)[2]) {
for (i in 1:dim(X)[1]) {
row_seq <- ((i-1) * Stride + 1):((i-1) * Stride + Filter_size)
col_seq <- ((j-1) * Stride + 1):((j-1) * Stride + Filter_size)
out[row_seq,col_seq,k] <- out[row_seq,col_seq,k] + X[i,j,l] * Filter[,,k,l]
}
}
}
}
out
## , , 1
##
## [,1] [,2] [,3] [,4]
## [1,] -1 -14 -20 -16
## [2,] 8 -2 -2 -10
## [3,] 8 -2 -2 -10
## [4,] 12 18 24 9
– 這是當步輻為2的狀況下:
Filter_size <- dim(Filter)[1]
Stride <- 2
out <- array(0, dim = c(6, 6, 1))
for (l in 1:dim(X)[3]) {
for (k in 1:dim(Filter)[3]) {
for (j in 1:dim(X)[2]) {
for (i in 1:dim(X)[1]) {
row_seq <- ((i-1) * Stride + 1):((i-1) * Stride + Filter_size)
col_seq <- ((j-1) * Stride + 1):((j-1) * Stride + Filter_size)
out[row_seq,col_seq,k] <- out[row_seq,col_seq,k] + X[i,j,l] * Filter[,,k,l]
}
}
}
}
out
## , , 1
##
## [,1] [,2] [,3] [,4] [,5] [,6]
## [1,] -1 -10 -4 -13 -7 -16
## [2,] 10 1 13 4 16 7
## [3,] -2 -11 -5 -14 -8 -17
## [4,] 11 2 14 5 17 8
## [5,] -3 -12 -6 -15 -9 -18
## [6,] 12 3 15 6 18 9
# Encoder
data <- mx.symbol.Variable('data')
conv1 <- mx.symbol.Convolution(data = data, kernel = c(7, 7), stride = c(7, 7), num_filter = 8, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu", name = 'relu1')
conv2 <- mx.symbol.Convolution(data = relu1, kernel = c(2, 2), stride = c(2, 2), num_filter = 16, name = 'conv2')
relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu", name = 'relu2')
encoder <- mx.symbol.Convolution(data = relu2, kernel = c(2, 2), stride = c(2, 2), num_filter = 32, name = 'encoder')
# Decoder
deconv3 <- mx.symbol.Deconvolution(data = encoder, kernel = c(2, 2), stride = c(2, 2), num_filter = 16, name = 'deconv3')
relu3 <- mx.symbol.Activation(data = deconv3, act_type = "relu", name = 'relu3')
deconv4 <- mx.symbol.Deconvolution(data = relu3, kernel = c(2, 2), stride = c(2, 2), num_filter = 8, name = 'deconv4')
relu4 <- mx.symbol.Activation(data = deconv4, act_type = "relu", name = 'relu4')
decoder <- mx.symbol.Deconvolution(data = relu4, kernel = c(7, 7), stride = c(7, 7), num_filter = 1, name = 'decoder')
# MSE loss
label <- mx.symbol.Variable(name = 'label')
residual <- mx.symbol.broadcast_minus(lhs = label, rhs = decoder)
square_residual <- mx.symbol.square(data = residual)
mean_square_residual <- mx.symbol.mean(data = square_residual, axis = 0:3, keepdims = FALSE)
mse_loss <- mx.symbol.MakeLoss(data = mean_square_residual, name = 'mse')
model$symbol <- decoder
unzip_pred <- predict(model, Test.X)
unzip_pred[unzip_pred > 1] <- 1
unzip_pred[unzip_pred < 0] <- 0
library(imager)
par(mar=rep(0,4), mfcol = c(4, 5))
for (i in 1:10) {
plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
rasterImage((Test.X[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
rasterImage((unzip_pred[,,,i]), 0, 0, 1, 1, interpolate=FALSE)
}
– 這是壓縮模型:
all_layers <- model$symbol$get.internals()
encoder_output <- which(all_layers$outputs == 'encoder_output') %>% all_layers$get.output()
encoder_model <- model
encoder_model$symbol <- encoder_output
encoder_model$arg.params <- encoder_model$arg.params[names(encoder_model$arg.params) %in% names(mx.symbol.infer.shape(encoder_output, data = c(28, 28, 1, 7))$arg.shapes)]
encoder_model$aux.params <- encoder_model$aux.params[names(encoder_model$aux.params) %in% names(mx.symbol.infer.shape(encoder_output, data = c(28, 28, 1, 7))$aux.shapes)]
– 這是解壓縮模型:
data <- mx.symbol.Variable('data')
deconv3 <- mx.symbol.Deconvolution(data = data, kernel = c(2, 2), stride = c(2, 2), num_filter = 16, name = 'deconv3')
relu3 <- mx.symbol.Activation(data = deconv3, act_type = "relu", name = 'relu3')
deconv4 <- mx.symbol.Deconvolution(data = relu3, kernel = c(2, 2), stride = c(2, 2), num_filter = 8, name = 'deconv4')
relu4 <- mx.symbol.Activation(data = deconv4, act_type = "relu", name = 'relu4')
decoder_output <- mx.symbol.Deconvolution(data = relu4, kernel = c(7, 7), stride = c(7, 7), num_filter = 1, name = 'decoder')
decoder_model <- model
decoder_model$symbol <- decoder_output
decoder_model$arg.params <- decoder_model$arg.params[names(decoder_model$arg.params) %in% names(mx.symbol.infer.shape(decoder_output, data = c(1, 1, 32, 1))$arg.shapes)]
decoder_model$aux.params <- decoder_model$aux.params[names(decoder_model$aux.params) %in% names(mx.symbol.infer.shape(decoder_output, data = c(1, 1, 32, 1))$aux.shapes)]
img_input <- Test.X[,,,1]
dim(img_input) <- c(28, 28, 1, 1)
Input <- predict(encoder_model, img_input)
dim(Input)
## [1] 1 1 32 1
## [1] 28 28 1 1
DECONV_func <- function (X, WEIGHT, STRIDE) {
original_size <- dim(X)[1]
out <- array(0, dim = c(original_size * STRIDE, original_size * STRIDE, dim(WEIGHT)[3], dim(X)[4]))
for (m in 1:dim(X)[4]) {
for (l in 1:dim(X)[3]) {
for (k in 1:dim(WEIGHT)[3]) {
for (j in 1:dim(X)[2]) {
for (i in 1:dim(X)[1]) {
row_seq <- ((i-1) * STRIDE + 1):((i-1) * STRIDE + STRIDE)
col_seq <- ((j-1) * STRIDE + 1):((j-1) * STRIDE + STRIDE)
out[row_seq,col_seq,k,m] <- out[row_seq,col_seq,k,m] + X[i,j,l,m] * WEIGHT[,,k,l]
}
}
}
}
}
return(out)
}
deconv3_out <- DECONV_func(X = Input, WEIGHT = as.array(decoder_model$arg.params$deconv3_weight), STRIDE = 2)
relu3_out <- deconv3_out
relu3_out[relu3_out < 0] <- 0
deconv4_out <- DECONV_func(X = relu3_out, WEIGHT = as.array(decoder_model$arg.params$deconv4_weight), STRIDE = 2)
relu4_out <- deconv4_out
relu4_out[relu4_out < 0] <- 0
My_Output <- DECONV_func(X = relu4_out, WEIGHT = as.array(decoder_model$arg.params$decoder_weight), STRIDE = 7)
library(imager)
par(mar=rep(0,4), mfcol = c(1, 2))
plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
Output[Output > 1] <- 1
Output[Output < 0] <- 0
rasterImage((Output[,,,1]), 0, 0, 1, 1, interpolate = FALSE)
plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
My_Output[My_Output > 1] <- 1
My_Output[My_Output < 0] <- 0
rasterImage((My_Output[,,,1]), 0, 0, 1, 1, interpolate = FALSE)
– 另外我們也了解到,透過這種方式訓練的「Encoder」,它確實能把數據做「壓縮/降維」,並且這些「降維」後的數據是有辦法還原成原始圖像的,這也說明了雖然我們看不懂「Encoder」的輸出,但它肯定存在某種意義
– 下圖是我們試圖了解不同數字經過「Encoder」編碼過後的向量在空間中的相對位置,我們發現不同數字存在群聚關係(為了將數據從32維打到2維空間,我們這裡使用了PCA降維技術):
zip_code <- predict(encoder_model, Test.X)
dim(zip_code) <- dim(zip_code)[3:4]
zip_code <- t(zip_code)
PCA_result <- princomp(zip_code, cor = TRUE)
plot(PCA_result$scores[,1], PCA_result$scores[,2],
xlab = 'Comp.1', ylab = 'Comp.2',
pch = 19, cex = 0.5, col = rainbow(10)[Test.Y + 1])
legend('topright', legend = 0:9, pch = 19, col = rainbow(10))
sub_Train.DAT <- data.table(cbind(Train.Y, Train.X))[1:500,]
fwrite(x = sub_Train.DAT,
file = 'data/sub_train_data.csv',
col.names = FALSE, row.names = FALSE)
my_iterator_func2 <- setRefClass("Custom_Iter2",
fields = c("iter", "data.csv", "data.shape", "batch.size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, data.csv, data.shape, batch.size){
csv_iter <- mx.io.CSVIter(data.csv = data.csv, data.shape = data.shape, batch.size = batch.size)
.self$iter <- csv_iter
.self
},
value = function(){
val <- as.array(.self$iter$value()$data)
val.x <- val[-1,]
dim(val.x) <- c(28, 28, 1, ncol(val.x))
val.x <- val.x/255
val.x <- mx.nd.array(val.x)
val.y <- t(model.matrix(~ -1 + factor(val[1,], levels = 0:9)))
val.y <- array(val.y, dim = c(10, dim(val.x)[4]))
val.y <- mx.nd.array(val.y)
list(data=val.x, label=val.y)
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter2 = my_iterator_func2(iter = NULL, data.csv = 'data/sub_train_data.csv', data.shape = 785, batch.size = 20)
data <- mx.symbol.Variable('data')
conv1 <- mx.symbol.Convolution(data = data, kernel = c(7, 7), stride = c(7, 7), num_filter = 8, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu", name = 'relu1')
conv2 <- mx.symbol.Convolution(data = relu1, kernel = c(2, 2), stride = c(2, 2), num_filter = 16, name = 'conv2')
relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu", name = 'relu2')
conv3 <- mx.symbol.Convolution(data = relu2, kernel = c(2, 2), stride = c(2, 2), num_filter = 32, name = 'conv3')
fc1 <- mx.symbol.FullyConnected(data = conv3, num.hidden = 10, name = 'fc1')
softmax <- mx.symbol.softmax(data = fc1, axis = 1, name = 'softmax')
label <- mx.symbol.Variable(name = 'label')
eps <- 1e-8
m_log <- 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax + eps), label))
m_logloss <- mx.symbol.MakeLoss(m_log, name = 'm_logloss')
my_optimizer <- mx.opt.create(name = "adam", learning.rate = 0.001, beta1 = 0.9, beta2 = 0.999, wd = 1e-4)
my.eval.metric.loss <- mx.metric.custom(
name = "mlog-loss",
function(real, pred) {
return(as.array(pred))
}
)
mx.set.seed(0)
model.1 <- mx.model.FeedForward.create(symbol = m_logloss, X = my_iter2, optimizer = my_optimizer,
eval.metric = my.eval.metric.loss,
array.batch.size = 20, ctx = mx.gpu(), num.round = 100)
model.1$symbol <- softmax
predict_Y <- predict(model.1, Test.X)
confusion_table <- table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.7714881
## Test.Y
## 0 1 2 3 4 5 6 7 8 9
## 1 1439 4 23 46 3 76 62 22 21 7
## 2 1 1666 17 6 7 14 3 20 29 4
## 3 12 18 1255 154 21 33 109 43 51 2
## 4 27 3 63 1315 2 120 2 100 44 26
## 5 1 1 27 0 1050 25 83 7 12 99
## 6 141 2 15 113 62 937 35 10 107 45
## 7 34 0 114 3 47 33 1315 1 8 3
## 8 0 59 9 38 17 21 1 1409 6 123
## 9 3 67 106 46 79 232 40 14 1306 64
## 10 5 31 27 21 318 60 11 127 91 1269
mx.set.seed(0)
new_arg <- mxnet:::mx.model.init.params(symbol = m_logloss,
input.shape = list(data = c(28, 28, 1, 7), label = c(10, 7)),
output.shape = NULL,
initializer = mxnet:::mx.init.uniform(0.01),
ctx = mx.gpu())
for (k in 1:6) {
new_arg$arg.params[[k]] <- encoder_model$arg.params[[k]]
}
model.2 <- mx.model.FeedForward.create(symbol = m_logloss, X = my_iter2, optimizer = my_optimizer,
eval.metric = my.eval.metric.loss,
arg.params = new_arg$arg.params,
array.batch.size = 20, ctx = mx.gpu(), num.round = 100)
model.2$symbol <- softmax
predict_Y <- predict(model.2, Test.X)
confusion_table <- table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.8385119
## Test.Y
## 0 1 2 3 4 5 6 7 8 9
## 1 1481 0 15 15 8 36 61 4 25 13
## 2 0 1758 21 2 14 12 10 15 21 10
## 3 1 9 1347 99 10 9 27 39 28 0
## 4 10 18 27 1400 12 70 5 35 75 54
## 5 3 2 45 13 1291 36 40 20 5 121
## 6 54 10 30 109 10 1198 51 2 107 10
## 7 52 5 31 23 15 40 1396 0 5 1
## 8 6 8 52 22 9 15 6 1561 31 87
## 9 49 38 76 44 57 119 39 7 1338 29
## 10 7 3 12 15 180 16 26 70 40 1317
– 但一般的卷積網路通常都比較大,這樣encoder對於數據就不存在壓縮的效果了,把自編碼器的概念擴展到一般的卷積網路會有同樣優勢嗎?
– 這裡我們同樣運用小sample做實驗,我們重新做一個convolutional filter的數量的網路來訓練:
data <- mx.symbol.Variable('data')
# first conv
conv1 <- mx.symbol.Convolution(data = data, kernel = c(5, 5), num_filter = 16, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu", name = 'relu1')
pool1 <- mx.symbol.Pooling(data = relu1, pool_type = "max", kernel = c(2, 2), stride = c(2, 2), name = 'pool1')
# second conv
conv2 <- mx.symbol.Convolution(data = pool1, kernel = c(5, 5), num_filter = 32, name = 'conv2')
relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu", name = 'relu2')
pool2 <- mx.symbol.Pooling(data = relu2, pool_type = "max", kernel = c(2, 2), stride = c(2, 2), name = 'pool2')
# third conv
conv3 <- mx.symbol.Convolution(data = pool2, kernel = c(4, 4), num_filter = 128, name = 'conv3')
relu3 <- mx.symbol.Activation(data = conv3, act_type = "relu", name = 'relu3')
# Softmax
fc1 <- mx.symbol.FullyConnected(data = relu3, num.hidden = 10, name = 'fc1')
softmax <- mx.symbol.softmax(data = fc1, axis = 1, name = 'softmax')
label <- mx.symbol.Variable(name = 'label')
eps <- 1e-8
m_log <- 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax + eps), label))
m_logloss <- mx.symbol.MakeLoss(m_log, name = 'm_logloss')
model.3 <- mx.model.FeedForward.create(symbol = m_logloss, X = my_iter2, optimizer = my_optimizer,
eval.metric = my.eval.metric.loss,
array.batch.size = 20, ctx = mx.gpu(), num.round = 100)
model.3$symbol <- softmax
predict_Y <- predict(model.3, Test.X)
confusion_table <- table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.8936905
## Test.Y
## 0 1 2 3 4 5 6 7 8 9
## 1 1477 0 12 8 9 24 36 13 2 13
## 2 0 1789 6 1 5 4 6 3 5 3
## 3 33 8 1432 44 0 2 5 48 19 1
## 4 0 11 35 1534 3 30 0 29 25 19
## 5 1 3 17 1 1352 17 19 2 1 47
## 6 27 1 15 75 4 1412 26 24 78 50
## 7 81 2 29 0 19 21 1514 0 9 0
## 8 8 12 45 19 12 2 0 1576 18 48
## 9 11 23 49 47 12 34 34 3 1476 9
## 10 25 2 16 13 190 5 21 55 42 1452
– 現在請你重新先訓練一個自編碼器,並且把encoder部分的參數用於轉移特徵學習,再看看效果如何!
# Encoder
data <- mx.symbol.Variable('data')
conv1 <- mx.symbol.Convolution(data = data, kernel = c(5, 5), num_filter = 16, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu", name = 'relu1')
pool1 <- mx.symbol.Pooling(data = relu1, pool_type = "max", kernel = c(2, 2), stride = c(2, 2), name = 'pool1')
conv2 <- mx.symbol.Convolution(data = pool1, kernel = c(5, 5), num_filter = 32, name = 'conv2')
relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu", name = 'relu2')
pool2 <- mx.symbol.Pooling(data = relu2, pool_type = "max", kernel = c(2, 2), stride = c(2, 2), name = 'pool2')
conv3 <- mx.symbol.Convolution(data = pool2, kernel = c(4, 4), num_filter = 128, name = 'conv3')
encoder <- mx.symbol.Activation(data = conv3, act_type = "relu", name = 'encoder')
# Decoder
deconv4 <- mx.symbol.Deconvolution(data = encoder, kernel = c(2, 2), stride = c(2, 2), num_filter = 32, name = 'deconv4')
relu4 <- mx.symbol.Activation(data = deconv4, act_type = "relu", name = 'relu4')
deconv5 <- mx.symbol.Deconvolution(data = relu4, kernel = c(2, 2), stride = c(2, 2), num_filter = 16, name = 'deconv5')
relu5 <- mx.symbol.Activation(data = deconv5, act_type = "relu", name = 'relu5')
decoder <- mx.symbol.Deconvolution(data = relu5, kernel = c(7, 7), stride = c(7, 7), num_filter = 1, name = 'decoder')
# MSE loss
label <- mx.symbol.Variable(name = 'label')
residual <- mx.symbol.broadcast_minus(lhs = label, rhs = decoder)
square_residual <- mx.symbol.square(data = residual)
mean_square_residual <- mx.symbol.mean(data = square_residual, axis = 0:3, keepdims = FALSE)
mse_loss <- mx.symbol.MakeLoss(data = mean_square_residual, name = 'mse')
data <- mx.symbol.Variable('data')
# first conv
conv1 <- mx.symbol.Convolution(data = data, kernel = c(5, 5), num_filter = 16, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu", name = 'relu1')
pool1 <- mx.symbol.Pooling(data = relu1, pool_type = "max", kernel = c(2, 2), stride = c(2, 2), name = 'pool1')
# second conv
conv2 <- mx.symbol.Convolution(data = pool1, kernel = c(5, 5), num_filter = 32, name = 'conv2')
relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu", name = 'relu2')
pool2 <- mx.symbol.Pooling(data = relu2, pool_type = "max", kernel = c(2, 2), stride = c(2, 2), name = 'pool2')
# third conv
conv3 <- mx.symbol.Convolution(data = pool2, kernel = c(4, 4), num_filter = 128, name = 'conv3')
relu3 <- mx.symbol.Activation(data = conv3, act_type = "relu", name = 'relu3')
# Softmax
fc1 <- mx.symbol.FullyConnected(data = relu3, num.hidden = 10, name = 'fc1')
softmax <- mx.symbol.softmax(data = fc1, axis = 1, name = 'softmax')
label <- mx.symbol.Variable(name = 'label')
eps <- 1e-8
m_log <- 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax + eps), label))
m_logloss <- mx.symbol.MakeLoss(m_log, name = 'm_logloss')
mx.set.seed(0)
new_arg <- mxnet:::mx.model.init.params(symbol = m_logloss,
input.shape = list(data = c(28, 28, 1, 7), label = c(10, 7)),
output.shape = NULL,
initializer = mxnet:::mx.init.uniform(0.01),
ctx = mx.gpu())
for (k in 1:6) {
new_arg$arg.params[[k]] <- model$arg.params[[k]]
}
model.4 <- mx.model.FeedForward.create(symbol = m_logloss, X = my_iter2, optimizer = my_optimizer,
eval.metric = my.eval.metric.loss,
arg.params = new_arg$arg.params,
array.batch.size = 20, ctx = mx.gpu(), num.round = 100)
model.4$symbol <- softmax
predict_Y <- predict(model.4, Test.X)
confusion_table <- table(max.col(t(predict_Y)), Test.Y)
cat("Testing accuracy rate =", sum(diag(confusion_table))/sum(confusion_table))
## Testing accuracy rate = 0.9230357
## Test.Y
## 0 1 2 3 4 5 6 7 8 9
## 1 1573 0 14 14 2 17 12 0 9 11
## 2 0 1770 4 0 5 0 3 5 5 1
## 3 2 11 1495 18 10 1 1 26 16 1
## 4 2 4 29 1598 0 37 0 27 37 30
## 5 0 2 12 0 1418 4 8 10 0 39
## 6 19 0 5 44 0 1443 51 1 45 4
## 7 21 9 32 9 13 12 1566 0 6 0
## 8 35 13 33 20 6 11 0 1642 17 43
## 9 7 40 26 27 3 15 18 4 1502 13
## 10 4 2 6 12 149 11 2 38 38 1500
透過這種方式訓練自編碼器,理論上網路就可以無限深(不過通常每次壓縮會存在一些損失,從而導致過深的網路無法精確還原)。
Hinton在2006年時就提出這種用法,第一步先分層訓練,第二步把他們合併在一起,最後在微調參數:
– 當然,隨著時代演進,我們手上有眾多的工具用來解決梯度消失問題,而這整個過程非常的費力,所以現在已經幾乎沒有人用這個方式來訓練網路了。但透過自編碼器的輔助進行轉移特徵學習仍然是一個重要的應用方式,這「有機會」能增加最終模型的準確性!
– 自編碼器其實還有非常多種類,像是「去噪自編碼器」(給輸入的圖像增加一些雜訊,而輸出保持原樣)以及「稀疏自編碼器」(限制Encoder的輸出,讓他們幾乎都是0。實現的方式很簡單,只要在損失函數中加上對Encoder的輸出的限制即可)等。實驗時都可以試著去用不同的自編碼器進行轉移特徵學習,以解決權重初始化問題。
– 解除馬賽克同樣也是一種自編碼模型,你現在是否能想像了?
另外,透過自編碼器的學習,我們又學會了一個新的網路結構:反卷積層。我們在下一節課開始會涉及圖像分割以及圖像識別,這會開始大量的使用到反卷積層,請大家務必熟悉它的相關操作技術。
最後,我們留給大家一個問題:你覺得我們目前對自編碼器所定義的Loss function恰當嗎?如果你認為不恰當,有更好的方法嗎?