深度學習理論與實務

林嶔 (Lin, Chin)

Lesson 15 循環神經網絡介紹與注意力機制的應用

第一節:長短期記憶單元(1)

– 我們之前分析時是採用卷積神經網路,而卷積核在每次掃描時是獨立運行的,我們上節課的最後已經提到了需要一種帶有「記憶結構」的「類卷積核」,而今天我們主要就先從這裡開始。

F01

\[ \begin{align} q_{t-1} & = h_{t-2} || x_{t-1} \\ p_{t-1} & = L(q_{t-1}, W) \\ h_{t-1} & = \tanh(p_{t-1}) \\ q_{t} & = h_{t-1} || x_{t} \\ p_{t} & = L(q_{t}, W) \\ h_{t} & = \tanh(p_{t}) \\ & \dots \end{align} \]

第一節:長短期記憶單元(2)

– 因此我們會面臨長期訊息損失的問題,假定\(h_1\)是由100%的\(x_1\)組成,而\(h_2\)是由50%的\(x_2\)和50%的\(h_1\)組成,之後就以這個比例,那隨著距離越走遠,\(x_1\)所佔整體的比例就會慢慢下滑。

F02

第一節:長短期記憶單元(3)

F03

\[ \begin{align} q_{t} & = h_{t-1} || x_{t} \end{align} \]

\[ \begin{align} l_t^1& = L(q_{t}, W^1) \end{align} \]

\[ \begin{align} a_t^1 & = \sigma(l_t^1) \\ \tilde{c_t} & = c_{t-1} \otimes a_t^1 \end{align} \]

\[ \begin{align} l_t^2& = L(q_{t}, W^2) \\ a_t^2 & = \sigma(l_t^2) \\ l_t^3& = L(q_{t}, W^3) \\ a_t^3 & = \tanh(l_t^3) \\ s & = a_t^2 \otimes a_t^3 \\ c_{t} & = \tilde{c_t} + s \end{align} \]

\[ \begin{align} l_t^4& = L(q_{t}, W^4) \\ a_t^4 & = \sigma(l_t^4) \\ o & = \tanh (c_{t}) \\ h_{t} & = o \otimes a_t^4 \end{align} \]

第一節:長短期記憶單元(4)

library(mxnet)

lstm.cell <- function(num.hidden, hiddendata, prev.state = NULL, param, seqidx, layeridx, first = FALSE, dropout = 0) {
  
  if (dropout > 0) {hiddendata <- mx.symbol.Dropout(data = hiddendata, p = dropout)}
  
  i2h <- mx.symbol.FullyConnected(data = hiddendata, weight = param$i2h.weight, bias = param$i2h.bias, num.hidden = num.hidden * 4, name = paste0("t", seqidx, ".l", layeridx, ".i2h"))
  
  if (first) {
    gates <- i2h
  } else {
    h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$h2h.weight, bias = param$h2h.bias, num.hidden = num.hidden * 4, name = paste0("t", seqidx, ".l", layeridx, ".h2h"))
    gates <- i2h + h2h
  }
  
  slice.gates <- mx.symbol.SliceChannel(gates, num.outputs = 4, name = paste0("t", seqidx, ".l", layeridx, ".slice"))
  
  in.gate <- mx.symbol.Activation(slice.gates[[1]], act.type = "sigmoid", name = paste0("t", seqidx, ".l", layeridx, ".in.gate"))
  in.transform <- mx.symbol.Activation(slice.gates[[2]], act.type = "tanh", name = paste0("t", seqidx, ".l", layeridx, ".in.transform"))
  forget.gate <- mx.symbol.Activation(slice.gates[[3]], act.type = "sigmoid", name = paste0("t", seqidx, ".l", layeridx, ".forget.gate"))
  out.gate <- mx.symbol.Activation(slice.gates[[4]], act.type = "sigmoid", name = paste0("t", seqidx, ".l", layeridx, ".out.gate"))
  
  if (first) {
    next.c <- (in.gate * in.transform)
  } else {
    next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform)
  }
  
  next.h <- out.gate * mx.symbol.Activation(next.c, act.type="tanh")
  
  return (list(c=next.c, h=next.h))
  
}

第一節:長短期記憶單元(5)

– 除此之外,我們還為它加上了正向/雙向的循環結構選項:

F04

lstm.layer <- function (indata, num.hidden, seq.len, layeridx = 1, dropout = 0, inverse = TRUE) {
  
  wordvec <- mx.symbol.SliceChannel(data = indata, num_outputs = seq.len, axis = 2)
  
  Cis.param.cells <- list(i2h.weight = mx.symbol.Variable(paste0("Cis.l", layeridx, ".i2h.weight")),
                          i2h.bias = mx.symbol.Variable(paste0("Cis.l", layeridx, ".i2h.bias")),
                          h2h.weight = mx.symbol.Variable(paste0("Cis.l", layeridx, ".h2h.weight")),
                          h2h.bias = mx.symbol.Variable(paste0("Cis.l", layeridx, ".h2h.bias")))
  
  Trans.param.cells <- list(i2h.weight = mx.symbol.Variable(paste0("Trans.l", layeridx, ".i2h.weight")),
                            i2h.bias = mx.symbol.Variable(paste0("Trans.l", layeridx, ".i2h.bias")),
                            h2h.weight = mx.symbol.Variable(paste0("Trans.l", layeridx, ".h2h.weight")),
                            h2h.bias = mx.symbol.Variable(paste0("Trans.l", layeridx, ".h2h.bias")))
  
  last.hidden_1 <- list()
  
  #Cis
  
  for (seqidx in 1:seq.len) {
    hidden <- wordvec[[seqidx]]
    if (seqidx == 1) {
      next.state <- lstm.cell(num.hidden = num.hidden, hiddendata = hidden, first = TRUE, param = Cis.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
    } else {
      next.state <- lstm.cell(num.hidden = num.hidden, hiddendata = hidden, prev.state = last.states, param = Cis.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
    }
    hidden <- next.state$h
    last.states <- next.state
    if (dropout > 0) {hidden <- mx.symbol.Dropout(data = hidden, p = dropout)}
    last.hidden_1 <- c(last.hidden_1, mx.symbol.reshape(data = hidden, shape = c(1, 1, num.hidden, -1)))
  }
  concat_full_1 <- mx.symbol.concat(last.hidden_1, num.args = seq.len, dim = 2, name = paste0("Cis.l", layeridx, ".concat"))

  if (inverse) {
    
    last.hidden_2 <- list()
    
    for (seqidx in seq.len+1:seq.len) {
      hidden <- wordvec[[seq.len*2 - seqidx + 1]]
      if (seqidx == (seq.len+1)) {
        next.state <- lstm.cell(num.hidden = num.hidden, hiddendata = hidden, first = TRUE, param = Trans.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
      } else {
        next.state <- lstm.cell(num.hidden = num.hidden, hiddendata = hidden, prev.state = last.states, param = Trans.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
      }
      hidden <- next.state$h
      last.states <- next.state
      if (dropout > 0) {hidden <- mx.symbol.Dropout(data = hidden, p = dropout)}
      last.hidden_2 <- c(last.hidden_2, mx.symbol.reshape(data = hidden, shape = c(1, 1, num.hidden, -1)))
    }
    concat_full_2 <- mx.symbol.concat(last.hidden_2, num.args = seq.len, dim = 2, name = paste0("Trans.l", layeridx, ".concat"))
    concat_full_3 <- list(concat_full_1, concat_full_2)
    concat_full <- mx.symbol.concat(concat_full_3, num.args = 2, dim = 1, name = paste0("l", layeridx, ".concat"))
  } else {
    concat_full_3 <- list(concat_full_1)
    concat_full <- mx.symbol.concat(concat_full_3, num.args = 1, dim = 1, name = paste0("l", layeridx, ".concat"))
  }
  
  return(concat_full)
  
}

第一節:長短期記憶單元(6)

– 為了避免運算時間過長,我們僅使用總字數小於200以內的病歷進行實驗:

library(mxnet)
library(magrittr)
library(dplyr)
library(plyr)
library(data.table)

load('data/Discharge_Notes.RData')

word.data <- fread('data/wiki.txt', header = FALSE, showProgress = FALSE)

words.ref <- word.data %>% select(V1) %>% setDF %>% .[,1] %>% as.character
words.ref <- c("", words.ref)
words.index <- 1:length(words.ref)
names(words.index) <- words.ref

words.matrix <- word.data %>% select(-V1) %>% setDF %>% as.matrix
words.matrix <- rbind(rep(0, 50), words.matrix)

max.length <- 200

seq_array <- array(0, dim = c(max.length, length(Processed_Notes)))
short_pos <- which(sapply(Processed_Notes, length) <= max.length)

pb <- txtProgressBar(max = length(short_pos), style = 3)

for (i in 1:length(short_pos)) {
  
  words_pos <- words.index[Processed_Notes[[short_pos[i]]]]
  words_pos[is.na(words_pos)] <- 1
  seq_array[1:length(words_pos),i] <- words_pos - 1
  setTxtProgressBar(pb, i)
  
}

close(pb)

Cancer_code <- Cancer_code[short_pos]

Train.seq_array <- seq_array[,1:3000]
Train.Y <- Cancer_code[1:3000]

Test.seq_array <- seq_array[,3001:length(short_pos)]
dim(Test.seq_array) <- c(max.length, 1, dim(Test.seq_array)[2])
Test.Y <- Cancer_code[3001:length(Cancer_code)]
my_iterator_core <- function (batch_size) {
  
  batch = 0
  batch_per_epoch = length(Train.Y)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > length(Train.Y)] = sample(1:length(Train.Y), sum(idx > length(Train.Y)))
    data = mx.nd.array(array(Train.seq_array[,idx], dim = c(dim(Train.seq_array)[1], 1, batch_size)))
    label = mx.nd.array(array(Train.Y[idx], dim = c(1, batch_size)))
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter <- my_iterator_func(iter = NULL, batch_size = 20)

第一節:長短期記憶單元(7)

data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')

embed1 <- mx.symbol.Embedding(data = data, input.dim = length(words.ref), output.dim = 50, name = 'embed1')
lstm1 <- lstm.layer(indata = embed1, num.hidden = 50, seq.len = max.length, layeridx = 1, dropout = 0, inverse = TRUE)
pool1 <- mx.symbol.max(data = lstm1, axis = 2:3, keepdims = FALSE, name = 'pool1')

fc1 <- mx.symbol.FullyConnected(data = pool1, num.hidden = 1, name = 'fc1')
logistic_pred <- mx.symbol.sigmoid(data = fc1, name = 'logistic_pred')

eps <- 1e-8
ce_loss_pos <-  mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <-  mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
my_optimizer <- mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 1e-4)

第一節:長短期記憶單元(8)

new_arg <- mxnet:::mx.model.init.params(symbol = logistic_pred, 
                                        input.shape = list(data = c(max.length, 1, 20)), 
                                        output.shape = NULL,
                                        initializer = mxnet:::mx.init.Xavier(rnd_type = "uniform", magnitude = 2.24), 
                                        ctx = mx.gpu())

new_arg$arg.params$embed1_weight <- mx.nd.array(t(words.matrix))

my.eval.metric.loss <- mx.metric.custom(
  name = "cross-entropy",
  function(label, pred) {
    return(as.array(pred))
  }
)

my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
                                        array.batch.size = 20, ctx = mx.gpu(), num.round = 20,
                                        eval.metric = my.eval.metric.loss,
                                        arg.params = new_arg$arg.params, fixed.param = 'embed1_weight',
                                        batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
my_model$symbol <- logistic_pred
my_model$arg.params <- append(my_model$arg.params, new_arg$arg.params['embed1_weight'])
pred_y <- predict(my_model, Test.seq_array)
library(pROC)

roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))

練習1:重現LSTM單元的推理過程

– 如果你還沒訓練完模型,你可以從這裡下載lstm_v1-0000.params以及lstm_v1-symbol.json

– 接著,請你試著重現LSTM單元的推理過程:

my_model <- mx.model.load('model/lstm_v1', 0)

all_layers <- my_model$symbol$get.internals()
my_model$symbol <- which(all_layers$outputs == 'l1.concat_output') %>% all_layers$get.output()
my_model$arg.params <- my_model$arg.params[!grepl('fc1', names(my_model$arg.params))]

Input <- Test.seq_array[,,5]
dim(Input) <- c(200, 1, 1)
Output <- predict(my_model, Input)

– 我們只做單向的前5個字,請試著做出相對應的預測:

sub_Input <- Input[1:5,,]
sub_Output <- Output[,1:5,1:50,]

練習1答案(1)

Embedding_Output <- as.array(my_model$arg.params$embed1_weight)[,sub_Input + 1]
i = 1

input_vec <- Embedding_Output[,i]
i2h <- input_vec %*% as.array(my_model$arg.params$Cis.l1.i2h.weight) + as.array(my_model$arg.params$Cis.l1.i2h.bias)
gates <- i2h

in.gate <- 1/(1+exp(-gates[1:50]))
in.transform <- tanh(gates[51:100])
forget.gate <- 1/(1+exp(-gates[101:150]))
out.gate <- 1/(1+exp(-gates[151:200]))

next.c <- in.gate * in.transform
next.h <- out.gate * tanh(next.c)
next.h
##  [1]  0.086967833 -0.063449661  0.036880010  0.010068880 -0.149757446
##  [6] -0.045954626 -0.007239696 -0.029001972 -0.036172191  0.014461363
## [11] -0.143614092  0.104743384  0.077178065 -0.018109491 -0.053787901
## [16]  0.033238218  0.073090688  0.219209935 -0.139997959 -0.040038392
## [21] -0.080872811 -0.060081231 -0.020082376 -0.033003086  0.071313954
## [26] -0.264252035  0.035592710 -0.021121420 -0.132410585 -0.033140717
## [31] -0.024339752  0.218807462 -0.020077547 -0.001792587 -0.015403937
## [36]  0.019053839 -0.097208227 -0.121762368  0.021987457  0.049170363
## [41]  0.019188609 -0.017815512  0.076241088 -0.145931257 -0.092379922
## [46]  0.021569875 -0.020209405 -0.020605694  0.270575293  0.009769096
sub_Output[1,]
##  [1]  0.086967848 -0.063449658  0.036880013  0.010068879 -0.149757430
##  [6] -0.045954626 -0.007239695 -0.029001968 -0.036172193  0.014461363
## [11] -0.143614084  0.104743376  0.077178061 -0.018109491 -0.053787898
## [16]  0.033238214  0.073090687  0.219209924 -0.139997944 -0.040038396
## [21] -0.080872811 -0.060081221 -0.020082375 -0.033003084  0.071313955
## [26] -0.264252037  0.035592712 -0.021121416 -0.132410571 -0.033140719
## [31] -0.024339752  0.218807474 -0.020077543 -0.001792587 -0.015403938
## [36]  0.019053839 -0.097208224 -0.121762365  0.021987455  0.049170367
## [41]  0.019188609 -0.017815515  0.076241083 -0.145931259 -0.092379928
## [46]  0.021569878 -0.020209404 -0.020605695  0.270575315  0.009769096
i = 2

input_vec <- Embedding_Output[,i]
i2h <- input_vec %*% as.array(my_model$arg.params$Cis.l1.i2h.weight) + as.array(my_model$arg.params$Cis.l1.i2h.bias)
h2h <- next.h %*% as.array(my_model$arg.params$Cis.l1.h2h.weight) + as.array(my_model$arg.params$Cis.l1.h2h.bias)
gates <- i2h + h2h

in.gate <- 1/(1+exp(-gates[,1:50]))
in.transform <- tanh(gates[,51:100])
forget.gate <- 1/(1+exp(-gates[,101:150]))
out.gate <- 1/(1+exp(-gates[,151:200]))

next.c <- forget.gate * next.c + in.gate * in.transform
next.h <- out.gate * tanh(next.c)
next.h
##  [1]  0.172714273 -0.134861879  0.084825992  0.026005947 -0.292853043
##  [6] -0.143597484 -0.054681028 -0.069802900 -0.134559245  0.001540124
## [11] -0.307148649  0.208848144  0.314672849 -0.058207505 -0.153540190
## [16]  0.075622727  0.163340499  0.578598825 -0.251814185 -0.086394009
## [21] -0.219408604 -0.151962526 -0.038312221 -0.078217000  0.185670722
## [26] -0.527193472  0.082940466 -0.087716673 -0.319968570 -0.094814267
## [31] -0.098546791  0.587162975 -0.057633908 -0.034965075 -0.033302097
## [36]  0.061189049 -0.235737058 -0.228580143  0.068899261  0.153120268
## [41]  0.044878595 -0.065051461  0.171124248 -0.306323643 -0.283827253
## [46]  0.047662889 -0.072233065 -0.035651133  0.658952325  0.038527077
sub_Output[2,]
##  [1]  0.172714263 -0.134861887  0.084825985  0.026005954 -0.292853057
##  [6] -0.143597454 -0.054681029 -0.069802895 -0.134559244  0.001540123
## [11] -0.307148635  0.208848119  0.314672858 -0.058207501 -0.153540149
## [16]  0.075622715  0.163340479  0.578598797 -0.251814187 -0.086393997
## [21] -0.219408602 -0.151962548 -0.038312223 -0.078216985  0.185670748
## [26] -0.527193427  0.082940474 -0.087716669 -0.319968581 -0.094814271
## [31] -0.098546803  0.587162912 -0.057633899 -0.034965076 -0.033302099
## [36]  0.061189052 -0.235737026 -0.228580177  0.068899252  0.153120279
## [41]  0.044878587 -0.065051466  0.171124265 -0.306323618 -0.283827186
## [46]  0.047662888 -0.072233059 -0.035651125  0.658952296  0.038527071

練習1答案(2)

next.c <- 0
h_list <- list()

for (i in 1:5) {
  
  input_vec <- Embedding_Output[,i]
  i2h <- input_vec %*% as.array(my_model$arg.params$Cis.l1.i2h.weight) + as.array(my_model$arg.params$Cis.l1.i2h.bias)
  if (i != 1) {
    h2h <- next.h %*% as.array(my_model$arg.params$Cis.l1.h2h.weight) + as.array(my_model$arg.params$Cis.l1.h2h.bias)
    gates <- i2h + h2h
  } else {
    gates <- i2h
  }
  
  in.gate <- 1/(1+exp(-gates[,1:50]))
  in.transform <- tanh(gates[,51:100])
  forget.gate <- 1/(1+exp(-gates[,101:150]))
  out.gate <- 1/(1+exp(-gates[,151:200]))
  
  next.c <- forget.gate * next.c + in.gate * in.transform
  next.h <- out.gate * tanh(next.c)
  h_list[[i]] <- next.h
  
}

my_Output <- do.call('rbind', h_list)

all.equal(my_Output, sub_Output)
## [1] "Mean relative difference: 6.695539e-08"

第二節:Gated Recurrent Unit(1)

next.c <- forget.gate * next.c + in.gate * in.transform

– 另外,長期記憶和短期記憶為什麼不能整合起來呢?

F05

第二節:Gated Recurrent Unit(2)

F06

\[ \begin{align} q_{t} & = h_{t-1} || x_{t} \end{align} \]

\[ \begin{align} l_t^1& = L(q_{t}, W^1) \\ a_t^1 & = \sigma(l_t^1) \\ \tilde{h_t} & = h_{t-1} \otimes a_t^1 \end{align} \]

\[ \begin{align} l_t^2& = L(q_{t}, W^2) \\ a_t^2 & = \sigma(l_t^2) \end{align} \]

\[ \begin{align} \tilde{q_{t}} & = \tilde{h_t} || x_{t} \\ l_t^3& = L(\tilde{q_{t}}, W^3) \\ a_t^3 & = \tanh(l_t^3) \end{align} \]

\[ \begin{align} h_t & = a_t^2 \otimes a_t^3 + (1-a_t^2) \otimes h_{t-1} \end{align} \]

第二節:Gated Recurrent Unit(3)

library(mxnet)

gru.cell <- function(num.hidden, hiddendata, prev.state = NULL, param, seqidx, layeridx, first = FALSE, dropout = 0) {
  
  if (dropout > 0) {hiddendata <- mx.symbol.Dropout(data = hiddendata, p = dropout)}
  
  i2h <- mx.symbol.FullyConnected(data = hiddendata, weight = param$gates.i2h.weight, bias = param$gates.i2h.bias, num.hidden = num.hidden * 2, name = paste0("t", seqidx, ".l", layeridx, ".gates.i2h"))
  
  if (first) {
    gates <- i2h
  } else {
    h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$gates.h2h.weight, bias = param$gates.h2h.bias, num.hidden = num.hidden * 2, name = paste0("t", seqidx, ".l", layeridx, ".gates.h2h"))
    gates <- i2h + h2h
  }
  
  slice.gates <- mx.symbol.SliceChannel(gates, num.outputs = 2, name = paste0("t", seqidx, ".l", layeridx, ".slice"))
  
  update.gate <- mx.symbol.Activation(slice.gates[[1]], act.type = "sigmoid", name = paste0("t", seqidx, ".l", layeridx, ".update.gate"))
  reset.gate <- mx.symbol.Activation(slice.gates[[2]], act.type = "sigmoid", name = paste0("t", seqidx, ".l", layeridx, ".reset.gate"))
  
  htrans.i2h <- mx.symbol.FullyConnected(data = hiddendata, weight = param$trans.i2h.weight, bias = param$trans.i2h.bias, num.hidden = num.hidden, name = paste0("t", seqidx, ".l", layeridx, ".trans.i2h"))
  
  if (first) {
    h.trans <- htrans.i2h
    h.trans.active <- mx.symbol.Activation(h.trans, act.type="tanh", name = paste0("t", seqidx, ".l", layeridx, ".h.trans.active"))
    next.h <- h.trans.active
  } else {
    h.after.reset <- prev.state$h * reset.gate
    htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight, bias = param$trans.h2h.bias, num.hidden = num.hidden, name = paste0("t", seqidx, ".l", layeridx, ".trans.h2h"))
    h.trans <- htrans.i2h + htrans.h2h
    h.trans.active <- mx.symbol.Activation(h.trans, act.type="tanh", name = paste0("t", seqidx, ".l", layeridx, ".h.trans.active"))
    next.h <- update.gate * h.trans.active + (1 - update.gate) * prev.state$h
  }
  
  return (list(h=next.h))
  
}
gru.layer <- function (indata, num.hidden, seq.len, layeridx = 1, dropout = 0, inverse = TRUE) {
  
  wordvec <- mx.symbol.SliceChannel(data = indata, num_outputs = seq.len, axis = 2)
  
  Cis.param.cells <- list(gates.i2h.weight = mx.symbol.Variable(paste0("Cis.l", layeridx, ".gates.i2h.weight")),
                          gates.i2h.bias = mx.symbol.Variable(paste0("Cis.l", layeridx, ".gates.i2h.bias")),
                          gates.h2h.weight = mx.symbol.Variable(paste0("Cis.l", layeridx, ".gates.h2h.weight")),
                          gates.h2h.bias = mx.symbol.Variable(paste0("Cis.l", layeridx, ".gates.h2h.bias")),
                          trans.i2h.weight = mx.symbol.Variable(paste0("Cis.l", layeridx, ".trans.i2h.weight")),
                          trans.i2h.bias = mx.symbol.Variable(paste0("Cis.l", layeridx, ".trans.i2h.bias")),
                          trans.h2h.weight = mx.symbol.Variable(paste0("Cis.l", layeridx, ".trans.h2h.weight")),
                          trans.h2h.bias = mx.symbol.Variable(paste0("Cis.l", layeridx, ".trans.h2h.bias")))
  
  Trans.param.cells <- list(gates.i2h.weight = mx.symbol.Variable(paste0("Trans.l", layeridx, ".gates.i2h.weight")),
                            gates.i2h.bias = mx.symbol.Variable(paste0("Trans.l", layeridx, ".gates.i2h.bias")),
                            gates.h2h.weight = mx.symbol.Variable(paste0("Trans.l", layeridx, ".gates.h2h.weight")),
                            gates.h2h.bias = mx.symbol.Variable(paste0("Trans.l", layeridx, ".gates.h2h.bias")),
                            trans.i2h.weight = mx.symbol.Variable(paste0("Trans.l", layeridx, ".trans.i2h.weight")),
                            trans.i2h.bias = mx.symbol.Variable(paste0("Trans.l", layeridx, ".trans.i2h.bias")),
                            trans.h2h.weight = mx.symbol.Variable(paste0("Trans.l", layeridx, ".trans.h2h.weight")),
                            trans.h2h.bias = mx.symbol.Variable(paste0("Trans.l", layeridx, ".trans.h2h.bias")))
  
  last.hidden_1 <- list()
  
  #Cis
  
  for (seqidx in 1:seq.len) {
    hidden <- wordvec[[seqidx]]
    if (seqidx == 1) {
      next.state <- gru.cell(num.hidden = num.hidden, hiddendata = hidden, first = TRUE, param = Cis.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
    } else {
      next.state <- gru.cell(num.hidden = num.hidden, hiddendata = hidden, prev.state = last.states, param = Cis.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
    }
    hidden <- next.state$h
    last.states <- next.state
    if (dropout > 0) {hidden <- mx.symbol.Dropout(data = hidden, p = dropout)}
    last.hidden_1 <- c(last.hidden_1, mx.symbol.reshape(data = hidden, shape = c(1, 1, num.hidden, -1)))
  }
  concat_full_1 <- mx.symbol.concat(last.hidden_1, num.args = seq.len, dim = 2, name = paste0("Cis.l", layeridx, ".concat"))
  
  if (inverse) {
    
    last.hidden_2 <- list()
    
    for (seqidx in seq.len+1:seq.len) {
      hidden <- wordvec[[seq.len*2 - seqidx + 1]]
      if (seqidx == (seq.len+1)) {
        next.state <- gru.cell(num.hidden = num.hidden, hiddendata = hidden, first = TRUE, param = Trans.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
      } else {
        next.state <- gru.cell(num.hidden = num.hidden, hiddendata = hidden, prev.state = last.states, param = Trans.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
      }
      hidden <- next.state$h
      last.states <- next.state
      if (dropout > 0) {hidden <- mx.symbol.Dropout(data = hidden, p = dropout)}
      last.hidden_2 <- c(last.hidden_2, mx.symbol.reshape(data = hidden, shape = c(1, 1, num.hidden, -1)))
    }
    concat_full_2 <- mx.symbol.concat(last.hidden_2, num.args = seq.len, dim = 2, name = paste0("Trans.l", layeridx, ".concat"))
    concat_full_3 <- list(concat_full_1, concat_full_2)
    concat_full <- mx.symbol.concat(concat_full_3, num.args = 2, dim = 1, name = paste0("l", layeridx, ".concat"))
  } else {
    concat_full_3 <- list(concat_full_1)
    concat_full <- mx.symbol.concat(concat_full_3, num.args = 1, dim = 1, name = paste0("l", layeridx, ".concat"))
  }
  
  return(concat_full)
  
}

第二節:Gated Recurrent Unit(4)

data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')

embed1 <- mx.symbol.Embedding(data = data, input.dim = length(words.ref), output.dim = 50, name = 'embed1')
gru1 <- gru.layer(indata = embed1, num.hidden = 50, seq.len = max.length, layeridx = 1, dropout = 0, inverse = TRUE)
pool1 <- mx.symbol.max(data = gru1, axis = 2:3, keepdims = FALSE, name = 'pool1')

fc1 <- mx.symbol.FullyConnected(data = pool1, num.hidden = 1, name = 'fc1')
logistic_pred <- mx.symbol.sigmoid(data = fc1, name = 'logistic_pred')

eps <- 1e-8
ce_loss_pos <-  mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <-  mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
new_arg <- mxnet:::mx.model.init.params(symbol = logistic_pred, 
                                        input.shape = list(data = c(max.length, 1, 20)), 
                                        output.shape = NULL,
                                        initializer = mxnet:::mx.init.Xavier(rnd_type = "uniform", magnitude = 2.24), 
                                        ctx = mx.gpu())

new_arg$arg.params$embed1_weight <- mx.nd.array(t(words.matrix))

my.eval.metric.loss <- mx.metric.custom(
  name = "cross-entropy",
  function(label, pred) {
    return(as.array(pred))
  }
)

my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
                                        array.batch.size = 20, ctx = mx.gpu(), num.round = 20,
                                        eval.metric = my.eval.metric.loss,
                                        arg.params = new_arg$arg.params, fixed.param = 'embed1_weight',
                                        batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
my_model$symbol <- logistic_pred
my_model$arg.params <- append(my_model$arg.params, new_arg$arg.params['embed1_weight'])
pred_y <- predict(my_model, Test.seq_array)
roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))

關於目前卷積核與記憶單元在語言任務中的討論

– 因此,我們的模型基本上還是一種辨識關鍵字/片語的工具,只是隨著抽取特徵的技術越來越好(像是使用了記憶單元),現在抽取的特徵比較有可能考慮到了上下文關係。

– 像是Facebook團隊在2017年發表的Convolutional Sequence to Sequence Learning,就是用堆疊卷積核在翻譯任務上取得非常好的成績。

– 但同樣也是Facebook團隊近期在翻譯任務上取得的不可思議突破:Unsupervised Machine Translation Using Monolingual Corpora Only,所使用的卻是LSTM

第三節:注意力機制(1)

– 因此比較好的想法就是要把整個句子的資訊都加在一起,但直接取平均似乎不是太好的方法。

– 至於位置資訊呢?其實並不是這麼重要,在不是特別離譜的狀況下你的大腦通常是能修正回來的:

F08

第三節:注意力機制(2)

F09

– 舉例來說,我們的模型其實可以設計成這樣(下面是一個比較複雜的案例),他出自這篇經典的Paper:Feed-Forward Networks with Attention Can Solve Some Long-Term Memory Problems

F10

F12

第三節:注意力機制(3)

– 其實有注意力機制的網路並不複雜,而下面是修正之後的Model architecture:

data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')

embed1 <- mx.symbol.Embedding(data = data, input.dim = length(words.ref), output.dim = 50, name = 'embed1')
gru1 <- gru.layer(indata = embed1, num.hidden = 50, seq.len = max.length, layeridx = 1, dropout = 0, inverse = TRUE)

conv1 <- mx.symbol.Convolution(data = gru1, kernel = c(1, 1), num_filter = 30, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu", name = 'relu1')
drop1 <- mx.symbol.Dropout(data = relu1, p = 0.2, name = 'drop1')
conv2 <- mx.symbol.Convolution(data = drop1, kernel = c(1, 1), num_filter = 30, name = 'conv2')
relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu", name = 'relu2')
drop2 <- mx.symbol.Dropout(data = relu2, p = 0.2, name = 'drop1')
conv3 <- mx.symbol.Convolution(data = drop2, kernel = c(1, 1), num_filter = 1, name = 'conv3')
Attention_score <- mx.symbol.softmax(data = conv3, axis = 2, name = 'Attention_score')

weighted_gru <- mx.symbol.broadcast_mul(lhs = gru1, rhs = Attention_score, name = 'weighted_gru')
pool1 <- mx.symbol.sum(data = weighted_gru, axis = 2:3, keepdims = FALSE, name = 'pool1')

fc1 <- mx.symbol.FullyConnected(data = pool1, num.hidden = 1, name = 'fc1')
logistic_pred <- mx.symbol.sigmoid(data = fc1, name = 'logistic_pred')

eps <- 1e-8
ce_loss_pos <-  mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <-  mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
new_arg <- mxnet:::mx.model.init.params(symbol = logistic_pred, 
                                        input.shape = list(data = c(max.length, 1, 20)), 
                                        output.shape = NULL,
                                        initializer = mxnet:::mx.init.Xavier(rnd_type = "uniform", magnitude = 2.24), 
                                        ctx = mx.gpu())

new_arg$arg.params$embed1_weight <- mx.nd.array(t(words.matrix))

my.eval.metric.loss <- mx.metric.custom(
  name = "cross-entropy",
  function(label, pred) {
    return(as.array(pred))
  }
)

my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
                                        array.batch.size = 20, ctx = mx.gpu(), num.round = 20,
                                        eval.metric = my.eval.metric.loss,
                                        arg.params = new_arg$arg.params, fixed.param = 'embed1_weight',
                                        batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
my_model$symbol <- logistic_pred
my_model$arg.params <- append(my_model$arg.params, new_arg$arg.params['embed1_weight'])
pred_y <- predict(my_model, Test.seq_array)
roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))

練習2:了解模型的判斷邏輯

F11

– 如果你還沒訓練完模型,你可以從這裡下載gru_v2-0000.params以及gru_v2-symbol.json

– 但在使用這個函數之前,我們需要先下載my_DIC.aff以及my_DIC.dic兩個字典檔案作為我們拼寫校正之用:

library(hunspell)
library(magrittr)

Sys.setenv(DICPATH = "data")

wrong2right <- function (text, spell.check = TRUE) {
  
  RECODE = tolower(text)
  RECODE = RECODE %>% gsub("\n", "<S><S><S><S><S>", ., fixed = TRUE) %>% gsub("\r", "<S><S><S><S><S>", ., fixed = TRUE)

  RECODE = RECODE %>% gsub("[ :,;-]", "<S>", .) %>% gsub("(", "<S>", ., fixed = TRUE) %>% gsub(")", "<S>", ., fixed = TRUE)%>% gsub("/", "<S>", ., fixed = TRUE) 
  RECODE = strsplit(RECODE, split = ".",