循環神經網絡介紹與注意力機制的應用

林嶔 (Lin, Chin)

Lesson 14

長短期記憶單元(1)

– 我們之前分析時是採用卷積神經網路,而卷積核在每次掃描時是獨立運行的,我們上節課的最後已經提到了需要一種帶有「記憶結構」的「類卷積核」,而今天我們主要就先從這裡開始。

F14_1

\[ \begin{align} q_{t-1} & = h_{t-2} || x_{t-1} \\ p_{t-1} & = L(q_{t-1}, W) \\ h_{t-1} & = \tanh(p_{t-1}) \\ q_{t} & = h_{t-1} || x_{t} \\ p_{t} & = L(q_{t}, W) \\ h_{t} & = \tanh(p_{t}) \\ & \dots \end{align} \]

長短期記憶單元(2)

– 因此我們會面臨長期訊息損失的問題,假定\(h_1\)是由100%的\(x_1\)組成,而\(h_2\)是由50%的\(x_2\)和50%的\(h_1\)組成,之後就以這個比例,那隨著距離越走遠,\(x_1\)所佔整體的比例就會慢慢下滑。

F14_2

長短期記憶單元(3)

F14_3

\[ \begin{align} q_{t} & = h_{t-1} || x_{t} \end{align} \]

\[ \begin{align} l_t^1& = L(q_{t}, W^1) \end{align} \]

\[ \begin{align} a_t^1 & = \sigma(l_t^1) \\ \tilde{c_t} & = c_{t-1} \otimes a_t^1 \end{align} \]

\[ \begin{align} l_t^2& = L(q_{t}, W^2) \\ a_t^2 & = \sigma(l_t^2) \\ l_t^3& = L(q_{t}, W^3) \\ a_t^3 & = \tanh(l_t^3) \\ s & = a_t^2 \otimes a_t^3 \\ c_{t} & = \tilde{c_t} + s \end{align} \]

\[ \begin{align} l_t^4& = L(q_{t}, W^4) \\ a_t^4 & = \sigma(l_t^4) \\ o & = \tanh (c_{t}) \\ h_{t} & = o \otimes a_t^4 \end{align} \]

長短期記憶單元(4)

library(mxnet)

lstm.cell <- function(num.hidden, hiddendata, prev.state = NULL, param, seqidx, layeridx, first = FALSE, dropout = 0) {
  
  if (dropout > 0) {hiddendata <- mx.symbol.Dropout(data = hiddendata, p = dropout)}
  
  i2h <- mx.symbol.FullyConnected(data = hiddendata, weight = param$i2h.weight, bias = param$i2h.bias, num.hidden = num.hidden * 4, name = paste0("t", seqidx, ".l", layeridx, ".i2h"))
  
  if (first) {
    gates <- i2h
  } else {
    h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$h2h.weight, bias = param$h2h.bias, num.hidden = num.hidden * 4, name = paste0("t", seqidx, ".l", layeridx, ".h2h"))
    gates <- i2h + h2h
  }
  
  slice.gates <- mx.symbol.SliceChannel(gates, num.outputs = 4, name = paste0("t", seqidx, ".l", layeridx, ".slice"))
  
  in.gate <- mx.symbol.Activation(slice.gates[[1]], act.type = "sigmoid", name = paste0("t", seqidx, ".l", layeridx, ".in.gate"))
  in.transform <- mx.symbol.Activation(slice.gates[[2]], act.type = "tanh", name = paste0("t", seqidx, ".l", layeridx, ".in.transform"))
  forget.gate <- mx.symbol.Activation(slice.gates[[3]], act.type = "sigmoid", name = paste0("t", seqidx, ".l", layeridx, ".forget.gate"))
  out.gate <- mx.symbol.Activation(slice.gates[[4]], act.type = "sigmoid", name = paste0("t", seqidx, ".l", layeridx, ".out.gate"))
  
  if (first) {
    next.c <- (in.gate * in.transform)
  } else {
    next.c <- (forget.gate * prev.state$c) + (in.gate * in.transform)
  }
  
  next.h <- out.gate * mx.symbol.Activation(next.c, act.type="tanh")
  
  return (list(c=next.c, h=next.h))
  
}

長短期記憶單元(5)

– 除此之外,我們還為它加上了正向/雙向的循環結構選項:

F14_4

lstm.layer <- function (indata, num.hidden, seq.len, layeridx = 1, dropout = 0, inverse = TRUE) {
  
  wordvec <- mx.symbol.SliceChannel(data = indata, num_outputs = seq.len, axis = 2)
  
  Cis.param.cells <- list(i2h.weight = mx.symbol.Variable(paste0("Cis.l", layeridx, ".i2h.weight")),
                          i2h.bias = mx.symbol.Variable(paste0("Cis.l", layeridx, ".i2h.bias")),
                          h2h.weight = mx.symbol.Variable(paste0("Cis.l", layeridx, ".h2h.weight")),
                          h2h.bias = mx.symbol.Variable(paste0("Cis.l", layeridx, ".h2h.bias")))
  
  Trans.param.cells <- list(i2h.weight = mx.symbol.Variable(paste0("Trans.l", layeridx, ".i2h.weight")),
                            i2h.bias = mx.symbol.Variable(paste0("Trans.l", layeridx, ".i2h.bias")),
                            h2h.weight = mx.symbol.Variable(paste0("Trans.l", layeridx, ".h2h.weight")),
                            h2h.bias = mx.symbol.Variable(paste0("Trans.l", layeridx, ".h2h.bias")))
  
  last.hidden_1 <- list()
  
  #Cis
  
  for (seqidx in 1:seq.len) {
    hidden <- wordvec[[seqidx]]
    if (seqidx == 1) {
      next.state <- lstm.cell(num.hidden = num.hidden, hiddendata = hidden, first = TRUE, param = Cis.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
    } else {
      next.state <- lstm.cell(num.hidden = num.hidden, hiddendata = hidden, prev.state = last.states, param = Cis.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
    }
    hidden <- next.state$h
    last.states <- next.state
    if (dropout > 0) {hidden <- mx.symbol.Dropout(data = hidden, p = dropout)}
    last.hidden_1 <- c(last.hidden_1, mx.symbol.reshape(data = hidden, shape = c(1, 1, num.hidden, -1)))
  }
  concat_full_1 <- mx.symbol.concat(last.hidden_1, num.args = seq.len, dim = 2, name = paste0("Cis.l", layeridx, ".concat"))

  if (inverse) {
    
    last.hidden_2 <- list()
    
    for (seqidx in seq.len+1:seq.len) {
      hidden <- wordvec[[seq.len*2 - seqidx + 1]]
      if (seqidx == (seq.len+1)) {
        next.state <- lstm.cell(num.hidden = num.hidden, hiddendata = hidden, first = TRUE, param = Trans.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
      } else {
        next.state <- lstm.cell(num.hidden = num.hidden, hiddendata = hidden, prev.state = last.states, param = Trans.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
      }
      hidden <- next.state$h
      last.states <- next.state
      if (dropout > 0) {hidden <- mx.symbol.Dropout(data = hidden, p = dropout)}
      last.hidden_2 <- c(last.hidden_2, mx.symbol.reshape(data = hidden, shape = c(1, 1, num.hidden, -1)))
    }
    concat_full_2 <- mx.symbol.concat(last.hidden_2, num.args = seq.len, dim = 2, name = paste0("Trans.l", layeridx, ".concat"))
    concat_full_3 <- list(concat_full_1, concat_full_2)
    concat_full <- mx.symbol.concat(concat_full_3, num.args = 2, dim = 1, name = paste0("l", layeridx, ".concat"))
  } else {
    concat_full_3 <- list(concat_full_1)
    concat_full <- mx.symbol.concat(concat_full_3, num.args = 1, dim = 1, name = paste0("l", layeridx, ".concat"))
  }
  
  return(concat_full)
  
}

長短期記憶單元(6)

– 為了避免運算時間過長,我們僅使用總字數小於200以內的病歷進行實驗:

library(mxnet)
library(magrittr)
library(dplyr)
library(plyr)
library(data.table)

load('data/Discharge_Notes.RData')

word.data <- fread('data/wiki.txt', header = FALSE, showProgress = FALSE)

words.ref <- word.data %>% select(V1) %>% setDF %>% .[,1] %>% as.character
words.ref <- c("", words.ref)
words.index <- 1:length(words.ref)
names(words.index) <- words.ref

words.matrix <- word.data %>% select(-V1) %>% setDF %>% as.matrix
words.matrix <- rbind(rep(0, 50), words.matrix)

max.length <- 200

seq_array <- array(0, dim = c(max.length, length(Processed_Notes)))
short_pos <- which(sapply(Processed_Notes, length) <= max.length)

pb <- txtProgressBar(max = length(short_pos), style = 3)

for (i in 1:length(short_pos)) {
  
  words_pos <- words.index[Processed_Notes[[short_pos[i]]]]
  words_pos[is.na(words_pos)] <- 1
  seq_array[1:length(words_pos),i] <- words_pos - 1
  setTxtProgressBar(pb, i)
  
}

close(pb)

Cancer_code <- Cancer_code[short_pos]

Train.seq_array <- seq_array[,1:3000]
Train.Y <- Cancer_code[1:3000]

Test.seq_array <- seq_array[,3001:length(short_pos)]
dim(Test.seq_array) <- c(max.length, 1, dim(Test.seq_array)[2])
Test.Y <- Cancer_code[3001:length(Cancer_code)]
my_iterator_core <- function (batch_size) {
  
  batch = 0
  batch_per_epoch = length(Train.Y)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > length(Train.Y)] = sample(1:length(Train.Y), sum(idx > length(Train.Y)))
    data = mx.nd.array(array(Train.seq_array[,idx], dim = c(dim(Train.seq_array)[1], 1, batch_size)))
    label = mx.nd.array(array(Train.Y[idx], dim = c(1, batch_size)))
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter <- my_iterator_func(iter = NULL, batch_size = 20)

長短期記憶單元(7)

data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')

embed1 <- mx.symbol.Embedding(data = data, input.dim = length(words.ref), output.dim = 50, name = 'embed1')
lstm1 <- lstm.layer(indata = embed1, num.hidden = 50, seq.len = max.length, layeridx = 1, dropout = 0, inverse = TRUE)
pool1 <- mx.symbol.max(data = lstm1, axis = 2:3, keepdims = FALSE, name = 'pool1')

fc1 <- mx.symbol.FullyConnected(data = pool1, num.hidden = 1, name = 'fc1')
logistic_pred <- mx.symbol.sigmoid(data = fc1, name = 'logistic_pred')

eps <- 1e-8
ce_loss_pos <-  mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <-  mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
my_optimizer <- mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 1e-4)

長短期記憶單元(8)

new_arg <- mxnet:::mx.model.init.params(symbol = logistic_pred, 
                                        input.shape = list(data = c(max.length, 1, 20)), 
                                        output.shape = NULL,
                                        initializer = mxnet:::mx.init.Xavier(rnd_type = "uniform", magnitude = 2.24), 
                                        ctx = mx.cpu())

new_arg$arg.params$embed1_weight <- mx.nd.array(t(words.matrix))

my.eval.metric.loss <- mx.metric.custom(
  name = "cross-entropy",
  function(label, pred) {
    return(as.array(pred))
  }
)

my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
                                        array.batch.size = 20, ctx = mx.cpu(), num.round = 30,
                                        eval.metric = my.eval.metric.loss,
                                        arg.params = new_arg$arg.params, fixed.param = 'embed1_weight',
                                        batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
my_model$symbol <- logistic_pred
my_model$arg.params <- append(my_model$arg.params, new_arg$arg.params['embed1_weight'])
pred_y <- predict(my_model, Test.seq_array)
library(pROC)

roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))

練習1:重現LSTM單元的推理過程

– 如果你還沒訓練完模型,你可以從這裡下載lstm_v1-0000.params以及lstm_v1-symbol.json

– 接著,請你試著重現LSTM單元的推理過程:

my_model <- mx.model.load('model/lstm_v1', 0)

all_layers <- my_model$symbol$get.internals()
my_model$symbol <- which(all_layers$outputs == 'l1.concat_output') %>% all_layers$get.output()
my_model$arg.params <- my_model$arg.params[!grepl('fc1', names(my_model$arg.params))]

Input <- Test.seq_array[,,5]
dim(Input) <- c(200, 1, 1)
Output <- predict(my_model, Input)

– 我們只做單向的前5個字,請試著做出相對應的預測:

sub_Input <- Input[1:5,,]
sub_Output <- Output[,1:5,1:50,]

練習1答案(1)

Embedding_Output <- as.array(my_model$arg.params$embed1_weight)[,sub_Input + 1]
i = 1

input_vec <- Embedding_Output[,i]
i2h <- input_vec %*% as.array(my_model$arg.params$Cis.l1.i2h.weight) + as.array(my_model$arg.params$Cis.l1.i2h.bias)
gates <- i2h

in.gate <- 1/(1+exp(-gates[1:50]))
in.transform <- tanh(gates[51:100])
forget.gate <- 1/(1+exp(-gates[101:150]))
out.gate <- 1/(1+exp(-gates[151:200]))

next.c <- in.gate * in.transform
next.h <- out.gate * tanh(next.c)
next.h
##  [1]  0.086967833 -0.063449661  0.036880010  0.010068880 -0.149757446
##  [6] -0.045954626 -0.007239696 -0.029001972 -0.036172191  0.014461363
## [11] -0.143614092  0.104743384  0.077178065 -0.018109491 -0.053787901
## [16]  0.033238218  0.073090688  0.219209935 -0.139997959 -0.040038392
## [21] -0.080872811 -0.060081231 -0.020082376 -0.033003086  0.071313954
## [26] -0.264252035  0.035592710 -0.021121420 -0.132410585 -0.033140717
## [31] -0.024339752  0.218807462 -0.020077547 -0.001792587 -0.015403937
## [36]  0.019053839 -0.097208227 -0.121762368  0.021987457  0.049170363
## [41]  0.019188609 -0.017815512  0.076241088 -0.145931257 -0.092379922
## [46]  0.021569875 -0.020209405 -0.020605694  0.270575293  0.009769096
sub_Output[1,]
##  [1]  0.086967841 -0.063449658  0.036880009  0.010068880 -0.149757430
##  [6] -0.045954626 -0.007239695 -0.029001970 -0.036172196  0.014461363
## [11] -0.143614084  0.104743384  0.077178054 -0.018109491 -0.053787902
## [16]  0.033238214  0.073090695  0.219209924 -0.139997959 -0.040038396
## [21] -0.080872811 -0.060081221 -0.020082375 -0.033003084  0.071313955
## [26] -0.264252037  0.035592709 -0.021121420 -0.132410586 -0.033140719
## [31] -0.024339752  0.218807474 -0.020077543 -0.001792587 -0.015403938
## [36]  0.019053841 -0.097208224 -0.121762373  0.021987455  0.049170367
## [41]  0.019188609 -0.017815515  0.076241091 -0.145931244 -0.092379920
## [46]  0.021569878 -0.020209406 -0.020605695  0.270575285  0.009769096
i = 2

input_vec <- Embedding_Output[,i]
i2h <- input_vec %*% as.array(my_model$arg.params$Cis.l1.i2h.weight) + as.array(my_model$arg.params$Cis.l1.i2h.bias)
h2h <- next.h %*% as.array(my_model$arg.params$Cis.l1.h2h.weight) + as.array(my_model$arg.params$Cis.l1.h2h.bias)
gates <- i2h + h2h

in.gate <- 1/(1+exp(-gates[,1:50]))
in.transform <- tanh(gates[,51:100])
forget.gate <- 1/(1+exp(-gates[,101:150]))
out.gate <- 1/(1+exp(-gates[,151:200]))

next.c <- forget.gate * next.c + in.gate * in.transform
next.h <- out.gate * tanh(next.c)
next.h
##  [1]  0.172714273 -0.134861879  0.084825992  0.026005947 -0.292853043
##  [6] -0.143597484 -0.054681028 -0.069802900 -0.134559245  0.001540124
## [11] -0.307148649  0.208848144  0.314672849 -0.058207505 -0.153540190
## [16]  0.075622727  0.163340499  0.578598825 -0.251814185 -0.086394009
## [21] -0.219408604 -0.151962526 -0.038312221 -0.078217000  0.185670722
## [26] -0.527193472  0.082940466 -0.087716673 -0.319968570 -0.094814267
## [31] -0.098546791  0.587162975 -0.057633908 -0.034965075 -0.033302097
## [36]  0.061189049 -0.235737058 -0.228580143  0.068899261  0.153120268
## [41]  0.044878595 -0.065051461  0.171124248 -0.306323643 -0.283827253
## [46]  0.047662889 -0.072233065 -0.035651133  0.658952325  0.038527077
sub_Output[2,]
##  [1]  0.172714278 -0.134861872  0.084825985  0.026005950 -0.292853028
##  [6] -0.143597469 -0.054681018 -0.069802903 -0.134559259  0.001540127
## [11] -0.307148665  0.208848134  0.314672887 -0.058207501 -0.153540164
## [16]  0.075622715  0.163340494  0.578598797 -0.251814157 -0.086394012
## [21] -0.219408616 -0.151962534 -0.038312223 -0.078216992  0.185670748
## [26] -0.527193487  0.082940467 -0.087716669 -0.319968551 -0.094814263
## [31] -0.098546788  0.587162971 -0.057633907 -0.034965076 -0.033302099
## [36]  0.061189052 -0.235737056 -0.228580162  0.068899259  0.153120279
## [41]  0.044878591 -0.065051466  0.171124235 -0.306323618 -0.283827245
## [46]  0.047662891 -0.072233059 -0.035651125  0.658952355  0.038527071

練習1答案(2)

next.c <- 0
h_list <- list()

for (i in 1:5) {
  
  input_vec <- Embedding_Output[,i]
  i2h <- input_vec %*% as.array(my_model$arg.params$Cis.l1.i2h.weight) + as.array(my_model$arg.params$Cis.l1.i2h.bias)
  if (i != 1) {
    h2h <- next.h %*% as.array(my_model$arg.params$Cis.l1.h2h.weight) + as.array(my_model$arg.params$Cis.l1.h2h.bias)
    gates <- i2h + h2h
  } else {
    gates <- i2h
  }
  
  in.gate <- 1/(1+exp(-gates[,1:50]))
  in.transform <- tanh(gates[,51:100])
  forget.gate <- 1/(1+exp(-gates[,101:150]))
  out.gate <- 1/(1+exp(-gates[,151:200]))
  
  next.c <- forget.gate * next.c + in.gate * in.transform
  next.h <- out.gate * tanh(next.c)
  h_list[[i]] <- next.h
  
}

my_Output <- do.call('rbind', h_list)

all.equal(my_Output, sub_Output)
## [1] "Mean relative difference: 5.544029e-08"

Gated Recurrent Unit(1)

next.c <- forget.gate * next.c + in.gate * in.transform

– 另外,長期記憶和短期記憶為什麼不能整合起來呢?

F14_5

Gated Recurrent Unit(2)

F14_6

\[ \begin{align} q_{t} & = h_{t-1} || x_{t} \end{align} \]

\[ \begin{align} l_t^1& = L(q_{t}, W^1) \\ a_t^1 & = \sigma(l_t^1) \\ \tilde{h_t} & = h_{t-1} \otimes a_t^1 \end{align} \]

\[ \begin{align} l_t^2& = L(q_{t}, W^2) \\ a_t^2 & = \sigma(l_t^2) \end{align} \]

\[ \begin{align} \tilde{q_{t}} & = \tilde{h_t} || x_{t} \\ l_t^3& = L(\tilde{q_{t}}, W^3) \\ a_t^3 & = \tanh(l_t^3) \end{align} \]

\[ \begin{align} h_t & = a_t^2 \otimes a_t^3 + (1-a_t^2) \otimes h_{t-1} \end{align} \]

Gated Recurrent Unit(3)

library(mxnet)

gru.cell <- function(num.hidden, hiddendata, prev.state = NULL, param, seqidx, layeridx, first = FALSE, dropout = 0) {
  
  if (dropout > 0) {hiddendata <- mx.symbol.Dropout(data = hiddendata, p = dropout)}
  
  i2h <- mx.symbol.FullyConnected(data = hiddendata, weight = param$gates.i2h.weight, bias = param$gates.i2h.bias, num.hidden = num.hidden * 2, name = paste0("t", seqidx, ".l", layeridx, ".gates.i2h"))
  
  if (first) {
    gates <- i2h
  } else {
    h2h <- mx.symbol.FullyConnected(data = prev.state$h, weight = param$gates.h2h.weight, bias = param$gates.h2h.bias, num.hidden = num.hidden * 2, name = paste0("t", seqidx, ".l", layeridx, ".gates.h2h"))
    gates <- i2h + h2h
  }
  
  slice.gates <- mx.symbol.SliceChannel(gates, num.outputs = 2, name = paste0("t", seqidx, ".l", layeridx, ".slice"))
  
  update.gate <- mx.symbol.Activation(slice.gates[[1]], act.type = "sigmoid", name = paste0("t", seqidx, ".l", layeridx, ".update.gate"))
  reset.gate <- mx.symbol.Activation(slice.gates[[2]], act.type = "sigmoid", name = paste0("t", seqidx, ".l", layeridx, ".reset.gate"))
  
  htrans.i2h <- mx.symbol.FullyConnected(data = hiddendata, weight = param$trans.i2h.weight, bias = param$trans.i2h.bias, num.hidden = num.hidden, name = paste0("t", seqidx, ".l", layeridx, ".trans.i2h"))
  
  if (first) {
    h.trans <- htrans.i2h
    h.trans.active <- mx.symbol.Activation(h.trans, act.type="tanh", name = paste0("t", seqidx, ".l", layeridx, ".h.trans.active"))
    next.h <- h.trans.active
  } else {
    h.after.reset <- prev.state$h * reset.gate
    htrans.h2h <- mx.symbol.FullyConnected(data = h.after.reset, weight = param$trans.h2h.weight, bias = param$trans.h2h.bias, num.hidden = num.hidden, name = paste0("t", seqidx, ".l", layeridx, ".trans.h2h"))
    h.trans <- htrans.i2h + htrans.h2h
    h.trans.active <- mx.symbol.Activation(h.trans, act.type="tanh", name = paste0("t", seqidx, ".l", layeridx, ".h.trans.active"))
    next.h <- update.gate * h.trans.active + (1 - update.gate) * prev.state$h
  }
  
  return (list(h=next.h))
  
}
gru.layer <- function (indata, num.hidden, seq.len, layeridx = 1, dropout = 0, inverse = TRUE) {
  
  wordvec <- mx.symbol.SliceChannel(data = indata, num_outputs = seq.len, axis = 2)
  
  Cis.param.cells <- list(gates.i2h.weight = mx.symbol.Variable(paste0("Cis.l", layeridx, ".gates.i2h.weight")),
                          gates.i2h.bias = mx.symbol.Variable(paste0("Cis.l", layeridx, ".gates.i2h.bias")),
                          gates.h2h.weight = mx.symbol.Variable(paste0("Cis.l", layeridx, ".gates.h2h.weight")),
                          gates.h2h.bias = mx.symbol.Variable(paste0("Cis.l", layeridx, ".gates.h2h.bias")),
                          trans.i2h.weight = mx.symbol.Variable(paste0("Cis.l", layeridx, ".trans.i2h.weight")),
                          trans.i2h.bias = mx.symbol.Variable(paste0("Cis.l", layeridx, ".trans.i2h.bias")),
                          trans.h2h.weight = mx.symbol.Variable(paste0("Cis.l", layeridx, ".trans.h2h.weight")),
                          trans.h2h.bias = mx.symbol.Variable(paste0("Cis.l", layeridx, ".trans.h2h.bias")))
  
  Trans.param.cells <- list(gates.i2h.weight = mx.symbol.Variable(paste0("Trans.l", layeridx, ".gates.i2h.weight")),
                            gates.i2h.bias = mx.symbol.Variable(paste0("Trans.l", layeridx, ".gates.i2h.bias")),
                            gates.h2h.weight = mx.symbol.Variable(paste0("Trans.l", layeridx, ".gates.h2h.weight")),
                            gates.h2h.bias = mx.symbol.Variable(paste0("Trans.l", layeridx, ".gates.h2h.bias")),
                            trans.i2h.weight = mx.symbol.Variable(paste0("Trans.l", layeridx, ".trans.i2h.weight")),
                            trans.i2h.bias = mx.symbol.Variable(paste0("Trans.l", layeridx, ".trans.i2h.bias")),
                            trans.h2h.weight = mx.symbol.Variable(paste0("Trans.l", layeridx, ".trans.h2h.weight")),
                            trans.h2h.bias = mx.symbol.Variable(paste0("Trans.l", layeridx, ".trans.h2h.bias")))
  
  last.hidden_1 <- list()
  
  #Cis
  
  for (seqidx in 1:seq.len) {
    hidden <- wordvec[[seqidx]]
    if (seqidx == 1) {
      next.state <- gru.cell(num.hidden = num.hidden, hiddendata = hidden, first = TRUE, param = Cis.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
    } else {
      next.state <- gru.cell(num.hidden = num.hidden, hiddendata = hidden, prev.state = last.states, param = Cis.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
    }
    hidden <- next.state$h
    last.states <- next.state
    if (dropout > 0) {hidden <- mx.symbol.Dropout(data = hidden, p = dropout)}
    last.hidden_1 <- c(last.hidden_1, mx.symbol.reshape(data = hidden, shape = c(1, 1, num.hidden, -1)))
  }
  concat_full_1 <- mx.symbol.concat(last.hidden_1, num.args = seq.len, dim = 2, name = paste0("Cis.l", layeridx, ".concat"))
  
  if (inverse) {
    
    last.hidden_2 <- list()
    
    for (seqidx in seq.len+1:seq.len) {
      hidden <- wordvec[[seq.len*2 - seqidx + 1]]
      if (seqidx == (seq.len+1)) {
        next.state <- gru.cell(num.hidden = num.hidden, hiddendata = hidden, first = TRUE, param = Trans.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
      } else {
        next.state <- gru.cell(num.hidden = num.hidden, hiddendata = hidden, prev.state = last.states, param = Trans.param.cells, seqidx = seqidx, layeridx = layeridx, dropout = dropout)
      }
      hidden <- next.state$h
      last.states <- next.state
      if (dropout > 0) {hidden <- mx.symbol.Dropout(data = hidden, p = dropout)}
      last.hidden_2 <- c(last.hidden_2, mx.symbol.reshape(data = hidden, shape = c(1, 1, num.hidden, -1)))
    }
    concat_full_2 <- mx.symbol.concat(last.hidden_2, num.args = seq.len, dim = 2, name = paste0("Trans.l", layeridx, ".concat"))
    concat_full_3 <- list(concat_full_1, concat_full_2)
    concat_full <- mx.symbol.concat(concat_full_3, num.args = 2, dim = 1, name = paste0("l", layeridx, ".concat"))
  } else {
    concat_full_3 <- list(concat_full_1)
    concat_full <- mx.symbol.concat(concat_full_3, num.args = 1, dim = 1, name = paste0("l", layeridx, ".concat"))
  }
  
  return(concat_full)
  
}

Gated Recurrent Unit(4)

data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')

embed1 <- mx.symbol.Embedding(data = data, input.dim = length(words.ref), output.dim = 50, name = 'embed1')
gru1 <- gru.layer(indata = embed1, num.hidden = 50, seq.len = max.length, layeridx = 1, dropout = 0, inverse = TRUE)
pool1 <- mx.symbol.max(data = gru1, axis = 2:3, keepdims = FALSE, name = 'pool1')

fc1 <- mx.symbol.FullyConnected(data = pool1, num.hidden = 1, name = 'fc1')
logistic_pred <- mx.symbol.sigmoid(data = fc1, name = 'logistic_pred')

eps <- 1e-8
ce_loss_pos <-  mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <-  mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
new_arg <- mxnet:::mx.model.init.params(symbol = logistic_pred, 
                                        input.shape = list(data = c(max.length, 1, 20)), 
                                        output.shape = NULL,
                                        initializer = mxnet:::mx.init.Xavier(rnd_type = "uniform", magnitude = 2.24), 
                                        ctx = mx.cpu())

new_arg$arg.params$embed1_weight <- mx.nd.array(t(words.matrix))

my.eval.metric.loss <- mx.metric.custom(
  name = "cross-entropy",
  function(label, pred) {
    return(as.array(pred))
  }
)

my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
                                        array.batch.size = 20, ctx = mx.cpu(), num.round = 30,
                                        eval.metric = my.eval.metric.loss,
                                        arg.params = new_arg$arg.params, fixed.param = 'embed1_weight',
                                        batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
my_model$symbol <- logistic_pred
my_model$arg.params <- append(my_model$arg.params, new_arg$arg.params['embed1_weight'])
pred_y <- predict(my_model, Test.seq_array)
roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))

關於目前卷積核與記憶單元在語言任務中的討論

– 因此,我們的模型基本上還是一種辨識關鍵字/片語的工具,只是隨著抽取特徵的技術越來越好(像是使用了記憶單元),現在抽取的特徵比較有可能考慮到了上下文關係。

– 像是Facebook團隊在2017年發表的Convolutional Sequence to Sequence Learning,就是用堆疊卷積核在翻譯任務上取得非常好的成績。

– 但同樣也是Facebook團隊近期在翻譯任務上取得的不可思議突破:Unsupervised Machine Translation Using Monolingual Corpora Only,所使用的卻是LSTM

注意力機制(1)

– 因此比較好的想法就是要把整個句子的資訊都加在一起,但直接取平均似乎不是太好的方法。

– 至於位置資訊呢?其實並不是這麼重要,在不是特別離譜的狀況下你的大腦通常是能修正回來的:

F14_8

注意力機制(2)

F14_9

– 舉例來說,我們的模型其實可以設計成這樣(下面是一個比較複雜的案例),他出自這篇經典的Paper:Feed-Forward Networks with Attention Can Solve Some Long-Term Memory Problems

F14_10

F14_12

注意力機制(3)

– 其實有注意力機制的網路並不複雜,而下面是修正之後的Model architecture:

data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')

embed1 <- mx.symbol.Embedding(data = data, input.dim = length(words.ref), output.dim = 50, name = 'embed1')
gru1 <- gru.layer(indata = embed1, num.hidden = 50, seq.len = max.length, layeridx = 1, dropout = 0, inverse = TRUE)

conv1 <- mx.symbol.Convolution(data = gru1, kernel = c(1, 1), num_filter = 30, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu", name = 'relu1')
drop1 <- mx.symbol.Dropout(data = relu1, p = 0.2, name = 'drop1')
conv2 <- mx.symbol.Convolution(data = drop1, kernel = c(1, 1), num_filter = 30, name = 'conv2')
relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu", name = 'relu2')
drop2 <- mx.symbol.Dropout(data = relu2, p = 0.2, name = 'drop1')
conv3 <- mx.symbol.Convolution(data = drop2, kernel = c(1, 1), num_filter = 1, name = 'conv3')
Attention_score <- mx.symbol.softmax(data = conv3, axis = 2, name = 'Attention_score')

weighted_gru <- mx.symbol.broadcast_mul(lhs = gru1, rhs = Attention_score, name = 'weighted_gru')
pool1 <- mx.symbol.sum(data = weighted_gru, axis = 2:3, keepdims = FALSE, name = 'pool1')

fc1 <- mx.symbol.FullyConnected(data = pool1, num.hidden = 1, name = 'fc1')
logistic_pred <- mx.symbol.sigmoid(data = fc1, name = 'logistic_pred')

eps <- 1e-8
ce_loss_pos <-  mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <-  mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
new_arg <- mxnet:::mx.model.init.params(symbol = logistic_pred, 
                                        input.shape = list(data = c(max.length, 1, 20)), 
                                        output.shape = NULL,
                                        initializer = mxnet:::mx.init.Xavier(rnd_type = "uniform", magnitude = 2.24), 
                                        ctx = mx.cpu())

new_arg$arg.params$embed1_weight <- mx.nd.array(t(words.matrix))

my.eval.metric.loss <- mx.metric.custom(
  name = "cross-entropy",
  function(label, pred) {
    return(as.array(pred))
  }
)

my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
                                        array.batch.size = 20, ctx = mx.cpu(), num.round = 30,
                                        eval.metric = my.eval.metric.loss,
                                        arg.params = new_arg$arg.params, fixed.param = 'embed1_weight',
                                        batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
my_model$symbol <- logistic_pred
my_model$arg.params <- append(my_model$arg.params, new_arg$arg.params['embed1_weight'])
pred_y <- predict(my_model, Test.seq_array)
roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))

練習2:了解模型的判斷邏輯

F14_11

– 如果你還沒訓練完模型,你可以從這裡下載gru_v2-0000.params以及gru_v2-symbol.json

– 但在使用這個函數之前,我們需要先下載my_DIC.aff以及my_DIC.dic兩個字典檔案作為我們拼寫校正之用:

library(hunspell)
library(magrittr)

Sys.setenv(DICPATH = "data")

wrong2right <- function (text, spell.check = TRUE) {
  
  RECODE = tolower(text)
  RECODE = RECODE %>% gsub("\n", "<S><S><S><S><S>", ., fixed = TRUE) %>% gsub("\r", "<S><S><S><S><S>", ., fixed = TRUE)

  RECODE = RECODE %>% gsub("[ :,;-]", "<S>", .) %>% gsub("(", "<S>", ., fixed = TRUE) %>% gsub(")", "<S>", ., fixed = TRUE)%>% gsub("/", "<S>", ., fixed = TRUE) 
  RECODE = strsplit(RECODE, split = ".", fixed = TRUE)[[1]]
  RECODE = paste(RECODE, collapse = "<S><S><S><S><S>")
  RECODE = gsub("(<S>){5, }", "<S><S><S><S><S>", RECODE)
  RECODE = gsub("(<S>)+$", "", RECODE)
  RECODE = strsplit(RECODE, split = "<S>", fixed = TRUE)[[1]]

  pos1 = rep(NA, length(RECODE))
  pos2 = rep(NA, length(RECODE))
  
  start.pos = 1
  sub.text = text
  
  for (i in 1:length(RECODE)) {
    if (RECODE[i]!="") {
      result = regexpr(RECODE[i], tolower(sub.text))
      if (result[1] != -1) {
        pos1[i] = start.pos + result[1] - 1
        pos2[i] = start.pos + result[1] + attr(result, "match.length") - 2
        start.pos = pos2[i] + 1
        sub.text = substr(sub.text, result[1] + attr(result, "match.length"), nchar(sub.text))
      }
    }
  }
  
  RECODE = RECODE %>% gsub("[^A-Z0-9a-z .:,;-]", "", .) 
  RECODE[!(!grepl("[^a-z]", RECODE)|!grepl("[^0-9]", RECODE))] = RECODE[!(!grepl("[^a-z]", RECODE)|!grepl("[^0-9]", RECODE))] %>% gsub("[0-9]", "", .)
  RECODE = RECODE %>% gsub("[0-9]", "", .)
  
  if (spell.check) {
    SPELL_check = which(!hunspell:::hunspell_check(RECODE, dict = hunspell:::dictionary("my_DIC", cache = FALSE)))
    if (length(SPELL_check) > 0) {
      hist.num.space = 0
      for (k in 1:length(SPELL_check)) {
        RECODE[SPELL_check[k]] = hunspell:::hunspell_suggest(RECODE[SPELL_check[k]], dict = hunspell:::dictionary("my_DIC", cache = FALSE))[[1]][1]
        if (grepl(" ", RECODE[SPELL_check[k]], fixed = TRUE)) {
          num.space = length(gregexpr(" ", RECODE[SPELL_check[k]], fixed = TRUE)[[1]])
          len.pos1 = length(pos1)
          pos1 = c(pos1[1:(SPELL_check[k]+hist.num.space)], rep(NA, num.space - 1), pos1[(SPELL_check[k]+hist.num.space):len.pos1])
          pos2 = c(pos2[1:(SPELL_check[k]+hist.num.space)], rep(NA, num.space - 1), pos2[(SPELL_check[k]+hist.num.space):len.pos1])
          pos1[SPELL_check[k]+hist.num.space+1:num.space] = NA
          pos2[SPELL_check[k]+hist.num.space+0:(num.space-1)] = NA
          hist.num.space = hist.num.space + num.space
        }
      }
    }
    RECODE = RECODE %>% gsub(" ", "<S>", .)
    RECODE[nchar(RECODE) == 0] = "<A>"
    RECODE = strsplit(RECODE, split = "<S>", fixed = TRUE) %>% unlist
    RECODE[RECODE %in% "<A>"] <- ""
    RECODE = tolower(RECODE)
  }
  
  return(list(new.text = RECODE, original.text = text, pos1 = pos1, pos2 = pos2))
  
}

– 採用的病歷一樣使用之前的例子:

  1. Adenocarcinoma of stomach with peritoneal carcinomatosis and massive ascite, stage IV under bidirection chemotherapy (neoadjuvant intraperitoneal-systemic chemotherapy) with intraperitoneal paclitaxel 120mg (20151126, 20151201) and systemic with Oxalip (20151127) and oral XELOX.

  2. Chronic kidney disease, stage V with pulmonary edema underwent emergent hemodialysis, status post arteriovenous graft creation with maintenance hemodialysis.

練習2答案(1)

my_predict <- function (model, doc, ctx = mx.cpu()) {
  
  require(magrittr)
  
  all_layers <- model$symbol$get.internals()
  
  Attention_score <- which(all_layers$outputs == 'Attention_score_output') %>% all_layers$get.output()
  logistic_pred <- which(all_layers$outputs == 'logistic_pred_output') %>% all_layers$get.output()
  
  out <- mx.symbol.Group(c(Attention_score, logistic_pred))
  executor <- mx.simple.bind(symbol = out, data = dim(doc), ctx = ctx)
  
  mx.exec.update.arg.arrays(executor, model$arg.params, match.name = TRUE)
  mx.exec.update.aux.arrays(executor, model$aux.params, match.name = TRUE)
  if (class(doc)!='MXNDArray') {doc <- mx.nd.array(doc)}
  mx.exec.update.arg.arrays(executor, list(data = doc), match.name = TRUE)
  mx.exec.forward(executor, is.train = FALSE)
  
  pred_list <- list()
  
  pred_list[[1]] <- as.array(executor$ref.outputs$Attention_score_output)
  pred_list[[2]] <- as.array(executor$ref.outputs$logistic_pred_output)
  
  return(pred_list)
  
}
my_discharge_note <- 'Adenocarcinoma of stomach with peritoneal carcinomatosis and massive ascite, stage IV under bidirection chemotherapy (neoadjuvant intraperitoneal-systemic chemotherapy) with intraperitoneal paclitaxel 120mg (20151126, 20151201) and systemic with Oxalip (20151127) and oral XELOX.'

processed_discharge_note <- wrong2right(my_discharge_note)

raw_len <- length(processed_discharge_note$new.text)
discharge_note_seq <- rep(0, 200)
words_pos <- words.index[processed_discharge_note$new.text]
words_pos[is.na(words_pos)] <- 1
discharge_note_seq[1:raw_len] <- words_pos - 1
dim(discharge_note_seq) <- c(200, 1, 1)

pred_list <- my_predict(model = my_model, doc = discharge_note_seq)

rbind(round(pred_list[[1]][,1:raw_len,,], 3), processed_discharge_note$new.text)
##      [,1]             [,2] [,3]      [,4]   [,5]         [,6]        
## [1,] "0.932"          "0"  "0"       "0"    "0"          "0.068"     
## [2,] "adenocarcinoma" "of" "stomach" "with" "peritoneal" "carcinomas"
##      [,7]  [,8]      [,9]    [,10] [,11]   [,12] [,13]   [,14] [,15]      
## [1,] "0"   "0"       "0"     "0"   "0"     "0"   "0"     "0"   "0"        
## [2,] "and" "massive" "ascie" ""    "stage" "iv"  "under" "bi"  "direction"
##      [,16]          [,17] [,18] [,19]      [,20]             [,21]     
## [1,] "0"            "0"   "0"   "0"        "0"               "0"       
## [2,] "chemotherapy" ""    "neo" "adjuvant" "intraperitoneal" "systemic"
##      [,22]          [,23] [,24]  [,25]             [,26]        [,27]
## [1,] "0"            "0"   "0"    "0"               "0"          "0"  
## [2,] "chemotherapy" ""    "with" "intraperitoneal" "paclitaxel" "mg" 
##      [,28] [,29] [,30] [,31] [,32] [,33] [,34]      [,35]  [,36]    [,37]
## [1,] "0"   "0"   "0"   "0"   "0"   "0"   "0"        "0"    "0"      "0"  
## [2,] ""    ""    ""    ""    ""    "and" "systemic" "with" "oxalis" ""   
##      [,38] [,39] [,40] [,41]  [,42]  
## [1,] "0"   "0"   "0"   "0"    "0"    
## [2,] ""    ""    "and" "oral" "xerox"
my_discharge_note <- 'Chronic kidney disease, stage V with pulmonary edema underwent emergent hemodialysis, status post arteriovenous graft creation with maintenance hemodialysis.'

processed_discharge_note <- wrong2right(my_discharge_note)

raw_len <- length(processed_discharge_note$new.text)
discharge_note_seq <- rep(0, 200)
words_pos <- words.index[processed_discharge_note$new.text]
words_pos[is.na(words_pos)] <- 1
discharge_note_seq[1:raw_len] <- words_pos - 1
dim(discharge_note_seq) <- c(200, 1, 1)

pred_list <- my_predict(model = my_model, doc = discharge_note_seq)

rbind(round(pred_list[[1]][,1:raw_len,,], 3), processed_discharge_note$new.text)
##      [,1]      [,2]     [,3]      [,4]    [,5]    [,6]    [,7]   
## [1,] "0.007"   "0.003"  "0.004"   "0.002" "0.002" "0.002" "0.002"
## [2,] "chronic" "kidney" "disease" ""      "stage" "v"     "with" 
##      [,8]        [,9]    [,10]       [,11]      [,12]          [,13]  
## [1,] "0.003"     "0.051" "0.505"     "0.016"    "0.008"        "0.002"
## [2,] "pulmonary" "edema" "underwent" "emergent" "hemodialysis" ""     
##      [,14]    [,15]   [,16]           [,17]   [,18]      [,19]  
## [1,] "0.002"  "0.002" "0.001"         "0.003" "0.002"    "0.002"
## [2,] "status" "post"  "arteriovenous" "graft" "creation" "with" 
##      [,20]         [,21]         
## [1,] "0.002"       "0.001"       
## [2,] "maintenance" "hemodialysis"

– 第二份病歷似乎連空字元都有0.2%的權重,並且具有最高權重的單字是underwent以及edema,這兩個很可能是反向字,思考一下模型的合理性。

練習2答案(2)

Highlight_fun <- function (text.list, highlight.seq) {
  
  nrom_col_index <- round(highlight.seq / max(highlight.seq) * 99) + 1
  col_list <- rgb(seq(0, 1, length.out = 100), 0, 0)
  col_index <- col_list[nrom_col_index]
  
  if (text.list$pos1[1] > 1) {
    html_output <- substr(text.list$original.text, 1, text.list$pos1[1] - 1)
  } else {
    html_output <- ""
  }
  
  last.end <- 0
  
  for (j in 1:length(highlight.seq)) {
    if (!is.na(text.list$pos1[j]) & !is.na(text.list$pos2[j]) & text.list$new.text[j] != "") {
      if (last.end != 0|text.list$pos1[j] > 1) {
        html_output <- paste0(html_output, substr(text.list$original.text, last.end + 1, text.list$pos1[j] - 1))
      }
      html_output <- paste0(html_output, '<font color=\"', col_index[j], '\">', substr(text.list$original.text, text.list$pos1[j], text.list$pos2[j]), '</font>')
      last.end <- text.list$pos2[j]
    }
  }
  
  html_output <- paste0(html_output, substr(text.list$original.text, last.end + 1, nchar(text.list$original.text)))
  
  return(html_output)
  
}
html_output <- Highlight_fun(text.list = processed_discharge_note, highlight.seq = pred_list[[1]][,1:raw_len,,])

cat(html_output, file = 'test.html')

Adenocarcinoma of stomach with peritoneal carcinomatosis and massive ascite, stage IV under bidirection chemotherapy (neoadjuvant intraperitoneal-systemic chemotherapy) with intraperitoneal paclitaxel 120mg (20151126, 20151201) and systemic with Oxalip (20151127) and oral XELOX.

Chronic kidney disease, stage V with pulmonary edema underwent emergent hemodialysis, status post arteriovenous graft creation with maintenance hemodialysis.

循環神經網絡(1)

– 在之前的任務中,我們是把輸入長度固定為某個數(以今天的例子是200),而不足的部分填入空白,這在Input的部分可能還行,那Output呢?因此我們需要新的結構來處理語言任務。