深度學習理論與實務

林嶔 (Lin, Chin)

Lesson 14 詞嵌入模型與其深度學習應用

第一節:詞袋模型(1)

– 因此,傳統的語言模型演算法大多透過詞袋模型(Bag-of-words model)的方法進行特徵萃取,而這個方法在像是垃圾郵件分類等任務中也取得非常好的成績:

F01

– 這種向量化的方式非常類似於類別變項的處理,也有另外一個稱呼叫做獨熱編碼(one hot encoding)。

第一節:詞袋模型(2)

– 我們先到這裡下載三軍總醫院從105年1月1日至105年1月31日所有的出院病歷摘要,這些病例已經初步被標記是否為是「癌症」病例,我們看看Data的樣子:

load('data/Discharge_Notes.RData')

cat(Discharge_Notes[10])
## Axis I : 
##  1. Other specified depressive disorder
##  2. Suspect adjustment disorder with depressed mood.
## Axis II:   deffered
## Axis III:  deferred
## Axis IV: Military stress
## Axis V:Global Assessment Functioning scale: 41-50
Cancer_code[10]
## [1] 0

第一節:詞袋模型(3)

– 如果你想學習如何處理文字,你可以參考一下R語言程式設計導論:文字處理簡介的部分

– 但在使用這個函數之前,我們需要先下載my_DIC.aff以及my_DIC.dic兩個字典檔案作為我們拼寫校正之用:

library(hunspell)
library(magrittr)

Sys.setenv(DICPATH = "data")

wrong2right <- function (text, spell.check = TRUE) {
  
  RECODE = tolower(text)
  RECODE = RECODE %>% gsub("\n", "<S><S><S><S><S>", ., fixed = TRUE) %>% gsub("\r", "<S><S><S><S><S>", ., fixed = TRUE)

  RECODE = RECODE %>% gsub("[ :,;-]", "<S>", .) %>% gsub("(", "<S>", ., fixed = TRUE) %>% gsub(")", "<S>", ., fixed = TRUE)%>% gsub("/", "<S>", ., fixed = TRUE) 
  RECODE = strsplit(RECODE, split = ".", fixed = TRUE)[[1]]
  RECODE = paste(RECODE, collapse = "<S><S><S><S><S>")
  RECODE = gsub("(<S>){5, }", "<S><S><S><S><S>", RECODE)
  RECODE = gsub("(<S>)+$", "", RECODE)
  RECODE = strsplit(RECODE, split = "<S>", fixed = TRUE)[[1]]

  pos1 = rep(NA, length(RECODE))
  pos2 = rep(NA, length(RECODE))
  
  start.pos = 1
  sub.text = text
  
  for (i in 1:length(RECODE)) {
    if (RECODE[i]!="") {
      result = regexpr(RECODE[i], tolower(sub.text))
      if (result[1] != -1) {
        pos1[i] = start.pos + result[1] - 1
        pos2[i] = start.pos + result[1] + attr(result, "match.length") - 2
        start.pos = pos2[i] + 1
        sub.text = substr(sub.text, result[1] + attr(result, "match.length"), nchar(sub.text))
      }
    }
  }
  
  RECODE = RECODE %>% gsub("[^A-Z0-9a-z .:,;-]", "", .) 
  RECODE[!(!grepl("[^a-z]", RECODE)|!grepl("[^0-9]", RECODE))] = RECODE[!(!grepl("[^a-z]", RECODE)|!grepl("[^0-9]", RECODE))] %>% gsub("[0-9]", "", .)
  RECODE = RECODE %>% gsub("[0-9]", "", .)
  
  if (spell.check) {
    SPELL_check = which(!hunspell:::hunspell_check(RECODE, dict = hunspell:::dictionary("my_DIC", cache = FALSE)))
    if (length(SPELL_check) > 0) {
      hist.num.space = 0
      for (k in 1:length(SPELL_check)) {
        RECODE[SPELL_check[k]] = hunspell:::hunspell_suggest(RECODE[SPELL_check[k]], dict = hunspell:::dictionary("my_DIC", cache = FALSE))[[1]][1]
        if (grepl(" ", RECODE[SPELL_check[k]], fixed = TRUE)) {
          num.space = length(gregexpr(" ", RECODE[SPELL_check[k]], fixed = TRUE)[[1]])
          len.pos1 = length(pos1)
          pos1 = c(pos1[1:(SPELL_check[k]+hist.num.space)], rep(NA, num.space - 1), pos1[(SPELL_check[k]+hist.num.space):len.pos1])
          pos2 = c(pos2[1:(SPELL_check[k]+hist.num.space)], rep(NA, num.space - 1), pos2[(SPELL_check[k]+hist.num.space):len.pos1])
          pos1[SPELL_check[k]+hist.num.space+1:num.space] = NA
          pos2[SPELL_check[k]+hist.num.space+0:(num.space-1)] = NA
          hist.num.space = hist.num.space + num.space
        }
      }
    }
    RECODE = RECODE %>% gsub(" ", "<S>", .)
    RECODE[nchar(RECODE) == 0] = "<A>"
    RECODE = strsplit(RECODE, split = "<S>", fixed = TRUE) %>% unlist
    RECODE[RECODE %in% "<A>"] <- ""
    RECODE = tolower(RECODE)
  }
  
  return(list(new.text = RECODE, original.text = text, pos1 = pos1, pos2 = pos2))
  
}
Result <- wrong2right(Discharge_Notes[10])
Result$new.text
##  [1] "axis"        "i"           ""            ""            ""           
##  [6] ""            ""            ""            ""            ""           
## [11] ""            ""            "other"       "specified"   "depressive" 
## [16] "disorder"    ""            ""            ""            ""           
## [21] ""            ""            ""            ""            ""           
## [26] "suspect"     "adjustment"  "disorder"    "with"        "depressed"  
## [31] "mood"        ""            ""            ""            ""           
## [36] "axis"        "ii"          ""            ""            "differed"   
## [41] ""            ""            ""            ""            "axis"       
## [46] "iii"         ""            "deferred"    ""            ""           
## [51] ""            ""            "axis"        "iv"          "military"   
## [56] "stress"      ""            ""            ""            ""           
## [61] "axis"        "global"      "assessment"  "functioning" "scale"      
## [66] ""            ""            ""

第一節:詞袋模型(4)

Processed_Notes[[10]]
##  [1] "axis"        "i"           ""            ""            ""           
##  [6] ""            "other"       "specified"   "depressive"  "disorder"   
## [11] ""            ""            ""            ""            "suspect"    
## [16] "adjustment"  "disorder"    "with"        "depressed"   "mood"       
## [21] ""            ""            ""            ""            "axis"       
## [26] "ii"          ""            ""            ""            "differed"   
## [31] ""            ""            ""            ""            "axis"       
## [36] "iii"         ""            ""            "deferred"    ""           
## [41] ""            ""            ""            "axis"        "iv"         
## [46] ""            "military"    "stress"      ""            ""           
## [51] ""            ""            "axis"        "v"           "global"     
## [56] "assessment"  "functioning" "scale"
lvl.word <- unlist(Processed_Notes) %>% as.factor %>% levels
lvl.word <- lvl.word[-1]
length(lvl.word)
## [1] 5874
tdm_array <- array(0L, dim = c(length(lvl.word), length(Processed_Notes)))

for (i in 1:length(Processed_Notes)) {
  term_table <- table(Processed_Notes[[i]])
  term_table <- term_table[names(term_table) %in% lvl.word]
  term_pos <- which(lvl.word %in% names(term_table))
  tdm_array[term_pos,i] <- term_table[names(term_table) %in% lvl.word]
}

dim(tdm_array)
## [1] 5874 4535

練習1:利用詞袋模型與多層感知器進行癌症病歷分類

– 現在,讓我們試試用多層感知機來運用在病歷分類上,請你試著利用訓練組的資料做模型的訓練,並且用測試組的資料評估模型準確度。

Train.X <- tdm_array[,1:3000]
Train.Y <- Cancer_code[1:3000]

Test.X <- tdm_array[,3001:length(Cancer_code)]
Test.Y <- Cancer_code[3001:length(Cancer_code)]

– 另外,為了訓練你整合的能力,請你也對下列這兩串原始的文字進行病歷分類:

  1. Adenocarcinoma of stomach with peritoneal carcinomatosis and massive ascite, stage IV under bidirection chemotherapy (neoadjuvant intraperitoneal-systemic chemotherapy) with intraperitoneal paclitaxel 120mg (20151126, 20151201) and systemic with Oxalip (20151127) and oral XELOX.

  2. Chronic kidney disease, stage V with pulmonary edema underwent emergent hemodialysis, status post arteriovenous graft creation with maintenance hemodialysis.

練習1答案(1)

library(mxnet)

my_iterator_core <- function (batch_size) {
  
  batch = 0
  batch_per_epoch = length(Train.Y)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > ncol(Train.X)] = sample(1:ncol(Train.X), sum(idx > ncol(Train.X)))
    data = mx.nd.array(array(Train.X[,idx], dim = c(nrow(Train.X), batch_size)))
    label = mx.nd.array(array(Train.Y[idx], dim = c(1, batch_size)))
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter <- my_iterator_func(iter = NULL, batch_size = 20)
data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')

fc1 <- mx.symbol.FullyConnected(data = data, num.hidden = 50, name = 'fc1')
relu1 <- mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 <- mx.symbol.FullyConnected(data = relu1, num.hidden = 1, name = 'fc2')
logistic_pred <- mx.symbol.sigmoid(data = fc2, name = 'logistic_pred')

eps <- 1e-8
ce_loss_pos <-  mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <-  mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
my_optimizer <- mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 1e-4)

練習1答案(2)

my.eval.metric.loss <- mx.metric.custom(
  name = "cross-entropy",
  function(label, pred) {
    return(as.array(pred))
  }
)

my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
                                        array.batch.size = 20, ctx = mx.gpu(), num.round = 30,
                                        eval.metric = my.eval.metric.loss,
                                        batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
my_model$symbol <- logistic_pred
pred_y <- predict(my_model, Test.X)
library(pROC)

roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))

練習1答案(3)

my_discharge_note <- 'Adenocarcinoma of stomach with peritoneal carcinomatosis and massive ascite, stage IV under bidirection chemotherapy (neoadjuvant intraperitoneal-systemic chemotherapy) with intraperitoneal paclitaxel 120mg (20151126, 20151201) and systemic with Oxalip (20151127) and oral XELOX.'

processed_discharge_note <- wrong2right(my_discharge_note)$new.text

my_tdm_array <- array(0L, dim = c(length(lvl.word), 1))
term_table <- table(processed_discharge_note)
term_table <- term_table[names(term_table) %in% lvl.word]
term_pos <- which(lvl.word %in% names(term_table))
my_tdm_array[term_pos,] <- term_table
dim(my_tdm_array) <- c(length(lvl.word), 1)

predict(my_model, my_tdm_array, array.layout = 'colmajor')
##      [,1]
## [1,]    1

第二節:詞嵌入模型(1)

  1. 文字的順序是有意義的,但你思考一下我們有沒有可能保留文字順序?

F02

  1. 我們不可能有足夠完整的字典做同義字的整合及片語的蒐集。那這樣你應該能想像當我們使用詞袋模型時,必須對每個文章都產生極長的向量,這不但浪費並且容易過度擬合。

  2. 如果在未來應用時存在訓練樣本中未曾出現的單字,那分類器將「無法」利用這個資訊。

– 如果我們想要把這個突破應用到其他領域上,那勢必要想個辦法連結圖片與該領域的關係,而目前看起來詞袋模型似乎並不合用。

– 雖然卷積網路雖然最初雖然是設計來做識別影像的,但只要我們能把「文字描述」轉為「圖片」格式,我們就一樣能利用卷積神經網路進行分類。

第二節:詞嵌入模型(2)

– 有一種想法是先把文字轉為向量,並且盡可能讓相似詞的向量足夠相近,這個方法叫做詞嵌入(word embedding)

F03

第二節:詞嵌入模型(3)

– 因此,只要我們建構一個用上下文預測中間的字的模型,那當你輸入「chronic」以及「disease」並要求網路做出預測時,他會沒有辦法判斷該輸出「kidney」或者是「renal」,我們就能由此判斷這兩個字擁有相似的字意。

F04

第二節:詞嵌入模型(4)

library(magrittr)

load('data/Discharge_Notes.RData')

lvl.word <- unlist(Processed_Notes) %>% as.factor %>% levels
lvl.word <- lvl.word[-1]
word.index <- 1:length(lvl.word)
names(word.index) <- lvl.word

Input_list <- list()
Output_list <- list()

for (i in 1:length(Processed_Notes)) {
  
  Continuous_word <- Processed_Notes[[i]][grep('[a-z]', Processed_Notes[[i]])]
  
  if (length(Continuous_word) > 6) {
    
    Input_list[[i]] <- array(0L, dim = c(6, length(Continuous_word) - 6))
    Output_list[[i]] <- rep(0L, length(Continuous_word) - 6)
    
    for (k in 1:ncol(Input_list[[i]])) {
      
      Input_list[[i]][,k] <- word.index[Continuous_word[k:(5+k)]]
      Output_list[[i]][k] <- word.index[Continuous_word[6+k]]
      
    }
    
  }
  
}

Input <- do.call('cbind', Input_list)
Output <- do.call('c', Output_list)
Input[,1:10]
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1613 2834  812 5816 1124 2342 1934 3658 1961  2851
## [2,] 2834  812 5816 1124 2342 1934 3658 1961 2851  5008
## [3,]  812 5816 1124 2342 1934 3658 1961 2851 5008  4166
## [4,] 5816 1124 2342 1934 3658 1961 2851 5008 4166  3764
## [5,] 1124 2342 1934 3658 1961 2851 5008 4166 3764  2342
## [6,] 2342 1934 3658 1961 2851 5008 4166 3764 2342  5435
Output[1:10]
##  [1] 1934 3658 1961 2851 5008 4166 3764 2342 5435 1613

第二節:詞嵌入模型(5)

– 因此我們會編寫特殊的Iterator,讓他能在執行時再產生完整的矩陣:

library(mxnet)

my_iterator_core <- function (batch_size) {
  
  batch = 0
  batch_per_epoch = length(Output)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > ncol(Input)] = sample(1:ncol(Input), sum(idx > ncol(Input)))
    data = array(0L, dim = c(length(lvl.word), batch_size))
    label = array(0L, dim = c(length(lvl.word), batch_size))
    for (i in 1:batch_size) {
      data[Input[,idx[i]],i] <- 1L
      label[Output[idx[i]],i] <- 1L
    }
    data = mx.nd.array(data)
    label = mx.nd.array(label)
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter <- my_iterator_func(iter = NULL, batch_size = 100)

第二節:詞嵌入模型(6)

data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')

fc1 <- mx.symbol.FullyConnected(data = data, num.hidden = 50, no.bias = TRUE, name = 'fc1')
fc2 <- mx.symbol.FullyConnected(data = fc1, num.hidden = length(lvl.word), no.bias = TRUE, name = 'fc2')
softmax_layer <- mx.symbol.SoftmaxOutput(data = fc2, label = label, name = 'sofmax_layer')

eps <- 1e-8
m_log <- 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss <- mx.symbol.MakeLoss(m_log, name = 'm_logloss')
my_optimizer <- mx.opt.create(name = "sgd", learning.rate = 5e-3, momentum = 0, wd = 0)
my.eval.metric.loss <- mx.metric.custom(
  name = "m-log-loss",
  function(label, pred) {
    return(as.array(pred))
  }
)

word2vec_model <- mx.model.FeedForward.create(symbol = m_logloss, X = my_iter, optimizer = my_optimizer,
                                              array.batch.size = 100, ctx = mx.gpu(), num.round = 5,
                                              eval.metric = my.eval.metric.loss,
                                              batch.end.callback = mx.callback.log.speedometer(frequency = 200, batch.size = 100))

第二節:詞嵌入模型(7)

word2vec_matrix <- t(as.array(word2vec_model$arg.params$fc2_weight))

\[\cos \theta = \frac{\vec{\mbox{vec}_1} \cdot \vec{\mbox{vec}_2}}{|\vec{\mbox{vec}_1}| \cdot |\vec{\mbox{vec}_2}|} \]

– 在R裡面的實現過程如下,你會發現與「cancer」最接近的字居然如此符合預期!

interested_pos <- word.index['cancer']
cos_similarity <- rep(0, length(lvl.word))
names(cos_similarity) <- lvl.word

for (i in 1:length(lvl.word)) {
  
  cos_similarity[i] <- word2vec_matrix[interested_pos,] %*% word2vec_matrix[i,] / sqrt(sum(word2vec_matrix[interested_pos,]^2)) / sqrt(sum(word2vec_matrix[i,]^2))
  
}

sort(cos_similarity, decreasing = TRUE) %>% head
##         cancer        adenoma         lesion adenocarcinoma        tubular 
##      1.0000000      0.9146292      0.8586472      0.8548079      0.8435445 
##          large 
##      0.8340912

第二節:詞嵌入模型(8)

– 而這個詞嵌入矩陣最終將有辦法獲得更可靠的文字間的關係,並且能夠允許收納更多的單字。

第三節:將文字變成圖像進行分析(1)

– 為了較快實現後面的課程,這裡已經準備好了一個由英文Wiki上所有的文章中所訓練而成的詞嵌入矩陣,請點擊這裡下載!

library(magrittr)
library(dplyr)
library(plyr)
library(data.table)

word.data <- fread('data/wiki.txt', header = FALSE, showProgress = FALSE)

words.ref <- word.data %>% select(V1) %>% setDF %>% .[,1] %>% as.character
words.ref <- c("", words.ref)
words.index <- 1:length(words.ref)
names(words.index) <- words.ref

words.matrix <- word.data %>% select(-V1) %>% setDF %>% as.matrix
words.matrix <- rbind(rep(0, 50), words.matrix)
load('data/Discharge_Notes.RData')

words_pos <- words.index[Processed_Notes[[1]]]
words_pos[is.na(words_pos)] <- 1
text_img.array <- t(words.matrix[words_pos,])
dim(text_img.array)
## [1]  50 301
library(imager)

par(mai = rep(0, 4))
img <- text_img.array
img[img>2] <- 2
img[img<-2] <- -2
plot(as.cimg(t(img)))

第三節:將文字變成圖像進行分析(2)

– 至於文章長度不等的問題怎麼解決?其實也沒什麼好方法,就以最長的為準剩下填白吧:

max.length <- sapply(Processed_Notes, length) %>% max()
img_array <- array(0, dim = c(50, max.length, length(Processed_Notes)))

pb <- txtProgressBar(max = length(Processed_Notes), style = 3)

for (i in 1:length(Processed_Notes)) {
  
  words_pos <- words.index[Processed_Notes[[i]]]
  words_pos[is.na(words_pos)] <- 1
  img_array[,1:length(words_pos),i] <- t(words.matrix[words_pos,])
  setTxtProgressBar(pb, i)
  
}

close(pb)
dim(img_array)
## [1]   50  494 4535

第三節:將文字變成圖像進行分析(3)

Train.img_array <- img_array[,,1:3000]
Train.Y <- Cancer_code[1:3000]

Test.img_array <- img_array[,,3001:length(Cancer_code)]
dim(Test.img_array) <- c(dim(Test.img_array)[1:2], 1, dim(Test.img_array)[3])
Test.Y <- Cancer_code[3001:length(Cancer_code)]
library(mxnet)

my_iterator_core <- function (batch_size) {
  
  batch = 0
  batch_per_epoch = length(Train.Y)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > length(Train.Y)] = sample(1:length(Train.Y), sum(idx > length(Train.Y)))
    data = mx.nd.array(array(Train.img_array[,,idx], dim = c(dim(Train.img_array)[1:2], 1, batch_size)))
    label = mx.nd.array(array(Train.Y[idx], dim = c(1, batch_size)))
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter <- my_iterator_func(iter = NULL, batch_size = 20)
my_iter$reset()
my_iter$iter.next()
## [1] TRUE
test_data <- my_iter$value()
dim(test_data$data)
## [1]  50 494   1  20

第三節:將文字變成圖像進行分析(4)

– 讓我們看看怎樣編寫Model architecture:

data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')

conv1 <- mx.symbol.Convolution(data = data, kernel = c(50, 1), num_filter = 50, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act.type = 'relu', name = 'relu1')
pool1 <- mx.symbol.max(data = relu1, axis = 2:3, keepdims = FALSE, name = 'pool1')

fc1 <- mx.symbol.FullyConnected(data = pool1, num.hidden = 1, name = 'fc1')
logistic_pred <- mx.symbol.sigmoid(data = fc1, name = 'logistic_pred')

eps <- 1e-8
ce_loss_pos <-  mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <-  mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
my_optimizer <- mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 1e-4)

第三節:將文字變成圖像進行分析(5)

my.eval.metric.loss <- mx.metric.custom(
  name = "cross-entropy",
  function(label, pred) {
    return(as.array(pred))
  }
)

my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
                                        array.batch.size = 20, ctx = mx.gpu(), num.round = 30,
                                        eval.metric = my.eval.metric.loss,
                                        batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
my_model$symbol <- logistic_pred
pred_y <- predict(my_model, Test.img_array)
library(pROC)

roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))

– 另外,這樣的模型有更少的參數量,因此也會較難發生過度擬合的問題!

練習2:使用片語及短句進行預測

– 讓我們參考一下Chin Lin等人發表的:Artificial Intelligence Learning Semantics via External Resources for Classifying Diagnosis Codes in Discharge Notes,看看人家是怎樣設計Model architecture的:

F05

練習2答案

data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')

concat_lst <- NULL
filter_sizes <- 1:5
num_filter <- c(40, 30, 15, 10, 5)
  
for (i in 1:length(filter_sizes)) {
  
  convi <- mx.symbol.Convolution(data = data, kernel = c(50, filter_sizes[i]), num_filter = num_filter[i], name = paste0('conv', i))
  relui <- mx.symbol.Activation(data = convi, act_type = "relu", name = paste0('relu', i))
  pooli <- mx.symbol.max(data = relui, axis = 2:3, keepdims = FALSE, name = paste0('pool', i))
  concat_lst <- append(concat_lst, pooli)
  
}
  
h_pool <- mx.symbol.concat(data = concat_lst, num.args = length(filter_sizes), dim = 1, name = 'h_pool')
h_drop <- mx.symbol.Dropout(data = h_pool, p = 0.5, name = 'h_drop')
fc1 <- mx.symbol.FullyConnected(data = h_drop, num.hidden = 1, name = 'fc1')
logistic_pred <- mx.symbol.sigmoid(data = fc1, name = 'logistic_pred')

eps <- 1e-8
ce_loss_pos <-  mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <-  mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
my.eval.metric.loss <- mx.metric.custom(
  name = "cross-entropy",
  function(label, pred) {
    return(as.array(pred))
  }
)

my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
                                        array.batch.size = 20, ctx = mx.gpu(), num.round = 30,
                                        eval.metric = my.eval.metric.loss,
                                        batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
my_model$symbol <- logistic_pred
pred_y <- predict(my_model, Test.img_array)
roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))

第四節:在模型中進行詞嵌入(1)

– 首先一樣先把wiki的詞嵌入矩陣讀進來,並且為其第一列補0值作為空符號:

library(magrittr)
library(dplyr)
library(plyr)
library(data.table)

load('data/Discharge_Notes.RData')

word.data <- fread('data/wiki.txt', header = FALSE, showProgress = FALSE)

words.ref <- word.data %>% select(V1) %>% setDF %>% .[,1] %>% as.character
words.ref <- c("", words.ref)
words.index <- 1:length(words.ref)
names(words.index) <- words.ref

words.matrix <- word.data %>% select(-V1) %>% setDF %>% as.matrix
words.matrix <- rbind(rep(0, 50), words.matrix)

– 接著進行資料的整合,現在我們只存索引位置(等等讓MxNet自己查對照表),要非常注意的是MxNet裡面的索引是從0開始編號而非像R裡面從1開始,所以我們需要把所有的編號通通減去1以滿足MxNet的需要:

max.length <- sapply(Processed_Notes, length) %>% max()
seq_array <- array(0, dim = c(max.length, length(Processed_Notes)))

pb <- txtProgressBar(max = length(Processed_Notes), style = 3)

for (i in 1:length(Processed_Notes)) {
  
  words_pos <- words.index[Processed_Notes[[i]]]
  words_pos[is.na(words_pos)] <- 1
  seq_array[1:length(words_pos),i] <- words_pos - 1 # very important
  setTxtProgressBar(pb, i)
  
}

close(pb)

第四節:在模型中進行詞嵌入(2)

Train.seq_array <- seq_array[,1:3000]
Train.Y <- Cancer_code[1:3000]

Test.seq_array <- seq_array[,3001:length(Cancer_code)]
dim(Test.seq_array) <- c(max.length, 1, dim(Test.seq_array)[2])
Test.Y <- Cancer_code[3001:length(Cancer_code)]
library(mxnet)

my_iterator_core <- function (batch_size) {
  
  batch = 0
  batch_per_epoch = length(Train.Y)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > length(Train.Y)] = sample(1:length(Train.Y), sum(idx > length(Train.Y)))
    data = mx.nd.array(array(Train.seq_array[,idx], dim = c(dim(Train.seq_array)[1], 1, batch_size)))
    label = mx.nd.array(array(Train.Y[idx], dim = c(1, batch_size)))
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter <- my_iterator_func(iter = NULL, batch_size = 20)

第四節:在模型中進行詞嵌入(3)

data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')

embed1 <- mx.symbol.Embedding(data = data, input.dim = length(words.ref), output.dim = 50, name = 'embed1')

concat_lst <- NULL
filter_sizes <- 1:5
num_filter <- c(40, 30, 15, 10, 5)

for (i in 1:length(filter_sizes)) {
  
  convi <- mx.symbol.Convolution(data = embed1, kernel = c(50, filter_sizes[i]), num_filter = num_filter[i], name = paste0('conv', i))
  relui <- mx.symbol.Activation(data = convi, act_type = "relu", name = paste0('relu', i))
  pooli <- mx.symbol.max(data = relui, axis = 2:3, keepdims = FALSE, name = paste0('pool', i))
  concat_lst <- append(concat_lst, pooli)
  
}

h_pool <- mx.symbol.concat(data = concat_lst, num.args = length(filter_sizes), dim = 1, name = 'h_pool')
h_drop <- mx.symbol.Dropout(data = h_pool, p = 0.5, name = 'h_drop')
fc1 <- mx.symbol.FullyConnected(data = h_drop, num.hidden = 1, name = 'fc1')
logistic_pred <- mx.symbol.sigmoid(data = fc1, name = 'logistic_pred')

eps <- 1e-8
ce_loss_pos <-  mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <-  mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')

– 用mx.symbol.infer.shape來檢查:

mx.symbol.infer.shape(embed1, data = c(494, 1, 20))$out.shapes
## $embed1_output
## [1]  50 494   1  20
my_optimizer <- mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 1e-4)

第四節:在模型中進行詞嵌入(4)

new_arg <- mxnet:::mx.model.init.params(symbol = logistic_pred, 
                                        input.shape = list(data = c(494, 1, 20)), 
                                        output.shape = NULL, initializer = mxnet:::mx.init.Xavier(rnd_type = "uniform", magnitude = 2.24), 
                                        ctx = mx.gpu())

new_arg$arg.params$embed1_weight <- mx.nd.array(t(words.matrix))

my.eval.metric.loss <- mx.metric.custom(
  name = "cross-entropy",
  function(label, pred) {
    return(as.array(pred))
  }
)

my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
                                        array.batch.size = 20, ctx = mx.gpu(), num.round = 30,
                                        eval.metric = my.eval.metric.loss,
                                        arg.params = new_arg$arg.params, fixed.param = 'embed1_weight',
                                        batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
my_model$symbol <- logistic_pred
my_model$arg.params <- append(my_model$arg.params, new_arg$arg.params['embed1_weight'])
pred_y <- predict(my_model, Test.seq_array)
library(pROC)

roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))

第四節:在模型中進行詞嵌入(5)

– 但是wiki共有34萬多個單字,而這個月的病歷中僅僅存在5千多個單字,這代表的是假定允許模型訓練時更新詞嵌入矩陣的權重,那也僅僅有那5千多個單字的權重會改變

– 更嚴重的是,假設病歷中原先有H5N1這個單字而沒有H1N1,在最開始的時候他們兩個的向量本來是很接近的,會不會訓練一陣子之後就離開了?

F06

data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')

embed1 <- mx.symbol.Embedding(data = data, input.dim = length(words.ref), output.dim = 50, name = 'embed1')
embed2 <- mx.symbol.Convolution(data = embed1, kernel = c(50, 1), num_filter = 50, no.bias = TRUE, name = 'embed2')

concat_lst <- NULL
filter_sizes <- 1:5
num_filter <- c(40, 30, 15, 10, 5)

for (i in 1:length(filter_sizes)) {
  
  convi <- mx.symbol.Convolution(data = embed2, kernel = c(1, filter_sizes[i]), num_filter = num_filter[i], name = paste0('conv', i))
  relui <- mx.symbol.Activation(data = convi, act_type = "relu", name = paste0('relu', i))
  pooli <- mx.symbol.max(data = relui, axis = 2:3, keepdims = FALSE, name = paste0('pool', i))
  concat_lst <- append(concat_lst, pooli)
  
}

h_pool <- mx.symbol.concat(data = concat_lst, num.args = length(filter_sizes), dim = 1, name = 'h_pool')
h_drop <- mx.symbol.Dropout(data = h_pool, p = 0.5, name = 'h_drop')
fc1 <- mx.symbol.FullyConnected(data = h_drop, num.hidden = 1, name = 'fc1')
logistic_pred <- mx.symbol.sigmoid(data = fc1, name = 'logistic_pred')

eps <- 1e-8
ce_loss_pos <-  mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <-  mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')

– 注意一下維度的改變:

mx.symbol.infer.shape(embed1, data = c(494, 1, 20))$out.shapes
## $embed1_output
## [1]  50 494   1  20
mx.symbol.infer.shape(embed2, data = c(494, 1, 20))$out.shapes
## $embed2_output
## [1]   1 494  50  20

結語

– 然而在較為複雜的任務中,為了保留語句的順序,我們不可能再使用one hot encoding的方式進行數值化,因此使用詞嵌入模型進行文字的處理已經是目前的主流,並且後續使用上卷積神經網路的結構非常優雅的解決了相似字與片語的問題,相較於傳統的詞袋模型在眾多複雜任務中效能也有很大的提升。

– 另外需要注意的是詞嵌入模型的運用仍有很多變化,像是是否允許更新詞嵌入矩陣的權重則端看你的任務而定,除此之外訓練詞嵌入模型所用的資料(wiki vs PubMed)也會影響他的效果,更甚者我們可以建造一個模型從字母開始產生單字的詞向量,這個部分眾多的變化是當代研究的熱點之一!

– 在自然語言處理相關的任務中我們不斷的強調文字的順序是有意義的,因此我們需要一種結構能夠在抓取特徵的同時記錄上下文的資訊,而這個帶有記憶的結構我們會在下一週講到,並將其運用在我們的任務之中看看效能是否提升。