詞嵌入模型與其深度學習應用

林嶔 (Lin, Chin)

Lesson 13

詞袋模型(1)

– 因此,傳統的語言模型演算法大多透過詞袋模型(Bag-of-words model)的方法進行特徵萃取,而這個方法在像是垃圾郵件分類等任務中也取得非常好的成績:

F13_1

– 這種向量化的方式非常類似於類別變項的處理,也有另外一個稱呼叫做獨熱編碼(one hot encoding)。

詞袋模型(2)

– 我們先到這裡下載三軍總醫院從105年1月1日至105年1月31日所有的出院病歷摘要,這些病例已經初步被標記是否為是「癌症」病例,我們看看Data的樣子:

load('data/Discharge_Notes.RData')

cat(Discharge_Notes[10])
## Axis I : 
##  1. Other specified depressive disorder
##  2. Suspect adjustment disorder with depressed mood.
## Axis II:   deffered
## Axis III:  deferred
## Axis IV: Military stress
## Axis V:Global Assessment Functioning scale: 41-50
Cancer_code[10]
## [1] 0

詞袋模型(3)

– 如果你想學習如何處理文字,你可以參考一下R語言程式設計導論:文字處理簡介的部分

– 但在使用這個函數之前,我們需要先下載my_DIC.aff以及my_DIC.dic兩個字典檔案作為我們拼寫校正之用:

library(hunspell)
library(magrittr)

Sys.setenv(DICPATH = "data")

wrong2right <- function (text, spell.check = TRUE) {
  
  RECODE = tolower(text)
  RECODE = RECODE %>% gsub("\n", "<S><S><S><S><S>", ., fixed = TRUE) %>% gsub("\r", "<S><S><S><S><S>", ., fixed = TRUE)

  RECODE = RECODE %>% gsub("[ :,;-]", "<S>", .) %>% gsub("(", "<S>", ., fixed = TRUE) %>% gsub(")", "<S>", ., fixed = TRUE)%>% gsub("/", "<S>", ., fixed = TRUE) 
  RECODE = strsplit(RECODE, split = ".", fixed = TRUE)[[1]]
  RECODE = paste(RECODE, collapse = "<S><S><S><S><S>")
  RECODE = gsub("(<S>){5, }", "<S><S><S><S><S>", RECODE)
  RECODE = gsub("(<S>)+$", "", RECODE)
  RECODE = strsplit(RECODE, split = "<S>", fixed = TRUE)[[1]]

  pos1 = rep(NA, length(RECODE))
  pos2 = rep(NA, length(RECODE))
  
  start.pos = 1
  sub.text = text
  
  for (i in 1:length(RECODE)) {
    if (RECODE[i]!="") {
      result = regexpr(RECODE[i], tolower(sub.text))
      if (result[1] != -1) {
        pos1[i] = start.pos + result[1] - 1
        pos2[i] = start.pos + result[1] + attr(result, "match.length") - 2
        start.pos = pos2[i] + 1
        sub.text = substr(sub.text, result[1] + attr(result, "match.length"), nchar(sub.text))
      }
    }
  }
  
  RECODE = RECODE %>% gsub("[^A-Z0-9a-z .:,;-]", "", .) 
  RECODE[!(!grepl("[^a-z]", RECODE)|!grepl("[^0-9]", RECODE))] = RECODE[!(!grepl("[^a-z]", RECODE)|!grepl("[^0-9]", RECODE))] %>% gsub("[0-9]", "", .)
  RECODE = RECODE %>% gsub("[0-9]", "", .)
  
  if (spell.check) {
    SPELL_check = which(!hunspell:::hunspell_check(RECODE, dict = hunspell:::dictionary("my_DIC", cache = FALSE)))
    if (length(SPELL_check) > 0) {
      hist.num.space = 0
      for (k in 1:length(SPELL_check)) {
        RECODE[SPELL_check[k]] = hunspell:::hunspell_suggest(RECODE[SPELL_check[k]], dict = hunspell:::dictionary("my_DIC", cache = FALSE))[[1]][1]
        if (grepl(" ", RECODE[SPELL_check[k]], fixed = TRUE)) {
          num.space = length(gregexpr(" ", RECODE[SPELL_check[k]], fixed = TRUE)[[1]])
          len.pos1 = length(pos1)
          pos1 = c(pos1[1:(SPELL_check[k]+hist.num.space)], rep(NA, num.space - 1), pos1[(SPELL_check[k]+hist.num.space):len.pos1])
          pos2 = c(pos2[1:(SPELL_check[k]+hist.num.space)], rep(NA, num.space - 1), pos2[(SPELL_check[k]+hist.num.space):len.pos1])
          pos1[SPELL_check[k]+hist.num.space+1:num.space] = NA
          pos2[SPELL_check[k]+hist.num.space+0:(num.space-1)] = NA
          hist.num.space = hist.num.space + num.space
        }
      }
    }
    RECODE = RECODE %>% gsub(" ", "<S>", .)
    RECODE[nchar(RECODE) == 0] = "<A>"
    RECODE = strsplit(RECODE, split = "<S>", fixed = TRUE) %>% unlist
    RECODE[RECODE %in% "<A>"] <- ""
    RECODE = tolower(RECODE)
  }
  
  return(list(new.text = RECODE, original.text = text, pos1 = pos1, pos2 = pos2))
  
}
Result <- wrong2right(Discharge_Notes[10])
Result$new.text
##  [1] "axis"        "i"           ""            ""            ""           
##  [6] ""            ""            ""            ""            ""           
## [11] ""            ""            "other"       "specified"   "depressive" 
## [16] "disorder"    ""            ""            ""            ""           
## [21] ""            ""            ""            ""            ""           
## [26] "suspect"     "adjustment"  "disorder"    "with"        "depressed"  
## [31] "mood"        ""            ""            ""            ""           
## [36] "axis"        "ii"          ""            ""            "differed"   
## [41] ""            ""            ""            ""            "axis"       
## [46] "iii"         ""            "deferred"    ""            ""           
## [51] ""            ""            "axis"        "iv"          "military"   
## [56] "stress"      ""            ""            ""            ""           
## [61] "axis"        "global"      "assessment"  "functioning" "scale"      
## [66] ""            ""            ""

詞袋模型(4)

Processed_Notes[[10]]
##  [1] "axis"        "i"           ""            ""            ""           
##  [6] ""            "other"       "specified"   "depressive"  "disorder"   
## [11] ""            ""            ""            ""            "suspect"    
## [16] "adjustment"  "disorder"    "with"        "depressed"   "mood"       
## [21] ""            ""            ""            ""            "axis"       
## [26] "ii"          ""            ""            ""            "differed"   
## [31] ""            ""            ""            ""            "axis"       
## [36] "iii"         ""            ""            "deferred"    ""           
## [41] ""            ""            ""            "axis"        "iv"         
## [46] ""            "military"    "stress"      ""            ""           
## [51] ""            ""            "axis"        "v"           "global"     
## [56] "assessment"  "functioning" "scale"
lvl.word <- unlist(Processed_Notes) %>% as.factor %>% levels
lvl.word <- lvl.word[-1]
length(lvl.word)
## [1] 5874
tdm_array <- array(0L, dim = c(length(lvl.word), length(Processed_Notes)))

for (i in 1:length(Processed_Notes)) {
  term_table <- table(Processed_Notes[[i]])
  term_table <- term_table[names(term_table) %in% lvl.word]
  term_pos <- which(lvl.word %in% names(term_table))
  tdm_array[term_pos,i] <- term_table[names(term_table) %in% lvl.word]
}

dim(tdm_array)
## [1] 5874 4535

練習1:利用詞袋模型與多層感知器進行癌症病歷分類

– 現在,讓我們試試用多層感知機來運用在病歷分類上,請你試著利用訓練組的資料做模型的訓練,並且用測試組的資料評估模型準確度。

Train.X <- tdm_array[,1:3000]
Train.Y <- Cancer_code[1:3000]

Test.X <- tdm_array[,3001:length(Cancer_code)]
Test.Y <- Cancer_code[3001:length(Cancer_code)]

– 另外,為了訓練你整合的能力,請你也對下列這兩串原始的文字進行病歷分類:

  1. Adenocarcinoma of stomach with peritoneal carcinomatosis and massive ascite, stage IV under bidirection chemotherapy (neoadjuvant intraperitoneal-systemic chemotherapy) with intraperitoneal paclitaxel 120mg (20151126, 20151201) and systemic with Oxalip (20151127) and oral XELOX.

  2. Chronic kidney disease, stage V with pulmonary edema underwent emergent hemodialysis, status post arteriovenous graft creation with maintenance hemodialysis.

練習1答案(1)

library(mxnet)

my_iterator_core <- function (batch_size) {
  
  batch = 0
  batch_per_epoch = length(Train.Y)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > ncol(Train.X)] = sample(1:ncol(Train.X), sum(idx > ncol(Train.X)))
    data = mx.nd.array(array(Train.X[,idx], dim = c(nrow(Train.X), batch_size)))
    label = mx.nd.array(array(Train.Y[idx], dim = c(1, batch_size)))
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter <- my_iterator_func(iter = NULL, batch_size = 20)
data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')

fc1 <- mx.symbol.FullyConnected(data = data, num.hidden = 50, name = 'fc1')
relu1 <- mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 <- mx.symbol.FullyConnected(data = relu1, num.hidden = 1, name = 'fc2')
logistic_pred <- mx.symbol.sigmoid(data = fc2, name = 'logistic_pred')

eps <- 1e-8
ce_loss_pos <-  mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <-  mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
my_optimizer <- mx.opt.create(name = "sgd", learning.rate = 0.05, momentum = 0.9, wd = 1e-4)

練習1答案(2)

my.eval.metric.loss <- mx.metric.custom(
  name = "cross-entropy",
  function(label, pred) {
    return(as.array(pred))
  }
)

my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
                                        array.batch.size = 20, ctx = mx.cpu(), num.round = 30,
                                        eval.metric = my.eval.metric.loss,
                                        batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
my_model$symbol <- logistic_pred
pred_y <- predict(my_model, Test.X)
library(pROC)

roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))

練習1答案(3)

my_discharge_note <- 'Adenocarcinoma of stomach with peritoneal carcinomatosis and massive ascite, stage IV under bidirection chemotherapy (neoadjuvant intraperitoneal-systemic chemotherapy) with intraperitoneal paclitaxel 120mg (20151126, 20151201) and systemic with Oxalip (20151127) and oral XELOX.'

processed_discharge_note <- wrong2right(my_discharge_note)$new.text

my_tdm_array <- array(0L, dim = c(length(lvl.word), 1))
term_table <- table(processed_discharge_note)
term_table <- term_table[names(term_table) %in% lvl.word]
term_pos <- which(lvl.word %in% names(term_table))
my_tdm_array[term_pos,] <- term_table
dim(my_tdm_array) <- c(length(lvl.word), 1)

predict(my_model, my_tdm_array, array.layout = 'colmajor')
##      [,1]
## [1,]    1

詞嵌入模型(1)

  1. 文字的順序是有意義的,但你思考一下我們有沒有可能保留文字順序?

F13_2

  1. 我們不可能有足夠完整的字典做同義字的整合及片語的蒐集。那這樣你應該能想像當我們使用詞袋模型時,必須對每個文章都產生極長的向量,這不但浪費並且容易過度擬合。

  2. 如果在未來應用時存在訓練樣本中未曾出現的單字,那分類器將「無法」利用這個資訊。

– 如果我們想要把這個突破應用到其他領域上,那勢必要想個辦法連結圖片與該領域的關係,而目前看起來詞袋模型似乎並不合用。

– 雖然卷積網路雖然最初雖然是設計來做識別影像的,但只要我們能把「文字描述」轉為「圖片」格式,我們就一樣能利用卷積神經網路進行分類。

詞嵌入模型(2)

– 有一種想法是先把文字轉為向量,並且盡可能讓相似詞的向量足夠相近,這個方法叫做詞嵌入(word embedding)

F13_3

詞嵌入模型(3)

– 因此,只要我們建構一個用上下文預測中間的字的模型,那當你輸入「chronic」以及「disease」並要求網路做出預測時,他會沒有辦法判斷該輸出「kidney」或者是「renal」,我們就能由此判斷這兩個字擁有相似的字意。

F13_4

詞嵌入模型(4)

library(magrittr)

load('data/Discharge_Notes.RData')

lvl.word <- unlist(Processed_Notes) %>% as.factor %>% levels
lvl.word <- lvl.word[-1]
word.index <- 1:length(lvl.word)
names(word.index) <- lvl.word

Input_list <- list()
Output_list <- list()

for (i in 1:length(Processed_Notes)) {
  
  Continuous_word <- Processed_Notes[[i]][grep('[a-z]', Processed_Notes[[i]])]
  
  if (length(Continuous_word) > 6) {
    
    Input_list[[i]] <- array(0L, dim = c(6, length(Continuous_word) - 6))
    Output_list[[i]] <- rep(0L, length(Continuous_word) - 6)
    
    for (k in 1:ncol(Input_list[[i]])) {
      
      Input_list[[i]][,k] <- word.index[Continuous_word[k:(5+k)]]
      Output_list[[i]][k] <- word.index[Continuous_word[6+k]]
      
    }
    
  }
  
}

Input <- do.call('cbind', Input_list)
Output <- do.call('c', Output_list)
Input[,1:10]
##      [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1613 2834  812 5816 1124 2342 1934 3658 1961  2851
## [2,] 2834  812 5816 1124 2342 1934 3658 1961 2851  5008
## [3,]  812 5816 1124 2342 1934 3658 1961 2851 5008  4166
## [4,] 5816 1124 2342 1934 3658 1961 2851 5008 4166  3764
## [5,] 1124 2342 1934 3658 1961 2851 5008 4166 3764  2342
## [6,] 2342 1934 3658 1961 2851 5008 4166 3764 2342  5435
Output[1:10]
##  [1] 1934 3658 1961 2851 5008 4166 3764 2342 5435 1613

詞嵌入模型(5)

– 因此我們會編寫特殊的Iterator,讓他能在執行時再產生完整的矩陣:

library(mxnet)

my_iterator_core <- function (batch_size) {
  
  batch = 0
  batch_per_epoch = length(Output)/batch_size
  
  reset = function() {batch <<- 0}
  
  iter.next = function() {
    batch <<- batch+1
    if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
  }
  
  value = function() {
    idx = 1:batch_size + (batch - 1) * batch_size
    idx[idx > ncol(Input)] = sample(1:ncol(Input), sum(idx > ncol(Input)))
    data = array(0L, dim = c(length(lvl.word), batch_size))
    label = array(0L, dim = c(length(lvl.word), batch_size))
    for (i in 1:batch_size) {
      data[Input[,idx[i]],i] <- 1L
      label[Output[idx[i]],i] <- 1L
    }
    data = mx.nd.array(data)
    label = mx.nd.array(label)
    return(list(data = data, label = label))
  }
  
  return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}

my_iterator_func <- setRefClass("Custom_Iter",
                                fields = c("iter", "batch_size"),
                                contains = "Rcpp_MXArrayDataIter",
                                methods = list(
                                  initialize = function(iter, batch_size = 100){
                                    .self$iter <- my_iterator_core(batch_size = batch_size)
                                    .self
                                  },
                                  value = function(){
                                    .self$iter$value()
                                  },
                                  iter.next = function(){
                                    .self$iter$iter.next()
                                  },
                                  reset = function(){
                                    .self$iter$reset()
                                  },
                                  finalize=function(){
                                  }
                                )
)

my_iter <- my_iterator_func(iter = NULL, batch_size = 100)

詞嵌入模型(6)

data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')

fc1 <- mx.symbol.FullyConnected(data = data, num.hidden = 50, no.bias = TRUE, name = 'fc1')
fc2 <- mx.symbol.FullyConnected(data = fc1, num.hidden = length(lvl.word), no.bias = TRUE, name = 'fc2')
softmax_layer <- mx.symbol.SoftmaxOutput(data = fc2, label = label, name = 'sofmax_layer')

eps <- 1e-8
m_log <- 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss <- mx.symbol.MakeLoss(m_log, name = 'm_logloss')
my_optimizer <- mx.opt.create(name = "sgd", learning.rate = 5e-3, momentum = 0, wd = 0)
my.eval.metric.loss <- mx.metric.custom(
  name = "m-log-loss",
  function(label, pred) {
    return(as.array(pred))
  }
)

word2vec_model <- mx.model.FeedForward.create(symbol = m_logloss, X = my_iter, optimizer = my_optimizer,
                                              array.batch.size = 100, ctx = mx.cpu(), num.round = 5,
                                              eval.metric = my.eval.metric.loss,
                                              batch.end.callback = mx.callback.log.speedometer(frequency = 200, batch.size = 100))

詞嵌入模型(7)

word2vec_matrix <- t(as.array(word2vec_model$arg.params$fc2_weight))

\[\cos \theta = \frac{\vec{\mbox{vec}_1} \cdot \vec{\mbox{vec}_2}}{|\vec{\mbox{vec}_1}| \cdot |\vec{\mbox{vec}_2}|} \]

– 在R裡面的實現過程如下,你會發現與「cancer」最接近的字居然如此符合預期!

interested_pos <- word.index['cancer']
cos_similarity <- rep(0, length(lvl.word))
names(cos_similarity) <- lvl.word

for (i in 1:length(lvl.word)) {
  
  cos_similarity[i] <- word2vec_matrix[interested_pos,] %*% word2vec_matrix[i,] / sqrt(sum(word2vec_matrix[interested_pos,]^2)) / sqrt(sum(word2vec_matrix[i,]^2))
  
}

sort(cos_similarity, decreasing = TRUE) %>% head
##         cancer        adenoma         lesion adenocarcinoma          large 
##      1.0000000      0.9130057      0.8659733      0.8543842      0.8380871 
##        tubular 
##      0.8326128

詞嵌入模型(8)

– 而這個詞嵌入矩陣最終將有辦法獲得更可靠的文字間的關係,並且能夠允許收納更多的單字。

將文字變成圖像進行分析(1)

– 為了較快實現後面的課程,這裡已經準備好了一個由英文Wiki上所有的文章中所訓練而成的詞嵌入矩陣,請點擊這裡下載!

library(magrittr)
library(dplyr)
library(plyr)
library(data.table)

word.data <- fread('data/wiki.txt', header = FALSE, showProgress = FALSE)

words.ref <- word.data %>% select(V1) %>% setDF %>% .[,1] %>% as.character
words.ref <- c("", words.ref)
words.index <- 1:length(words.ref)
names(words.index) <- words.ref

words.matrix <- word.data %>% select(-V1) %>% setDF %>% as.matrix
words.matrix <- rbind(rep(0, 50), words.matrix)
load('data/Discharge_Notes.RData')

words_pos <- words.index[Processed_Notes[[1]]]
words_pos[is.na(words_pos)] <- 1
text_img.array <- t(words.matrix[words_pos,])
dim(text_img.array)
## [1]  50 301
library(imager)

par(mai = rep(0, 4))
img <- text_img.array
img[img>2] <- 2
img[img<-2] <- -2
plot(as.cimg(t(img)))