林嶔 (Lin, Chin)
Lesson 14 詞嵌入模型與其深度學習應用
– 因此,傳統的語言模型演算法大多透過詞袋模型(Bag-of-words model)的方法進行特徵萃取,而這個方法在像是垃圾郵件分類等任務中也取得非常好的成績:
– 這種向量化的方式非常類似於類別變項的處理,也有另外一個稱呼叫做獨熱編碼(one hot encoding)。
– 我們先到這裡下載三軍總醫院從105年1月1日至105年1月31日所有的出院病歷摘要,這些病例已經初步被標記是否為是「癌症」病例,我們看看Data的樣子:
## Axis I :
## 1. Other specified depressive disorder
## 2. Suspect adjustment disorder with depressed mood.
## Axis II: deffered
## Axis III: deferred
## Axis IV: Military stress
## Axis V:Global Assessment Functioning scale: 41-50
## [1] 0
– 如果你想學習如何處理文字,你可以參考一下R語言程式設計導論:文字處理簡介的部分
– 但在使用這個函數之前,我們需要先下載my_DIC.aff以及my_DIC.dic兩個字典檔案作為我們拼寫校正之用:
library(hunspell)
library(magrittr)
Sys.setenv(DICPATH = "data")
wrong2right <- function (text, spell.check = TRUE) {
RECODE = tolower(text)
RECODE = RECODE %>% gsub("\n", "<S><S><S><S><S>", ., fixed = TRUE) %>% gsub("\r", "<S><S><S><S><S>", ., fixed = TRUE)
RECODE = RECODE %>% gsub("[ :,;-]", "<S>", .) %>% gsub("(", "<S>", ., fixed = TRUE) %>% gsub(")", "<S>", ., fixed = TRUE)%>% gsub("/", "<S>", ., fixed = TRUE)
RECODE = strsplit(RECODE, split = ".", fixed = TRUE)[[1]]
RECODE = paste(RECODE, collapse = "<S><S><S><S><S>")
RECODE = gsub("(<S>){5, }", "<S><S><S><S><S>", RECODE)
RECODE = gsub("(<S>)+$", "", RECODE)
RECODE = strsplit(RECODE, split = "<S>", fixed = TRUE)[[1]]
pos1 = rep(NA, length(RECODE))
pos2 = rep(NA, length(RECODE))
start.pos = 1
sub.text = text
for (i in 1:length(RECODE)) {
if (RECODE[i]!="") {
result = regexpr(RECODE[i], tolower(sub.text))
if (result[1] != -1) {
pos1[i] = start.pos + result[1] - 1
pos2[i] = start.pos + result[1] + attr(result, "match.length") - 2
start.pos = pos2[i] + 1
sub.text = substr(sub.text, result[1] + attr(result, "match.length"), nchar(sub.text))
}
}
}
RECODE = RECODE %>% gsub("[^A-Z0-9a-z .:,;-]", "", .)
RECODE[!(!grepl("[^a-z]", RECODE)|!grepl("[^0-9]", RECODE))] = RECODE[!(!grepl("[^a-z]", RECODE)|!grepl("[^0-9]", RECODE))] %>% gsub("[0-9]", "", .)
RECODE = RECODE %>% gsub("[0-9]", "", .)
if (spell.check) {
SPELL_check = which(!hunspell:::hunspell_check(RECODE, dict = hunspell:::dictionary("my_DIC", cache = FALSE)))
if (length(SPELL_check) > 0) {
hist.num.space = 0
for (k in 1:length(SPELL_check)) {
RECODE[SPELL_check[k]] = hunspell:::hunspell_suggest(RECODE[SPELL_check[k]], dict = hunspell:::dictionary("my_DIC", cache = FALSE))[[1]][1]
if (grepl(" ", RECODE[SPELL_check[k]], fixed = TRUE)) {
num.space = length(gregexpr(" ", RECODE[SPELL_check[k]], fixed = TRUE)[[1]])
len.pos1 = length(pos1)
pos1 = c(pos1[1:(SPELL_check[k]+hist.num.space)], rep(NA, num.space - 1), pos1[(SPELL_check[k]+hist.num.space):len.pos1])
pos2 = c(pos2[1:(SPELL_check[k]+hist.num.space)], rep(NA, num.space - 1), pos2[(SPELL_check[k]+hist.num.space):len.pos1])
pos1[SPELL_check[k]+hist.num.space+1:num.space] = NA
pos2[SPELL_check[k]+hist.num.space+0:(num.space-1)] = NA
hist.num.space = hist.num.space + num.space
}
}
}
RECODE = RECODE %>% gsub(" ", "<S>", .)
RECODE[nchar(RECODE) == 0] = "<A>"
RECODE = strsplit(RECODE, split = "<S>", fixed = TRUE) %>% unlist
RECODE[RECODE %in% "<A>"] <- ""
RECODE = tolower(RECODE)
}
return(list(new.text = RECODE, original.text = text, pos1 = pos1, pos2 = pos2))
}
## [1] "axis" "i" "" "" ""
## [6] "" "" "" "" ""
## [11] "" "" "other" "specified" "depressive"
## [16] "disorder" "" "" "" ""
## [21] "" "" "" "" ""
## [26] "suspect" "adjustment" "disorder" "with" "depressed"
## [31] "mood" "" "" "" ""
## [36] "axis" "ii" "" "" "differed"
## [41] "" "" "" "" "axis"
## [46] "iii" "" "deferred" "" ""
## [51] "" "" "axis" "iv" "military"
## [56] "stress" "" "" "" ""
## [61] "axis" "global" "assessment" "functioning" "scale"
## [66] "" "" ""
## [1] "axis" "i" "" "" ""
## [6] "" "other" "specified" "depressive" "disorder"
## [11] "" "" "" "" "suspect"
## [16] "adjustment" "disorder" "with" "depressed" "mood"
## [21] "" "" "" "" "axis"
## [26] "ii" "" "" "" "differed"
## [31] "" "" "" "" "axis"
## [36] "iii" "" "" "deferred" ""
## [41] "" "" "" "axis" "iv"
## [46] "" "military" "stress" "" ""
## [51] "" "" "axis" "v" "global"
## [56] "assessment" "functioning" "scale"
lvl.word <- unlist(Processed_Notes) %>% as.factor %>% levels
lvl.word <- lvl.word[-1]
length(lvl.word)
## [1] 5874
tdm_array <- array(0L, dim = c(length(lvl.word), length(Processed_Notes)))
for (i in 1:length(Processed_Notes)) {
term_table <- table(Processed_Notes[[i]])
term_table <- term_table[names(term_table) %in% lvl.word]
term_pos <- which(lvl.word %in% names(term_table))
tdm_array[term_pos,i] <- term_table[names(term_table) %in% lvl.word]
}
dim(tdm_array)
## [1] 5874 4535
– 現在,讓我們試試用多層感知機來運用在病歷分類上,請你試著利用訓練組的資料做模型的訓練,並且用測試組的資料評估模型準確度。
Train.X <- tdm_array[,1:3000]
Train.Y <- Cancer_code[1:3000]
Test.X <- tdm_array[,3001:length(Cancer_code)]
Test.Y <- Cancer_code[3001:length(Cancer_code)]
– 另外,為了訓練你整合的能力,請你也對下列這兩串原始的文字進行病歷分類:
Adenocarcinoma of stomach with peritoneal carcinomatosis and massive ascite, stage IV under bidirection chemotherapy (neoadjuvant intraperitoneal-systemic chemotherapy) with intraperitoneal paclitaxel 120mg (20151126, 20151201) and systemic with Oxalip (20151127) and oral XELOX.
Chronic kidney disease, stage V with pulmonary edema underwent emergent hemodialysis, status post arteriovenous graft creation with maintenance hemodialysis.
library(mxnet)
my_iterator_core <- function (batch_size) {
batch = 0
batch_per_epoch = length(Train.Y)/batch_size
reset = function() {batch <<- 0}
iter.next = function() {
batch <<- batch+1
if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
}
value = function() {
idx = 1:batch_size + (batch - 1) * batch_size
idx[idx > ncol(Train.X)] = sample(1:ncol(Train.X), sum(idx > ncol(Train.X)))
data = mx.nd.array(array(Train.X[,idx], dim = c(nrow(Train.X), batch_size)))
label = mx.nd.array(array(Train.Y[idx], dim = c(1, batch_size)))
return(list(data = data, label = label))
}
return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "batch_size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, batch_size = 100){
.self$iter <- my_iterator_core(batch_size = batch_size)
.self
},
value = function(){
.self$iter$value()
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter <- my_iterator_func(iter = NULL, batch_size = 20)
data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')
fc1 <- mx.symbol.FullyConnected(data = data, num.hidden = 50, name = 'fc1')
relu1 <- mx.symbol.Activation(data = fc1, act.type = 'relu', name = 'relu1')
fc2 <- mx.symbol.FullyConnected(data = relu1, num.hidden = 1, name = 'fc2')
logistic_pred <- mx.symbol.sigmoid(data = fc2, name = 'logistic_pred')
eps <- 1e-8
ce_loss_pos <- mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <- mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
my.eval.metric.loss <- mx.metric.custom(
name = "cross-entropy",
function(label, pred) {
return(as.array(pred))
}
)
my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
array.batch.size = 20, ctx = mx.gpu(), num.round = 30,
eval.metric = my.eval.metric.loss,
batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
library(pROC)
roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))
my_discharge_note <- 'Adenocarcinoma of stomach with peritoneal carcinomatosis and massive ascite, stage IV under bidirection chemotherapy (neoadjuvant intraperitoneal-systemic chemotherapy) with intraperitoneal paclitaxel 120mg (20151126, 20151201) and systemic with Oxalip (20151127) and oral XELOX.'
processed_discharge_note <- wrong2right(my_discharge_note)$new.text
my_tdm_array <- array(0L, dim = c(length(lvl.word), 1))
term_table <- table(processed_discharge_note)
term_table <- term_table[names(term_table) %in% lvl.word]
term_pos <- which(lvl.word %in% names(term_table))
my_tdm_array[term_pos,] <- term_table
dim(my_tdm_array) <- c(length(lvl.word), 1)
predict(my_model, my_tdm_array, array.layout = 'colmajor')
## [,1]
## [1,] 1
我們不可能有足夠完整的字典做同義字的整合及片語的蒐集。那這樣你應該能想像當我們使用詞袋模型時,必須對每個文章都產生極長的向量,這不但浪費並且容易過度擬合。
如果在未來應用時存在訓練樣本中未曾出現的單字,那分類器將「無法」利用這個資訊。
– 如果我們想要把這個突破應用到其他領域上,那勢必要想個辦法連結圖片與該領域的關係,而目前看起來詞袋模型似乎並不合用。
– 雖然卷積網路雖然最初雖然是設計來做識別影像的,但只要我們能把「文字描述」轉為「圖片」格式,我們就一樣能利用卷積神經網路進行分類。
– 有一種想法是先把文字轉為向量,並且盡可能讓相似詞的向量足夠相近,這個方法叫做詞嵌入(word embedding)
– 因此,只要我們建構一個用上下文預測中間的字的模型,那當你輸入「chronic」以及「disease」並要求網路做出預測時,他會沒有辦法判斷該輸出「kidney」或者是「renal」,我們就能由此判斷這兩個字擁有相似的字意。
library(magrittr)
load('data/Discharge_Notes.RData')
lvl.word <- unlist(Processed_Notes) %>% as.factor %>% levels
lvl.word <- lvl.word[-1]
word.index <- 1:length(lvl.word)
names(word.index) <- lvl.word
Input_list <- list()
Output_list <- list()
for (i in 1:length(Processed_Notes)) {
Continuous_word <- Processed_Notes[[i]][grep('[a-z]', Processed_Notes[[i]])]
if (length(Continuous_word) > 6) {
Input_list[[i]] <- array(0L, dim = c(6, length(Continuous_word) - 6))
Output_list[[i]] <- rep(0L, length(Continuous_word) - 6)
for (k in 1:ncol(Input_list[[i]])) {
Input_list[[i]][,k] <- word.index[Continuous_word[k:(5+k)]]
Output_list[[i]][k] <- word.index[Continuous_word[6+k]]
}
}
}
Input <- do.call('cbind', Input_list)
Output <- do.call('c', Output_list)
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10]
## [1,] 1613 2834 812 5816 1124 2342 1934 3658 1961 2851
## [2,] 2834 812 5816 1124 2342 1934 3658 1961 2851 5008
## [3,] 812 5816 1124 2342 1934 3658 1961 2851 5008 4166
## [4,] 5816 1124 2342 1934 3658 1961 2851 5008 4166 3764
## [5,] 1124 2342 1934 3658 1961 2851 5008 4166 3764 2342
## [6,] 2342 1934 3658 1961 2851 5008 4166 3764 2342 5435
## [1] 1934 3658 1961 2851 5008 4166 3764 2342 5435 1613
– 因此我們會編寫特殊的Iterator,讓他能在執行時再產生完整的矩陣:
library(mxnet)
my_iterator_core <- function (batch_size) {
batch = 0
batch_per_epoch = length(Output)/batch_size
reset = function() {batch <<- 0}
iter.next = function() {
batch <<- batch+1
if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
}
value = function() {
idx = 1:batch_size + (batch - 1) * batch_size
idx[idx > ncol(Input)] = sample(1:ncol(Input), sum(idx > ncol(Input)))
data = array(0L, dim = c(length(lvl.word), batch_size))
label = array(0L, dim = c(length(lvl.word), batch_size))
for (i in 1:batch_size) {
data[Input[,idx[i]],i] <- 1L
label[Output[idx[i]],i] <- 1L
}
data = mx.nd.array(data)
label = mx.nd.array(label)
return(list(data = data, label = label))
}
return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "batch_size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, batch_size = 100){
.self$iter <- my_iterator_core(batch_size = batch_size)
.self
},
value = function(){
.self$iter$value()
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter <- my_iterator_func(iter = NULL, batch_size = 100)
data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')
fc1 <- mx.symbol.FullyConnected(data = data, num.hidden = 50, no.bias = TRUE, name = 'fc1')
fc2 <- mx.symbol.FullyConnected(data = fc1, num.hidden = length(lvl.word), no.bias = TRUE, name = 'fc2')
softmax_layer <- mx.symbol.SoftmaxOutput(data = fc2, label = label, name = 'sofmax_layer')
eps <- 1e-8
m_log <- 0 - mx.symbol.mean(mx.symbol.broadcast_mul(mx.symbol.log(softmax_layer + eps), label))
m_logloss <- mx.symbol.MakeLoss(m_log, name = 'm_logloss')
my.eval.metric.loss <- mx.metric.custom(
name = "m-log-loss",
function(label, pred) {
return(as.array(pred))
}
)
word2vec_model <- mx.model.FeedForward.create(symbol = m_logloss, X = my_iter, optimizer = my_optimizer,
array.batch.size = 100, ctx = mx.gpu(), num.round = 5,
eval.metric = my.eval.metric.loss,
batch.end.callback = mx.callback.log.speedometer(frequency = 200, batch.size = 100))
\[\cos \theta = \frac{\vec{\mbox{vec}_1} \cdot \vec{\mbox{vec}_2}}{|\vec{\mbox{vec}_1}| \cdot |\vec{\mbox{vec}_2}|} \]
– 在R裡面的實現過程如下,你會發現與「cancer」最接近的字居然如此符合預期!
interested_pos <- word.index['cancer']
cos_similarity <- rep(0, length(lvl.word))
names(cos_similarity) <- lvl.word
for (i in 1:length(lvl.word)) {
cos_similarity[i] <- word2vec_matrix[interested_pos,] %*% word2vec_matrix[i,] / sqrt(sum(word2vec_matrix[interested_pos,]^2)) / sqrt(sum(word2vec_matrix[i,]^2))
}
sort(cos_similarity, decreasing = TRUE) %>% head
## cancer adenoma lesion adenocarcinoma tubular
## 1.0000000 0.9146292 0.8586472 0.8548079 0.8435445
## large
## 0.8340912
– 而這個詞嵌入矩陣最終將有辦法獲得更可靠的文字間的關係,並且能夠允許收納更多的單字。
儘管我們在MxNet中簡單的實現了一個小型的word2vec,但將他擴展到真實的維基百科訓練任務中並沒有這麼簡單,運算量的增長會非常大的限制了模型的訓練,因此實際在實現word2vec時其實用了非常多的演算法上的加速,但由於數學難度太高我們暫且跳過。
幸好的是word2vec由於變化性較小,直接使用別人已經寫好的套件是可以接受的選擇,我們這裡推薦直接使用bmschmidt/wordVectors進行模型的訓練,你只要準備好處理過後的文字檔即可,而訓練模型的教學檔可以參考他所寫的introduction
如果你還是不清楚整個詞嵌入模型的訓練過程,可以參考一下wevi: word embedding visual inspector,你可以設置不同的文字關係觀察最終的嵌入結果:
– 為了較快實現後面的課程,這裡已經準備好了一個由英文Wiki上所有的文章中所訓練而成的詞嵌入矩陣,請點擊這裡下載!
library(magrittr)
library(dplyr)
library(plyr)
library(data.table)
word.data <- fread('data/wiki.txt', header = FALSE, showProgress = FALSE)
words.ref <- word.data %>% select(V1) %>% setDF %>% .[,1] %>% as.character
words.ref <- c("", words.ref)
words.index <- 1:length(words.ref)
names(words.index) <- words.ref
words.matrix <- word.data %>% select(-V1) %>% setDF %>% as.matrix
words.matrix <- rbind(rep(0, 50), words.matrix)
load('data/Discharge_Notes.RData')
words_pos <- words.index[Processed_Notes[[1]]]
words_pos[is.na(words_pos)] <- 1
text_img.array <- t(words.matrix[words_pos,])
dim(text_img.array)
## [1] 50 301
library(imager)
par(mai = rep(0, 4))
img <- text_img.array
img[img>2] <- 2
img[img<-2] <- -2
plot(as.cimg(t(img)))
– 至於文章長度不等的問題怎麼解決?其實也沒什麼好方法,就以最長的為準剩下填白吧:
max.length <- sapply(Processed_Notes, length) %>% max()
img_array <- array(0, dim = c(50, max.length, length(Processed_Notes)))
pb <- txtProgressBar(max = length(Processed_Notes), style = 3)
for (i in 1:length(Processed_Notes)) {
words_pos <- words.index[Processed_Notes[[i]]]
words_pos[is.na(words_pos)] <- 1
img_array[,1:length(words_pos),i] <- t(words.matrix[words_pos,])
setTxtProgressBar(pb, i)
}
close(pb)
## [1] 50 494 4535
Train.img_array <- img_array[,,1:3000]
Train.Y <- Cancer_code[1:3000]
Test.img_array <- img_array[,,3001:length(Cancer_code)]
dim(Test.img_array) <- c(dim(Test.img_array)[1:2], 1, dim(Test.img_array)[3])
Test.Y <- Cancer_code[3001:length(Cancer_code)]
library(mxnet)
my_iterator_core <- function (batch_size) {
batch = 0
batch_per_epoch = length(Train.Y)/batch_size
reset = function() {batch <<- 0}
iter.next = function() {
batch <<- batch+1
if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
}
value = function() {
idx = 1:batch_size + (batch - 1) * batch_size
idx[idx > length(Train.Y)] = sample(1:length(Train.Y), sum(idx > length(Train.Y)))
data = mx.nd.array(array(Train.img_array[,,idx], dim = c(dim(Train.img_array)[1:2], 1, batch_size)))
label = mx.nd.array(array(Train.Y[idx], dim = c(1, batch_size)))
return(list(data = data, label = label))
}
return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "batch_size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, batch_size = 100){
.self$iter <- my_iterator_core(batch_size = batch_size)
.self
},
value = function(){
.self$iter$value()
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter <- my_iterator_func(iter = NULL, batch_size = 20)
## [1] TRUE
## [1] 50 494 1 20
– 讓我們看看怎樣編寫Model architecture:
data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')
conv1 <- mx.symbol.Convolution(data = data, kernel = c(50, 1), num_filter = 50, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act.type = 'relu', name = 'relu1')
pool1 <- mx.symbol.max(data = relu1, axis = 2:3, keepdims = FALSE, name = 'pool1')
fc1 <- mx.symbol.FullyConnected(data = pool1, num.hidden = 1, name = 'fc1')
logistic_pred <- mx.symbol.sigmoid(data = fc1, name = 'logistic_pred')
eps <- 1e-8
ce_loss_pos <- mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <- mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
my.eval.metric.loss <- mx.metric.custom(
name = "cross-entropy",
function(label, pred) {
return(as.array(pred))
}
)
my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
array.batch.size = 20, ctx = mx.gpu(), num.round = 30,
eval.metric = my.eval.metric.loss,
batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
library(pROC)
roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))
– 另外,這樣的模型有更少的參數量,因此也會較難發生過度擬合的問題!
– 讓我們參考一下Chin Lin等人發表的:Artificial Intelligence Learning Semantics via External Resources for Classifying Diagnosis Codes in Discharge Notes,看看人家是怎樣設計Model architecture的:
data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')
concat_lst <- NULL
filter_sizes <- 1:5
num_filter <- c(40, 30, 15, 10, 5)
for (i in 1:length(filter_sizes)) {
convi <- mx.symbol.Convolution(data = data, kernel = c(50, filter_sizes[i]), num_filter = num_filter[i], name = paste0('conv', i))
relui <- mx.symbol.Activation(data = convi, act_type = "relu", name = paste0('relu', i))
pooli <- mx.symbol.max(data = relui, axis = 2:3, keepdims = FALSE, name = paste0('pool', i))
concat_lst <- append(concat_lst, pooli)
}
h_pool <- mx.symbol.concat(data = concat_lst, num.args = length(filter_sizes), dim = 1, name = 'h_pool')
h_drop <- mx.symbol.Dropout(data = h_pool, p = 0.5, name = 'h_drop')
fc1 <- mx.symbol.FullyConnected(data = h_drop, num.hidden = 1, name = 'fc1')
logistic_pred <- mx.symbol.sigmoid(data = fc1, name = 'logistic_pred')
eps <- 1e-8
ce_loss_pos <- mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <- mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
my.eval.metric.loss <- mx.metric.custom(
name = "cross-entropy",
function(label, pred) {
return(as.array(pred))
}
)
my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
array.batch.size = 20, ctx = mx.gpu(), num.round = 30,
eval.metric = my.eval.metric.loss,
batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
my_model$symbol <- logistic_pred
pred_y <- predict(my_model, Test.img_array)
roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))
– 首先一樣先把wiki的詞嵌入矩陣讀進來,並且為其第一列補0值作為空符號:
library(magrittr)
library(dplyr)
library(plyr)
library(data.table)
load('data/Discharge_Notes.RData')
word.data <- fread('data/wiki.txt', header = FALSE, showProgress = FALSE)
words.ref <- word.data %>% select(V1) %>% setDF %>% .[,1] %>% as.character
words.ref <- c("", words.ref)
words.index <- 1:length(words.ref)
names(words.index) <- words.ref
words.matrix <- word.data %>% select(-V1) %>% setDF %>% as.matrix
words.matrix <- rbind(rep(0, 50), words.matrix)
– 接著進行資料的整合,現在我們只存索引位置(等等讓MxNet自己查對照表),要非常注意的是MxNet裡面的索引是從0開始編號而非像R裡面從1開始,所以我們需要把所有的編號通通減去1以滿足MxNet的需要:
max.length <- sapply(Processed_Notes, length) %>% max()
seq_array <- array(0, dim = c(max.length, length(Processed_Notes)))
pb <- txtProgressBar(max = length(Processed_Notes), style = 3)
for (i in 1:length(Processed_Notes)) {
words_pos <- words.index[Processed_Notes[[i]]]
words_pos[is.na(words_pos)] <- 1
seq_array[1:length(words_pos),i] <- words_pos - 1 # very important
setTxtProgressBar(pb, i)
}
close(pb)
Train.seq_array <- seq_array[,1:3000]
Train.Y <- Cancer_code[1:3000]
Test.seq_array <- seq_array[,3001:length(Cancer_code)]
dim(Test.seq_array) <- c(max.length, 1, dim(Test.seq_array)[2])
Test.Y <- Cancer_code[3001:length(Cancer_code)]
library(mxnet)
my_iterator_core <- function (batch_size) {
batch = 0
batch_per_epoch = length(Train.Y)/batch_size
reset = function() {batch <<- 0}
iter.next = function() {
batch <<- batch+1
if (batch > batch_per_epoch) {return(FALSE)} else {return(TRUE)}
}
value = function() {
idx = 1:batch_size + (batch - 1) * batch_size
idx[idx > length(Train.Y)] = sample(1:length(Train.Y), sum(idx > length(Train.Y)))
data = mx.nd.array(array(Train.seq_array[,idx], dim = c(dim(Train.seq_array)[1], 1, batch_size)))
label = mx.nd.array(array(Train.Y[idx], dim = c(1, batch_size)))
return(list(data = data, label = label))
}
return(list(reset = reset, iter.next = iter.next, value = value, batch_size = batch_size, batch = batch))
}
my_iterator_func <- setRefClass("Custom_Iter",
fields = c("iter", "batch_size"),
contains = "Rcpp_MXArrayDataIter",
methods = list(
initialize = function(iter, batch_size = 100){
.self$iter <- my_iterator_core(batch_size = batch_size)
.self
},
value = function(){
.self$iter$value()
},
iter.next = function(){
.self$iter$iter.next()
},
reset = function(){
.self$iter$reset()
},
finalize=function(){
}
)
)
my_iter <- my_iterator_func(iter = NULL, batch_size = 20)
data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')
embed1 <- mx.symbol.Embedding(data = data, input.dim = length(words.ref), output.dim = 50, name = 'embed1')
concat_lst <- NULL
filter_sizes <- 1:5
num_filter <- c(40, 30, 15, 10, 5)
for (i in 1:length(filter_sizes)) {
convi <- mx.symbol.Convolution(data = embed1, kernel = c(50, filter_sizes[i]), num_filter = num_filter[i], name = paste0('conv', i))
relui <- mx.symbol.Activation(data = convi, act_type = "relu", name = paste0('relu', i))
pooli <- mx.symbol.max(data = relui, axis = 2:3, keepdims = FALSE, name = paste0('pool', i))
concat_lst <- append(concat_lst, pooli)
}
h_pool <- mx.symbol.concat(data = concat_lst, num.args = length(filter_sizes), dim = 1, name = 'h_pool')
h_drop <- mx.symbol.Dropout(data = h_pool, p = 0.5, name = 'h_drop')
fc1 <- mx.symbol.FullyConnected(data = h_drop, num.hidden = 1, name = 'fc1')
logistic_pred <- mx.symbol.sigmoid(data = fc1, name = 'logistic_pred')
eps <- 1e-8
ce_loss_pos <- mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <- mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
– 用mx.symbol.infer.shape來檢查:
## $embed1_output
## [1] 50 494 1 20
new_arg <- mxnet:::mx.model.init.params(symbol = logistic_pred,
input.shape = list(data = c(494, 1, 20)),
output.shape = NULL, initializer = mxnet:::mx.init.Xavier(rnd_type = "uniform", magnitude = 2.24),
ctx = mx.gpu())
new_arg$arg.params$embed1_weight <- mx.nd.array(t(words.matrix))
my.eval.metric.loss <- mx.metric.custom(
name = "cross-entropy",
function(label, pred) {
return(as.array(pred))
}
)
my_model <- mx.model.FeedForward.create(symbol = ce_loss, X = my_iter, optimizer = my_optimizer,
array.batch.size = 20, ctx = mx.gpu(), num.round = 30,
eval.metric = my.eval.metric.loss,
arg.params = new_arg$arg.params, fixed.param = 'embed1_weight',
batch.end.callback = mx.callback.log.speedometer(frequency = 50, batch.size = 20))
my_model$symbol <- logistic_pred
my_model$arg.params <- append(my_model$arg.params, new_arg$arg.params['embed1_weight'])
pred_y <- predict(my_model, Test.seq_array)
library(pROC)
roc_result <- roc(Test.Y, pred_y)
plot(roc_result, col = "red", main = paste0('AUC = ', formatC(roc_result$auc, 3, format = 'f')))
– 但是wiki共有34萬多個單字,而這個月的病歷中僅僅存在5千多個單字,這代表的是假定允許模型訓練時更新詞嵌入矩陣的權重,那也僅僅有那5千多個單字的權重會改變
– 更嚴重的是,假設病歷中原先有H5N1這個單字而沒有H1N1,在最開始的時候他們兩個的向量本來是很接近的,會不會訓練一陣子之後就離開了?
data <- mx.symbol.Variable(name = 'data')
label <- mx.symbol.Variable(name = 'label')
embed1 <- mx.symbol.Embedding(data = data, input.dim = length(words.ref), output.dim = 50, name = 'embed1')
embed2 <- mx.symbol.Convolution(data = embed1, kernel = c(50, 1), num_filter = 50, no.bias = TRUE, name = 'embed2')
concat_lst <- NULL
filter_sizes <- 1:5
num_filter <- c(40, 30, 15, 10, 5)
for (i in 1:length(filter_sizes)) {
convi <- mx.symbol.Convolution(data = embed2, kernel = c(1, filter_sizes[i]), num_filter = num_filter[i], name = paste0('conv', i))
relui <- mx.symbol.Activation(data = convi, act_type = "relu", name = paste0('relu', i))
pooli <- mx.symbol.max(data = relui, axis = 2:3, keepdims = FALSE, name = paste0('pool', i))
concat_lst <- append(concat_lst, pooli)
}
h_pool <- mx.symbol.concat(data = concat_lst, num.args = length(filter_sizes), dim = 1, name = 'h_pool')
h_drop <- mx.symbol.Dropout(data = h_pool, p = 0.5, name = 'h_drop')
fc1 <- mx.symbol.FullyConnected(data = h_drop, num.hidden = 1, name = 'fc1')
logistic_pred <- mx.symbol.sigmoid(data = fc1, name = 'logistic_pred')
eps <- 1e-8
ce_loss_pos <- mx.symbol.broadcast_mul(mx.symbol.log(logistic_pred + eps), label)
ce_loss_neg <- mx.symbol.broadcast_mul(mx.symbol.log(1 - logistic_pred + eps), 1 - label)
ce_loss_mean <- 0 - mx.symbol.mean(ce_loss_pos + ce_loss_neg)
ce_loss <- mx.symbol.MakeLoss(ce_loss_mean, name = 'ce_loss')
– 注意一下維度的改變:
## $embed1_output
## [1] 50 494 1 20
## $embed2_output
## [1] 1 494 50 20
語言相關的任務通常被稱作「自然語言處理(Natural Language Processing)」,由於相比圖像其規則更為複雜,因此是人工智慧領域非常具有挑戰性的領域。
詞袋模型這種直覺而簡單的分析方式,是這個領域最早被實際應用的方法之一,經過我們一系列的實驗之後你會發現它的效果其實沒有想像中差,因此一些像是垃圾郵件篩選服務目前可能還是用這種方式,他的高效及節省運算資源非常適合簡單任務的應用。
– 然而在較為複雜的任務中,為了保留語句的順序,我們不可能再使用one hot encoding的方式進行數值化,因此使用詞嵌入模型進行文字的處理已經是目前的主流,並且後續使用上卷積神經網路的結構非常優雅的解決了相似字與片語的問題,相較於傳統的詞袋模型在眾多複雜任務中效能也有很大的提升。
– 另外需要注意的是詞嵌入模型的運用仍有很多變化,像是是否允許更新詞嵌入矩陣的權重則端看你的任務而定,除此之外訓練詞嵌入模型所用的資料(wiki vs PubMed)也會影響他的效果,更甚者我們可以建造一個模型從字母開始產生單字的詞向量,這個部分眾多的變化是當代研究的熱點之一!
– 在自然語言處理相關的任務中我們不斷的強調文字的順序是有意義的,因此我們需要一種結構能夠在抓取特徵的同時記錄上下文的資訊,而這個帶有記憶的結構我們會在下一週講到,並將其運用在我們的任務之中看看效能是否提升。