Lesson 6 異常醫學影像預測挑戰

下載資料

請你先到資料科學競賽網站選擇第六個任務，登入後下載4份資料，包含之前任務沒有的「supplement.zip」，裡面放著很多影像資料。

– 與之前所有任務相同，這個任務的資料下載是需要經過申請的，請你找助教申請帳號。

讀取資料

最開始我們還是需要先將「train.csv」、「test.csv」以及「sample_submission.csv」讀進來：

library(data.table)

train_dat <- fread('train.csv', data.table = FALSE)
test_dat <- fread('test.csv', data.table = FALSE)
submit_dat <- fread('sample_submission.csv', data.table = FALSE)

如果你是使用伺服器版的Rstudio，那直接上傳上傳「supplement.zip」就能得到資料夾「image」了。或是你可以先解壓縮放到你的project的資料夾下

二元分類任務

這樣的任務最直覺的想法就是建立一個二元分類神經網路，新加坡團隊所發表的Development and Validation of a Deep Learning System for Diabetic Retinopathy and Related Eye Diseases Using Retinal Images From Multiethnic Populations With Diabetes就有用類似這種方式進行：

– 「Additionally, convolutional networks were trained to reject images for insufficient image quality, as well as for being invalid input (i.e. not being a retinal image). For the latter model, a broad variety of natural images was used as the negative class, in training. Images rejected by either of these models are considered as being recommended for further referral, for the purposes of computing the experimental results. Once an image is analyzed, a report will be generated for the users.」-出自Supplementary page 7。

既然要用卷積神經網路，我們在這裡提供了一個已經預訓練好的模型列表

– 這個範例使用「resnet-50」進行，這是一個50層深的ResNet，非常標準的深度神經網路。我們要建構一個新的神經網路將最後一層的FC從分1000類轉變成分2類：

library(magrittr)
library(mxnet)

res_model <- mx.model.load(prefix = "resnet-50", iteration = 0)
all_layers <- res_model$symbol$get.internals()
flatten0_output <- which(all_layers$outputs == 'flatten0_output') %>% all_layers$get.output()

fc1 <- mx.symbol.FullyConnected(data = flatten0_output, num_hidden = 2, name = 'fc1')
softmax <- mx.symbol.SoftmaxOutput(data = fc1, name = 'softmax')

產生了新的深度神經網路後，我們將他們合併成一個大陣列：

library(OpenImageR)

image_dir <- 'image/'

y_array <- model.matrix(~ -1 + factor(train_dat[,'anomaly'])) %>% t()
img_array <-  array(0, dim = c(256, 256, 3, nrow(train_dat)))

for (i in 1:nrow(train_dat)) {
  
  if (i == 1) {t0 <- Sys.time()}
  
  current_prc_path <- paste0(image_dir, train_dat[i, 'UID'], '.jpeg')
  img_array[,,,i] <- readImage(current_prc_path)
  
  if (i %% 100 == 0) {
    
    batch_time <- as.double(difftime(Sys.time(), t0, units = "secs"))
    batch_speed <- i / batch_time
    estimated_mins <- (nrow(train_dat) - i) / batch_speed / 60
    
    message(i, "/", nrow(train_dat), ": Speed: ", formatC(batch_speed, 2, format = "f"),
            " samples/sec | Remaining time: ", formatC(estimated_mins, 1, format = "f"), " mins")
    
  }
  
}

由於在這個任務中必須給出確切的分類，我們一定是需要驗證組來幫忙的，讓我們隨機切出20%的資料：

set.seed(0)

train_ids <- sample(1:nrow(train_dat), nrow(train_dat) * 0.8)

train_y_array <- y_array[,train_ids]
train_img_array <- img_array[,,,train_ids]

valid_y_array <- y_array[,-train_ids]
valid_img_array <- img_array[,,,-train_ids]

接著，我們在開始訓練之前需要取得模型權重參數，我們可以將最後一層以外的部分填入resnet-50的參數，並以這為基礎開始訓練任務：

new_arg <- mxnet:::mx.model.init.params(symbol = softmax,
                                        input.shape = list(data = c(256, 256, 3, 32)),
                                        output.shape = NULL,
                                        initializer = mxnet:::mx.init.uniform(0.01),
                                        ctx = mx.cpu())

for (i in 1:length(new_arg$arg.params)) {
  pos <- which(names(res_model$arg.params) == names(new_arg$arg.params)[i])
  if (length(pos) == 1) {
    if (all.equal(dim(res_model$arg.params[[pos]]), dim(new_arg$arg.params[[i]])) == TRUE) {
      new_arg$arg.params[[i]] <- res_model$arg.params[[pos]]
    }
  }
}

for (i in 1:length(new_arg$aux.params)) {
  pos <- which(names(res_model$aux.params) == names(new_arg$aux.params)[i])
  if (length(pos) == 1) {
    if (all.equal(dim(res_model$aux.params[[pos]]), dim(new_arg$aux.params[[i]])) == TRUE) {
      new_arg$aux.params[[i]] <- res_model$aux.params[[pos]]
    }
  }
}

可以開始訓練了，如果你的電腦沒有GPU，請你將「mx.gpu()」改成「mx.cpu()」

my.eval.metric.mlogloss <- mx.metric.custom(
  name = "m-logloss", 
  function(real, pred) {
    real1 = as.numeric(as.array(real))
    pred1 = as.numeric(as.array(pred))
    pred1[pred1 <= 1e-6] = 1e-6
    pred1[pred1 >= 1 - 1e-6] = 1 - 1e-6
    return(-mean(real1 * log(pred1), na.rm = TRUE))
  }
)

mx.set.seed(0)

my_model <- mx.model.FeedForward.create(symbol = softmax,
                                       X = train_img_array, y = train_y_array,
                                       optimizer = "sgd", learning.rate = 0.005, momentum = 0.9,
                                       array.batch.size = 16, num.round = 10,
                                       arg.params = new_arg$arg.params, aux.params = new_arg$aux.params,
                                       ctx = mx.gpu(0),
                                       eval.metric = my.eval.metric.mlogloss)

我們先在驗證組上進行驗證，效果肯定是非常好的，可以做出1.0000的AUC，而切點大約也會是在0.5上下：

library(pROC)

pred_valid <-  predict(model = my_model, X = valid_img_array, ctx = mx.gpu(0))
roc_valid <- roc(response = valid_y_array[2,], predictor = pred_valid[2,])

best_pos <- which.max(roc_valid$sensitivities + roc_valid$specificities)
best_cut <- roc_valid$thresholds[best_pos]

現在，我們能直接用這個網路來預測，首先先產生大陣列：

image_dir <- 'image/'
test_img_array <-  array(0, dim = c(256, 256, 3, nrow(test_dat)))

for (i in 1:nrow(test_dat)) {
  
  if (i == 1) {t0 <- Sys.time()}
  
  current_prc_path <- paste0(image_dir, test_dat[i, 'UID'], '.jpeg')
  test_img_array[,,,i] <- readImage(current_prc_path)
  
  if (i %% 100 == 0) {
    
    batch_time <- as.double(difftime(Sys.time(), t0, units = "secs"))
    batch_speed <- i / batch_time
    estimated_mins <- (nrow(test_dat) - i) / batch_speed / 60
    
    message(i, "/", nrow(test_dat), ": Speed: ", formatC(batch_speed, 2, format = "f"),
            " samples/sec | Remaining time: ", formatC(estimated_mins, 1, format = "f"), " mins")
    
  }
  
}

讓我們直接進行預測，最簡單的神經網路能幫助取得的成績實在是不太好(balance accuracy = 0.7172)

pred_test <-  predict(model = my_model, X = test_img_array, ctx = mx.gpu(0))
submit_dat[,'anomaly'] <- (pred_test[2,] > best_cut)
fwrite(submit_dat, file = 'my_submission.csv', na = '', row.names = FALSE, quote = FALSE)

無監督學習

在深度學習理論與實務-變分自編碼器與異常檢測中，我們有提到可以用變分自編碼器進行異常檢測。

– 其實用自編碼器也可以，重點是中間的隱編碼的數目要小。

– 先讓我們建構一個Model architecture：

# Encoder

data <- mx.symbol.Variable('data')

conv1 <- mx.symbol.Convolution(data = data, kernel = c(8, 8), stride = c(8, 8), num_filter = 8, name = 'conv1')
relu1 <- mx.symbol.Activation(data = conv1, act_type = "relu", name = 'relu1')

conv2 <- mx.symbol.Convolution(data = relu1, kernel = c(4, 4), stride = c(4, 4), num_filter = 32, name = 'conv2')
relu2 <- mx.symbol.Activation(data = conv2, act_type = "relu", name = 'relu2')

conv3 <- mx.symbol.Convolution(data = relu2, kernel = c(4, 4), stride = c(4, 4), num_filter = 128, name = 'conv3')
relu3 <- mx.symbol.Activation(data = conv3, act_type = "relu", name = 'relu3')

encoder <- mx.symbol.Convolution(data = relu3, kernel = c(2, 2), stride = c(2, 2), num_filter = 256, name = 'encoder')

# Decoder

deconv4 <- mx.symbol.Deconvolution(data = encoder, kernel = c(2, 2), stride = c(2, 2), num_filter = 128, name = 'deconv4')
relu4 <- mx.symbol.Activation(data = deconv4, act_type = "relu", name = 'relu4')

deconv5 <- mx.symbol.Deconvolution(data = relu4, kernel = c(4, 4), stride = c(4, 4), num_filter = 32, name = 'deconv5')
relu5 <- mx.symbol.Activation(data = deconv5, act_type = "relu", name = 'relu5')

deconv6 <- mx.symbol.Deconvolution(data = relu5, kernel = c(4, 4), stride = c(4, 4), num_filter = 8, name = 'deconv6')
relu6 <- mx.symbol.Activation(data = deconv6, act_type = "relu", name = 'relu6')

decoder <- mx.symbol.Deconvolution(data = relu6, kernel = c(8, 8), stride = c(8, 8), num_filter = 3, name = 'decoder')

# MSE loss

label <- mx.symbol.Variable(name = 'label')

residual <- mx.symbol.broadcast_minus(lhs = label, rhs = decoder) 
square_residual <- mx.symbol.square(data = residual)
mean_square_residual <- mx.symbol.mean(data = square_residual, axis = 0:3, keepdims = FALSE)
mse_loss <- mx.symbol.MakeLoss(data = mean_square_residual, name = 'mse')

在訓練這個模型時，我們只要使用所有的胸部x光就好，我們馬上就能進行訓練了。

nomaly_array <- train_img_array[,,,which(train_y_array[2,] == 0)]

my_optimizer <- mx.opt.create(name = "adam", learning.rate = 1e-3, beta1 = 0.9, beta2 = 0.999, epsilon = 1e-08, wd = 0)

my.eval.metric.loss <- mx.metric.custom(
  name = "my-loss", 
  function(real, pred) {
    return(as.array(pred))
  }
)

mx.set.seed(0)

my_model <- mx.model.FeedForward.create(symbol = mse_loss, optimizer = my_optimizer,
                                        X = nomaly_array, y = nomaly_array,
                                        array.batch.size = 16, num.round = 50,
                                        ctx = mx.gpu(0),
                                        eval.metric = my.eval.metric.loss)

my_model$symbol <- decoder

同樣我們需要用驗證組資料來找最佳切點，讓我們試試看。如果以AUC來說，他一定會比本來的差，但我們來看看他神奇的效果。

library(pROC)

pred_img_array <-  predict(model = my_model, X = valid_img_array, ctx = mx.gpu(0))
diff_img_array <- (pred_img_array - valid_img_array)^2
pred_valid <- apply(diff_img_array, 4, mean)
  
roc_valid <- roc(response = valid_y_array[2,], predictor = pred_valid)

best_pos <- which.max(roc_valid$sensitivities + roc_valid$specificities)
best_cut <- roc_valid$thresholds[best_pos]

現在，我們能直接用這個網路來預測，首先先產生大陣列：

image_dir <- 'image/'
test_img_array <-  array(0, dim = c(256, 256, 3, nrow(test_dat)))

for (i in 1:nrow(test_dat)) {
  
  if (i == 1) {t0 <- Sys.time()}
  
  current_prc_path <- paste0(image_dir, test_dat[i, 'UID'], '.jpeg')
  test_img_array[,,,i] <- readImage(current_prc_path)
  
  if (i %% 100 == 0) {
    
    batch_time <- as.double(difftime(Sys.time(), t0, units = "secs"))
    batch_speed <- i / batch_time
    estimated_mins <- (nrow(test_dat) - i) / batch_speed / 60
    
    message(i, "/", nrow(test_dat), ": Speed: ", formatC(batch_speed, 2, format = "f"),
            " samples/sec | Remaining time: ", formatC(estimated_mins, 1, format = "f"), " mins")
    
  }
  
}

讓我們直接進行預測，將預測前與預測後相減，得到的平均誤差就可以拿來作為判斷之用，最終我們能獲得0.7238的balance accuracy。

pred_img_array <-  predict(model = my_model, X = test_img_array, ctx = mx.gpu(0))
diff_img_array <- (pred_img_array - test_img_array)^2
pred_test <- apply(diff_img_array, 4, mean)

submit_dat[,'anomaly'] <- (pred_test > best_cut)
fwrite(submit_dat, file = 'my_submission.csv', na = '', row.names = FALSE, quote = FALSE)

你應該有注意到，關鍵其實是切點的選擇，如果我們修正切點改成認為可以接受10%的胸部X光被誤判，最終我們能獲得0.7327的balance accuracy。

best_cut <- quantile(pred_valid[valid_y_array[2,] == 0], 0.9)

submit_dat[,'anomaly'] <- (pred_test > best_cut)
fwrite(submit_dat, file = 'my_submission.csv', na = '', row.names = FALSE, quote = FALSE)

要不要再試試看更好的訓練方法呀？可以參考GANomaly: Semi-Supervised Anomaly Detection via Adversarial Training的做法。
你應該感覺得出來，進步空間還非常的大，請你好好嘗試一下！