物件識別模型實驗

林嶔 (Lin, Chin)

Lesson 12

訓練一個物件識別模型(1)

– 對於圖像分類模型,大多是透過Pooling把這個特徵圖縮減成1×1×n的特徵,並對其做Softmax regression的輸出

– 對於YOLO model,他是對每一個Gird都要有一系列輸出,因此他就是不做Pooling直接再用1×1的卷積核進行運算,從而輸出7×7×m的輸出,最終我們再對7×7×m的部分做解碼(Decode)。

F12_1

訓練一個物件識別模型(2)

F12_2

– 所以假設最終的特徵圖大小為7×7×n,那在皮卡丘識別任務中YOLO結構的輸出將會是7×7×6。

  1. 可信度:這是一個必須介於0至1的數值,所以需要經過Sigmoid轉換後方能輸出

  2. y座標(row)的「相對」位置:這也是一個必須介於0至1的數值

  3. x座標(column)的「相對」位置:這也是一個必須介於0至1的數值

  4. 寬度(x軸):這是一個必須大於0的數值,經過指數轉換可以把任意數轉換成符合需求,但常規的做法是把原始值經過對數轉換,而輸出值是不做任何處理的

  5. 高度(y軸):這個部分與寬度相同

  6. 類別1的可能性:在YOLO v1中,是將類別1至類別N的可能性一起做Softmax,但在YOLO v3中將這個部分全部改成Sigmoid輸出,以允許多重標籤的物件

  1. 為什麼要使用「相對」位置而非「絕對」位置?

  2. 為什麼在高度/寬度的輸出不是使用ReLU或是指數轉換,而是將原始值做對數處理後而輸出值保持原樣?

訓練一個物件識別模型(3)

– 讓我們先從這裡下載一個做圖像識別的MobileNet v2模型,我們先試試它的圖像分類效果:

library(mxnet)
library(imager)
library(jpeg)
library(OpenImageR)
library(magrittr)

#Load a pre-training residual network model

mobile_model <- mx.model.load("model/mobilev2", 0)
label_names <- readLines("model/synset.txt", encoding = "UTF-8")

#Define image processing functions

preproc.image <- function(im, width = 224, height = 224, method = 'bilinear') {
  resized <- resizeImage(image = im, width = width, height = height, method = method)
  resized <- as.array(resized) * 255
  resized[,,1] <- resized[,,1] - 123.68
  resized[,,2] <- resized[,,2] - 116.78
  resized[,,3] <- resized[,,3] - 103.94
  # Reshape to format needed by mxnet (width, height, channel, num)
  dim(resized) <- c(width, height, 3, 1)
  return(resized)
}

#Read image # Display image

img <- readJPEG("image/4.jpg")

#Pre-processing

normed <- preproc.image(img)

#Display image

par(mar = rep(0, 4))
plot(NA, xlim = c(0.04, 0.96), ylim = c(0.04, 0.96), xaxt = "n", yaxt = "n", bty = "n")
rasterImage(img, 0, 0, 1, 1, interpolate = FALSE)

#Predict

prob <- predict(mobile_model, X = normed, ctx = mx.cpu())
cat(paste0(label_names[which.max(prob)], ': ', formatC(max(prob), 4, format = 'f'), '\n'))
## n02497673 Madagascar cat, ring-tailed lemur, Lemur catta: 1.0000

訓練一個物件識別模型(4)

– 這裡的函數「DWCONV_function」以及「CONV_function」都只是在原先的基礎上再增加卷積層,關鍵是函數「YOLO_map_function」的部分。

– 根據剛剛的定義你會發現除了高度/寬度的輸出(第4項與第5項)不需要經過Sigmoid轉換之外,剩下都需要,所以我們先用函數「mx.symbol.SliceChannel」把他們拆開,最後再各自處理過後再用函數「mx.symbol.concat」合併。

# Libraries

library(mxnet)
library(magrittr)

## Define the model architecture
## Use pre-trained model and fine tuning

# Load MobileNet v2

Pre_Trained_model <- mx.model.load('model/mobilev2', 0)

# Get the internal output

Mobile_symbol <- Pre_Trained_model$symbol

Mobile_All_layer <- Mobile_symbol$get.internals()

basic_out <- which(Mobile_All_layer$outputs == 'conv6_3_linear_bn_output') %>% Mobile_All_layer$get.output()

# mx.symbol.infer.shape(basic_out, data = c(256, 256, 3, 7))$out.shapes
# conv6_3_linear_bn_output out shape = 8 8 320 n (if input shape = 256 256 3 n)

# Convolution layer for specific mission and training new parameters

# 1. Additional some architecture for better learning

DWCONV_function <- function (indata, num_filters = 256, Inverse_coef = 6, residual = TRUE, name = 'lvl1', stage = 1) {
  
  expend_conv <- mx.symbol.Convolution(data = indata, kernel = c(1, 1), stride = c(1, 1), pad = c(0, 0),
                                       no.bias = TRUE, num.filter = num_filters * Inverse_coef,
                                       name = paste0(name, '_', stage, '_expend'))
  expend_bn <- mx.symbol.BatchNorm(data = expend_conv, fix_gamma = FALSE, name = paste0(name, '_', stage, '_expend_bn'))
  expend_relu <- mx.symbol.LeakyReLU(data = expend_bn, act.type = 'leaky', slope = 0.1, name = paste0(name, '_', stage, '_expend_relu'))
  
  dwise_conv <- mx.symbol.Convolution(data = expend_relu, kernel = c(3, 3), stride = c(1, 1), pad = c(1, 1),
                                      no.bias = TRUE, num.filter = num_filters * Inverse_coef, num.group = num_filters * Inverse_coef,
                                      name = paste0(name, '_', stage, '_dwise'))
  dwise_bn <- mx.symbol.BatchNorm(data = dwise_conv, fix_gamma = FALSE, name = paste0(name, '_', stage, '_dwise_bn'))
  dwise_relu <- mx.symbol.LeakyReLU(data = dwise_bn, act.type = 'leaky', slope = 0.1, name = paste0(name, '_', stage, '_dwise_relu'))
  
  restore_conv <- mx.symbol.Convolution(data = dwise_relu, kernel = c(1, 1), stride = c(1, 1), pad = c(0, 0),
                                        no.bias = TRUE, num.filter = num_filters,
                                        name = paste0(name, '_', stage, '_restore'))
  restore_bn <- mx.symbol.BatchNorm(data = restore_conv, fix_gamma = FALSE, name = paste0(name, '_', stage, '_restore_bn'))
  
  if (residual) {
    
    block <- mx.symbol.broadcast_plus(lhs = indata, rhs = restore_bn, name = paste0(name, '_', stage, '_block'))
    return(block)
    
  } else {
    
    restore_relu <- mx.symbol.LeakyReLU(data = restore_bn, act.type = 'leaky', slope = 0.1, name = paste0(name, '_', stage, '_restore_relu'))
    return(restore_relu)
    
  }
  
}

CONV_function <- function (indata, num_filters = 256, name = 'lvl1', stage = 1) {
  
  conv <- mx.symbol.Convolution(data = indata, kernel = c(1, 1), stride = c(1, 1), pad = c(0, 0),
                                no.bias = TRUE, num.filter = num_filters,
                                name = paste0(name, '_', stage, '_conv'))
  bn <- mx.symbol.BatchNorm(data = conv, fix_gamma = FALSE, name = paste0(name, '_', stage, '_bn'))
  relu <- mx.symbol.Activation(data = bn, act.type = 'relu', name = paste0(name, '_', stage, '_relu'))
  
  return(relu)
  
}

YOLO_map_function <- function (indata, final_map = 6, num_box = 1, drop = 0.2, name = 'lvl1') {
  
  dp <- mx.symbol.Dropout(data = indata, p = drop, name = paste0(name, '_drop'))
  
  conv <- mx.symbol.Convolution(data = dp, kernel = c(1, 1), stride = c(1, 1), pad = c(0, 0),
                                no.bias = FALSE, num.filter = final_map, name = paste0(name, '_linearmap'))
  
  inter_split <- mx.symbol.SliceChannel(data = conv, num_outputs = final_map,
                                        axis = 1, squeeze_axis = FALSE, name = paste0(name, "_inter_split"))
  
  new_list <- list()
  
  for (k in 1:final_map) {
    if (!(k %% num_box) %in% c(4:5)) {
      new_list[[k]] <- mx.symbol.Activation(inter_split[[k]], act.type = 'sigmoid', name = paste0(name, "_yolomap_", k))
    } else {
      new_list[[k]] <- inter_split[[k]]
    }
  }
  
  yolomap <- mx.symbol.concat(data = new_list, num.args = final_map, dim = 1, name = paste0(name, "_yolomap"))
  
  return(yolomap)
  
}

yolo_conv_1 <- DWCONV_function(indata = basic_out, num_filters = 320, Inverse_coef = 3, residual = TRUE, name = 'yolo', stage = 1)
yolo_conv_2 <- DWCONV_function(indata = yolo_conv_1, num_filters = 320, Inverse_coef = 3, residual = TRUE, name = 'yolo', stage = 2)
yolo_conv_3 <- CONV_function(indata = yolo_conv_2, num_filters = 320, name = 'yolo', stage = 3)

yolomap <- YOLO_map_function(indata = yolo_conv_3, final_map = 6, drop = 0.2, name = 'final')

訓練一個物件識別模型(5)

F12_3

  1. 第一個部分是對於y座標與x座標的損失

  2. 第二個部分是對於寬度與高度的損失

  3. 第三個部分是可信度該找出而答錯的損失

  4. 第四個部分是可信度該略過而答錯的損失

  5. 第五個部分是類別n的可能性的損失

– 另外,他還有個\(\lambda_{coord}\)以及\(\lambda_{noobj}\)兩個參數,根據YOLO v1 paper的建議分別被定是5以及0.5,這是因為物件識別是一個極度類別不平衡的任務,所以給予正向樣本較高的權重。

訓練一個物件識別模型(6)

– 當然我們對y座標與x座標的部分是沒有辦法做修正的。

# 2. Custom loss function

MSE_loss_function <- function (indata, inlabel, obj, lambda) {
  
  diff_pred_label <- mx.symbol.broadcast_minus(lhs = indata, rhs = inlabel)
  square_diff_pred_label <- mx.symbol.square(data = diff_pred_label)
  obj_square_diff_loss <- mx.symbol.broadcast_mul(lhs = obj, rhs = square_diff_pred_label)
  MSE_loss <- mx.symbol.mean(data = obj_square_diff_loss, axis = 0:3, keepdims = FALSE)
  
  return(MSE_loss * lambda)
  
}

CE_loss_function <- function (indata, inlabel, obj, lambda, eps = 1e-4) {
  
  log_pred_1 <- mx.symbol.log(data = indata + eps)
  log_pred_2 <- mx.symbol.log(data = 1 - indata + eps)
  multiple_log_pred_label_1 <- mx.symbol.broadcast_mul(lhs = log_pred_1, rhs = inlabel)
  multiple_log_pred_label_2 <- mx.symbol.broadcast_mul(lhs = log_pred_2, rhs = 1 - inlabel)
  obj_weighted_loss <- mx.symbol.broadcast_mul(lhs = obj, rhs = multiple_log_pred_label_1 + multiple_log_pred_label_2)
  average_CE_loss <- mx.symbol.mean(data = obj_weighted_loss, axis = 0:3, keepdims = FALSE)
  CE_loss <- 0 - average_CE_loss * lambda
  
  return(CE_loss)
  
}

YOLO_loss_function <- function (indata, inlabel, final_map = 6, num_box = 1, lambda = 10, weight_classification = 0.2, name = 'yolo') {
  
  num_feature <- final_map/num_box
  
  my_loss <- 0
  
  yolomap_split <- mx.symbol.SliceChannel(data = indata, num_outputs = final_map, 
                                          axis = 1, squeeze_axis = FALSE, name = paste(name, '_yolomap_split'))
  
  label_split <- mx.symbol.SliceChannel(data = inlabel, num_outputs = final_map, 
                                        axis = 1, squeeze_axis = FALSE, name = paste(name, '_label_split'))
  
  for (j in 1:num_box) {
    for (k in 1:num_feature) {
      if (k %in% 1:5) {weight <- 1} else {weight <- weight_classification}
      if (!k %in% c(2:5)) {
        if (k == 1) {
          my_loss <- my_loss + CE_loss_function(indata = yolomap_split[[(j-1)*num_feature+k]],
                                                inlabel = label_split[[(j-1)*num_feature+k]],
                                                obj = label_split[[(j-1)*num_feature+1]],
                                                lambda = lambda * weight,
                                                eps = 1e-4)
          my_loss <- my_loss + CE_loss_function(indata = yolomap_split[[(j-1)*num_feature+k]],
                                                inlabel = label_split[[(j-1)*num_feature+k]],
                                                obj = 1 - label_split[[(j-1)*num_feature+1]],
                                                lambda = 1,
                                                eps = 1e-4)
        } else {
          my_loss <- my_loss + CE_loss_function(indata = yolomap_split[[(j-1)*num_feature+k]],
                                                inlabel = label_split[[(j-1)*num_feature+k]],
                                                obj = label_split[[(j-1)*num_feature+1]],
                                                lambda = lambda * weight,
                                                eps = 1e-4)
        }
      } else {
        my_loss <- my_loss + MSE_loss_function(indata = yolomap_split[[(j-1)*num_feature+k]],
                                               inlabel = label_split[[(j-1)*num_feature+k]],
                                               obj = label_split[[(j-1)*num_feature+1]],
                                               lambda = lambda * weight)
      }
    }
  }
  
  return(my_loss)
  
}

label <- mx.symbol.Variable(name = "label")

yolo_loss <- YOLO_loss_function(indata = yolomap, inlabel = label, final_map = 6, num_box = 1, lambda = 10, weight_classification = 0.2, name = 'yolo')

final_yolo_loss <- mx.symbol.MakeLoss(data = yolo_loss)

訓練一個物件識別模型(7)

– 先讓我們從這裡下載所需要的檔案

– 如果你想弄懂怎樣從JPG檔案變成我們現在需要的格式,請你參考MxNetR-YOLO/pikachu/code/1. Processing data的過程

# Libraries

library(OpenImageR)
library(jpeg)
library(mxnet)
library(imager)

# Load data (Training set)

load('data/train_img_list.RData')
load('data/train_box_info.RData')

head(train_box_info)
##   obj_name  col_left col_right   row_bot   row_top prob img_id
## 1  pikachu 0.6267570 0.7256063 0.4658268 0.3013253    1      1
## 2  pikachu 0.5070340 0.5993253 0.4963081 0.3682864    1      2
## 3  pikachu 0.5904536 0.6917713 0.5608004 0.3917792    1      3
## 4  pikachu 0.5722729 0.6571676 0.5396996 0.4144326    1      4
## 5  pikachu 0.3893552 0.5016431 0.4850163 0.3470082    1      5
## 6  pikachu 0.3819232 0.4916472 0.5595707 0.4213461    1      6
head(train_img_list[[1]], 20)
##  [1] ff d8 ff e0 00 10 4a 46 49 46 00 01 01 00 00 01 00 01 00 00
Show_img <- function (img, box_info = NULL, show_prob = FALSE, col_bbox = '#FFFFFF00', col_label = '#FF0000FF',
                      show_grid = FALSE, n.grid = 8, col_grid = '#0000FFFF') {
  
  require(imager)
  
  par(mar = rep(0, 4))
  plot(NA, xlim = c(0.04, 0.96), ylim = c(0.96, 0.04), xaxt = "n", yaxt = "n", bty = "n")
  img <- (img - min(img))/(max(img) - min(img))
  img <- as.raster(img)
  rasterImage(img, 0, 1, 1, 0, interpolate=FALSE)
  
  box_info[box_info[,2] < 0, 2] <- 0
  box_info[box_info[,3] > 1, 3] <- 1
  box_info[box_info[,4] > 1, 4] <- 1
  box_info[box_info[,5] < 0, 5] <- 0
  
  if (!is.null(box_info)) {
    for (i in 1:nrow(box_info)) {
      if (is.null(box_info$col[i])) {COL_LABEL <- col_label} else {COL_LABEL <- box_info$col[i]}
      if (show_prob) {
        TEXT <- paste0(box_info[i,1], ' (', formatC(box_info[i,6]*100, 0, format = 'f'), '%)')
      } else {
        TEXT <- box_info[i,1]
      }
      size <- max(box_info[i,3] - box_info[i,2], 0.05)
      rect(xleft = box_info[i,2], xright = box_info[i,2] + 0.04*sqrt(size)*nchar(TEXT),
           ybottom = box_info[i,5] + 0.08*sqrt(size), ytop = box_info[i,5],
           col = COL_LABEL, border = COL_LABEL, lwd = 0)
      text(x = box_info[i,2] + 0.02*sqrt(size) * nchar(TEXT),
           y = box_info[i,5] + 0.04*sqrt(size),
           labels = TEXT,
           col = 'white', cex = 1.5*sqrt(size), font = 2)
      rect(xleft = box_info[i,2], xright = box_info[i,3],
           ybottom = box_info[i,4], ytop = box_info[i,5],
           col = col_bbox, border = COL_LABEL, lwd = 5*sqrt(size))
    }
  }
  
  if (show_grid) {
    for (i in 1:n.grid) {
      if (i != n.grid) {
        abline(a = i/n.grid, b = 0, col = col_grid, lwd = 12/n.grid)
        abline(v = i/n.grid, col = col_grid, lwd = 12/n.grid)
      }
      for (j in 1:n.grid) {
        text((i-0.5)/n.grid, (j-0.5)/n.grid, paste0('(', j, ', ', i, ')'), col = col_grid, cex = 8/n.grid)
      }
    }
  }
  
}

img_id <- 1

resized_img <- readJPEG(train_img_list[[img_id]])
sub_BOX_INFOS <- train_box_info[train_box_info$img_id %in% img_id,]

Show_img(img = resized_img, box_info = sub_BOX_INFOS, show_grid = FALSE)