機器學習及演算法

林嶔 (Lin, Chin)

Lesson 13 機器學習概論4(決策樹與隨機森林)

第一節:決策樹(1)

– 最簡單的監督機器學習法是決策樹,決策樹牽涉到了最少的數學,僅僅是使用電腦大量運算來進行分類。

F01

第一節:決策樹(2)

F02

F03

第一節:決策樹(3)

data(iris)

#Split data
set.seed(1)

Train.sample <- sample(1:150, 100, replace = FALSE)

Train.data <- iris[Train.sample,]
Test.data <- iris[-Train.sample,]

#Find optimal cut-points

optimal.cut_points <- data.frame(var = colnames(Train.data)[1:4],
                                 cut = numeric(4),
                                 pval = numeric(4))
for (i in 1:4) {
  unique.values = unique(sort(Train.data[,i]))
  chisq.list = numeric(length(unique.values)-1)
  for (j in 1:length(chisq.list)) {
    potiantal.cut_points = (unique.values[j] + unique.values[j+1])/2
    names(chisq.list)[j] = potiantal.cut_points
    X = Train.data[,i] > potiantal.cut_points
    Y = Train.data[,5]
    chisq.list[j] = fisher.test(X, Y)$p.value
  }
  optimal.cut_points[i,2] = names(chisq.list)[which.min(chisq.list)]
  optimal.cut_points[i,3] = min(chisq.list)
}

print(optimal.cut_points)
##            var  cut         pval
## 1 Sepal.Length 5.55 1.053168e-15
## 2  Sepal.Width 2.95 2.195292e-07
## 3 Petal.Length 2.45 1.398479e-26
## 4  Petal.Width  0.8 1.398479e-26

第一節:決策樹(4)

library(party)

tree.model <- ctree(formula = Species ~ ., data = Train.data)
tree.model
## 
##   Conditional inference tree with 3 terminal nodes
## 
## Response:  Species 
## Inputs:  Sepal.Length, Sepal.Width, Petal.Length, Petal.Width 
## Number of observations:  100 
## 
## 1) Petal.Length <= 1.9; criterion = 1, statistic = 92.735
##   2)*  weights = 32 
## 1) Petal.Length > 1.9
##   3) Petal.Width <= 1.6; criterion = 1, statistic = 48.709
##     4)*  weights = 35 
##   3) Petal.Width > 1.6
##     5)*  weights = 33
sort(Train.data[,3])
##   [1] 1.1 1.2 1.3 1.3 1.3 1.3 1.3 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.5
##  [19] 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.6 1.6 1.6 1.6 1.6 1.7 1.9 3.0 3.3 3.5 3.5
##  [37] 3.7 3.8 3.9 3.9 3.9 4.0 4.0 4.1 4.2 4.2 4.2 4.3 4.3 4.4 4.4 4.4 4.5 4.5
##  [55] 4.5 4.5 4.5 4.5 4.5 4.6 4.7 4.7 4.7 4.7 4.8 4.8 4.8 4.8 4.9 4.9 5.0 5.0
##  [73] 5.1 5.1 5.1 5.1 5.1 5.1 5.2 5.2 5.3 5.4 5.4 5.5 5.6 5.6 5.6 5.7 5.7 5.8
##  [91] 5.8 5.9 6.0 6.1 6.1 6.1 6.3 6.4 6.7 6.9

第一節:決策樹(5)

pred.y <- predict(tree.model, Test.data[,1:4])
table(pred.y, Test.data[,5])
##             
## pred.y       setosa versicolor virginica
##   setosa         18          0         0
##   versicolor      0         14         3
##   virginica       0          0        15
plot(tree.model)

第一節:決策樹(6)

dat <- read.csv("ECG_train.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE, na.strings = "")

– 這裡給一些範例語法,讓大家知道怎樣用在不同的依變項屬性上。

  1. 這是連續變項的範例:
subdat <- dat[!(dat[,'K'] %in% NA) & !(dat[,'Rate'] %in% NA) & !(dat[,'AGE'] %in% NA), c('K', 'Rate', 'AGE')]

tree.model <- ctree(formula = K ~ ., data = subdat)
plot(tree.model)

  1. 這是二元分類的範例:
subdat <- dat[!(dat[,'LVD'] %in% NA) & !(dat[,'GENDER'] %in% NA) & !(dat[,'Rate'] %in% NA), c('LVD', 'GENDER', 'Rate')]
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
subdat[,'LVD'] <- as.factor(subdat[,'LVD'])

tree.model <- ctree(formula = LVD ~ ., data = subdat)
plot(tree.model)

  1. 這是多分類任務的範例:
subdat <- dat[!(dat[,'AMI'] %in% NA) & !(dat[,'GENDER'] %in% NA) & !(dat[,'AGE'] %in% NA), c('AMI', 'GENDER', 'AGE')]
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
subdat[,'AMI'] <- as.factor(subdat[,'AMI'])

tree.model <- ctree(formula = AMI ~ ., data = subdat)
plot(tree.model)

  1. 這是存活分析的範例:
subdat <- dat[!(dat[,'time'] %in% NA) & !(dat[,'death'] %in% NA) & !(dat[,'Rate'] %in% NA) & !(dat[,'AGE'] %in% NA), c('time', 'death', 'Rate', 'AGE')]

tree.model <- ctree(formula = Surv(time, death) ~ ., data = subdat)
plot(tree.model)

第一節:決策樹(7)

  1. 參數【mincriterion】是顯著水準

  2. 參數【maxdepth】是樹的最大深度

  3. 參數【minsplit】是說樣本大於多少,才考慮繼續分類

  4. 參數【minbucket】是說每個分類至少要有幾個樣本

subdat <- dat[!(dat[,'LVD'] %in% NA) & !(dat[,'GENDER'] %in% NA) & !(dat[,'Rate'] %in% NA), c('LVD', 'GENDER', 'Rate')]
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
subdat[,'LVD'] <- as.factor(subdat[,'LVD'])

tree.model <- ctree(formula = LVD ~ ., data = subdat,
                    controls = ctree_control(mincriterion = 0.95, maxdepth = 2, minsplit = 20, minbucket = 7))
plot(tree.model)

– 如果你想獲得預測機率,可以用這種方式:

prob_list <- predict(tree.model, subdat, type = 'prob')
prob_y <- do.call('rbind', prob_list)[,1]

練習1:完整資料科學實驗

– 請你試著調整一下參數,你必須找出一組參數讓驗證組準確度最高。

library(mice)

subdat <- dat[!(dat[,"LVD"] %in% NA), c(-1, -2, -4, -5)]

subdat[,'LVD'] <- as.factor(subdat[,'LVD'])
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
for (i in 1:31) {subdat[,paste0('rhythm.', i)] <- as.factor(subdat[,paste0('rhythm.', i)])}

used_dat.x <- subdat[,-1]
mice_dat <- mice(used_dat.x, m = 1, maxit = 5, meth = 'cart', seed = 123, printFlag = FALSE)
impute_dat.x <- mice:::complete(mice_dat, action = 1)

set.seed(0)
all_idx <- 1:nrow(subdat)

train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]

train_X <- impute_dat.x[train_idx,]
valid_X <- impute_dat.x[valid_idx,]
test_X <- impute_dat.x[test_idx,]

train_Y <- subdat[train_idx,"LVD"]
valid_Y <- subdat[valid_idx,"LVD"]
test_Y <- subdat[test_idx,"LVD"]

練習1答案

library(pROC)

result <- data.frame(mincriterion = rep(c(0.95, 0.99), each = 3), maxdepth = 4:6, valid_auc = NA)

for (i in 1:nrow(result)) {
  
  tree.model <- ctree(formula = train_Y ~ ., data = train_X,
                      controls = ctree_control(mincriterion = result[i,'mincriterion'], maxdepth = result[i,'maxdepth']))
  
  prob_list <- predict(tree.model, valid_X, type = 'prob')
  prob_y <- do.call('rbind', prob_list)[,1]
  roc_valid <- roc(valid_Y ~ prob_y)
  result[i,'valid_auc'] <- roc_valid[['auc']]
  
}

result
##   mincriterion maxdepth valid_auc
## 1         0.95        4 0.7446314
## 2         0.95        5 0.7362981
## 3         0.95        6 0.7362981
## 4         0.99        4 0.7446314
## 5         0.99        5 0.7446314
## 6         0.99        6 0.7446314
best_pos <- which.max(result[,'valid_auc'])

best.tree.model <- ctree(formula = train_Y ~ ., data = train_X,
                      controls = ctree_control(mincriterion = result[best_pos,'mincriterion'], maxdepth = result[best_pos,'maxdepth']))

prob_list <- predict(tree.model, test_X, type = 'prob')
prob_y <- do.call('rbind', prob_list)[,1]
  
roc_curve <- roc(test_Y ~ prob_y)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')

練習1引申

– 讓我們來用套件「rpart」所提供的Classification & Regression Trees (CART):

library(rpart)
library(rpart.plot)

cart.model <- rpart(formula = train_Y ~ ., data = train_X)
prp(cart.model, faclen = 0, fallen.leaves = TRUE, shadow.col = "gray", extra = 2)  

第二節:隨機森林(1)

– 但他有一個重要的缺陷,就是我們Training sample是一次抽樣的結果,而這個樣本一定存在了一些抽樣誤差造成的特色與母體未必相同,我們有沒有可能因為這個錯誤的特色導致了決策樹陷入「局部極值」?

– 接著,假設我們成功的製造了100棵小樹,要預測的時候就將這樣本經過這100棵樹,並請這100棵樹投票看看他們預測這個樣本的結果。

– 這種在原始樣本中隨機抽樣的方式,在機器學習領域叫做「Bagging」,在統計學中叫做「Bootstraping」,這是一種全隨機的抽樣法。

– 與「Bagging」和「Bootstraping」相對的一種方法叫做「Boosting」,這種抽樣方法會根據之前的結果再抽樣,我們會在這門課的最後介紹他。

F04

第二節:隨機森林(2)

– 剛剛最佳的參數是【mincriterion = 0.95】與【maxdepth = 4】,我們就嘗試每次隨機抽樣70%的樣本,做10棵樹看看。

library(pROC)

set.seed(0)

tree.model_list <- list()

for (i in 1:10) {
  
  new_train_idx <- sample(1:nrow(train_X), 0.7 * nrow(train_X))
  sub_train_X <- train_X[new_train_idx,]
  sub_train_Y <- train_Y[new_train_idx]
  
  tree.model_list[[i]] <- ctree(formula = sub_train_Y ~ ., data = sub_train_X,
                                controls = ctree_control(mincriterion = 0.95, maxdepth = 4))

}

– 先讓我們觀察一下隨便3棵樹:

plot(tree.model_list[[1]], main = 'Tree 1')