機器學習及演算法

林嶔 (Lin, Chin)

Lesson 13 機器學習概論4(決策樹與隨機森林)

第一節:決策樹(1)

– 最簡單的監督機器學習法是決策樹,決策樹牽涉到了最少的數學,僅僅是使用電腦大量運算來進行分類。

F01

第一節:決策樹(2)

F02

F03

第一節:決策樹(3)

data(iris)

#Split data
set.seed(1)

Train.sample <- sample(1:150, 100, replace = FALSE)

Train.data <- iris[Train.sample,]
Test.data <- iris[-Train.sample,]

#Find optimal cut-points

optimal.cut_points <- data.frame(var = colnames(Train.data)[1:4],
                                 cut = numeric(4),
                                 pval = numeric(4))
for (i in 1:4) {
  unique.values = unique(sort(Train.data[,i]))
  chisq.list = numeric(length(unique.values)-1)
  for (j in 1:length(chisq.list)) {
    potiantal.cut_points = (unique.values[j] + unique.values[j+1])/2
    names(chisq.list)[j] = potiantal.cut_points
    X = Train.data[,i] > potiantal.cut_points
    Y = Train.data[,5]
    chisq.list[j] = fisher.test(X, Y)$p.value
  }
  optimal.cut_points[i,2] = names(chisq.list)[which.min(chisq.list)]
  optimal.cut_points[i,3] = min(chisq.list)
}

print(optimal.cut_points)
##            var  cut         pval
## 1 Sepal.Length 5.55 1.053168e-15
## 2  Sepal.Width 2.95 2.195292e-07
## 3 Petal.Length 2.45 1.398479e-26
## 4  Petal.Width  0.8 1.398479e-26

第一節:決策樹(4)

library(party)

tree.model <- ctree(formula = Species ~ ., data = Train.data)
tree.model
## 
##   Conditional inference tree with 3 terminal nodes
## 
## Response:  Species 
## Inputs:  Sepal.Length, Sepal.Width, Petal.Length, Petal.Width 
## Number of observations:  100 
## 
## 1) Petal.Length <= 1.9; criterion = 1, statistic = 92.735
##   2)*  weights = 32 
## 1) Petal.Length > 1.9
##   3) Petal.Width <= 1.6; criterion = 1, statistic = 48.709
##     4)*  weights = 35 
##   3) Petal.Width > 1.6
##     5)*  weights = 33
sort(Train.data[,3])
##   [1] 1.1 1.2 1.3 1.3 1.3 1.3 1.3 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.5
##  [19] 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.6 1.6 1.6 1.6 1.6 1.7 1.9 3.0 3.3 3.5 3.5
##  [37] 3.7 3.8 3.9 3.9 3.9 4.0 4.0 4.1 4.2 4.2 4.2 4.3 4.3 4.4 4.4 4.4 4.5 4.5
##  [55] 4.5 4.5 4.5 4.5 4.5 4.6 4.7 4.7 4.7 4.7 4.8 4.8 4.8 4.8 4.9 4.9 5.0 5.0
##  [73] 5.1 5.1 5.1 5.1 5.1 5.1 5.2 5.2 5.3 5.4 5.4 5.5 5.6 5.6 5.6 5.7 5.7 5.8
##  [91] 5.8 5.9 6.0 6.1 6.1 6.1 6.3 6.4 6.7 6.9

第一節:決策樹(5)

pred.y <- predict(tree.model, Test.data[,1:4])
table(pred.y, Test.data[,5])
##             
## pred.y       setosa versicolor virginica
##   setosa         18          0         0
##   versicolor      0         14         3
##   virginica       0          0        15
plot(tree.model)

第一節:決策樹(6)

dat <- read.csv("ECG_train.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE, na.strings = "")

– 這裡給一些範例語法,讓大家知道怎樣用在不同的依變項屬性上。

  1. 這是連續變項的範例:
subdat <- dat[!(dat[,'K'] %in% NA) & !(dat[,'Rate'] %in% NA) & !(dat[,'AGE'] %in% NA), c('K', 'Rate', 'AGE')]

tree.model <- ctree(formula = K ~ ., data = subdat)
plot(tree.model)

  1. 這是二元分類的範例:
subdat <- dat[!(dat[,'LVD'] %in% NA) & !(dat[,'GENDER'] %in% NA) & !(dat[,'Rate'] %in% NA), c('LVD', 'GENDER', 'Rate')]
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
subdat[,'LVD'] <- as.factor(subdat[,'LVD'])

tree.model <- ctree(formula = LVD ~ ., data = subdat)
plot(tree.model)

  1. 這是多分類任務的範例:
subdat <- dat[!(dat[,'AMI'] %in% NA) & !(dat[,'GENDER'] %in% NA) & !(dat[,'AGE'] %in% NA), c('AMI', 'GENDER', 'AGE')]
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
subdat[,'AMI'] <- as.factor(subdat[,'AMI'])

tree.model <- ctree(formula = AMI ~ ., data = subdat)
plot(tree.model)

  1. 這是存活分析的範例:
subdat <- dat[!(dat[,'time'] %in% NA) & !(dat[,'death'] %in% NA) & !(dat[,'Rate'] %in% NA) & !(dat[,'AGE'] %in% NA), c('time', 'death', 'Rate', 'AGE')]

tree.model <- ctree(formula = Surv(time, death) ~ ., data = subdat)
plot(tree.model)

第一節:決策樹(7)

  1. 參數【mincriterion】是顯著水準

  2. 參數【maxdepth】是樹的最大深度

  3. 參數【minsplit】是說樣本大於多少,才考慮繼續分類

  4. 參數【minbucket】是說每個分類至少要有幾個樣本

subdat <- dat[!(dat[,'LVD'] %in% NA) & !(dat[,'GENDER'] %in% NA) & !(dat[,'Rate'] %in% NA), c('LVD', 'GENDER', 'Rate')]
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
subdat[,'LVD'] <- as.factor(subdat[,'LVD'])

tree.model <- ctree(formula = LVD ~ ., data = subdat,
                    controls = ctree_control(mincriterion = 0.95, maxdepth = 2, minsplit = 20, minbucket = 7))
plot(tree.model)

– 如果你想獲得預測機率,可以用這種方式:

prob_list <- predict(tree.model, subdat, type = 'prob')
prob_y <- do.call('rbind', prob_list)[,1]

練習1:完整資料科學實驗

– 請你試著調整一下參數,你必須找出一組參數讓驗證組準確度最高。

library(mice)

subdat <- dat[!(dat[,"LVD"] %in% NA), c(-1, -2, -4, -5)]

subdat[,'LVD'] <- as.factor(subdat[,'LVD'])
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
for (i in 1:31) {subdat[,paste0('rhythm.', i)] <- as.factor(subdat[,paste0('rhythm.', i)])}

used_dat.x <- subdat[,-1]
mice_dat <- mice(used_dat.x, m = 1, maxit = 5, meth = 'cart', seed = 123, printFlag = FALSE)
impute_dat.x <- mice:::complete(mice_dat, action = 1)

set.seed(0)
all_idx <- 1:nrow(subdat)

train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]

train_X <- impute_dat.x[train_idx,]
valid_X <- impute_dat.x[valid_idx,]
test_X <- impute_dat.x[test_idx,]

train_Y <- subdat[train_idx,"LVD"]
valid_Y <- subdat[valid_idx,"LVD"]
test_Y <- subdat[test_idx,"LVD"]

練習1答案

library(pROC)

result <- data.frame(mincriterion = rep(c(0.95, 0.99), each = 3), maxdepth = 4:6, valid_auc = NA)

for (i in 1:nrow(result)) {
  
  tree.model <- ctree(formula = train_Y ~ ., data = train_X,
                      controls = ctree_control(mincriterion = result[i,'mincriterion'], maxdepth = result[i,'maxdepth']))
  
  prob_list <- predict(tree.model, valid_X, type = 'prob')
  prob_y <- do.call('rbind', prob_list)[,1]
  roc_valid <- roc(valid_Y ~ prob_y)
  result[i,'valid_auc'] <- roc_valid[['auc']]
  
}

result
##   mincriterion maxdepth valid_auc
## 1         0.95        4 0.7446314
## 2         0.95        5 0.7362981
## 3         0.95        6 0.7362981
## 4         0.99        4 0.7446314
## 5         0.99        5 0.7446314
## 6         0.99        6 0.7446314
best_pos <- which.max(result[,'valid_auc'])

best.tree.model <- ctree(formula = train_Y ~ ., data = train_X,
                      controls = ctree_control(mincriterion = result[best_pos,'mincriterion'], maxdepth = result[best_pos,'maxdepth']))

prob_list <- predict(tree.model, test_X, type = 'prob')
prob_y <- do.call('rbind', prob_list)[,1]
  
roc_curve <- roc(test_Y ~ prob_y)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')

練習1引申

– 讓我們來用套件「rpart」所提供的Classification & Regression Trees (CART):

library(rpart)
library(rpart.plot)

cart.model <- rpart(formula = train_Y ~ ., data = train_X)
prp(cart.model, faclen = 0, fallen.leaves = TRUE, shadow.col = "gray", extra = 2)  

第二節:隨機森林(1)

– 但他有一個重要的缺陷,就是我們Training sample是一次抽樣的結果,而這個樣本一定存在了一些抽樣誤差造成的特色與母體未必相同,我們有沒有可能因為這個錯誤的特色導致了決策樹陷入「局部極值」?

– 接著,假設我們成功的製造了100棵小樹,要預測的時候就將這樣本經過這100棵樹,並請這100棵樹投票看看他們預測這個樣本的結果。

– 這種在原始樣本中隨機抽樣的方式,在機器學習領域叫做「Bagging」,在統計學中叫做「Bootstraping」,這是一種全隨機的抽樣法。

– 與「Bagging」和「Bootstraping」相對的一種方法叫做「Boosting」,這種抽樣方法會根據之前的結果再抽樣,我們會在這門課的最後介紹他。

F04

第二節:隨機森林(2)

– 剛剛最佳的參數是【mincriterion = 0.95】與【maxdepth = 4】,我們就嘗試每次隨機抽樣70%的樣本,做10棵樹看看。

library(pROC)

set.seed(0)

tree.model_list <- list()

for (i in 1:10) {
  
  new_train_idx <- sample(1:nrow(train_X), 0.7 * nrow(train_X))
  sub_train_X <- train_X[new_train_idx,]
  sub_train_Y <- train_Y[new_train_idx]
  
  tree.model_list[[i]] <- ctree(formula = sub_train_Y ~ ., data = sub_train_X,
                                controls = ctree_control(mincriterion = 0.95, maxdepth = 4))

}

– 先讓我們觀察一下隨便3棵樹:

plot(tree.model_list[[1]], main = 'Tree 1')

plot(tree.model_list[[2]], main = 'Tree 2')

plot(tree.model_list[[3]], main = 'Tree 3')

第二節:隨機森林(3)

– 其實本來就有很多變數本來就有高度的相關性,因此多用一些不同資訊是有好處的。

– 另外,切點也不應該訂的這麼死,多個樹就能有效解決這個問題。

final_pred <- 0

for (i in 1:length(tree.model_list)) {
  
  prob_list <- predict(tree.model_list[[i]], test_X, type = 'prob')
  prob_y <- do.call('rbind', prob_list)[,1]
  final_pred <- final_pred + prob_y / length(tree.model_list)
  
}

roc_curve <- roc(test_Y ~ final_pred)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')

第二節:隨機森林(4)

library(randomForest)

rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50)

pred.y <- predict(rf.model, test_X, type = 'prob')[,2]

roc_curve <- roc(test_Y ~ pred.y)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')

練習2:完整資料科學實驗

– 請你試著調整一下參數,你必須找出一組參數讓驗證組準確度最高。

– 候選參數包含【replace = c(TRUE, FALSE)】、【sampsize = c(1000, 1400, 1800)】、【nodesize = c(10, 20, 50)】

rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50,
                         replace = TRUE, sampsize = 1000, nodesize = 10)

pred.y <- predict(rf.model, test_X, type = 'prob')[,2]

roc_curve <- roc(test_Y ~ pred.y)
print(roc_curve)
## 
## Call:
## roc.formula(formula = test_Y ~ pred.y)
## 
## Data: pred.y in 229 controls (test_Y 0) < 194 cases (test_Y 1).
## Area under the curve: 0.7709

練習2答案

library(pROC)

result <- data.frame(replace = rep(c(TRUE, FALSE), each = 9),
                     sampsize = rep(c(600, 800, 1000), each = 3),
                     nodesize = c(10, 20, 50),
                     valid_auc = NA)

for (i in 1:nrow(result)) {
  
  rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50,
                           replace = result[i,'replace'],
                           sampsize = result[i,'sampsize'],
                           nodesize =  result[i,'nodesize'])
  
  pred.y <- predict(rf.model, valid_X, type = 'prob')[,2]
  roc_valid <- roc(valid_Y ~ pred.y)
  result[i,'valid_auc'] <- roc_valid[['auc']]
  
}

result
##    replace sampsize nodesize valid_auc
## 1     TRUE      600       10 0.7856914
## 2     TRUE      600       20 0.7819712
## 3     TRUE      600       50 0.7903732
## 4     TRUE      800       10 0.7815476
## 5     TRUE      800       20 0.7768658
## 6     TRUE      800       50 0.7840316
## 7     TRUE     1000       10 0.7901328
## 8     TRUE     1000       20 0.7941163
## 9     TRUE     1000       50 0.7773008
## 10   FALSE      600       10 0.7801168
## 11   FALSE      600       20 0.7771864
## 12   FALSE      600       50 0.7853365
## 13   FALSE      800       10 0.7852106
## 14   FALSE      800       20 0.7765224
## 15   FALSE      800       50 0.7881525
## 16   FALSE     1000       10 0.7846955
## 17   FALSE     1000       20 0.7794643
## 18   FALSE     1000       50 0.7860806
best_pos <- which.max(result[,'valid_auc'])

best.rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50,
                              replace = result[best_pos,'replace'],
                              sampsize = result[best_pos,'sampsize'],
                              nodesize =  result[best_pos,'nodesize'])

pred.y <- predict(best.rf.model, test_X, type = 'prob')[,2]

roc_curve <- roc(test_Y ~ pred.y)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')

第三節:變項重要性分析(1)

– 而隨機森林隨機組成了非常多樹,將能夠做出一個變項重要性的排序。

used_count <- varUsed(best.rf.model)
names(used_count) <- colnames(train_X)
sort(used_count, decreasing = TRUE)
##    Axes_T      Rate      QRSd       QTc        PR  Axes_QRS       AGE        QT 
##       321       297       294       270       260       255       251       250 
##    Axes_P rhythm.17 rhythm.14    GENDER rhythm.26 rhythm.29 rhythm.11 rhythm.18 
##       248        95        69        64        62        62        60        51 
## rhythm.27  rhythm.2  rhythm.8  rhythm.1 rhythm.25  rhythm.4 rhythm.21  rhythm.7 
##        50        46        42        41        38        34        34        32 
## rhythm.15  rhythm.6 rhythm.20 rhythm.13  rhythm.3 rhythm.28 rhythm.19  rhythm.9 
##        31        30        28        22        21        19        18        17 
## rhythm.16 rhythm.10 rhythm.30 rhythm.31  rhythm.5 rhythm.12 rhythm.22 rhythm.24 
##        15        14        14        12        11         9         9         7 
## rhythm.23 
##         4

– 透過這個方式我們還能拿到每顆樹的使用狀況,這可以用來取得變異樹等資訊。

count_table <- varUsed(best.rf.model, by.tree = TRUE)
rownames(count_table) <- colnames(train_X)

第三節:變項重要性分析(2)

importance <- importance(best.rf.model)
varImpPlot(best.rf.model)

第三節:變項重要性分析(3)

– 函數「importance」可以協助我們觀察他們的關聯性,從而瞭解非線性模型的優勢。

– 要特別注意的是,不建議使用Training sample來畫圖,因為隨機森林的切點是由Training sample訂的。

partialPlot(best.rf.model, pred.data = valid_X, x.var = AGE, which.class = 1)

partialPlot(best.rf.model, pred.data = valid_X, x.var = Axes_T, which.class = 1)

partialPlot(best.rf.model, pred.data = valid_X, x.var = Rate, which.class = 1)

練習3:篩選變項(1)

– 但是用線性模型來進行篩選感覺不是很好,畢竟有些變項可能不是沒關係,有的是線性關係。

– 那就是假設兩個變項對outcome存在預測能力,但分開看完全沒有關係,同時合一起看才有關係,這樣照決策樹運行的邏輯,不就無法處理了?

– 換個說法,就是這兩個變項存在交互作用

set.seed(0)

x1 <- rep(0:1, 50) + rnorm(100, sd = 0.5)
x2 <- rep(0:1, each = 50) + rnorm(100, sd = 0.5)
y <- rep(0:1, 50) != rep(0:1, each = 50)
y <- as.factor(y)

plot(x1, x2, col = as.integer(y) + 1, pch = 19)

library(party)
tree.model <- ctree(y ~ x1 + x2)
tree.model
## 
##   Conditional inference tree with 1 terminal nodes
## 
## Response:  y 
## Inputs:  x1, x2 
## Number of observations:  100 
## 
## 1)*  weights = 100
glm(y ~ x1 + x2, family = 'binomial')
## 
## Call:  glm(formula = y ~ x1 + x2, family = "binomial")
## 
## Coefficients:
## (Intercept)           x1           x2  
##    -0.01014     -0.10878      0.13794  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  97 Residual
## Null Deviance:       138.6 
## Residual Deviance: 138.3     AIC: 144.3
glm(y ~ x1 + x2 + x1:x2, family = 'binomial')
## 
## Call:  glm(formula = y ~ x1 + x2 + x1:x2, family = "binomial")
## 
## Coefficients:
## (Intercept)           x1           x2        x1:x2  
##      -1.926        3.898        3.703       -6.961  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  96 Residual
## Null Deviance:       138.6 
## Residual Deviance: 82.87     AIC: 90.87

練習3:篩選變項(2)

– 我們必須先將變項重要性做一個排序,看看是不是用前N個變項就可以準確預測了(在驗證組中做)。

– 之後再用多層感知機分析這幾個變項,從而得到最終模型。

練習3答案(1)

rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50)
importance <- importance(best.rf.model)
importance <- importance[order(importance[,'MeanDecreaseGini'], decreasing = TRUE),]

result <- data.frame(n_var = 1:length(importance), valid_auc = NA)

for (i in 1:nrow(result)) {
  
  rf.model <- randomForest(formula = train_Y ~ ., data = train_X[,names(importance)[1:result[i,'n_var']],drop = FALSE], ntree = 50)
  pred.y <- predict(rf.model, valid_X, type = 'prob')[,2]
  roc_valid <- roc(valid_Y ~ pred.y)
  result[i,'valid_auc'] <- roc_valid[['auc']]

}

plot(valid_auc ~ n_var, data = result, type = 'l', ylim = c(0.5, 1))

best_n.var <- which.max(result[,'valid_auc'] - result[,'n_var'] * 0.001)
best_n.var
## [1] 22

練習3答案(2)

library(nnet)

train_Y_val <- as.integer(train_Y) - 1
fit_nnet <- nnet(train_Y_val ~ ., data = train_X[,names(importance)[1:best_n.var],drop = FALSE],
                 size = 20, maxit = 100, entropy = TRUE, decay = 0.001, trace = FALSE, MaxNWts = 1e5)

pred.y <- predict(fit_nnet, test_X[,names(importance)[1:best_n.var]])

roc_curve <- roc(test_Y ~ pred.y)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')

課程小結

– 儘管我們不清楚X與Y實際上的關係,這兩個機器學習的方法還是有效的協助我們預測出量的結果。

– 事實上,隨機森林還並非Tree-based model的最終型態,在2003年以後出現的梯度提升機(Gradient Boosting Machine)是一個比隨機森林更強的分類器,我們會在後面的課程介紹

– 另外也能用來篩選變項,讓一些複雜的非線性模型(多層感知機與之後要教的SVM等)能夠用少量的變項進行分析。

– 這對我們完整的資料科學流程又更進一步了!