林嶔 (Lin, Chin)
Lesson 13 機器學習概論4(決策樹與隨機森林)
– 最簡單的監督機器學習法是決策樹,決策樹牽涉到了最少的數學,僅僅是使用電腦大量運算來進行分類。
data(iris)
#Split data
set.seed(1)
Train.sample <- sample(1:150, 100, replace = FALSE)
Train.data <- iris[Train.sample,]
Test.data <- iris[-Train.sample,]
#Find optimal cut-points
optimal.cut_points <- data.frame(var = colnames(Train.data)[1:4],
cut = numeric(4),
pval = numeric(4))
for (i in 1:4) {
unique.values = unique(sort(Train.data[,i]))
chisq.list = numeric(length(unique.values)-1)
for (j in 1:length(chisq.list)) {
potiantal.cut_points = (unique.values[j] + unique.values[j+1])/2
names(chisq.list)[j] = potiantal.cut_points
X = Train.data[,i] > potiantal.cut_points
Y = Train.data[,5]
chisq.list[j] = fisher.test(X, Y)$p.value
}
optimal.cut_points[i,2] = names(chisq.list)[which.min(chisq.list)]
optimal.cut_points[i,3] = min(chisq.list)
}
print(optimal.cut_points)
## var cut pval
## 1 Sepal.Length 5.55 1.053168e-15
## 2 Sepal.Width 2.95 2.195292e-07
## 3 Petal.Length 2.45 1.398479e-26
## 4 Petal.Width 0.8 1.398479e-26
##
## Conditional inference tree with 3 terminal nodes
##
## Response: Species
## Inputs: Sepal.Length, Sepal.Width, Petal.Length, Petal.Width
## Number of observations: 100
##
## 1) Petal.Length <= 1.9; criterion = 1, statistic = 92.735
## 2)* weights = 32
## 1) Petal.Length > 1.9
## 3) Petal.Width <= 1.6; criterion = 1, statistic = 48.709
## 4)* weights = 35
## 3) Petal.Width > 1.6
## 5)* weights = 33
## [1] 1.1 1.2 1.3 1.3 1.3 1.3 1.3 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.5
## [19] 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.6 1.6 1.6 1.6 1.6 1.7 1.9 3.0 3.3 3.5 3.5
## [37] 3.7 3.8 3.9 3.9 3.9 4.0 4.0 4.1 4.2 4.2 4.2 4.3 4.3 4.4 4.4 4.4 4.5 4.5
## [55] 4.5 4.5 4.5 4.5 4.5 4.6 4.7 4.7 4.7 4.7 4.8 4.8 4.8 4.8 4.9 4.9 5.0 5.0
## [73] 5.1 5.1 5.1 5.1 5.1 5.1 5.2 5.2 5.3 5.4 5.4 5.5 5.6 5.6 5.6 5.7 5.7 5.8
## [91] 5.8 5.9 6.0 6.1 6.1 6.1 6.3 6.4 6.7 6.9
##
## pred.y setosa versicolor virginica
## setosa 18 0 0
## versicolor 0 14 3
## virginica 0 0 15
dat <- read.csv("ECG_train.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE, na.strings = "")
– 這裡給一些範例語法,讓大家知道怎樣用在不同的依變項屬性上。
subdat <- dat[!(dat[,'K'] %in% NA) & !(dat[,'Rate'] %in% NA) & !(dat[,'AGE'] %in% NA), c('K', 'Rate', 'AGE')]
tree.model <- ctree(formula = K ~ ., data = subdat)
plot(tree.model)
subdat <- dat[!(dat[,'LVD'] %in% NA) & !(dat[,'GENDER'] %in% NA) & !(dat[,'Rate'] %in% NA), c('LVD', 'GENDER', 'Rate')]
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
subdat[,'LVD'] <- as.factor(subdat[,'LVD'])
tree.model <- ctree(formula = LVD ~ ., data = subdat)
plot(tree.model)
subdat <- dat[!(dat[,'AMI'] %in% NA) & !(dat[,'GENDER'] %in% NA) & !(dat[,'AGE'] %in% NA), c('AMI', 'GENDER', 'AGE')]
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
subdat[,'AMI'] <- as.factor(subdat[,'AMI'])
tree.model <- ctree(formula = AMI ~ ., data = subdat)
plot(tree.model)
subdat <- dat[!(dat[,'time'] %in% NA) & !(dat[,'death'] %in% NA) & !(dat[,'Rate'] %in% NA) & !(dat[,'AGE'] %in% NA), c('time', 'death', 'Rate', 'AGE')]
tree.model <- ctree(formula = Surv(time, death) ~ ., data = subdat)
plot(tree.model)
參數【mincriterion】是顯著水準
參數【maxdepth】是樹的最大深度
參數【minsplit】是說樣本大於多少,才考慮繼續分類
參數【minbucket】是說每個分類至少要有幾個樣本
subdat <- dat[!(dat[,'LVD'] %in% NA) & !(dat[,'GENDER'] %in% NA) & !(dat[,'Rate'] %in% NA), c('LVD', 'GENDER', 'Rate')]
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
subdat[,'LVD'] <- as.factor(subdat[,'LVD'])
tree.model <- ctree(formula = LVD ~ ., data = subdat,
controls = ctree_control(mincriterion = 0.95, maxdepth = 2, minsplit = 20, minbucket = 7))
plot(tree.model)
– 如果你想獲得預測機率,可以用這種方式:
– 請你試著調整一下參數,你必須找出一組參數讓驗證組準確度最高。
library(mice)
subdat <- dat[!(dat[,"LVD"] %in% NA), c(-1, -2, -4, -5)]
subdat[,'LVD'] <- as.factor(subdat[,'LVD'])
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
for (i in 1:31) {subdat[,paste0('rhythm.', i)] <- as.factor(subdat[,paste0('rhythm.', i)])}
used_dat.x <- subdat[,-1]
mice_dat <- mice(used_dat.x, m = 1, maxit = 5, meth = 'cart', seed = 123, printFlag = FALSE)
impute_dat.x <- mice:::complete(mice_dat, action = 1)
set.seed(0)
all_idx <- 1:nrow(subdat)
train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]
train_X <- impute_dat.x[train_idx,]
valid_X <- impute_dat.x[valid_idx,]
test_X <- impute_dat.x[test_idx,]
train_Y <- subdat[train_idx,"LVD"]
valid_Y <- subdat[valid_idx,"LVD"]
test_Y <- subdat[test_idx,"LVD"]
library(pROC)
result <- data.frame(mincriterion = rep(c(0.95, 0.99), each = 3), maxdepth = 4:6, valid_auc = NA)
for (i in 1:nrow(result)) {
tree.model <- ctree(formula = train_Y ~ ., data = train_X,
controls = ctree_control(mincriterion = result[i,'mincriterion'], maxdepth = result[i,'maxdepth']))
prob_list <- predict(tree.model, valid_X, type = 'prob')
prob_y <- do.call('rbind', prob_list)[,1]
roc_valid <- roc(valid_Y ~ prob_y)
result[i,'valid_auc'] <- roc_valid[['auc']]
}
result
## mincriterion maxdepth valid_auc
## 1 0.95 4 0.7446314
## 2 0.95 5 0.7362981
## 3 0.95 6 0.7362981
## 4 0.99 4 0.7446314
## 5 0.99 5 0.7446314
## 6 0.99 6 0.7446314
best_pos <- which.max(result[,'valid_auc'])
best.tree.model <- ctree(formula = train_Y ~ ., data = train_X,
controls = ctree_control(mincriterion = result[best_pos,'mincriterion'], maxdepth = result[best_pos,'maxdepth']))
prob_list <- predict(tree.model, test_X, type = 'prob')
prob_y <- do.call('rbind', prob_list)[,1]
roc_curve <- roc(test_Y ~ prob_y)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')
– 讓我們來用套件「rpart」所提供的Classification & Regression Trees (CART):
library(rpart)
library(rpart.plot)
cart.model <- rpart(formula = train_Y ~ ., data = train_X)
prp(cart.model, faclen = 0, fallen.leaves = TRUE, shadow.col = "gray", extra = 2)
– 但他有一個重要的缺陷,就是我們Training sample是一次抽樣的結果,而這個樣本一定存在了一些抽樣誤差造成的特色與母體未必相同,我們有沒有可能因為這個錯誤的特色導致了決策樹陷入「局部極值」?
– 接著,假設我們成功的製造了100棵小樹,要預測的時候就將這樣本經過這100棵樹,並請這100棵樹投票看看他們預測這個樣本的結果。
– 這種在原始樣本中隨機抽樣的方式,在機器學習領域叫做「Bagging」,在統計學中叫做「Bootstraping」,這是一種全隨機的抽樣法。
– 與「Bagging」和「Bootstraping」相對的一種方法叫做「Boosting」,這種抽樣方法會根據之前的結果再抽樣,我們會在這門課的最後介紹他。
– 剛剛最佳的參數是【mincriterion = 0.95】與【maxdepth = 4】,我們就嘗試每次隨機抽樣70%的樣本,做10棵樹看看。
library(pROC)
set.seed(0)
tree.model_list <- list()
for (i in 1:10) {
new_train_idx <- sample(1:nrow(train_X), 0.7 * nrow(train_X))
sub_train_X <- train_X[new_train_idx,]
sub_train_Y <- train_Y[new_train_idx]
tree.model_list[[i]] <- ctree(formula = sub_train_Y ~ ., data = sub_train_X,
controls = ctree_control(mincriterion = 0.95, maxdepth = 4))
}
– 先讓我們觀察一下隨便3棵樹:
– 其實本來就有很多變數本來就有高度的相關性,因此多用一些不同資訊是有好處的。
– 另外,切點也不應該訂的這麼死,多個樹就能有效解決這個問題。
final_pred <- 0
for (i in 1:length(tree.model_list)) {
prob_list <- predict(tree.model_list[[i]], test_X, type = 'prob')
prob_y <- do.call('rbind', prob_list)[,1]
final_pred <- final_pred + prob_y / length(tree.model_list)
}
roc_curve <- roc(test_Y ~ final_pred)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')
library(randomForest)
rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50)
pred.y <- predict(rf.model, test_X, type = 'prob')[,2]
roc_curve <- roc(test_Y ~ pred.y)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')
– 請你試著調整一下參數,你必須找出一組參數讓驗證組準確度最高。
– 候選參數包含【replace = c(TRUE, FALSE)】、【sampsize = c(1000, 1400, 1800)】、【nodesize = c(10, 20, 50)】
rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50,
replace = TRUE, sampsize = 1000, nodesize = 10)
pred.y <- predict(rf.model, test_X, type = 'prob')[,2]
roc_curve <- roc(test_Y ~ pred.y)
print(roc_curve)
##
## Call:
## roc.formula(formula = test_Y ~ pred.y)
##
## Data: pred.y in 229 controls (test_Y 0) < 194 cases (test_Y 1).
## Area under the curve: 0.7709
library(pROC)
result <- data.frame(replace = rep(c(TRUE, FALSE), each = 9),
sampsize = rep(c(600, 800, 1000), each = 3),
nodesize = c(10, 20, 50),
valid_auc = NA)
for (i in 1:nrow(result)) {
rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50,
replace = result[i,'replace'],
sampsize = result[i,'sampsize'],
nodesize = result[i,'nodesize'])
pred.y <- predict(rf.model, valid_X, type = 'prob')[,2]
roc_valid <- roc(valid_Y ~ pred.y)
result[i,'valid_auc'] <- roc_valid[['auc']]
}
result
## replace sampsize nodesize valid_auc
## 1 TRUE 600 10 0.7856914
## 2 TRUE 600 20 0.7819712
## 3 TRUE 600 50 0.7903732
## 4 TRUE 800 10 0.7815476
## 5 TRUE 800 20 0.7768658
## 6 TRUE 800 50 0.7840316
## 7 TRUE 1000 10 0.7901328
## 8 TRUE 1000 20 0.7941163
## 9 TRUE 1000 50 0.7773008
## 10 FALSE 600 10 0.7801168
## 11 FALSE 600 20 0.7771864
## 12 FALSE 600 50 0.7853365
## 13 FALSE 800 10 0.7852106
## 14 FALSE 800 20 0.7765224
## 15 FALSE 800 50 0.7881525
## 16 FALSE 1000 10 0.7846955
## 17 FALSE 1000 20 0.7794643
## 18 FALSE 1000 50 0.7860806
best_pos <- which.max(result[,'valid_auc'])
best.rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50,
replace = result[best_pos,'replace'],
sampsize = result[best_pos,'sampsize'],
nodesize = result[best_pos,'nodesize'])
pred.y <- predict(best.rf.model, test_X, type = 'prob')[,2]
roc_curve <- roc(test_Y ~ pred.y)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')
– 而隨機森林隨機組成了非常多樹,將能夠做出一個變項重要性的排序。
used_count <- varUsed(best.rf.model)
names(used_count) <- colnames(train_X)
sort(used_count, decreasing = TRUE)
## Axes_T Rate QRSd QTc PR Axes_QRS AGE QT
## 321 297 294 270 260 255 251 250
## Axes_P rhythm.17 rhythm.14 GENDER rhythm.26 rhythm.29 rhythm.11 rhythm.18
## 248 95 69 64 62 62 60 51
## rhythm.27 rhythm.2 rhythm.8 rhythm.1 rhythm.25 rhythm.4 rhythm.21 rhythm.7
## 50 46 42 41 38 34 34 32
## rhythm.15 rhythm.6 rhythm.20 rhythm.13 rhythm.3 rhythm.28 rhythm.19 rhythm.9
## 31 30 28 22 21 19 18 17
## rhythm.16 rhythm.10 rhythm.30 rhythm.31 rhythm.5 rhythm.12 rhythm.22 rhythm.24
## 15 14 14 12 11 9 9 7
## rhythm.23
## 4
– 透過這個方式我們還能拿到每顆樹的使用狀況,這可以用來取得變異樹等資訊。
– 函數「importance」可以協助我們觀察他們的關聯性,從而瞭解非線性模型的優勢。
– 要特別注意的是,不建議使用Training sample來畫圖,因為隨機森林的切點是由Training sample訂的。
– 但是用線性模型來進行篩選感覺不是很好,畢竟有些變項可能不是沒關係,有的是線性關係。
– 那就是假設兩個變項對outcome存在預測能力,但分開看完全沒有關係,同時合一起看才有關係,這樣照決策樹運行的邏輯,不就無法處理了?
– 換個說法,就是這兩個變項存在交互作用
set.seed(0)
x1 <- rep(0:1, 50) + rnorm(100, sd = 0.5)
x2 <- rep(0:1, each = 50) + rnorm(100, sd = 0.5)
y <- rep(0:1, 50) != rep(0:1, each = 50)
y <- as.factor(y)
plot(x1, x2, col = as.integer(y) + 1, pch = 19)
##
## Conditional inference tree with 1 terminal nodes
##
## Response: y
## Inputs: x1, x2
## Number of observations: 100
##
## 1)* weights = 100
##
## Call: glm(formula = y ~ x1 + x2, family = "binomial")
##
## Coefficients:
## (Intercept) x1 x2
## -0.01014 -0.10878 0.13794
##
## Degrees of Freedom: 99 Total (i.e. Null); 97 Residual
## Null Deviance: 138.6
## Residual Deviance: 138.3 AIC: 144.3
##
## Call: glm(formula = y ~ x1 + x2 + x1:x2, family = "binomial")
##
## Coefficients:
## (Intercept) x1 x2 x1:x2
## -1.926 3.898 3.703 -6.961
##
## Degrees of Freedom: 99 Total (i.e. Null); 96 Residual
## Null Deviance: 138.6
## Residual Deviance: 82.87 AIC: 90.87
– 我們必須先將變項重要性做一個排序,看看是不是用前N個變項就可以準確預測了(在驗證組中做)。
– 之後再用多層感知機分析這幾個變項,從而得到最終模型。
rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50)
importance <- importance(best.rf.model)
importance <- importance[order(importance[,'MeanDecreaseGini'], decreasing = TRUE),]
result <- data.frame(n_var = 1:length(importance), valid_auc = NA)
for (i in 1:nrow(result)) {
rf.model <- randomForest(formula = train_Y ~ ., data = train_X[,names(importance)[1:result[i,'n_var']],drop = FALSE], ntree = 50)
pred.y <- predict(rf.model, valid_X, type = 'prob')[,2]
roc_valid <- roc(valid_Y ~ pred.y)
result[i,'valid_auc'] <- roc_valid[['auc']]
}
plot(valid_auc ~ n_var, data = result, type = 'l', ylim = c(0.5, 1))
## [1] 22
library(nnet)
train_Y_val <- as.integer(train_Y) - 1
fit_nnet <- nnet(train_Y_val ~ ., data = train_X[,names(importance)[1:best_n.var],drop = FALSE],
size = 20, maxit = 100, entropy = TRUE, decay = 0.001, trace = FALSE, MaxNWts = 1e5)
pred.y <- predict(fit_nnet, test_X[,names(importance)[1:best_n.var]])
roc_curve <- roc(test_Y ~ pred.y)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')
– 儘管我們不清楚X與Y實際上的關係,這兩個機器學習的方法還是有效的協助我們預測出量的結果。
– 事實上,隨機森林還並非Tree-based model的最終型態,在2003年以後出現的梯度提升機(Gradient Boosting Machine)是一個比隨機森林更強的分類器,我們會在後面的課程介紹
– 另外也能用來篩選變項,讓一些複雜的非線性模型(多層感知機與之後要教的SVM等)能夠用少量的變項進行分析。
– 這對我們完整的資料科學流程又更進一步了!