第一節：決策樹(3)

讓我們來試著用iris的資料實作決策樹吧，我們先設計一個簡單的邏輯，我們找出每個變項的最佳切點(切割後以費雪檢定進行檢定，p value最小為最佳切點)，接著比較各變項最佳切點的卡方值，選擇一個最好的分類方法

data(iris)

#Split data
set.seed(1)

Train.sample <- sample(1:150, 100, replace = FALSE)

Train.data <- iris[Train.sample,]
Test.data <- iris[-Train.sample,]

#Find optimal cut-points

optimal.cut_points <- data.frame(var = colnames(Train.data)[1:4],
                                 cut = numeric(4),
                                 pval = numeric(4))
for (i in 1:4) {
  unique.values = unique(sort(Train.data[,i]))
  chisq.list = numeric(length(unique.values)-1)
  for (j in 1:length(chisq.list)) {
    potiantal.cut_points = (unique.values[j] + unique.values[j+1])/2
    names(chisq.list)[j] = potiantal.cut_points
    X = Train.data[,i] > potiantal.cut_points
    Y = Train.data[,5]
    chisq.list[j] = fisher.test(X, Y)$p.value
  }
  optimal.cut_points[i,2] = names(chisq.list)[which.min(chisq.list)]
  optimal.cut_points[i,3] = min(chisq.list)
}

print(optimal.cut_points)

##            var  cut         pval
## 1 Sepal.Length 5.55 1.053168e-15
## 2  Sepal.Width 2.95 2.195292e-07
## 3 Petal.Length 2.45 1.398479e-26
## 4  Petal.Width  0.8 1.398479e-26

看來使用Petal.Length以及使用Petal.Width進行分類的效果一樣好，我們來試試看用套件做的結果如何。

第一節：決策樹(4)

事實上決策樹因為對於「最大分類效果」的定義有相當多種，所以有非常多不同種類的決策樹，由於我們剛剛的結果是以卡方檢定的結果為標準，故我們這裡使用conditional inference tree作驗證

library(party)

tree.model <- ctree(formula = Species ~ ., data = Train.data)
tree.model

## 
##   Conditional inference tree with 3 terminal nodes
## 
## Response:  Species 
## Inputs:  Sepal.Length, Sepal.Width, Petal.Length, Petal.Width 
## Number of observations:  100 
## 
## 1) Petal.Length <= 1.9; criterion = 1, statistic = 92.735
##   2)*  weights = 32 
## 1) Petal.Length > 1.9
##   3) Petal.Width <= 1.6; criterion = 1, statistic = 48.709
##     4)*  weights = 35 
##   3) Petal.Width > 1.6
##     5)*  weights = 33

結果看起來好像不太一樣，但其實是一樣的，因為>1.9和>2.45意思是相同的。

sort(Train.data[,3])

##   [1] 1.1 1.2 1.3 1.3 1.3 1.3 1.3 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.4 1.5
##  [19] 1.5 1.5 1.5 1.5 1.5 1.5 1.5 1.6 1.6 1.6 1.6 1.6 1.7 1.9 3.0 3.3 3.5 3.5
##  [37] 3.7 3.8 3.9 3.9 3.9 4.0 4.0 4.1 4.2 4.2 4.2 4.3 4.3 4.4 4.4 4.4 4.5 4.5
##  [55] 4.5 4.5 4.5 4.5 4.5 4.6 4.7 4.7 4.7 4.7 4.8 4.8 4.8 4.8 4.9 4.9 5.0 5.0
##  [73] 5.1 5.1 5.1 5.1 5.1 5.1 5.2 5.2 5.3 5.4 5.4 5.5 5.6 5.6 5.6 5.7 5.7 5.8
##  [91] 5.8 5.9 6.0 6.1 6.1 6.1 6.3 6.4 6.7 6.9

第一節：決策樹(5)

接著我們來利用這個決策樹模型做預測，來看看結果：

pred.y <- predict(tree.model, Test.data[,1:4])
table(pred.y, Test.data[,5])

##             
## pred.y       setosa versicolor virginica
##   setosa         18          0         0
##   versicolor      0         14         3
##   virginica       0          0        15

使用內建函數畫圖看看，效果是不是很不錯呢?

plot(tree.model)

第一節：決策樹(6)

讓我們使用真實資料來進行實驗吧！請至這裡下載範例資料

dat <- read.csv("ECG_train.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE, na.strings = "")

– 這裡給一些範例語法，讓大家知道怎樣用在不同的依變項屬性上。

這是連續變項的範例：

subdat <- dat[!(dat[,'K'] %in% NA) & !(dat[,'Rate'] %in% NA) & !(dat[,'AGE'] %in% NA), c('K', 'Rate', 'AGE')]

tree.model <- ctree(formula = K ~ ., data = subdat)
plot(tree.model)

這是二元分類的範例：

subdat <- dat[!(dat[,'LVD'] %in% NA) & !(dat[,'GENDER'] %in% NA) & !(dat[,'Rate'] %in% NA), c('LVD', 'GENDER', 'Rate')]
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
subdat[,'LVD'] <- as.factor(subdat[,'LVD'])

tree.model <- ctree(formula = LVD ~ ., data = subdat)
plot(tree.model)

這是多分類任務的範例：

subdat <- dat[!(dat[,'AMI'] %in% NA) & !(dat[,'GENDER'] %in% NA) & !(dat[,'AGE'] %in% NA), c('AMI', 'GENDER', 'AGE')]
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
subdat[,'AMI'] <- as.factor(subdat[,'AMI'])

tree.model <- ctree(formula = AMI ~ ., data = subdat)
plot(tree.model)

這是存活分析的範例：

subdat <- dat[!(dat[,'time'] %in% NA) & !(dat[,'death'] %in% NA) & !(dat[,'Rate'] %in% NA) & !(dat[,'AGE'] %in% NA), c('time', 'death', 'Rate', 'AGE')]

tree.model <- ctree(formula = Surv(time, death) ~ ., data = subdat)
plot(tree.model)

第一節：決策樹(7)

決策樹有非常多的參數可以讓我們設定，我們這裡舉幾個常用的。

參數【mincriterion】是顯著水準
參數【maxdepth】是樹的最大深度
參數【minsplit】是說樣本大於多少，才考慮繼續分類
參數【minbucket】是說每個分類至少要有幾個樣本

subdat <- dat[!(dat[,'LVD'] %in% NA) & !(dat[,'GENDER'] %in% NA) & !(dat[,'Rate'] %in% NA), c('LVD', 'GENDER', 'Rate')]
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
subdat[,'LVD'] <- as.factor(subdat[,'LVD'])

tree.model <- ctree(formula = LVD ~ ., data = subdat,
                    controls = ctree_control(mincriterion = 0.95, maxdepth = 2, minsplit = 20, minbucket = 7))
plot(tree.model)

– 如果你想獲得預測機率，可以用這種方式：

prob_list <- predict(tree.model, subdat, type = 'prob')
prob_y <- do.call('rbind', prob_list)[,1]

練習1：完整資料科學實驗

我們就以二元分類預測LVD為例，來用完整的資料建立決策樹。

– 請你試著調整一下參數，你必須找出一組參數讓驗證組準確度最高。

library(mice)

subdat <- dat[!(dat[,"LVD"] %in% NA), c(-1, -2, -4, -5)]

subdat[,'LVD'] <- as.factor(subdat[,'LVD'])
subdat[,'GENDER'] <- as.factor(subdat[,'GENDER'])
for (i in 1:31) {subdat[,paste0('rhythm.', i)] <- as.factor(subdat[,paste0('rhythm.', i)])}

used_dat.x <- subdat[,-1]
mice_dat <- mice(used_dat.x, m = 1, maxit = 5, meth = 'cart', seed = 123, printFlag = FALSE)
impute_dat.x <- mice:::complete(mice_dat, action = 1)

set.seed(0)
all_idx <- 1:nrow(subdat)

train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]

train_X <- impute_dat.x[train_idx,]
valid_X <- impute_dat.x[valid_idx,]
test_X <- impute_dat.x[test_idx,]

train_Y <- subdat[train_idx,"LVD"]
valid_Y <- subdat[valid_idx,"LVD"]
test_Y <- subdat[test_idx,"LVD"]

練習1答案

這是一個簡單的範例：

library(pROC)

result <- data.frame(mincriterion = rep(c(0.95, 0.99), each = 3), maxdepth = 4:6, valid_auc = NA)

for (i in 1:nrow(result)) {
  
  tree.model <- ctree(formula = train_Y ~ ., data = train_X,
                      controls = ctree_control(mincriterion = result[i,'mincriterion'], maxdepth = result[i,'maxdepth']))
  
  prob_list <- predict(tree.model, valid_X, type = 'prob')
  prob_y <- do.call('rbind', prob_list)[,1]
  roc_valid <- roc(valid_Y ~ prob_y)
  result[i,'valid_auc'] <- roc_valid[['auc']]
  
}

result

##   mincriterion maxdepth valid_auc
## 1         0.95        4 0.7446314
## 2         0.95        5 0.7362981
## 3         0.95        6 0.7362981
## 4         0.99        4 0.7446314
## 5         0.99        5 0.7446314
## 6         0.99        6 0.7446314

讓我們把這個模型應用在測試組中：

best_pos <- which.max(result[,'valid_auc'])

best.tree.model <- ctree(formula = train_Y ~ ., data = train_X,
                      controls = ctree_control(mincriterion = result[best_pos,'mincriterion'], maxdepth = result[best_pos,'maxdepth']))

prob_list <- predict(tree.model, test_X, type = 'prob')
prob_y <- do.call('rbind', prob_list)[,1]
  
roc_curve <- roc(test_Y ~ prob_y)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')

練習1引申

除了套件「party」所提供的「條件推理樹」外，還有一些套件提供其他決策樹模型

– 讓我們來用套件「rpart」所提供的Classification & Regression Trees (CART)：

library(rpart)
library(rpart.plot)

cart.model <- rpart(formula = train_Y ~ ., data = train_X)
prp(cart.model, faclen = 0, fallen.leaves = TRUE, shadow.col = "gray", extra = 2)

決策樹的類型還有很多，但原理都很相似，我們就不一一介紹，大家可以自己找套件使用。

第二節：隨機森林(1)

決策樹確實有能力解決線性不可分割問題，對於非線性問題他確實比邏輯斯回歸更優秀。

– 但他有一個重要的缺陷，就是我們Training sample是一次抽樣的結果，而這個樣本一定存在了一些抽樣誤差造成的特色與母體未必相同，我們有沒有可能因為這個錯誤的特色導致了決策樹陷入「局部極值」?

當然是有可能的，所以學者們想出了隨機森林來解決這個問題，他的做法是把原來的樣本「隨機的」抽取一小部分當新樣本，並依此新樣本作出一棵決策樹，然後再重複抽樣產生第二棵樹，依此類推。

– 接著，假設我們成功的製造了100棵小樹，要預測的時候就將這樣本經過這100棵樹，並請這100棵樹投票看看他們預測這個樣本的結果。

– 這種在原始樣本中隨機抽樣的方式，在機器學習領域叫做「Bagging」，在統計學中叫做「Bootstraping」，這是一種全隨機的抽樣法。

– 與「Bagging」和「Bootstraping」相對的一種方法叫做「Boosting」，這種抽樣方法會根據之前的結果再抽樣，我們會在這門課的最後介紹他。

這樣的操作在數學上的意義，就像我們在做梯度下降法時從100個「隨機的起始點」開始疊代程序，即使我們無法判斷「全局極值」在哪個點，我們應該也能透過這100個方程式的投票獲得更準確的結果！

F04

第二節：隨機森林(2)

先讓我們手動製造新的樹好了，來試試看這樣是不是真的會提升一點點效能。

– 剛剛最佳的參數是【mincriterion = 0.95】與【maxdepth = 4】，我們就嘗試每次隨機抽樣70%的樣本，做10棵樹看看。

library(pROC)

set.seed(0)

tree.model_list <- list()

for (i in 1:10) {
  
  new_train_idx <- sample(1:nrow(train_X), 0.7 * nrow(train_X))
  sub_train_X <- train_X[new_train_idx,]
  sub_train_Y <- train_Y[new_train_idx]
  
  tree.model_list[[i]] <- ctree(formula = sub_train_Y ~ ., data = sub_train_X,
                                controls = ctree_control(mincriterion = 0.95, maxdepth = 4))

}

– 先讓我們觀察一下隨便3棵樹：

plot(tree.model_list[[1]], main = 'Tree 1')

plot(tree.model_list[[2]], main = 'Tree 2')

plot(tree.model_list[[3]], main = 'Tree 3')

第二節：隨機森林(3)

你是不是發現隨機性帶來的好處了?每棵樹選到的變項不太一樣，而即使選到一樣的切點也不太一樣。

– 其實本來就有很多變數本來就有高度的相關性，因此多用一些不同資訊是有好處的。

– 另外，切點也不應該訂的這麼死，多個樹就能有效解決這個問題。

現在讓我們把結果平均起來看看，並應用在測試祖上，發現確實有比較準，而且ROC曲線也更加平滑了。

final_pred <- 0

for (i in 1:length(tree.model_list)) {
  
  prob_list <- predict(tree.model_list[[i]], test_X, type = 'prob')
  prob_y <- do.call('rbind', prob_list)[,1]
  final_pred <- final_pred + prob_y / length(tree.model_list)
  
}

roc_curve <- roc(test_Y ~ final_pred)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')

第二節：隨機森林(4)

讓我們來試試使用套件完成隨機森林樹吧，但因為要產生很多小樹運算量驚人，我們僅設置50棵樹。

library(randomForest)

rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50)

pred.y <- predict(rf.model, test_X, type = 'prob')[,2]

roc_curve <- roc(test_Y ~ pred.y)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')

準確度提升不少，太令人震驚了！

練習2：完整資料科學實驗

我們就以二元分類預測LVD為例，來用完整的資料建立決策樹。

– 請你試著調整一下參數，你必須找出一組參數讓驗證組準確度最高。

– 候選參數包含【replace = c(TRUE, FALSE)】、【sampsize = c(1000, 1400, 1800)】、【nodesize = c(10, 20, 50)】

rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50,
                         replace = TRUE, sampsize = 1000, nodesize = 10)

pred.y <- predict(rf.model, test_X, type = 'prob')[,2]

roc_curve <- roc(test_Y ~ pred.y)
print(roc_curve)

## 
## Call:
## roc.formula(formula = test_Y ~ pred.y)
## 
## Data: pred.y in 229 controls (test_Y 0) < 194 cases (test_Y 1).
## Area under the curve: 0.7709

練習2答案

這是一個簡單的範例：

library(pROC)

result <- data.frame(replace = rep(c(TRUE, FALSE), each = 9),
                     sampsize = rep(c(600, 800, 1000), each = 3),
                     nodesize = c(10, 20, 50),
                     valid_auc = NA)

for (i in 1:nrow(result)) {
  
  rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50,
                           replace = result[i,'replace'],
                           sampsize = result[i,'sampsize'],
                           nodesize =  result[i,'nodesize'])
  
  pred.y <- predict(rf.model, valid_X, type = 'prob')[,2]
  roc_valid <- roc(valid_Y ~ pred.y)
  result[i,'valid_auc'] <- roc_valid[['auc']]
  
}

result

##    replace sampsize nodesize valid_auc
## 1     TRUE      600       10 0.7856914
## 2     TRUE      600       20 0.7819712
## 3     TRUE      600       50 0.7903732
## 4     TRUE      800       10 0.7815476
## 5     TRUE      800       20 0.7768658
## 6     TRUE      800       50 0.7840316
## 7     TRUE     1000       10 0.7901328
## 8     TRUE     1000       20 0.7941163
## 9     TRUE     1000       50 0.7773008
## 10   FALSE      600       10 0.7801168
## 11   FALSE      600       20 0.7771864
## 12   FALSE      600       50 0.7853365
## 13   FALSE      800       10 0.7852106
## 14   FALSE      800       20 0.7765224
## 15   FALSE      800       50 0.7881525
## 16   FALSE     1000       10 0.7846955
## 17   FALSE     1000       20 0.7794643
## 18   FALSE     1000       50 0.7860806

讓我們把這個模型應用在測試組中：

best_pos <- which.max(result[,'valid_auc'])

best.rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50,
                              replace = result[best_pos,'replace'],
                              sampsize = result[best_pos,'sampsize'],
                              nodesize =  result[best_pos,'nodesize'])

pred.y <- predict(best.rf.model, test_X, type = 'prob')[,2]

roc_curve <- roc(test_Y ~ pred.y)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')

第三節：變項重要性分析(1)

如果你足夠了解決策樹的分析過程，你應該有注意到它會依序選擇「最重要的變項」、其次的重要性慢慢遞減。

– 而隨機森林隨機組成了非常多樹，將能夠做出一個變項重要性的排序。

used_count <- varUsed(best.rf.model)
names(used_count) <- colnames(train_X)
sort(used_count, decreasing = TRUE)

##    Axes_T      Rate      QRSd       QTc        PR  Axes_QRS       AGE        QT 
##       321       297       294       270       260       255       251       250 
##    Axes_P rhythm.17 rhythm.14    GENDER rhythm.26 rhythm.29 rhythm.11 rhythm.18 
##       248        95        69        64        62        62        60        51 
## rhythm.27  rhythm.2  rhythm.8  rhythm.1 rhythm.25  rhythm.4 rhythm.21  rhythm.7 
##        50        46        42        41        38        34        34        32 
## rhythm.15  rhythm.6 rhythm.20 rhythm.13  rhythm.3 rhythm.28 rhythm.19  rhythm.9 
##        31        30        28        22        21        19        18        17 
## rhythm.16 rhythm.10 rhythm.30 rhythm.31  rhythm.5 rhythm.12 rhythm.22 rhythm.24 
##        15        14        14        12        11         9         9         7 
## rhythm.23 
##         4

– 透過這個方式我們還能拿到每顆樹的使用狀況，這可以用來取得變異樹等資訊。

count_table <- varUsed(best.rf.model, by.tree = TRUE)
rownames(count_table) <- colnames(train_X)

第三節：變項重要性分析(2)

函數「importance」可以協助我們產生每個變項重要性的報表，它的邏輯是透過計算「Gini Impurity」，並看看有這個變項的輔助之下Gini Impurity下降了多少。

importance <- importance(best.rf.model)

函數「varImpPlot」則可以協助我們把它的排序畫出來：

varImpPlot(best.rf.model)

第三節：變項重要性分析(3)

另外，非線性的預測模型還能提供一個好處，就是告訴我們該變項與outcome的真實關係。

– 函數「importance」可以協助我們觀察他們的關聯性，從而瞭解非線性模型的優勢。

– 要特別注意的是，不建議使用Training sample來畫圖，因為隨機森林的切點是由Training sample訂的。

partialPlot(best.rf.model, pred.data = valid_X, x.var = AGE, which.class = 1)

partialPlot(best.rf.model, pred.data = valid_X, x.var = Axes_T, which.class = 1)

partialPlot(best.rf.model, pred.data = valid_X, x.var = Rate, which.class = 1)

練習3：篩選變項(1)

在篩選變項上，之前我們都只能依靠回歸分析的單變項/多變項分析結果，看看結果是否顯著決定是否要用。

– 但是用線性模型來進行篩選感覺不是很好，畢竟有些變項可能不是沒關係，有的是線性關係。

直接使用Tree-based model似乎非常方便，畢竟它同時兼顧了選變項與建模的兩項功能，但他有個非常重要的限制，那就是他沒有辦法面對存在「特徵相依性」的變項組合

– 那就是假設兩個變項對outcome存在預測能力，但分開看完全沒有關係，同時合一起看才有關係，這樣照決策樹運行的邏輯，不就無法處理了?

– 換個說法，就是這兩個變項存在交互作用

讓我們看看下面的例子

set.seed(0)

x1 <- rep(0:1, 50) + rnorm(100, sd = 0.5)
x2 <- rep(0:1, each = 50) + rnorm(100, sd = 0.5)
y <- rep(0:1, 50) != rep(0:1, each = 50)
y <- as.factor(y)

plot(x1, x2, col = as.integer(y) + 1, pch = 19)

這下完蛋了，單變量分析中都完全不顯著，決策樹完全失靈

library(party)
tree.model <- ctree(y ~ x1 + x2)
tree.model

## 
##   Conditional inference tree with 1 terminal nodes
## 
## Response:  y 
## Inputs:  x1, x2 
## Number of observations:  100 
## 
## 1)*  weights = 100

用邏輯斯迴歸來運算看看，我們發現原來這就是交互作用。

glm(y ~ x1 + x2, family = 'binomial')

## 
## Call:  glm(formula = y ~ x1 + x2, family = "binomial")
## 
## Coefficients:
## (Intercept)           x1           x2  
##    -0.01014     -0.10878      0.13794  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  97 Residual
## Null Deviance:       138.6 
## Residual Deviance: 138.3     AIC: 144.3

glm(y ~ x1 + x2 + x1:x2, family = 'binomial')

## 
## Call:  glm(formula = y ~ x1 + x2 + x1:x2, family = "binomial")
## 
## Coefficients:
## (Intercept)           x1           x2        x1:x2  
##      -1.926        3.898        3.703       -6.961  
## 
## Degrees of Freedom: 99 Total (i.e. Null);  96 Residual
## Null Deviance:       138.6 
## Residual Deviance: 82.87     AIC: 90.87

練習3：篩選變項(2)

總知，我們要知道Tree-based model是有他的限制的，現在讓我們嘗試使用隨機森林找出比較重要的變項，之後再用多層感知機來進行運算。

– 我們必須先將變項重要性做一個排序，看看是不是用前N個變項就可以準確預測了(在驗證組中做)。

– 之後再用多層感知機分析這幾個變項，從而得到最終模型。

完成一整個資料科學實驗試試看吧！

練習3答案(1)

這是一個簡單的範例，讓我們先看看用不同變項數的AUC表現：

rf.model <- randomForest(formula = train_Y ~ ., data = train_X, ntree = 50)
importance <- importance(best.rf.model)
importance <- importance[order(importance[,'MeanDecreaseGini'], decreasing = TRUE),]

result <- data.frame(n_var = 1:length(importance), valid_auc = NA)

for (i in 1:nrow(result)) {
  
  rf.model <- randomForest(formula = train_Y ~ ., data = train_X[,names(importance)[1:result[i,'n_var']],drop = FALSE], ntree = 50)
  pred.y <- predict(rf.model, valid_X, type = 'prob')[,2]
  roc_valid <- roc(valid_Y ~ pred.y)
  result[i,'valid_auc'] <- roc_valid[['auc']]

}

plot(valid_auc ~ n_var, data = result, type = 'l', ylim = c(0.5, 1))

你可以制定一個選擇變項數目的依據，如我們認為每增加一個變項至少也要增加0.001的AUC，從而找到最適當的變項數：

best_n.var <- which.max(result[,'valid_auc'] - result[,'n_var'] * 0.001)
best_n.var

## [1] 22

練習3答案(2)

根據這個答案，我們將可以用來進行多層感知機的分析：

library(nnet)

train_Y_val <- as.integer(train_Y) - 1
fit_nnet <- nnet(train_Y_val ~ ., data = train_X[,names(importance)[1:best_n.var],drop = FALSE],
                 size = 20, maxit = 100, entropy = TRUE, decay = 0.001, trace = FALSE, MaxNWts = 1e5)

pred.y <- predict(fit_nnet, test_X[,names(importance)[1:best_n.var]])

roc_curve <- roc(test_Y ~ pred.y)
plot(roc_curve)
text(0.5, 0.5, paste0('AUC = ', formatC(roc_curve[['auc']], 4, format = 'f')), col = 'red')

結果不見得比較好，重點是整個流程。未來在應用時我們要學會整合多個模型的優缺點來完成一個資料科學實驗。

課程小結

本週我們帶領大家一起學會了決策樹及隨機森林，他們在面對一些較複雜的問題時效能遠遠超過了線性模型。

– 儘管我們不清楚X與Y實際上的關係，這兩個機器學習的方法還是有效的協助我們預測出量的結果。

– 事實上，隨機森林還並非Tree-based model的最終型態，在2003年以後出現的梯度提升機(Gradient Boosting Machine)是一個比隨機森林更強的分類器，我們會在後面的課程介紹

隨機森林除了能準確的預測outcome之外，更重要的是提供了變項重要性分析，這對我們的解釋非常有幫助。

– 另外也能用來篩選變項，讓一些複雜的非線性模型(多層感知機與之後要教的SVM等)能夠用少量的變項進行分析。

– 這對我們完整的資料科學流程又更進一步了！

機器學習及演算法

第一節：決策樹(1)

第一節：決策樹(2)