機器學習及演算法

林嶔 (Lin, Chin)

Lesson 10 機器學習概論1(線性判別分析、樸素貝葉斯分類與k-近鄰演算法)

機器學習概論簡介

– 以後你會發現在一個特定任務的「資料科學實驗」中,你可能會使用10種以上的算法,而每種算法至少有100組參數在比較,從而在驗證組中找到最佳的模型最後再應用至測試組。

第一節:線性判別分析(1)

F01

– 請至這裡下載範例資料

dat <- read.csv("ECG_train.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE, na.strings = "")

第一節:線性判別分析(2)

– 這是一組資料,我們可以透過下面這個「旋轉投影矩陣\(R\)」,強迫二維座標點轉軸\(\theta\)

\[R = \begin{pmatrix} cos(\theta) & -sin(\theta) \\ sin(\theta) & cos(\theta) \end{pmatrix}\]

X_mat <- cbind(x = c(1.7, 2.5, -2.1), y = c(3.1, -2.8, -0.7))

rotate_degree <- 0.5
R_mat <- matrix(c(cos(rotate_degree), -sin(rotate_degree), sin(rotate_degree), cos(rotate_degree)), nrow = 2)
XR_mat <- X_mat %*% R_mat

par(mfrow = c(1, 2))
plot(X_mat[,1], X_mat[,2], xlim = c(-5, 5), ylim = c(-5, 5), pch = 19, cex = 2, col = 2:4, main = 'before rotate')
plot(XR_mat[,1], XR_mat[,2], xlim = c(-5, 5), ylim = c(-5, 5), pch = 19, cex = 2, col = 2:4, main = 'after rotate')

– 你可以換一下【rotate_degree】看看效果

第一節:線性判別分析(3)

– 我們的目標是找到一個矩陣\(R\),讓我們能用「Rate」和「QTc」預測「AMI」:

library(MASS)

dat[,'AMI'] <- factor(dat[,'AMI'])
lda_fit <- lda(AMI ~ Rate + QTc, data = dat)
lda_fit$scaling
##              LD1         LD2
## Rate -0.02371907  0.03725259
## QTc  -0.01476256 -0.01480724

– 找到矩陣後我們來圖像化表達一下線性判別分析的效果:

X_mat <- cbind(dat[,'Rate'], dat[,'QTc'])
R_mat <- lda_fit$scaling
XR_mat <- X_mat %*% R_mat
col_idx <- as.integer(dat[,'AMI'])
col_list <- c('#FF000030', '#00A00030', '#0000FF30')

par(mfrow = c(1, 2))
plot(X_mat[,1], X_mat[,2], pch = 19, cex = 0.5, col = col_list[col_idx], main = 'before rotate')
plot(XR_mat[,1], XR_mat[,2], pch = 19, cex = 0.5, col = col_list[col_idx], main = 'after rotate')

第一節:線性判別分析(4)

result_list <- predict(lda_fit, dat)
table(dat[,'AMI'], result_list$class)
##          
##           not-AMI NSTEMI STEMI
##   not-AMI    1055      0     1
##   NSTEMI      168      0     0
##   STEMI       255      0     0

– 這是對不同類別平均數的距離:

head(result_list$posterior)
##     not-AMI     NSTEMI     STEMI
## 1 0.5799250 0.14117064 0.2789043
## 2 0.8068244 0.07610796 0.1170676
## 3 0.6695866 0.13224111 0.1981723
## 4 0.6559637 0.14644156 0.1975948
## 5 0.7454229 0.09747472 0.1571024
## 6 0.7566320 0.09973263 0.1436353
library(pROC)

roc_curve <- roc(response = (dat[,'AMI'] == 'STEMI'), predictor = result_list$posterior[,3])
plot(roc_curve)

練習1:與迴歸分析做出比較

subdat <- dat[!(dat[,"LVD"] %in% NA) & !(dat[,"Rate"] %in% NA) & !(dat[,"QTc"] %in% NA),]
X <- subdat[,c('Rate', 'QTc')]
Y <- subdat[,"LVD"]

set.seed(0)
all_idx <- 1:nrow(subdat)

train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]

train_X <- X[train_idx,]
valid_X <- X[valid_idx,]
test_X <- X[test_idx,]

train_Y <- Y[train_idx]
valid_Y <- Y[valid_idx]
test_Y <- Y[test_idx]

練習1答案(1)

glm_fit <- glm(train_Y ~ ., data = train_X, family = 'binomial')

valid_pred <- predict(glm_fit, valid_X)
test_pred <- predict(glm_fit, test_X)

library(pROC)

roc_valid <- roc(valid_Y ~ valid_pred)
best_pos <- which.max(roc_valid$sensitivities + roc_valid$specificities)
best_cut <- roc_valid$thresholds[best_pos]

tab_test <- table(test_pred >= best_cut, test_Y)
sens <- tab_test[2,2] / sum(tab_test[,2])
spec <- tab_test[1,1] / sum(tab_test[,1])

roc_test <- roc(test_Y ~ test_pred)
plot(roc_test)

points(spec, sens, pch = 19)
text(0.5, 0.5, paste0('AUC(valid) = ', formatC(roc_valid$auc, digits = 3, format = 'f'),
                      '\nSens = ', formatC(sens, digits = 3, format = 'f'),
                      '\nSpec = ', formatC(spec, digits = 3, format = 'f'),
                      '\nAUC(test) = ', formatC(roc_test$auc, digits = 3, format = 'f')), col = 'red')

練習1答案(2)

lda_fit <- lda(train_Y ~ ., data = train_X)

valid_list <- predict(lda_fit, valid_X)
test_list <- predict(lda_fit, test_X)

valid_pred <- valid_list$posterior[,2]
test_pred <- test_list$posterior[,2]

library(pROC)

roc_valid <- roc(valid_Y ~ valid_pred)
best_pos <- which.max(roc_valid$sensitivities + roc_valid$specificities)
best_cut <- roc_valid$thresholds[best_pos]

tab_test <- table(test_pred >= best_cut, test_Y)
sens <- tab_test[2,2] / sum(tab_test[,2])
spec <- tab_test[1,1] / sum(tab_test[,1])

roc_test <- roc(test_Y ~ test_pred)
plot(roc_test)

points(spec, sens, pch = 19)
text(0.5, 0.5, paste0('AUC(valid) = ', formatC(roc_valid$auc, digits = 3, format = 'f'),
                      '\nSens = ', formatC(sens, digits = 3, format = 'f'),
                      '\nSpec = ', formatC(spec, digits = 3, format = 'f'),
                      '\nAUC(test) = ', formatC(roc_test$auc, digits = 3, format = 'f')), col = 'red')

第二節:樸素貝葉斯分類器(1)

– 我們假設具有樣本\(X\)具有條件\(X = \begin{pmatrix} x_1 & x_2 & x_3 \end{pmatrix}\),我們想要知道發生事件(\(Y\))的可能性\(L(Y|X)\)為何,那可以得到下列式子:

\[ \begin{align} L(Y|X) & = P(Y)P(x_1|Y)P(x_2|Y)P(x_3|Y) \end{align} \]

– 同樣的,我們也能計算出不發生事件(\(Y'\))的可能性\(L(Y'|X)\)

\[ \begin{align} L(Y'|X) & = P(Y')P(x_1|Y')P(x_2|Y')P(x_3|Y') \end{align} \]

– 那使用獨立假設有甚麼好處呢?好處在於我們應該有足夠多的樣本計算\(P(x_i|Y)\),但如果條件足夠多,樣本中可能就不存在同時滿足\(X = \begin{pmatrix} x_1 & x_2 & x_3 \end{pmatrix}\)的樣本,從而無法做出準確的估計。

– 但獨立假設的缺點也很明顯,又是一個線性的預測工具,我們不可能透過樸素貝葉斯分類器找出交互作用存在。

第二節:樸素貝葉斯分類器(2)

F02

\[ \begin{align} L(\text{辦卡:會}|X) & = P(\text{辦卡:會})P(\text{性別:女}|\text{辦卡:會})P(\text{年齡:31~40}|\text{辦卡:會})P(\text{職業:上班族}|\text{辦卡:會})P(\text{月收入:中}|\text{辦卡:會}) \\ & = \frac{6}{10} \times \frac{4}{6} \times \frac{2}{6} \times \frac{3}{6} \times \frac{1}{6} = 0.0111 \\\\ L(\text{辦卡:不會}|X) & = P(\text{辦卡:不會})P(\text{性別:女}|\text{辦卡:不會})P(\text{年齡:31~40}|\text{辦卡:不會})P(\text{職業:上班族}|\text{辦卡:不會})P(\text{月收入:中}|\text{辦卡:不會}) \\ & = \frac{4}{10} \times \frac{1}{4} \times \frac{3}{4} \times \frac{2}{4} \times \frac{1}{4} = 0.0188 \end{align} \]

第二節:樸素貝葉斯分類器(3)

– 讓我們能用「GENDER」和「Rate」預測「AMI」:

library(e1071)

dat[,'GENDER'] <- factor(dat[,'GENDER'])
dat[,'AMI'] <- factor(dat[,'AMI'])

fit_Bayes <- naiveBayes(AMI ~ GENDER + Rate, data = dat)
fit_Bayes
## 
## Naive Bayes Classifier for Discrete Predictors
## 
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
## 
## A-priori probabilities:
## Y
##   not-AMI    NSTEMI     STEMI 
## 0.7147674 0.1132839 0.1719488 
## 
## Conditional probabilities:
##          GENDER
## Y            female      male
##   not-AMI 0.4915094 0.5084906
##   NSTEMI  0.1904762 0.8095238
##   STEMI   0.1411765 0.8588235
## 
##          Rate
## Y             [,1]     [,2]
##   not-AMI 90.12830 23.72952
##   NSTEMI  80.92262 22.54098
##   STEMI   83.04706 22.14905

第二節:樸素貝葉斯分類器(4)

\[ \begin{align} L(\text{not-AMI}|\text{Rate = 100 & GENDER = male}) & = P(\text{not-AMI})P(\text{Rate = 100}|\text{not-AMI})P(\text{GENDER = male}|\text{not-AMI}) \\ L(\text{NSTEMI}|\text{Rate = 100 & GENDER = male}) & = P(\text{NSTEMI})P(\text{Rate = 100}|\text{NSTEMI})P(\text{GENDER = male}|\text{NSTEMI}) \\ L(\text{STEMI}|\text{Rate = 100 & GENDER = male}) & = P(\text{STEMI})P(\text{Rate = 100}|\text{STEMI})P(\text{GENDER = male}|\text{STEMI}) \end{align} \]

L1 <- prop.table(fit_Bayes$apriori)[1] * dnorm(100, mean = fit_Bayes$tables$Rate[1,1], sd = fit_Bayes$tables$Rate[1,2]) * fit_Bayes$tables$GENDER[1,2]
L2 <- prop.table(fit_Bayes$apriori)[2] * dnorm(100, mean = fit_Bayes$tables$Rate[2,1], sd = fit_Bayes$tables$Rate[2,2]) * fit_Bayes$tables$GENDER[2,2]
L3 <- prop.table(fit_Bayes$apriori)[3] * dnorm(100, mean = fit_Bayes$tables$Rate[3,1], sd = fit_Bayes$tables$Rate[3,2]) * fit_Bayes$tables$GENDER[3,2]
L_vec <- c(L1, L2, L3)
L_vec / sum(L_vec)
##   not-AMI    NSTEMI     STEMI 
## 0.6424388 0.1300580 0.2275032
predict(fit_Bayes, newdata = data.frame(Rate = 100, GENDER = factor('male', levels = c('female', 'male'))), type = 'raw')
##        not-AMI   NSTEMI     STEMI
## [1,] 0.6424388 0.130058 0.2275032

練習2:比較樸素貝葉斯分類器與邏輯斯回歸

subdat <- dat[!(dat[,"LVD"] %in% NA) & !(dat[,"Rate"] %in% NA) & !(dat[,"QTc"] %in% NA),]
X <- subdat[,c('Rate', 'QTc')]
Y <- subdat[,"LVD"]

set.seed(0)
all_idx <- 1:nrow(subdat)

train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]

train_X <- X[train_idx,]
valid_X <- X[valid_idx,]
test_X <- X[test_idx,]

train_Y <- Y[train_idx]
valid_Y <- Y[valid_idx]
test_Y <- Y[test_idx]

練習2答案

fit_Bayes <- naiveBayes(train_Y ~ ., data = train_X, family = 'binomial')

valid_pred <- predict(fit_Bayes, valid_X, type = 'raw')[,2]
test_pred <- predict(fit_Bayes, test_X, type = 'raw')[,2]

library(pROC)

roc_valid <- roc(valid_Y ~ valid_pred)
best_pos <- which.max(roc_valid$sensitivities + roc_valid$specificities)
best_cut <- roc_valid$thresholds[best_pos]

tab_test <- table(test_pred >= best_cut, test_Y)
sens <- tab_test[2,2] / sum(tab_test[,2])
spec <- tab_test[1,1] / sum(tab_test[,1])

roc_test <- roc(test_Y ~ test_pred)
plot(roc_test)

points(spec, sens, pch = 19)
text(0.5, 0.5, paste0('AUC(valid) = ', formatC(roc_valid$auc, digits = 3, format = 'f'),
                      '\nSens = ', formatC(sens, digits = 3, format = 'f'),
                      '\nSpec = ', formatC(spec, digits = 3, format = 'f'),
                      '\nAUC(test) = ', formatC(roc_test$auc, digits = 3, format = 'f')), col = 'red')

– 不過這不是重點,重點是結果終於不一樣了!論文中同時出現樸素貝葉斯分類器與邏輯斯回歸是可以接受的。

第三節:k-近鄰演算法(1)

– 我們再來教個更簡單的:k-近鄰演算法(k-nearest neighbor algorithm)

subdat <- dat[!(dat[,"AMI"] %in% NA) & !(dat[,"Rate"] %in% NA) & !(dat[,"QTc"] %in% NA),]
X_mat <- cbind(subdat[,'Rate'], subdat[,'QTc'])
col_idx <- as.integer(subdat[,'AMI'])
col_list <- c('#FF000030', '#00A00030', '#0000FF30')

plot(X_mat[,1], X_mat[,2], pch = 19, cex = 0.5, col = col_list[col_idx], main = 'All data')
points(150, 600, pch = 19, cex = 1)

第三節:k-近鄰演算法(2)

new_vec <- c(150, 600)
new_X <- matrix(new_vec, nrow = nrow(X_mat), ncol = 2, byrow = TRUE)
distance <-  apply((X_mat - new_X)^2, 1, sum)
k_param <- 20
limit_val <- sort(distance)[k_param]
used_id <- which(distance <= limit_val)

par(mfrow = c(1, 2))

plot(X_mat[-used_id,1], X_mat[-used_id,2], pch = 19, cex = 0.5, col = '#00000020', main = 'Original scale')
points(X_mat[used_id,1], X_mat[used_id,2], col = col_list[col_idx[used_id]], pch = 19, cex = 1)
points(150, 600, pch = 15, cex = 1)

plot(X_mat[-used_id,1], X_mat[-used_id,2], xlim = c(100, 200), ylim = c(500, 700), pch = 19, cex = 0.5, col = '#00000020', main = 'Interested zoom')
points(X_mat[used_id,1], X_mat[used_id,2], col = col_list[col_idx[used_id]], pch = 19, cex = 1)
points(150, 600, pch = 15, cex = 1)

table(subdat[used_id,'AMI'])
## 
## not-AMI  NSTEMI   STEMI 
##      16       1       3

第三節:k-近鄰演算法(3)

norm_X_mat <- scale(X_mat)
new_vec <- c(150, 600)
norm_vec <- (new_vec - attr(norm_X_mat, 'scaled:center')) / attr(norm_X_mat, 'scaled:scale')
new_norm_X <- matrix(norm_vec, nrow = nrow(X_mat), ncol = 2, byrow = TRUE)
distance <-  apply((norm_X_mat - new_norm_X)^2, 1, sum)
k_param <- 20
limit_val <- sort(distance)[k_param]
used_id <- which(distance <= limit_val)

par(mfrow = c(1, 2))

plot(X_mat[-used_id,1], X_mat[-used_id,2], pch = 19, cex = 0.5, col = '#00000020', main = 'Original scale')
points(X_mat[used_id,1], X_mat[used_id,2], col = col_list[col_idx[used_id]], pch = 19, cex = 1)
points(150, 600, pch = 15, cex = 1)

plot(X_mat[-used_id,1], X_mat[-used_id,2], xlim = c(100, 200), ylim = c(500, 700), pch = 19, cex = 0.5, col = '#00000020', main = 'Interested zoom')
points(X_mat[used_id,1], X_mat[used_id,2], col = col_list[col_idx[used_id]], pch = 19, cex = 1)
points(150, 600, pch = 15, cex = 1)

table(subdat[used_id,'AMI'])
## 
## not-AMI  NSTEMI   STEMI 
##      15       1       4

第三節:k-近鄰演算法(4)

– 需要注意的是,由於k-近鄰演算法每次都要跟原始資料比,所以是沒有辦法精簡出摘要數據的,每次我們都需要跟原始資料做比較。

– 我們直接將樣本分成「訓練組」、「驗證組」與「測試組」。

subdat <- dat[!(dat[,"AMI"] %in% NA) & !(dat[,"Rate"] %in% NA) & !(dat[,"QTc"] %in% NA),]
X <- subdat[,c('Rate', 'QTc')]
Y <- subdat[,"AMI"]

set.seed(0)
all_idx <- 1:nrow(subdat)

train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]

train_X <- X[train_idx,]
valid_X <- X[valid_idx,]
test_X <- X[test_idx,]

train_Y <- Y[train_idx]
valid_Y <- Y[valid_idx]
test_Y <- Y[test_idx]
library(class)

pred_valid.1 <- knn(train = train_X, test = valid_X, cl = train_Y, k = 20, prob = TRUE)
pred_valid.2 <- knn(train = scale(train_X), test = scale(valid_X), cl = train_Y, k = 20, prob = TRUE)

table(valid_Y, pred_valid.1)
##          pred_valid.1
## valid_Y   not-AMI NSTEMI STEMI
##   not-AMI     212      2     6
##   NSTEMI       24      0     2
##   STEMI        48      1     0
table(valid_Y, pred_valid.2)
##          pred_valid.2
## valid_Y   not-AMI NSTEMI STEMI
##   not-AMI     209      2     9
##   NSTEMI       24      0     2
##   STEMI        48      1     0

練習3:使用k-近鄰演算法分析

subdat <- dat[!(dat[,"LVD"] %in% NA) & !(dat[,"Rate"] %in% NA) & !(dat[,"QTc"] %in% NA),]
X <- subdat[,c('Rate', 'QTc')]
Y <- subdat[,"LVD"]

set.seed(0)
all_idx <- 1:nrow(subdat)

train_idx <- sample(all_idx, nrow(subdat) * 0.6)
valid_idx <- sample(all_idx[!all_idx %in% train_idx], nrow(subdat) * 0.2)
test_idx <- all_idx[!all_idx %in% c(train_idx, valid_idx)]

train_X <- X[train_idx,]
valid_X <- X[valid_idx,]
test_X <- X[test_idx,]

train_Y <- Y[train_idx]
valid_Y <- Y[valid_idx]
test_Y <- Y[test_idx]

練習3答案

valid_pred <- knn(train = scale(train_X), test = scale(valid_X), cl = train_Y, k = 20, prob = TRUE)
test_pred <- knn(train = scale(train_X), test = scale(test_X), cl = train_Y, k = 20, prob = TRUE)

library(pROC)

roc_valid <- roc(valid_Y ~ attr(valid_pred, 'prob'), direction = '>')
best_pos <- which.max(roc_valid$sensitivities + roc_valid$specificities)
best_cut <- roc_valid$thresholds[best_pos]

tab_test <- table(attr(test_pred, 'prob') >= best_cut, test_Y)
sens <- tab_test[2,2] / sum(tab_test[,2])
spec <- tab_test[1,1] / sum(tab_test[,1])

roc_test <- roc(test_Y ~ attr(test_pred, 'prob'), direction = '>')
plot(roc_test)

text(0.5, 0.5, paste0('AUC(valid) = ', formatC(roc_valid$auc, digits = 3, format = 'f'),
                      '\nSens = ', formatC(sens, digits = 3, format = 'f'),
                      '\nSpec = ', formatC(spec, digits = 3, format = 'f'),
                      '\nAUC(test) = ', formatC(roc_test$auc, digits = 3, format = 'f')), col = 'red')

課程小結

– 你應該有開始覺得機器學習並沒有這麼困難,在過去3節課我們才分別教會你3個統計方法,而今天一口氣就學會3個機器學習方法,有沒有覺得很有信心!

– 透過大量的實驗應該有發現,線性預測也不見得不好,事實上大多數時候線性預測帶來的效果並不一定比較差。