機器學習2-支持向量機

林嶔 (Lin, Chin)

Lesson 20

特徵相依性(1)

– 那就是假設兩個變項對outcome存在預測能力,但分開看完全沒有關係,同時合一起看才有關係,這樣照決策樹運行的邏輯,不就無法處理了?

– 換個說法,就是這兩個變項存在交互作用

x1 = rep(0:1, 500)
x2 = rep(0:1, rep(500, 2))
y = x1 + x2 - 2*x1*x2
y = as.factor(y)

table(x1, y)
##    y
## x1    0   1
##   0 250 250
##   1 250 250
table(x2, y)
##    y
## x2    0   1
##   0 250 250
##   1 250 250
table(x1, y, x2)
## , , x2 = 0
## 
##    y
## x1    0   1
##   0 250   0
##   1   0 250
## 
## , , x2 = 1
## 
##    y
## x1    0   1
##   0   0 250
##   1 250   0
library(party)
tree.model = ctree(y~x1+x2)
plot(tree.model)

特徵相依性(2)

set.seed(0)
x1 = rnorm(300) 
x2 = rnorm(300) 
lr1 = x1^2 + x2^2
y = lr1 > mean(lr1)
plot(x1, x2, col = (y + 1)*2, pch = 19)

library(rgl)
plot3d(x = x1,
       y = x2,
       z = x1*x2,
       col = (y + 1)*2, size = 3, main="3D Scatterplot")

You must enable Javascript to view this page properly.

plot(x1^2, x2^2, col = (y + 1)*2, pch = 19)

特徵相依性(3)

F20_1

線性支持向量機介紹(1)

– 在一個2維平面中,他希望找到一條線能完美的分割紅點與藍點

x1 = c(0, 2, 2, 3)
x2 = c(0, 2, 0, 0)
y = c(1, 1, -1, -1)
plot(x1, x2, col = y + 3, pch = 19, cex = 3)

– 我們當然一眼就能看出有好多條線都能輕易完美分割這四個點,舉例來說,x2 = -0.5 + 0.5 × x1能幫我們輕易的切開紅藍點

plot(x1, x2, col =  y + 3, pch = 19, cex = 3)
abline(a = -0.5, b = 0.5, lwd = 2, lty = 1)

– 但這樣是不夠的,儘管我們不清楚原因,但你應該覺得x2 = -1 + 1 × x1這條線切得更好吧!

plot(x1, x2, col =  y + 3, pch = 19, cex = 3)
abline(a = -1, b = 1, lwd = 2, lty = 1)

線性支持向量機介紹(2)

plot(x1, x2, col =  y + 3, pch = 19, cex = 3)
abline(a = -1, b = 1, lwd = 2, lty = 1)
abline(a = 0, b = 1, lwd = 2, lty = 2)
abline(a = -2, b = 1, lwd = 2, lty = 2)

  1. 我們想要找到一條線,能完美切割這些點(約束條件)

  2. 這條線離最接近的點要最遠(取極大值)

– 在這樣的條線下,w0 = 1;w1 = -1;w2 = 1,這能滿足這個例子上我們要的解

distance.func = function (x1, x2, w0, w1, w2) {
  dist = 1/sqrt(w1^2 + w2^2) * abs(w0 + w1 * x1 + w2 * x2)
  return(dist)
}

distance.func(0, 0, w0 = 1, w1 = -1, w2 = 1)
## [1] 0.7071068
distance.func(2, 2, w0 = 1, w1 = -1, w2 = 1)
## [1] 0.7071068
distance.func(2, 0, w0 = 1, w1 = -1, w2 = 1)
## [1] 0.7071068
distance.func(3, 0, w0 = 1, w1 = -1, w2 = 1)
## [1] 1.414214
distance.func = function (x1, x2, y, w0, w1, w2) {
  dist = 1/sqrt(w1^2 + w2^2) * y * (w0 + w1 * x1 + w2 * x2)
  return(dist)
}

distance.func(0, 0, 1, w0 = 1, w1 = -1, w2 = 1)
## [1] 0.7071068
distance.func(2, 2, 1, w0 = 1, w1 = -1, w2 = 1)
## [1] 0.7071068
distance.func(2, 0, -1, w0 = 1, w1 = -1, w2 = 1)
## [1] 0.7071068
distance.func(3, 0, -1, w0 = 1, w1 = -1, w2 = 1)
## [1] 1.414214

F20_2

線性支持向量機介紹(3)

– 由於我們令了兩條虛線分別是w0 + w1 × x1 + w2 × x2 = 1與w0 + w1 × x1 + w2 × x2 = -1,所以所有點離粗線的距離至少都要滿足下列方程式

F20_3

– 既然他都大於等於1了,那要最大化距離方程就跟他無關了,我們只要最大化…

F20_4

– 把式子改寫一下,問題變得簡單了

F20_5

線性支持向量機介紹(4)

– 我們必須改用二次規劃求解,這個部分我們讓套件幫助我們渡過難關,但我們需要把我們的問題轉化:

F20_5

F20_6

library(quadprog)
n.sample = 4
n.weight = 3 
small.value = 5e-6
Q.matrix = matrix(small.value, nrow = n.weight, ncol = n.weight)
Q.matrix = Q.matrix + small.value*diag(n.weight)
A.matrix = rbind(rep(1, n.sample)*y, x1*y, x2*y)
fit = solve.QP(Dmat = Q.matrix, dvec = rep(0, n.weight), Amat = A.matrix, bvec = rep(1, n.sample))
fit$solution
## [1]  1 -1  1

– 當答案解出來之後,我們就能利用這個解來畫線了

COEF = fit$solution
A0 = -COEF[1]/COEF[3]
A1 = A0 + 1/COEF[3]
A2 = A0 - 1/COEF[3]
B = -COEF[2]/COEF[3]

plot(x1, x2, col =  y + 3, pch = 19, cex = 3)
abline(a = A0, b = B, lwd = 2, lty = 1)
abline(a = A1, b = B, lwd = 2, lty = 2)
abline(a = A2, b = B, lwd = 2, lty = 2)

– 我們把它寫成一個完整的求解函數

mysvm = function (x1, x2, y) {
  
  require(quadprog)
  
  n.sample = length(x1)
  n.weight = 3
  small.value = 5e-6
  Q.matrix = matrix(small.value, nrow = n.weight, ncol = n.weight);diag(Q.matrix) = 1; Q.matrix[1,1] = small.value
  A.matrix = rbind(rep(1, n.sample)*y, x1*y, x2*y)
  fit = solve.QP(Dmat = Q.matrix, dvec = rep(0, n.weight), Amat = A.matrix, bvec = rep(1, n.sample))
  
  COEF = fit$solution
  A0 = -COEF[1]/COEF[3]
  A1 = A0 + 1/COEF[3]
  A2 = A0 - 1/COEF[3]
  B = -COEF[2]/COEF[3]
  
  plot(x1, x2, col =  y + 3, pch = 19)
  abline(a = A0, b = B, lwd = 2, lty = 1)
  abline(a = A1, b = B, lwd = 2, lty = 2)
  abline(a = A2, b = B, lwd = 2, lty = 2)
  
}

mysvm(x1 = c(0, 2, 2, 3),
      x2 = c(0, 2, 0, 0),
      y = c(1, 1, -1, -1))

mysvm(x1 = c(0, 2, 3, 4, 5),
      x2 = c(0, 2, 0, 0, 3),
      y = c(1, 1, -1, -1, -1))

練習-1

data(iris)
sub.iris = iris[1:100,]
X1 = sub.iris[,1]
X2 = sub.iris[,2]
Y = as.integer(sub.iris[,5]) * 2 - 3

mysvm(x1 = X1, x2 = X2, y = Y)

glm.model = glm((Y>0)~X1+X2, family = "binomial")
COEF = glm.model$coefficients
A = -COEF[1]/COEF[3]
B = -COEF[2]/COEF[3]

plot(X1, X2, col = Y + 3, pch = 19)
abline(a = A, b = B, col = "darkgreen")

利用套件做出SVM(1)

F20_5

– 在進入下一個部份之前,我們要先轉換一下上述這個「有條件的」最佳化問題,把他的求解的式子加入一個「拉格朗日乘數(Lagrange multiplier)」,讓問題成為一個「無條件的」最佳化問題。

F20_7

– 接著我們把我們的優化目標修正為:

F20_8

– 然後我們可以將問題轉換為:

F20_9

– 在給定一個特殊條件:KKT條件,這樣我們能把W與b給消除:

F20_10

利用套件做出SVM(2)

F20_11

#set.seed(0)
x1 = c(0, 2, 2, 3)
x2 = c(0, 2, 0, 0)
y = c(1, 1, -1, -1)

library(quadprog)

n.sample = 4
small.value = 5e-6

X = cbind(x1, x2)

Q.matrix = (y%*%t(y))*(X%*%t(X))
Q.matrix = Q.matrix + small.value*diag(n.sample)
A.matrix = t(rbind(matrix(y, ncol = n.sample), diag(n.sample)))

fit = solve.QP(Dmat = Q.matrix, dvec = rep(1, n.sample), Amat = A.matrix, bvec = rep(0, n.sample + 1), meq = 1, factorized = FALSE)
qpsol <- fit$solution
print(qpsol)
## [1] 0.4999981 0.4999981 0.9999963 0.0000000

– 不要忘記我們剛剛在解極值時給定的KKT條件,讓我們回顧一下關係式

F20_10

F20_14

– 這裡要注意一點,若拉格朗日乘數等於0,在計算b時不能使用,而且也完全不影響到W向量的結果。

– 我們把剩下的這些拉格朗日乘數大於0的點稱為「支持向量」,他們其實就是位於最靠近寬分界的那些點,這下我們終於了解為什麼這個方法要稱為「支持向量機」了

findCoefs <- function(a, y, X){
  nonzero <-  abs(a) > 5e-6
  W <- rowSums(sapply(which(nonzero), function(i) a[i]*y[i]*X[i,]))
  b <- mean(sapply(which(nonzero), function(i) y[i]-X[i,]%*%W))
  result <- c(b, W)
  names(result) = c("w0", "w1", "w2")
  return(result)
}

coefs = findCoefs(qpsol, y, X)
print(coefs)
##         w0         w1         w2 
##  0.9999975 -0.9999963  0.9999963
A = -coefs[1]/coefs[3]
B = -coefs[2]/coefs[3]

plot(x1, x2, col =  y + 3, pch = 19)
abline(a = A, b = B, lwd = 2, lty = 1)

利用套件做出SVM(3)

library(e1071)

x1 = c(0, 2, 2, 3)
x2 = c(0, 2, 0, 0)
y = c(1, 1, -1, -1)

svm.model = svm(y ~ x1 + x2, kernel = "linear", scale = FALSE, type = "C-classification", cost = 1e5)

– 這就是「支持向量」

svm.model$SV
##   x1 x2
## 1  0  0
## 2  2  2
## 3  2  0

– 這是每個「支持向量」的拉格朗日乘數乘以標籤

svm.model$coefs
##      [,1]
## [1,]  0.5
## [2,]  0.5
## [3,] -1.0

– 這是負數截距

svm.model$rho
## [1] -1
W.vector = rowSums(sapply(1:length(svm.model$coefs), function(i) svm.model$coefs[i]*svm.model$SV[i,]))
w0 = -svm.model$rho
w1 = W.vector[1]
w2 = W.vector[2]
A0 = -w0/w2
A1 = A0 + 1/w2
A2 = A0 - 1/w2
B = -w1/w2

plot(x1, x2, col =  y + 3, pch = 19, cex = 3)
abline(a = A0, b = B, lwd = 2, lty = 1)
abline(a = A1, b = B, lwd = 2, lty = 2)
abline(a = A2, b = B, lwd = 2, lty = 2)

svm.model$decision.values
##   1/-1
## 1    1
## 2    1
## 3   -1
## 4   -2

利用套件做出SVM(4)

data(iris)
sub.iris = iris[1:100,c(1, 2, 5)]
sub.iris[,3] = as.factor(as.character(sub.iris[,3]))
X1 = sub.iris[,1]
X2 = sub.iris[,2]
Y = as.integer(sub.iris[,3])

svm.model = svm(Species ~ Sepal.Length + Sepal.Width, data = sub.iris, kernel = "linear", scale = FALSE, type = "C-classification", cost = 1e5)

W.vector = rowSums(sapply(1:length(svm.model$coefs), function(i) svm.model$coefs[i]*svm.model$SV[i,]))
w0 = -svm.model$rho
w1 = W.vector[1]
w2 = W.vector[2]

A0 = -w0/w2
A1 = A0 + 1/w2
A2 = A0 - 1/w2
B = -w1/w2

plot(X1, X2, col =  Y * 2 + 2, pch = 19)
abline(a = A0, b = B, lwd = 2, lty = 1)
abline(a = A1, b = B, lwd = 2, lty = 2)
abline(a = A2, b = B, lwd = 2, lty = 2)

predict(svm.model, data.frame(Sepal.Length = 5, Sepal.Width = 4))
##      1 
## setosa 
## Levels: setosa versicolor
predict(svm.model, data.frame(Sepal.Length = 6, Sepal.Width = 3))
##          1 
## versicolor 
## Levels: setosa versicolor
plot(svm.model, sub.iris)