林嶔 (Lin, Chin)
Lesson 4 人工智慧基礎1(線性迴歸)
– 請至這裡下載範例資料
dat <- read.csv("ECG_train.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE, na.strings = "")
##
## Call:
## lm(formula = Y ~ X)
##
## Coefficients:
## (Intercept) X
## 78.9313 0.1021
##
## Call:
## lm(formula = Rate ~ AGE, data = dat)
##
## Coefficients:
## (Intercept) AGE
## 78.9313 0.1021
##
## Call:
## lm(formula = Rate ~ ., data = dat[, c("Rate", "AGE")])
##
## Coefficients:
## (Intercept) AGE
## 78.9313 0.1021
\[\hat{y} = f(x) = b_{0} + b_{1}x\]
\[\hat{Rate} = 78.9313 + 0.1021 \times Age\]
\[\hat{Rate} = 85.0573 = 78.9313 + 0.1021 \times 60\]
model <- lm(Rate ~ AGE + PR, data = dat)
model <- lm(Rate ~ ., data = dat[,c('Rate', 'AGE', 'PR')])
model
##
## Call:
## lm(formula = Rate ~ ., data = dat[, c("Rate", "AGE", "PR")])
##
## Coefficients:
## (Intercept) AGE PR
## 99.0890 0.1203 -0.1373
\[\hat{y} = f(x) = b_{0} + b_{1}x_{1} + b_{2}x_{2}\]
\[\hat{Rate} = 99.0890 + 0.1203 \times AGE - 0.1373 \times PR\]
X <- dat[,"AGE"]
Y <- dat[,"Rate"]
model <- lm(Y~X)
x <- c(10, 150)
y <- 78.9313 + 0.1021 * x
plot(X, Y, ylab = "Rate", xlab = "AGE", main = "Scatter plot of AGE and Rate", pch = 19, col = "#00000030")
lines(x, y, col = "red", lwd = 2)
– 對於第\(i\)個人來說,他們的誤差長這樣:
\[\delta_i = y_{i} - \hat{y_{i}} = y_{i} - (b_{0} + b_{1}x_i)\]
– 線性迴歸分析的「目標」是希望能找到一組\(b_{0}\)與\(b_{1}\)讓「殘差平方和」最小化,我們把這個整體叫做\(loss\):
\[loss = \sum \limits_{i=1}^{n} \delta_{i}^{2} = \sum \limits_{i=1}^{n} \left(y_{i} - \hat{y_{i}}\right)^{2}\]
– 上式進行全部展開可以獲得下列結果:
\[loss = \sum \limits_{i=1}^{n} \left(y_{i} - \left(b_{0} + b_{1}x_{i}\right)\right)^{2}\]
\[loss = \sum \limits_{i=1}^{n} \left(y_{i} - \left(b_{0} + b_{1}x_{i}\right)\right)^{2}\]
– 我們的目標是希望\(loss\)越小越好,因此我們可以將上述問題轉變為一個『優化問題』(求函數極值):
\[min(loss)\]
\[f(x) = x^{2} + 2x + 1\] - 接著,我們對上述函數進行微分,並尋找微分後函數為0的位置,將可以知道此函數的極值位置:
\[\frac{\partial}{\partial x} f(x) = 2x + 2 = 0\]
\[x = -1\]
– 首先先介紹偏微分,一次牽涉到了2個以上元素的微分就需要用到他,我們用一個簡單的範例還示範:
\[f(a, b) = 4a^{2}- 4a + b^{2} - 2b + 5\]
\[ \begin{align} f(a, b) & = (4a^{2} - 4a + 1) + (b^{2} - 2b + 1) + 3 \\ & = (2a - 1)^{2} + (b - 1)^{2} + 3 \end{align} \]
– 現在讓我們試著使用偏微分求導函數:
\[ \begin{align} \frac{\partial}{\partial a} f(a, b) & = 8a - 4 \\ \frac{\partial}{\partial b} f(a, b) & = 2b - 2 \end{align} \]
\[loss = \sum \limits_{i=1}^{n} \left(y_{i} - \left(b_{0} + b_{1}x_{i}\right)\right)^{2}\]
– 你可能需要複習一下連鎖率
\[\frac{\partial}{\partial x}h(x) = \frac{\partial}{\partial x}f(g(x)) = \frac{\partial}{\partial g(x)}f(g(x)) \cdot\frac{\partial}{\partial x}g(x)\]
– \(b_0\)以及\(b_1\)的偏導函數如下
\[ \begin{align} \frac{\partial}{\partial b_0}loss & = \frac{{1}}{n}\sum \limits_{i=1}^{n} \frac{\partial}{\partial \hat{y_{i}}} \left(y_{i} - \hat{y_{i}}\right)^{2} \cdot \frac{\partial}{\partial b_0} \hat{y_{i}}\\ & = \frac{2}{n} \sum \limits_{i=1}^{n}\left( \hat{y_{i}} - y_{i} \right) \cdot \frac{\partial}{\partial b_0} (b_{0} + b_{1}x_{i}) \\ & = \frac{2}{n} \sum \limits_{i=1}^{n}\left( \hat{y_{i}} - y_{i} \right) \\\\ \frac{\partial}{\partial b_1}loss & = \frac{2}{n} \sum \limits_{i=1}^{n}\left( \hat{y_{i}} - y_{i} \right) \cdot x_{i} \end{align} \]
\[ \begin{align} \frac{\partial}{\partial b_0}loss & = \frac{2}{n} \sum \limits_{i=1}^{n}\left( \hat{y_{i}} - y_{i} \right) = 0\\ \frac{\partial}{\partial b_1}loss & = \frac{2}{n} \sum \limits_{i=1}^{n}\left( \hat{y_{i}} - y_{i} \right) \cdot x_{i} = 0 \end{align} \]
\[ \begin{align} \sum \limits_{i=1}^{n} \hat{y_{i}} - \sum \limits_{i=1}^{n} y_{i} = 0\\ \sum \limits_{i=1}^{n} \hat{y_{i}} \cdot x_{i} - \sum \limits_{i=1}^{n} y_{i} \cdot x_{i} = 0 \end{align} \]
\[ \begin{align} \sum \limits_{i=1}^{n} \left(b_{0} + b_{1}x_{i}\right) - \sum \limits_{i=1}^{n} y_{i} = 0\\ \sum \limits_{i=1}^{n} \left(b_{0} + b_{1}x_{i}\right) \cdot x_{i} - \sum \limits_{i=1}^{n} y_{i} \cdot x_{i} = 0 \end{align} \]
\[ \begin{align} \sum \limits_{i=1}^{n} b_{0} + \sum \limits_{i=1}^{n} b_{1}x_{i} - \sum \limits_{i=1}^{n} y_{i} = 0\\ \sum \limits_{i=1}^{n} b_{0}x_{i} + \sum \limits_{i=1}^{n} b_{1}x_{i}x_{i} - \sum \limits_{i=1}^{n} y_{i}x_{i} = 0 \end{align} \]
\[ \begin{align} b_{0} n + b_{1} \sum \limits_{i=1}^{n} x_{i} - \sum \limits_{i=1}^{n} y_{i} = 0\\ b_{0} \sum \limits_{i=1}^{n} x_{i} + b_{1} \sum \limits_{i=1}^{n} x_{i}x_{i} - \sum \limits_{i=1}^{n} y_{i}x_{i} = 0 \end{align} \]
\[ \begin{align} b_{0} \sum \limits_{i=1}^{n} x_{i} + \frac{1}{n} \cdot b_{1} \sum \limits_{i=1}^{n} x_{i} \cdot \sum \limits_{i=1}^{n} x_{i} - \frac{1}{n} \cdot \sum \limits_{i=1}^{n} y_{i} \cdot \sum \limits_{i=1}^{n} x_{i} = 0\\ b_{0} \sum \limits_{i=1}^{n} x_{i} + b_{1} \sum \limits_{i=1}^{n} x_{i}x_{i} - \sum \limits_{i=1}^{n} y_{i}x_{i} = 0 \end{align} \]
\[ \begin{align} \frac{1}{n} \cdot b_{1} \sum \limits_{i=1}^{n} x_{i} \cdot \sum \limits_{i=1}^{n} x_{i} - b_{1} \sum \limits_{i=1}^{n} x_{i}x_{i} + \sum \limits_{i=1}^{n} y_{i}x_{i} - \frac{1}{n} \cdot \sum \limits_{i=1}^{n} y_{i} \cdot \sum \limits_{i=1}^{n} x_{i} & = 0\\ \frac{1}{n} \cdot b_{1} \sum \limits_{i=1}^{n} x_{i} \cdot \sum \limits_{i=1}^{n} x_{i} - b_{1} \sum \limits_{i=1}^{n} x_{i}x_{i} & = \frac{1}{n} \cdot \sum \limits_{i=1}^{n} y_{i} \cdot \sum \limits_{i=1}^{n} x_{i} - \sum \limits_{i=1}^{n} y_{i}x_{i}\\ b_{1} \cdot \left( \frac{1}{n} \cdot \sum \limits_{i=1}^{n} x_{i} \cdot \sum \limits_{i=1}^{n} x_{i} - \sum \limits_{i=1}^{n} x_{i}x_{i} \right) & = \frac{1}{n} \cdot \sum \limits_{i=1}^{n} y_{i} \cdot \sum \limits_{i=1}^{n} x_{i} - \sum \limits_{i=1}^{n} y_{i}x_{i}\\ b_{1} & = \frac{\frac{1}{n} \cdot \sum \limits_{i=1}^{n} y_{i} \cdot \sum \limits_{i=1}^{n} x_{i} - \sum \limits_{i=1}^{n} y_{i}x_{i}}{\frac{1}{n} \cdot \sum \limits_{i=1}^{n} x_{i} \cdot \sum \limits_{i=1}^{n} x_{i} - \sum \limits_{i=1}^{n} x_{i}x_{i}} \end{align} \]
##
## Call:
## lm(formula = Y ~ X)
##
## Coefficients:
## (Intercept) X
## 78.9313 0.1021
\[ \begin{align} b_{1} & = \frac{\frac{1}{n} \cdot \sum \limits_{i=1}^{n} y_{i} \cdot \sum \limits_{i=1}^{n} x_{i} - \sum \limits_{i=1}^{n} y_{i}x_{i}}{\frac{1}{n} \cdot \sum \limits_{i=1}^{n} x_{i} \cdot \sum \limits_{i=1}^{n} x_{i} - \sum \limits_{i=1}^{n} x_{i}x_{i}} \\\\ b_{0} n + b_{1} \sum \limits_{i=1}^{n} x_{i} - \sum \limits_{i=1}^{n} y_{i} &= 0\\ b_{0} &= \frac{1}{n} \sum \limits_{i=1}^{n} y_{i} - b_{1} \cdot \frac{1}{n} \sum \limits_{i=1}^{n} x_{i} \end{align} \]
– 需要注意的是,你必須先保留X與Y都沒有missing的才能進行計算:
subdat <- dat[!(dat[,"Rate"] %in% NA) & !(dat[,"AGE"] %in% NA),]
X <- subdat[,"AGE"]
Y <- subdat[,"Rate"]
sum_X <- sum(X)
sum_Y <- sum(Y)
sum_XY <- sum(X * Y)
sum_XX <- sum(X * X)
n <- length(X)
b1 <- (sum_Y * sum_X / n - sum_XY) / (sum_X * sum_X / n - sum_XX)
b1
## [1] 0.1021352
– 你應該有注意到\(\frac{1}{n} \sum \limits_{i=1}^{n} x_{i}\)和\(\frac{1}{n} \sum \limits_{i=1}^{n} y_{i}\)其實就是\(x\)與\(y\)的平均值:
## [1] 78.93125
– 現在如果我們想要擴展到更多x變項,我們的聯立方程組就會更複雜,有沒有甚麼好方法呢?
– 這是剛剛的例子:
\[ \begin{align} \hat{y_i} & = b_{0} + b_{1}x_i \\ y_i & = b_{0} + b_{1}x_i + \delta_i \end{align} \]
– 我們可以把這樣的式子看成這樣的形態:
\[ \begin{align} i = 1, y_1 & = b_{0} + b_{1}x_1 + \delta_1 \\ i = 2, y_2 & = b_{0} + b_{1}x_2 + \delta_2 \\ i = 3, y_3 & = b_{0} + b_{1}x_3 + \delta_3 \\ ... \\ i = n, y_3 & = b_{0} + b_{1}x_n + \delta_n \end{align} \]
\[ \begin{align} Y & = XB + E \\\\ Y & = \begin{pmatrix} y_1 \\ y_2 \\ y_3 \\ \vdots \\ y_n \end{pmatrix} X = \begin{pmatrix} 1 & x_1 \\ 1 & x_2 \\ 1 & x_3 \\ 1 & \vdots \\ 1 & x_n \end{pmatrix} B = \begin{pmatrix} b_0 \\ b_1 \end{pmatrix} E = \begin{pmatrix} \delta_1 \\ \delta_2 \\ \delta_3 \\ \vdots \\ \delta_n \end{pmatrix} \end{align} \]
\[loss = E^TE = (Y-XB)^T(Y-XB)\]
– 我們的目標仍然是希望\(loss\)越小越好:
\[min(loss)\]
\[ \begin{align} \frac{\partial}{\partial B}loss & = \frac{\partial}{\partial B} (Y-XB)^T(Y-XB) \\ & = \frac{\partial}{\partial B} (Y^T-B^TX^T)(Y-XB) \\ & = \frac{\partial}{\partial B} (Y^TY-B^TX^TY - Y^TXB + B^TX^TXB) \\ & = \frac{\partial}{\partial B} (Y^TY - 2B^TX^TY + B^TX^TXB) \\ & = - 2X^TY + 2X^TXB = 0 \end{align} \]
\[ \begin{align} X^TXB & = X^TY \\ B & = (X^TX)^{-1}X^TY \end{align} \]
函數「matrix()」可以獲得矩陣
函數「t()」可以求得轉置矩陣
函數「solve()」可以求得反矩陣
矩陣的乘法需要利用函數「%*%」來完成
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
## [,1] [,2]
## [1,] 1 2
## [2,] 3 4
## [,1] [,2]
## [1,] -2 1.5
## [2,] 1 -0.5
## [,1] [,2]
## [1,] 2 4
## [2,] 3 5
## [,1] [,2]
## [1,] 11 19
## [2,] 16 28
## [,1] [,2]
## [1,] 2 12
## [2,] 6 20
## [,1] [,2]
## [1,] 1 3
## [2,] 2 4
## [3,] 2 4
## [4,] 3 5
## [,1] [,2]
## [1,] 2 4
## [2,] 3 5
## [3,] 1 3
## [4,] 2 4
## [,1] [,2] [,3] [,4]
## [1,] 1 3 2 4
## [2,] 2 4 3 5
– 接著我們就能按照剛剛推導出來的公式獲得我們的結果:
\[ \begin{align} B & = (X^TX)^{-1}X^TY \end{align} \]
subdat <- dat[!(dat[,"Rate"] %in% NA) & !(dat[,"AGE"] %in% NA),]
X <- subdat[,"AGE"]
Y <- subdat[,"Rate"]
X_mat <- cbind(1, as.matrix(X))
Y_mat <- matrix(Y, ncol = 1)
B_mat <- solve(t(X_mat) %*% X_mat) %*% t(X_mat) %*% Y_mat
B_mat
## [,1]
## [1,] 78.9312550
## [2,] 0.1021352
##
## Call:
## lm(formula = Rate ~ ., data = dat[, c("Rate", "AGE")])
##
## Coefficients:
## (Intercept) AGE
## 78.9313 0.1021
– 請你利用矩陣運算重現這個分析結果:
##
## Call:
## lm(formula = Rate ~ ., data = dat[, c("Rate", "AGE", "PR")])
##
## Coefficients:
## (Intercept) AGE PR
## 99.0890 0.1203 -0.1373
subdat <- dat[!(dat[,"Rate"] %in% NA) & !(dat[,"AGE"] %in% NA) & !(dat[,"PR"] %in% NA),]
X <- subdat[,c("AGE", "PR")]
Y <- subdat[,"Rate"]
X_mat <- cbind(1, as.matrix(X))
Y_mat <- matrix(Y, ncol = 1)
B_mat <- solve(t(X_mat) %*% X_mat) %*% t(X_mat) %*% Y_mat
B_mat
## [,1]
## 99.0890198
## AGE 0.1202761
## PR -0.1372964
– 答案是可以的,對於已經編碼成0、1的變項我們可以直接把他放入線性迴歸,但是多類別的變項我們就不能直接放入線性迴歸之中。
##
## Call:
## lm(formula = Rate ~ ., data = dat[, c("Rate", "AGE", "LVD")])
##
## Coefficients:
## (Intercept) AGE LVD
## 84.332737 -0.005113 12.423469
##
## Call:
## lm(formula = Rate ~ ., data = dat[, c("Rate", "AGE", "AMI")])
##
## Coefficients:
## (Intercept) AGE AMINSTEMI AMISTEMI
## 86.80025 0.05229 -9.15157 -6.95232
– For equation 1:
\[\hat{Rate} = 84.332737 - 0.005113 \times AGE + 12.423469 \times LVD\]
– For equation 2:
\[\hat{Rate} = 86.80025 + 0.05229 \times AGE - 9.15157 \times (AMI =NSTEMI) - 6.95232 \times (AMI =STEMI)\]
dat[,'AMI1'] <- dat[,'AMI'] == 'NSTEMI'
dat[,'AMI2'] <- dat[,'AMI'] == 'STEMI'
model <- lm(Rate ~ ., data = dat[,c('Rate', 'AGE', 'AMI1', 'AMI2')])
model
##
## Call:
## lm(formula = Rate ~ ., data = dat[, c("Rate", "AGE", "AMI1",
## "AMI2")])
##
## Coefficients:
## (Intercept) AGE AMI1TRUE AMI2TRUE
## 86.80025 0.05229 -9.15157 -6.95232
dat[,'AMI1'] <- dat[,'AMI'] == 'NSTEMI'
dat[,'AMI2'] <- dat[,'AMI'] == 'STEMI'
model <- lm(Rate ~ ., data = dat[,c('Rate', 'AGE', 'AMI1', 'AMI2')])
model
##
## Call:
## lm(formula = Rate ~ ., data = dat[, c("Rate", "AGE", "AMI1",
## "AMI2")])
##
## Coefficients:
## (Intercept) AGE AMI1TRUE AMI2TRUE
## 86.80025 0.05229 -9.15157 -6.95232
subdat <- dat[!(dat[,"Rate"] %in% NA) & !(dat[,"AGE"] %in% NA) & !(dat[,"AMI1"] %in% NA) & !(dat[,"AMI2"] %in% NA),]
X <- subdat[,c("AGE", "AMI1", "AMI2")]
Y <- subdat[,"Rate"]
X_mat <- cbind(1, as.matrix(X))
Y_mat <- matrix(Y, ncol = 1)
B_mat <- solve(t(X_mat) %*% X_mat) %*% t(X_mat) %*% Y_mat
B_mat
## [,1]
## 86.80025338
## AGE 0.05228693
## AMI1 -9.15156608
## AMI2 -6.95232042
– 你是否為後面的課程感到害怕了呢?不用太緊張,矩陣微分可以說是本學期最難的部分了
– 另外,即使你不會數學原理,你同樣能透過內建函數獲得完全一樣的成果