林嶔 (Lin, Chin)
Lesson 2 統計分析實作1(描述性統計及描述統計圖表)
– 請至這裡下載範例資料
dat <- read.csv("ECG_train.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE, na.strings = "")
## [1] 61.14584
## [1] 18.45581
## [1] 340.6169
## [1] 61.96178
## 0% 25% 50% 75% 100%
## 20.03530 48.18833 61.96178 75.82827 102.63361
## 50%
## 61.96178
## 95%
## 89.31906
## [1] 20.0353
## [1] 102.6336
##
## female male
## 2172 2828
##
## 0 1
## female 459 247
## male 703 703
##
## female male
## 0.4344 0.5656
##
## 0 1
## female 0.2173295 0.1169508
## male 0.3328598 0.3328598
##
## 0 1
## female 0.6501416 0.3498584
## male 0.5000000 0.5000000
##
## 0 1
## female 0.3950086 0.2600000
## male 0.6049914 0.7400000
– 在R裡面,我們能夠畫出任何統計圖!
– 在R裡面的顏色可以在Colors in R裡查看
– 另外,這裡教一個新函數「par()」,他可以指定繪圖環境。其中最常見的應用為把4張圖放在同一張畫布內:
par(mfrow = c(2, 2))
hist(dat[,"AGE"], col = "red")
boxplot(dat[,"AGE"], col = "blue")
pie(table(dat[,"AMI"]), col = c("blue", "red", "green"))
barplot(table(dat[,"AMI"]), col = c("gray90", "gray50", "gray10"))
## count spray
## 1 10 A
## 2 7 A
## 3 20 A
## 4 14 A
## 5 14 A
## 6 12 A
plot(dat[,"AGE"], dat[,"Rate"], ylab = "Heart rate", xlab = "AGE", main = "Scatter plot of AGE and Heart rate")
plot(dat[,"AGE"], dat[,"Rate"], ylab = "Heart rate", xlab = "AGE", main = "Scatter plot of AGE and Heart rate", pch = 19)
– 函數「lines()」的效果是按照順序把幾個點連起來,舉例來說…
– 註:函數「plot.new()」及函數「plot.window()」是拿來開一張新畫布用的!
x <- c(1, 4, 7)
y <- c(2, 9, 6)
plot.new()
plot.window(xlim = c(0, 10), ylim = c(0, 10))
lines(x, y)
– 預測線的方程式,需要先學會線性回歸,在R語言裡面線性回歸是這樣建立的:
##
## Call:
## lm(formula = Y ~ X)
##
## Coefficients:
## (Intercept) X
## 78.9313 0.1021
x <- c(10, 150)
y <- 78.9313 + 0.1021 * x
plot(dat[,"AGE"], dat[,"Rate"], ylab = "Heart rate", xlab = "AGE", main = "Scatter plot of AGE and Heart rate", pch = 19)
lines(x, y, col = "red", lwd = 2)
## [1] "lm"
– 函數「ls()」可以協助我們看看物件中有哪些東西
– 函數「names()」也可以做到一樣的事情
## [1] "assign" "call" "coefficients" "df.residual"
## [5] "effects" "fitted.values" "model" "qr"
## [9] "rank" "residuals" "terms" "xlevels"
## (Intercept) X
## 78.9312550 0.1021352
x = c(1, 0, -1, 0)
y = c(0, 1, 0, -1)
t = c("A", "B", "C", "D")
plot.new()
plot.window(xlim = c(-1, 1), ylim = c(-1, 1))
text(x, y, t)
x = c(1, 0, -1, 0)
y = c(0, 1, 0, -1)
plot.new()
plot.window(xlim = c(-1, 1), ylim = c(-1, 1))
points(x, y, pch = 1:4)
plot.new()
plot.window(xlim = c(-1, 1), ylim = c(-1, 1))
legend("topleft", c("Female", "Male"), col = c("red", "blue"), pch = c(15, 19), bg = "gray90")
legend(0, 0, c("estimates", "95% CI"), lty = c(1, 2), lwd = 2, col = "black")
##
## female male
## 2172 2828
– 你應該已經會為圖片加料了,試著畫出下面這張圖:
pie(table(dat[,"GENDER"]), col = c("red", "blue"), labels = c('Female', 'Male'), main = 'Distribution of gender')
pie(table(dat[,"GENDER"]), col = c("red", "blue"), labels = c('Female', 'Male'), main = 'Distribution of gender')
text(0, 0, 'test-1')
text(0, 0.3, 'test-2')
## [1] 66.36768
## [1] 65.91322
– 下面這樣的語法與上面有些不同,但結果卻一致。還記得他們兩者的差別嗎?那為什麼結果又會一致呢?
## [1] 66.36768
## [1] 65.91322
paste(mean(dat[dat[,"LVD"] %in% 1,]$AGE, na.rm = TRUE), "±", sd(dat[dat[,"LVD"] %in% 1,]$AGE, na.rm = TRUE), sep = "")
## [1] "66.3676804579562±15.9551239076992"
paste0(mean(dat[dat[,"LVD"] %in% 1,]$AGE, na.rm = TRUE), "±", sd(dat[dat[,"LVD"] %in% 1,]$AGE, na.rm = TRUE))
## [1] "66.3676804579562±15.9551239076992"
m = mean(dat[dat[,"LVD"] %in% 1,]$AGE, na.rm = TRUE)
s = sd(dat[dat[,"LVD"] %in% 1,]$AGE, na.rm = TRUE)
formatC(m, digits = 3, format = "f")
## [1] "66.368"
## [1] "15.955"
## [1] "66.368±15.955"
m0 = mean(dat[dat[,"LVD"] %in% 0,]$AGE, na.rm = TRUE)
s0 = sd(dat[dat[,"LVD"] %in% 0,]$AGE, na.rm = TRUE)
txt0 = paste0(formatC(m0, digits = 3, format = "f"), "±", formatC(s0, digits = 3, format = "f"))
txt0
## [1] "65.913±17.341"
m1 = mean(dat[dat[,"LVD"] %in% 1,]$AGE, na.rm = TRUE)
s1 = sd(dat[dat[,"LVD"] %in% 1,]$AGE, na.rm = TRUE)
txt1 = paste0(formatC(m1, digits = 3, format = "f"), "±", formatC(s1, digits = 3, format = "f"))
txt1
## [1] "66.368±15.955"
x = c(m0, m1)
barplot(x, col = c("blue", "red"), xlab = "LVD", ylab = "AGE", ylim = c(0, 120))
lines(c(1.9, 1.9), c(m1 - s1, m1 + s1), lwd = 3)
lines(c(1.75, 2.05), c(m1 + s1, m1 + s1), lwd = 3)
lines(c(1.75, 2.05), c(m1 - s1, m1 - s1), lwd = 3)
lines(c(0.7, 0.7), c(m0 - s0, m0 + s0), lwd = 3)
lines(c(0.55, 0.85), c(m0 + s0, m0 + s0), lwd = 3)
lines(c(0.55, 0.85), c(m0 - s0, m0 - s0), lwd = 3)
text(1.9, 15, txt1, col = 'white', font = 2)
text(0.7, 15, txt0, col = 'white', font = 2)
legend("topright", c("Control", "Case"), fill = c("blue", "red"))
– 資料量多的時候經常會遇到這樣的問題,這時候我們可能需要告訴使用者不同區域點的密度。
– 舉例來說,不透明的紅色的色碼為『#FF0000』或『#FF0000FF』
– 透明度50%的紅色色碼為『#FF000080』
– 透明度50%的黑色色碼為『#00000080』
x = c(1, 0, -1, 0)
y = c(0, 1, 0, -1)
plot.new()
plot.window(xlim = c(-1, 1), ylim = c(-1, 1))
points(x, y, pch = 19, cex = 2, col = c("#FF0000", "#FF0000FF", "#FF000080", "#00000080"))
## [1] "#FF000080"
## [1] "#B3804DB3"
– 現在,假設你對單一色階的散布圖仍然不滿意,想要精益求精,google給了你一條明路,請參考R Scatter Plot: symbol color represents number of overlapping points
## Data in a data.frame
x1 <- rnorm(n=1E3, sd=2)
x2 <- x1*1.2 + rnorm(n=1E3, sd=2)
df <- data.frame(x1,x2)
## Use densCols() output to get density at each point
x <- densCols(x1,x2, colramp=colorRampPalette(c("black", "white")))
df$dens <- col2rgb(x)[1,] + 1L
## Map densities to colors
cols <- colorRampPalette(c("#000099", "#00FEFF", "#45FE4F",
"#FCFF00", "#FF9400", "#FF3100"))(256)
df$col <- cols[df$dens]
## Plot it, reordering rows so that densest points are plotted on top
plot(x2~x1, data=df[order(df$dens),], pch=20, col=col, cex=2)
x1 <- dat[,"AGE"] # 關鍵在這
x2 <- dat[,"Rate"] # 關鍵在這
df <- data.frame(x1,x2)
## Use densCols() output to get density at each point
x <- densCols(x1,x2, colramp=colorRampPalette(c("black", "white")))
df$dens <- col2rgb(x)[1,] + 1L
## Map densities to colors
cols <- colorRampPalette(c("#000099", "#00FEFF", "#45FE4F",
"#FCFF00", "#FF9400", "#FF3100"))(256)
df$col <- cols[df$dens]
## Plot it, reordering rows so that densest points are plotted on top
plot(x2~x1, data=df[order(df$dens),], col = col, ylab = "Heart rate", xlab = "AGE", main = "Scatter plot of AGE and Heart rate", pch = 19) # 關鍵在這
– 透過4個練習題,你應該能發現要畫出paper上的圖其實並不難,只要你願意花時間可以一點一點畫出來。
– 除此之外,儲存成pdf格式的圖片由於是向量圖,他具有無限大的解析度,這點也非常重要。