資料視覺化2

林嶔 (Lin, Chin)

Lesson 9

第一節:ROC曲線(1)

– 請至這裡下載本週的範例資料

dat = read.csv("Example data.csv", header = TRUE)
head(dat)
##       eGFR Disease Survival.time Death Diabetes Cancer      SBP      DBP
## 1 34.65379       1     0.4771037     0        0      1 121.2353 121.3079
## 2 37.21183       1     3.0704424     0        1      1 122.2000 122.6283
## 3 32.60074       1     0.2607117     1        0      0 118.9136 121.7621
## 4 29.68481       1            NA    NA        0      0 118.2212 112.7043
## 5 28.35726       0     0.1681673     1        0      0 116.7469 115.7705
## 6 33.95012       1     1.2238556     0        0      0 119.9936 116.3872
##   Education Income
## 1         2      0
## 2         2      0
## 3         0      0
## 4         1      0
## 5         0      0
## 6         1      0
library(pROC)

第一節:ROC曲線(2)

ROC1 = roc(dat[,"Disease"], dat[,"eGFR"])
plot(ROC1, col = "red")

– 函數「which.max()」可以找尋一串數列的最大值

pos = which.max(ROC1$sensitivities + ROC1$specificities)
pos
## [1] 407

– 利用索引函數看看切點是什麼

cut = ROC1$thresholds[pos]
cut
## [1] 33.40866

第一節:ROC曲線(2)

plot(ROC1, col = "red")
points(ROC1$specificities[pos], ROC1$sensitivities[pos], pch = 19, cex = 2)

description = paste0("cut of point: ", formatC(cut, 2, format = "f"),
                     " (Sens = ", formatC(ROC1$sensitivities[pos], 3, format = "f"),
                     " ;Spec = ", formatC(ROC1$specificities[pos], 3, format = "f"), ")")

text(ROC1$specificities[pos], ROC1$sensitivities[pos], description, pos = 1)

第一節:ROC曲線(3)

– 在R裡面,我們可以這樣做…

ROC1 = roc(dat[,"Disease"], dat[,"eGFR"])
ROC2 = roc(dat[,"Disease"], dat[,"SBP"])
ROC3 = roc(dat[,"Disease"], dat[,"DBP"])
plot(ROC1, col = "red")
plot(ROC2, col = "darkgreen", add = TRUE)
plot(ROC3, col = "blue", add = TRUE)
legend("bottomright", c("eGFR", "SBP", "DBP"), lty = 1, lwd = 2, col = c("red", "darkgreen", "blue"))

roc.test(ROC2, ROC1)
## 
##  DeLong's test for two correlated ROC curves
## 
## data:  ROC2 and ROC1
## Z = 2.6874, p-value = 0.007202
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.5664361   0.5504805
roc.test(ROC2, ROC3)
## 
##  DeLong's test for two correlated ROC curves
## 
## data:  ROC2 and ROC3
## Z = 2.2119, p-value = 0.02698
## alternative hypothesis: true difference in AUC is not equal to 0
## sample estimates:
## AUC of roc1 AUC of roc2 
##   0.5640136   0.5372651

第一節:ROC曲線(4)

– 函數「glm()」使用後,也會產生預測機率用來評估個案陽性的機率,而這個預測機率的準確度可以透過ROC曲線進行評估

model1 = glm(dat[,"Disease"]~dat[,"SBP"], family = "binomial")
model2 = glm(dat[,"Disease"]~dat[,"SBP"] + factor(dat[,"Diabetes"]), family = "binomial")
model3 = glm(dat[,"Disease"]~dat[,"SBP"] + factor(dat[,"Diabetes"]) + factor(dat[,"Education"]) + factor(dat[,"Income"]), family = "binomial")

ROC1 = roc(model1$y, model1$fitted.values)
ROC2 = roc(model2$y, model2$fitted.values)
ROC3 = roc(model3$y, model3$fitted.values)
plot(ROC1, col = "red")
plot(ROC2, col = "darkgreen", add = TRUE)
plot(ROC3, col = "blue", add = TRUE)
legend("bottomright", c("Model 1", "Model 2", "Model 3"), lty = 1, lwd = 2, col = c("red", "darkgreen", "blue"))

練習1:手刻ROC curve

– 請你依照下列程序畫出ROC曲線

  1. 開一個空畫布

  2. 利用函數「lines()」畫出3條ROC曲線

  3. 自行繪製座標軸

練習1答案

model1 = glm(dat[,"Disease"]~dat[,"SBP"], family = "binomial")
model2 = glm(dat[,"Disease"]~dat[,"SBP"] + factor(dat[,"Diabetes"]), family = "binomial")
model3 = glm(dat[,"Disease"]~dat[,"SBP"] + factor(dat[,"Diabetes"]) + factor(dat[,"Education"]) + factor(dat[,"Income"]), family = "binomial")

ROC1 = roc(model1$y, model1$fitted.values)
ROC2 = roc(model2$y, model2$fitted.values)
ROC3 = roc(model3$y, model3$fitted.values)

plot.new()
plot.window(xlim = c(0, 1), ylim = c(0, 1))

lines(ROC1$specificities, ROC1$sensitivities, col = "red")
lines(ROC2$specificities, ROC2$sensitivities, col = "darkgreen")
lines(ROC3$specificities, ROC3$sensitivities, col = "blue")

axis(1, at = c(0, 0.2, 0.4, 0.6, 0.8, 1), labels = c("0%", "20%", "40%", "60%", "80%", "100%"), pos = 0)
axis(2, at = c(0, 0.2, 0.4, 0.6, 0.8, 1), labels = c("0%", "20%", "40%", "60%", "80%", "100%"), pos = 0, las = 2)

mtext("Specificity", side = 1, line = 3, cex.lab = 1, col = "black")
mtext("Sensitivity", side = 2, line = 3, cex.lab = 1, col = "black")

第二節:色彩透明度與函數(1)

– 資料量多的時候經常會遇到這樣的問題,這時候我們可能需要告訴使用者不同區域點的密度。

plot(dat[,"SBP"], dat[,"DBP"], ylab = "DBP", xlab = "SBP", main = "Scatter plot of SBP and DBP", cex = 2)

plot(dat[,"SBP"], dat[,"DBP"], ylab = "DBP", xlab = "SBP", main = "Scatter plot of SBP and DBP", pch = 19, cex = 2)

第二節:色彩透明度與函數(2)

– 舉例來說,不透明的紅色的色碼為『#FF0000』或『#FF0000FF』

– 透明度50%的紅色色碼為『#FF000080』

– 透明度50%的黑色色碼為『#00000080』

x = c(1, 0, -1, 0)
y = c(0, 1, 0, -1)
plot.new()
plot.window(xlim = c(-1, 1), ylim = c(-1, 1))
points(x, y, pch = 19, cex = 2, col = c("#FF0000", "#FF0000FF", "#FF000080", "#00000080"))

rgb(1, 0, 0, 0.5)
## [1] "#FF000080"
rgb(0.7, 0.5, 0.3, 0.7)
## [1] "#B3804DB3"
plot(dat[,"SBP"], dat[,"DBP"], ylab = "DBP", xlab = "SBP", main = "Scatter plot of SBP and DBP", pch = 19, cex = 2, col = "#00000030")

第二節:色彩透明度與函數(3)

smoothScatter(dat[,"SBP"], dat[,"DBP"], nrpoints = 0, ylab = "DBP", xlab = "SBP", main = "Scatter plot of SBP and DBP")