機器學習及演算法

林嶔 (Lin, Chin)

Lesson 3 統計分析實作2(單變項統計分析)

本節課程目標介紹

– 請至這裡下載範例資料

dat <- read.csv("ECG_train.csv", header = TRUE, fileEncoding = 'CP950', stringsAsFactors = FALSE, na.strings = "")

– 我們今天要教的檢定方式如下,這大約可以涵蓋90%以上的常見狀況:

F01

第一節:列聯表統計(1)

– 這時候我們需要使用函數「chisq.test()」及「fisher.test()」

TABLE <- table(dat[,"GENDER"], dat[,"AMI"])
TABLE
##         
##          not-AMI NSTEMI STEMI
##   female     521     32    36
##   male       539    136   219
Result1 <- chisq.test(TABLE)
Result1
## 
##  Pearson's Chi-squared test
## 
## data:  TABLE
## X-squared = 139.18, df = 2, p-value < 2.2e-16
Result2 <- fisher.test(TABLE)
Result2
## 
##  Fisher's Exact Test for Count Data
## 
## data:  TABLE
## p-value < 2.2e-16
## alternative hypothesis: two.sided

第一節:列聯表統計(2)

ls(Result1)
## [1] "data.name" "expected"  "method"    "observed"  "parameter" "p.value"  
## [7] "residuals" "statistic" "stdres"
ls(Result2)
## [1] "alternative" "data.name"   "method"      "p.value"
Result1$p.value
## [1] 6.004532e-31
Result2$p.value
## [1] 1.192588e-33

第一節:列聯表統計(3)

– 如果你有仔細注意的話,你會發現物件『Result1』裡面似乎有一個叫做「expected」的東西,而這個其實就是『有超過80%的格子期望值大於5』中,所謂的期望值:

Result1$expected
##         
##          not-AMI    NSTEMI    STEMI
##   female 420.998  66.72421 101.2778
##   male   639.002 101.27579 153.7222
Result1$expected > 5
##         
##          not-AMI NSTEMI STEMI
##   female    TRUE   TRUE  TRUE
##   male      TRUE   TRUE  TRUE
mean(Result1$expected > 5)
## [1] 1

第一節:列聯表統計(4)

subdat <- dat[dat$rhythm.3 %in% 1,]
TABLE <- table(subdat[,"GENDER"], subdat[,"AMI"])
TABLE
##         
##          not-AMI NSTEMI STEMI
##   female       6      0     0
##   male         7      2     3
Result1 <- chisq.test(TABLE)
Result1$expected
##         
##           not-AMI    NSTEMI STEMI
##   female 4.333333 0.6666667     1
##   male   8.666667 1.3333333     2
mean(Result1$expected > 5)
## [1] 0.1666667
Result2 <- fisher.test(TABLE)
Result2
## 
##  Fisher's Exact Test for Count Data
## 
## data:  TABLE
## p-value = 0.3067
## alternative hypothesis: two.sided

練習1:計算卡方檢定的期望值

Result1$expected
##         
##           not-AMI    NSTEMI STEMI
##   female 4.333333 0.6666667     1
##   male   8.666667 1.3333333     2

F02

TABLE
##         
##          not-AMI NSTEMI STEMI
##   female       6      0     0
##   male         7      2     3

練習1答案(1)

row_m <- c(sum(TABLE[1,]), sum(TABLE[2,]))
row_m
## [1]  6 12
col_m <- c(sum(TABLE[,1]), sum(TABLE[,2]), sum(TABLE[,3]))
col_m
## [1] 13  2  3
size <- sum(row_m)
size
## [1] 18
row_m[1] * col_m[1] / size
## [1] 4.333333

練習1答案(2)

row_m <- apply(TABLE, MARGIN = 1, FUN = sum)
row_m
## female   male 
##      6     12
col_m <- apply(TABLE, MARGIN = 2, FUN = sum)
col_m
## not-AMI  NSTEMI   STEMI 
##      13       2       3
t(col_m)
##      not-AMI NSTEMI STEMI
## [1,]      13      2     3
row_m %*% t(col_m) / size
##       not-AMI    NSTEMI STEMI
## [1,] 4.333333 0.6666667     1
## [2,] 8.666667 1.3333333     2

第二節:獨立t檢定與M-W U檢定(1)

– 其中M-W U檢定有另一個名字,叫做Wilcoxon rank sum test

– 因此他們相對應的函數是「t.test()」及「wilcox.test()」

result1 <- t.test(dat[,"K"]~dat[,"GENDER"], var.equal = FALSE)
result1
## 
##  Welch Two Sample t-test
## 
## data:  dat[, "K"] by dat[, "GENDER"]
## t = -4.0971, df = 3147.5, p-value = 4.289e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.28235100 -0.09957477
## sample estimates:
## mean in group female   mean in group male 
##             3.680688             3.871651
result2 <- wilcox.test(dat[,"K"]~dat[,"GENDER"])
result2
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  dat[, "K"] by dat[, "GENDER"]
## W = 1108000, p-value = 1.459e-09
## alternative hypothesis: true location shift is not equal to 0

第二節:獨立t檢定與M-W U檢定(2)

class(result1)
## [1] "htest"
class(result2)
## [1] "htest"
ls(result1)
## [1] "alternative" "conf.int"    "data.name"   "estimate"    "method"     
## [6] "null.value"  "parameter"   "p.value"     "statistic"
ls(result2)
## [1] "alternative" "data.name"   "method"      "null.value"  "parameter"  
## [6] "p.value"     "statistic"

第二節:獨立t檢定與M-W U檢定(3)

var.result <- var.test(dat[,"K"]~dat[,"GENDER"])
var.result
## 
##  F test to compare two variances
## 
## data:  dat[, "K"] by dat[, "GENDER"]
## F = 1.0081, num df = 1511, denom df = 1671, p-value = 0.8718
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.9136993 1.1125070
## sample estimates:
## ratio of variances 
##            1.00809
result3 <- t.test(dat[,"K"]~dat[,"GENDER"], var.equal = TRUE)
result3
## 
##  Two Sample t-test
## 
## data:  dat[, "K"] by dat[, "GENDER"]
## t = -4.0979, df = 3182, p-value = 4.273e-05
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.28233211 -0.09959366
## sample estimates:
## mean in group female   mean in group male 
##             3.680688             3.871651

第三節:ANOVA與K-W檢定(1)

– 我們使用AMI做分組,檢定他們AGE的差異,分別使用ANOVA and Kruskal-Wallis test

  1. 我們需要使用函數「aov()」及「anova()」執行ANOVA
Variance.table <- aov(dat[,"AGE"]~as.factor(dat[,"AMI"]))
ANOVA.table <- anova(Variance.table)
ANOVA.table
## Analysis of Variance Table
## 
## Response: dat[, "AGE"]
##                           Df Sum Sq Mean Sq F value Pr(>F)
## as.factor(dat[, "AMI"])    2   1296  647.93  2.0921 0.1238
## Residuals               1480 458360  309.70
  1. 函數「kruskal.test()」可以幫助我們做Kruskal-Wallis test
KW.result <- kruskal.test(dat[,"AGE"]~as.factor(dat[,"AMI"]))
KW.result
## 
##  Kruskal-Wallis rank sum test
## 
## data:  dat[, "AGE"] by as.factor(dat[, "AMI"])
## Kruskal-Wallis chi-squared = 8.7746, df = 2, p-value = 0.01243

練習2:如何分辨何時要使用哪個檢定呢

– 對於X而言,你必須有能力判斷他的類別數目

– 對於Y而言,你必須有能力判斷他在各組別的樣本數,如果有任一組樣本數<25那就必須使用相對應的無母數統計

練習2答案

length(levels(as.factor(dat$GENDER)))
## [1] 2
length(levels(as.factor(dat$AMI)))
## [1] 3
table(dat$GENDER)
## 
## female   male 
##   2172   2828
table(dat$AMI)
## 
## not-AMI  NSTEMI   STEMI 
##    1060     168     255
subdat <- dat[!dat$K %in% NA,]
table(subdat$GENDER)
## 
## female   male 
##   1512   1672
table(subdat$AMI)
## 
## not-AMI  NSTEMI   STEMI 
##     908      64     155

練習3:ANOVA的事後檢定

  1. 「AMI = ‘STEMI’」比上「AMI = ‘NSTEMI’」

  2. 「AMI = ‘STEMI’」比上「AMI = ‘not-AMI’」

  3. 「AMI = ‘NSTEMI’」比上「AMI = ‘not-AMI’」

練習3答案

subdat <- dat[dat$AMI %in% c('STEMI', 'NSTEMI'),]
var.result <- var.test(subdat[,"AGE"]~subdat[,"AMI"])
var.result
## 
##  F test to compare two variances
## 
## data:  subdat[, "AGE"] by subdat[, "AMI"]
## F = 1.0207, num df = 167, denom df = 254, p-value = 0.8768
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.7768977 1.3518136
## sample estimates:
## ratio of variances 
##           1.020698
t.result <- t.test(subdat[,"AGE"]~subdat[,"AMI"], var.equal = TRUE)
t.result
## 
##  Two Sample t-test
## 
## data:  subdat[, "AGE"] by subdat[, "AMI"]
## t = 1.0627, df = 421, p-value = 0.2885
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.215674  4.077034
## sample estimates:
## mean in group NSTEMI  mean in group STEMI 
##             62.61472             61.18404

– 他的邏輯是,如果檢定n次,那顯著水準就應該從本來的0.05改成0.05/n

– 換言之就是假定要保持顯著水準固定為0.05,那我們就必須將t檢定的結果乘上n

– 這是最終的p-value:

t.result$p.value * 3
## [1] 0.8656402

第四節:相關性檢定(1)

– 函數「cor.test()」可以用來計算相關係數

Result3 <- cor.test(dat[,"PR"], dat[,"K"], method = "pearson")
Result3
## 
##  Pearson's product-moment correlation
## 
## data:  dat[, "PR"] and dat[, "K"]
## t = 1.3747, df = 2820, p-value = 0.1693
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.01102990  0.06271683
## sample estimates:
##        cor 
## 0.02587868
Result4 <- cor.test(dat[,"PR"], dat[,"K"], method = "spearman")
Result4
## 
##  Spearman's rank correlation rho
## 
## data:  dat[, "PR"] and dat[, "K"]
## S = 3734100000, p-value = 0.8706
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##         rho 
## 0.003068111
sum(!(dat[,"PR"] %in% NA) & !(dat[,"K"] %in% NA))
## [1] 2822

第五節:相關係數(1)

– 對於兩個連續變項的「相關係數」,那剛剛的皮爾森相關其實已經提供了很好的結果

– 非連續變項的相關係數最常使用的是『Polychoric correlation』

F03

第五節:相關係數(2)

– 我們可以透過下列方法安裝這個套件

F04

– 或是使用函數『install.packages()』安裝指定套件

install.packages("polycor")
library("polycor")

第五節:相關係數(3)

第五節:相關係數(4)

– 如果你看不懂每一行的指令在寫甚麼,請一行一行看,並了解該函數的input的格式是什麼。

if(require(mvtnorm)){
set.seed(12345)
data <- rmvnorm(1000, c(0, 0), matrix(c(1, .5, .5, 1), 2, 2))
x <- data[,1]
y <- data[,2]
cor(x, y) # sample correlation
}
## [1] 0.5263698
if(require(mvtnorm)){
x <- cut(x, c(-Inf, .75, Inf))
y <- cut(y, c(-Inf, -1, .5, 1.5, Inf))
polychor(x, y) # 2-step estimate
}
## [1] 0.5230474
if(require(mvtnorm)){
set.seed(12345)
polychor(x, y, ML=TRUE, std.err=TRUE) # ML estimate
}
## 
## Polychoric Correlation, ML est. = 0.5231 (0.03819)
## Test of bivariate normality: Chisquare = 2.739, df = 2, p = 0.2543
## 
##   Row Threshold
##   Threshold Std.Err.
##      0.7537  0.04403
## 
## 
##   Column Thresholds
##   Threshold Std.Err.
## 1   -0.9842  0.04746
## 2    0.4841  0.04127
## 3    1.5010  0.06118

第五節:相關係數(5)

F05

– 但是可能因為臨床意義等原因(如空腹血糖126以上被定義為糖尿病),使這兩個變項被置換成類別/序位變項。

– 因此Polychoric correlation的想法是,想辦法將兩個類別變項還原成連續變項,再進行相關係數檢定。

– 所以在範例的程式碼,第一個部分是真的製造兩個連續變項,並做Pearson correlation;第二個部分則是強硬的把他切割成數份,並計算Polychoric correlation看答案是不是類似

if(require(mvtnorm)){
set.seed(12345)
data <- rmvnorm(1000, c(0, 0), matrix(c(1, .5, .5, 1), 2, 2))
x <- data[,1]
y <- data[,2]
cor(x, y) # sample correlation
}
## [1] 0.5263698
if(require(mvtnorm)){
x <- cut(x, c(-Inf, .75, Inf))
y <- cut(y, c(-Inf, -1, .5, 1.5, Inf))
polychor(x, y) # 2-step estimate
}
## [1] 0.5230474

第五節:相關係數(6)

Result5 = polychor(dat[,"GENDER"], dat[,"LVD"])
Result5
## [1] 0.2317155
Result6 = polyserial(dat[,"K"], dat[,"LVD"])
Result6
## [1] 0.06453313

課程小結

– 這節課教的方法裡面大約能面對95%的簡易分析上,之後的統計方法開始會帶有「預測」的功能,我們會將其歸類為「人工智慧」的範疇

– 除此之外還有一些檢定可以用於相依樣本上,像是paired t test及Mcnemar test,你是否能透過google出來呢?