R語言程式設計導論

林嶔 (Lin, Chin)

Lesson 4 自訂函數與資料整理

第一節:自訂函數(1)

– 自訂函數是一個組合複雜算式的好方法,舉例來說我們經常想要對學生的成績做加分,最常用的手段是「開根號乘以10」,我們可以把函數寫成這樣:

add_score = function (x) {
  sqrt(x)*10
}

add_score(36)
## [1] 60
add_score(58)
## [1] 76.15773

第一節:自訂函數(2)

x = c(1, 1)

for (i in 3:20) {
  x[i] = x[i-1] + x[i-2]
}

x
##  [1]    1    1    2    3    5    8   13   21   34   55   89  144  233  377  610
## [16]  987 1597 2584 4181 6765
last.seq = 20
Fibonacci = function (last.seq) {
  x = c(1, 1)

  for (i in 3:last.seq) {
    x[i] = x[i-1] + x[i-2]
  }

  x
}

Fibonacci(10)
##  [1]  1  1  2  3  5  8 13 21 34 55
Fibonacci(20)
##  [1]    1    1    2    3    5    8   13   21   34   55   89  144  233  377  610
## [16]  987 1597 2584 4181 6765

第一節:自訂函數(3)

Fibonacci = function (a, b, last.seq) {
  x = c(a, b)

  for (i in 3:last.seq) {
    x[i] = x[i-1] + x[i-2]
  }

  x
}

Fibonacci(1, 1, 20)
##  [1]    1    1    2    3    5    8   13   21   34   55   89  144  233  377  610
## [16]  987 1597 2584 4181 6765
Fibonacci(-3, 2, 20)
##  [1]  -3   2  -1   1   0   1   1   2   3   5   8  13  21  34  55  89 144 233 377
## [20] 610

第一節:自訂函數(4)

Fibonacci(2, 4, 2)

– 不要忘記函數「cat()」能與使用者溝通

Fibonacci = function (a, b, last.seq) {
  if (last.seq < 3) {
    cat("last.seq必須大於等於3。")
  } else {
    x = c(a, b)
    
    for (i in 3:last.seq) {
      x[i] = x[i-1] + x[i-2]
    }
    
    x
  }
}
Fibonacci(2, 4, 2)
## last.seq必須大於等於3。

– 你也可以將函數「cat()」換成函數「message()」或是函數「stop()」試試看

練習1:質數找尋函數

– 記住,某些狀況會產生error,請你預先防止error的產生

– 完整程式碼如下:

max.x = 100

x = 2:max.x
answer.x = rep(TRUE, max.x-1)

for (i in 2:(max.x-1)) {
  n = x[i]
  chech_n = as.integer(sqrt(n))
  for (j in 1:chech_n) {
    if (answer.x[j] == TRUE) {
      if (x[i] %% x[j] == 0) {
        answer.x[i] = FALSE
        break
      }
    }
  }
}

x[answer.x]
##  [1]  2  3  5  7 11 13 17 19 23 29 31 37 41 43 47 53 59 61 67 71 73 79 83 89 97

練習1答案

find_prime_number = function (max.x) {
  if (max.x < 2) {
    message("max.x必須大於等於2。")
  } else {
    max.x = as.integer(max.x)
    x = 2:max.x
    answer.x = rep(TRUE, max.x-1)
    for (i in 2:(max.x-1)) {
      n = x[i]
      chech_n = as.integer(sqrt(n))
      for (j in 1:chech_n) {
        if (answer.x[j] == TRUE) {
          if (x[i] %% x[j] == 0) {
            answer.x[i] = FALSE
            break
          }
        }
      }
    }
    x[answer.x]
  }
}

find_prime_number(100)
##  [1]  2  3  5  7 11 13 17 19 23 29 31 37 41 43 47 53 59 61 67 71 73 79 83 89 97

第二節:合併資料(1)

– 這是那些測站在早上5點半的時候所測得的汙染物濃度

dat1 = read.csv("monitoring_1.csv", header = TRUE, fileEncoding = 'CP950')
dat1.clean = dat1[duplicated(dat1) == FALSE,]
dat1.clean
##                   time    device_id s_d0  s_t0 s_h0      lat      lon
## 1  2016-04-25 04:42:16 28C2DDDD4505   17 29.75   77 24.99643 121.5483
## 2  2016-04-25 05:25:11 28C2DDDD450C   27 27.50   87 25.05221 121.5637
## 3  2016-04-25 05:23:57 28C2DDDD4534   27 30.00   65 25.08374 121.5070
## 4  2016-04-25 05:25:13 28C2DDDD47C6   30 26.12   93 25.01992 121.5305
## 5  2016-04-25 05:24:48 28C2DDDD4234   30 28.00   83 25.00244 121.5519
## 7  2016-04-25 05:24:04 28C2DDDD455E   30 28.12   96 25.02298 121.4993
## 8  2016-04-25 05:24:47 28C2DDDD4414   32 26.50   95 25.04024 121.6195
## 9  2016-04-25 05:25:16 28C2DDDD4588   33 27.75   88 25.03579 121.5135
## 10 2016-04-25 05:24:42 28C2DDDD41C0   33 27.87  100 25.02058 121.5288
## 11 2016-04-25 05:23:54 28C2DDDD4357   33 28.12   92 24.98758 121.5500
## 12 2016-04-25 05:25:51 28C2DDDD459A   34 27.87   84 25.09869 121.5296
## 13 2016-04-25 05:23:45 28C2DDDD4246   34 29.37   86 25.12696 121.5077
## 14 2016-04-25 05:25:12 28C2DDDD45A1   35 28.50   89 25.10584 121.4826
## 15 2016-04-25 05:24:16 28C2DDDD400A   36 28.62   83 25.04922 121.5486
## 16 2016-04-25 05:24:40 28C2DDDD4372   36 29.12   84 25.06035 121.5906
## 17 2016-04-25 05:23:46 28C2DDDD423E   37 27.25   89 25.04317 121.5036
## 18 2016-04-25 05:23:46 28C2DDDD456A   37 27.87   89 25.03577 121.5524
## 19 2016-04-25 05:24:44 28C2DDDD4790   38 27.00  100 25.08577 121.5601
## 20 2016-04-25 05:25:44 28C2DDDD41B2   38 28.00   86 25.07329 121.5364
## 21 2016-04-25 05:26:20 28C2DDDD458F   38 28.62   87 25.05472 121.5296
## 22 2016-04-25 05:26:30 28C2DDDD434F   39 29.37   75 25.04524 121.5418
## 23 2016-04-25 05:25:38 28C2DDDD45A9   41 28.00  151 25.06528 121.5162
## 24 2016-04-25 05:25:32 28C2DDDD436D   42 27.62   89 25.07913 121.5802
## 25 2016-04-25 05:24:00 28C2DDDD41FA   42 27.75   88 25.05652 121.5522
## 26 2016-04-25 05:26:34 28C2DDDD41EB   42 27.75   88 25.11949 121.5050
## 28 2016-04-25 05:25:48 28C2DDDD434D   43 28.50   88 25.06870 121.6120
##                      school          time2
## 1                  景興國小 2016/4/25 5:30
## 2                  西松國小 2016/4/25 5:30
## 3                  葫蘆國小 2016/4/25 5:30
## 4                  溫州國宅 2016/4/25 5:30
## 5                  興隆國小 2016/4/25 5:30
## 7                  萬大國小 2016/4/25 5:30
## 8                  舊莊國小 2016/4/25 5:30
## 9  台北市立大學附設實驗國小 2016/4/25 5:30
## 10                 古亭國小 2016/4/25 5:30
## 11                 永建國小 2016/4/25 5:30
## 12                 福林國小 2016/4/25 5:30
## 13                 清江國小 2016/4/25 5:30
## 14                 富安國小 2016/4/25 5:30
## 15                 敦化國小 2016/4/25 5:30
## 16                 潭美國小 2016/4/25 5:30
## 17                 西門國小 2016/4/25 5:30
## 18                 仁愛國小 2016/4/25 5:30
## 19                 文湖國小 2016/4/25 5:30
## 20                 大佳國小 2016/4/25 5:30
## 21                 吉林國小 2016/4/25 5:30
## 22                 懷生國小 2016/4/25 5:30
## 23                 大同國小 2016/4/25 5:30
## 24                 內湖國小 2016/4/25 5:30
## 25                 民生國小 2016/4/25 5:30
## 26                 立農國小 2016/4/25 5:30
## 28                 南湖國小 2016/4/25 5:30

第二節:合併資料(2)

data_pipeline = function (file.path) {
  dat1 = read.csv(file.path, header = TRUE, fileEncoding = 'CP950')
  dat1.clean = dat1[duplicated(dat1) == FALSE,]
  dat1.clean
}

data1_clean <- data_pipeline("monitoring_1.csv")
head(data1_clean)
##                  time    device_id s_d0  s_t0 s_h0      lat      lon   school
## 1 2016-04-25 04:42:16 28C2DDDD4505   17 29.75   77 24.99643 121.5483 景興國小
## 2 2016-04-25 05:25:11 28C2DDDD450C   27 27.50   87 25.05221 121.5637 西松國小
## 3 2016-04-25 05:23:57 28C2DDDD4534   27 30.00   65 25.08374 121.5070 葫蘆國小
## 4 2016-04-25 05:25:13 28C2DDDD47C6   30 26.12   93 25.01992 121.5305 溫州國宅
## 5 2016-04-25 05:24:48 28C2DDDD4234   30 28.00   83 25.00244 121.5519 興隆國小
## 7 2016-04-25 05:24:04 28C2DDDD455E   30 28.12   96 25.02298 121.4993 萬大國小
##            time2
## 1 2016/4/25 5:30
## 2 2016/4/25 5:30
## 3 2016/4/25 5:30
## 4 2016/4/25 5:30
## 5 2016/4/25 5:30
## 7 2016/4/25 5:30

第二節:合併資料(3)

– 這是那些測站在早上7點的時候所測得的汙染物濃度

data2_clean <- data_pipeline("monitoring_2.csv")
head(data2_clean)
##                  time    device_id s_d0  s_t0 s_h0      lat      lon
## 1 2016-04-25 06:55:23 28C2DDDD4591   33 26.75   91 25.00070 121.5754
## 2 2016-04-25 06:55:28 28C2DDDD41EB   41 27.87   87 25.11949 121.5050
## 3 2016-04-25 06:55:36 28C2DDDD4598   41 28.50   88 25.06126 121.5111
## 4 2016-04-25 06:55:59 28C2DDDD452E   42 26.62   88 25.03944 121.5462
## 5 2016-04-25 06:56:04 28C2DDDD4372   42 28.75   85 25.06035 121.5906
## 6 2016-04-25 06:56:18 28C2DDDD4338   38 26.62   89 25.14918 121.5242
##             school          time2
## 1         博嘉國小 2016/4/25 7:00
## 2         立農國小 2016/4/25 7:00
## 3         永樂國小 2016/4/25 7:00
## 4 私立復興實驗中學 2016/4/25 7:00
## 5         潭美國小 2016/4/25 7:00
## 6         泉源國小 2016/4/25 7:00

第二節:合併資料(4)

– 下面是從路徑到合併檔案的全過程

data1_clean <- data_pipeline("monitoring_1.csv")
data2_clean <- data_pipeline("monitoring_2.csv")
merge.dat = merge(data1_clean, data2_clean, by = "school", all = TRUE)
head(merge.dat)
##                     school              time.x  device_id.x s_d0.x s_t0.x
## 1                 仁愛國小 2016-04-25 05:23:46 28C2DDDD456A     37  27.87
## 2                 內湖國小 2016-04-25 05:25:32 28C2DDDD436D     42  27.62
## 3                 南湖國小 2016-04-25 05:25:48 28C2DDDD434D     43  28.50
## 4                 古亭國小 2016-04-25 05:24:42 28C2DDDD41C0     33  27.87
## 5 台北市立大學附設實驗國小 2016-04-25 05:25:16 28C2DDDD4588     33  27.75
## 6                 吉林國小 2016-04-25 05:26:20 28C2DDDD458F     38  28.62
##   s_h0.x    lat.x    lon.x        time2.x              time.y  device_id.y
## 1     89 25.03577 121.5524 2016/4/25 5:30 2016-04-25 06:58:42 28C2DDDD456A
## 2     89 25.07913 121.5802 2016/4/25 5:30 2016-04-25 06:56:22 28C2DDDD436D
## 3     88 25.06870 121.6120 2016/4/25 5:30                <NA>         <NA>
## 4    100 25.02058 121.5288 2016/4/25 5:30                <NA>         <NA>
## 5     88 25.03579 121.5135 2016/4/25 5:30                <NA>         <NA>
## 6     87 25.05472 121.5296 2016/4/25 5:30                <NA>         <NA>
##   s_d0.y s_t0.y s_h0.y    lat.y    lon.y        time2.y
## 1     39  28.00     87 25.03577 121.5524 2016/4/25 7:00
## 2     40  27.12     90 25.07913 121.5802 2016/4/25 7:00
## 3     NA     NA     NA       NA       NA           <NA>
## 4     NA     NA     NA       NA       NA           <NA>
## 5     NA     NA     NA       NA       NA           <NA>
## 6     NA     NA     NA       NA       NA           <NA>

練習2:僅合併有興趣的資料

– 我們其實可以在檔案合併前,利用索引函數將data1_clean及data2_clean兩個檔案變成一個比較乾淨的檔案,然後再進行合併

– 這是範例

##                     school s_d0.x s_d0.y
## 1                 仁愛國小     37     39
## 2                 內湖國小     42     40
## 3                 南湖國小     43     NA
## 4                 古亭國小     33     NA
## 5 台北市立大學附設實驗國小     33     NA
## 6                 吉林國小     38     NA

練習2答案

– 下面是從路徑到合併檔案的全過程

data1_clean <- data_pipeline("monitoring_1.csv")
data2_clean <- data_pipeline("monitoring_2.csv")
data1.simple = data1_clean[,c("s_d0", "school")]
data2.simple = data2_clean[,c("s_d0", "school")]
simple.merge.dat = merge(data1.simple, data2.simple, by = "school", all = TRUE)
head(simple.merge.dat)
##                     school s_d0.x s_d0.y
## 1                 仁愛國小     37     39
## 2                 內湖國小     42     40
## 3                 南湖國小     43     NA
## 4                 古亭國小     33     NA
## 5 台北市立大學附設實驗國小     33     NA
## 6                 吉林國小     38     NA
colnames(simple.merge.dat) = c('school', paste('s_d0.', 1:2, sep = ""))

第三節:初級資料轉換(1)

– 這份資料是描述每個人疾病狀況的檔案,我們希望將這份直式資料轉為橫式資料

dat = read.csv("comorbidity_1.csv", header = TRUE, fileEncoding = 'CP950')
head(dat, 10)
##    ID Disease
## 1   n     CKD
## 2   f      DM
## 3   m     HTN
## 4   w      DM
## 5   u      DM
## 6   j     HTN
## 7   d     CKD
## 8   l     CKD
## 9   y     HTN
## 10  w     HTN
##   ID   CKD Depression    DM   HTN
## 1  a  TRUE       TRUE  TRUE  TRUE
## 2  b  TRUE       TRUE  TRUE FALSE
## 3  c FALSE       TRUE  TRUE FALSE
## 4  d  TRUE      FALSE FALSE  TRUE
## 5  e  TRUE       TRUE FALSE FALSE
## 6  f  TRUE      FALSE  TRUE  TRUE

第三節:初級資料轉換(2)

– 請各位先回想在前面幾節課學過哪些函數、功能?我保證只使用已經學會的功能就足以應付這個問題了。

  1. 函數「length()」

  2. 函數「levels()」

  3. 各式索引函數

  4. 迴圈函數「for」

第三節:初級資料轉換(3)

– 而函數「levels()」僅能用在因子向量上,所以我們要先確認ID及Disease的屬性

class(dat[,1])
## [1] "factor"
class(dat[,2])
## [1] "factor"

– 很幸運的,在檔案讀進來的時候兩者就都已經是因子向量了,如果不是的話,記得要使用函數「as.factor()」進行轉換

dat[,1] = as.factor(dat[,1])
dat[,2] = as.factor(dat[,2])
levels.sample = levels(dat[,1])
levels.sample
##  [1] "a" "b" "c" "d" "e" "f" "g" "h" "i" "j" "k" "l" "m" "n" "p" "r" "s" "t" "u"
## [20] "v" "w" "x" "y" "z"
n.sample = length(levels.sample)
n.sample
## [1] 24
levels.disease = levels(dat[,2])
levels.disease
## [1] "CKD"        "Depression" "DM"         "HTN"
n.disease = length(levels.disease)
n.disease
## [1] 4

第三節:初級資料轉換(4)

– 這裡我們會用到函數「matrix」做一個空的矩陣,在最開始的時候我們可以先在矩陣內都填上0(填什麼並不重要,因為等等都會覆蓋掉)

new.dat = matrix(0, nrow = n.sample, ncol = n.disease+1)
colnames(new.dat) = c("ID", levels.disease)
new.dat[,1] = levels.sample
new.dat
##       ID  CKD Depression DM  HTN
##  [1,] "a" "0" "0"        "0" "0"
##  [2,] "b" "0" "0"        "0" "0"
##  [3,] "c" "0" "0"        "0" "0"
##  [4,] "d" "0" "0"        "0" "0"
##  [5,] "e" "0" "0"        "0" "0"
##  [6,] "f" "0" "0"        "0" "0"
##  [7,] "g" "0" "0"        "0" "0"
##  [8,] "h" "0" "0"        "0" "0"
##  [9,] "i" "0" "0"        "0" "0"
## [10,] "j" "0" "0"        "0" "0"
## [11,] "k" "0" "0"        "0" "0"
## [12,] "l" "0" "0"        "0" "0"
## [13,] "m" "0" "0"        "0" "0"
## [14,] "n" "0" "0"        "0" "0"
## [15,] "p" "0" "0"        "0" "0"
## [16,] "r" "0" "0"        "0" "0"
## [17,] "s" "0" "0"        "0" "0"
## [18,] "t" "0" "0"        "0" "0"
## [19,] "u" "0" "0"        "0" "0"
## [20,] "v" "0" "0"        "0" "0"
## [21,] "w" "0" "0"        "0" "0"
## [22,] "x" "0" "0"        "0" "0"
## [23,] "y" "0" "0"        "0" "0"
## [24,] "z" "0" "0"        "0" "0"

第三節:初級資料轉換(5)

i = 1
dat[dat[,1]==levels.sample[i],]
##    ID    Disease
## 11  a         DM
## 18  a        CKD
## 45  a Depression
## 49  a        HTN
dat[dat[,1]==levels.sample[i],2]
## [1] DM         CKD        Depression HTN       
## Levels: CKD Depression DM HTN
levels.disease %in% dat[dat[,1]==levels.sample[i],2]
## [1] TRUE TRUE TRUE TRUE
new.dat[i,-1] = levels.disease %in% dat[dat[,1]==levels.sample[i],2]
new.dat
##       ID  CKD    Depression DM     HTN   
##  [1,] "a" "TRUE" "TRUE"     "TRUE" "TRUE"
##  [2,] "b" "0"    "0"        "0"    "0"   
##  [3,] "c" "0"    "0"        "0"    "0"   
##  [4,] "d" "0"    "0"        "0"    "0"   
##  [5,] "e" "0"    "0"        "0"    "0"   
##  [6,] "f" "0"    "0"        "0"    "0"   
##  [7,] "g" "0"    "0"        "0"    "0"   
##  [8,] "h" "0"    "0"        "0"    "0"   
##  [9,] "i" "0"    "0"        "0"    "0"   
## [10,] "j" "0"    "0"        "0"    "0"   
## [11,] "k" "0"    "0"        "0"    "0"   
## [12,] "l" "0"    "0"        "0"    "0"   
## [13,] "m" "0"    "0"        "0"    "0"   
## [14,] "n" "0"    "0"        "0"    "0"   
## [15,] "p" "0"    "0"        "0"    "0"   
## [16,] "r" "0"    "0"        "0"    "0"   
## [17,] "s" "0"    "0"        "0"    "0"   
## [18,] "t" "0"    "0"        "0"    "0"   
## [19,] "u" "0"    "0"        "0"    "0"   
## [20,] "v" "0"    "0"        "0"    "0"   
## [21,] "w" "0"    "0"        "0"    "0"   
## [22,] "x" "0"    "0"        "0"    "0"   
## [23,] "y" "0"    "0"        "0"    "0"   
## [24,] "z" "0"    "0"        "0"    "0"

練習3:完成這項工作

##    ID   CKD Depression    DM   HTN
## 1   a  TRUE       TRUE  TRUE  TRUE
## 2   b  TRUE       TRUE  TRUE FALSE
## 3   c FALSE       TRUE  TRUE FALSE
## 4   d  TRUE      FALSE FALSE  TRUE
## 5   e  TRUE       TRUE FALSE FALSE
## 6   f  TRUE      FALSE  TRUE  TRUE
## 7   g  TRUE       TRUE  TRUE FALSE
## 8   h  TRUE       TRUE FALSE  TRUE
## 9   i FALSE       TRUE  TRUE  TRUE
## 10  j FALSE      FALSE FALSE  TRUE
## 11  k  TRUE      FALSE FALSE FALSE
## 12  l  TRUE      FALSE FALSE FALSE
## 13  m  TRUE       TRUE FALSE  TRUE
## 14  n  TRUE       TRUE FALSE  TRUE
## 15  p  TRUE       TRUE FALSE  TRUE
## 16  r FALSE      FALSE  TRUE  TRUE
## 17  s  TRUE      FALSE  TRUE FALSE
## 18  t  TRUE       TRUE FALSE FALSE
## 19  u  TRUE       TRUE  TRUE FALSE
## 20  v FALSE      FALSE  TRUE  TRUE
## 21  w FALSE      FALSE  TRUE  TRUE
## 22  x  TRUE      FALSE FALSE  TRUE
## 23  y  TRUE      FALSE  TRUE  TRUE
## 24  z  TRUE       TRUE FALSE FALSE

練習3答案

levels.disease = levels(dat[,2])
levels.sample = levels(dat[,1])
new.dat = matrix(0, nrow = length(levels.sample), ncol = length(levels.disease)+1)
colnames(new.dat) = c("ID", levels.disease)
new.dat[,1] = levels.sample 
for (i in 1:length(levels.sample)) {
  new.dat[i,-1] = levels.disease%in%dat[dat[,1]==levels.sample[i],2]
}
new.dat = data.frame(new.dat)
new.dat

小結

– 本節課開始並沒有額外的給大家許多新的函數,但我們開始利用迴圈組合作出一些比較複雜的事情,請同學試著學習如何思考處理流程,而這個流程是資料科學家的基礎核心工作。