R語言程式設計導論

林嶔 (Lin, Chin)

Lesson 11 HTML剖析及網頁爬蟲實作

第一節:HTML基本語法介紹(1)

– 所謂的網頁背後都是被一種叫做「HTML」的語法,而你的IE、Chrome等瀏覽器其實就是負責解析「HTML」的語法,並把他呈現成你想看到的樣子。

– 你可以參考全國重度級急救責任醫院急診即時訊息總覽找到更多醫院

F01

F02

第一節:HTML基本語法介紹(2)

URL = "https://reg.ntuh.gov.tw/EmgInfoBoard/NTUHEmgInfo.aspx"

txt = scan(URL, what = "character", encoding = "UTF-8", quiet = TRUE)

head(txt, 15)
##  [1] "<!DOCTYPE"                              
##  [2] "html>"                                  
##  [3] "<html"                                  
##  [4] "xmlns=\"http://www.w3.org/1999/xhtml\">"
##  [5] "<head><title>"                          
##  [6] "國立臺灣大學醫學院附設醫院"             
##  [7] "急診即時資訊"                           
##  [8] "</title>"                               
##  [9] "<style"                                 
## [10] "type=\"text/css\">"                     
## [11] "table,"                                 
## [12] "div"                                    
## [13] "{"                                      
## [14] "font-family:"                           
## [15] "verdana,"
txt_new = paste(txt, sep = "", collapse = " ")

第一節:HTML基本語法介紹(3)

TITLE.pos = gregexpr("<title>.*</title>", txt_new)
start.TITLE.pos = TITLE.pos[[1]][1]
end.TITLE.pos = start.TITLE.pos + attr(TITLE.pos[[1]], "match.length")[1] - 1

TITLE.word = substr(txt_new, start.TITLE.pos, end.TITLE.pos)

TITLE.word
## [1] "<title> 國立臺灣大學醫學院附設醫院 急診即時資訊 </title>"
TITLE.word = gsub("<title>", "", TITLE.word)
TITLE.word = gsub("</title>", "", TITLE.word)
TITLE.word
## [1] " 國立臺灣大學醫學院附設醫院 急診即時資訊 "

第一節:HTML基本語法介紹(4)

start.pos = gregexpr("<tr>", txt_new)
end.pos = gregexpr("</tr>", txt_new)

i = 1
sub.start.pos = start.pos[[1]][i]
sub.end.pos = end.pos[[1]][i] + attr(end.pos[[1]], "match.length")[i] - 1

sub_txt = substr(txt_new, sub.start.pos, sub.end.pos)
sub_txt
## [1] "<tr> <td>等候掛號人數:</td> <td> 0人</td> </tr>"
sub_txt = gsub('等候掛號人數:', '', sub_txt)
sub_txt = gsub('</?tr>', '', sub_txt)
sub_txt = gsub('</?td>', '', sub_txt)
sub_txt = gsub(' ', '', sub_txt)
sub_txt
## [1] "0人"

練習1:寫出一個函數讓我能隨時知道臺大醫院的急診即時訊息

– 這個過程叫做「網路爬蟲」,他的過程與一般手工操作是完全一樣的,因此他是合法的,但是你要注意他會對伺服器產生一定程度的負擔,請不要讓你的程式不斷的擷取資訊。

– 如果你很快就完成了台大醫院的部分,可以再試試看全國重度級急救責任醫院急診即時訊息總覽的其他醫院!

練習1答案

NTU_info = function () {
  
  result = data.frame(item = c('等候掛號人數', '等候看診人數', '等候住院人數', '等候ICU人數', '等候推床人數'),
                      info = NA,
                      stringsAsFactors = FALSE)
  
  URL = "https://reg.ntuh.gov.tw/EmgInfoBoard/NTUHEmgInfo.aspx"
  
  txt = scan(URL, what = "character", encoding = "UTF-8", quiet = TRUE)
  txt_new = paste(txt, sep = "", collapse = " ")
  
  start.pos = gregexpr("<tr>", txt_new)
  end.pos = gregexpr("</tr>", txt_new)
  
  for (i in 1:5) {
    
    sub.start.pos = start.pos[[1]][i]
    sub.end.pos = end.pos[[1]][i] + attr(end.pos[[1]], "match.length")[i] - 1
    
    sub_txt = substr(txt_new, sub.start.pos, sub.end.pos)
    sub_txt = gsub('等.*', '', sub_txt)
    sub_txt = gsub('</?tr>', '', sub_txt)
    sub_txt = gsub('</?td>', '', sub_txt)
    result[i,'info'] = gsub(' ', '', sub_txt)
    
  }

  result
  
}

NTU_info()
##           item info
## 1 等候掛號人數  0人
## 2 等候看診人數  0人
## 3 等候住院人數 53人
## 4  等候ICU人數  0人
## 5 等候推床人數  0人

第二節:利用套件執行任務(1)

– 套件「rvest」能協助我們做這件事情,套件內的函數「read_html」能協助我們讀取網頁,而函數「html_nodes」能幫助我們把某種標籤的文字萃取出來,最後「html_text」能幫助我們把標籤通通去掉:

library(rvest)

URL = "https://reg.ntuh.gov.tw/EmgInfoBoard/NTUHEmgInfo.aspx"

website = read_html(URL)

needed_txt = website %>% html_nodes("tr") %>% html_text()
needed_txt
## [1] "等候掛號人數:\r\n                        \r\n                            0人\r\n                    "    
## [2] "等候看診人數:\r\n                        \r\n                            0人\r\n                    "    
## [3] "等候住院人數:\r\n                        \r\n                            53人\r\n                    "   
## [4] "等候ICU人數:\r\n                        \r\n                            0人\r\n                    "     
## [5] "等候推床人數:\r\n                        \r\n                            0人\r\n                    "    
## [6] "兒科等候看診人數:\r\n                        \r\n                            0人\r\n                    "
## [7] "兒科等候住院人數:\r\n                        \r\n                            0人\r\n                    "
## [8] "兒科等候ICU人數:\r\n                        \r\n                            0人\r\n                    " 
## [9] "資料擷取時間:2021/7/9 上午 08:35:10\r\n                    "

第二節:利用套件執行任務(2)

URL = "https://www.ptt.cc/bbs/AllTogether/index3245.html"
website = read_html(URL)

needed_html = website %>% html_nodes("a")
needed_html
## {xml_nodeset (67)}
##  [1] <a id="logo" href="/bbs/">批踢踢實業坊</a>
##  [2] <a class="board" href="/bbs/AllTogether/index.html"><span class="board-l ...
##  [3] <a class="right small" href="/about.html">關於我們</a>
##  [4] <a class="right small" href="/contact.html">聯絡資訊</a>
##  [5] <a class="btn selected" href="/bbs/AllTogether/index.html">看板</a>
##  [6] <a class="btn" href="/man/AllTogether/index.html">精華區</a>
##  [7] <a class="btn wide" href="/bbs/AllTogether/index1.html">最舊</a>
##  [8] <a class="btn wide" href="/bbs/AllTogether/index3244.html"><U+2039> 上頁</a>
##  [9] <a class="btn wide" href="/bbs/AllTogether/index3246.html">下頁 <U+203A></a>
## [10] <a class="btn wide" href="/bbs/AllTogether/index.html">最新</a>
## [11] <a href="/bbs/AllTogether/M.1600670546.A.B1C.html">[徵女] 台北吃燒肉 牛排</a>
## [12] <a href="/bbs/AllTogether/search?q=thread%3A%5B%E5%BE%B5%E5%A5%B3%5D+%E5 ...
## [13] <a href="/bbs/AllTogether/search?q=author%3Aj2046g">搜尋看板內 j2046g 的文章</a>
## [14] <a href="/bbs/AllTogether/M.1600680270.A.B43.html">[徵男] 高雄今天一起喝一杯吧</a>
## [15] <a href="/bbs/AllTogether/search?q=thread%3A%5B%E5%BE%B5%E7%94%B7%5D+%E9 ...
## [16] <a href="/bbs/AllTogether/search?q=author%3Ababyqoo390">搜尋看板內 babyqoo390 ...
## [17] <a href="/bbs/AllTogether/M.1600690355.A.667.html">[徵女] 有玩過桌球的女生</a>
## [18] <a href="/bbs/AllTogether/search?q=thread%3A%5B%E5%BE%B5%E5%A5%B3%5D+%E6 ...
## [19] <a href="/bbs/AllTogether/search?q=author%3Aandy19930905">搜尋看板內 andy1993 ...
## [20] <a href="/bbs/AllTogether/M.1600694448.A.12F.html">[聯誼] 徵人一起參加聯誼派對</a>
## ...
needed_txt = needed_html %>% html_text()
needed_txt
##  [1] "批踢踢實業坊"                               
##  [2] "看板 AllTogether"                           
##  [3] "關於我們"                                   
##  [4] "聯絡資訊"                                   
##  [5] "看板"                                       
##  [6] "精華區"                                     
##  [7] "最舊"                                       
##  [8] "<U+2039> 上頁"                              
##  [9] "下頁 <U+203A>"                              
## [10] "最新"                                       
## [11] "[徵女] 台北吃燒肉 牛排"                     
## [12] "搜尋同標題文章"                             
## [13] "搜尋看板內 j2046g 的文章"                   
## [14] "[徵男] 高雄今天一起喝一杯吧"                
## [15] "搜尋同標題文章"                             
## [16] "搜尋看板內 babyqoo390 的文章"               
## [17] "[徵女] 有玩過桌球的女生"                    
## [18] "搜尋同標題文章"                             
## [19] "搜尋看板內 andy19930905 的文章"             
## [20] "[聯誼] 徵人一起參加聯誼派對"                
## [21] "搜尋同標題文章"                             
## [22] "搜尋看板內 honey99 的文章"                  
## [23] "[徵女] 台中天氣晴"                          
## [24] "搜尋同標題文章"                             
## [25] "搜尋看板內 Lethe1314 的文章"                
## [26] "[徵女] 台北男生徵友"                        
## [27] "搜尋同標題文章"                             
## [28] "搜尋看板內 nanostick 的文章"                
## [29] "[大心] 我們結婚啦!!!"                       
## [30] "搜尋同標題文章"                             
## [31] "搜尋看板內 Birdmaster 的文章"               
## [32] "[徵女] 認真找女朋友"                        
## [33] "搜尋同標題文章"                             
## [34] "搜尋看板內 yiyuyi 的文章"                   
## [35] "[聯誼] 9/26(六)台北免費歡樂桌遊聯誼交友派對"
## [36] "搜尋同標題文章"                             
## [37] "搜尋看板內 tianxingren 的文章"              
## [38] "[徵女] 一起為人生塗點顏色"                  
## [39] "搜尋同標題文章"                             
## [40] "搜尋看板內 jkeuy 的文章"                    
## [41] "[徵女] 關鍵字: 健談 幽默 個性好 人品好"     
## [42] "搜尋同標題文章"                             
## [43] "搜尋看板內 o500521 的文章"                  
## [44] "[徵女] 9/23板橋音樂會"                      
## [45] "搜尋同標題文章"                             
## [46] "搜尋看板內 playing808 的文章"               
## [47] "[徵男] 台北週二中午看天能"                  
## [48] "搜尋同標題文章"                             
## [49] "搜尋看板內 bhaniy0407 的文章"               
## [50] "[徵女] 竹北30蠍男徵友"                      
## [51] "搜尋同標題文章"                             
## [52] "搜尋看板內 SellCall 的文章"                 
## [53] "[徵女] 身長一米六五"                        
## [54] "搜尋同標題文章"                             
## [55] "搜尋看板內 junzi 的文章"                    
## [56] "[徵女] 隨意鳥吃飯"                          
## [57] "搜尋同標題文章"                             
## [58] "搜尋看板內 smallwoei 的文章"                
## [59] "[徵女] 珍惜每個緣分"                        
## [60] "搜尋同標題文章"                             
## [61] "搜尋看板內 pig810627 的文章"                
## [62] "[徵女] 台北下午茶兜風"                      
## [63] "搜尋同標題文章"                             
## [64] "搜尋看板內 Karp 的文章"                     
## [65] "[徵女] 北部 下個月陪我過生日"               
## [66] "搜尋同標題文章"                             
## [67] "搜尋看板內 shoggoth 的文章"
intrested_pos = grep("[徵男]", needed_txt, fixed = TRUE)
needed_txt[intrested_pos]
## [1] "[徵男] 高雄今天一起喝一杯吧" "[徵男] 台北週二中午看天能"
needed_link = needed_html[intrested_pos] %>% html_attr("href")

第二節:利用套件執行任務(3)

i = 1
sub_link = paste("https://www.ptt.cc", needed_link[i], sep = "")
sub_website = read_html(sub_link) 

article_info = sub_website %>% html_nodes(".article-meta-value")
article_info
## {xml_nodeset (4)}
## [1] <span class="article-meta-value">babyqoo390 (ann)</span>
## [2] <span class="article-meta-value">AllTogether</span>
## [3] <span class="article-meta-value">[徵男] 高雄今天一起喝一杯吧</span>
## [4] <span class="article-meta-value">Mon Sep 21 17:24:28 2020</span>

練習2:請你寫出一個程式找出最近的徵男文

  1. 最新的頁面在https://www.ptt.cc/bbs/AllTogether/index.html,你需要透過下面的方式找出上一頁的連結:
URL = "https://www.ptt.cc/bbs/AllTogether/index.html"
website = read_html(URL)

website %>% html_nodes("a") %>% .[8] %>% html_attr("href")
## [1] "/bbs/AllTogether/index3714.html"
  1. 接著,從最新的頁面開始抓取徵男文的標題與連結,直到抓到10篇為止!

  2. 抓滿10篇之後,進去連結內去看看發文者ID以及時間,並把他填入表格之內

##       Title                           
##  [1,] "[徵男] 微解禁後的你"           
##  [2,] "[徵男] 內文沒有照片"           
##  [3,] "[徵男] 胎記"                   
##  [4,] "[徵男]總是要試試看"            
##  [5,] "[徵男] 尋找人生夥伴"           
##  [6,] "[徵男] (代徵)-後疫情世代的你"
##  [7,] "[徵男] 我不想錯過你"           
##  [8,] "[徵男] (代徵)來幫閨蜜徵友"   
##  [9,] "[徵男] 尋找人生好隊友"         
## [10,] "[徵男] 尋找一個隊友"           
##       url                                        ID                    
##  [1,] "/bbs/AllTogether/M.1625783750.A.EAA.html" "aiba1229 (Astrid)"   
##  [2,] "/bbs/AllTogether/M.1625739636.A.C8C.html" "assilem (亞斯藍布魯)"
##  [3,] "/bbs/AllTogether/M.1625742577.A.46B.html" "hunter6126 (Mineee)" 
##  [4,] "/bbs/AllTogether/M.1625751706.A.965.html" "chenxine (老公是GD)" 
##  [5,] "/bbs/AllTogether/M.1625753462.A.574.html" "kaputt ()"           
##  [6,] "/bbs/AllTogether/M.1625650256.A.86E.html" "costco5 (清風)"      
##  [7,] "/bbs/AllTogether/M.1625704579.A.167.html" "syuan116 (咩咩)"     
##  [8,] "/bbs/AllTogether/M.1625722813.A.C8D.html" "racocopink (可可粉)" 
##  [9,] "/bbs/AllTogether/M.1625577513.A.02D.html" "emilyz (全新生活)"   
## [10,] "/bbs/AllTogether/M.1625591022.A.86C.html" "hihitina (噹噹)"     
##       time                      
##  [1,] "Fri Jul  9 06:35:48 2021"
##  [2,] "Thu Jul  8 18:20:34 2021"
##  [3,] "Thu Jul  8 19:09:35 2021"
##  [4,] "Thu Jul  8 21:41:44 2021"
##  [5,] "Thu Jul  8 22:11:00 2021"
##  [6,] "Wed Jul  7 17:30:54 2021"
##  [7,] "Thu Jul  8 08:36:17 2021"
##  [8,] "Thu Jul  8 13:40:11 2021"
##  [9,] "Tue Jul  6 21:18:30 2021"
## [10,] "Wed Jul  7 01:03:40 2021"

練習2答案

my_table = matrix("", nrow = 10, ncol = 4)
colnames(my_table) = c("Title", "url", "ID", "time")

URL = "https://www.ptt.cc/bbs/AllTogether/index.html"
current_id = 1

for (i in 1:10) {
  
  website = read_html(URL)
  needed_html = website %>% html_nodes("a")
  needed_txt = needed_html %>% html_text()
  intrested_pos = grep("[徵男]", needed_txt, fixed = TRUE)
  
  if (length(intrested_pos) > 0) {
    
    for (j in intrested_pos) {
      
      if (current_id <= 10) {
        my_table[current_id, 1] = needed_txt[j]
        my_table[current_id, 2] = needed_html[j] %>% html_attr("href")
      }
      
    current_id = current_id + 1
    
    }
    
  }
  
  if (current_id > 10) {
    break
  }
  
  next_page = website %>% html_nodes("a") %>% .[8] %>% html_attr("href")
  URL = paste0("https://www.ptt.cc", next_page, sep = "")
  
}

for (i in 1:nrow(my_table)) {
  
  sub_URL = paste("https://www.ptt.cc", my_table[i, 2], sep = "")
  sub_website = read_html(sub_URL)
  article_info = sub_website %>% html_nodes(".article-meta-value") %>% html_text()
  my_table[i, 3] = article_info[1]
  my_table[i, 4] = article_info[4]
  
}

my_table
##       Title                           
##  [1,] "[徵男] 微解禁後的你"           
##  [2,] "[徵男] 內文沒有照片"           
##  [3,] "[徵男] 胎記"                   
##  [4,] "[徵男]總是要試試看"            
##  [5,] "[徵男] 尋找人生夥伴"           
##  [6,] "[徵男] (代徵)-後疫情世代的你"
##  [7,] "[徵男] 我不想錯過你"           
##  [8,] "[徵男] (代徵)來幫閨蜜徵友"   
##  [9,] "[徵男] 尋找人生好隊友"         
## [10,] "[徵男] 尋找一個隊友"           
##       url                                        ID                    
##  [1,] "/bbs/AllTogether/M.1625783750.A.EAA.html" "aiba1229 (Astrid)"   
##  [2,] "/bbs/AllTogether/M.1625739636.A.C8C.html" "assilem (亞斯藍布魯)"
##  [3,] "/bbs/AllTogether/M.1625742577.A.46B.html" "hunter6126 (Mineee)" 
##  [4,] "/bbs/AllTogether/M.1625751706.A.965.html" "chenxine (老公是GD)" 
##  [5,] "/bbs/AllTogether/M.1625753462.A.574.html" "kaputt ()"           
##  [6,] "/bbs/AllTogether/M.1625650256.A.86E.html" "costco5 (清風)"      
##  [7,] "/bbs/AllTogether/M.1625704579.A.167.html" "syuan116 (咩咩)"     
##  [8,] "/bbs/AllTogether/M.1625722813.A.C8D.html" "racocopink (可可粉)" 
##  [9,] "/bbs/AllTogether/M.1625577513.A.02D.html" "emilyz (全新生活)"   
## [10,] "/bbs/AllTogether/M.1625591022.A.86C.html" "hihitina (噹噹)"     
##       time                      
##  [1,] "Fri Jul  9 06:35:48 2021"
##  [2,] "Thu Jul  8 18:20:34 2021"
##  [3,] "Thu Jul  8 19:09:35 2021"
##  [4,] "Thu Jul  8 21:41:44 2021"
##  [5,] "Thu Jul  8 22:11:00 2021"
##  [6,] "Wed Jul  7 17:30:54 2021"
##  [7,] "Thu Jul  8 08:36:17 2021"
##  [8,] "Thu Jul  8 13:40:11 2021"
##  [9,] "Tue Jul  6 21:18:30 2021"
## [10,] "Wed Jul  7 01:03:40 2021"

第三節:使用cookie(1)

URL = 'https://www.ptt.cc/bbs/Gossiping/index.html'

website = read_html(URL)
website
## {html_document}
## <html>
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8 ...
## [2] <body>\n\t\t\n<div class="bbs-screen bbs-content">\n    <div class="over1 ...

F05

第三節:使用cookie(2)

– 你可以透過下面這個方式找到電腦目前的cookie共有哪些:

F06

F07

第三節:使用cookie(3)

library(RCurl)

URL = 'https://www.ptt.cc/bbs/Gossiping/index.html'
curl = getCurlHandle()
curlSetOpt(cookie = "over18=1", followlocation = TRUE, curl = curl)
## An object of class "CURLHandle"
## Slot "ref":
## <pointer: 0x0000000039eb0030>
html_character = getURL(URL, curl = curl)

website = read_html(html_character)
needed_html = website %>% html_nodes("a")
needed_txt = needed_html %>% html_text()
needed_txt
##  [1] "批踢踢實業坊"                                    
##  [2] "看板 Gossiping"                                  
##  [3] "關於我們"                                        
##  [4] "聯絡資訊"                                        
##  [5] "看板"                                            
##  [6] "精華區"                                          
##  [7] "最舊"                                            
##  [8] "<U+2039> 上頁"                                   
##  [9] "下頁 <U+203A>"                                   
## [10] "最新"                                            
## [11] "[問卦] 所以現在有哪些認知作戰的具體行動?"       
## [12] "搜尋同標題文章"                                  
## [13] "搜尋看板內 dean1990 的文章"                      
## [14] "Re: [問卦] 科技業報薪水為什麼都喜歡說N"          
## [15] "搜尋同標題文章"                                  
## [16] "搜尋看板內 GA389434 的文章"                      
## [17] "[新聞] 壓倒性通過!歐洲議會決議 籲歐盟外交杯"    
## [18] "搜尋同標題文章"                                  
## [19] "搜尋看板內 shareya 的文章"                       
## [20] "Re: [新聞] 六都火化數同期暴增2812人!"           
## [21] "搜尋同標題文章"                                  
## [22] "搜尋看板內 chinaciv 的文章"                      
## [23] "[問卦] 把自己不喜歡的東西送人是美德嗎?"         
## [24] "搜尋同標題文章"                                  
## [25] "搜尋看板內 Aurora5566 的文章"                    
## [26] "[新聞] 琉球鄉民爭取 納離島優先打疫苗"            
## [27] "搜尋同標題文章"                                  
## [28] "搜尋看板內 YOYOsister 的文章"                    
## [29] "[問卦] 這隻蟑螂怎麼那麼大"                       
## [30] "搜尋同標題文章"                                  
## [31] "搜尋看板內 Freeven 的文章"                       
## [32] "[問卦] 女生就不能搬貨嗎?"                       
## [33] "搜尋同標題文章"                                  
## [34] "搜尋看板內 jackymars5 的文章"                    
## [35] "Re: [問卦] 不認識的妹子突然加Line的八卦"         
## [36] "搜尋同標題文章"                                  
## [37] "搜尋看板內 assassinzero 的文章"                  
## [38] "[問卦] 夢到被鬼壓床該怎麼辦?"                   
## [39] "搜尋同標題文章"                                  
## [40] "搜尋看板內 EliEli 的文章"                        
## [41] "Re: [新聞] AZ致死333人 國民黨:蔡英文7月催打令變"
## [42] "搜尋同標題文章"                                  
## [43] "搜尋看板內 adion 的文章"                         
## [44] "[問卦] 月薪族的時薪低於基本時薪"                 
## [45] "搜尋同標題文章"                                  
## [46] "搜尋看板內 nicky1245 的文章"                     
## [47] "[問卦] 2021了還一堆人怕不沾鍋的八卦?"           
## [48] "搜尋同標題文章"                                  
## [49] "搜尋看板內 s2678132 的文章"                      
## [50] "[公告] 八卦板板規(2021.05.11)"                   
## [51] "搜尋同標題文章"                                  
## [52] "搜尋看板內 arsonlolita 的文章"                   
## [53] "[協尋] 2021/06/30 朋友的公公走失,請台南朋友"    
## [54] "搜尋同標題文章"                                  
## [55] "搜尋看板內 vagrantlin 的文章"                    
## [56] "Fw: [公告] GossipPicket 取消匿名功能初步討論"    
## [57] "搜尋同標題文章"                                  
## [58] "搜尋看板內 q347 的文章"                          
## [59] "[協尋] 徵求7/2板橋文化路ㄧ段車禍行車記"          
## [60] "搜尋同標題文章"                                  
## [61] "搜尋看板內 joechangla 的文章"                    
## [62] "[協尋] 徵求新竹園區一路往光復路行車紀錄"         
## [63] "搜尋同標題文章"                                  
## [64] "搜尋看板內 jason401310 的文章"

第三節:使用cookie(4)

library(RCurl)
library(rvest)

my_table = matrix("", nrow = 10, ncol = 2)
colnames(my_table) = c("Title", "url")

URL = 'https://www.ptt.cc/bbs/Gossiping/index.html'
curl = getCurlHandle()
curlSetOpt(cookie = "over18=1", followlocation = TRUE, curl = curl)
## An object of class "CURLHandle"
## Slot "ref":
## <pointer: 0x0000000039bdb170>
current_id = 1

for (i in 1:10) {
  
  html_character = getURL(URL, curl = curl)
  website = read_html(html_character)
  
  needed_html = website %>% html_nodes("a")
  needed_txt = needed_html %>% html_text()
  intrested_pos = which(grepl("[新聞]", needed_txt, fixed = TRUE) & !grepl("Re: ", needed_txt, fixed = TRUE))
  
  if (length(intrested_pos) > 0) {
    
    for (j in intrested_pos) {
      
      if (current_id <= 10) {
        my_table[current_id, 1] = needed_txt[j]
        my_table[current_id, 2] = needed_html[j] %>% html_attr("href")
      }
      
    current_id = current_id + 1
    
    }
    
  }
  
  if (current_id > 10) {
    break
  }
  
  next_page = website %>% html_nodes("a") %>% .[8] %>% html_attr("href")
  URL = paste0("https://www.ptt.cc", next_page, sep = "")
  
}

my_table
##       Title                                                          
##  [1,] "[新聞] 壓倒性通過!歐洲議會決議 籲歐盟外交杯"                 
##  [2,] "[新聞] 琉球鄉民爭取 納離島優先打疫苗"                         
##  [3,] "[新聞] 復育有成 櫻花鉤吻鮭年增2000尾"                         
##  [4,] "[新聞] 獨家》北市確診幾乎天天有愛滋病患?市"                  
##  [5,] "[新聞] 柯文哲「設局說」目的已達成!學者列8大謊言:證據已浮出" 
##  [6,] "[新聞] 約周玉蔻拍SWAG!童仲彥火辣告白「60路"                  
##  [7,] "[新聞] 柯文哲昨嗆「公布總統府疫苗」北市府今稱"                
##  [8,] "[新聞] 蔡英文:民進黨幾個縣市施打疫苗效率很"                  
##  [9,] "[新聞] 獨家》北市確診幾乎天天有愛滋病患?市府、聯醫:疫調還在"
## [10,] "[新聞] 推文現台灣國旗 白宮:團隊無心之過"                     
##       url                                     
##  [1,] "/bbs/Gossiping/M.1625790852.A.C3A.html"
##  [2,] "/bbs/Gossiping/M.1625791001.A.2C4.html"
##  [3,] "/bbs/Gossiping/M.1625789656.A.5AA.html"
##  [4,] "/bbs/Gossiping/M.1625789740.A.FF4.html"
##  [5,] "/bbs/Gossiping/M.1625790736.A.B33.html"
##  [6,] "/bbs/Gossiping/M.1625788907.A.7AA.html"
##  [7,] "/bbs/Gossiping/M.1625786298.A.97B.html"
##  [8,] "/bbs/Gossiping/M.1625786900.A.F0F.html"
##  [9,] "/bbs/Gossiping/M.1625783262.A.0A7.html"
## [10,] "/bbs/Gossiping/M.1625783522.A.269.html"

小結

– 當然,上課所示範的範例僅僅是網頁爬蟲的冰山一角,有許多更特別的case我們沒有辦法一個一個都示範,等你遇到的時候記得上網找教學,並試著從教學中找出該怎麼做!