R語言程式設計導論

林嶔 (Lin, Chin)

Lesson 10 文字處理簡介

第一節:基本文字操作(1)

– 在R的字串處理中要小心注意character、factor、numeric這三種類別的誤轉換和混用,尤其是factor是一種很討厭的格式,因為它在轉成數字和字串的時候,常常會變成跟原本不一樣的東西

1-1. 字串黏合(不同物件)

paste("A", "B", sep = "")
## [1] "AB"

1-2. 字串黏合(同物件)

paste(c("A", "B"), collapse = "")
## [1] "AB"
  1. 字串分割
strsplit("A.B", split = "." , fixed = TRUE)
## [[1]]
## [1] "A" "B"
  1. 部份穩合
x = c("AB", "AA")
grepl("B", x)
## [1]  TRUE FALSE
grepl("A", x)
## [1] TRUE TRUE
  1. 回傳出現位置
x = c("ABABC", "CCAAE")
gregexpr("A", x)
## [[1]]
## [1] 1 3
## attr(,"match.length")
## [1] 1 1
## attr(,"useBytes")
## [1] TRUE
## 
## [[2]]
## [1] 3 4
## attr(,"match.length")
## [1] 1 1
## attr(,"useBytes")
## [1] TRUE
  1. 子字串
substr("ndmc1234", 1, 4)
## [1] "ndmc"
  1. 字元取代
x = "AABB"
gsub("A", "C", x)
## [1] "CCBB"
  1. 計算字串長度
x = c("A","AAA","AAAAA")
nchar(x)
## [1] 1 3 5
  1. 大小寫切換
tolower("aBcDe")
## [1] "abcde"
toupper("aBcDe")
## [1] "ABCDE"

第一節:基本文字操作(2)

– 這是第一種作法,使用函數「strsplit」做字串分割:

emails = c("xup6fup@mail.ndmctsgh.edu.tw", "xup6fup0629@gmail.com")

n.account = length(emails)
accounts = rep("", n.account)

splited_emails = strsplit(emails, split = "@" , fixed = TRUE)

for (i in 1:n.account) {
  accounts[i] = splited_emails[[i]][1]
}

accounts
## [1] "xup6fup"     "xup6fup0629"

– 這是第二種作法,先使用函數「gregexpr」回傳「@」出現位置,再使用函數「substr」分離字串:

emails = c("xup6fup@mail.ndmctsgh.edu.tw", "xup6fup0629@gmail.com")

n.account = length(emails)
accounts = rep("", n.account)

symbol_pos = gregexpr("@" , emails, fixed = TRUE)

for (i in 1:n.account) {
  accounts[i] = substr(emails[i], 1, symbol_pos[[i]][1] - 1)
}

accounts
## [1] "xup6fup"     "xup6fup0629"

練習1:處理半格式化的文字資料

– 我們這裡要用到函數「readLines」來進行讀檔:

dat = readLines("pirate.txt", encoding = "UTF-8")
head(dat)
## [1] "2013年9月份海盜案件紀要(東南亞地區)" "資料來源:馬來西亞海盜報案中心(PRC) "
## [3] "1.\t日期:2013年9月3日"              "時間:世界時間1410"                 
## [5] "經緯度:北緯10度13分、東經107度02分" "地點:越南"

練習1答案

dat = readLines("pirate.txt", encoding = "UTF-8")

dat1 = dat[grepl("經緯度:", dat)]
dat2 = gsub('經緯度:', "", dat1)
dat3 = strsplit(dat2, '、')

lat = rep(NA, length(dat3))
lon = rep(NA, length(dat3))

for (i in 1:length(dat3)) {
  
  if (grepl('南', dat3[[i]][[1]])) {lat_sign = -1} else {lat_sign = 1}
  current_lat = strsplit(dat3[[i]][[1]], '度')
  current_lat = gsub('北緯', '', current_lat[[1]])
  current_lat = gsub('南緯', '', current_lat)
  current_lat = gsub('分', '', current_lat)
  current_lat = as.numeric(current_lat)
  
  lat[i] = (current_lat[1] + current_lat[2] / 60) * lat_sign
    
  if (grepl('西', dat3[[i]][[2]])) {lon_sign = -1} else {lon_sign = 1}
  current_lon = strsplit(dat3[[i]][[2]], '度')
  current_lon = gsub('東經', '', current_lon[[1]])
  current_lon = gsub('西經', '', current_lon)
  current_lon = gsub('分', '', current_lon)
  current_lon = as.numeric(current_lon)
  
  lon[i] = (current_lon[1] + current_lon[2] / 60) * lon_sign
  
}

第二節:正則表達式(1)

– 這時候我們可以利用「正則表達式」來描述某些patten的位置,像是這樣:

x = c("abc123", "abcd12!34")

gsub("[^0-9]", "", x)
## [1] "123"  "1234"

– 下面是所有中括號的用法:

## [1] "[Aa]     :: A 或 a"
## [1] "[^1-9]   :: not 1:9"
## [1] "[1-9]    :: 1:9"
## [1] "[a-z]    :: a b c ... z"
## [1] "[A-Z]    :: A B C ... Z"
## [1] "[a-zA-Z] :: 所有英文字母"
## [1] "[W-z]    :: WXYZabc....z"
## [1] "[w-Z]    :: 不可使用!"

第二節:正則表達式(2)

gsub(" {1,}", " ", "nice to  meet   you.")
## [1] "nice to meet you."
## [1] "*        :: {0, }   至少出現 0次, 最多無限多次"
## [1] "+        :: {1, }   至少出現 1次, 最多無限多次"
## [1] "?        :: {0,1}   至少出現 0次, 最多出現 1次"
gsub(" +", " ", "nice to  meet   you.")
## [1] "nice to meet you."

第二節:正則表達式(3)

x = c("medicine", "medical")

grepl("medic(ine|al)", x)
## [1] TRUE TRUE
## [1] "$        :: 字尾限定"
## [1] "^        :: 字首限定"
## [1] "|        :: \"ABC|EFG\" --> grep(\"ABC\"or\"DEF\",x)"
## [1] ".        :: 任意字元"
x = c("how are you", "hi Jack", "nice to meet you")

grepl("you$", x)
## [1]  TRUE FALSE  TRUE
grepl("^h", x)
## [1]  TRUE  TRUE FALSE
grepl("h.*a", x)
## [1]  TRUE  TRUE FALSE

第二節:正則表達式(4)

dat = readLines("pirate.txt", encoding = "UTF-8")

area.dat = grep("經緯度", dat, value = TRUE)
lat.pos = gregexpr("[北南]緯[0-9]+度[0-9]+\\.*[0-9]*分", area.dat)
lon.pos = gregexpr("[東西]經[0-9]+度[0-9]+\\.*[0-9]*分", area.dat)

n.area = length(area.dat)
lat.char = character(n.area)
lon.char = character(n.area)

for (i in 1:n.area) {
  
  lat.char[i] = substr(area.dat[i], lat.pos[[i]], lat.pos[[i]] + attr(lat.pos[[i]], "match.length") - 1)
  lon.char[i] = substr(area.dat[i], lon.pos[[i]], lon.pos[[i]] + attr(lon.pos[[i]], "match.length") - 1)
  
}

lat.char
## [1] "北緯10度13分"  "北緯1度9.18分" "南緯7度9.9分"  "南緯0度16.4分"
## [5] "北緯1度7分"    "北緯4度52分"   "北緯3度58分"
lon.char
## [1] "東經107度02分"    "東經103度34.44分" "東經112度40.2分"  "東經117度41.7分" 
## [5] "東經103度37分"    "東經104度5分"     "東經98度45分"
lat = numeric(n.area)
lon = numeric(n.area)

for (i in 1:n.area) {
  
  splited_lat = strsplit(lat.char[i], "度")[[1]]
  splited_lat = gsub("[^0-9\\.]", "", splited_lat)
  splited_lat = as.numeric(splited_lat)
  lat[i] = splited_lat[1] + splited_lat[2] / 60 
  if (grepl("南緯", lat.char[i])) {lat[i] = -lat[i]}
  
  splited_lon = strsplit(lon.char[i], "度")[[1]]
  splited_lon = gsub("[^0-9\\.]", "", splited_lon)
  splited_lon = as.numeric(splited_lon)
  lon[i] = splited_lon[1] + splited_lon[2] / 60 
  if (grepl("西經", lon.char[i])) {lon[i] = -lon[i]}
  
}

lat
## [1] 10.2166667  1.1530000 -7.1650000 -0.2733333  1.1166667  4.8666667  3.9666667
lon
## [1] 107.0333 103.5740 112.6700 117.6950 103.6167 104.0833  98.7500

練習2:利用正則表達式有效率的找出問題

x = c("KUJYW3A", "6yp7cfsxd", "fopKOLb", "0Ti5j9CVo", "YbdFfTy", "pqVb5RSO",
      "VqGAlm3WNw", "MBsnhKwAjT", "6qZDKlh", "2otWgVA", "q0bnD38", "PhEiD41",
      "MLf9naK", "Nn64XcER5", "MEGBXUL1", "bAeoV5wt8T", "qvjJfYMX", "vWcJRygs",
      "xuvDp2aB", "GuiraLx", "oNGYFgq4p", "KZoHvBO", "fzti6jekwd", "dLISnCv", 
      "LTHXGow", "sz4QhX5", "JBELeHr", "mgG6VAOD", "zaeNCRWwb", "UTVLBDxrth",
      "3LXAcJHzk", "7Tc0RJD", "xQvBtmOR", "vD3gjl6Z", "gZ2tShAVY", "9DQwZop",
      "YNZh6EaODH", "UjTMAne7D", "lyGTxu4", "7Iy1G4gPK", "rPZ0tlqM", "goLsQMc2",
      "bcvemkjftd", "Z6yrYx3q", "ReTYMxqlt", "Sd6BEHVCN", "Th8ZYWigq", "tqgr0oud", 
      "rAsRu4dw", "93w1nxe")
  1. 密碼長度需要8碼以上(含8碼)

  2. 密碼需要同時包含「大寫」、「小寫」、「數字」,只要缺少任一項即為有問題

## [1]  4 14 16 34 40 44

練習2答案

which(grepl("[A-Z].*[A-Z]", x) & grepl("[a-z].*[a-z]", x) & grepl("[0-9].*[0-9]", x) & nchar(x) >= 8)
## [1]  4 14 16 34 40 44

小結

– 另外熟悉文句的拆解及還原,並學習如何做拼寫校正,這有助於我們之後對文字資料做進一步的應用。

– 下週的課程我們會進一步的作文字分析,從網頁上擷取文字資訊並將其轉換為可用的資訊,如此能夠讓我們的程式更加自動化!