0%

Getting and Cleaning Data

前言:本文主要参考了来自coursera上约翰霍普金斯大学Data Science系列课程Course3:Getting and Cleaning Data.


Tips

  • 记录你的每一步操作
  • 变量名应当取得比较详尽,不要缩写
  • 善用??

Data collection

set path

1
setwd("C:/Users/msgsxj/Desktop/coursera/Getting and Cleaning Data")

create directories

1
2
3
if (!file.exists("data")){
dir.create("data")
}

get data

from csv

  • utils包中函数:
  • download.file():如果是https开头,mac需要指定method = "curl",win不需要;如果是http开头则都不需要
1
2
3
4
setwd("C:/Users/msgsxj/Desktop/coursera/Getting and Cleaning Data")
fileurl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accseeType=DOWNLOAD"
download.file(fileurl, destfile = "./data/cameras.csv", method = "curl") # mac需要指定curl,win不需要
list.files("./data")
  • utils包中函数:
  • read.table()
  • read.csv()
1
2
cameraData <- read.table("./data/cameras.csv", sep = ",", header = TRUE)
cameraData <- read.csv("./data/cameras.csv")
  • 重复读取1000次,两种方式读取csv所花时间对比
  • read.table():1.862808 secs
  • read.csv():1.861519 secs
1
2
3
4
5
6
7
8
9
10
11
t1 <- Sys.time()
for (i in 1:1000){
cameraData <- read.table("./data/cameras.csv", sep = ",", header = TRUE)
}
Sys.time() - t1 # 1.862808 secs

t2 <- Sys.time()
for (i in 1:1000){
cameraData <- read.csv("./data/cameras.csv")
}
Sys.time() - t2 # 1.861519 secs

from xlsx

  • 首先用excel手动将cameras.csv转存为cameras.xlsx
  • readxl包中函数:
  • read_excel()
1
2
3
4
5
setwd("C:/Users/msgsxj/Desktop/coursera/Getting and Cleaning Data")
install.packages("tidyverse")
install.packages("readxl")
library(readxl)
cameraData <- read_excel("./data/cameras.xlsx")

from xml

  • XML包中函数:
  • xmlTreeParse()
  • xmlRoot()
  • xmlName()
  • xmlSApply()
  • xpathSApply()
1
2
3
4
5
6
7
8
9
10
11
12
install.packages("XML")
library(XML)
fileurl_1 <- "http://www.w3school.com.cn/example/xmle/note.xml"
doc_1 <- xmlTreeParse(fileurl_1, useInternal = TRUE)
doc_1
rootNode <- xmlRoot(doc_1)
rootNode
xmlName(rootNode)
rootNode[[1]]
rootNode[[1]][[1]]
xmlSApply(rootNode, xmlValue) #获得所有文本
xpathSApply(rootNode, "//to", xmlValue) #所有标签为to的文本

  • 另一个例子
1
2
3
4
5
6
fileurl_2 <- "http://espn.go.com/nfl/team/_/name/bal/baltimore-ravens"
doc_2 <- htmlTreeParse(fileurl_2, useInternal = TRUE)
time <- xpathSApply(doc_2, "//div[@class='game-meta']", xmlValue)
time
team <- xpathSApply(doc_2, "//div[@class='game-info']", xmlValue)
team

from JSON

  • json结构有点类似xml,应用广泛,也是通过API获得的数据的最常见类型
  • jsonlite包中函数:
  • fromJSON()
  • toJSON()
1
2
3
4
5
6
7
8
library(jsonlite)
jsonData <- fromJSON("https://api.github.com/users/jtleek/repos")
names(jsonData)
names(jsonData$owner)
jsonData$owner$login
myjson <- toJSON(iris, pretty=TRUE)
iris2 <- fromJSON(myjson)
head(iris2)

from MySQL

  • DBI包中函数:
  • dbConnect()
  • dbGetQuery()
  • RMySQL包中函数:
  • MySQL()
1
2
3
4
5
library(DBI)
install.packages("RMySQL")
library(RMySQL)
ucscDb <- dbConnect(MySQL(), user = "genome", host = "genome-mysql.cse.ucsc.edu")
result <- dbGetQuery(ucscDb, "show databases;");dbDisconnect(ucscDb); #断开连接后会返回TRUE
  • 一个数据库的例子:hg19
  • DBI包中函数:
  • dbListTables()
  • dbReadTable()
  • dbSendQuery()
  • fetch()
  • dbClearResult()
  • dbDisconnect()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
hg19 <- dbConnect(MySQL(), user = "genome", db = "hg19", host = "genome-mysql.cse.ucsc.edu")
allTables <- dbListTables(hg19)
length(allTables)
allTables[1:5]
dbListFields(hg19, "affyU133Plus2")
dbGetQuery(hg19, "select count(*) from affyU133Plus2") #计算表中所有记录 58463
affyData <- dbReadTable(hg19, "affyU133Plus2")
affyData[1,]
query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
affyMis <- fetch(query)
quantile(affyMis$misMatches)
affyMisSmall <- fetch(query, n = 10)
dbClearResult(query) #清空查询后会返回TRUE
dim(affyMisSmall)
dbDisconnect(hg19)

from HDF5

  • 官网:https://support.hdfgroup.org/HDF5/
  • Hierarchical Data Format(层次型数据结构)
  • rhdf5包中函数:
  • h5createFile()
  • h5createGroup()
  • h5write()
  • h5ls()
  • h5read()
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
setwd("C:/Users/msgsxj/Desktop/coursera/Getting and Cleaning Data")
source("http://bioconductor.org/biocLite.R")
biocLite("rhdf5")
library(rhdf5)
created = h5createFile("./data/example.h5")
created
created = h5createGroup("./data/example.h5", "foo")
created = h5createGroup("./data/example.h5", "baa")
created = h5createGroup("./data/example.h5", "foo/foobaa")
A = matrix(1:10, nr = 2, nc = 5)
h5write(A, "./data/example.h5", "foo/A")
B = array(seq(0.1,2.0,by=0.1) ,dim = c(5,2,2))
attr(B, "scale") <- "liter"
h5write(B, "./data/example.h5", "foo/foobaa/B")
h5ls("./data/example.h5")
df = data.frame(1L:5L, seq(0,1,length.out=5), c("a","b","c","d","e"), stringAsFactors = FALSE)
h5write(df, "./data/example.h5", "df") # df直接写进最高群组
h5ls("./data/example.h5")
readA = h5read("./data/example.h5", "foo/A")
readB = h5read("./data/example.h5", "foo/foobaa/B")
readdf = h5read("./data/example.h5", "df")
readA
h5write(c(12,13,14), "./data/example.h5", "foo/A", index = list(1,1:3))
h5read("./data/example.h5", "foo/A")

from The Web

  • base包中函数:
  • url()
  • readLines()
1
2
3
4
con = url("http://www.msgsxj.cn/about/")
htmlCode = readLines(con)
close(con)
htmlCode[378:380]
me
  • XML包中函数:
  • htmlTreeParse()
  • htmlParse()
  • xpathSApply()
  • httr包中函数:
  • GET()
  • content()
1
2
3
4
5
6
7
8
9
10
library(XML)
url <- "http://xueshu.baidu.com/scholarID/CN-BP75S7TJ" # 周志华
html <- htmlTreeParse(url, useInternalNodes = T)
xpathSApply(html, "//div[@class='res_info']", xmlValue)
# httr包的等效实现
library(httr)
html2 <- GET(url)
content2 <- content(html2, as = "text")
parsedHtml <- htmlParse(content2, asText = TRUE)
xpathSApply(parsedHtml, "//div[@class='res_info']", xmlValue)

  • httr包中函数:
  • GET():websites with passwords
  • handle()
1
2
3
4
5
6
7
8
9
pg1 = GET("https://httpbin.org/basic-auth/user/passwd")
pg1
names(pg1)
pg2 = GET("https://httpbin.org/basic-auth/user/passwd", authenticate("user","passwd"))
pg2
names(pg2)
baidu = handle("https://baidu.com")
pg3 = GET(handle = baidu, path = "/")
pg3

from API

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
library(httr)
# 1. Find OAuth settings for github:
# http://developer.github.com/v3/oauth/
oauth_endpoints("github")

# 2. To make your own application, register at
# https://github.com/settings/developers. Use any URL for the homepage URL
# (http://github.com is fine) and http://localhost:1410 as the callback url
#
# Replace your key and secret below.
myapp <- oauth_app("github",
key = "56b637a5baffac62cad9",
secret = "8e107541ae1791259e9987d544ca568633da2ebf")

# 3. Get OAuth credentials
github_token <- oauth2.0_token(oauth_endpoints("github"), myapp)

# 4. Use API
gtoken <- config(token = github_token)
req <- GET("https://api.github.com/rate_limit", gtoken)
stop_for_status(req)
content(req)
# OR:
req <- with_config(gtoken, GET("https://api.github.com/rate_limit"))
stop_for_status(req)
content(req)

from other sources

  • 下列函数或包自行查询文档:
  • file()
  • url()
  • gzfile()
  • bzfile()
  • read.arff()(Weka)
  • read.dta()(Stata)
  • read.mtp()(Minitab)
  • read.octave()(Octave)
  • read.spss()(SPSS)
  • read.xport()(SAS)
  • read_fwf(.for)
  • RPostresSQL包(provide a DBI-compliant(兼容数据库接口)的数据库连接)
  • RODBC包(为PostgreQL,MySQL,Microsoft Accsee SQLite提供接口)
  • RMongo包,rmongodb包(MongoDB)
  • jpeg(),readbitmap(),png(),EBImage包(读取图像数据)
  • rdgal包,rgeos包,raster包(读取GIS(Geographic Information System)数据)
  • tuneR包,seewave包(读取MP3)

Some Data Resources

  • United Bations:http://data.un.org/
  • U.S.:http://www.data.gov/
  • United Kingdom:http://data.gov.uk/
  • France http://www.data.gouv.fr/
  • Ghana:http://data.gov.gh/
  • Australia:http://data.gov.au/
  • Grenamy:http://www.govdata.de/
  • Hong Kong:http://www.gov.hk/en/theme/psi/datasets/
  • Janpan:http://www.data.go.jp/
  • Many more:http://www.data.gov/opendatasites
  • http://www.gapminder.org/
  • http://www.asdfree.com/
  • http://www.kaggle.com/
  • Hilary Mason:http://bitly.com/bundles/hmason/1
  • Jeff Hammerbacher:http://www.quora.com/Jeff-Hammerbacher/Introduction-to-Data-Science-Data-Sets

Making data tidy

Subsetting and Sorting

Subsetting

  • X[]
1
2
3
4
5
6
7
8
9
10
11
12
set.seed(13435)
X <- data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15))
X <- X[sample(1:5),]
X$var2[c(1,3)]=NA
X
X[,1] #提取第一列数据
X[,"var1"] #提取var1数据
X[1:2,"var2"]
X[(X$var1 <= 3 & X$var3 > 11),]
X[(X$var1 <= 3 | X$var3 > 15),]
X[which(X$var2 > 8),]
X[(X$var2 > 8),] #会返回NA
X[]

Sorting

  • base包中函数:
  • sort()
  • order()
1
2
3
4
5
6
7
8
9
set.seed(13435)
X <- data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15))
X <- X[sample(1:5),]
X$var2[c(1,3)]=NA
sort(X$var1)
sort(X$var1, decreasing = TRUE)
sort(X$var2, na.last = TRUE)
X[order(X$var1),]
X[order(X$var1,X$var3),]

  • plyr包中函数:
  • arrange()
1
2
3
4
5
6
7
8
9
10
11
library(plyr)
set.seed(13435)
X <- data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15))
X <- X[sample(1:5),]
X$var2[c(1,3)]=NA
arrange(X, var1)
arrange(X, desc(var1))
X$var4 <- rnorm(5)
X
Y <- cbind(X, rnorm(5))
Y

![(/pictures/R/R_pylr.png)

Summarizing Data

  • 下载数据
1
2
3
4
setwd("C:/Users/msgsxj/Desktop/coursera/Getting and Cleaning Data")
fileurl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accseeType=DOWNLOAD"
download.file(fileurl, destfile = "./data/restaurants.csv", method = "curl")
restData <- read.csv("./data/restaurants.csv")
  • utils包中函数
  • head()
  • tail()
  • str()
  • base包中函数:
  • summary()
  • table()
  • stats包中函数:
  • quantile()
1
2
3
4
5
6
7
8
head(restData, n = 1)
tail(restData, n = 1)
summary(restData)
str(restData) # 比summary好用
quantile(restData$councilDistrict, na.rm = TRUE)
quantile(restData$councilDistrict, probs = c(0.5,0.75,0.9))
table(restData$zipCode, useNA ="ifany") # useNA ="ifany"为统计NA个数
table(restData$councilDistrict, restData$zipCode)

  • base包中函数:
  • is.na()
1
2
3
4
5
6
7
8
sum(is.na(restData$councilDistrict))
any(is.na(restData$councilDistrict))
all(restData$zipCode > 0)
colSums(is.na(restData))
all(colSums(is.na(restData)) == 0)
table(restData$zipCode %in% c("21212"))
table(restData$zipCode %in% c("21212","21213"))
restData[(restData$zipCode %in% c("21212","21213")),]

  • base包中函数:
  • as.data.frame()
  • stats中函数:
  • xtabs()
  • ftable()
1
2
3
4
5
6
7
8
data(UCBAdmissions)
DF = as.data.frame(UCBAdmissions)
summary(DF)
xt <- xtabs(Freq ~ Gender + Admit, data = DF)
xt
warpbreaks$replicate <- rep(1:9, len = 54)
xt <- xtabs(breaks ~., data = warpbreaks)
ftable(xt)

  • utils包中函数:
  • object.size()
1
2
3
fakeData = rnorm(1e5)
object.size(fakeData)
print(object.size(fakeData), units = "Mb") # 0.8 Mb

Creating new Variables

  • 下载数据
1
2
3
4
setwd("C:/Users/msgsxj/Desktop/coursera/Getting and Cleaning Data")
fileurl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accseeType=DOWNLOAD"
download.file(fileurl, destfile = "./data/restaurants.csv", method = "curl")
restData <- read.csv("./data/restaurants.csv")

Create sequences

  • base包中函数:
  • seq()
1
2
3
4
seq(1, 10, by = 2) # 1 3 5 7 9
seq(1, 10, length = 3) # 1.0 5.5 10.0
c(1, 3, 8, 25, 100) # 1 3 8 25 100
seq(along = x) # 1 2 3 4 5

Create binary variables

  • base包中函数:
  • ifelse()
1
2
restData$zipWrong <- ifelse(restData$zipCode < 0, T, F)
class(restData$zipWrong)

Create categorical variables

  • base包中函数:
  • cut()
  • Hmisc包中函数:
  • cut2()
  • plyr包中函数:
  • mutate()
1
2
3
4
5
6
7
8
9
10
11
install.packages("Hmisc")
restData$zipGroups <- cut(restData$zipCode, breaks = quantile(restData$zipCode))
table(restData$zipGroups)
class(restData$zipGroups)
library(Hmisc)
restData$zipGroups <- cut2(restData$zipCode, g = 4)
table(restData$zipGroups)
class(restData$zipGroups)
library(plyr)
restData2 <- mutate(restData, zipGroups = cut2(zipCode, g = 4)) # 新变量加入
table(restData2$zipGroups)

  • base包中函数:
  • factor()
  • stats包中函数:
  • relevel()
1
2
3
4
5
6
7
8
9
restData$zcf <- factor(restData$zipCode)
restData$zcf[1:10]
class(restData$zcf)
yesno <- sample(c("yes","no"), size = 10, replace = TRUE)
class(yesno)
yesnofac <- factor(yesno, levels = c("yes","no"))
class(yesnofac)
relevel(yesnofac, ref = "yes") # yes 设置为比较低的水平
as.numeric(yesnofac)

Common transforms

  • abs()
  • sqrt()
  • ceiling()
  • floor()
  • round(x, digits = n) # 小数点后有效数字
  • signif(x, digits = n) # 一共有效数字
  • cos()
  • log()
  • log2()
  • log10()
  • exp()

Reshaping Data

  • reshape2包中函数:
  • melt()
  • dcast()
1
2
3
4
5
6
7
8
9
10
library(reshape2)
dim(mtcars)
mtcars$carname <- rownames(mtcars)
carMelt <- melt(mtcars, id = c("carname","gear","cyl"), measure.vars = c("mpg","hp")) # 样本量翻倍
dim(carMelt)
head(carMelt)
tail(carMelt)
class(carMelt$variable)
cylData <- dcast(carMelt, cyl ~ variable, mean)
cylData # 对melt函数中设定的variable按cyl求平均

  • base包中函数:
  • tapply()
  • split()
  • lappy()
  • unlist()
  • sapply()
1
2
3
4
5
tapply(InsectSprays$count, InsectSprays$spray, sum) # 按杀虫剂种类求和
spIns <- split(InsectSprays$count, InsectSprays$spray);spIns
sprCount <- lapply(spIns, sum);sprCount # 按杀虫剂种类求和
unlist(sprCount)
sapply(spIns, sum) # sapply为lapply简化版

  • plyr包中函数:
  • ddply()
1
2
3
4
5
library(plyr)
ddply(InsectSprays, .(spray), summarise, sum=sum(count))
spraySums <- ddply(InsectSprays, .(spray), summarise, sum=ave(count))
dim(spraySums)
head(spraySums)

Editing Text Variable

  • 下载数据
1
2
3
4
5
setwd("C:/Users/msgsxj/Desktop/coursera/Getting and Cleaning Data")
fileurl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accseeType=DOWNLOAD"
download.file(fileurl, destfile = "./data/camera2.csv", method = "curl")
camera2Data <- read.csv("./data/camera2.csv")
names(camera2Data)
  • base包中函数:
  • tolower()
  • strsplit()
  • sapply()
1
2
3
4
5
6
7
8
names(camera2Data) <- tolower(names(camera2Data)) # 变量名全部变小写
names(camera2Data)
splitNames <- strsplit(names(camera2Data),"\\.") # split 需转义
splitNames[[6]]
splitNames[[6]][1]
firstElement <- function(x){x[1]}
names(camera2Data) <- sapply(splitNames,firstElement) # 想只保留splitNames[[6]]的第一部分
names(camera2Data)

  • 上述srtsplit()+sapply()过程的相似处理:用""取代"."
  • sub()
  • gsub() 替换所有
1
2
3
4
5
6
camera2Data <- read.csv("./data/camera2.csv") #重新载入
names(camera2Data)
sub("\\.", "", names(camera2Data),) #需转义
mytest <- "this_is_a_test"
sub("_", "", mytest) #不需要转义
gsub("_", "", mytest)

  • base包中函数:
  • grep()
  • grepl():返回一列logical
  • 更多正则表达式规则见维基百科
1
2
3
4
5
6
7
grep("Alameda", camera2Data$intersection) # 4 5 36
table(grepl("Alameda", camera2Data$intersection))
camera2Data_noAlameda <- camera2Data[!grepl("Alameda", camera2Data$intersection),]
grep("Alameda", camera2Data$intersection, value = T) # 直接返回值
length(grep("JeffStreet", camera2Data$intersection)) # 检查是否出现
length(grep('^[12]/2/2007', '22/2/2007', value = T))
length(grep('^[12]/2/2007', '2/2/2007', value = T))

  • base包中函数:
  • nchar()
  • substr()
  • paste()
  • paste0()
  • stringr包中函数:
  • str_trim()
1
2
3
4
5
6
library(stringr)
nchar("Jeffery Leek")
substr("Jeffery Leek",1,7)
paste("Jeffery","Leek") #自动加上了空格
paste0("Jeffery","Leek") #不加空格
str_trim("Jeff ") #去掉空格

Date

  • base包中函数:
  • date()
  • Sys.Date()
1
2
3
4
5
6
d1 <- date()
class(d1)
d2 <- Sys.Date()
class(d2)
format(d2, "%a %b %d")
julian(d2)

  • lubridate包中函数:
  • ymd()
  • ymd_hms()
  • dmy():返回日期
  • wday():返回星期
1
2
3
4
5
6
7
8
9
library(lubridate)
ymd("20140108")
mdy("08/04/2013")
ymd_hms("2011-08-03 10:15:03")
ymd_hms("2011-08-03 10:15:03", tz = "Pacific/Auckland")
x = dmy(c("1jan2013","2jan2013","31mar2013","30jul2013"))
x[1]
wday(x[1])
wday(x[1],label=T)


一些强大的技巧

%>%

  • %>%:管道函数
  • 下面两条代码等价
1
2
anscombe_tidy <- anscombe %>% mutate(observation = seq_len(n()))
anscombe_tidy <- mutate(anscombe,observation = seq_len(n()))

parse_number()

  • readr包中函数:
  • parse_number():获得数字
1
2
3
4
library(readr)
parse_number("$1000") # 1000
parse_number("1,234,567.78") # 1234568
parse_number("class5") # 5

print(10)

1
2
3
4
5
6
k <- {10; 5; 5; 88; 7}  # 取最后一个赋值
k
k <- {print(10); 5}
k
k <- {print(10)}
k

data.table package

  • data.table继承于data.frame
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
library(data.table)
DT <- data.table(x = rnorm(9), r = rep(c("a","b","c"),each=3), z = rnorm(9))
head(DT, 3)
DT[c(2,3)] # 取2,3行
DT[1:2,c(2,3)] # 取2,3列
DT[,list(mean(x),sum(z))] # 返回x的均值和z的和
DT[,table(r)] # 返回r的分布
DT[,w:={tmp <- (x+z); log2(tmp+5)}] #增加一列w
DT[,a:=x>0] # 增加一列a
DT[,b:=mean(x+w),by=a] # 增加一列b
DT <- data.table(x = sample(letters[1:3], 1E5, TRUE))
DT[, .N, by=x]
DT <- data.table(x=rep(c("a","b","c"),each=100),y=rnorm(300))
setkey(DT, x)
head(DT['a'])
DT1 <- data.table(x=c('a','a','b','dt1'),y=1:4)
DT2 <- data.table(x=c('a','b','dt2'),z=5:7)
setkey(DT1,x);setkey(DT2,x) # key不要求唯一
DT1
DT2
merge(DT1,DT2) #合并

  • fread():读取数据特别快
1
2
3
4
5
6
library(data.table)
big_df <- data.frame(x=rnorm(1E7),y=rnorm(1E7))
file <- tempfile()
write.table(big_df,file=file,row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE)
system.time(fread(file))
system.time(read.table(file,header= TRUE,sep="\t"))

lapply,aspply,apply,tapply,mapply

  • lappy:loop over a list and evaluate a function on each element
  • sapply:same as lappy but try to simplify the result
  • apply:apply a function over the margins of any array
  • tapply:apply a function over subsets of a vector
  • mapply:multivariate version of lapply
1
2
3
4
5
6
7
8
9
10
x <- 1:4
lapply(x, runif)
x <- 1:4
sapply(x, runif)
x <- matrix(rnorm(200), 20, 10)
apply(x, 2, mean)
x <- c(rnorm(10), runif(10), rnorm(10,1))
f <- gl(3, 10)
tapply(x, f, mean)
mapply(rep, 1:4, 1:4)

now()

1
now("America/New_York")