前言:本文主要参考了来自coursera上约翰霍普金斯大学Data Science系列课程Course3:Getting and Cleaning Data.

Tips

记录你的每一步操作
变量名应当取得比较详尽,不要缩写
善用??

Data collection

set path

1	setwd("C:/Users/msgsxj/Desktop/coursera/Getting and Cleaning Data")

create directories

1
2
3

if (!file.exists("data")){
	dir.create("data")
}

get data

from csv

utils包中函数:
download.file():如果是https开头,mac需要指定method = "curl",win不需要;如果是http开头则都不需要

setwd("C:/Users/msgsxj/Desktop/coursera/Getting and Cleaning Data")
fileurl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accseeType=DOWNLOAD"
download.file(fileurl, destfile = "./data/cameras.csv", method = "curl") # mac需要指定curl,win不需要
list.files("./data")

utils包中函数:
read.table()
read.csv()

1 2	cameraData <- read.table("./data/cameras.csv", sep = ",", header = TRUE) cameraData <- read.csv("./data/cameras.csv")

重复读取1000次,两种方式读取csv所花时间对比
read.table():1.862808 secs
read.csv():1.861519 secs

t1 <- Sys.time()
for (i in 1:1000){
  cameraData <- read.table("./data/cameras.csv", sep = ",", header = TRUE)
}
Sys.time() - t1 # 1.862808 secs

t2 <- Sys.time()
for (i in 1:1000){
  cameraData <- read.csv("./data/cameras.csv")
}
Sys.time() - t2 # 1.861519 secs

from xlsx

首先用excel手动将cameras.csv转存为cameras.xlsx
readxl包中函数:
read_excel()

setwd("C:/Users/msgsxj/Desktop/coursera/Getting and Cleaning Data")
install.packages("tidyverse")
install.packages("readxl")
library(readxl)
cameraData <- read_excel("./data/cameras.xlsx")

from xml

XML包中函数:
xmlTreeParse()
xmlRoot()
xmlName()
xmlSApply()
xpathSApply()

install.packages("XML")
library(XML)
fileurl_1 <- "http://www.w3school.com.cn/example/xmle/note.xml"
doc_1 <- xmlTreeParse(fileurl_1, useInternal = TRUE)
doc_1
rootNode <- xmlRoot(doc_1)
rootNode
xmlName(rootNode)
rootNode[[1]]
rootNode[[1]][[1]]
xmlSApply(rootNode, xmlValue) #获得所有文本
xpathSApply(rootNode, "//to", xmlValue) #所有标签为to的文本

另一个例子

fileurl_2 <- "http://espn.go.com/nfl/team/_/name/bal/baltimore-ravens"
doc_2 <- htmlTreeParse(fileurl_2, useInternal = TRUE)
time <- xpathSApply(doc_2, "//div[@class='game-meta']", xmlValue)
time
team <- xpathSApply(doc_2, "//div[@class='game-info']", xmlValue)
team

from JSON

json结构有点类似xml,应用广泛,也是通过API获得的数据的最常见类型
jsonlite包中函数:
fromJSON()
toJSON()

library(jsonlite)
jsonData <- fromJSON("https://api.github.com/users/jtleek/repos")
names(jsonData)
names(jsonData$owner)
jsonData$owner$login
myjson <- toJSON(iris, pretty=TRUE)
iris2 <- fromJSON(myjson)
head(iris2)

from MySQL

DBI包中函数:
dbConnect()
dbGetQuery()
RMySQL包中函数:
MySQL()

library(DBI)
install.packages("RMySQL")
library(RMySQL)
ucscDb <- dbConnect(MySQL(), user = "genome", host = "genome-mysql.cse.ucsc.edu")
result <- dbGetQuery(ucscDb, "show databases;");dbDisconnect(ucscDb); #断开连接后会返回TRUE

一个数据库的例子:hg19
DBI包中函数:
dbListTables()
dbReadTable()
dbSendQuery()
fetch()
dbClearResult()
dbDisconnect()

hg19 <- dbConnect(MySQL(), user = "genome", db = "hg19", host = "genome-mysql.cse.ucsc.edu")
allTables <- dbListTables(hg19)
length(allTables)
allTables[1:5]
dbListFields(hg19, "affyU133Plus2")
dbGetQuery(hg19, "select count(*) from affyU133Plus2") #计算表中所有记录 58463
affyData <- dbReadTable(hg19, "affyU133Plus2")
affyData[1,]
query <- dbSendQuery(hg19, "select * from affyU133Plus2 where misMatches between 1 and 3")
affyMis <- fetch(query)
quantile(affyMis$misMatches)
affyMisSmall <- fetch(query, n = 10)
dbClearResult(query) #清空查询后会返回TRUE
dim(affyMisSmall)
dbDisconnect(hg19)

from HDF5

官网:https://support.hdfgroup.org/HDF5/
Hierarchical Data Format(层次型数据结构)
rhdf5包中函数:
h5createFile()
h5createGroup()
h5write()
h5ls()
h5read()

setwd("C:/Users/msgsxj/Desktop/coursera/Getting and Cleaning Data")
source("http://bioconductor.org/biocLite.R")
biocLite("rhdf5")
library(rhdf5)
created = h5createFile("./data/example.h5")
created
created = h5createGroup("./data/example.h5", "foo")
created = h5createGroup("./data/example.h5", "baa")
created = h5createGroup("./data/example.h5", "foo/foobaa")
A = matrix(1:10, nr = 2, nc = 5)
h5write(A, "./data/example.h5", "foo/A")
B = array(seq(0.1,2.0,by=0.1) ,dim = c(5,2,2))
attr(B, "scale") <- "liter"
h5write(B, "./data/example.h5", "foo/foobaa/B")
h5ls("./data/example.h5")
df = data.frame(1L:5L, seq(0,1,length.out=5), c("a","b","c","d","e"), stringAsFactors = FALSE)
h5write(df, "./data/example.h5", "df") # df直接写进最高群组
h5ls("./data/example.h5")
readA = h5read("./data/example.h5", "foo/A")
readB = h5read("./data/example.h5", "foo/foobaa/B")
readdf = h5read("./data/example.h5", "df")
readA
h5write(c(12,13,14), "./data/example.h5", "foo/A", index = list(1,1:3))
h5read("./data/example.h5", "foo/A")

from The Web

base包中函数:
url()
readLines()

con = url("http://www.msgsxj.cn/about/")
htmlCode = readLines(con)
close(con)
htmlCode[378:380]

XML包中函数:
htmlTreeParse()
htmlParse()
xpathSApply()
httr包中函数:
GET()
content()

library(XML)
url <- "http://xueshu.baidu.com/scholarID/CN-BP75S7TJ" # 周志华
html <- htmlTreeParse(url, useInternalNodes = T)
xpathSApply(html, "//div[@class='res_info']", xmlValue)
# httr包的等效实现
library(httr)
html2 <- GET(url)
content2 <- content(html2, as = "text")
parsedHtml <- htmlParse(content2, asText = TRUE)
xpathSApply(parsedHtml, "//div[@class='res_info']", xmlValue)

httr包中函数:
GET():websites with passwords
handle()

pg1 = GET("https://httpbin.org/basic-auth/user/passwd")
pg1
names(pg1)
pg2 = GET("https://httpbin.org/basic-auth/user/passwd", authenticate("user","passwd"))
pg2
names(pg2)
baidu = handle("https://baidu.com")
pg3 = GET(handle = baidu, path = "/")
pg3

from API

library(httr)
# 1. Find OAuth settings for github:
#    http://developer.github.com/v3/oauth/
oauth_endpoints("github")

# 2. To make your own application, register at
#    https://github.com/settings/developers. Use any URL for the homepage URL
#    (http://github.com is fine) and  http://localhost:1410 as the callback url
#
#    Replace your key and secret below.
myapp <- oauth_app("github",
  key = "56b637a5baffac62cad9",
  secret = "8e107541ae1791259e9987d544ca568633da2ebf")

# 3. Get OAuth credentials
github_token <- oauth2.0_token(oauth_endpoints("github"), myapp)

# 4. Use API
gtoken <- config(token = github_token)
req <- GET("https://api.github.com/rate_limit", gtoken)
stop_for_status(req)
content(req)
# OR:
req <- with_config(gtoken, GET("https://api.github.com/rate_limit"))
stop_for_status(req)
content(req)

from other sources

下列函数或包自行查询文档:
file()
url()
gzfile()
bzfile()
read.arff()(Weka)
read.dta()(Stata)
read.mtp()(Minitab)
read.octave()(Octave)
read.spss()(SPSS)
read.xport()(SAS)
read_fwf(.for)
RPostresSQL包(provide a DBI-compliant(兼容数据库接口)的数据库连接)
RODBC包(为PostgreQL,MySQL,Microsoft Accsee SQLite提供接口)
RMongo包,rmongodb包(MongoDB)
jpeg(),readbitmap(),png(),EBImage包(读取图像数据)
rdgal包,rgeos包,raster包(读取GIS(Geographic Information System)数据)
tuneR包,seewave包(读取MP3)

Some Data Resources

United Bations:http://data.un.org/
U.S.:http://www.data.gov/
United Kingdom:http://data.gov.uk/
France http://www.data.gouv.fr/
Ghana:http://data.gov.gh/
Australia:http://data.gov.au/
Grenamy:http://www.govdata.de/
Hong Kong:http://www.gov.hk/en/theme/psi/datasets/
Janpan:http://www.data.go.jp/
Many more:http://www.data.gov/opendatasites
http://www.gapminder.org/
http://www.asdfree.com/
http://www.kaggle.com/
Hilary Mason:http://bitly.com/bundles/hmason/1
Jeff Hammerbacher:http://www.quora.com/Jeff-Hammerbacher/Introduction-to-Data-Science-Data-Sets

Making data tidy

Subsetting and Sorting

Subsetting

set.seed(13435)
X <- data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15))
X <- X[sample(1:5),]
X$var2[c(1,3)]=NA
X
X[,1] #提取第一列数据
X[,"var1"] #提取var1数据
X[1:2,"var2"]
X[(X$var1 <= 3 & X$var3 > 11),]
X[(X$var1 <= 3 | X$var3 > 15),]
X[which(X$var2 > 8),]
X[(X$var2 > 8),] #会返回NA

Sorting

base包中函数:
sort()
order()

set.seed(13435)
X <- data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15))
X <- X[sample(1:5),]
X$var2[c(1,3)]=NA
sort(X$var1)
sort(X$var1, decreasing = TRUE)
sort(X$var2, na.last = TRUE)
X[order(X$var1),]
X[order(X$var1,X$var3),]

plyr包中函数:
arrange()

library(plyr)
set.seed(13435)
X <- data.frame("var1"=sample(1:5),"var2"=sample(6:10),"var3"=sample(11:15))
X <- X[sample(1:5),]
X$var2[c(1,3)]=NA
arrange(X, var1)
arrange(X, desc(var1))
X$var4 <- rnorm(5)
X
Y <- cbind(X, rnorm(5))
Y

![(/pictures/R/R_pylr.png)

Summarizing Data

下载数据

setwd("C:/Users/msgsxj/Desktop/coursera/Getting and Cleaning Data")
fileurl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accseeType=DOWNLOAD"
download.file(fileurl, destfile = "./data/restaurants.csv", method = "curl")
restData <- read.csv("./data/restaurants.csv")

utils包中函数
head()
tail()
str()
base包中函数:
summary()
table()
stats包中函数:
quantile()

head(restData, n = 1)
tail(restData, n = 1)
summary(restData)
str(restData) # 比summary好用
quantile(restData$councilDistrict, na.rm = TRUE)
quantile(restData$councilDistrict, probs = c(0.5,0.75,0.9))
table(restData$zipCode, useNA ="ifany") # useNA ="ifany"为统计NA个数
table(restData$councilDistrict, restData$zipCode)

base包中函数:
is.na()

sum(is.na(restData$councilDistrict))
any(is.na(restData$councilDistrict))
all(restData$zipCode > 0)
colSums(is.na(restData))
all(colSums(is.na(restData)) == 0)
table(restData$zipCode %in% c("21212"))
table(restData$zipCode %in% c("21212","21213"))
restData[(restData$zipCode %in% c("21212","21213")),]

base包中函数:
as.data.frame()
stats中函数:
xtabs()
ftable()

data(UCBAdmissions)
DF = as.data.frame(UCBAdmissions)
summary(DF)
xt <- xtabs(Freq ~ Gender + Admit, data = DF)
xt
warpbreaks$replicate <- rep(1:9, len = 54)
xt <- xtabs(breaks ~., data = warpbreaks)
ftable(xt)

utils包中函数:
object.size()

1
2
3

fakeData = rnorm(1e5)
object.size(fakeData)
print(object.size(fakeData), units = "Mb") # 0.8 Mb

Creating new Variables

下载数据

setwd("C:/Users/msgsxj/Desktop/coursera/Getting and Cleaning Data")
fileurl <- "https://data.baltimorecity.gov/api/views/k5ry-ef3g/rows.csv?accseeType=DOWNLOAD"
download.file(fileurl, destfile = "./data/restaurants.csv", method = "curl")
restData <- read.csv("./data/restaurants.csv")

Create sequences

base包中函数:
seq()

seq(1, 10, by = 2) # 1 3 5 7 9
seq(1, 10, length = 3) # 1.0  5.5 10.0
c(1, 3, 8, 25, 100) # 1 3 8 25 100
seq(along = x) # 1 2 3 4 5

Create binary variables

base包中函数:
ifelse()

1 2	restData$zipWrong <- ifelse(restData$zipCode < 0, T, F) class(restData$zipWrong)

Create categorical variables

base包中函数:
cut()
Hmisc包中函数:
cut2()
plyr包中函数:
mutate()

install.packages("Hmisc")
restData$zipGroups <- cut(restData$zipCode, breaks = quantile(restData$zipCode))
table(restData$zipGroups)
class(restData$zipGroups)
library(Hmisc)
restData$zipGroups <- cut2(restData$zipCode, g = 4)
table(restData$zipGroups)
class(restData$zipGroups)
library(plyr)
restData2 <- mutate(restData, zipGroups = cut2(zipCode, g = 4)) # 新变量加入
table(restData2$zipGroups)

base包中函数:
factor()
stats包中函数:
relevel()

restData$zcf <- factor(restData$zipCode)
restData$zcf[1:10]
class(restData$zcf)
yesno <- sample(c("yes","no"), size = 10, replace = TRUE)
class(yesno)
yesnofac <- factor(yesno, levels = c("yes","no"))
class(yesnofac)
relevel(yesnofac, ref = "yes") # yes 设置为比较低的水平
as.numeric(yesnofac)

Common transforms

abs()
sqrt()
ceiling()
floor()
round(x, digits = n) # 小数点后有效数字
signif(x, digits = n) # 一共有效数字
cos()
log()
log2()
log10()
exp()

Reshaping Data

reshape2包中函数:
melt()
dcast()

library(reshape2)
dim(mtcars)
mtcars$carname <- rownames(mtcars)
carMelt <- melt(mtcars, id = c("carname","gear","cyl"), measure.vars = c("mpg","hp")) # 样本量翻倍
dim(carMelt)
head(carMelt)
tail(carMelt)
class(carMelt$variable)
cylData <- dcast(carMelt, cyl ~ variable, mean)
cylData # 对melt函数中设定的variable按cyl求平均

base包中函数:
tapply()
split()
lappy()
unlist()
sapply()

tapply(InsectSprays$count, InsectSprays$spray, sum) # 按杀虫剂种类求和
spIns <- split(InsectSprays$count, InsectSprays$spray);spIns
sprCount <- lapply(spIns, sum);sprCount # 按杀虫剂种类求和
unlist(sprCount)
sapply(spIns, sum) # sapply为lapply简化版

plyr包中函数:
ddply()

library(plyr)
ddply(InsectSprays, .(spray), summarise, sum=sum(count))
spraySums <- ddply(InsectSprays, .(spray), summarise, sum=ave(count))
dim(spraySums)
head(spraySums)

Editing Text Variable

下载数据

setwd("C:/Users/msgsxj/Desktop/coursera/Getting and Cleaning Data")
fileurl <- "https://data.baltimorecity.gov/api/views/dz54-2aru/rows.csv?accseeType=DOWNLOAD"
download.file(fileurl, destfile = "./data/camera2.csv", method = "curl")
camera2Data <- read.csv("./data/camera2.csv")
names(camera2Data)

base包中函数:
tolower()
strsplit()
sapply()

names(camera2Data) <- tolower(names(camera2Data)) # 变量名全部变小写
names(camera2Data)
splitNames <- strsplit(names(camera2Data),"\\.") # split 需转义
splitNames[[6]]
splitNames[[6]][1]
firstElement <- function(x){x[1]}
names(camera2Data) <- sapply(splitNames,firstElement)  # 想只保留splitNames[[6]]的第一部分
names(camera2Data)

上述srtsplit()+sapply()过程的相似处理:用""取代"."
sub()
gsub() 替换所有

camera2Data <- read.csv("./data/camera2.csv") #重新载入
names(camera2Data)
sub("\\.", "", names(camera2Data),)  #需转义
mytest <- "this_is_a_test"
sub("_", "", mytest) #不需要转义
gsub("_", "", mytest)

base包中函数:
grep()
grepl():返回一列logical
更多正则表达式规则见维基百科

grep("Alameda", camera2Data$intersection) # 4 5 36
table(grepl("Alameda", camera2Data$intersection))
camera2Data_noAlameda <- camera2Data[!grepl("Alameda", camera2Data$intersection),]
grep("Alameda", camera2Data$intersection, value = T) # 直接返回值
length(grep("JeffStreet", camera2Data$intersection)) # 检查是否出现
length(grep('^[12]/2/2007', '22/2/2007', value = T))
length(grep('^[12]/2/2007', '2/2/2007', value = T))

base包中函数:
nchar()
substr()
paste()
paste0()
stringr包中函数:
str_trim()

library(stringr)
nchar("Jeffery Leek")
substr("Jeffery Leek",1,7)
paste("Jeffery","Leek") #自动加上了空格
paste0("Jeffery","Leek") #不加空格
str_trim("Jeff      ") #去掉空格

Date

base包中函数:
date()
Sys.Date()

d1 <- date()
class(d1)
d2 <- Sys.Date()
class(d2)
format(d2, "%a %b %d")
julian(d2)

lubridate包中函数:
ymd()
ymd_hms()
dmy():返回日期
wday():返回星期

library(lubridate)
ymd("20140108")
mdy("08/04/2013")
ymd_hms("2011-08-03 10:15:03")
ymd_hms("2011-08-03 10:15:03", tz = "Pacific/Auckland")
x = dmy(c("1jan2013","2jan2013","31mar2013","30jul2013"))
x[1]
wday(x[1])
wday(x[1],label=T)

一些强大的技巧

%>%

%>%:管道函数
下面两条代码等价

1 2	anscombe_tidy <- anscombe %>% mutate(observation = seq_len(n())) anscombe_tidy <- mutate(anscombe,observation = seq_len(n()))

parse_number()

readr包中函数:
parse_number():获得数字

library(readr)
parse_number("$1000") # 1000
parse_number("1,234,567.78") # 1234568
parse_number("class5") # 5

print(10)

k <- {10; 5; 5; 88; 7}  # 取最后一个赋值
k
k <- {print(10); 5}
k
k <- {print(10)}
k

data.table package

data.table继承于data.frame

library(data.table)
DT <- data.table(x = rnorm(9), r = rep(c("a","b","c"),each=3), z = rnorm(9))
head(DT, 3)
DT[c(2,3)] # 取2,3行
DT[1:2,c(2,3)] # 取2,3列
DT[,list(mean(x),sum(z))] # 返回x的均值和z的和
DT[,table(r)] # 返回r的分布
DT[,w:={tmp <- (x+z); log2(tmp+5)}] #增加一列w
DT[,a:=x>0] # 增加一列a
DT[,b:=mean(x+w),by=a] # 增加一列b
DT <- data.table(x = sample(letters[1:3], 1E5, TRUE))
DT[, .N, by=x]
DT <- data.table(x=rep(c("a","b","c"),each=100),y=rnorm(300))
setkey(DT, x)
head(DT['a'])
DT1 <- data.table(x=c('a','a','b','dt1'),y=1:4)
DT2 <- data.table(x=c('a','b','dt2'),z=5:7)
setkey(DT1,x);setkey(DT2,x) # key不要求唯一
DT1
DT2
merge(DT1,DT2) #合并

fread():读取数据特别快

library(data.table)
big_df <- data.frame(x=rnorm(1E7),y=rnorm(1E7))
file <- tempfile()
write.table(big_df,file=file,row.names=FALSE,col.names=TRUE,sep="\t",quote=FALSE)
system.time(fread(file))
system.time(read.table(file,header= TRUE,sep="\t"))

lapply,aspply,apply,tapply,mapply

lappy:loop over a list and evaluate a function on each element
sapply:same as lappy but try to simplify the result
apply:apply a function over the margins of any array
tapply:apply a function over subsets of a vector
mapply:multivariate version of lapply

x <- 1:4
lapply(x, runif)
x <- 1:4
sapply(x, runif)
x <- matrix(rnorm(200), 20, 10)
apply(x, 2, mean)
x <- c(rnorm(10), runif(10), rnorm(10,1))
f <- gl(3, 10)
tapply(x, f, mean)
mapply(rep, 1:4, 1:4)

now()

now():可查看指定城市时间
可查城市

1	now("America/New_York")