katie leap and stephen lauer
7/19/16
All slides and workshop materials can be found at: https://github.com/gridclub/r-summer-workshops-2016
read.csv()
will read in csv files
library(Hmisc)
demographics <- sasxport.get("http://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/DEMO_H.XPT")
taste.smell <- sasxport.get("http://wwwn.cdc.gov/Nchs/Nhanes/2013-2014/CSX_H.XPT")
dplyr
works like magicdplyr
:
select()
rename()
filter()
mutate()
group_by()
summarize()
arrange()
magrittr
packagelibrary(magrittr) # or dplyr
## confusing ##
mean(seq_len(sum(seq(1:10))))
[1] 28
## clear ##
seq(1:10) %>%
sum %>%
seq_len %>%
mean
[1] 28
library(plyr)
library(dplyr)
demo <- demographics %>%
select(id=seqn,gender=riagendr,age=ridageyr)
rename()
contains()
ends_with()
everything()
matches()
num_range("x", 1:5)
: columns named x1, x2, x3, x4, x5.one_of()
starts_with()
summary()
provides summary statisticsdplyr
is, data management relies on your brain
mutate()
allows us to add new columnsmutate_each()
: multiple columnsdemo <- demographics %>%
select(id=seqn,gender=riagendr,age=ridageyr) %>%
mutate(gender=revalue(as.factor(gender), c("1"="male","2"="female")))
summary(demo)
id gender age
Min. :73557 male :5003 Min. : 0.00
1st Qu.:76100 female:5172 1st Qu.:10.00
Median :78644 Median :26.00
Mean :78644 Mean :31.48
3rd Qu.:81188 3rd Qu.:52.00
Max. :83731 Max. :80.00
filter()
allows you to pick rows that have a specific value inside a columnwomen <- demo %>%
filter(gender=="female")
filter()
rowsold.farts <- demo %>%
filter(age>75)
summarize()
mean()
demo %>%
summarize(mean.age = mean(age))
mean.age
1 31.48413
summarize_each()
: multiple columnsgroup <- demo %>%
group_by(gender)
head(group)
Source: local data frame [6 x 3]
Groups: gender [2]
id gender age
(int) (fctr) (int)
1 73557 male 69
2 73558 male 54
3 73559 male 72
4 73560 male 9
5 73561 female 73
6 73562 male 56
arrange <- demo %>%
arrange(gender)
head(arrange)
id gender age
1 73557 male 69
2 73558 male 54
3 73559 male 72
4 73560 male 9
5 73562 male 56
6 73563 male 0
demo %>%
group_by(gender) %>%
summarize(mean.age = mean(age))
Source: local data frame [2 x 2]
gender mean.age
(fctr) (dbl)
1 male 30.69159
2 female 32.25077
dplyr
specific:
first
: First value of a vector.last
: Last value of a vector.nth
: Nth value of a vector.n
: # of values in a vector.n_distinct
: # of distinct values in a vector.IQR
min
, max
mean
, median
var
, sd
library(tidyr)
full.dat <- left_join(demo, ts, by = "id")
by
statementby
variables as well
by = c("id","age")
dplyr
functions can be found on the data wrangling cheat sheetXML
rvest
Scraping an HTML table from www.basketball-reference.com/teams/BOS/2016.html
library(XML)
site <- "http://www.basketball-reference.com/teams/BOS/2016.html"
celtics_2016 <- readHTMLTable(site)
celtics_players <- celtics_2016$per_game
Player Age G PTS
1 Marcus Smart 21 61 9.1
2 Amir Johnson 28 79 7.3
3 David Lee 32 30 7.1
4 Tyler Zeller 26 60 6.1
5 Jonas Jerebko 28 78 4.4
6 Isaiah Thomas 26 82 22.2
years <- 1980:2013
all_celtics <- c()
for(year in years){
site <- paste0("http://www.basketball-reference.com/teams/BOS/", year, ".html")
celtics_one_year <- readHTMLTable(site)
one_year_players <- celtics_one_year$per_game
one_year_players$Year <- year
all_celtics <- bind_rows(all_celtics, one_year_players)
}
all_celtics %>%
select(Player, Year, PTS) %>%
arrange(desc(as.numeric(PTS)))
Source: local data frame [544 x 3]
Player Year PTS
(chr) (int) (chr)
1 Larry Bird 1988 29.9
2 Larry Bird 1985 28.7
3 Larry Bird 1987 28.1
4 Paul Pierce 2006 26.8
5 Kevin McHale 1987 26.1
6 Paul Pierce 2002 26.1
7 Paul Pierce 2003 25.9
8 Larry Bird 1986 25.8
9 Paul Pierce 2001 25.3
10 Paul Pierce 2007 25.0
.. ... ... ...
all_celtics %>%
group_by(Player) %>%
summarize(Games=sum(as.numeric(G)),
PPG=weighted.mean(as.numeric(PTS), as.numeric(G))) %>%
arrange(desc(PPG))
Source: local data frame [242 x 3]
Player Games PPG
(chr) (dbl) (dbl)
1 Larry Bird 897 24.30234
2 Paul Pierce 1102 21.81053
3 Antoine Walker 552 20.62083
4 Kevin McHale 971 17.84449
5 Dominique Wilkins 77 17.80000
6 Reggie Lewis 450 17.57533
7 Ray Allen 358 16.71844
8 Dino Radja 224 16.68437
9 Robert Parish 1106 16.48300
10 Ricky Davis 181 16.26022
.. ... ... ...
imdb <- read_html("http://www.imdb.com/search/title?count=100&keywords=robot&num_votes=3000,&title_type=feature&ref_=gnr_kw_ro")
descriptions <- imdb %>%
html_nodes(".outline") %>%
html_text()
descriptions[[1]]
[1] "After the re-emergence of the world's first mutant, world-destroyer Apocalypse, the X-Men must unite to defeat his extinction level plan."
rating <- imdb %>%
html_nodes(".value") %>%
html_text() %>%
as.numeric
head(rating)
[1] 7.4 8.2 8.2 6.6 8.6 7.5
year <- imdb %>%
html_nodes(".year_type") %>%
html_text() %>%
gsub(pattern="\\(",replacement="") %>%
gsub(pattern="\\)",replacement="") %>%
as.numeric
head(year)
[1] 2016 2016 2015 2015 2014 2015
title <- imdb %>%
html_nodes(".title") %>%
html_text()
title2 <- unlist(strsplit(title,"\n \n\n\n\n "))[seq(from=2,to=length(title),by=2)]
title3 <- unlist(strsplit(title2,"\n \\("))[seq(from=1,to=length(title),by=2)]
head(title3)
[1] "X-Men: Apocalypse" "Captain America: Civil War"
[3] "Star Wars: The Force Awakens" "Terminator Genisys"
[5] "Interstellar" "Avengers: Age of Ultron"
robot.movies <- data.frame(title=title3,year,rating)
head(arrange(robot.movies,desc(rating)),4)
title year rating
1 Star Wars: Episode V - The Empire Strikes Back 1980 8.8
2 The Matrix 1999 8.7
3 Star Wars: Episode IV - A New Hope 1977 8.7
4 Interstellar 2014 8.6
grep
to find which descriptions contain robot words (“robot”, “android”, “AI”, etc.) to filter out fake robot movies (X-Men Apocalyse, really?)