Last active
April 9, 2017 08:32
-
-
Save valentinitnelav/27abd6f9cf7128e4b7241d3ce7e74f3c to your computer and use it in GitHub Desktop.
Read <table> HTML tag with {rvest} using CSS selectors
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read <table> HTML tag with {rvest} using CSS selectors | |
# ==================================================== | |
# Load library | |
library(rvest) | |
# ======================= | |
# Read the web page [accessed 08-Apr-2017] | |
# ======================= | |
link <- "http://www.theplantlist.org/1.1/statistics/" | |
# NOTE: is ethical to store the page and not read it unnecessarily too many times, | |
# overloading their server | |
link.scrap <- read_html(link) | |
# ======================= | |
# Use the general "table" selector to read all tables, | |
# then select desired table | |
# ======================= | |
# ----------------------- | |
# Without piping: | |
# ----------------------- | |
tbl.nodes <- html_nodes(x = link.scrap, | |
css ="table") | |
tbls.lst <- html_table(tbl.nodes) | |
# The result is a list, | |
# therefore, an individual table can be further accessed via indexing | |
# so, select only the first table | |
my.tbl <- tbls.lst[[1]] | |
# also, drop unwanted first column | |
my.tbl <- my.tbl[,-1] | |
# adjust column name | |
colnames(my.tbl)[3] <- "Total_prc" | |
my.tbl | |
## Status Total Total_prc | |
## 1 Accepted 350,699 33.0% | |
## 2 Synonym 470,624 44.2% | |
## 3 Unplaced 243 0.0% | |
## 4 Unassessed 242,469 22.8% | |
# Note that all data is read as character! | |
str(my.tbl) | |
## 'data.frame': 4 obs. of 3 variables: | |
## $ Status : chr "Accepted" "Synonym" "Unplaced" "Unassessed" | |
## $ Total : chr "350,699" "470,624" "243" "242,469" | |
## $ Total_prc: chr "33.0%" "44.2%" "0.0%" "22.8%" | |
# ----------------------- | |
# With piping: | |
# ----------------------- | |
my.tbl.2 <- | |
html_nodes(x = link.scrap, | |
css ="table") %>% | |
html_table() %>% | |
.[[1]] %>% # select the first table | |
.[,-1] # drop unwanted first column | |
# adjust column name | |
colnames(my.tbl.2)[3] <- "Total_prc" | |
my.tbl.2 | |
# ======================= | |
# Read a specific table tag from the page | |
# using directly the table's selector | |
# ======================= | |
my.tbl.3 <- | |
html_nodes(x = link.scrap, | |
css ='#columns > section > div:nth-child(5) > table') %>% | |
html_table() %>% | |
.[[1]] %>% # The indexing is to select the only element of the list | |
.[,-1] # drop unwanted first column | |
colnames(my.tbl.3)[3] <- "Total_prc" | |
my.tbl.3 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment