Last active
March 25, 2023 18:40
-
-
Save ClaudiuPapasteri/7bef34394c395e03ee074f884ddbf4d4 to your computer and use it in GitHub Desktop.
Scrape Google Scholar search
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#' Scrape Google Scholar search | |
#' | |
#' @description Web scrape pages with scholar Google Scholar search results of a defined term. | |
#' | |
#' @param term String representing the Google Scholar search term. May include boolean search operators (e.g., "AND"), word search operators (e.g., "intext:") and symbol search operators (e.g., quotation marks " "). | |
#' @param pages Numeric vector of integer values representing the Google Search page numbers that need to be scraped. | |
#' @param crawl_delay Numeric vector of length one representing the number of seconds for the base crawl delay. This avoids being blocked by HTTP error 429 or Captcha. | |
#' @param ... Any additional httr config to use throughout the session. Using a Proxy to not be IP flagged and blocked is advised. | |
#' | |
#' @return Data frame with scraped results of Google Scholar search using term. | |
#' @export | |
#' | |
#' @examples | |
#' # Run single query | |
#' test1 <- scrape_gs(term = 'intext:"psychotherapy" AND "PTSD"', pages = 1:5) # use boolean operators and exact phrasing | |
#' test2 <- scrape_gs(term = 'intitle:"psychotherapy" AND "PTSD"', pages = 1:5) | |
#' all.equal(test1, test2) # different results based on intext/intitle search operators | |
#' | |
#' # Run multiple queries using list | |
#' queries <- list('"psychotherapy" AND "PTSD"', '"psychotherapy" AND "trauma"') | |
#' pages <- list(1:2, 2:4) | |
#' test_list1 <- Map(list, term = queries, pages = pages) | |
#' # equivalent definition for multiple queries using list | |
#' first <- list(term = '"psychotherapy" AND "PTSD"', pages = 1:2) | |
#' second <- list(term = '"psychotherapy" AND "trauma"', pages = 2:4) | |
#' test_list2 <- list(first, second) | |
#' identical(test_list1, test_list2) # lead to same structure | |
#' | |
#' test3 <- do.call(rbind, lapply(test_list2, function(x) do.call(scrape_gs, x))) | |
#' | |
#' @version "0.1" | |
#' | |
#' @author Claudiu C. Papasteri | |
#' | |
#' @license MIT License | |
scrape_gs <- function(term, pages, crawl_delay, ...) { | |
library(rvest) | |
library(httr) | |
gs_url_base <- "https://scholar.google.com/scholar" | |
# set httr config outside of function and use them inside ...; e.g.: | |
# useragent <- httr::user_agent("Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36") # spoof user agent | |
# proxy <- httr::use_proxy(url = "proxy.com", port = 8080, username = "dave", password = "pass", auth = "basic") | |
result_list <- list() | |
i <- 1 | |
for (n_page in (pages - 1)*10) { # gs page indexing starts with 0; there are 10 articles per page, see "?start=" param | |
gs_url <- paste0(gs_url_base, "?start=", n_page, "&q=", noquote(gsub("\\s+", "+", trimws(term)))) | |
t0 <- Sys.time() | |
session <- rvest::session(gs_url, ...) # session$config$options$useragent | |
t1 <- Sys.time() | |
response_delay <- as.numeric(t1-t0) # backing off time | |
wbpage <- rvest::read_html(session) | |
# Avoid HTTP error 429 due to too many requests - use crawl delay & back off | |
Sys.sleep(crawl_delay + 3*response_delay + runif(n = 1, min = 0.5, max = 1)) | |
if((i %% 10) == 0) { # sleep every 10 iterations | |
message("taking a break") | |
Sys.sleep(10 + 10*response_delay + runif(n = 1, min = 0, max = 1)) | |
} | |
i <- i + 1 | |
# Raw data | |
titles <- rvest::html_text(rvest::html_elements(wbpage, ".gs_rt")) | |
authors_years <- rvest::html_text(rvest::html_elements(wbpage, ".gs_a")) | |
part_abstracts <- rvest::html_text(rvest::html_elements(wbpage, ".gs_rs")) | |
bottom_row_nodes <- rvest::html_elements(wbpage, ".gs_fl") | |
bottom_row_nodes <- bottom_row_nodes[!grepl("gs_ggs gs_fl", as.character(bottom_row_nodes), fixed = TRUE)] # exclude the ones with this tag, they are download links | |
bottom_row <- rvest::html_text(bottom_row_nodes) | |
# Processed data | |
authors <- gsub("^(.*?)\\W+-\\W+.*", "\\1", authors_years, perl = TRUE) | |
years <- gsub("^.*(\\d{4}).*", "\\1", authors_years, perl = TRUE) | |
citations <- strsplit(gsub("(?!^)(?=[[:upper:]])", " ", bottom_row, perl = TRUE), " ") # split on capital letter to get Number of citations link | |
citations <- lapply(citations, "[", 3) | |
n_citations <- suppressWarnings(as.numeric(sub("\\D*(\\d+).*", "\\1", citations))) | |
# Store in list | |
result_list <- append( | |
result_list, | |
list( | |
list( | |
page = n_page/10 + 1, | |
term = term, | |
title = titles, | |
authors = authors, | |
year = years, | |
n_citations = n_citations, | |
abstract = part_abstracts | |
) | |
) | |
) | |
} | |
# Return as data frame | |
result_df <- lapply(result_list, as.data.frame) | |
result_df <- as.data.frame(do.call(rbind, result_df)) | |
result_df | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment