Skip to content

Instantly share code, notes, and snippets.

@timriffe
Created March 29, 2025 09:44
Show Gist options
  • Save timriffe/9822d3837abe7de09d70d521462d5572 to your computer and use it in GitHub Desktop.
Save timriffe/9822d3837abe7de09d70d521462d5572 to your computer and use it in GitHub Desktop.
a third script to see how many citations the citations of ego have, this time with openalex data
# note, our citation lookup needs an openalex paper id
# but you probably only have the doi. This does the translation
# this returns a full url, where the id is the last part
get_openalex_id_from_doi <- function(doi, email) {
base_url <- "https://api.openalex.org/works/doi:"
doi_encoded <- URLencode(doi, reserved = TRUE)
url <- paste0(base_url, doi_encoded, "?mailto=",email)
res <- httr::GET(url, httr::user_agent(email))
httr::stop_for_status(res)
openalex_id <- jsonlite::fromJSON(httr::content(res,
as = "text",
encoding = "UTF-8"))$id
# strip to just id
id_stripped <- gsub("^.*/", "", openalex_id)
return(id_stripped)
}
# This takes a doi, and you need to pass an email address too. Use my email
# if you don't feel good about that.
get_second_order_citations_openalex <- function(doi, email, per_page = 200, max_pages = 50) {
# 1. convert doi to alexid url
openalex_id <- get_openalex_id_from_doi(doi = doi, email = email)
# 2. setup
base_url <- "https://api.openalex.org/works"
cursor <- "*"
all_results <- list()
page_count <- 0
# 3. begin loop in case there are more results than fit on a page!
repeat {
# 4. compose url, moving up to next page of results if necessary (cursor)
url <- paste0(base_url,
"?filter=cites:openalex:", openalex_id,
"&per-page=", per_page,
"&cursor=", URLencode(cursor, reserved = TRUE),
"&mailto=", email)
# 5. make request
result <- httr::GET(url, httr::user_agent(email))
httr::stop_for_status(result)
# 6. grab text blob; this has lots of
content <- jsonlite::fromJSON(httr::content(result, as = "text", encoding = "UTF-8"))
if (length(content$results) == 0) break
# 7. get just two variables we need
out <- tibble::tibble(
doi = content$results$doi,
cites = content$results$cited_by_count
)
# 8. increment page count, also use for indexing output list
page_count <- page_count + 1
all_results[[page_count]] <- out
cursor <- content$meta$next_cursor
if (is.null(cursor) || page_count >= max_pages) break
# this is just to not abuse api
Sys.sleep(1)
}
final_df <- dplyr::bind_rows(all_results)
return(final_df)
}
# coveragedb paper (fast)
# test <- get_second_order_citations_openalex("10.1093/ije/dyab027","[email protected]")
# nrow(test) # first order cites
# sum(test$cites) # second order cites
# Lee Carter paper (> 1 min)
# test <- get_second_order_citations_openalex("10.1080/01621459.1992.10475265","[email protected]")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment