Created
March 29, 2025 09:44
-
-
Save timriffe/9822d3837abe7de09d70d521462d5572 to your computer and use it in GitHub Desktop.
a third script to see how many citations the citations of ego have, this time with openalex data
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# note, our citation lookup needs an openalex paper id | |
# but you probably only have the doi. This does the translation | |
# this returns a full url, where the id is the last part | |
get_openalex_id_from_doi <- function(doi, email) { | |
base_url <- "https://api.openalex.org/works/doi:" | |
doi_encoded <- URLencode(doi, reserved = TRUE) | |
url <- paste0(base_url, doi_encoded, "?mailto=",email) | |
res <- httr::GET(url, httr::user_agent(email)) | |
httr::stop_for_status(res) | |
openalex_id <- jsonlite::fromJSON(httr::content(res, | |
as = "text", | |
encoding = "UTF-8"))$id | |
# strip to just id | |
id_stripped <- gsub("^.*/", "", openalex_id) | |
return(id_stripped) | |
} | |
# This takes a doi, and you need to pass an email address too. Use my email | |
# if you don't feel good about that. | |
get_second_order_citations_openalex <- function(doi, email, per_page = 200, max_pages = 50) { | |
# 1. convert doi to alexid url | |
openalex_id <- get_openalex_id_from_doi(doi = doi, email = email) | |
# 2. setup | |
base_url <- "https://api.openalex.org/works" | |
cursor <- "*" | |
all_results <- list() | |
page_count <- 0 | |
# 3. begin loop in case there are more results than fit on a page! | |
repeat { | |
# 4. compose url, moving up to next page of results if necessary (cursor) | |
url <- paste0(base_url, | |
"?filter=cites:openalex:", openalex_id, | |
"&per-page=", per_page, | |
"&cursor=", URLencode(cursor, reserved = TRUE), | |
"&mailto=", email) | |
# 5. make request | |
result <- httr::GET(url, httr::user_agent(email)) | |
httr::stop_for_status(result) | |
# 6. grab text blob; this has lots of | |
content <- jsonlite::fromJSON(httr::content(result, as = "text", encoding = "UTF-8")) | |
if (length(content$results) == 0) break | |
# 7. get just two variables we need | |
out <- tibble::tibble( | |
doi = content$results$doi, | |
cites = content$results$cited_by_count | |
) | |
# 8. increment page count, also use for indexing output list | |
page_count <- page_count + 1 | |
all_results[[page_count]] <- out | |
cursor <- content$meta$next_cursor | |
if (is.null(cursor) || page_count >= max_pages) break | |
# this is just to not abuse api | |
Sys.sleep(1) | |
} | |
final_df <- dplyr::bind_rows(all_results) | |
return(final_df) | |
} | |
# coveragedb paper (fast) | |
# test <- get_second_order_citations_openalex("10.1093/ije/dyab027","[email protected]") | |
# nrow(test) # first order cites | |
# sum(test$cites) # second order cites | |
# Lee Carter paper (> 1 min) | |
# test <- get_second_order_citations_openalex("10.1080/01621459.1992.10475265","[email protected]") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment