timriffe · March 29, 2025 09:44
diff --git a/openalex_second_order.R b/openalex_second_order.R
 # note, our citation lookup needs an openalex paper id
 # but you probably only have the doi. This does the translation
 # this returns a full url, where the id is the last part
 get_openalex_id_from_doi <- function(doi, email) {
  base_url <- "https://api.openalex.org/works/doi:"
  doi_encoded <- URLencode(doi, reserved = TRUE)
  
  url <- paste0(base_url, doi_encoded, "?mailto=",email)
  
  res <- httr::GET(url, httr::user_agent(email))
  httr::stop_for_status(res)
  
  openalex_id <- jsonlite::fromJSON(httr::content(res, 
                                                  as = "text", 
                                                  encoding = "UTF-8"))$id
  # strip to just id
  id_stripped <- gsub("^.*/", "", openalex_id)
  return(id_stripped)  
 }

 # This takes a doi, and you need to pass an email address too. Use my email
 # if you don't feel good about that.
 get_second_order_citations_openalex <- function(doi, email, per_page = 200, max_pages = 50) {
  # 1. convert doi to alexid url
  openalex_id <- get_openalex_id_from_doi(doi = doi, email = email)

  # 2. setup
  base_url <- "https://api.openalex.org/works"
  cursor <- "*"
  all_results <- list()
  page_count <- 0
  
  # 3. begin loop in case there are more results than fit on a page!
  repeat {
    # 4. compose url, moving up to next page of results if necessary (cursor)
    url <- paste0(base_url,
                  "?filter=cites:openalex:", openalex_id,
                  "&per-page=", per_page,
                  "&cursor=", URLencode(cursor, reserved = TRUE),
                  "&mailto=", email)
    # 5. make request
    result <- httr::GET(url, httr::user_agent(email))
    httr::stop_for_status(result)
    # 6. grab text blob; this has lots of
    content <- jsonlite::fromJSON(httr::content(result, as = "text", encoding = "UTF-8"))
    
    if (length(content$results) == 0) break
    # 7. get just two variables we need
    out <- tibble::tibble(
      doi = content$results$doi,
      cites = content$results$cited_by_count
    )
    # 8. increment page count, also use for indexing output list
    page_count <- page_count + 1
    all_results[[page_count]] <- out
    cursor     <- content$meta$next_cursor
    
    if (is.null(cursor) || page_count >= max_pages) break
    
    # this is just to not abuse api
    Sys.sleep(1)  
  }
  
  final_df <- dplyr::bind_rows(all_results)
  return(final_df)
 }
 # coveragedb paper (fast)
 # test <- get_second_order_citations_openalex("10.1093/ije/dyab027","[email protected]")
 # nrow(test) # first order cites
 # sum(test$cites) # second order cites
 # Lee Carter paper (> 1 min)
 # test <- get_second_order_citations_openalex("10.1080/01621459.1992.10475265","[email protected]")
	# note, our citation lookup needs an openalex paper id
	# but you probably only have the doi. This does the translation
	# this returns a full url, where the id is the last part
	get_openalex_id_from_doi <- function(doi, email) {
	base_url <- "https://api.openalex.org/works/doi:"
	doi_encoded <- URLencode(doi, reserved = TRUE)

	url <- paste0(base_url, doi_encoded, "?mailto=",email)

	res <- httr::GET(url, httr::user_agent(email))
	httr::stop_for_status(res)

	openalex_id <- jsonlite::fromJSON(httr::content(res,
	as = "text",
	encoding = "UTF-8"))$id
	# strip to just id
	id_stripped <- gsub("^.*/", "", openalex_id)
	return(id_stripped)
	}

	# This takes a doi, and you need to pass an email address too. Use my email
	# if you don't feel good about that.
	get_second_order_citations_openalex <- function(doi, email, per_page = 200, max_pages = 50) {
	# 1. convert doi to alexid url
	openalex_id <- get_openalex_id_from_doi(doi = doi, email = email)

	# 2. setup
	base_url <- "https://api.openalex.org/works"
	cursor <- "*"
	all_results <- list()
	page_count <- 0

	# 3. begin loop in case there are more results than fit on a page!
	repeat {
	# 4. compose url, moving up to next page of results if necessary (cursor)
	url <- paste0(base_url,
	"?filter=cites:openalex:", openalex_id,
	"&per-page=", per_page,
	"&cursor=", URLencode(cursor, reserved = TRUE),
	"&mailto=", email)
	# 5. make request
	result <- httr::GET(url, httr::user_agent(email))
	httr::stop_for_status(result)
	# 6. grab text blob; this has lots of
	content <- jsonlite::fromJSON(httr::content(result, as = "text", encoding = "UTF-8"))

	if (length(content$results) == 0) break
	# 7. get just two variables we need
	out <- tibble::tibble(
	doi = content$results$doi,
	cites = content$results$cited_by_count
	)
	# 8. increment page count, also use for indexing output list
	page_count <- page_count + 1
	all_results[[page_count]] <- out
	cursor <- content$meta$next_cursor

	if (is.null(cursor) \|\| page_count >= max_pages) break

	# this is just to not abuse api
	Sys.sleep(1)
	}

	final_df <- dplyr::bind_rows(all_results)
	return(final_df)
	}
	# coveragedb paper (fast)
	# test <- get_second_order_citations_openalex("10.1093/ije/dyab027","[email protected]")
	# nrow(test) # first order cites
	# sum(test$cites) # second order cites
	# Lee Carter paper (> 1 min)
	# test <- get_second_order_citations_openalex("10.1080/01621459.1992.10475265","[email protected]")