Last active
September 21, 2025 18:26
-
-
Save janxkoci/f6538194706103447b9826b653e6d7db to your computer and use it in GitHub Desktop.
Getting BibTeX bibliography from PubMed with R, rentrez, and glue
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "1ef7a411-a3b4-49e0-a62f-8bd5397cc13a", | |
| "metadata": {}, | |
| "source": [ | |
| "# Getting BibTeX bibliography from PubMed with R, rentrez, and glue" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "62749280-ff0d-4c09-9de7-879621f78a99", | |
| "metadata": { | |
| "scrolled": true | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "── \u001b[1mAttaching core tidyverse packages\u001b[22m ──────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──\n", | |
| "\u001b[32m✔\u001b[39m \u001b[34mdplyr \u001b[39m 1.1.4 \u001b[32m✔\u001b[39m \u001b[34mreadr \u001b[39m 2.1.5\n", | |
| "\u001b[32m✔\u001b[39m \u001b[34mforcats \u001b[39m 1.0.0 \u001b[32m✔\u001b[39m \u001b[34mstringr \u001b[39m 1.5.1\n", | |
| "\u001b[32m✔\u001b[39m \u001b[34mggplot2 \u001b[39m 3.5.2 \u001b[32m✔\u001b[39m \u001b[34mtibble \u001b[39m 3.3.0\n", | |
| "\u001b[32m✔\u001b[39m \u001b[34mlubridate\u001b[39m 1.9.4 \u001b[32m✔\u001b[39m \u001b[34mtidyr \u001b[39m 1.3.1\n", | |
| "\u001b[32m✔\u001b[39m \u001b[34mpurrr \u001b[39m 1.1.0 \n", | |
| "── \u001b[1mConflicts\u001b[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──\n", | |
| "\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mfilter()\u001b[39m masks \u001b[34mstats\u001b[39m::filter()\n", | |
| "\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mlag()\u001b[39m masks \u001b[34mstats\u001b[39m::lag()\n", | |
| "\u001b[36mℹ\u001b[39m Use the conflicted package (\u001b[3m\u001b[34m<http://conflicted.r-lib.org/>\u001b[39m\u001b[23m) to force all conflicts to become errors\n", | |
| "\n", | |
| "Attaching package: ‘magrittr’\n", | |
| "\n", | |
| "\n", | |
| "The following object is masked from ‘package:purrr’:\n", | |
| "\n", | |
| " set_names\n", | |
| "\n", | |
| "\n", | |
| "The following object is masked from ‘package:tidyr’:\n", | |
| "\n", | |
| " extract\n", | |
| "\n", | |
| "\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "library(rentrez)\n", | |
| "library(tidyverse)\n", | |
| "library(magrittr)\n", | |
| "library(glue)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "5f8faf1e-4475-4e40-a729-1d9f3d7d3392", | |
| "metadata": {}, | |
| "source": [ | |
| "Search term:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "8d602f7d-4808-4c9b-b50e-77ea9cc33060", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "## search term\n", | |
| "search_term = \"\\\"flegontov p\\\"[au] OR \\\"flegontov pn\\\"[au]\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "98583ca8-95b0-4dda-ab6b-304ca70353db", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "## number of publications\n", | |
| "npubs = entrez_search(db = \"pubmed\", term = search_term, retmax = 100)$count\n", | |
| "## get ncbi uids\n", | |
| "ncbi_ids = entrez_search(db = \"pubmed\", term = search_term, retmax = npubs)$ids #%>% str\n", | |
| "## get all publications as json, save into list object\n", | |
| "esummary = entrez_summary(db = \"pubmed\", id = ncbi_ids, retmode = \"json\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "72ba3531-bf00-462d-a7c8-d21bec536ac2", | |
| "metadata": {}, | |
| "source": [ | |
| "See names of items in the resulting list:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "a46c42da-12b4-406b-98cd-19f355acf483", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| " [1] \"uid\" \"pubdate\" \"epubdate\" \n", | |
| " [4] \"source\" \"authors\" \"lastauthor\" \n", | |
| " [7] \"title\" \"sorttitle\" \"volume\" \n", | |
| "[10] \"issue\" \"pages\" \"lang\" \n", | |
| "[13] \"nlmuniqueid\" \"issn\" \"essn\" \n", | |
| "[16] \"pubtype\" \"recordstatus\" \"pubstatus\" \n", | |
| "[19] \"articleids\" \"history\" \"references\" \n", | |
| "[22] \"attributes\" \"pmcrefcount\" \"fulljournalname\" \n", | |
| "[25] \"elocationid\" \"doctype\" \"srccontriblist\" \n", | |
| "[28] \"booktitle\" \"medium\" \"edition\" \n", | |
| "[31] \"publisherlocation\" \"publishername\" \"srcdate\" \n", | |
| "[34] \"reportnumber\" \"availablefromurl\" \"locationlabel\" \n", | |
| "[37] \"doccontriblist\" \"docdate\" \"bookname\" \n", | |
| "[40] \"chapter\" \"sortpubdate\" \"sortfirstauthor\" \n", | |
| "[43] \"vernaculartitle\" \n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "esummary[[1]] %>% names %>% print" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "6e269e2a-e722-47d9-83c7-2c49f12ecfca", | |
| "metadata": {}, | |
| "source": [ | |
| "## pick items\n", | |
| "- authors (needs `paste(collapse)`)\n", | |
| "- pubdate or sortpubdate (extract year)\n", | |
| "- title\n", | |
| "- fulljournalname (needs `gsub(\":.*\", \"\")`)\n", | |
| " - source may be better\n", | |
| "- issue\n", | |
| "- volume\n", | |
| "- pages\n", | |
| "- articleids (needs filter to doi)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "647580b1-82d9-441b-9830-c08d34eb3fd5", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Rows: 57\n", | |
| "Columns: 13\n", | |
| "$ uid \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"40604287\"\u001b[90m, \u001b[39m\"40454862\"\u001b[90m, \u001b[39m\"40169722\"\u001b[90m, \u001b[39m\"39979458\"\u001b[90m, \u001b[39m\"39910…\n", | |
| "$ title \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"Ancient DNA reveals the prehistory of the Uralic and …\n", | |
| "$ fulljournalname \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"Nature\"\u001b[90m, \u001b[39m\"Molecular ecology\"\u001b[90m, \u001b[39m\"Genetics\"\u001b[90m, \u001b[39m\"Nature\"\u001b[90m, \u001b[39m\"…\n", | |
| "$ source \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"Nature\"\u001b[90m, \u001b[39m\"Mol Ecol\"\u001b[90m, \u001b[39m\"Genetics\"\u001b[90m, \u001b[39m\"Nature\"\u001b[90m, \u001b[39m\"Nature\"\u001b[90m, \u001b[39m…\n", | |
| "$ pubdate \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"2025 Aug\"\u001b[90m, \u001b[39m\"2025 Jun 2\"\u001b[90m, \u001b[39m\"2025 May 8\"\u001b[90m, \u001b[39m\"2025 Mar\"\u001b[90m, \u001b[39m\"2…\n", | |
| "$ sortpubdate \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"2025/08/01 00:00\"\u001b[90m, \u001b[39m\"2025/06/02 00:00\"\u001b[90m, \u001b[39m\"2025/05/08 00…\n", | |
| "$ issue \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"8075\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"1\"\u001b[90m, \u001b[39m\"8054\"\u001b[90m, \u001b[39m\"8053\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"1\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m…\n", | |
| "$ volume \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"644\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"230\"\u001b[90m, \u001b[39m\"639\"\u001b[90m, \u001b[39m\"639\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"228\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m,\u001b[39m…\n", | |
| "$ pages \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"122--132\"\u001b[90m, \u001b[39m\"e17796\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"E14\"\u001b[90m, \u001b[39m\"132--142\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"…\n", | |
| "$ journalname \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"Nature\"\u001b[90m, \u001b[39m\"Molecular ecology\"\u001b[90m, \u001b[39m\"Genetics\"\u001b[90m, \u001b[39m\"Nature\"\u001b[90m, \u001b[39m\"…\n", | |
| "$ year \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"2025\"\u001b[90m, \u001b[39m\"2025\"\u001b[90m, \u001b[39m\"2025\"\u001b[90m, \u001b[39m\"2025\"\u001b[90m, \u001b[39m\"2025\"\u001b[90m, \u001b[39m\"2024\"\u001b[90m, \u001b[39m\"2024\"…\n", | |
| "$ doi \u001b[3m\u001b[90m<named list>\u001b[39m\u001b[23m \"10.1038/s41586-025-09189-3\"\u001b[90m, \u001b[39m\"10.1111/mec.1779…\n", | |
| "$ authors \u001b[3m\u001b[90m<named list>\u001b[39m\u001b[23m \"Zeng TC, Vyazov LA, Kim A, Flegontov P, Sirak …\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "## extract items into table\n", | |
| "papers <- esummary %>% \n", | |
| " lapply(extract, c(\"uid\", \"title\", \"fulljournalname\", \"source\",\"pubdate\", \"sortpubdate\", \"issue\", \"volume\", \"pages\")) %>% \n", | |
| " bind_rows %>% \n", | |
| " mutate(\n", | |
| " journalname = str_remove(fulljournalname, \":.*\"),\n", | |
| " year = str_extract(sortpubdate, \"\\\\d{4}\"),\n", | |
| " pages = str_replace(pages, \"-\",\"--\")\n", | |
| " )\n", | |
| "\n", | |
| "papers$doi <- esummary %>% \n", | |
| " lapply(function(x) use_series(x, articleids) %>% subset(idtype == \"doi\", select = value) %>% pluck(1))\n", | |
| "\n", | |
| "papers$authors <- esummary %>% \n", | |
| " lapply(function(x) use_series(x, authors) %>% pull(name) %>% paste(collapse = \", \"))\n", | |
| "\n", | |
| "papers %>% glimpse" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "88e7a943-77a5-4e55-8ead-0a8ac3189b16", | |
| "metadata": {}, | |
| "source": [ | |
| "## format as bibtex using glue" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "592073ca-d76e-4513-a3c0-f9c3cf11bd45", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "bibtex <- papers %>% \n", | |
| " glue_data(\"\n", | |
| "@article{{{uid},\n", | |
| " author={{{authors}}},\n", | |
| " year={{{year}}},\n", | |
| " title={{{title}}},\n", | |
| " journal={{{source}}},\n", | |
| " number={{{issue}}},\n", | |
| " volume={{{volume}}},\n", | |
| " pages={{{pages}}},\n", | |
| " doi={{{doi}}}\n", | |
| "}}\n", | |
| "\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "519be050-f503-4882-87e0-4474d33a5c7e", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "## write the bibtex string into a file\n", | |
| "writeLines(bibtex, \"papers.bib\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "a437f691-9938-48b1-be57-ca77d3788164", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "## remove empty fields\n", | |
| "system(\"sed -i '/{}/d' papers.bib\")" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "R", | |
| "language": "R", | |
| "name": "ir" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": "r", | |
| "file_extension": ".r", | |
| "mimetype": "text/x-r-source", | |
| "name": "R", | |
| "pygments_lexer": "r", | |
| "version": "4.4.3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ## Getting BibTeX bibliography from PubMed with R, rentrez, and glue | |
| library(rentrez) | |
| library(tidyverse) | |
| library(magrittr) | |
| library(glue) | |
| ## search term | |
| search_term = "\"flegontov p\"[au] OR \"flegontov pn\"[au]" | |
| ## number of publications | |
| npubs = entrez_search(db = "pubmed", term = search_term, retmax = 100)$count | |
| ## get ncbi uids | |
| ncbi_ids = entrez_search(db = "pubmed", term = search_term, retmax = npubs)$ids #%>% str | |
| ## get all publications as json, save into list object | |
| esummary = entrez_summary(db = "pubmed", id = ncbi_ids, retmode = "json") | |
| ## extract items into table | |
| papers <- esummary %>% | |
| lapply(extract, c("uid", "title", "fulljournalname", "source","pubdate", "sortpubdate", "issue", "volume", "pages")) %>% | |
| bind_rows %>% | |
| mutate( | |
| journalname = str_remove(fulljournalname, ":.*"), | |
| year = str_extract(sortpubdate, "\\d{4}"), | |
| pages = str_replace(pages, "-","--") | |
| ) | |
| papers$doi <- esummary %>% | |
| lapply(function(x) use_series(x, articleids) %>% subset(idtype == "doi", select = value) %>% pluck(1)) | |
| papers$authors <- esummary %>% | |
| lapply(function(x) use_series(x, authors) %>% pull(name) %>% paste(collapse = ", ")) | |
| #papers %>% glimpse | |
| bibtex <- papers %>% | |
| glue_data(" | |
| @article{{{uid}, | |
| author={{{authors}}}, | |
| year={{{year}}}, | |
| title={{{title}}}, | |
| journal={{{source}}}, | |
| number={{{issue}}}, | |
| volume={{{volume}}}, | |
| pages={{{pages}}}, | |
| doi={{{doi}}} | |
| }} | |
| ") | |
| ## write the bibtex string into a file | |
| writeLines(bibtex, "papers.bib") | |
| ## remove empty fields | |
| system("sed -i '/{}/d' papers.bib") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| name: pubmed2bibtex | |
| channels: | |
| - conda-forge | |
| dependencies: | |
| - r-base=4 | |
| - r-tidyverse # includes magrittr & glue | |
| - r-rentrez | |
| ## pick an IDE | |
| # - rstudio-desktop | |
| # - jupyterlab | |
| # - r-irkernel |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment