janxkoci · September 21, 2025 18:26
diff --git a/pubmed2bibtex.ipynb b/pubmed2bibtex.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "id": "1ef7a411-a3b4-49e0-a62f-8bd5397cc13a",
   "metadata": {},
   "source": [
    "# Getting BibTeX bibliography from PubMed with R, rentrez, and glue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "62749280-ff0d-4c09-9de7-879621f78a99",
   "metadata": {
    "scrolled": true
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "── \u001b[1mAttaching core tidyverse packages\u001b[22m ──────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──\n",
      "\u001b[32m✔\u001b[39m \u001b[34mdplyr    \u001b[39m 1.1.4     \u001b[32m✔\u001b[39m \u001b[34mreadr    \u001b[39m 2.1.5\n",
      "\u001b[32m✔\u001b[39m \u001b[34mforcats  \u001b[39m 1.0.0     \u001b[32m✔\u001b[39m \u001b[34mstringr  \u001b[39m 1.5.1\n",
      "\u001b[32m✔\u001b[39m \u001b[34mggplot2  \u001b[39m 3.5.2     \u001b[32m✔\u001b[39m \u001b[34mtibble   \u001b[39m 3.3.0\n",
      "\u001b[32m✔\u001b[39m \u001b[34mlubridate\u001b[39m 1.9.4     \u001b[32m✔\u001b[39m \u001b[34mtidyr    \u001b[39m 1.3.1\n",
      "\u001b[32m✔\u001b[39m \u001b[34mpurrr    \u001b[39m 1.1.0     \n",
      "── \u001b[1mConflicts\u001b[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──\n",
      "\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mfilter()\u001b[39m masks \u001b[34mstats\u001b[39m::filter()\n",
      "\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mlag()\u001b[39m    masks \u001b[34mstats\u001b[39m::lag()\n",
      "\u001b[36mℹ\u001b[39m Use the conflicted package (\u001b[3m\u001b[34m<http://conflicted.r-lib.org/>\u001b[39m\u001b[23m) to force all conflicts to become errors\n",
      "\n",
      "Attaching package: ‘magrittr’\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:purrr’:\n",
      "\n",
      "    set_names\n",
      "\n",
      "\n",
      "The following object is masked from ‘package:tidyr’:\n",
      "\n",
      "    extract\n",
      "\n",
      "\n"
     ]
    }
   ],
   "source": [
    "library(rentrez)\n",
    "library(tidyverse)\n",
    "library(magrittr)\n",
    "library(glue)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5f8faf1e-4475-4e40-a729-1d9f3d7d3392",
   "metadata": {},
   "source": [
    "Search term:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8d602f7d-4808-4c9b-b50e-77ea9cc33060",
   "metadata": {},
   "outputs": [],
   "source": [
    "## search term\n",
    "search_term = \"\\\"flegontov p\\\"[au] OR \\\"flegontov pn\\\"[au]\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "98583ca8-95b0-4dda-ab6b-304ca70353db",
   "metadata": {},
   "outputs": [],
   "source": [
    "## number of publications\n",
    "npubs = entrez_search(db = \"pubmed\", term = search_term, retmax = 100)$count\n",
    "## get ncbi uids\n",
    "ncbi_ids = entrez_search(db = \"pubmed\", term = search_term, retmax = npubs)$ids #%>% str\n",
    "## get all publications as json, save into list object\n",
    "esummary = entrez_summary(db = \"pubmed\", id = ncbi_ids, retmode = \"json\")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "72ba3531-bf00-462d-a7c8-d21bec536ac2",
   "metadata": {},
   "source": [
    "See names of items in the resulting list:"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "a46c42da-12b4-406b-98cd-19f355acf483",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      " [1] \"uid\"               \"pubdate\"           \"epubdate\"         \n",
      " [4] \"source\"            \"authors\"           \"lastauthor\"       \n",
      " [7] \"title\"             \"sorttitle\"         \"volume\"           \n",
      "[10] \"issue\"             \"pages\"             \"lang\"             \n",
      "[13] \"nlmuniqueid\"       \"issn\"              \"essn\"             \n",
      "[16] \"pubtype\"           \"recordstatus\"      \"pubstatus\"        \n",
      "[19] \"articleids\"        \"history\"           \"references\"       \n",
      "[22] \"attributes\"        \"pmcrefcount\"       \"fulljournalname\"  \n",
      "[25] \"elocationid\"       \"doctype\"           \"srccontriblist\"   \n",
      "[28] \"booktitle\"         \"medium\"            \"edition\"          \n",
      "[31] \"publisherlocation\" \"publishername\"     \"srcdate\"          \n",
      "[34] \"reportnumber\"      \"availablefromurl\"  \"locationlabel\"    \n",
      "[37] \"doccontriblist\"    \"docdate\"           \"bookname\"         \n",
      "[40] \"chapter\"           \"sortpubdate\"       \"sortfirstauthor\"  \n",
      "[43] \"vernaculartitle\"  \n"
     ]
    }
   ],
   "source": [
    "esummary[[1]] %>% names %>% print"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "6e269e2a-e722-47d9-83c7-2c49f12ecfca",
   "metadata": {},
   "source": [
    "## pick items\n",
    "- authors (needs `paste(collapse)`)\n",
    "- pubdate or sortpubdate (extract year)\n",
    "- title\n",
    "- fulljournalname (needs `gsub(\":.*\", \"\")`)\n",
    "  - source may be better\n",
    "- issue\n",
    "- volume\n",
    "- pages\n",
    "- articleids (needs filter to doi)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "647580b1-82d9-441b-9830-c08d34eb3fd5",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Rows: 57\n",
      "Columns: 13\n",
      "$ uid             \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"40604287\"\u001b[90m, \u001b[39m\"40454862\"\u001b[90m, \u001b[39m\"40169722\"\u001b[90m, \u001b[39m\"39979458\"\u001b[90m, \u001b[39m\"39910…\n",
      "$ title           \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"Ancient DNA reveals the prehistory of the Uralic and …\n",
      "$ fulljournalname \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"Nature\"\u001b[90m, \u001b[39m\"Molecular ecology\"\u001b[90m, \u001b[39m\"Genetics\"\u001b[90m, \u001b[39m\"Nature\"\u001b[90m, \u001b[39m\"…\n",
      "$ source          \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"Nature\"\u001b[90m, \u001b[39m\"Mol Ecol\"\u001b[90m, \u001b[39m\"Genetics\"\u001b[90m, \u001b[39m\"Nature\"\u001b[90m, \u001b[39m\"Nature\"\u001b[90m, \u001b[39m…\n",
      "$ pubdate         \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"2025 Aug\"\u001b[90m, \u001b[39m\"2025 Jun 2\"\u001b[90m, \u001b[39m\"2025 May 8\"\u001b[90m, \u001b[39m\"2025 Mar\"\u001b[90m, \u001b[39m\"2…\n",
      "$ sortpubdate     \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"2025/08/01 00:00\"\u001b[90m, \u001b[39m\"2025/06/02 00:00\"\u001b[90m, \u001b[39m\"2025/05/08 00…\n",
      "$ issue           \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"8075\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"1\"\u001b[90m, \u001b[39m\"8054\"\u001b[90m, \u001b[39m\"8053\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"1\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m…\n",
      "$ volume          \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"644\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"230\"\u001b[90m, \u001b[39m\"639\"\u001b[90m, \u001b[39m\"639\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"228\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m,\u001b[39m…\n",
      "$ pages           \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"122--132\"\u001b[90m, \u001b[39m\"e17796\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"E14\"\u001b[90m, \u001b[39m\"132--142\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"…\n",
      "$ journalname     \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"Nature\"\u001b[90m, \u001b[39m\"Molecular ecology\"\u001b[90m, \u001b[39m\"Genetics\"\u001b[90m, \u001b[39m\"Nature\"\u001b[90m, \u001b[39m\"…\n",
      "$ year            \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"2025\"\u001b[90m, \u001b[39m\"2025\"\u001b[90m, \u001b[39m\"2025\"\u001b[90m, \u001b[39m\"2025\"\u001b[90m, \u001b[39m\"2025\"\u001b[90m, \u001b[39m\"2024\"\u001b[90m, \u001b[39m\"2024\"…\n",
      "$ doi             \u001b[3m\u001b[90m<named list>\u001b[39m\u001b[23m \"10.1038/s41586-025-09189-3\"\u001b[90m, \u001b[39m\"10.1111/mec.1779…\n",
      "$ authors         \u001b[3m\u001b[90m<named list>\u001b[39m\u001b[23m \"Zeng TC, Vyazov LA, Kim A, Flegontov P, Sirak …\n"
     ]
    }
   ],
   "source": [
    "## extract items into table\n",
    "papers <- esummary %>% \n",
    "    lapply(extract, c(\"uid\", \"title\", \"fulljournalname\", \"source\",\"pubdate\", \"sortpubdate\", \"issue\", \"volume\", \"pages\")) %>% \n",
    "    bind_rows %>% \n",
    "    mutate(\n",
    "        journalname = str_remove(fulljournalname, \":.*\"),\n",
    "        year = str_extract(sortpubdate, \"\\\\d{4}\"),\n",
    "        pages = str_replace(pages, \"-\",\"--\")\n",
    "    )\n",
    "\n",
    "papers$doi <- esummary %>% \n",
    "    lapply(function(x) use_series(x, articleids) %>% subset(idtype == \"doi\", select = value) %>% pluck(1))\n",
    "\n",
    "papers$authors <- esummary %>% \n",
    "    lapply(function(x) use_series(x, authors) %>% pull(name) %>% paste(collapse = \", \"))\n",
    "\n",
    "papers %>% glimpse"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "88e7a943-77a5-4e55-8ead-0a8ac3189b16",
   "metadata": {},
   "source": [
    "## format as bibtex using glue"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "592073ca-d76e-4513-a3c0-f9c3cf11bd45",
   "metadata": {},
   "outputs": [],
   "source": [
    "bibtex <- papers %>% \n",
    "    glue_data(\"\n",
    "@article{{{uid},\n",
    "    author={{{authors}}},\n",
    "    year={{{year}}},\n",
    "    title={{{title}}},\n",
    "    journal={{{source}}},\n",
    "    number={{{issue}}},\n",
    "    volume={{{volume}}},\n",
    "    pages={{{pages}}},\n",
    "    doi={{{doi}}}\n",
    "}}\n",
    "\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "519be050-f503-4882-87e0-4474d33a5c7e",
   "metadata": {},
   "outputs": [],
   "source": [
    "## write the bibtex string into a file\n",
    "writeLines(bibtex, \"papers.bib\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "a437f691-9938-48b1-be57-ca77d3788164",
   "metadata": {},
   "outputs": [],
   "source": [
    "## remove empty fields\n",
    "system(\"sed -i '/{}/d' papers.bib\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "R",
   "language": "R",
   "name": "ir"
  },
  "language_info": {
   "codemirror_mode": "r",
   "file_extension": ".r",
   "mimetype": "text/x-r-source",
   "name": "R",
   "pygments_lexer": "r",
   "version": "4.4.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
 }
diff --git a/rentrez2bibtex.r b/rentrez2bibtex.r
 ## Getting BibTeX bibliography from PubMed with R, rentrez, and glue
 library(rentrez)
 library(tidyverse)
 library(magrittr)
 library(glue)

 ## search term
 search_term = "\"flegontov p\"[au] OR \"flegontov pn\"[au]"

 ## number of publications
 npubs = entrez_search(db = "pubmed", term = search_term, retmax = 100)$count
 ## get ncbi uids
 ncbi_ids = entrez_search(db = "pubmed", term = search_term, retmax = npubs)$ids #%>% str
 ## get all publications as json, save into list object
 esummary = entrez_summary(db = "pubmed", id = ncbi_ids, retmode = "json")

 ## extract items into table
 papers <- esummary %>% 
    lapply(extract, c("uid", "title", "fulljournalname", "source","pubdate", "sortpubdate", "issue", "volume", "pages")) %>% 
    bind_rows %>% 
    mutate(
        journalname = str_remove(fulljournalname, ":.*"),
        year = str_extract(sortpubdate, "\\d{4}"),
        pages = str_replace(pages, "-","--")
    )

 papers$doi <- esummary %>% 
    lapply(function(x) use_series(x, articleids) %>% subset(idtype == "doi", select = value) %>% pluck(1))

 papers$authors <- esummary %>% 
    lapply(function(x) use_series(x, authors) %>% pull(name) %>% paste(collapse = ", "))

 #papers %>% glimpse

 bibtex <- papers %>% 
    glue_data("
 @article{{{uid},
    author={{{authors}}},
    year={{{year}}},
    title={{{title}}},
    journal={{{source}}},
    number={{{issue}}},
    volume={{{volume}}},
    pages={{{pages}}},
    doi={{{doi}}}
 }}
 ")

 ## write the bibtex string into a file
 writeLines(bibtex, "papers.bib")

 ## remove empty fields
 system("sed -i '/{}/d' papers.bib")
diff --git a/rentrez2bibtex.yml b/rentrez2bibtex.yml
 name: pubmed2bibtex
 channels:
 - conda-forge
 dependencies:
 - r-base=4
 - r-tidyverse # includes magrittr & glue
 - r-rentrez
 ## pick an IDE
 # - rstudio-desktop
 # - jupyterlab
 # - r-irkernel
	{
	"cells": [
	{
	"cell_type": "markdown",
	"id": "1ef7a411-a3b4-49e0-a62f-8bd5397cc13a",
	"metadata": {},
	"source": [
	"# Getting BibTeX bibliography from PubMed with R, rentrez, and glue"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 1,
	"id": "62749280-ff0d-4c09-9de7-879621f78a99",
	"metadata": {
	"scrolled": true
	},
	"outputs": [
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"── \u001b[1mAttaching core tidyverse packages\u001b[22m ──────────────────────────────────────────────────────────────── tidyverse 2.0.0 ──\n",
	"\u001b[32m✔\u001b[39m \u001b[34mdplyr \u001b[39m 1.1.4 \u001b[32m✔\u001b[39m \u001b[34mreadr \u001b[39m 2.1.5\n",
	"\u001b[32m✔\u001b[39m \u001b[34mforcats \u001b[39m 1.0.0 \u001b[32m✔\u001b[39m \u001b[34mstringr \u001b[39m 1.5.1\n",
	"\u001b[32m✔\u001b[39m \u001b[34mggplot2 \u001b[39m 3.5.2 \u001b[32m✔\u001b[39m \u001b[34mtibble \u001b[39m 3.3.0\n",
	"\u001b[32m✔\u001b[39m \u001b[34mlubridate\u001b[39m 1.9.4 \u001b[32m✔\u001b[39m \u001b[34mtidyr \u001b[39m 1.3.1\n",
	"\u001b[32m✔\u001b[39m \u001b[34mpurrr \u001b[39m 1.1.0 \n",
	"── \u001b[1mConflicts\u001b[22m ────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──\n",
	"\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mfilter()\u001b[39m masks \u001b[34mstats\u001b[39m::filter()\n",
	"\u001b[31m✖\u001b[39m \u001b[34mdplyr\u001b[39m::\u001b[32mlag()\u001b[39m masks \u001b[34mstats\u001b[39m::lag()\n",
	"\u001b[36mℹ\u001b[39m Use the conflicted package (\u001b[3m\u001b[34m<http://conflicted.r-lib.org/>\u001b[39m\u001b[23m) to force all conflicts to become errors\n",
	"\n",
	"Attaching package: ‘magrittr’\n",
	"\n",
	"\n",
	"The following object is masked from ‘package:purrr’:\n",
	"\n",
	" set_names\n",
	"\n",
	"\n",
	"The following object is masked from ‘package:tidyr’:\n",
	"\n",
	" extract\n",
	"\n",
	"\n"
	]
	}
	],
	"source": [
	"library(rentrez)\n",
	"library(tidyverse)\n",
	"library(magrittr)\n",
	"library(glue)"
	]
	},
	{
	"cell_type": "markdown",
	"id": "5f8faf1e-4475-4e40-a729-1d9f3d7d3392",
	"metadata": {},
	"source": [
	"Search term:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"id": "8d602f7d-4808-4c9b-b50e-77ea9cc33060",
	"metadata": {},
	"outputs": [],
	"source": [
	"## search term\n",
	"search_term = \"\\\"flegontov p\\\"[au] OR \\\"flegontov pn\\\"[au]\""
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"id": "98583ca8-95b0-4dda-ab6b-304ca70353db",
	"metadata": {},
	"outputs": [],
	"source": [
	"## number of publications\n",
	"npubs = entrez_search(db = \"pubmed\", term = search_term, retmax = 100)$count\n",
	"## get ncbi uids\n",
	"ncbi_ids = entrez_search(db = \"pubmed\", term = search_term, retmax = npubs)$ids #%>% str\n",
	"## get all publications as json, save into list object\n",
	"esummary = entrez_summary(db = \"pubmed\", id = ncbi_ids, retmode = \"json\")"
	]
	},
	{
	"cell_type": "markdown",
	"id": "72ba3531-bf00-462d-a7c8-d21bec536ac2",
	"metadata": {},
	"source": [
	"See names of items in the resulting list:"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"id": "a46c42da-12b4-406b-98cd-19f355acf483",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	" [1] \"uid\" \"pubdate\" \"epubdate\" \n",
	" [4] \"source\" \"authors\" \"lastauthor\" \n",
	" [7] \"title\" \"sorttitle\" \"volume\" \n",
	"[10] \"issue\" \"pages\" \"lang\" \n",
	"[13] \"nlmuniqueid\" \"issn\" \"essn\" \n",
	"[16] \"pubtype\" \"recordstatus\" \"pubstatus\" \n",
	"[19] \"articleids\" \"history\" \"references\" \n",
	"[22] \"attributes\" \"pmcrefcount\" \"fulljournalname\" \n",
	"[25] \"elocationid\" \"doctype\" \"srccontriblist\" \n",
	"[28] \"booktitle\" \"medium\" \"edition\" \n",
	"[31] \"publisherlocation\" \"publishername\" \"srcdate\" \n",
	"[34] \"reportnumber\" \"availablefromurl\" \"locationlabel\" \n",
	"[37] \"doccontriblist\" \"docdate\" \"bookname\" \n",
	"[40] \"chapter\" \"sortpubdate\" \"sortfirstauthor\" \n",
	"[43] \"vernaculartitle\" \n"
	]
	}
	],
	"source": [
	"esummary[[1]] %>% names %>% print"
	]
	},
	{
	"cell_type": "markdown",
	"id": "6e269e2a-e722-47d9-83c7-2c49f12ecfca",
	"metadata": {},
	"source": [
	"## pick items\n",
	"- authors (needs `paste(collapse)`)\n",
	"- pubdate or sortpubdate (extract year)\n",
	"- title\n",
	"- fulljournalname (needs `gsub(\":.*\", \"\")`)\n",
	" - source may be better\n",
	"- issue\n",
	"- volume\n",
	"- pages\n",
	"- articleids (needs filter to doi)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"id": "647580b1-82d9-441b-9830-c08d34eb3fd5",
	"metadata": {},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"Rows: 57\n",
	"Columns: 13\n",
	"$ uid \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"40604287\"\u001b[90m, \u001b[39m\"40454862\"\u001b[90m, \u001b[39m\"40169722\"\u001b[90m, \u001b[39m\"39979458\"\u001b[90m, \u001b[39m\"39910…\n",
	"$ title \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"Ancient DNA reveals the prehistory of the Uralic and …\n",
	"$ fulljournalname \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"Nature\"\u001b[90m, \u001b[39m\"Molecular ecology\"\u001b[90m, \u001b[39m\"Genetics\"\u001b[90m, \u001b[39m\"Nature\"\u001b[90m, \u001b[39m\"…\n",
	"$ source \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"Nature\"\u001b[90m, \u001b[39m\"Mol Ecol\"\u001b[90m, \u001b[39m\"Genetics\"\u001b[90m, \u001b[39m\"Nature\"\u001b[90m, \u001b[39m\"Nature\"\u001b[90m, \u001b[39m…\n",
	"$ pubdate \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"2025 Aug\"\u001b[90m, \u001b[39m\"2025 Jun 2\"\u001b[90m, \u001b[39m\"2025 May 8\"\u001b[90m, \u001b[39m\"2025 Mar\"\u001b[90m, \u001b[39m\"2…\n",
	"$ sortpubdate \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"2025/08/01 00:00\"\u001b[90m, \u001b[39m\"2025/06/02 00:00\"\u001b[90m, \u001b[39m\"2025/05/08 00…\n",
	"$ issue \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"8075\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"1\"\u001b[90m, \u001b[39m\"8054\"\u001b[90m, \u001b[39m\"8053\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"1\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m…\n",
	"$ volume \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"644\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"230\"\u001b[90m, \u001b[39m\"639\"\u001b[90m, \u001b[39m\"639\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"228\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m,\u001b[39m…\n",
	"$ pages \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"122--132\"\u001b[90m, \u001b[39m\"e17796\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"E14\"\u001b[90m, \u001b[39m\"132--142\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"\"\u001b[90m, \u001b[39m\"…\n",
	"$ journalname \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"Nature\"\u001b[90m, \u001b[39m\"Molecular ecology\"\u001b[90m, \u001b[39m\"Genetics\"\u001b[90m, \u001b[39m\"Nature\"\u001b[90m, \u001b[39m\"…\n",
	"$ year \u001b[3m\u001b[90m<chr>\u001b[39m\u001b[23m \"2025\"\u001b[90m, \u001b[39m\"2025\"\u001b[90m, \u001b[39m\"2025\"\u001b[90m, \u001b[39m\"2025\"\u001b[90m, \u001b[39m\"2025\"\u001b[90m, \u001b[39m\"2024\"\u001b[90m, \u001b[39m\"2024\"…\n",
	"$ doi \u001b[3m\u001b[90m<named list>\u001b[39m\u001b[23m \"10.1038/s41586-025-09189-3\"\u001b[90m, \u001b[39m\"10.1111/mec.1779…\n",
	"$ authors \u001b[3m\u001b[90m<named list>\u001b[39m\u001b[23m \"Zeng TC, Vyazov LA, Kim A, Flegontov P, Sirak …\n"
	]
	}
	],
	"source": [
	"## extract items into table\n",
	"papers <- esummary %>% \n",
	" lapply(extract, c(\"uid\", \"title\", \"fulljournalname\", \"source\",\"pubdate\", \"sortpubdate\", \"issue\", \"volume\", \"pages\")) %>% \n",
	" bind_rows %>% \n",
	" mutate(\n",
	" journalname = str_remove(fulljournalname, \":.*\"),\n",
	" year = str_extract(sortpubdate, \"\\\\d{4}\"),\n",
	" pages = str_replace(pages, \"-\",\"--\")\n",
	" )\n",
	"\n",
	"papers$doi <- esummary %>% \n",
	" lapply(function(x) use_series(x, articleids) %>% subset(idtype == \"doi\", select = value) %>% pluck(1))\n",
	"\n",
	"papers$authors <- esummary %>% \n",
	" lapply(function(x) use_series(x, authors) %>% pull(name) %>% paste(collapse = \", \"))\n",
	"\n",
	"papers %>% glimpse"
	]
	},
	{
	"cell_type": "markdown",
	"id": "88e7a943-77a5-4e55-8ead-0a8ac3189b16",
	"metadata": {},
	"source": [
	"## format as bibtex using glue"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"id": "592073ca-d76e-4513-a3c0-f9c3cf11bd45",
	"metadata": {},
	"outputs": [],
	"source": [
	"bibtex <- papers %>% \n",
	" glue_data(\"\n",
	"@article{{{uid},\n",
	" author={{{authors}}},\n",
	" year={{{year}}},\n",
	" title={{{title}}},\n",
	" journal={{{source}}},\n",
	" number={{{issue}}},\n",
	" volume={{{volume}}},\n",
	" pages={{{pages}}},\n",
	" doi={{{doi}}}\n",
	"}}\n",
	"\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"id": "519be050-f503-4882-87e0-4474d33a5c7e",
	"metadata": {},
	"outputs": [],
	"source": [
	"## write the bibtex string into a file\n",
	"writeLines(bibtex, \"papers.bib\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 8,
	"id": "a437f691-9938-48b1-be57-ca77d3788164",
	"metadata": {},
	"outputs": [],
	"source": [
	"## remove empty fields\n",
	"system(\"sed -i '/{}/d' papers.bib\")"
	]
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "R",
	"language": "R",
	"name": "ir"
	},
	"language_info": {
	"codemirror_mode": "r",
	"file_extension": ".r",
	"mimetype": "text/x-r-source",
	"name": "R",
	"pygments_lexer": "r",
	"version": "4.4.3"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 5
	}
	## Getting BibTeX bibliography from PubMed with R, rentrez, and glue
	library(rentrez)
	library(tidyverse)
	library(magrittr)
	library(glue)

	## search term
	search_term = "\"flegontov p\"[au] OR \"flegontov pn\"[au]"

	## number of publications
	npubs = entrez_search(db = "pubmed", term = search_term, retmax = 100)$count
	## get ncbi uids
	ncbi_ids = entrez_search(db = "pubmed", term = search_term, retmax = npubs)$ids #%>% str
	## get all publications as json, save into list object
	esummary = entrez_summary(db = "pubmed", id = ncbi_ids, retmode = "json")

	## extract items into table
	papers <- esummary %>%
	lapply(extract, c("uid", "title", "fulljournalname", "source","pubdate", "sortpubdate", "issue", "volume", "pages")) %>%
	bind_rows %>%
	mutate(
	journalname = str_remove(fulljournalname, ":.*"),
	year = str_extract(sortpubdate, "\\d{4}"),
	pages = str_replace(pages, "-","--")
	)

	papers$doi <- esummary %>%
	lapply(function(x) use_series(x, articleids) %>% subset(idtype == "doi", select = value) %>% pluck(1))

	papers$authors <- esummary %>%
	lapply(function(x) use_series(x, authors) %>% pull(name) %>% paste(collapse = ", "))

	#papers %>% glimpse

	bibtex <- papers %>%
	glue_data("
	@article{{{uid},
	author={{{authors}}},
	year={{{year}}},
	title={{{title}}},
	journal={{{source}}},
	number={{{issue}}},
	volume={{{volume}}},
	pages={{{pages}}},
	doi={{{doi}}}
	}}
	")

	## write the bibtex string into a file
	writeLines(bibtex, "papers.bib")

	## remove empty fields
	system("sed -i '/{}/d' papers.bib")
	name: pubmed2bibtex
	channels:
	- conda-forge
	dependencies:
	- r-base=4
	- r-tidyverse # includes magrittr & glue
	- r-rentrez
	## pick an IDE
	# - rstudio-desktop
	# - jupyterlab
	# - r-irkernel