Skip to content

Instantly share code, notes, and snippets.

@rafisics
Last active May 19, 2025 15:01
Show Gist options
  • Save rafisics/aa8d720991faee9e3157f420e9860639 to your computer and use it in GitHub Desktop.
Save rafisics/aa8d720991faee9e3157f420e9860639 to your computer and use it in GitHub Desktop.
arXiv script - Python tool to query arXiv papers and download files from terminal. Handles hep-th/9711200 and 2412.16795 formats.
#!/usr/bin/python
## arXiv script version 0.3
## Copyright 2015 Tom Brown
## This program is free software; you can redistribute it and/or
## modify it under the terms of the GNU General Public License as
## published by the Free Software Foundation; either version 3 of the
## License, or (at your option) any later version.
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
## See http://www.stringwiki.org/wiki/ArXiv_script for more usage
## instructions
'''arXiv script
Usage:
python arxiv.py reference [ -htabcjdps ] [ --help ]
"reference" must be a standard arXiv reference, e.g. hep-th/9711200, 0705.0303.
Options:
-h, --help
displays this help message
-t
displays the title
-a
displays the author(s)
-b
displays the aBstract
-c
displays the comments
-j
displays the journal reference
-d
downloads the PDF
-p
downloads the PS
-s
downloads the source file
'''
__version__ = "0.3"
__author__ = "Tom Brown"
__copyright__ = "Copyright 2015 Tom Brown, GNU GPL 3"
import sys
import os
import getopt
import re
import urllib.request
import urllib.parse
import urllib.error
import gzip
def findRefType(ref):
ref = ref.replace('arxiv:', '').replace('arXiv:', '')
if re.search(r'^[a-zA-Z\-\.]+/\d{7}$', ref):
type = 'old-style eprint'
elif re.search(r'^\d{7}$', ref):
type = 'old-style eprint'
ref = 'hep-th/' + ref
elif re.search(r'^\d{4}\.\d{4,5}(v\d+)?$', ref):
type = 'new-style eprint'
else:
type = 'not arXiv'
return type, ref
def downloadPDF(ref, type, downloadPath):
downloadPath = os.path.expanduser(downloadPath)
if not os.path.exists(downloadPath):
os.makedirs(downloadPath)
filename = ref.replace('/', '-') + '.pdf' if type == 'old-style eprint' else ref + '.pdf'
try:
urllib.request.urlretrieve('https://arxiv.org/pdf/' + ref,
os.path.join(downloadPath, filename))
except Exception as e:
print(f"Error downloading PDF: {e}")
def downloadPS(ref, type, downloadPath):
downloadPath = os.path.expanduser(downloadPath)
if not os.path.exists(downloadPath):
os.makedirs(downloadPath)
filename = ref.replace('/', '-') if type == 'old-style eprint' else ref
try:
urllib.request.urlretrieve('https://arxiv.org/ps/' + ref,
os.path.join(downloadPath, filename))
with gzip.open(os.path.join(downloadPath, filename), 'rb') as gzipFile:
with open(os.path.join(downloadPath, filename + ".ps"), 'wb') as psFile:
psFile.write(gzipFile.read())
os.remove(os.path.join(downloadPath, filename))
except Exception as e:
print(f"Error downloading PS: {e}")
def downloadSource(ref, type, downloadPath):
downloadPath = os.path.expanduser(downloadPath)
if not os.path.exists(downloadPath):
os.makedirs(downloadPath)
filename = ref.replace('/', '-') if type == 'old-style eprint' else ref
try:
urllib.request.urlretrieve('https://arxiv.org/e-print/' + ref,
os.path.join(downloadPath, filename + ".dum"))
with gzip.open(os.path.join(downloadPath, filename + ".dum"), 'rb') as gzipFile:
with open(os.path.join(downloadPath, filename), 'wb') as sourceFile:
sourceFile.write(gzipFile.read())
os.remove(os.path.join(downloadPath, filename + ".dum"))
except Exception as e:
print(f"Error downloading source: {e}")
def getTitle(html):
title_section = html.split('class="title mathjax">', 1)[-1].split('</h1>', 1)[0] or \
html.split('>Title:</span>', 1)[-1].split('</h1>', 1)[0]
title = re.sub(r'<[^>]*>', '', title_section)
title = re.sub(r'^Title:\s*', '', title)
return title.strip()
def getAuthors(html):
authors = html[html.find(">Authors:</span>"):]
authors = authors[authors.find("\">")+2:]
authors = authors[:authors.find("</div>")]
authors = re.sub('<[^>]*>', '', authors)
authors = authors.replace("\n", "")
return authors.strip()
def getAbstract(html):
abstract_start = html.find("Abstract:</span>")
if abstract_start == -1:
return "Abstract not found"
abstract = html[abstract_start + len("Abstract:</span>"):]
abstract = abstract[:abstract.find("</blockquote>")]
from html import unescape
abstract = unescape(re.sub(r'<[^>]+>', '', abstract))
abstract = abstract.replace('\n', ' ')
return abstract.strip()
def getComments(html):
patterns = [
r'Comments:</span>\s*<span[^>]*>(.*?)</span>',
r'citation_arxiv_comment"[^>]*content="([^"]*)"',
r'<td[^>]*>\s*Comments:\s*</td>\s*<td[^>]*>(.*?)</td>',
r'Comments:\s*([^<>\n]{5,300}?)<'
]
for pattern in patterns:
match = re.search(pattern, html, re.DOTALL)
if match:
comment = re.sub(r'<[^>]+>', '', match.group(1)).strip()
if comment:
return comment
return "no comments"
def getJref(html):
if "jref" not in html:
return "no journal reference"
else:
jref = html[html.find("jref\">")+6:]
jref = jref[:jref.find("</td>")]
return jref.strip()
if __name__ == "__main__":
authorOpt = 0
titleOpt = 0
abstractOpt = 0
commentsOpt = 0
jrefOpt = 0
pdfOpt = 0
psOpt = 0
sourceOpt = 0
try:
options, arguments = getopt.gnu_getopt(sys.argv[1:],
'hatbcjdpsv', ['help'])
except getopt.error as err:
print(f'error: {err}; try \'arxiv.py -h\' for more information')
sys.exit(0)
for o, a in options:
if o in ('-h', '--help'):
print(__doc__)
sys.exit(0)
elif o == '-a':
authorOpt = 1
elif o == '-t':
titleOpt = 1
elif o == '-b':
abstractOpt = 1
elif o == '-c':
commentsOpt = 1
elif o == '-j':
jrefOpt = 1
elif o == '-d':
pdfOpt = 1
elif o == '-p':
psOpt = 1
elif o == '-s':
sourceOpt = 1
if len(options) == 0:
authorOpt = 1
titleOpt = 1
abstractOpt = 1
commentsOpt = 1
jrefOpt = 1
if len(arguments) != 1:
print('you didn\'t specify an arXiv reference; try \'arxiv.py -h\' for more information')
sys.exit(0)
else:
ref = arguments[0]
type, ref = findRefType(ref)
print(f"Reference {ref} is of type {type}")
if type == "not arXiv":
print("type not of arXiv form")
sys.exit(0)
if authorOpt + titleOpt + abstractOpt + commentsOpt + jrefOpt > 0:
try:
with urllib.request.urlopen('https://arxiv.org/abs/' + ref) as htmlObject:
html = htmlObject.read().decode('utf-8')
except Exception as e:
print(f"Error fetching arXiv page: {e}")
sys.exit(1)
if titleOpt:
title = getTitle(html)
print("\nTitle:", title)
if authorOpt:
authors = getAuthors(html)
print("\nAuthors:", authors)
if abstractOpt:
abstract = getAbstract(html)
print("\nAbstract:", abstract)
if commentsOpt:
comments = getComments(html)
print("\nComments:", comments)
if jrefOpt:
jref = getJref(html)
print("\nJournal reference:", jref)
if pdfOpt:
downloadPDF(ref, type, "./")
if psOpt:
downloadPS(ref, type, "./")
if sourceOpt:
downloadSource(ref, type, "./")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment