Last active
May 19, 2025 15:01
-
-
Save rafisics/aa8d720991faee9e3157f420e9860639 to your computer and use it in GitHub Desktop.
arXiv script - Python tool to query arXiv papers and download files from terminal. Handles hep-th/9711200 and 2412.16795 formats.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python | |
## arXiv script version 0.3 | |
## Copyright 2015 Tom Brown | |
## This program is free software; you can redistribute it and/or | |
## modify it under the terms of the GNU General Public License as | |
## published by the Free Software Foundation; either version 3 of the | |
## License, or (at your option) any later version. | |
## This program is distributed in the hope that it will be useful, | |
## but WITHOUT ANY WARRANTY; without even the implied warranty of | |
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
## GNU General Public License for more details. | |
## You should have received a copy of the GNU General Public License | |
## along with this program. If not, see <http://www.gnu.org/licenses/>. | |
## See http://www.stringwiki.org/wiki/ArXiv_script for more usage | |
## instructions | |
'''arXiv script | |
Usage: | |
python arxiv.py reference [ -htabcjdps ] [ --help ] | |
"reference" must be a standard arXiv reference, e.g. hep-th/9711200, 0705.0303. | |
Options: | |
-h, --help | |
displays this help message | |
-t | |
displays the title | |
-a | |
displays the author(s) | |
-b | |
displays the aBstract | |
-c | |
displays the comments | |
-j | |
displays the journal reference | |
-d | |
downloads the PDF | |
-p | |
downloads the PS | |
-s | |
downloads the source file | |
''' | |
__version__ = "0.3" | |
__author__ = "Tom Brown" | |
__copyright__ = "Copyright 2015 Tom Brown, GNU GPL 3" | |
import sys | |
import os | |
import getopt | |
import re | |
import urllib.request | |
import urllib.parse | |
import urllib.error | |
import gzip | |
def findRefType(ref): | |
ref = ref.replace('arxiv:', '').replace('arXiv:', '') | |
if re.search(r'^[a-zA-Z\-\.]+/\d{7}$', ref): | |
type = 'old-style eprint' | |
elif re.search(r'^\d{7}$', ref): | |
type = 'old-style eprint' | |
ref = 'hep-th/' + ref | |
elif re.search(r'^\d{4}\.\d{4,5}(v\d+)?$', ref): | |
type = 'new-style eprint' | |
else: | |
type = 'not arXiv' | |
return type, ref | |
def downloadPDF(ref, type, downloadPath): | |
downloadPath = os.path.expanduser(downloadPath) | |
if not os.path.exists(downloadPath): | |
os.makedirs(downloadPath) | |
filename = ref.replace('/', '-') + '.pdf' if type == 'old-style eprint' else ref + '.pdf' | |
try: | |
urllib.request.urlretrieve('https://arxiv.org/pdf/' + ref, | |
os.path.join(downloadPath, filename)) | |
except Exception as e: | |
print(f"Error downloading PDF: {e}") | |
def downloadPS(ref, type, downloadPath): | |
downloadPath = os.path.expanduser(downloadPath) | |
if not os.path.exists(downloadPath): | |
os.makedirs(downloadPath) | |
filename = ref.replace('/', '-') if type == 'old-style eprint' else ref | |
try: | |
urllib.request.urlretrieve('https://arxiv.org/ps/' + ref, | |
os.path.join(downloadPath, filename)) | |
with gzip.open(os.path.join(downloadPath, filename), 'rb') as gzipFile: | |
with open(os.path.join(downloadPath, filename + ".ps"), 'wb') as psFile: | |
psFile.write(gzipFile.read()) | |
os.remove(os.path.join(downloadPath, filename)) | |
except Exception as e: | |
print(f"Error downloading PS: {e}") | |
def downloadSource(ref, type, downloadPath): | |
downloadPath = os.path.expanduser(downloadPath) | |
if not os.path.exists(downloadPath): | |
os.makedirs(downloadPath) | |
filename = ref.replace('/', '-') if type == 'old-style eprint' else ref | |
try: | |
urllib.request.urlretrieve('https://arxiv.org/e-print/' + ref, | |
os.path.join(downloadPath, filename + ".dum")) | |
with gzip.open(os.path.join(downloadPath, filename + ".dum"), 'rb') as gzipFile: | |
with open(os.path.join(downloadPath, filename), 'wb') as sourceFile: | |
sourceFile.write(gzipFile.read()) | |
os.remove(os.path.join(downloadPath, filename + ".dum")) | |
except Exception as e: | |
print(f"Error downloading source: {e}") | |
def getTitle(html): | |
title_section = html.split('class="title mathjax">', 1)[-1].split('</h1>', 1)[0] or \ | |
html.split('>Title:</span>', 1)[-1].split('</h1>', 1)[0] | |
title = re.sub(r'<[^>]*>', '', title_section) | |
title = re.sub(r'^Title:\s*', '', title) | |
return title.strip() | |
def getAuthors(html): | |
authors = html[html.find(">Authors:</span>"):] | |
authors = authors[authors.find("\">")+2:] | |
authors = authors[:authors.find("</div>")] | |
authors = re.sub('<[^>]*>', '', authors) | |
authors = authors.replace("\n", "") | |
return authors.strip() | |
def getAbstract(html): | |
abstract_start = html.find("Abstract:</span>") | |
if abstract_start == -1: | |
return "Abstract not found" | |
abstract = html[abstract_start + len("Abstract:</span>"):] | |
abstract = abstract[:abstract.find("</blockquote>")] | |
from html import unescape | |
abstract = unescape(re.sub(r'<[^>]+>', '', abstract)) | |
abstract = abstract.replace('\n', ' ') | |
return abstract.strip() | |
def getComments(html): | |
patterns = [ | |
r'Comments:</span>\s*<span[^>]*>(.*?)</span>', | |
r'citation_arxiv_comment"[^>]*content="([^"]*)"', | |
r'<td[^>]*>\s*Comments:\s*</td>\s*<td[^>]*>(.*?)</td>', | |
r'Comments:\s*([^<>\n]{5,300}?)<' | |
] | |
for pattern in patterns: | |
match = re.search(pattern, html, re.DOTALL) | |
if match: | |
comment = re.sub(r'<[^>]+>', '', match.group(1)).strip() | |
if comment: | |
return comment | |
return "no comments" | |
def getJref(html): | |
if "jref" not in html: | |
return "no journal reference" | |
else: | |
jref = html[html.find("jref\">")+6:] | |
jref = jref[:jref.find("</td>")] | |
return jref.strip() | |
if __name__ == "__main__": | |
authorOpt = 0 | |
titleOpt = 0 | |
abstractOpt = 0 | |
commentsOpt = 0 | |
jrefOpt = 0 | |
pdfOpt = 0 | |
psOpt = 0 | |
sourceOpt = 0 | |
try: | |
options, arguments = getopt.gnu_getopt(sys.argv[1:], | |
'hatbcjdpsv', ['help']) | |
except getopt.error as err: | |
print(f'error: {err}; try \'arxiv.py -h\' for more information') | |
sys.exit(0) | |
for o, a in options: | |
if o in ('-h', '--help'): | |
print(__doc__) | |
sys.exit(0) | |
elif o == '-a': | |
authorOpt = 1 | |
elif o == '-t': | |
titleOpt = 1 | |
elif o == '-b': | |
abstractOpt = 1 | |
elif o == '-c': | |
commentsOpt = 1 | |
elif o == '-j': | |
jrefOpt = 1 | |
elif o == '-d': | |
pdfOpt = 1 | |
elif o == '-p': | |
psOpt = 1 | |
elif o == '-s': | |
sourceOpt = 1 | |
if len(options) == 0: | |
authorOpt = 1 | |
titleOpt = 1 | |
abstractOpt = 1 | |
commentsOpt = 1 | |
jrefOpt = 1 | |
if len(arguments) != 1: | |
print('you didn\'t specify an arXiv reference; try \'arxiv.py -h\' for more information') | |
sys.exit(0) | |
else: | |
ref = arguments[0] | |
type, ref = findRefType(ref) | |
print(f"Reference {ref} is of type {type}") | |
if type == "not arXiv": | |
print("type not of arXiv form") | |
sys.exit(0) | |
if authorOpt + titleOpt + abstractOpt + commentsOpt + jrefOpt > 0: | |
try: | |
with urllib.request.urlopen('https://arxiv.org/abs/' + ref) as htmlObject: | |
html = htmlObject.read().decode('utf-8') | |
except Exception as e: | |
print(f"Error fetching arXiv page: {e}") | |
sys.exit(1) | |
if titleOpt: | |
title = getTitle(html) | |
print("\nTitle:", title) | |
if authorOpt: | |
authors = getAuthors(html) | |
print("\nAuthors:", authors) | |
if abstractOpt: | |
abstract = getAbstract(html) | |
print("\nAbstract:", abstract) | |
if commentsOpt: | |
comments = getComments(html) | |
print("\nComments:", comments) | |
if jrefOpt: | |
jref = getJref(html) | |
print("\nJournal reference:", jref) | |
if pdfOpt: | |
downloadPDF(ref, type, "./") | |
if psOpt: | |
downloadPS(ref, type, "./") | |
if sourceOpt: | |
downloadSource(ref, type, "./") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment