rafisics · May 19, 2025 15:01
diff --git a/arxiv.py b/arxiv.py
 #!/usr/bin/python

 ## arXiv script version 0.3

 ## Copyright 2015 Tom Brown

 ## This program is free software; you can redistribute it and/or
 ## modify it under the terms of the GNU General Public License as
 ## published by the Free Software Foundation; either version 3 of the
 ## License, or (at your option) any later version.

 ## This program is distributed in the hope that it will be useful,
 ## but WITHOUT ANY WARRANTY; without even the implied warranty of
 ## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 ## GNU General Public License for more details.

 ## You should have received a copy of the GNU General Public License
 ## along with this program.  If not, see <http://www.gnu.org/licenses/>.

 ## See http://www.stringwiki.org/wiki/ArXiv_script for more usage
 ## instructions

 '''arXiv script
 Usage:
 python arxiv.py reference [ -htabcjdps ] [ --help ]
 "reference" must be a standard arXiv reference, e.g. hep-th/9711200, 0705.0303.
 Options:
 -h, --help
 displays this help message
 -t
 displays the title
 -a
 displays the author(s)
 -b
 displays the aBstract
 -c
 displays the comments
 -j
 displays the journal reference
 -d
 downloads the PDF
 -p
 downloads the PS
 -s
 downloads the source file
 '''

 __version__ = "0.3"
 __author__ = "Tom Brown"
 __copyright__ = "Copyright 2015 Tom Brown, GNU GPL 3"


 import sys
 import os
 import getopt
 import re
 import urllib.request
 import urllib.parse
 import urllib.error
 import gzip


 def findRefType(ref):
    ref = ref.replace('arxiv:', '').replace('arXiv:', '')
    if re.search(r'^[a-zA-Z\-\.]+/\d{7}$', ref):
        type = 'old-style eprint'
    elif re.search(r'^\d{7}$', ref):
        type = 'old-style eprint'
        ref = 'hep-th/' + ref
    elif re.search(r'^\d{4}\.\d{4,5}(v\d+)?$', ref):
        type = 'new-style eprint'
    else:
        type = 'not arXiv'

    return type, ref


 def downloadPDF(ref, type, downloadPath):
    downloadPath = os.path.expanduser(downloadPath)
    if not os.path.exists(downloadPath):
        os.makedirs(downloadPath)
        
    filename = ref.replace('/', '-') + '.pdf' if type == 'old-style eprint' else ref + '.pdf'
    try:
        urllib.request.urlretrieve('https://arxiv.org/pdf/' + ref, 
                                 os.path.join(downloadPath, filename))
    except Exception as e:
        print(f"Error downloading PDF: {e}")


 def downloadPS(ref, type, downloadPath):
    downloadPath = os.path.expanduser(downloadPath)
    if not os.path.exists(downloadPath):
        os.makedirs(downloadPath)
        
    filename = ref.replace('/', '-') if type == 'old-style eprint' else ref
    try:
        urllib.request.urlretrieve('https://arxiv.org/ps/' + ref, 
                                 os.path.join(downloadPath, filename))
        with gzip.open(os.path.join(downloadPath, filename), 'rb') as gzipFile:
            with open(os.path.join(downloadPath, filename + ".ps"), 'wb') as psFile:
                psFile.write(gzipFile.read())
        os.remove(os.path.join(downloadPath, filename))
    except Exception as e:
        print(f"Error downloading PS: {e}")


 def downloadSource(ref, type, downloadPath):
    downloadPath = os.path.expanduser(downloadPath)
    if not os.path.exists(downloadPath):
        os.makedirs(downloadPath)
        
    filename = ref.replace('/', '-') if type == 'old-style eprint' else ref
    try:
        urllib.request.urlretrieve('https://arxiv.org/e-print/' + ref, 
                                 os.path.join(downloadPath, filename + ".dum"))
        with gzip.open(os.path.join(downloadPath, filename + ".dum"), 'rb') as gzipFile:
            with open(os.path.join(downloadPath, filename), 'wb') as sourceFile:
                sourceFile.write(gzipFile.read())
        os.remove(os.path.join(downloadPath, filename + ".dum"))
    except Exception as e:
        print(f"Error downloading source: {e}")


 def getTitle(html):
    title_section = html.split('class="title mathjax">', 1)[-1].split('</h1>', 1)[0] or \
                   html.split('>Title:</span>', 1)[-1].split('</h1>', 1)[0]
    title = re.sub(r'<[^>]*>', '', title_section)  
    title = re.sub(r'^Title:\s*', '', title)       
    return title.strip()


 def getAuthors(html):
    authors = html[html.find(">Authors:</span>"):]
    authors = authors[authors.find("\">")+2:]
    authors = authors[:authors.find("</div>")]
    authors = re.sub('<[^>]*>', '', authors)
    authors = authors.replace("\n", "")
    return authors.strip()


 def getAbstract(html):
    abstract_start = html.find("Abstract:</span>")
    if abstract_start == -1:
        return "Abstract not found"
    
    abstract = html[abstract_start + len("Abstract:</span>"):]
    abstract = abstract[:abstract.find("</blockquote>")]

    from html import unescape
    abstract = unescape(re.sub(r'<[^>]+>', '', abstract))
    abstract = abstract.replace('\n', ' ')
    return abstract.strip()

 def getComments(html):
    patterns = [
        r'Comments:</span>\s*<span[^>]*>(.*?)</span>',
        r'citation_arxiv_comment"[^>]*content="([^"]*)"',
        r'<td[^>]*>\s*Comments:\s*</td>\s*<td[^>]*>(.*?)</td>',
        r'Comments:\s*([^<>\n]{5,300}?)<'
    ]
    
    for pattern in patterns:
        match = re.search(pattern, html, re.DOTALL)
        if match:
            comment = re.sub(r'<[^>]+>', '', match.group(1)).strip()
            if comment:
                return comment
                
    return "no comments"


 def getJref(html):
    if "jref" not in html:
        return "no journal reference"
    else:
        jref = html[html.find("jref\">")+6:]
        jref = jref[:jref.find("</td>")]
        return jref.strip()


 if __name__ == "__main__":
    authorOpt = 0
    titleOpt = 0
    abstractOpt = 0
    commentsOpt = 0
    jrefOpt = 0
    pdfOpt = 0
    psOpt = 0
    sourceOpt = 0

    try:
        options, arguments = getopt.gnu_getopt(sys.argv[1:], 
                                             'hatbcjdpsv', ['help'])
    except getopt.error as err:
        print(f'error: {err}; try \'arxiv.py -h\' for more information')
        sys.exit(0)

    for o, a in options:
        if o in ('-h', '--help'):
            print(__doc__)
            sys.exit(0)
        elif o == '-a':
            authorOpt = 1
        elif o == '-t':
            titleOpt = 1
        elif o == '-b':
            abstractOpt = 1
        elif o == '-c':
            commentsOpt = 1
        elif o == '-j':
            jrefOpt = 1
        elif o == '-d':
            pdfOpt = 1
        elif o == '-p':
            psOpt = 1
        elif o == '-s':
            sourceOpt = 1

    if len(options) == 0:
        authorOpt = 1
        titleOpt = 1
        abstractOpt = 1
        commentsOpt = 1
        jrefOpt = 1

    if len(arguments) != 1:
        print('you didn\'t specify an arXiv reference; try \'arxiv.py -h\' for more information')
        sys.exit(0)
    else:
        ref = arguments[0]

    type, ref = findRefType(ref)

    print(f"Reference {ref} is of type {type}")

    if type == "not arXiv":
        print("type not of arXiv form")
        sys.exit(0)

    if authorOpt + titleOpt + abstractOpt + commentsOpt + jrefOpt > 0:
        try:
            with urllib.request.urlopen('https://arxiv.org/abs/' + ref) as htmlObject:
                html = htmlObject.read().decode('utf-8')
        except Exception as e:
            print(f"Error fetching arXiv page: {e}")
            sys.exit(1)

    if titleOpt:
        title = getTitle(html)
        print("\nTitle:", title)

    if authorOpt:
        authors = getAuthors(html)
        print("\nAuthors:", authors)

    if abstractOpt:
        abstract = getAbstract(html)
        print("\nAbstract:", abstract)

    if commentsOpt:
        comments = getComments(html)
        print("\nComments:", comments)

    if jrefOpt:
        jref = getJref(html)
        print("\nJournal reference:", jref)

    if pdfOpt:
        downloadPDF(ref, type, "./")

    if psOpt:
        downloadPS(ref, type, "./")

    if sourceOpt:
        downloadSource(ref, type, "./")
	#!/usr/bin/python

	## arXiv script version 0.3

	## Copyright 2015 Tom Brown

	## This program is free software; you can redistribute it and/or
	## modify it under the terms of the GNU General Public License as
	## published by the Free Software Foundation; either version 3 of the
	## License, or (at your option) any later version.

	## This program is distributed in the hope that it will be useful,
	## but WITHOUT ANY WARRANTY; without even the implied warranty of
	## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	## GNU General Public License for more details.

	## You should have received a copy of the GNU General Public License
	## along with this program. If not, see <http://www.gnu.org/licenses/>.

	## See http://www.stringwiki.org/wiki/ArXiv_script for more usage
	## instructions

	'''arXiv script
	Usage:
	python arxiv.py reference [ -htabcjdps ] [ --help ]
	"reference" must be a standard arXiv reference, e.g. hep-th/9711200, 0705.0303.
	Options:
	-h, --help
	displays this help message
	-t
	displays the title
	-a
	displays the author(s)
	-b
	displays the aBstract
	-c
	displays the comments
	-j
	displays the journal reference
	-d
	downloads the PDF
	-p
	downloads the PS
	-s
	downloads the source file
	'''

	__version__ = "0.3"
	__author__ = "Tom Brown"
	__copyright__ = "Copyright 2015 Tom Brown, GNU GPL 3"


	import sys
	import os
	import getopt
	import re
	import urllib.request
	import urllib.parse
	import urllib.error
	import gzip


	def findRefType(ref):
	ref = ref.replace('arxiv:', '').replace('arXiv:', '')
	if re.search(r'^[a-zA-Z\-\.]+/\d{7}$', ref):
	type = 'old-style eprint'
	elif re.search(r'^\d{7}$', ref):
	type = 'old-style eprint'
	ref = 'hep-th/' + ref
	elif re.search(r'^\d{4}\.\d{4,5}(v\d+)?$', ref):
	type = 'new-style eprint'
	else:
	type = 'not arXiv'

	return type, ref


	def downloadPDF(ref, type, downloadPath):
	downloadPath = os.path.expanduser(downloadPath)
	if not os.path.exists(downloadPath):
	os.makedirs(downloadPath)

	filename = ref.replace('/', '-') + '.pdf' if type == 'old-style eprint' else ref + '.pdf'
	try:
	urllib.request.urlretrieve('https://arxiv.org/pdf/' + ref,
	os.path.join(downloadPath, filename))
	except Exception as e:
	print(f"Error downloading PDF: {e}")


	def downloadPS(ref, type, downloadPath):
	downloadPath = os.path.expanduser(downloadPath)
	if not os.path.exists(downloadPath):
	os.makedirs(downloadPath)

	filename = ref.replace('/', '-') if type == 'old-style eprint' else ref
	try:
	urllib.request.urlretrieve('https://arxiv.org/ps/' + ref,
	os.path.join(downloadPath, filename))
	with gzip.open(os.path.join(downloadPath, filename), 'rb') as gzipFile:
	with open(os.path.join(downloadPath, filename + ".ps"), 'wb') as psFile:
	psFile.write(gzipFile.read())
	os.remove(os.path.join(downloadPath, filename))
	except Exception as e:
	print(f"Error downloading PS: {e}")


	def downloadSource(ref, type, downloadPath):
	downloadPath = os.path.expanduser(downloadPath)
	if not os.path.exists(downloadPath):
	os.makedirs(downloadPath)

	filename = ref.replace('/', '-') if type == 'old-style eprint' else ref
	try:
	urllib.request.urlretrieve('https://arxiv.org/e-print/' + ref,
	os.path.join(downloadPath, filename + ".dum"))
	with gzip.open(os.path.join(downloadPath, filename + ".dum"), 'rb') as gzipFile:
	with open(os.path.join(downloadPath, filename), 'wb') as sourceFile:
	sourceFile.write(gzipFile.read())
	os.remove(os.path.join(downloadPath, filename + ".dum"))
	except Exception as e:
	print(f"Error downloading source: {e}")


	def getTitle(html):
	title_section = html.split('class="title mathjax">', 1)[-1].split('</h1>', 1)[0] or \
	html.split('>Title:</span>', 1)[-1].split('</h1>', 1)[0]
	title = re.sub(r'<[^>]*>', '', title_section)
	title = re.sub(r'^Title:\s*', '', title)
	return title.strip()


	def getAuthors(html):
	authors = html[html.find(">Authors:</span>"):]
	authors = authors[authors.find("\">")+2:]
	authors = authors[:authors.find("</div>")]
	authors = re.sub('<[^>]*>', '', authors)
	authors = authors.replace("\n", "")
	return authors.strip()


	def getAbstract(html):
	abstract_start = html.find("Abstract:</span>")
	if abstract_start == -1:
	return "Abstract not found"

	abstract = html[abstract_start + len("Abstract:</span>"):]
	abstract = abstract[:abstract.find("</blockquote>")]

	from html import unescape
	abstract = unescape(re.sub(r'<[^>]+>', '', abstract))
	abstract = abstract.replace('\n', ' ')
	return abstract.strip()

	def getComments(html):
	patterns = [
	r'Comments:</span>\s<span[^>]>(.*?)</span>',
	r'citation_arxiv_comment"[^>]content="([^"])"',
	r'<td[^>]>\sComments:\s</td>\s<td[^>]>(.?)</td>',
	r'Comments:\s*([^<>\n]{5,300}?)<'
	]

	for pattern in patterns:
	match = re.search(pattern, html, re.DOTALL)
	if match:
	comment = re.sub(r'<[^>]+>', '', match.group(1)).strip()
	if comment:
	return comment

	return "no comments"


	def getJref(html):
	if "jref" not in html:
	return "no journal reference"
	else:
	jref = html[html.find("jref\">")+6:]
	jref = jref[:jref.find("</td>")]
	return jref.strip()


	if __name__ == "__main__":
	authorOpt = 0
	titleOpt = 0
	abstractOpt = 0
	commentsOpt = 0
	jrefOpt = 0
	pdfOpt = 0
	psOpt = 0
	sourceOpt = 0

	try:
	options, arguments = getopt.gnu_getopt(sys.argv[1:],
	'hatbcjdpsv', ['help'])
	except getopt.error as err:
	print(f'error: {err}; try \'arxiv.py -h\' for more information')
	sys.exit(0)

	for o, a in options:
	if o in ('-h', '--help'):
	print(__doc__)
	sys.exit(0)
	elif o == '-a':
	authorOpt = 1
	elif o == '-t':
	titleOpt = 1
	elif o == '-b':
	abstractOpt = 1
	elif o == '-c':
	commentsOpt = 1
	elif o == '-j':
	jrefOpt = 1
	elif o == '-d':
	pdfOpt = 1
	elif o == '-p':
	psOpt = 1
	elif o == '-s':
	sourceOpt = 1

	if len(options) == 0:
	authorOpt = 1
	titleOpt = 1
	abstractOpt = 1
	commentsOpt = 1
	jrefOpt = 1

	if len(arguments) != 1:
	print('you didn\'t specify an arXiv reference; try \'arxiv.py -h\' for more information')
	sys.exit(0)
	else:
	ref = arguments[0]

	type, ref = findRefType(ref)

	print(f"Reference {ref} is of type {type}")

	if type == "not arXiv":
	print("type not of arXiv form")
	sys.exit(0)

	if authorOpt + titleOpt + abstractOpt + commentsOpt + jrefOpt > 0:
	try:
	with urllib.request.urlopen('https://arxiv.org/abs/' + ref) as htmlObject:
	html = htmlObject.read().decode('utf-8')
	except Exception as e:
	print(f"Error fetching arXiv page: {e}")
	sys.exit(1)

	if titleOpt:
	title = getTitle(html)
	print("\nTitle:", title)

	if authorOpt:
	authors = getAuthors(html)
	print("\nAuthors:", authors)

	if abstractOpt:
	abstract = getAbstract(html)
	print("\nAbstract:", abstract)

	if commentsOpt:
	comments = getComments(html)
	print("\nComments:", comments)

	if jrefOpt:
	jref = getJref(html)
	print("\nJournal reference:", jref)

	if pdfOpt:
	downloadPDF(ref, type, "./")

	if psOpt:
	downloadPS(ref, type, "./")

	if sourceOpt:
	downloadSource(ref, type, "./")