Created
November 28, 2017 15:01
-
-
Save yingziwu/db721e156f9486e0d3a99bcb0e9257b8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
import requests | |
from six.moves.urllib.parse import urljoin | |
def capture( | |
target_url, | |
user_agent="savepagenow (https://github.com/pastpages/savepagenow)", | |
accept_cache=False, | |
proxies=None | |
): | |
""" | |
Archives the provided URL using archive.org's Wayback Machine. | |
Returns the archive.org URL where the capture is stored. | |
Raises a CachedPage exception if archive.org declines to conduct a new | |
capture and returns a previous snapshot instead. | |
To silence that exception, pass into True to the ``accept_cache`` keyword | |
argument. | |
""" | |
# Put together the URL that will save our request | |
domain = "https://web.archive.org" | |
save_url = urljoin(domain, "/save/") | |
request_url = save_url + target_url | |
# Send the capture request to achive.org | |
headers = { | |
'User-Agent': user_agent, | |
} | |
try: | |
response = requests.get(request_url, headers=headers,proxies=proxies) | |
except requests.exceptions.RequestException as e: | |
raise e | |
if response.status_code == 403: | |
if 'X-Archive-Wayback-Runtime-Error' in response.headers: | |
if response.headers['X-Archive-Wayback-Runtime-Error'] == 'RobotAccessControlException: Blocked By Robots': | |
raise BlockedByRobots("archive.org returned blocked by robots.txt error") | |
# Put together the URL where this page is archived | |
archive_id = response.headers['Content-Location'] | |
archive_url = urljoin(domain, archive_id) | |
# Determine if the response was cached | |
cached = response.headers['X-Page-Cache'] == 'HIT' | |
if cached: | |
if not accept_cache: | |
raise CachedPage("archive.org returned a cached version of this page: {}".format( | |
archive_url | |
)) | |
# Return that | |
return archive_url | |
def capture_or_cache( | |
target_url, | |
user_agent="savepagenow (https://github.com/pastpages/savepagenow)" | |
): | |
""" | |
Archives the provided URL using archive.org's Wayback Machine, unless | |
the page has been recently captured. | |
Returns a tuple with the archive.org URL where the capture is stored, | |
along with a boolean indicating if a new capture was conducted. | |
If the boolean is True, archive.org conducted a new capture. If it is False, | |
archive.org has returned a recently cached capture instead, likely taken | |
in the previous minutes. | |
""" | |
try: | |
return capture(target_url, user_agent=user_agent, accept_cache=False), True | |
except CachedPage: | |
return capture(target_url, user_agent=user_agent, accept_cache=True), False | |
class CachedPage(Exception): | |
""" | |
This error is raised when archive.org declines to make a new capture | |
and instead returns the cached version of most recent archive. | |
""" | |
pass | |
class BlockedByRobots(Exception): | |
""" | |
This error is raised when archive.org has been blocked by the site's robots.txt access control instructions. | |
""" | |
pass | |
def Topic_status_test(topic_id, | |
user_agent = "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.109 Safari/537.36", | |
proxies=None): | |
""" | |
Test the v2ex topic status. | |
Status 403: Forbidden | |
Status 404: Not Found | |
Status 302: Redirect to index page | |
Status 401: Unauthorized | |
Status 200: OK | |
""" | |
base_url = "https://www.v2ex.com/t/" | |
request_url = urljoin(base_url,str(topic_id)) | |
headers = { | |
'User-Agent': user_agent, | |
} | |
try: | |
response = requests.get(request_url, headers=headers,proxies=proxies) | |
except requests.exceptions.RequestException as e: | |
raise e | |
if response.status_code == 403: | |
return 403 | |
if response.status_code == 404 and '404 Topic Not Found' in response.text: | |
return 404 | |
if response.url == 'https://www.v2ex.com/': | |
return 302 | |
if 'signin' in response.url: | |
return 401 | |
return 200 | |
class TopicStatusError(Exception): | |
""" | |
This error is raised when the topic can't be archived. | |
""" | |
pass | |
def Save_topic(topic_id, | |
proxies_archive=None, | |
proxies_v2ex=None): | |
""" | |
Save the v2ex topic to internet archive (https://archive.org/) | |
""" | |
base_url = "https://www.v2ex.com/t/" | |
request_url = urljoin(base_url,str(topic_id)) | |
topic_status = Topic_status_test(topic_id, proxies=proxies_v2ex) | |
if topic_status == 200: | |
try: | |
archive_url = capture(request_url, proxies=proxies_archive) | |
except CachedPage as e: | |
return e | |
return archive_url | |
else: | |
raise TopicStatusError("The status of topic %s is %d. This topic can't be archived." % (str(topic_id),topic_status)) | |
if __name__ == '__main__': | |
# 使用方法: | |
# 一、命令行 | |
# 在可以访问 https://archive.org/ 的情况下, | |
# 直接使用 python3 save_v2ex_topic_to_internet_archive.py [topic_id] 即可, | |
# 如 python3 save_v2ex_topic_to_internet_archive.py 410301 | |
# | |
# 若不能直接访问 https://archive.org/, | |
# 可使用 "--proxies_archive","--proxies_v2ex" 选项设置代理, | |
# 如 python3 save_v2ex_topic_to_internet_archive.py 410224 --proxies_archive "socks5://127.0.0.1:1080" | |
# | |
# 二、直接引入 | |
# import save_v2ex_topic_to_internet_archive as v2ex_archive | |
# archive_url = v2ex_archive.Save_topic(topic_id,proxies_archive,proxies_v2ex) | |
# print(archive_url) | |
import click | |
@click.command() | |
@click.argument("topic_id") | |
@click.option("--proxies_archive",help="proxy for the archive.org request, eg. socks5://127.0.0.1:1080") | |
@click.option("--proxies_v2ex",help="proxy for the www.v2ex.com request, eg. socks5://127.0.0.1:1080") | |
def cli(topic_id, proxies_archive, proxies_v2ex): | |
kwargs = {} | |
if proxies_archive: | |
proxies_archive = {"http":proxies_archive,"https":proxies_archive} | |
kwargs['proxies_archive'] = proxies_archive | |
if proxies_v2ex: | |
proxies_v2ex = {"http":proxies_v2ex,"https":proxies_v2ex} | |
kwargs['proxies_v2ex'] = proxies_v2ex | |
archive_url = Save_topic(topic_id, **kwargs) | |
click.echo(archive_url) | |
cli() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment