Created
September 1, 2017 15:30
-
-
Save cmoscardi/0cb3cd94ecc17a622967a43a0371c70c to your computer and use it in GitHub Desktop.
Link Checking Code -- comments below.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
session = FuturesSession(max_workers=8) | |
session.mount("http://", requests.adapters.HTTPAdapter(max_retries=5)) | |
msg = "%s: testing %d web and %d local links" % (nbname, len(web_links), len(local_links)) | |
print >> sys.stderr, msg | |
# Testing web links | |
while True: | |
futures = {link: session.get(link, verify=False, stream=True, | |
headers={'User-Agent': self.USER_AGENT}, timeout=10.0) | |
for link, count in web_links.items() | |
if count < self.TRIES} | |
if not futures: | |
break | |
for url, future in futures.iteritems(): | |
r = None | |
try: | |
r = future.result(timeout=20) | |
except requests.ConnectionError: | |
web_links[url] += 1 | |
except TimeoutError: | |
future.cancel() | |
web_links[url] += 1 | |
else: | |
if r.status_code >= 200 and r.status_code < 300: #2xx | |
del web_links[url] | |
else: | |
web_links[url] += 1 | |
# some weird DNS error causes threads to | |
# freak out and not die no matter what | |
# so we kill any unstopped threads manually | |
for thread in session.executor._threads: | |
thread._Thread__stop() | |
msg = "Web links failed {nb}: {links}".format(nb=nbname, links = ", ".join(web_links)) | |
assert web_links == {}, msg |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is messy, but is the worst part of the whole process.
web_links
- set it up as adefaultdict(int)
with the URLs as keys. So it'd look like this:FuturesSession
comes from requests-futures