Created
March 7, 2018 22:04
-
-
Save warborn/be67af3438e328c30c233583bd3ff237 to your computer and use it in GitHub Desktop.
Webcrawler that visits the first link in an article
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import time | |
import urllib | |
import requests | |
from bs4 import BeautifulSoup | |
start_url = "https://en.wikipedia.org/wiki/Special:Random" | |
target_url = "https://en.wikipedia.org/wiki/Philosophy" | |
def continue_crawl(search_history, target_url, max_steps = 25): | |
current_url = search_history[-1] | |
if current_url == target_url: | |
print("We've found the target article!") | |
return False | |
if len(search_history) > max_steps: | |
print("The search has gone on suspiciously long, aborting search!") | |
return False | |
if current_url in search_history[:-1]: | |
print("We've arrived at an article we've already seen, aborting search!") | |
print("The article was %s" % current_url) | |
return False | |
return True | |
def find_first_link(url): | |
response = requests.get(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
article_link = None | |
content_div = soup.find(id="mw-content-text").find(class_="mw-parser-output") | |
for element in content_div.find_all('p', recursive=False): | |
link = element.find('a', recursive=False) | |
if link: | |
article_link = link.get('href') | |
break | |
if not article_link: | |
return | |
first_link = urllib.parse.urljoin('https://en.wikipedia.org/', article_link) | |
return first_link | |
article_chain = [start_url] | |
while continue_crawl(article_chain, target_url): | |
print(article_chain[-1]) | |
# download html of last article in article_chain | |
# find the first link in that html | |
first_link = find_first_link(article_chain[-1]) | |
# add the first link to article_chain | |
article_chain.append(first_link) | |
# delay for about two seconds | |
time.sleep(2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment