-
-
Save Ricky-Wilson/68ab75db5bc7ee061e7d9d064bff737c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#http://www.diveintopython.net/html_processing/extracting_data.html | |
#https://docs.python.org/2/library/robotparser.html | |
import robotparser | |
import urllib | |
import csv | |
from urlparse import urlparse | |
def get_page(url): | |
sock = urllib.urlopen(url) | |
htmlSource = sock.read() | |
sock.close() | |
return htmlSource | |
#https://www.udacity.com/course/viewer#!/c-cs101/l-48727569/e-48718374/m-48719196 | |
def get_next_target(page): | |
start_link = page.find('href=') | |
if start_link == -1: | |
return None, 0 | |
start_quote = page.find('"', start_link) | |
end_quote = page.find('"', start_quote + 1) | |
url = page[start_quote + 1:end_quote] | |
return url, end_quote | |
def union(p,q): | |
for e in q: | |
if e not in p: | |
p.append(e) | |
def get_all_links(page): | |
links = [] | |
while True: | |
url,endpos = get_next_target(page) | |
if url: | |
links.append(url) | |
page = page[endpos:] | |
else: | |
break | |
terms = extract_dictionary_terms(page) | |
return links, terms | |
def get_title(page): | |
start_title = page.find('<title>') | |
if start_title == -1: | |
return 'None' | |
end_title = page.find('</title>') | |
title = page[start_title+7:end_title] | |
return title | |
def crawl_web(seed,max_depth,max_pages): | |
tocrawl = [seed] | |
crawled = [] | |
next_depth = [] | |
title = [] | |
depth = 0 | |
while tocrawl and depth <= max_depth: | |
page_url = tocrawl.pop() | |
url_split = urlparse(page_url) | |
rp = robotparser.RobotFileParser() | |
rp.set_url(url_split.netloc + "/robots.txt") | |
rp.read() | |
if (page_url not in crawled) and (rp.can_fetch("*", page_url)): | |
links,terms = get_all_links(get_page(page_url)) | |
union(next_depth, links) | |
while len(tocrawl) > max_pages: | |
tocrawl.pop() | |
#union(title, get_title(get_page(page))) | |
write_to_file(page_url,terms) | |
crawled.append(page_url) | |
if not tocrawl: | |
tocrawl, next_depth = next_depth, [] | |
depth = depth + 1 | |
def extract_dictionary_terms(page): | |
terms = import_dictionary() | |
for term in terms: | |
if page.find(term) >= 0: | |
terms.append(term) | |
else: | |
continue | |
if len(terms) > 0: | |
return terms | |
else: | |
return [] | |
def import_dictionary(): | |
with open("dictionary.txt", "r") as dictionaryfile: | |
dictionary = dictionaryfile.readlines() | |
dictionaryfile.close() | |
return dictionary | |
def write_to_file(url,terms): | |
crawled_file = open("crawled_file.csv", "wb") | |
wr = csv.writer(crawled_file, quoting=csv.QUOTE_ALL) | |
print >> crawled_file, url + '\",\"' | |
wr.writerow(terms) | |
# print >> crawled_file, url + ',' | |
# for term in terms: | |
# print >> crawled_file, term + ',' | |
# print >> crawled_file, '\n' | |
crawled_file.close() | |
return | |
#print "Hello, welcome to my web crawler." | |
seed = raw_input("Please enter the seed website URL: ") | |
#tocrawl, seed = | |
crawl_web(seed,500,500) | |
#while tocrawl > 0: | |
# crawl_web(seed,4,10) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment