Created
July 27, 2016 15:56
-
-
Save aron-bordin/97ca4233b5a304cd1466c5322b358cc6 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#! /usr/bin/env python | |
from utils import * | |
pending_requests = 0 | |
result = {} | |
def response_parse(response): | |
global pending_requests | |
for url in response['selector']['url']: | |
# get the url of repositories | |
# we count the number of requests using this var | |
pending_requests += 1 | |
# open a new request | |
write_line(''' | |
{ | |
"type": "selector_request", | |
"id": "category", | |
"url": "http://www.dmoz.org%s", | |
"selector": { | |
"url": {"type": "css", "filter": "div.title-and-desc a::attr('href')"}, | |
"title": {"type": "css", "filter": "div.title-and-desc a > div.site-title::text"} | |
} | |
} | |
''' % url) | |
def response_category(response): | |
global pending_requests | |
# this response is no longer pending | |
pending_requests -= 1 | |
for url, title in zip(response['selector']['url'], response['selector']['title']): | |
result[title] = url | |
# if finished all requests, we can close the spider | |
if pending_requests == 0: | |
# serialize the extracted data and close the spider | |
open('outputs/dmoz_data.json', 'w').write(json.dumps(result)) | |
write_line('{"type": "close"}') | |
def main(): | |
status = parse_json(stdin.readline()) | |
# we start checking if the channel is ready | |
if status['status'] != 'ready': | |
raise Exception("There is problem in the communication channel") | |
write_line(''' | |
{ | |
"type": "spider", | |
"name": "dmoz", | |
"start_urls": [] | |
} | |
''') | |
write_line(''' | |
{ | |
"type": "selector_request", | |
"id": "parse", | |
"url": "http://www.dmoz.org/Computers/Programming/Languages/Python/", | |
"selector": { | |
"url": {"type": "css", "filter": "#subcategories-div > section > div > div.cat-item > a::attr('href')"} | |
} | |
}''') | |
while True: | |
msg = parse_json(stdin.readline()) | |
# check the message type | |
if msg['type'] == 'exception' or msg['type'] == 'error': | |
raise Exception("Something wrong... " + str(msg)) | |
elif msg['type'] == 'response_selector': | |
# we check the id of the incoming response, and call a function to extract | |
# the data from each page | |
if msg['id'] == 'parse': | |
response_parse(msg) | |
elif msg['id'] == 'category': | |
response_category(msg) | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment