juanriaza · November 2, 2015 14:03
diff --git a/myspider.py b/myspider.py
 import json
 import scrapy
 import urllib


 class ExampleSpider(scrapy.Spider):
    name = 'habrahabr.ru'
    start_urls = ['http://habrahabr.ru/']

    def parse(self, response):
        for url in response.xpath('//h1[@class="title"]'
                                  '/a[@class="post_title"]/@href').extract():
            yield scrapy.Request(url, callback=self.habrapost)

    def habrapost(self, response):
        post = {
            'url': response.url,
            'title': response.xpath(
                '//h1/span[@class="post_title"]//text()').extract_first(),
        }

        query = urllib.quote_plus(post['title'].encode('utf-8'))
        search_url = 'https://ajax.googleapis.com/ajax/services/search/' \
                     'images?v=1.0&q={}'.format(query)

        yield scrapy.Request(
            search_url,
            meta={'post': post},
            callback=self.image_search)

    def image_search(self, response):
        json_data = json.loads(response.body)
        image_url = json_data['responseData']['results'][0]['url']
        yield scrapy.Request(
            image_url,
            meta={'post': response.meta['post']},
            callback=self.image)

    def image(self, response):
        path = 'images/%s' % response.url.split('/')[-1]
        with open(path, 'wb+') as f:
            f.write(response.body)

        post = response.meta['post']
        post['image'] = path
        yield post
	import json
	import scrapy
	import urllib


	class ExampleSpider(scrapy.Spider):
	name = 'habrahabr.ru'
	start_urls = ['http://habrahabr.ru/']

	def parse(self, response):
	for url in response.xpath('//h1[@class="title"]'
	'/a[@class="post_title"]/@href').extract():
	yield scrapy.Request(url, callback=self.habrapost)

	def habrapost(self, response):
	post = {
	'url': response.url,
	'title': response.xpath(
	'//h1/span[@class="post_title"]//text()').extract_first(),
	}

	query = urllib.quote_plus(post['title'].encode('utf-8'))
	search_url = 'https://ajax.googleapis.com/ajax/services/search/' \
	'images?v=1.0&q={}'.format(query)

	yield scrapy.Request(
	search_url,
	meta={'post': post},
	callback=self.image_search)

	def image_search(self, response):
	json_data = json.loads(response.body)
	image_url = json_data['responseData']['results'][0]['url']
	yield scrapy.Request(
	image_url,
	meta={'post': response.meta['post']},
	callback=self.image)

	def image(self, response):
	path = 'images/%s' % response.url.split('/')[-1]
	with open(path, 'wb+') as f:
	f.write(response.body)

	post = response.meta['post']
	post['image'] = path
	yield post