Last active
February 15, 2024 17:00
-
-
Save rmax/086a8c330b250babc4224db7220a644e to your computer and use it in GitHub Desktop.
Using twisted deferreds in a scrapy spider!
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ scrapy runspider txspider.py | |
2016-07-05 23:11:39 [scrapy] INFO: Scrapy 1.1.0 started (bot: scrapybot) | |
2016-07-05 23:11:39 [scrapy] INFO: Overridden settings: {} | |
2016-07-05 23:11:40 [scrapy] INFO: Enabled extensions: | |
['scrapy.extensions.corestats.CoreStats', 'scrapy.extensions.logstats.LogStats'] | |
2016-07-05 23:11:40 [scrapy] INFO: Enabled downloader middlewares: | |
['scrapy.downloadermiddlewares.httpauth.HttpAuthMiddleware', | |
'scrapy.downloadermiddlewares.downloadtimeout.DownloadTimeoutMiddleware', | |
'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware', | |
'scrapy.downloadermiddlewares.retry.RetryMiddleware', | |
'scrapy.downloadermiddlewares.defaultheaders.DefaultHeadersMiddleware', | |
'scrapy.downloadermiddlewares.redirect.MetaRefreshMiddleware', | |
'scrapy.downloadermiddlewares.httpcompression.HttpCompressionMiddleware', | |
'scrapy.downloadermiddlewares.redirect.RedirectMiddleware', | |
'scrapy.downloadermiddlewares.cookies.CookiesMiddleware', | |
'scrapy.downloadermiddlewares.chunked.ChunkedTransferMiddleware', | |
'scrapy.downloadermiddlewares.stats.DownloaderStats'] | |
2016-07-05 23:11:40 [scrapy] INFO: Enabled spider middlewares: | |
['scrapy.spidermiddlewares.httperror.HttpErrorMiddleware', | |
'scrapy.spidermiddlewares.offsite.OffsiteMiddleware', | |
'scrapy.spidermiddlewares.referer.RefererMiddleware', | |
'scrapy.spidermiddlewares.urllength.UrlLengthMiddleware', | |
'scrapy.spidermiddlewares.depth.DepthMiddleware'] | |
2016-07-05 23:11:40 [scrapy] INFO: Enabled item pipelines: | |
[] | |
2016-07-05 23:11:40 [scrapy] INFO: Spider opened | |
2016-07-05 23:11:40 [scrapy] INFO: Crawled 0 pages (at 0 pages/min), scraped 0 items (at 0 items/min) | |
2016-07-05 23:11:40 [scrapy] DEBUG: Crawled (200) <GET http://httpbin.org/get> (referer: None) | |
2016-07-05 23:11:40 [txspider] DEBUG: Logging from a thread 0! ~~~ (: | |
2016-07-05 23:11:40 [txspider] DEBUG: Sleeping for 5 secs | |
2016-07-05 23:11:40 [txspider] DEBUG: Logging from a thread 1! ~~~ (: | |
2016-07-05 23:11:45 [scrapy] DEBUG: Scraped from <200 http://httpbin.org/get> | |
{'thread': 0} | |
2016-07-05 23:11:45 [scrapy] DEBUG: Scraped from <200 http://httpbin.org/get> | |
{'thread': 1} | |
2016-07-05 23:11:45 [scrapy] DEBUG: Scraped from <200 http://httpbin.org/get> | |
{'inline': None} | |
2016-07-05 23:11:45 [scrapy] INFO: Closing spider (finished) | |
2016-07-05 23:11:45 [scrapy] INFO: Dumping Scrapy stats: | |
{'downloader/request_bytes': 212, | |
'downloader/request_count': 1, | |
'downloader/request_method_count/GET': 1, | |
'downloader/response_bytes': 513, | |
'downloader/response_count': 1, | |
'downloader/response_status_count/200': 1, | |
'finish_reason': 'finished', | |
'finish_time': datetime.datetime(2016, 7, 6, 2, 11, 45, 697663), | |
'item_scraped_count': 3, | |
'log_count/DEBUG': 7, | |
'log_count/INFO': 7, | |
'response_received_count': 1, | |
'scheduler/dequeued': 1, | |
'scheduler/dequeued/memory': 1, | |
'scheduler/enqueued': 1, | |
'scheduler/enqueued/memory': 1, | |
'start_time': datetime.datetime(2016, 7, 6, 2, 11, 40, 159804)} | |
2016-07-05 23:11:45 [scrapy] INFO: Spider closed (finished) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import scrapy | |
from twisted.internet import defer, threads, reactor | |
class TXSpider(scrapy.Spider): | |
name = 'txspider' | |
start_urls = [ | |
'http://httpbin.org/get', | |
] | |
def parse(self, response, n_threads=2): | |
d = defer.DeferredList([ | |
threads.deferToThread(self._process, n) for n in range(n_threads) | |
] + [ | |
# This method is decorated with defer.inlineCallbacks! | |
self._inline_func(response) | |
]) | |
d.addCallback(self._make_output) | |
# You must return a single deferred in a request callback, | |
# and the output must complain with what scrapy expects, that is, | |
# either an item, request or None. | |
return d | |
def _process(self, n): | |
self.logger.debug("Logging from a thread %s! ~~~ (:", n) | |
return { | |
'thread': n, | |
} | |
@defer.inlineCallbacks | |
def _inline_func(self, response, sleep=5): | |
# You can decorate a request callback with | |
# inlineCallbacks. | |
self.logger.debug("Sleeping for %s secs", sleep) | |
d = defer.Deferred() | |
reactor.callLater(sleep, d.callback, None) | |
ret = yield d | |
defer.returnValue({ | |
'inline': ret, | |
}) | |
def _make_output(self, results): | |
return [out for ok, out in results if ok and out] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment