Created
May 18, 2016 22:16
-
-
Save clemfromspace/74f322298c132ccfd1182a2608091245 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
from scrapy import Request | |
from scrapy.spider import CrawlSpider | |
from urlparse import urljoin | |
class TripAdvisorSpider(CrawlSpider): | |
name = 'tripadvisor' | |
allowed_domains = ['www.tripadvisor.fr'] | |
start_urls = ( | |
'https://www.tripadvisor.fr/Restaurants-g187147-Paris_Ile_de_France.html', | |
) | |
def build_review_full_link(self, review_id, response): | |
pattern = re.compile(ur'-d(\d+)') | |
request_id = re.search(pattern, response.url).groups()[0] | |
review_url = 'https://www.tripadvisor.fr/ExpandedUserReviews-d%(request_id)s' \ | |
'?target=%(review_id)s&reviews=%(review_id)s&servlet=Attraction_Review&expand=0' % { | |
'review_id': review_id, | |
'request_id': request_id | |
} | |
return review_url | |
def parse(self, response): | |
for page_link in response.xpath('//a[contains(@class, "pageNum")]/@href').extract(): | |
yield Request( | |
urljoin(response.url, page_link), | |
self.parse | |
) | |
for place_link in response.xpath('//h3[@class="title"]/a/@href').extract(): | |
yield Request( | |
urljoin(response.url, place_link), | |
self.parse_place | |
) | |
def parse_place(self, response): | |
for place_elem in response.xpath('//div[contains(@class,"reviewSelector")]'): | |
review_id = place_elem.xpath('./@id').extract()[0].replace('review_', '') | |
try: | |
review = place_elem.xpath('.//p[@class="entry"]').extract()[0] | |
print(review) | |
except IndexError: | |
review_url = self.build_review_full_link(review_id, response) | |
yield Request( | |
review_url, | |
callback=self.parse_review | |
) | |
def parse_review(self, response): | |
print(response.xpath('//div[@class="entry"]/p/text()').extract()[0]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment