Last active
May 31, 2016 01:40
-
-
Save asluchevskiy/4e0c94675a6b95e95dc8297dd993e0f3 to your computer and use it in GitHub Desktop.
ostrovok.ru web scraper using python
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import logging | |
import spider | |
if __name__ == '__main__': | |
logging.basicConfig(level=logging.DEBUG) | |
spider.OstrovokSpider(thread_number=64).run() | |
spider.OstrovokPagesSpider(thread_number=64).run() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import re | |
import pprint | |
import logging | |
import json | |
import string | |
import base64 | |
import pymongo | |
from datetime import datetime | |
from dateutil.parser import parse | |
from grab.spider import Spider, Task | |
from weblib.error import DataNotFound | |
logger = logging.getLogger('spider') | |
class OstrovokSpider(Spider): | |
initial_urls = [ | |
'https://ostrovok.ru/hotel/russia/' | |
# 'https://ostrovok.ru/hotel/russia/st._petersburg/' | |
# 'https://ostrovok.ru/hotel/russia/m/russian_far_east_amur_oblast_multi/' | |
# 'https://ostrovok.ru/hotel/russia/blagoveshchensk/' | |
] | |
def prepare(self): | |
self.setup_cache(database='ostrovok') | |
self.db = pymongo.MongoClient()['ostrovok'] | |
self.processed_urls = set() | |
# def update_grab_instance(self, grab): | |
# grab.setup(log_dir='./logs') | |
def task_initial(self, grab, task): | |
if grab.response.code == 404: | |
return | |
current_page = task.get('page', 1) | |
base_url = task.get('base_url', task.url) | |
if current_page == 1: | |
title = grab.doc.select('//div[@class="additional-heading"]/div[@class="additional-heading-inner"]/' | |
'h1[@class="additional-heading-title"]').text() | |
breadcrumbs = [span.text() for span in | |
grab.doc.select('//div[@class="breadcrumbs"]//*[@itemprop="breadcrumb"]')] | |
is_booking_page = grab.doc.select('//div[@class="datesform-wrapper-wrapper"]').exists() | |
article = string.join([p.text() for p in grab.doc.select('//div[@class="sidebar-article"]/p')], '<br>\n') | |
if not article: | |
article = None | |
child_lists = [] | |
for div in grab.doc.select('//div[@class="b-catalog__sidebar"]/div[contains(@class, "links")]'): | |
try: | |
section_title = div.select('.//strong[@class="links-title"]').text() | |
except DataNotFound: | |
continue | |
for el in div.select( | |
'.//ul[@class="links-list"]/li[@class="links-list-item"]/*[contains(@class, "links-link")]'): | |
_title = el.text() | |
try: | |
url = grab.make_url_absolute(el.attr('href')) | |
except DataNotFound: | |
url = grab.make_url_absolute(base64.decodestring(el.attr('data-b64url'))) | |
if '/a/' in url or '/content/' in url or '/events/' in url or '/map/' in url or \ | |
url.endswith('/deals/') or url.endswith('/rating/'):# or '/ryadom-s/' in url: | |
continue | |
# if section_title == u'Популярные направления': | |
# continue # test: russia | |
# if u'Популярные города' in section_title: | |
# continue | |
if current_page == 1 and section_title != u'Популярные направления' and \ | |
not section_title.startswith(u'Популярные города'): | |
child_lists.append(dict(title=_title, url=url, section_title=section_title)) | |
if url not in self.processed_urls: | |
yield Task('initial', url, refferer_url=task.url) | |
self.processed_urls.add(url) | |
hotels = [] | |
for div in grab.doc.select('//div[@class="hotel-inner"]'): | |
url_selector = div.select('.//div[contains(@class, "hotel-title")]/a') | |
hotel_title = url_selector.text() | |
url = grab.make_url_absolute(url_selector.attr('href')) | |
hotel_id = int(re.search('/id(\d+)/', url).group(1)) | |
hotel = dict(title=hotel_title, url=url, hotel_id=hotel_id, meta=dict(is_parsed=False)) | |
try: | |
self.db.hotel.insert(hotel) | |
except pymongo.errors.DuplicateKeyError: | |
pass | |
hotels.append(hotel) | |
hotels_id = [h['hotel_id'] for h in hotels] | |
if current_page == 1 and not self.db.list.find_one({'url': base_url}): | |
self.db.list.insert(dict(url=base_url, refferer_url=task.get('refferer_url'), | |
breadcrumbs=breadcrumbs, title=title, article=article, | |
is_booking_page=is_booking_page, | |
child_lists=child_lists)) | |
self.db.list.update({'url': base_url}, {'$addToSet': {'hotels_id': {'$each': hotels_id}}}) | |
# pagination | |
if len(grab.doc.select('//ul[@class="pagination"]/li')): | |
url = '%s?page=%d' % (base_url, current_page + 1) | |
yield Task('initial', url, page=current_page + 1, base_url=base_url) | |
class OstrovokPagesSpider(Spider): | |
def prepare(self): | |
# self.setup_cache(database='ostrovok') | |
self.db = pymongo.MongoClient()['ostrovok'] | |
def task_generator(self): | |
for item in self.db.hotel.find({'meta.is_parsed': False}, no_cursor_timeout=True): | |
yield Task('page', item['url'], item=item) | |
# yield Task('page', 'https://ostrovok.ru/hotel/russia/moscow/id5556/cosmos/', item={}) | |
# def update_grab_instance(self, grab): | |
# grab.setup(log_dir='./logs') | |
def task_page(self, grab, task): | |
if grab.response.code != 200: | |
logger.error('response code %d for url %s' % (grab.response.code, task.url)) | |
return | |
title = grab.doc.select('//h1[@itemprop="name"]').text() | |
near_places = [dict(name=a.text(), url=grab.make_url_absolute(a.attr('href'))) for a in | |
grab.doc.select(u'//div[./div[@class="hotelpage-links__header" and text()="Места рядом"]]' | |
u'//li[@class="hotelpage-links__item"]/a')] | |
similar_hotels = [int(re.search('/id(\d+)/', a.attr('href')).group(1)) for a in | |
grab.doc.select('//a[contains(@class, "hotel-similar-title")]')] | |
payment_notices = [div.text(smart=True) for div in grab.doc.select('//div[@class="paymenttypenotice-message"]')] | |
otahotel_id = re.search('\"OTAHOTEL_ID\": "(.+?)"', grab.response.body).group(1) | |
task.item.update(dict(near_places=near_places, similar_hotels=similar_hotels, payment_notices=payment_notices, | |
otahotel_id=otahotel_id)) | |
json_url = 'https://ostrovok.ru/hotel/api/hotcore/static/?otahotel_id=%s' % otahotel_id | |
yield Task('json', json_url, item=task.item) | |
def task_json(self, grab, task): | |
item = task.item | |
data = json.loads(grab.response.body)[0] | |
gps = (data['latitude'], data['longitude']) | |
item['location'] = dict(address=data['address'], city=data['city_ru'], region=data['region_name_ru'], | |
country=data['country_ru'], country_code=data['country_code'], gps=gps) | |
item['description'] = data['description'] | |
item['description_struct'] = data['description_struct'] | |
item['description_policy'] = data['policy_description'] | |
item['title_en'] = data['name_en'] | |
item['stars'] = data['star_rating'] / 10.0 if data['star_rating'] else None | |
item['contacts'] = dict(email=data.get('email'), phone=data.get('phone')) | |
item['time_policy'] = dict(check_in_time=data['check_in_time'], check_out_time=data['check_out_time']) | |
item['tags'] = [dict(title=gr['group_name'], tags=gr['amenities']) for gr in data['amenity_groups']] | |
item['tags_main'] = [t['title'] for t in data['amenities_main_list']] | |
item['payment_methods'] = data['payment_methods'] | |
item['price'] = data.get('low_rate') | |
if data['rating']['exists']: | |
item['rating'] = data['rating'] | |
for key in ('other_reviews_count', 'our_published_reviews_count', 'our_reviews_count', 'review_best'): | |
item['rating'].pop(key, None) | |
else: | |
item['rating'] = None | |
item['rating_tripadvisor'] = dict(total=data['tripadvisor_rating'], | |
count=data['tripadvisor_reviewcnt']) if 'tripadvisor_rating' in data else None | |
item['images'] = data['images'] | |
for image in item['images']: | |
image.pop('real_url') | |
item['reviews'] = [] | |
for review in data['reviews']: | |
review['created'] = parse(review['created']) | |
for key in ('id', 'translated', 'is_best', 'is_published', 'is_showed', 'author_region', 'source'): | |
review.pop(key, None) | |
item['reviews'].append(review) | |
item['metro'] = data['nearest_subways'] | |
item['meta']['is_parsed'] = True | |
item['meta']['timestamp'] = datetime.now() | |
self.db.hotel.save(item) | |
# todo: data saving |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment