Skip to content

Instantly share code, notes, and snippets.

@asluchevskiy
Last active May 31, 2016 01:40
Show Gist options
  • Save asluchevskiy/4e0c94675a6b95e95dc8297dd993e0f3 to your computer and use it in GitHub Desktop.
Save asluchevskiy/4e0c94675a6b95e95dc8297dd993e0f3 to your computer and use it in GitHub Desktop.
ostrovok.ru web scraper using python
# -*- coding: utf-8 -*-
import logging
import spider
if __name__ == '__main__':
logging.basicConfig(level=logging.DEBUG)
spider.OstrovokSpider(thread_number=64).run()
spider.OstrovokPagesSpider(thread_number=64).run()
# -*- coding: utf-8 -*-
import re
import pprint
import logging
import json
import string
import base64
import pymongo
from datetime import datetime
from dateutil.parser import parse
from grab.spider import Spider, Task
from weblib.error import DataNotFound
logger = logging.getLogger('spider')
class OstrovokSpider(Spider):
initial_urls = [
'https://ostrovok.ru/hotel/russia/'
# 'https://ostrovok.ru/hotel/russia/st._petersburg/'
# 'https://ostrovok.ru/hotel/russia/m/russian_far_east_amur_oblast_multi/'
# 'https://ostrovok.ru/hotel/russia/blagoveshchensk/'
]
def prepare(self):
self.setup_cache(database='ostrovok')
self.db = pymongo.MongoClient()['ostrovok']
self.processed_urls = set()
# def update_grab_instance(self, grab):
# grab.setup(log_dir='./logs')
def task_initial(self, grab, task):
if grab.response.code == 404:
return
current_page = task.get('page', 1)
base_url = task.get('base_url', task.url)
if current_page == 1:
title = grab.doc.select('//div[@class="additional-heading"]/div[@class="additional-heading-inner"]/'
'h1[@class="additional-heading-title"]').text()
breadcrumbs = [span.text() for span in
grab.doc.select('//div[@class="breadcrumbs"]//*[@itemprop="breadcrumb"]')]
is_booking_page = grab.doc.select('//div[@class="datesform-wrapper-wrapper"]').exists()
article = string.join([p.text() for p in grab.doc.select('//div[@class="sidebar-article"]/p')], '<br>\n')
if not article:
article = None
child_lists = []
for div in grab.doc.select('//div[@class="b-catalog__sidebar"]/div[contains(@class, "links")]'):
try:
section_title = div.select('.//strong[@class="links-title"]').text()
except DataNotFound:
continue
for el in div.select(
'.//ul[@class="links-list"]/li[@class="links-list-item"]/*[contains(@class, "links-link")]'):
_title = el.text()
try:
url = grab.make_url_absolute(el.attr('href'))
except DataNotFound:
url = grab.make_url_absolute(base64.decodestring(el.attr('data-b64url')))
if '/a/' in url or '/content/' in url or '/events/' in url or '/map/' in url or \
url.endswith('/deals/') or url.endswith('/rating/'):# or '/ryadom-s/' in url:
continue
# if section_title == u'Популярные направления':
# continue # test: russia
# if u'Популярные города' in section_title:
# continue
if current_page == 1 and section_title != u'Популярные направления' and \
not section_title.startswith(u'Популярные города'):
child_lists.append(dict(title=_title, url=url, section_title=section_title))
if url not in self.processed_urls:
yield Task('initial', url, refferer_url=task.url)
self.processed_urls.add(url)
hotels = []
for div in grab.doc.select('//div[@class="hotel-inner"]'):
url_selector = div.select('.//div[contains(@class, "hotel-title")]/a')
hotel_title = url_selector.text()
url = grab.make_url_absolute(url_selector.attr('href'))
hotel_id = int(re.search('/id(\d+)/', url).group(1))
hotel = dict(title=hotel_title, url=url, hotel_id=hotel_id, meta=dict(is_parsed=False))
try:
self.db.hotel.insert(hotel)
except pymongo.errors.DuplicateKeyError:
pass
hotels.append(hotel)
hotels_id = [h['hotel_id'] for h in hotels]
if current_page == 1 and not self.db.list.find_one({'url': base_url}):
self.db.list.insert(dict(url=base_url, refferer_url=task.get('refferer_url'),
breadcrumbs=breadcrumbs, title=title, article=article,
is_booking_page=is_booking_page,
child_lists=child_lists))
self.db.list.update({'url': base_url}, {'$addToSet': {'hotels_id': {'$each': hotels_id}}})
# pagination
if len(grab.doc.select('//ul[@class="pagination"]/li')):
url = '%s?page=%d' % (base_url, current_page + 1)
yield Task('initial', url, page=current_page + 1, base_url=base_url)
class OstrovokPagesSpider(Spider):
def prepare(self):
# self.setup_cache(database='ostrovok')
self.db = pymongo.MongoClient()['ostrovok']
def task_generator(self):
for item in self.db.hotel.find({'meta.is_parsed': False}, no_cursor_timeout=True):
yield Task('page', item['url'], item=item)
# yield Task('page', 'https://ostrovok.ru/hotel/russia/moscow/id5556/cosmos/', item={})
# def update_grab_instance(self, grab):
# grab.setup(log_dir='./logs')
def task_page(self, grab, task):
if grab.response.code != 200:
logger.error('response code %d for url %s' % (grab.response.code, task.url))
return
title = grab.doc.select('//h1[@itemprop="name"]').text()
near_places = [dict(name=a.text(), url=grab.make_url_absolute(a.attr('href'))) for a in
grab.doc.select(u'//div[./div[@class="hotelpage-links__header" and text()="Места рядом"]]'
u'//li[@class="hotelpage-links__item"]/a')]
similar_hotels = [int(re.search('/id(\d+)/', a.attr('href')).group(1)) for a in
grab.doc.select('//a[contains(@class, "hotel-similar-title")]')]
payment_notices = [div.text(smart=True) for div in grab.doc.select('//div[@class="paymenttypenotice-message"]')]
otahotel_id = re.search('\"OTAHOTEL_ID\": "(.+?)"', grab.response.body).group(1)
task.item.update(dict(near_places=near_places, similar_hotels=similar_hotels, payment_notices=payment_notices,
otahotel_id=otahotel_id))
json_url = 'https://ostrovok.ru/hotel/api/hotcore/static/?otahotel_id=%s' % otahotel_id
yield Task('json', json_url, item=task.item)
def task_json(self, grab, task):
item = task.item
data = json.loads(grab.response.body)[0]
gps = (data['latitude'], data['longitude'])
item['location'] = dict(address=data['address'], city=data['city_ru'], region=data['region_name_ru'],
country=data['country_ru'], country_code=data['country_code'], gps=gps)
item['description'] = data['description']
item['description_struct'] = data['description_struct']
item['description_policy'] = data['policy_description']
item['title_en'] = data['name_en']
item['stars'] = data['star_rating'] / 10.0 if data['star_rating'] else None
item['contacts'] = dict(email=data.get('email'), phone=data.get('phone'))
item['time_policy'] = dict(check_in_time=data['check_in_time'], check_out_time=data['check_out_time'])
item['tags'] = [dict(title=gr['group_name'], tags=gr['amenities']) for gr in data['amenity_groups']]
item['tags_main'] = [t['title'] for t in data['amenities_main_list']]
item['payment_methods'] = data['payment_methods']
item['price'] = data.get('low_rate')
if data['rating']['exists']:
item['rating'] = data['rating']
for key in ('other_reviews_count', 'our_published_reviews_count', 'our_reviews_count', 'review_best'):
item['rating'].pop(key, None)
else:
item['rating'] = None
item['rating_tripadvisor'] = dict(total=data['tripadvisor_rating'],
count=data['tripadvisor_reviewcnt']) if 'tripadvisor_rating' in data else None
item['images'] = data['images']
for image in item['images']:
image.pop('real_url')
item['reviews'] = []
for review in data['reviews']:
review['created'] = parse(review['created'])
for key in ('id', 'translated', 'is_best', 'is_published', 'is_showed', 'author_region', 'source'):
review.pop(key, None)
item['reviews'].append(review)
item['metro'] = data['nearest_subways']
item['meta']['is_parsed'] = True
item['meta']['timestamp'] = datetime.now()
self.db.hotel.save(item)
# todo: data saving
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment