Last active
November 25, 2019 08:56
-
-
Save maksimKorzh/f164b887389caaaba503c5fd65d4c5aa to your computer and use it in GitHub Desktop.
A simple class to inherit from while writing one time scrapers
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# | |
# Libraries | |
# | |
from bs4 import BeautifulSoup | |
from tabulate import * | |
import requests | |
import time | |
import json | |
import csv | |
# Scraper class to inherit from | |
class Scraper: | |
# URLs to crawl | |
urls = [] | |
# Base URL | |
base_url = '' | |
# The number of pages to be scraped | |
page_number = 0 | |
# Results list | |
results = [] | |
# Run scraper | |
def run(self): | |
# Loop over the range of pages to crawl | |
for index in range(1, self.page_number + 1): | |
# Populate URLs list with pages to crawl | |
self.urls.append(self.base_url + str(index)) | |
# Loop over the URLs | |
for url in self.urls: | |
# Make HTTP GET request | |
response = requests.get(url) | |
print('GET: %s | Status code: %s' % (url, response.status_code)) | |
# Call parse method when the response is obtained | |
self.parse(response) | |
# 2 seconds delay to avoid torturing web sites | |
time.sleep(2) | |
# User's parse function to extract data | |
def parse(self, response): | |
pass | |
# Pretty print results to console | |
def print_results(self): | |
# Make sure results available | |
if len(self.results): | |
# Results in dictionary format case | |
if type(self.results[0]) == dict: | |
print(tabulate([row.values() for row in self.results], self.results[0].keys(), tablefmt='fancy_grid')) | |
# Results in list format case | |
if type(self.results[0]) == list: | |
print(tabulate(self.results, tablefmt='fancy_grid')) | |
# Export results as CSV file | |
def export_csv(self, filename): | |
# Create file stream | |
with open(filename, 'w', newline='') as csv_file: | |
# Make sure results available | |
if len(self.results): | |
# Results in dictionary format case | |
if type(self.results[0]) == dict: | |
# Create dictionary writer | |
writer = csv.DictWriter(csv_file, fieldnames=self.results[0].keys()) | |
# Write column names | |
writer.writeheader() | |
# Loop over results | |
for row in self.results: | |
writer.writerow(row) | |
# Results in list format case | |
elif type(self.results[0]) == list: | |
# Create writer | |
writer = csv.writer(csv_file) | |
# Write results | |
writer.writerows(self.results) | |
# Return on unsupported results type | |
else: | |
print('ERROR! Unsupported results type!') | |
return | |
# Return if no results available | |
else: | |
print('Failed to export "%s" - no results to store!' % filename) | |
# Export results in JSON format | |
def export_json(self, filename): | |
# Create file stream | |
with open(filename, 'w') as json_file: | |
# Write data in JSON format | |
json_file.write(json.dumps(self.results, indent=2)) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Import Scraper class and dependencies | |
from lib.ots import * | |
# Create QuotesScraper class inherited from Scraper class | |
class TemplateScraper(Scraper): | |
# Define url to scrape data from | |
urls = [] | |
# Parse response for each page | |
def parse(self, response): | |
# Parse content | |
content = BeautifulSoup(response.text, 'lxml') | |
# Create QuotesScraper instance | |
scraper = TemplateScraper() | |
# Run QuotesScraper | |
scraper.run() | |
# Pretty print results to console | |
scraper.print_results() | |
# Export extracted data to CSV file | |
scraper.export_csv('./data/template.csv') | |
# Export extracted data to JSON file | |
scraper.export_json('./data/template.json') |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from ots import * | |
class AgentScraper(Scraper): | |
base_url = 'https://developers.whatismybrowser.com/useragents/explore/software_name/chrome/' | |
page_number = 2 | |
def parse(self, response): | |
content = BeautifulSoup(response.text, 'lxml') | |
table = content.find('table') | |
rows = table.findAll('tr') | |
if response.url.split('/')[-1] == '1': | |
self.columns = [header.text.strip('\n') for header in rows[0].findAll('th')] | |
for row in rows: | |
if len(row.findAll('td')): | |
self.results.append([data.text for data in row.findAll('td')]) | |
print('rows', len(rows)) | |
scraper = AgentScraper() | |
scraper.run() | |
scraper.results.insert(0, scraper.columns) | |
scraper.print_results() | |
scraper.export_csv('./data/user_agents.csv') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment