-
-
Save wkomor/b46baf9d2bab967fd40c028b3c28aeb7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
import sys | |
import requests | |
import re | |
import html5lib | |
import webbrowser | |
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer | |
from bs4 import BeautifulSoup, Comment | |
class Parser(object): | |
""" | |
Class to proxy and parse site according to ivelum test quiz | |
""" | |
NO_CONTENT = ['head', 'title', 'script', 'style', '[document]'] | |
def __init__(self, site='https://habrahabr.ru', port='8080'): | |
self.site = site | |
self.port = port | |
try: | |
self.site = sys.argv[1] | |
self.port = sys.argv[2] | |
except IndexError: | |
pass | |
def _open(self, url): | |
""" | |
Open url | |
:return: response text if success or '' if fail | |
""" | |
response = requests.get(url) | |
if response.status_code == 200: | |
return response.text | |
return '' | |
def parse_page(self, url): | |
""" | |
method to add for every 6-character word 'тм' sign | |
:param url: url to be open | |
:return: html: html file to show user | |
""" | |
page = self._open(url) | |
soup = BeautifulSoup(page, "html5lib") | |
elements = soup.find_all(text=True) | |
cleaned_elements = (e for e in elements if | |
e.parent.name not in self.NO_CONTENT and not | |
isinstance(e, Comment)) | |
map(lambda x: x.replaceWith(re.sub(ur'\b(\w{6})\b', | |
ur'\g<0>™', | |
x, | |
flags=re.U)), | |
cleaned_elements) | |
for a in soup.findAll('a'): | |
if a.get('href'): | |
a['href'] = a['href'].replace(self.site, | |
"http://127.0.0.1:{}".format( | |
self.port)) | |
return str(soup) | |
class HttpProcessor(BaseHTTPRequestHandler): | |
""" | |
Simple HTTP server | |
""" | |
def do_GET(self): | |
""" | |
Processing of GET requests | |
:return: | |
""" | |
self.send_response(200) | |
self.send_header('content-type', 'text/html') | |
self.end_headers() | |
parser = Parser() | |
self.wfile.write(parser.parse_page(parser.site + self.path)) | |
def main(): | |
try: | |
port = sys.argv[2] | |
except IndexError: | |
port = '8080' | |
serv = HTTPServer(("127.0.0.1", int(port)), HttpProcessor) | |
webbrowser.open('http://127.0.0.1:{}'.format(port)) | |
try: | |
serv.serve_forever() | |
except KeyboardInterrupt: | |
print(" You have successfuly stoped the server!") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment