Skip to content

Instantly share code, notes, and snippets.

@wkomor
Forked from anonymous/habraproxy.py
Last active July 6, 2016 02:41
Show Gist options
  • Save wkomor/b46baf9d2bab967fd40c028b3c28aeb7 to your computer and use it in GitHub Desktop.
Save wkomor/b46baf9d2bab967fd40c028b3c28aeb7 to your computer and use it in GitHub Desktop.
# -*- coding: utf-8 -*-
import sys
import requests
import re
import html5lib
import webbrowser
from BaseHTTPServer import BaseHTTPRequestHandler, HTTPServer
from bs4 import BeautifulSoup, Comment
class Parser(object):
"""
Class to proxy and parse site according to ivelum test quiz
"""
NO_CONTENT = ['head', 'title', 'script', 'style', '[document]']
def __init__(self, site='https://habrahabr.ru', port='8080'):
self.site = site
self.port = port
try:
self.site = sys.argv[1]
self.port = sys.argv[2]
except IndexError:
pass
def _open(self, url):
"""
Open url
:return: response text if success or '' if fail
"""
response = requests.get(url)
if response.status_code == 200:
return response.text
return ''
def parse_page(self, url):
"""
method to add for every 6-character word 'тм' sign
:param url: url to be open
:return: html: html file to show user
"""
page = self._open(url)
soup = BeautifulSoup(page, "html5lib")
elements = soup.find_all(text=True)
cleaned_elements = (e for e in elements if
e.parent.name not in self.NO_CONTENT and not
isinstance(e, Comment))
map(lambda x: x.replaceWith(re.sub(ur'\b(\w{6})\b',
ur'\g<0>™',
x,
flags=re.U)),
cleaned_elements)
for a in soup.findAll('a'):
if a.get('href'):
a['href'] = a['href'].replace(self.site,
"http://127.0.0.1:{}".format(
self.port))
return str(soup)
class HttpProcessor(BaseHTTPRequestHandler):
"""
Simple HTTP server
"""
def do_GET(self):
"""
Processing of GET requests
:return:
"""
self.send_response(200)
self.send_header('content-type', 'text/html')
self.end_headers()
parser = Parser()
self.wfile.write(parser.parse_page(parser.site + self.path))
def main():
try:
port = sys.argv[2]
except IndexError:
port = '8080'
serv = HTTPServer(("127.0.0.1", int(port)), HttpProcessor)
webbrowser.open('http://127.0.0.1:{}'.format(port))
try:
serv.serve_forever()
except KeyboardInterrupt:
print(" You have successfuly stoped the server!")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment