Skip to content

Instantly share code, notes, and snippets.

@fallenflint
Forked from anonymous/habraproxy.py
Last active June 1, 2016 10:40
Show Gist options
  • Save fallenflint/118886b3e61903a20d99d528d70f0b58 to your computer and use it in GitHub Desktop.
Save fallenflint/118886b3e61903a20d99d528d70f0b58 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import SocketServer
import SimpleHTTPServer
import urllib
import socket
import argparse
import webbrowser
import re
class Processor(object):
def __init__(self, host, port, site):
self.replace_re = re.compile(
r'(href=[\'"])(?P<proto>http|https?:)?(/*)%s(/?.*?["\'])' % site,
re.U | re.I | re.S | re.M)
self.host = host
self.port = port
self.site = site
self.hostport = '{}:{}'.format(host, port) if port != 80 else host
def replacer(self, matchobj):
result = (''.join(filter(None, matchobj.groups()[:3])) +
self.hostport + matchobj.groups()[-1])
if matchobj.groups('proto'):
result = result.replace('https://', 'http://')
return result
def process(self, stream):
body = tag = close_tag = script = False
counter = 0
buf = []
for letter in stream:
if close_tag: # if previous iteration set closing flag
close_tag = tag = False
if not letter.isalnum() and counter == 6:
yield u'™'
if letter == '<': # detect tag
tag = True
tag_content = []
tag_closing = False
counter = 0
if tag: # whether it's an opening or closing tag
buf.append(letter)
tag_content.append(letter)
if ''.join(tag_content) == '</':
tag_closing = True
if letter == '>': # when the tag is being closed
close_tag = True
tag_words = (''.join(tag_content[1:-1])
.replace('/', '').lower().split())
if 'body' in tag_words:
body = False if tag_closing else True
if 'script' in tag_words:
script = False if tag_closing else True
result = ''.join(buf)
if 'a' in tag_words:
result = self.replace_re.sub(self.replacer, result)
yield result[:-1]
buf = []
if body and not (tag or script) and letter.isalnum():
counter += 1
else:
counter = 0
if not buf:
yield letter
def get_handler(site, processor):
site_url = site if 'http' in site else 'http://' + site
class Proxy(SimpleHTTPServer.SimpleHTTPRequestHandler):
def do_GET(self):
req = urllib.urlopen(site_url + self.path)
data = req.read()
if ('text/html' in req.headers['content-type'] and
'charset' in req.headers['content-type']):
charset = req.headers['content-type'].split('charset=')[-1]
data = u''.join(
processor.process(unicode(data, charset)))
self.wfile.write(data.encode(charset))
else:
self.wfile.write(data)
return Proxy
class ReusingServer(SocketServer.ForkingTCPServer):
"""TCP Server that doesn't fail if port is still in TIME_WAIT state"""
def server_bind(self):
self.socket.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1)
self.socket.bind(self.server_address)
def main(host, port, site):
processor = Processor(host, port, site)
httpd = ReusingServer((host, port), get_handler(site, processor))
print "serving at {0}:{1} -> {2}".format(host, port, site)
try:
# although it's quite dirty, it should work in most cases
webbrowser.open("http://{}:{}".format(host, port))
httpd.serve_forever(poll_interval=0.1)
except KeyboardInterrupt:
httpd.server_close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'-p', '--port',
help="Local port used to listen to, defaults to 8000",
default=8000, type=int)
parser.add_argument(
'--host',
help="Hostname used to listen to, defaults to localhost",
default='localhost')
parser.add_argument(
'-s', '--site',
help="Site to use, defaults to habrahabr.ru",
default='habrahabr.ru')
args = parser.parse_args()
main(args.host, args.port, args.site)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment