Skip to content

Instantly share code, notes, and snippets.

@proegssilb
Last active May 13, 2017 03:15
Show Gist options
  • Save proegssilb/79bfa0e66922c43dd397bba34fa321cb to your computer and use it in GitHub Desktop.
Save proegssilb/79bfa0e66922c43dd397bba34fa321cb to your computer and use it in GitHub Desktop.
Web scraping hack involving proxy checking
#!/usr/bin/env python2
from __future__ import print_function
import requests
from bs4 import BeautifulSoup
def checkHeaders(tr):
tr = tr.find_all('th')
assert tr[0].text == u'IP Address'
assert tr[1].text == u'Port'
assert tr[4].text == u'Anonymity'
assert tr[6].text == u'Https'
return True
def getProxyInfo(tr):
return u'http://' + tr.contents[0].text + u':' + tr.contents[1].text + '/'
censoredUrl = ''
def checkProxy(proxyString):
proxyCfg = {'http': proxyString, 'https': proxyString}
res = requests.get(censoredUrl)
if res.status_code == 200:
return True
return False
def getHeader(soup):
return soup.select('thead > tr')[0]
def getRows(soup):
for row in soup.select('tbody > tr'):
if row.contents[4].text in (u'elite proxy', u'anonymous'):
host, port = row.contents[0].text, row.contents[1].text
yield host + u':' + port
def main(url):
res = requests.get(url)
soup = BeautifulSoup(res.content)
if not checkHeaders(getHeader(soup)):
print('ERROR: Headers do not match expected.')
return
for proxy in getRows(soup):
if checkProxy(proxy):
print(proxy)
if __name__ == '__main__':
main('http://us-proxy.org/')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment