Skip to content

Instantly share code, notes, and snippets.

@blooser
Created August 5, 2018 20:10
Show Gist options
  • Save blooser/12b2970ac7f189fe7b704f853ee576cf to your computer and use it in GitHub Desktop.
Save blooser/12b2970ac7f189fe7b704f853ee576cf to your computer and use it in GitHub Desktop.
Web Crawler
from bs4 import BeautifulSoup
import re
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
class Spider:
def __init__(self, url):
self.start_url = url
pass
def open_url(self, url):
http = urllib3.PoolManager()
response = http.request('GET', url)
return response
class SearchingSpider(Spider):
def __init__(self):
super().__init__(self)
self.links = []
def get_links(self, url):
response = self.open_url(url)
links = []
soup = BeautifulSoup(response.data, 'html.parser')
[script.decompose() for script in soup.find_all('script')]
for link in soup.find_all('a', attrs={'href': re.compile("^(?:http|ftp)s?://")}):
links.append(link['href'])
return soup, links
def search(self, start_url, content, max_pages=10, show=False):
self.links = [start_url]
page = 0
found = False
while page < max_pages and self.links != []:
page += 1
current_url = self.links.pop(0)
web_data, new_links = self.get_links(current_url)
if content in web_data.text:
found = True
print("\033[92mFound in\033[0m", current_url)
if show:
data = web_data.find(text=re.compile(content)).split()
data = ' '.join(data).replace(content, '\033[91m' + content + '\033[0m')
print('\033[93m>\033[0m', data)
self.links.extend(new_links)
if not found:
print("\033[91mNot found\033[0m")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment