blooser · August 5, 2018 20:10
diff --git a/spider.py b/spider.py
 from bs4 import BeautifulSoup
 import re
 import urllib3

 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


 class Spider:
    def __init__(self, url):
        self.start_url = url
        pass

    def open_url(self, url):
        http = urllib3.PoolManager()
        response = http.request('GET', url)
        return response

 
 class SearchingSpider(Spider):
    def __init__(self):
        super().__init__(self)
        self.links = []

    def get_links(self, url):
        response = self.open_url(url)
        links = []
        soup = BeautifulSoup(response.data, 'html.parser')
        [script.decompose() for script in soup.find_all('script')]
        for link in soup.find_all('a', attrs={'href': re.compile("^(?:http|ftp)s?://")}):
            links.append(link['href'])
        return soup, links

    def search(self, start_url, content, max_pages=10, show=False):
        self.links = [start_url]
        page = 0
        found = False

        while page < max_pages and self.links != []:
            page += 1
            current_url = self.links.pop(0)
            web_data, new_links = self.get_links(current_url)
            if content in web_data.text:
                found = True
                print("\033[92mFound in\033[0m", current_url)
                if show:
                    data = web_data.find(text=re.compile(content)).split()
                    data = ' '.join(data).replace(content, '\033[91m' + content + '\033[0m')
                    print('\033[93m>\033[0m', data)
            self.links.extend(new_links)

        if not found:
          print("\033[91mNot found\033[0m")
	from bs4 import BeautifulSoup
	import re
	import urllib3

	urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)


	class Spider:
	def __init__(self, url):
	self.start_url = url
	pass

	def open_url(self, url):
	http = urllib3.PoolManager()
	response = http.request('GET', url)
	return response


	class SearchingSpider(Spider):
	def __init__(self):
	super().__init__(self)
	self.links = []

	def get_links(self, url):
	response = self.open_url(url)
	links = []
	soup = BeautifulSoup(response.data, 'html.parser')
	[script.decompose() for script in soup.find_all('script')]
	for link in soup.find_all('a', attrs={'href': re.compile("^(?:http\|ftp)s?://")}):
	links.append(link['href'])
	return soup, links

	def search(self, start_url, content, max_pages=10, show=False):
	self.links = [start_url]
	page = 0
	found = False

	while page < max_pages and self.links != []:
	page += 1
	current_url = self.links.pop(0)
	web_data, new_links = self.get_links(current_url)
	if content in web_data.text:
	found = True
	print("\033[92mFound in\033[0m", current_url)
	if show:
	data = web_data.find(text=re.compile(content)).split()
	data = ' '.join(data).replace(content, '\033[91m' + content + '\033[0m')
	print('\033[93m>\033[0m', data)
	self.links.extend(new_links)

	if not found:
	print("\033[91mNot found\033[0m")