gleba · July 5, 2019 07:56
diff --git a/deep_search.py b/deep_search.py
 from itertools import repeat
 from bs4 import BeautifulSoup
 from langdetect import detect
 import requests
 import hashlib


 def page_to_text(url, deep_keys):
    try:
        page = requests.get(url)
    except:
        print('404')
        return ['404', []]
    soup = BeautifulSoup(page.content, 'lxml')
    overall = []
    found = []
    for tag in soup.find_all("div"):
        tag_text = tag.get_text()
        if len(tag_text) > 300:
            tt_list = tag_text.split("\n")
            for text in tt_list:
                if len(text) > 200 and detect(text) == 'ru':
                    text = text.lower()
                    search_results = []
                    for key in deep_keys:
                        search_results.append(text.find(key))
                    if max(search_results) != -1:
                        found.append(text)
                    else:
                        overall.append(text)
    return overall, found


 def rate_in(dict, key):
    if key in dict:
        dict[key] = dict[key] + 1
    else:
        dict[key] = 1


 def search(query, deep_keys):
    start = 0
    result_tokens = []
    result = []
    pos = 0
    uniques = {}
    targets = {}
    for _ in repeat(None, 30):
        if start is 0:
            page = requests.get("https://www.google.com/search?source=hp&q=" + query)
        else:
            page = requests.get("https://www.google.com/search?source=hp&q=" + query + "start=" + str(start))
        start = (start + 10)
        soup = BeautifulSoup(page.content, 'lxml')
        for tag in soup.find_all("a"):
            a_tag = tag.find("div", class_="BNeawe")
            if a_tag:
                href = tag.attrs['href'].split("/url?q=")
                if len(href) > 1:
                    link = href[1].split("&sa=")[0]
                    # hash = hashlib.md5(link.encode()).hexdigest()
                    text, found = page_to_text(link, deep_keys)
                    is_negative = ""
                    domain = link.split("/")[2]
                    if len(found) > 0:
                        print(link)
                        is_negative = "toxic"
                        rate_in(targets, domain)
                    else:
                        print(a_tag.text)
                        rate_in(uniques, domain)
                    info = {
                        "pos": pos,
                        "negative": is_negative,
                        "link": link,
                        "title": a_tag.text
                    }
                    tokens = {
                        "all": text,
                        "toxic": found,
                        "info": info
                    }
                    result_tokens.append(tokens)
                    result.append(info)
                    pos = pos + 1
    write_csv_array(result, "result.csv")
    write_csv_dict(uniques, "uniques.csv")
    write_csv_dict(targets, "targets.csv")


 def write_csv_dict(dict, filename):
    items = dict.keys()
    out = "key, value"
    for key in items:
        out += "\n" + str(key) + ", " + str(dict[key])
    f = open(filename, "w")
    f.write(out)
    f.close()


 def write_csv_array(array, filename):
    if len(array) == 0:
        return
    print(array[0])
    head = array[0].keys()
    out = ""
    for v in head:
        out += str(v) + ", "
    for o in array:
        out += "\n"
        for x in head:
            out += str(o[x]) + ","
    f = open(filename, "w")
    f.write(out)
    f.close()


 search("основной запрос", ["фильтр слов", "поиска по контенту"])
	from itertools import repeat
	from bs4 import BeautifulSoup
	from langdetect import detect
	import requests
	import hashlib


	def page_to_text(url, deep_keys):
	try:
	page = requests.get(url)
	except:
	print('404')
	return ['404', []]
	soup = BeautifulSoup(page.content, 'lxml')
	overall = []
	found = []
	for tag in soup.find_all("div"):
	tag_text = tag.get_text()
	if len(tag_text) > 300:
	tt_list = tag_text.split("\n")
	for text in tt_list:
	if len(text) > 200 and detect(text) == 'ru':
	text = text.lower()
	search_results = []
	for key in deep_keys:
	search_results.append(text.find(key))
	if max(search_results) != -1:
	found.append(text)
	else:
	overall.append(text)
	return overall, found


	def rate_in(dict, key):
	if key in dict:
	dict[key] = dict[key] + 1
	else:
	dict[key] = 1


	def search(query, deep_keys):
	start = 0
	result_tokens = []
	result = []
	pos = 0
	uniques = {}
	targets = {}
	for _ in repeat(None, 30):
	if start is 0:
	page = requests.get("https://www.google.com/search?source=hp&q=" + query)
	else:
	page = requests.get("https://www.google.com/search?source=hp&q=" + query + "start=" + str(start))
	start = (start + 10)
	soup = BeautifulSoup(page.content, 'lxml')
	for tag in soup.find_all("a"):
	a_tag = tag.find("div", class_="BNeawe")
	if a_tag:
	href = tag.attrs['href'].split("/url?q=")
	if len(href) > 1:
	link = href[1].split("&sa=")[0]
	# hash = hashlib.md5(link.encode()).hexdigest()
	text, found = page_to_text(link, deep_keys)
	is_negative = ""
	domain = link.split("/")[2]
	if len(found) > 0:
	print(link)
	is_negative = "toxic"
	rate_in(targets, domain)
	else:
	print(a_tag.text)
	rate_in(uniques, domain)
	info = {
	"pos": pos,
	"negative": is_negative,
	"link": link,
	"title": a_tag.text
	}
	tokens = {
	"all": text,
	"toxic": found,
	"info": info
	}
	result_tokens.append(tokens)
	result.append(info)
	pos = pos + 1
	write_csv_array(result, "result.csv")
	write_csv_dict(uniques, "uniques.csv")
	write_csv_dict(targets, "targets.csv")


	def write_csv_dict(dict, filename):
	items = dict.keys()
	out = "key, value"
	for key in items:
	out += "\n" + str(key) + ", " + str(dict[key])
	f = open(filename, "w")
	f.write(out)
	f.close()


	def write_csv_array(array, filename):
	if len(array) == 0:
	return
	print(array[0])
	head = array[0].keys()
	out = ""
	for v in head:
	out += str(v) + ", "
	for o in array:
	out += "\n"
	for x in head:
	out += str(o[x]) + ","
	f = open(filename, "w")
	f.write(out)
	f.close()


	search("основной запрос", ["фильтр слов", "поиска по контенту"])
No results found