Last active
July 5, 2019 07:56
-
-
Save gleba/e575db7d5085e845c1e2bd0b51116024 to your computer and use it in GitHub Desktop.
Поиск в тексте сайтов в выдаче гугла по запросу
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import repeat | |
from bs4 import BeautifulSoup | |
from langdetect import detect | |
import requests | |
import hashlib | |
def page_to_text(url, deep_keys): | |
try: | |
page = requests.get(url) | |
except: | |
print('404') | |
return ['404', []] | |
soup = BeautifulSoup(page.content, 'lxml') | |
overall = [] | |
found = [] | |
for tag in soup.find_all("div"): | |
tag_text = tag.get_text() | |
if len(tag_text) > 300: | |
tt_list = tag_text.split("\n") | |
for text in tt_list: | |
if len(text) > 200 and detect(text) == 'ru': | |
text = text.lower() | |
search_results = [] | |
for key in deep_keys: | |
search_results.append(text.find(key)) | |
if max(search_results) != -1: | |
found.append(text) | |
else: | |
overall.append(text) | |
return overall, found | |
def rate_in(dict, key): | |
if key in dict: | |
dict[key] = dict[key] + 1 | |
else: | |
dict[key] = 1 | |
def search(query, deep_keys): | |
start = 0 | |
result_tokens = [] | |
result = [] | |
pos = 0 | |
uniques = {} | |
targets = {} | |
for _ in repeat(None, 30): | |
if start is 0: | |
page = requests.get("https://www.google.com/search?source=hp&q=" + query) | |
else: | |
page = requests.get("https://www.google.com/search?source=hp&q=" + query + "start=" + str(start)) | |
start = (start + 10) | |
soup = BeautifulSoup(page.content, 'lxml') | |
for tag in soup.find_all("a"): | |
a_tag = tag.find("div", class_="BNeawe") | |
if a_tag: | |
href = tag.attrs['href'].split("/url?q=") | |
if len(href) > 1: | |
link = href[1].split("&sa=")[0] | |
# hash = hashlib.md5(link.encode()).hexdigest() | |
text, found = page_to_text(link, deep_keys) | |
is_negative = "" | |
domain = link.split("/")[2] | |
if len(found) > 0: | |
print(link) | |
is_negative = "toxic" | |
rate_in(targets, domain) | |
else: | |
print(a_tag.text) | |
rate_in(uniques, domain) | |
info = { | |
"pos": pos, | |
"negative": is_negative, | |
"link": link, | |
"title": a_tag.text | |
} | |
tokens = { | |
"all": text, | |
"toxic": found, | |
"info": info | |
} | |
result_tokens.append(tokens) | |
result.append(info) | |
pos = pos + 1 | |
write_csv_array(result, "result.csv") | |
write_csv_dict(uniques, "uniques.csv") | |
write_csv_dict(targets, "targets.csv") | |
def write_csv_dict(dict, filename): | |
items = dict.keys() | |
out = "key, value" | |
for key in items: | |
out += "\n" + str(key) + ", " + str(dict[key]) | |
f = open(filename, "w") | |
f.write(out) | |
f.close() | |
def write_csv_array(array, filename): | |
if len(array) == 0: | |
return | |
print(array[0]) | |
head = array[0].keys() | |
out = "" | |
for v in head: | |
out += str(v) + ", " | |
for o in array: | |
out += "\n" | |
for x in head: | |
out += str(o[x]) + "," | |
f = open(filename, "w") | |
f.write(out) | |
f.close() | |
search("основной запрос", ["фильтр слов", "поиска по контенту"]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment