-
-
Save s-yoshiki/ce08b4a1aa0c3ed1685c37559e05d7dd to your computer and use it in GitHub Desktop.
google new clolar
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import urllib.request, urllib.parse | |
from urllib.parse import urlparse | |
import bs4 | |
#proxies = {'http' : ''} | |
#proxy = urllib.request.ProxyHandler(proxies) | |
#opener = urllib.request.build_opener(proxy) | |
#urllib.request.install_opener(opener) | |
def getgoogleurl(search,siteurl=False): | |
if siteurl==False: | |
return 'http://www.google.com/search?q='+urllib.parse.quote(search) | |
else: | |
return 'http://www.google.com/search?q=site:'+urllib.parse.quote(siteurl)+'%20'+urllib.parse.quote(search) | |
def getgooglelinks(search,siteurl=False): | |
#google returns 403 without user agent | |
headers = {'User-agent':'Mozilla/11.0'} | |
# url = getgoogleurl(search,siteurl) | |
url = "http://www.google.co.jp/search?safe=off&channel=fs&hl=ja&tbm=nws&tbs=qdr:d&q="+urllib.parse.quote(search)+"&oq="+urllib.parse.quote(search) | |
req = urllib.request.Request(url,None,headers) | |
return bs4.BeautifulSoup(urllib.request.urlopen(req).read(), "html.parser") | |
google_url = "http://www.google.co.jp/search?safe=off&client=ubuntu&channel=fs&hl=ja&biw=1920&bih=984&tbm=nws&ei=M7wKW53oIsPJ0ASYxbpo&q=aws&oq=aws" | |
def main(html): | |
a_list = html.find_all("a") | |
url_list = [] | |
for val in a_list : | |
try : | |
text = val.get("href") | |
if text.find('/url?q=') != -1 : | |
data = urlparse(text) | |
q = urllib.parse.parse_qs(data.query) | |
target_url = q['q'][0] | |
if target_url.find('webcache.googleusercontent.com') != -1 : | |
continue | |
if target_url not in url_list : | |
url_list.append(target_url) | |
except : | |
pass | |
for val in url_list : | |
try : | |
s = bs4.BeautifulSoup(urllib.request.urlopen(val).read(), "html.parser") | |
title = s.title.string | |
if title == "" : | |
continue | |
print("\n--------------------") | |
print(val) | |
print(s.title.string), | |
except : | |
pass | |
def main2(html) : | |
d_list = html.find_all("div", class_="g") | |
result = [] | |
for val in d_list : | |
# URL | |
url = val.find('a').get('href') | |
data = urlparse(url) | |
q = urllib.parse.parse_qs(data.query) | |
target_url = q['q'][0] | |
# title | |
text = val.find('a').text | |
text = text.replace("\xa0", '') | |
# content | |
contents = val.find('div', class_="st").text | |
# publication | |
data = val.find("span", class_="f").text | |
data = data.replace("\u200e", "") | |
data = data.split("-") | |
time = '' | |
pub = '' | |
if len(data) == 2 : | |
pub = data[0] | |
time = data[1] | |
time = time.replace(" ","") | |
news = { | |
'url' : target_url, | |
'title' : text, | |
'publisher' : pub, | |
'date' : time, | |
'content' : contents | |
} | |
result.append(news) | |
return result | |
def show_title(word): | |
text = "" | |
text += "###############################################\n" | |
text += "## " + word + "\n" | |
text += "###############################################\n" | |
print(text) | |
if __name__ == "__main__" : | |
keywords = [ | |
"linux", "Ubuntu","RedHat", | |
] | |
for i in keywords : | |
query = urllib.parse.quote(i) | |
html = getgooglelinks(query) | |
r = main2(html) | |
result = "" | |
for j in r : | |
if re.search('[あ-んア-ン一-鿐]', j['title']) is not None : | |
result += ("### [%s](%s)\n"%(j['title'], j['url'])) | |
# print('url : ' + j['url']) | |
# print('title : ' + j['title']) | |
# print('publisher : ' + j['publisher']) | |
# print('date : ' + j['date']) | |
# print('content : ' + j['content']) | |
if result != "" : | |
print("## %s"%i) | |
print(result) | |
print("--------") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment