Skip to content

Instantly share code, notes, and snippets.

@s-yoshiki
Last active May 29, 2018 15:25
Show Gist options
  • Save s-yoshiki/ce08b4a1aa0c3ed1685c37559e05d7dd to your computer and use it in GitHub Desktop.
Save s-yoshiki/ce08b4a1aa0c3ed1685c37559e05d7dd to your computer and use it in GitHub Desktop.
google new clolar
import urllib.request, urllib.parse
from urllib.parse import urlparse
import bs4
#proxies = {'http' : ''}
#proxy = urllib.request.ProxyHandler(proxies)
#opener = urllib.request.build_opener(proxy)
#urllib.request.install_opener(opener)
def getgoogleurl(search,siteurl=False):
if siteurl==False:
return 'http://www.google.com/search?q='+urllib.parse.quote(search)
else:
return 'http://www.google.com/search?q=site:'+urllib.parse.quote(siteurl)+'%20'+urllib.parse.quote(search)
def getgooglelinks(search,siteurl=False):
#google returns 403 without user agent
headers = {'User-agent':'Mozilla/11.0'}
# url = getgoogleurl(search,siteurl)
url = "http://www.google.co.jp/search?safe=off&channel=fs&hl=ja&tbm=nws&tbs=qdr:d&q="+urllib.parse.quote(search)+"&oq="+urllib.parse.quote(search)
req = urllib.request.Request(url,None,headers)
return bs4.BeautifulSoup(urllib.request.urlopen(req).read(), "html.parser")
google_url = "http://www.google.co.jp/search?safe=off&client=ubuntu&channel=fs&hl=ja&biw=1920&bih=984&tbm=nws&ei=M7wKW53oIsPJ0ASYxbpo&q=aws&oq=aws"
def main(html):
a_list = html.find_all("a")
url_list = []
for val in a_list :
try :
text = val.get("href")
if text.find('/url?q=') != -1 :
data = urlparse(text)
q = urllib.parse.parse_qs(data.query)
target_url = q['q'][0]
if target_url.find('webcache.googleusercontent.com') != -1 :
continue
if target_url not in url_list :
url_list.append(target_url)
except :
pass
for val in url_list :
try :
s = bs4.BeautifulSoup(urllib.request.urlopen(val).read(), "html.parser")
title = s.title.string
if title == "" :
continue
print("\n--------------------")
print(val)
print(s.title.string),
except :
pass
def main2(html) :
d_list = html.find_all("div", class_="g")
result = []
for val in d_list :
# URL
url = val.find('a').get('href')
data = urlparse(url)
q = urllib.parse.parse_qs(data.query)
target_url = q['q'][0]
# title
text = val.find('a').text
text = text.replace("\xa0", '')
# content
contents = val.find('div', class_="st").text
# publication
data = val.find("span", class_="f").text
data = data.replace("\u200e", "")
data = data.split("-")
time = ''
pub = ''
if len(data) == 2 :
pub = data[0]
time = data[1]
time = time.replace(" ","")
news = {
'url' : target_url,
'title' : text,
'publisher' : pub,
'date' : time,
'content' : contents
}
result.append(news)
return result
def show_title(word):
text = ""
text += "###############################################\n"
text += "## " + word + "\n"
text += "###############################################\n"
print(text)
if __name__ == "__main__" :
keywords = [
"linux", "Ubuntu","RedHat",
]
for i in keywords :
query = urllib.parse.quote(i)
html = getgooglelinks(query)
r = main2(html)
result = ""
for j in r :
if re.search('[あ-んア-ン一-鿐]', j['title']) is not None :
result += ("### [%s](%s)\n"%(j['title'], j['url']))
# print('url : ' + j['url'])
# print('title : ' + j['title'])
# print('publisher : ' + j['publisher'])
# print('date : ' + j['date'])
# print('content : ' + j['content'])
if result != "" :
print("## %s"%i)
print(result)
print("--------")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment