s-yoshiki · May 29, 2018 15:25
diff --git a/_.py b/_.py
 import urllib.request, urllib.parse
 from urllib.parse import urlparse
 import bs4

 #proxies = {'http' : ''}
 #proxy = urllib.request.ProxyHandler(proxies)
 #opener = urllib.request.build_opener(proxy)
 #urllib.request.install_opener(opener)

 def getgoogleurl(search,siteurl=False):
    if siteurl==False:
        return 'http://www.google.com/search?q='+urllib.parse.quote(search)
    else:
        return 'http://www.google.com/search?q=site:'+urllib.parse.quote(siteurl)+'%20'+urllib.parse.quote(search)

 def getgooglelinks(search,siteurl=False):
    #google returns 403 without user agent
    headers = {'User-agent':'Mozilla/11.0'}
    # url = getgoogleurl(search,siteurl)
    url = "http://www.google.co.jp/search?safe=off&channel=fs&hl=ja&tbm=nws&tbs=qdr:d&q="+urllib.parse.quote(search)+"&oq="+urllib.parse.quote(search)
    req = urllib.request.Request(url,None,headers)
    return bs4.BeautifulSoup(urllib.request.urlopen(req).read(), "html.parser")

 google_url = "http://www.google.co.jp/search?safe=off&client=ubuntu&channel=fs&hl=ja&biw=1920&bih=984&tbm=nws&ei=M7wKW53oIsPJ0ASYxbpo&q=aws&oq=aws"



 def main(html):
    a_list = html.find_all("a")
    url_list = []

    for val in a_list :
        try :
            text = val.get("href")

            if text.find('/url?q=') != -1 :
                data = urlparse(text)
                q = urllib.parse.parse_qs(data.query)
                target_url = q['q'][0]
                
                if target_url.find('webcache.googleusercontent.com') != -1 :
                    continue
                
                if target_url not in url_list : 
                    url_list.append(target_url)
        except :
            pass

    for val in url_list :
        try :
            s = bs4.BeautifulSoup(urllib.request.urlopen(val).read(), "html.parser")
            title = s.title.string

            if title == "" :
                continue

            print("\n--------------------")
            print(val)
            print(s.title.string),
        except :
            pass

 def main2(html) :
    d_list = html.find_all("div", class_="g")
    result = []

    for val in d_list :
        # URL
        url = val.find('a').get('href')
        data = urlparse(url)
        q = urllib.parse.parse_qs(data.query)
        target_url = q['q'][0]

        # title
        text = val.find('a').text
        text = text.replace("\xa0", '')

        # content
        contents = val.find('div', class_="st").text

        # publication
        data = val.find("span", class_="f").text
        data = data.replace("\u200e", "")
        data = data.split("-")
        time = ''
        pub = ''
        if len(data) == 2 :
            pub = data[0]
            time = data[1]
            time = time.replace(" ","")

        news = {
            'url' : target_url,
            'title' : text,
            'publisher' : pub,
            'date' : time,
            'content' : contents
        }
        result.append(news)

    return result


 def show_title(word):
    text = ""
    text += "###############################################\n"
    text += "##    " + word + "\n"
    text += "###############################################\n"
    print(text)

 if __name__ == "__main__" :
    keywords = [
        "linux", "Ubuntu","RedHat",
        ]

    for i in keywords :
        query = urllib.parse.quote(i)
        html = getgooglelinks(query)
        r  = main2(html)
        result = ""

        for j in r :
            if re.search('[あ-んア-ン一-鿐]', j['title']) is not None :
                result += ("### [%s](%s)\n"%(j['title'], j['url']))
                # print('url       : ' + j['url'])
                # print('title     : ' + j['title'])
                # print('publisher : ' + j['publisher'])
                # print('date      : ' + j['date'])
                # print('content   : ' + j['content'])
        
        if result != "" :
            print("## %s"%i)
            print(result)
            print("--------")
	import urllib.request, urllib.parse
	from urllib.parse import urlparse
	import bs4

	#proxies = {'http' : ''}
	#proxy = urllib.request.ProxyHandler(proxies)
	#opener = urllib.request.build_opener(proxy)
	#urllib.request.install_opener(opener)

	def getgoogleurl(search,siteurl=False):
	if siteurl==False:
	return 'http://www.google.com/search?q='+urllib.parse.quote(search)
	else:
	return 'http://www.google.com/search?q=site:'+urllib.parse.quote(siteurl)+'%20'+urllib.parse.quote(search)

	def getgooglelinks(search,siteurl=False):
	#google returns 403 without user agent
	headers = {'User-agent':'Mozilla/11.0'}
	# url = getgoogleurl(search,siteurl)
	url = "http://www.google.co.jp/search?safe=off&channel=fs&hl=ja&tbm=nws&tbs=qdr:d&q="+urllib.parse.quote(search)+"&oq="+urllib.parse.quote(search)
	req = urllib.request.Request(url,None,headers)
	return bs4.BeautifulSoup(urllib.request.urlopen(req).read(), "html.parser")

	google_url = "http://www.google.co.jp/search?safe=off&client=ubuntu&channel=fs&hl=ja&biw=1920&bih=984&tbm=nws&ei=M7wKW53oIsPJ0ASYxbpo&q=aws&oq=aws"



	def main(html):
	a_list = html.find_all("a")
	url_list = []

	for val in a_list :
	try :
	text = val.get("href")

	if text.find('/url?q=') != -1 :
	data = urlparse(text)
	q = urllib.parse.parse_qs(data.query)
	target_url = q['q'][0]

	if target_url.find('webcache.googleusercontent.com') != -1 :
	continue

	if target_url not in url_list :
	url_list.append(target_url)
	except :
	pass

	for val in url_list :
	try :
	s = bs4.BeautifulSoup(urllib.request.urlopen(val).read(), "html.parser")
	title = s.title.string

	if title == "" :
	continue

	print("\n--------------------")
	print(val)
	print(s.title.string),
	except :
	pass

	def main2(html) :
	d_list = html.find_all("div", class_="g")
	result = []

	for val in d_list :
	# URL
	url = val.find('a').get('href')
	data = urlparse(url)
	q = urllib.parse.parse_qs(data.query)
	target_url = q['q'][0]

	# title
	text = val.find('a').text
	text = text.replace("\xa0", '')

	# content
	contents = val.find('div', class_="st").text

	# publication
	data = val.find("span", class_="f").text
	data = data.replace("\u200e", "")
	data = data.split("-")
	time = ''
	pub = ''
	if len(data) == 2 :
	pub = data[0]
	time = data[1]
	time = time.replace(" ","")

	news = {
	'url' : target_url,
	'title' : text,
	'publisher' : pub,
	'date' : time,
	'content' : contents
	}
	result.append(news)

	return result


	def show_title(word):
	text = ""
	text += "###############################################\n"
	text += "## " + word + "\n"
	text += "###############################################\n"
	print(text)

	if __name__ == "__main__" :
	keywords = [
	"linux", "Ubuntu","RedHat",
	]

	for i in keywords :
	query = urllib.parse.quote(i)
	html = getgooglelinks(query)
	r = main2(html)
	result = ""

	for j in r :
	if re.search('[あ-んア-ン一-鿐]', j['title']) is not None :
	result += ("### [%s](%s)\n"%(j['title'], j['url']))
	# print('url : ' + j['url'])
	# print('title : ' + j['title'])
	# print('publisher : ' + j['publisher'])
	# print('date : ' + j['date'])
	# print('content : ' + j['content'])

	if result != "" :
	print("## %s"%i)
	print(result)
	print("--------")
No results found