Skip to content

Instantly share code, notes, and snippets.

@nanzono
Last active December 27, 2015 03:48
Show Gist options
  • Save nanzono/7261848 to your computer and use it in GitHub Desktop.
Save nanzono/7261848 to your computer and use it in GitHub Desktop.
how to scrape
# coding:utf-8
from BeautifulSoup import BeautifulSoup
import urllib2
import json
def crawl(tmp_url):
page = urllib2.urlopen(tmp_url)
soup = BeautifulSoup(page.read())
parsed_page = dict()
title_element = soup.find("div", {"class": "title"})
if title_element is None:
parsed_page["title"] = None
else:
parsed_page["title"] = title_element.text
desc_element = soup.find("div", {"class": "q_desc"})
if desc_element is None:
parsed_page["q_desc"] = None
else:
parsed_page["q_desc"] = desc_element.text
anchor_elements = soup.findAll("a")
anchor_list = []
for anc in anchor_elements:
anchor_list.append(anc["href"])
parsed_page["links"] = anchor_list
return json.dumps(parsed_page)
if __name__ == '__main__':
target_urls = list()
target_urls.append('http://okwave.jp/qa/q8329615.html')
target_urls.append('http://okwave.jp/oodakedo/2087.html')
for target_url in target_urls:
file_name_url = target_url.replace(".", "_")
file_name_url = file_name_url.replace("/", "")
file_name_url = file_name_url.replace(":", "")
fo = open("result_%s.json" % file_name_url, "wb")
fo.write(crawl(target_url))
fo.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment