Last active
August 29, 2015 14:21
-
-
Save tsudoko/69aaec15bd40b5b29e6f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
from bs4 import BeautifulSoup | |
import urllib.request | |
import os.path | |
import sys | |
def get_contents_plain(html): | |
soup = BeautifulSoup(html) | |
contents_body = soup.find("div", class_="contents_body") | |
contents_body.find("div", class_="fc2_footer").decompose() | |
[br.replace_with('\n') for br in contents_body.find_all("br")] | |
return contents_body.get_text() | |
if len(sys.argv) < 2: | |
print("usage: %s [site]" % os.path.basename(sys.argv[0]), file=sys.stderr) | |
else: | |
site = urllib.request.urlopen(sys.argv[1]).read() | |
print(get_contents_plain(site)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment