Last active
March 10, 2016 16:03
-
-
Save jeshuamaxey/9c4fb3bfcbba15bcc3bb to your computer and use it in GitHub Desktop.
Expects a file `urls.csv` to be a list of URLs separated by new lines. For each URL it scrapes the article content and saves it to `gold_data.csv` all scraped data over 300 chars long.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
from httplib import BadStatusLine | |
from newspaper import Article | |
import requests | |
INFILE_NAME = "urls.csv" | |
OUTFILE_NAME = "gold_data.csv" | |
def get_sample_from_url(url): | |
page = requests.get(url) | |
if page.status_code != 200: | |
print 'Status code not okay, ignoring {}'.format(url) | |
return | |
article = Article(url, language='en') | |
article.download() | |
article.parse() | |
if len(article.text) < 300: | |
print 'Document too short, ignoring {}'.format(url) | |
return | |
sample = { | |
'content': article.text.encode('ascii', errors='ignore'), | |
'title': article.title.encode('ascii', errors='ignore'), | |
'source_id': 0, | |
'model_id': 0, | |
'text_type': 'document', | |
'url': url, | |
'label_gold': '', | |
'label_gold_reason': '', | |
'_golden': 'TRUE' | |
} | |
print 'Scraped {}'.format(url) | |
return sample | |
def write_document_samples_to_file(samples, outfile_name): | |
with open(outfile_name, 'w') as outfile: | |
fieldnames = ['url', 'source_id', 'model_id', 'text_type', 'title', 'content', 'label_gold', 'label_gold_reason', '_golden'] | |
writer = csv.DictWriter(outfile, fieldnames=fieldnames) | |
writer.writeheader() | |
for sample in samples: | |
writer.writerow(sample) | |
print 'Written {} samples to {}'.format(len(samples), outfile_name) | |
def main(): | |
samples = [] | |
with open(INFILE_NAME) as infile: | |
reader = csv.reader(infile) | |
urls = [url[0] for url in reader] | |
for url in urls: | |
try: | |
sample = get_sample_from_url(url) | |
if sample: | |
samples.append(sample) | |
except Exception as exception: | |
print 'Error scraping {}: \n\t{}'.format(url, exception.message) | |
continue | |
write_document_samples_to_file(samples, OUTFILE_NAME) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment