Skip to content

Instantly share code, notes, and snippets.

@jeshuamaxey
Last active March 10, 2016 16:03
Show Gist options
  • Save jeshuamaxey/9c4fb3bfcbba15bcc3bb to your computer and use it in GitHub Desktop.
Save jeshuamaxey/9c4fb3bfcbba15bcc3bb to your computer and use it in GitHub Desktop.
Expects a file `urls.csv` to be a list of URLs separated by new lines. For each URL it scrapes the article content and saves it to `gold_data.csv` all scraped data over 300 chars long.
import csv
from httplib import BadStatusLine
from newspaper import Article
import requests
INFILE_NAME = "urls.csv"
OUTFILE_NAME = "gold_data.csv"
def get_sample_from_url(url):
page = requests.get(url)
if page.status_code != 200:
print 'Status code not okay, ignoring {}'.format(url)
return
article = Article(url, language='en')
article.download()
article.parse()
if len(article.text) < 300:
print 'Document too short, ignoring {}'.format(url)
return
sample = {
'content': article.text.encode('ascii', errors='ignore'),
'title': article.title.encode('ascii', errors='ignore'),
'source_id': 0,
'model_id': 0,
'text_type': 'document',
'url': url,
'label_gold': '',
'label_gold_reason': '',
'_golden': 'TRUE'
}
print 'Scraped {}'.format(url)
return sample
def write_document_samples_to_file(samples, outfile_name):
with open(outfile_name, 'w') as outfile:
fieldnames = ['url', 'source_id', 'model_id', 'text_type', 'title', 'content', 'label_gold', 'label_gold_reason', '_golden']
writer = csv.DictWriter(outfile, fieldnames=fieldnames)
writer.writeheader()
for sample in samples:
writer.writerow(sample)
print 'Written {} samples to {}'.format(len(samples), outfile_name)
def main():
samples = []
with open(INFILE_NAME) as infile:
reader = csv.reader(infile)
urls = [url[0] for url in reader]
for url in urls:
try:
sample = get_sample_from_url(url)
if sample:
samples.append(sample)
except Exception as exception:
print 'Error scraping {}: \n\t{}'.format(url, exception.message)
continue
write_document_samples_to_file(samples, OUTFILE_NAME)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment