Last active
May 4, 2020 21:32
-
-
Save rlskoeser/00bbb2c2e07b1379da61e025360bca68 to your computer and use it in GitHub Desktop.
Script to generate dataset from DH Q&A archive (preliminary)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
''' | |
Script to parse data DH Q&A archive. | |
Install python dependencies: | |
pip install beautifulsoup4 feedparser | |
Clone DH Q&A archive repository: | |
https://github.com/achorg/DH-Answers-Archive | |
Run this script in the top-level directory of the repository. | |
''' | |
import csv | |
import datetime | |
import glob | |
import os | |
import re | |
from bs4 import BeautifulSoup, Comment | |
import feedparser | |
import requests | |
baseurl = 'http://digitalhumanities.org/answers' | |
def get_post_info(div, topic_url, feed): | |
# take a post container and return dict of post info | |
# takes bs4 div, basec url for this topic, and rss feedparser obj | |
info = {} | |
# generate permalink from li id since in at least one | |
# case the permalink isn't found | |
info['url'] = '%s#%s' % (topic_url, div['id']) | |
# first div id includes order information as position-# | |
info['order'] = div.div['id'].split('-')[1] | |
threadauthor = div.find('div', class_='threadauthor') | |
author_url = threadauthor.a['href'] | |
# members have local profile urls | |
if author_url.startswith('/'): | |
author_url = '%s%s' % (baseurl, author_url) | |
info['author url'] = author_url | |
info['author'] = threadauthor.find('strong').get_text() | |
# question is in first threadpost | |
threadpost = div.find('div', class_='threadpost') | |
# remove 'tweet this question' block and related comments | |
social = threadpost.find('div', class_="social-it") | |
if social: | |
social.extract() | |
[comment.extract() for comment in threadpost.findAll( | |
text=lambda text:isinstance(text, Comment))] | |
# get post content | |
info['content'] = threadpost.div.prettify() | |
# check if this is a reply to a specific post | |
if threadpost.p and threadpost.p.get_text().startswith('Replying to'): | |
# name could be a link, so get last link in the reply p | |
reply_to_post = threadpost.p.find_all('a')[-1]['href'] | |
info['reply to'] = '%s%s' % (baseurl, reply_to_post) | |
# check if marked as a best answer | |
info['best answer'] = bool(post.find('div', class_='best_answer')) | |
# post date | |
poststuff = div.find('div', class_='poststuff') | |
if poststuff: | |
relative_post_date = poststuff.text | |
# Posted x years ago | |
relative_post_date = relative_post_date.replace('Posted ', '') \ | |
.replace(' Permalink', '') | |
info['relative date'] = relative_post_date.strip() | |
else: | |
print('poststuff div not found for %(url)s' % info) | |
# find RSS entry for this record if possible | |
if feed: | |
entries = [e for e in feed.entries if e.link == info['url']] | |
if entries: | |
entry = entries[0] | |
# convert parsed timestruct into isoformat | |
info['date'] = datetime.datetime(*entry.published_parsed[:6]) \ | |
.isoformat() | |
# get datetime with published_parsed | |
# print(entry.published_parsed) | |
else: | |
print('ERROR: not in feed %s' % info['url']) | |
return info | |
def wayback_machine_timestamp(url): | |
'''get timestamp for most recent capture of a url from wayback | |
machine api''' | |
response = requests.get('http://archive.org/wayback/available', | |
params={'url': url}) | |
if response.status_code == requests.codes.ok: | |
data = response.json() | |
# if archived snapshots is not empty, return closest timestamp | |
if data['archived_snapshots']: | |
return data['archived_snapshots']['closest']['timestamp'] | |
dhqa_posts = [] | |
post_fieldnames = [ | |
'url', | |
'topic url', | |
'question', | |
'tags', | |
'author', | |
'author url', | |
'content', | |
'best answer', | |
'date', | |
'relative date', | |
'snapshot date', | |
'order', | |
'reply to', | |
] | |
for path in glob.glob('topic/*/index.html'): | |
# print(path) | |
# topic meta should include url for topic, | |
# but is not completely reliable! | |
# generate from filename instead | |
topic_url = '%s/%s' % (baseurl, os.path.dirname(path)) | |
capture_date = wayback_machine_timestamp(topic_url) | |
topic_data = { | |
'topic url': topic_url, | |
'snapshot date': capture_date or '' | |
} | |
with open(path) as topicdoc: | |
soup = BeautifulSoup(topicdoc, 'html.parser') | |
# page title is question (summary/brief) | |
topic_data['question'] = soup.find('h2').get_text() | |
tags = soup.find_all('a', rel='tag') | |
topic_data['tags'] = ';'.join([t.get_text() for t in tags]) | |
# should tags apply to all posts or just question? | |
# html doesn't have a proper date but RSS should | |
# get rss filename from rss link | |
rss = soup.find('a', class_="rss-link")['href'].lstrip('/') | |
# print(rss) | |
if os.path.exists(rss): | |
# with open(rss) as rssdoc: | |
feed = feedparser.parse(rss) | |
# rss_soup = BeautifulSoup(rssdoc, 'lxml') | |
# items = rss_soup.findAll('item') | |
if not feed.entries: | |
print('ERROR: RSS file has no content: %s' % rss) | |
feed = None | |
else: | |
print('ERROR: Missing RSS file: %s' % rss) | |
feed = None | |
posts = soup.findAll('li', id=re.compile(r'^post-\d+')) | |
for post in posts: | |
post_data = get_post_info(post, topic_url, feed) | |
post_data.update(topic_data) | |
dhqa_posts.append(post_data) | |
# check for second page (few cases; nothing has more than 2 pages) | |
next_link = soup.find('a', class_='next') | |
if next_link: | |
page_two = '%s/index.html' % next_link['href'].lstrip('/') | |
with open(page_two) as page_two_doc: | |
soup2 = BeautifulSoup(page_two_doc, 'html.parser') | |
posts = soup2.findAll('li', id=re.compile(r'^post-\d+')) | |
for post in posts: | |
# unlikely these will be in RSS feed... | |
post_data = get_post_info(post, topic_url, feed) | |
post_data.update(topic_data) | |
dhqa_posts.append(post_data) | |
# NOTE: missing 11 topic RSS feeds | |
# may be able to get date from tag feeds | |
print('%d posts total' % len(dhqa_posts)) | |
with open('dhqa_data.csv', 'w') as outfile: | |
writer = csv.DictWriter(outfile, fieldnames=post_fieldnames) | |
writer.writeheader() | |
writer.writerows(dhqa_posts) |
Good to know! I go back and forth on whether field names should be more dev friendly or more readable, sounds like I should use underscores in future.
That's a great question! Maybe something to clarify how this data will be used? I could see a more dev-friendly version of the dataset, versus a more data analysis friendly version, if you think having both might be worthwhile.
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
One small request is to have field headers with underscores rather than spaces. Otherwise I can't use tab completion with dot notation in pandas. Thanks 🙏