Created
July 17, 2020 16:18
-
-
Save yoheioka/f713c93808f58cb139359189ed9388c7 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import requests | |
import csv | |
import itertools | |
TERMS = ['2020FA', '2020JA', '2019SP'] | |
BASE_URL = 'https://eduapps.mit.edu/ose-rpt/' | |
TERM_URL = BASE_URL + 'subjectEvaluationSearch.htm?termId=%s&departmentId=++15&subjectCode=&instructorName=&search=Search' | |
HEADERS = { | |
'Connection': 'keep-alive', | |
'Pragma': 'no-cache', | |
'Cache-Control': 'no-cache', | |
'Upgrade-Insecure-Requests': '1', | |
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36', | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9', | |
'Sec-Fetch-Site': 'same-origin', | |
'Sec-Fetch-Mode': 'navigate', | |
'Sec-Fetch-User': '?1', | |
'Sec-Fetch-Dest': 'document', | |
'Referer': 'https//eduapps.mit.edu/ose-rpt/subjectEvaluationSearch.htm?termId=2020SP&departmentId=++15&subjectCode=&instructorName=&search=Search', | |
'Accept-Language': 'en-US,en;q=0.9,ja;q=0.8,pt;q=0.7', | |
'Cookie': 'XXXXXXXXXXXXXXXXXXXXXXX', | |
} | |
QUESTIONS = [ | |
"Overall rating of the subject", | |
"Recommend Subject", | |
"Average hours you spent per week on this subject in the classroom", | |
"Average hours you spent per week on this subject outside of the classroom", | |
"Subject expectations were clearly defined", | |
"Subject's learning objectives were met", | |
"Assignments contributed to my learning", | |
"Grading thus far has been fair", | |
"The pace of the class (content and assignments) was:", | |
] | |
QUESTIONS_NO_BAR = [ | |
"Average hours you spent per week on this subject in the classroom", | |
"Average hours you spent per week on this subject outside of the classroom", | |
] | |
FIELDNAMES = [ | |
'course_id', 'course_title', 'term', 'link', | |
] + sum( | |
[ | |
[question + ' (avg)', question + ' (responses)', question + ' (median)', question + ' (stdev)'] | |
for question in QUESTIONS | |
], | |
[] | |
) | |
def scrape_term(term): | |
page = requests.get(TERM_URL % term, headers=HEADERS) | |
soup = BeautifulSoup(page.text, 'html.parser') | |
a_s = soup.find_all('a') | |
all_courses = [] | |
for a in a_s: | |
href = a['href'] | |
if 'subjectEvaluationReport.htm?' not in href: | |
continue | |
course_id = a.text.split(' ')[0].strip() | |
all_courses.append({ | |
'course_id': str(course_id), | |
'course_title': a.text.replace(course_id, '').strip(), | |
'link': href | |
}) | |
return all_courses | |
def scrape_course(course): | |
course_link = BASE_URL + course['link'] | |
page = requests.get(course_link, headers=HEADERS) | |
soup = BeautifulSoup(page.text, 'html.parser') | |
trs = soup.find_all('tr') | |
results = {} | |
for tr in trs: | |
question = tr.find('a') | |
if not question: | |
question = tr.find('td') | |
if not question: | |
continue | |
question = question.text | |
if question not in QUESTIONS: | |
continue | |
tds = tr.find_all('td') | |
avg = float(tds[1].text) | |
if question in QUESTIONS_NO_BAR: | |
responses = float(tds[3].text) | |
median = float(tds[4].text) | |
stdev = float(tds[5].text) | |
else: | |
responses = float(tds[4].text) | |
median = float(tds[5].text) | |
stdev = float(tds[6].text) | |
results.update({ | |
question + ' (responses)': responses, | |
question + ' (median)': median, | |
question + ' (stdev)': stdev, | |
question + ' (avg)': avg | |
}) | |
return results | |
if __name__ == '__main__': | |
with open('reviews.csv', 'w') as newf: | |
writer = csv.DictWriter(newf, fieldnames=FIELDNAMES) | |
writer.writeheader() | |
for term in TERMS: | |
courses = scrape_term(term) | |
for course in courses: | |
course['term'] = term | |
scraped_course = scrape_course(course) | |
course.update(scraped_course) | |
writer.writerow(course) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment