Skip to content

Instantly share code, notes, and snippets.

@yoheioka
Created July 17, 2020 16:18
Show Gist options
  • Save yoheioka/f713c93808f58cb139359189ed9388c7 to your computer and use it in GitHub Desktop.
Save yoheioka/f713c93808f58cb139359189ed9388c7 to your computer and use it in GitHub Desktop.
from bs4 import BeautifulSoup
import requests
import csv
import itertools
TERMS = ['2020FA', '2020JA', '2019SP']
BASE_URL = 'https://eduapps.mit.edu/ose-rpt/'
TERM_URL = BASE_URL + 'subjectEvaluationSearch.htm?termId=%s&departmentId=++15&subjectCode=&instructorName=&search=Search'
HEADERS = {
'Connection': 'keep-alive',
'Pragma': 'no-cache',
'Cache-Control': 'no-cache',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Sec-Fetch-Site': 'same-origin',
'Sec-Fetch-Mode': 'navigate',
'Sec-Fetch-User': '?1',
'Sec-Fetch-Dest': 'document',
'Referer': 'https//eduapps.mit.edu/ose-rpt/subjectEvaluationSearch.htm?termId=2020SP&departmentId=++15&subjectCode=&instructorName=&search=Search',
'Accept-Language': 'en-US,en;q=0.9,ja;q=0.8,pt;q=0.7',
'Cookie': 'XXXXXXXXXXXXXXXXXXXXXXX',
}
QUESTIONS = [
"Overall rating of the subject",
"Recommend Subject",
"Average hours you spent per week on this subject in the classroom",
"Average hours you spent per week on this subject outside of the classroom",
"Subject expectations were clearly defined",
"Subject's learning objectives were met",
"Assignments contributed to my learning",
"Grading thus far has been fair",
"The pace of the class (content and assignments) was:",
]
QUESTIONS_NO_BAR = [
"Average hours you spent per week on this subject in the classroom",
"Average hours you spent per week on this subject outside of the classroom",
]
FIELDNAMES = [
'course_id', 'course_title', 'term', 'link',
] + sum(
[
[question + ' (avg)', question + ' (responses)', question + ' (median)', question + ' (stdev)']
for question in QUESTIONS
],
[]
)
def scrape_term(term):
page = requests.get(TERM_URL % term, headers=HEADERS)
soup = BeautifulSoup(page.text, 'html.parser')
a_s = soup.find_all('a')
all_courses = []
for a in a_s:
href = a['href']
if 'subjectEvaluationReport.htm?' not in href:
continue
course_id = a.text.split(' ')[0].strip()
all_courses.append({
'course_id': str(course_id),
'course_title': a.text.replace(course_id, '').strip(),
'link': href
})
return all_courses
def scrape_course(course):
course_link = BASE_URL + course['link']
page = requests.get(course_link, headers=HEADERS)
soup = BeautifulSoup(page.text, 'html.parser')
trs = soup.find_all('tr')
results = {}
for tr in trs:
question = tr.find('a')
if not question:
question = tr.find('td')
if not question:
continue
question = question.text
if question not in QUESTIONS:
continue
tds = tr.find_all('td')
avg = float(tds[1].text)
if question in QUESTIONS_NO_BAR:
responses = float(tds[3].text)
median = float(tds[4].text)
stdev = float(tds[5].text)
else:
responses = float(tds[4].text)
median = float(tds[5].text)
stdev = float(tds[6].text)
results.update({
question + ' (responses)': responses,
question + ' (median)': median,
question + ' (stdev)': stdev,
question + ' (avg)': avg
})
return results
if __name__ == '__main__':
with open('reviews.csv', 'w') as newf:
writer = csv.DictWriter(newf, fieldnames=FIELDNAMES)
writer.writeheader()
for term in TERMS:
courses = scrape_term(term)
for course in courses:
course['term'] = term
scraped_course = scrape_course(course)
course.update(scraped_course)
writer.writerow(course)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment