Created
February 13, 2023 20:56
-
-
Save sametz/e6b829f12f7905d52e9355c590f89474 to your computer and use it in GitHub Desktop.
A Jupyter notebook that can combine all class rosters in UDSIS to one .csv
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Note: Execute cells one at a time, because manual login to UDSIS is required." | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Additional installs required:\n", | |
"- geckodriver\n", | |
"- lxml" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import re\n", | |
"import time\n", | |
"\n", | |
"from bs4 import BeautifulSoup as bs\n", | |
"import pandas as pd\n", | |
"import requests\n", | |
"from selenium import webdriver\n", | |
"from selenium.webdriver.support.ui import WebDriverWait\n", | |
"from selenium.webdriver.support import expected_conditions as EC\n", | |
"from selenium.webdriver.common.by import By\n", | |
"from selenium.common.exceptions import TimeoutException" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Note: may need to install geckodriver for selenium to work.\n", | |
"# https://stackoverflow.com/questions/40208051/selenium-using-python-geckodriver-executable-needs-to-be-in-path\n", | |
"browser = webdriver.Firefox()\n", | |
"\n", | |
"\n", | |
"browser.get('https://www.udel.edu/udsis')" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"Once the selenium browser opens, use it to manually log in to UDSIS, then continue executing cells." | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"# Starting in Selenium 4.3, the syntax below no longer works\n", | |
"# faculty_center = browser.find_element_by_id('win0groupletPTNUI_LAND_REC_GROUPLET$0')\n", | |
"\n", | |
"# New syntax below:\n", | |
"faculty_center = browser.find_element(By.ID,'win0groupletPTNUI_LAND_REC_GROUPLET$0')\n", | |
"\n", | |
"faculty_center.click()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# In order to find elements on the page, need to switch to the iframe\n", | |
"iframe = browser.find_element(By.ID, 'main_target_win0')\n", | |
"\n", | |
"browser.switch_to.frame(iframe)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def view_all():\n", | |
" '''Clicks the View All button so all classes are listed.'''\n", | |
" try:\n", | |
" viewall_button = browser.find_element(By.LINK_TEXT, 'View All')\n", | |
" viewall_button.click()\n", | |
" except Exception as e:\n", | |
" print(e)\n", | |
"\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"view_all()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# lxml needs to be installed for this to work\n", | |
"def scrape_roster(html):\n", | |
" dfs = pd.read_html(browser.page_source)\n", | |
" audit_df =[df for df in dfs if \"Audit\" in df.columns]\n", | |
" if len(audit_df) != 1:\n", | |
" raise Exception(\"No one unique table containing 'Audit'\")\n", | |
"\n", | |
" return audit_df[0]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def reset_to_rosters():\n", | |
" \"\"\"navigates from roster page to all-rosters page in view all mode.\"\"\"\n", | |
" change_roster = browser.find_element(By.ID, 'DERIVED_SSR_FC_SSS_CHG_CLS_LINK')\n", | |
" change_roster.click()\n", | |
" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"INSTR_CLASS_VW$hviewall$0\")))\n", | |
" view_all()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# edit the regex to suit your needs. \n", | |
"def parse_class_name(class_name):\n", | |
" print(\"testing \", class_name)\n", | |
" name_regex = re.compile(r'(CHEM\\d\\d\\d)-(\\d\\d\\dL*)\\((\\d+)\\)')\n", | |
" print(type(name_regex))\n", | |
" res = name_regex.search(class_name)\n", | |
" print(type(res))\n", | |
" chem_class = res.group(1)\n", | |
" section = res.group(2)\n", | |
" class_code = res.group(3)\n", | |
" return chem_class, section, class_code\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Edit code below if you want to test this function\n", | |
"# res = parse_class_name('CHEM325-021L(1234)')\n", | |
"# res" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def download_roster(roster_button):\n", | |
" # click on the class roster\n", | |
" print(\"processing \", roster_button.get_attribute('id'))\n", | |
" roster_button.click()\n", | |
" \n", | |
" # make sure table has loaded\n", | |
" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"trCLASS_ROSTER_VW$0_row1\")))\n", | |
"\n", | |
" # find course name\n", | |
" name_element = browser.find_element(By.CSS_SELECTOR, 'a[id$=\"CLASSNAME_LONG\"]')\n", | |
" class_name = name_element.get_attribute('text').replace(\" \", \"\")\n", | |
" print(\"class name: \", class_name)\n", | |
"\n", | |
" # scrape table\n", | |
" try:\n", | |
" df = scrape_roster(browser.page_source)\n", | |
" except Exception as e:\n", | |
" print(\"error with: \", class_name)\n", | |
" raise\n", | |
"\n", | |
" # add section info to table in all rows\n", | |
" chem_class, section, class_code = parse_class_name(class_name)\n", | |
" df['Class'] = chem_class\n", | |
" df['Section'] = section\n", | |
" df['Class Code'] = class_code\n", | |
" \n", | |
" # find then click to go back to all classes\n", | |
" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"DERIVED_SSR_FC_SSS_CHG_CLS_LINK\")))\n", | |
" change_roster = browser.find_element(By.ID, 'DERIVED_SSR_FC_SSS_CHG_CLS_LINK')\n", | |
" print(\"found change_roster: \", change_roster.get_attribute('id'))\n", | |
" WebDriverWait(browser, timeout=10).until(EC.element_to_be_clickable((By.ID, \"DERIVED_SSR_FC_SSS_CHG_CLS_LINK\")))\n", | |
" print(\"It should be clickable.\")\n", | |
" change_roster.click()\n", | |
" print(\"I clicked change roster\")\n", | |
" \n", | |
" # click view all button\n", | |
" print(\"waiting to see if I can find view all\")\n", | |
" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"INSTR_CLASS_VW$hviewall$0\")))\n", | |
" print(\"clicking view all\")\n", | |
" view_all()\n", | |
" print(\"exiting download roster\")\n", | |
" return df" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def download_all_rosters(df_list):\n", | |
" \"\"\"rosters becomes stale after each loop iteration,\n", | |
" so using this klunky workaround.\n", | |
" \n", | |
" Argument:\n", | |
" - df_list: a list to save the roster dataframes in.\n", | |
" \"\"\"\n", | |
" print(\"getting initial roster list\")\n", | |
" rosters = browser.find_elements(By.CSS_SELECTOR, 'a[id^=\"CLASSROSTER\"]')\n", | |
" number_of_rosters = len(rosters)\n", | |
" \n", | |
" for i in range(number_of_rosters):\n", | |
" try:\n", | |
" roster = download_roster(rosters[i])\n", | |
" except Exception as e:\n", | |
" continue\n", | |
" df_list.append(roster)\n", | |
" \n", | |
" # Some explicit delay seems to be needed here; test to see if this can be shortened?\n", | |
" print(\"sleeping for 3 s\")\n", | |
" time.sleep(3)\n", | |
" print(\"waking up; looking for a class roster as proof that page loaded\")\n", | |
" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a[id^=\"CLASSROSTER\"]')))\n", | |
" print(\"refreshing roster list\")\n", | |
" rosters = browser.find_elements(By.CSS_SELECTOR, 'a[id^=\"CLASSROSTER\"]')\n", | |
"\n", | |
"\n", | |
" " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# Create an empty list to hold rosters as pandas dataframe objects\n", | |
"dfs = []" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": { | |
"tags": [] | |
}, | |
"outputs": [], | |
"source": [ | |
"# Before running this, make sure you are on the all-rosters page and that \"View All\" was selected so all classes are displayed\n", | |
"download_all_rosters(dfs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# check that the expected number of classes were found\n", | |
"len(dfs) " | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# if you need to re-run the scraping, reset to the all-rosters page\n", | |
"reset_to_rosters()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# combine all rosters into one dataframe\n", | |
"combined_classes = pd.concat(dfs, axis=0)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# check that the result looks right\n", | |
"combined_classes.head(40)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# export result to .csv\n", | |
"combined_classes.to_csv(\"test.csv\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"# close the browser\n", | |
"browser.quit()" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "datasci", | |
"language": "python", | |
"name": "datasci" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.10.4" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 4 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment