Skip to content

Instantly share code, notes, and snippets.

@sametz
Created February 13, 2023 20:56
Show Gist options
  • Save sametz/e6b829f12f7905d52e9355c590f89474 to your computer and use it in GitHub Desktop.
Save sametz/e6b829f12f7905d52e9355c590f89474 to your computer and use it in GitHub Desktop.
A Jupyter notebook that can combine all class rosters in UDSIS to one .csv
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Note: Execute cells one at a time, because manual login to UDSIS is required."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Additional installs required:\n",
"- geckodriver\n",
"- lxml"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import re\n",
"import time\n",
"\n",
"from bs4 import BeautifulSoup as bs\n",
"import pandas as pd\n",
"import requests\n",
"from selenium import webdriver\n",
"from selenium.webdriver.support.ui import WebDriverWait\n",
"from selenium.webdriver.support import expected_conditions as EC\n",
"from selenium.webdriver.common.by import By\n",
"from selenium.common.exceptions import TimeoutException"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Note: may need to install geckodriver for selenium to work.\n",
"# https://stackoverflow.com/questions/40208051/selenium-using-python-geckodriver-executable-needs-to-be-in-path\n",
"browser = webdriver.Firefox()\n",
"\n",
"\n",
"browser.get('https://www.udel.edu/udsis')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Once the selenium browser opens, use it to manually log in to UDSIS, then continue executing cells."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Starting in Selenium 4.3, the syntax below no longer works\n",
"# faculty_center = browser.find_element_by_id('win0groupletPTNUI_LAND_REC_GROUPLET$0')\n",
"\n",
"# New syntax below:\n",
"faculty_center = browser.find_element(By.ID,'win0groupletPTNUI_LAND_REC_GROUPLET$0')\n",
"\n",
"faculty_center.click()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# In order to find elements on the page, need to switch to the iframe\n",
"iframe = browser.find_element(By.ID, 'main_target_win0')\n",
"\n",
"browser.switch_to.frame(iframe)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def view_all():\n",
" '''Clicks the View All button so all classes are listed.'''\n",
" try:\n",
" viewall_button = browser.find_element(By.LINK_TEXT, 'View All')\n",
" viewall_button.click()\n",
" except Exception as e:\n",
" print(e)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"view_all()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# lxml needs to be installed for this to work\n",
"def scrape_roster(html):\n",
" dfs = pd.read_html(browser.page_source)\n",
" audit_df =[df for df in dfs if \"Audit\" in df.columns]\n",
" if len(audit_df) != 1:\n",
" raise Exception(\"No one unique table containing 'Audit'\")\n",
"\n",
" return audit_df[0]"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def reset_to_rosters():\n",
" \"\"\"navigates from roster page to all-rosters page in view all mode.\"\"\"\n",
" change_roster = browser.find_element(By.ID, 'DERIVED_SSR_FC_SSS_CHG_CLS_LINK')\n",
" change_roster.click()\n",
" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"INSTR_CLASS_VW$hviewall$0\")))\n",
" view_all()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# edit the regex to suit your needs. \n",
"def parse_class_name(class_name):\n",
" print(\"testing \", class_name)\n",
" name_regex = re.compile(r'(CHEM\\d\\d\\d)-(\\d\\d\\dL*)\\((\\d+)\\)')\n",
" print(type(name_regex))\n",
" res = name_regex.search(class_name)\n",
" print(type(res))\n",
" chem_class = res.group(1)\n",
" section = res.group(2)\n",
" class_code = res.group(3)\n",
" return chem_class, section, class_code\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Edit code below if you want to test this function\n",
"# res = parse_class_name('CHEM325-021L(1234)')\n",
"# res"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def download_roster(roster_button):\n",
" # click on the class roster\n",
" print(\"processing \", roster_button.get_attribute('id'))\n",
" roster_button.click()\n",
" \n",
" # make sure table has loaded\n",
" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"trCLASS_ROSTER_VW$0_row1\")))\n",
"\n",
" # find course name\n",
" name_element = browser.find_element(By.CSS_SELECTOR, 'a[id$=\"CLASSNAME_LONG\"]')\n",
" class_name = name_element.get_attribute('text').replace(\" \", \"\")\n",
" print(\"class name: \", class_name)\n",
"\n",
" # scrape table\n",
" try:\n",
" df = scrape_roster(browser.page_source)\n",
" except Exception as e:\n",
" print(\"error with: \", class_name)\n",
" raise\n",
"\n",
" # add section info to table in all rows\n",
" chem_class, section, class_code = parse_class_name(class_name)\n",
" df['Class'] = chem_class\n",
" df['Section'] = section\n",
" df['Class Code'] = class_code\n",
" \n",
" # find then click to go back to all classes\n",
" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"DERIVED_SSR_FC_SSS_CHG_CLS_LINK\")))\n",
" change_roster = browser.find_element(By.ID, 'DERIVED_SSR_FC_SSS_CHG_CLS_LINK')\n",
" print(\"found change_roster: \", change_roster.get_attribute('id'))\n",
" WebDriverWait(browser, timeout=10).until(EC.element_to_be_clickable((By.ID, \"DERIVED_SSR_FC_SSS_CHG_CLS_LINK\")))\n",
" print(\"It should be clickable.\")\n",
" change_roster.click()\n",
" print(\"I clicked change roster\")\n",
" \n",
" # click view all button\n",
" print(\"waiting to see if I can find view all\")\n",
" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"INSTR_CLASS_VW$hviewall$0\")))\n",
" print(\"clicking view all\")\n",
" view_all()\n",
" print(\"exiting download roster\")\n",
" return df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"def download_all_rosters(df_list):\n",
" \"\"\"rosters becomes stale after each loop iteration,\n",
" so using this klunky workaround.\n",
" \n",
" Argument:\n",
" - df_list: a list to save the roster dataframes in.\n",
" \"\"\"\n",
" print(\"getting initial roster list\")\n",
" rosters = browser.find_elements(By.CSS_SELECTOR, 'a[id^=\"CLASSROSTER\"]')\n",
" number_of_rosters = len(rosters)\n",
" \n",
" for i in range(number_of_rosters):\n",
" try:\n",
" roster = download_roster(rosters[i])\n",
" except Exception as e:\n",
" continue\n",
" df_list.append(roster)\n",
" \n",
" # Some explicit delay seems to be needed here; test to see if this can be shortened?\n",
" print(\"sleeping for 3 s\")\n",
" time.sleep(3)\n",
" print(\"waking up; looking for a class roster as proof that page loaded\")\n",
" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a[id^=\"CLASSROSTER\"]')))\n",
" print(\"refreshing roster list\")\n",
" rosters = browser.find_elements(By.CSS_SELECTOR, 'a[id^=\"CLASSROSTER\"]')\n",
"\n",
"\n",
" "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# Create an empty list to hold rosters as pandas dataframe objects\n",
"dfs = []"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# Before running this, make sure you are on the all-rosters page and that \"View All\" was selected so all classes are displayed\n",
"download_all_rosters(dfs)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# check that the expected number of classes were found\n",
"len(dfs) "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# if you need to re-run the scraping, reset to the all-rosters page\n",
"reset_to_rosters()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# combine all rosters into one dataframe\n",
"combined_classes = pd.concat(dfs, axis=0)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# check that the result looks right\n",
"combined_classes.head(40)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# export result to .csv\n",
"combined_classes.to_csv(\"test.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# close the browser\n",
"browser.quit()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "datasci",
"language": "python",
"name": "datasci"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.4"
}
},
"nbformat": 4,
"nbformat_minor": 4
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment