sametz · February 13, 2023 20:56
diff --git a/UDSIS_scrape_rosters.ipynb b/UDSIS_scrape_rosters.ipynb
 {
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Note: Execute cells one at a time, because manual login to UDSIS is required."
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Additional installs required:\n",
    "- geckodriver\n",
    "- lxml"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "import time\n",
    "\n",
    "from bs4 import BeautifulSoup as bs\n",
    "import pandas as pd\n",
    "import requests\n",
    "from selenium import webdriver\n",
    "from selenium.webdriver.support.ui import WebDriverWait\n",
    "from selenium.webdriver.support import expected_conditions as EC\n",
    "from selenium.webdriver.common.by import By\n",
    "from selenium.common.exceptions import TimeoutException"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Note: may need to install geckodriver for selenium to work.\n",
    "# https://stackoverflow.com/questions/40208051/selenium-using-python-geckodriver-executable-needs-to-be-in-path\n",
    "browser = webdriver.Firefox()\n",
    "\n",
    "\n",
    "browser.get('https://www.udel.edu/udsis')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Once the selenium browser opens, use it to manually log in to UDSIS, then continue executing cells."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Starting in Selenium 4.3, the syntax below no longer works\n",
    "# faculty_center = browser.find_element_by_id('win0groupletPTNUI_LAND_REC_GROUPLET$0')\n",
    "\n",
    "# New syntax below:\n",
    "faculty_center = browser.find_element(By.ID,'win0groupletPTNUI_LAND_REC_GROUPLET$0')\n",
    "\n",
    "faculty_center.click()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# In order to find elements on the page, need to switch to the iframe\n",
    "iframe = browser.find_element(By.ID, 'main_target_win0')\n",
    "\n",
    "browser.switch_to.frame(iframe)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def view_all():\n",
    "    '''Clicks the View All button so all classes are listed.'''\n",
    "    try:\n",
    "        viewall_button = browser.find_element(By.LINK_TEXT, 'View All')\n",
    "        viewall_button.click()\n",
    "    except Exception as e:\n",
    "        print(e)\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "view_all()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# lxml needs to be installed for this to work\n",
    "def scrape_roster(html):\n",
    "    dfs = pd.read_html(browser.page_source)\n",
    "    audit_df =[df for df in dfs if \"Audit\" in df.columns]\n",
    "    if len(audit_df) != 1:\n",
    "        raise Exception(\"No one unique table containing 'Audit'\")\n",
    "\n",
    "    return audit_df[0]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def reset_to_rosters():\n",
    "    \"\"\"navigates from roster page to all-rosters page in view all mode.\"\"\"\n",
    "    change_roster = browser.find_element(By.ID, 'DERIVED_SSR_FC_SSS_CHG_CLS_LINK')\n",
    "    change_roster.click()\n",
    "    WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"INSTR_CLASS_VW$hviewall$0\")))\n",
    "    view_all()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# edit the regex to suit your needs. \n",
    "def parse_class_name(class_name):\n",
    "    print(\"testing \", class_name)\n",
    "    name_regex = re.compile(r'(CHEM\\d\\d\\d)-(\\d\\d\\dL*)\\((\\d+)\\)')\n",
    "    print(type(name_regex))\n",
    "    res = name_regex.search(class_name)\n",
    "    print(type(res))\n",
    "    chem_class = res.group(1)\n",
    "    section = res.group(2)\n",
    "    class_code = res.group(3)\n",
    "    return chem_class, section, class_code\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Edit code below if you want to test this function\n",
    "# res = parse_class_name('CHEM325-021L(1234)')\n",
    "# res"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_roster(roster_button):\n",
    "    # click on the class roster\n",
    "    print(\"processing \", roster_button.get_attribute('id'))\n",
    "    roster_button.click()\n",
    "    \n",
    "    # make sure table has loaded\n",
    "    WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"trCLASS_ROSTER_VW$0_row1\")))\n",
    "\n",
    "    # find course name\n",
    "    name_element = browser.find_element(By.CSS_SELECTOR, 'a[id$=\"CLASSNAME_LONG\"]')\n",
    "    class_name = name_element.get_attribute('text').replace(\" \", \"\")\n",
    "    print(\"class name: \", class_name)\n",
    "\n",
    "    # scrape table\n",
    "    try:\n",
    "        df = scrape_roster(browser.page_source)\n",
    "    except Exception as e:\n",
    "        print(\"error with: \", class_name)\n",
    "        raise\n",
    "\n",
    "    # add section info to table in all rows\n",
    "    chem_class, section, class_code = parse_class_name(class_name)\n",
    "    df['Class'] =  chem_class\n",
    "    df['Section'] = section\n",
    "    df['Class Code'] = class_code\n",
    "    \n",
    "    # find then click to go back to all classes\n",
    "    WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"DERIVED_SSR_FC_SSS_CHG_CLS_LINK\")))\n",
    "    change_roster = browser.find_element(By.ID, 'DERIVED_SSR_FC_SSS_CHG_CLS_LINK')\n",
    "    print(\"found change_roster: \", change_roster.get_attribute('id'))\n",
    "    WebDriverWait(browser, timeout=10).until(EC.element_to_be_clickable((By.ID, \"DERIVED_SSR_FC_SSS_CHG_CLS_LINK\")))\n",
    "    print(\"It should be clickable.\")\n",
    "    change_roster.click()\n",
    "    print(\"I clicked change roster\")\n",
    "          \n",
    "    # click view all button\n",
    "    print(\"waiting to see if I can find view all\")\n",
    "    WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"INSTR_CLASS_VW$hviewall$0\")))\n",
    "    print(\"clicking view all\")\n",
    "    view_all()\n",
    "    print(\"exiting download roster\")\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "def download_all_rosters(df_list):\n",
    "    \"\"\"rosters becomes stale after each loop iteration,\n",
    "    so using this klunky workaround.\n",
    "    \n",
    "    Argument:\n",
    "    - df_list: a list to save the roster dataframes in.\n",
    "    \"\"\"\n",
    "    print(\"getting initial roster list\")\n",
    "    rosters = browser.find_elements(By.CSS_SELECTOR, 'a[id^=\"CLASSROSTER\"]')\n",
    "    number_of_rosters = len(rosters)\n",
    "    \n",
    "    for i in range(number_of_rosters):\n",
    "        try:\n",
    "            roster = download_roster(rosters[i])\n",
    "        except Exception as e:\n",
    "            continue\n",
    "        df_list.append(roster)\n",
    "        \n",
    "        # Some explicit delay seems to be needed here; test to see if this can be shortened?\n",
    "        print(\"sleeping for 3 s\")\n",
    "        time.sleep(3)\n",
    "        print(\"waking up; looking for a class roster as proof that page loaded\")\n",
    "        WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a[id^=\"CLASSROSTER\"]')))\n",
    "        print(\"refreshing roster list\")\n",
    "        rosters = browser.find_elements(By.CSS_SELECTOR, 'a[id^=\"CLASSROSTER\"]')\n",
    "\n",
    "\n",
    "        "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Create an empty list to hold rosters as pandas dataframe objects\n",
    "dfs = []"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "tags": []
   },
   "outputs": [],
   "source": [
    "# Before running this, make sure you are on the all-rosters page and that \"View All\" was selected so all classes are displayed\n",
    "download_all_rosters(dfs)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check that the expected number of classes were found\n",
    "len(dfs)  "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# if you need to re-run the scraping, reset to the all-rosters page\n",
    "reset_to_rosters()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# combine all rosters into one dataframe\n",
    "combined_classes = pd.concat(dfs, axis=0)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# check that the result looks right\n",
    "combined_classes.head(40)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# export result to .csv\n",
    "combined_classes.to_csv(\"test.csv\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "# close the browser\n",
    "browser.quit()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "datasci",
   "language": "python",
   "name": "datasci"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
 }
	{
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Note: Execute cells one at a time, because manual login to UDSIS is required."
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Additional installs required:\n",
	"- geckodriver\n",
	"- lxml"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"import re\n",
	"import time\n",
	"\n",
	"from bs4 import BeautifulSoup as bs\n",
	"import pandas as pd\n",
	"import requests\n",
	"from selenium import webdriver\n",
	"from selenium.webdriver.support.ui import WebDriverWait\n",
	"from selenium.webdriver.support import expected_conditions as EC\n",
	"from selenium.webdriver.common.by import By\n",
	"from selenium.common.exceptions import TimeoutException"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Note: may need to install geckodriver for selenium to work.\n",
	"# https://stackoverflow.com/questions/40208051/selenium-using-python-geckodriver-executable-needs-to-be-in-path\n",
	"browser = webdriver.Firefox()\n",
	"\n",
	"\n",
	"browser.get('https://www.udel.edu/udsis')"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Once the selenium browser opens, use it to manually log in to UDSIS, then continue executing cells."
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"# Starting in Selenium 4.3, the syntax below no longer works\n",
	"# faculty_center = browser.find_element_by_id('win0groupletPTNUI_LAND_REC_GROUPLET$0')\n",
	"\n",
	"# New syntax below:\n",
	"faculty_center = browser.find_element(By.ID,'win0groupletPTNUI_LAND_REC_GROUPLET$0')\n",
	"\n",
	"faculty_center.click()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# In order to find elements on the page, need to switch to the iframe\n",
	"iframe = browser.find_element(By.ID, 'main_target_win0')\n",
	"\n",
	"browser.switch_to.frame(iframe)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def view_all():\n",
	" '''Clicks the View All button so all classes are listed.'''\n",
	" try:\n",
	" viewall_button = browser.find_element(By.LINK_TEXT, 'View All')\n",
	" viewall_button.click()\n",
	" except Exception as e:\n",
	" print(e)\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"view_all()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# lxml needs to be installed for this to work\n",
	"def scrape_roster(html):\n",
	" dfs = pd.read_html(browser.page_source)\n",
	" audit_df =[df for df in dfs if \"Audit\" in df.columns]\n",
	" if len(audit_df) != 1:\n",
	" raise Exception(\"No one unique table containing 'Audit'\")\n",
	"\n",
	" return audit_df[0]"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def reset_to_rosters():\n",
	" \"\"\"navigates from roster page to all-rosters page in view all mode.\"\"\"\n",
	" change_roster = browser.find_element(By.ID, 'DERIVED_SSR_FC_SSS_CHG_CLS_LINK')\n",
	" change_roster.click()\n",
	" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"INSTR_CLASS_VW$hviewall$0\")))\n",
	" view_all()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# edit the regex to suit your needs. \n",
	"def parse_class_name(class_name):\n",
	" print(\"testing \", class_name)\n",
	" name_regex = re.compile(r'(CHEM\\d\\d\\d)-(\\d\\d\\dL*)\\((\\d+)\\)')\n",
	" print(type(name_regex))\n",
	" res = name_regex.search(class_name)\n",
	" print(type(res))\n",
	" chem_class = res.group(1)\n",
	" section = res.group(2)\n",
	" class_code = res.group(3)\n",
	" return chem_class, section, class_code\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Edit code below if you want to test this function\n",
	"# res = parse_class_name('CHEM325-021L(1234)')\n",
	"# res"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def download_roster(roster_button):\n",
	" # click on the class roster\n",
	" print(\"processing \", roster_button.get_attribute('id'))\n",
	" roster_button.click()\n",
	" \n",
	" # make sure table has loaded\n",
	" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"trCLASS_ROSTER_VW$0_row1\")))\n",
	"\n",
	" # find course name\n",
	" name_element = browser.find_element(By.CSS_SELECTOR, 'a[id$=\"CLASSNAME_LONG\"]')\n",
	" class_name = name_element.get_attribute('text').replace(\" \", \"\")\n",
	" print(\"class name: \", class_name)\n",
	"\n",
	" # scrape table\n",
	" try:\n",
	" df = scrape_roster(browser.page_source)\n",
	" except Exception as e:\n",
	" print(\"error with: \", class_name)\n",
	" raise\n",
	"\n",
	" # add section info to table in all rows\n",
	" chem_class, section, class_code = parse_class_name(class_name)\n",
	" df['Class'] = chem_class\n",
	" df['Section'] = section\n",
	" df['Class Code'] = class_code\n",
	" \n",
	" # find then click to go back to all classes\n",
	" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"DERIVED_SSR_FC_SSS_CHG_CLS_LINK\")))\n",
	" change_roster = browser.find_element(By.ID, 'DERIVED_SSR_FC_SSS_CHG_CLS_LINK')\n",
	" print(\"found change_roster: \", change_roster.get_attribute('id'))\n",
	" WebDriverWait(browser, timeout=10).until(EC.element_to_be_clickable((By.ID, \"DERIVED_SSR_FC_SSS_CHG_CLS_LINK\")))\n",
	" print(\"It should be clickable.\")\n",
	" change_roster.click()\n",
	" print(\"I clicked change roster\")\n",
	" \n",
	" # click view all button\n",
	" print(\"waiting to see if I can find view all\")\n",
	" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.ID, \"INSTR_CLASS_VW$hviewall$0\")))\n",
	" print(\"clicking view all\")\n",
	" view_all()\n",
	" print(\"exiting download roster\")\n",
	" return df"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"def download_all_rosters(df_list):\n",
	" \"\"\"rosters becomes stale after each loop iteration,\n",
	" so using this klunky workaround.\n",
	" \n",
	" Argument:\n",
	" - df_list: a list to save the roster dataframes in.\n",
	" \"\"\"\n",
	" print(\"getting initial roster list\")\n",
	" rosters = browser.find_elements(By.CSS_SELECTOR, 'a[id^=\"CLASSROSTER\"]')\n",
	" number_of_rosters = len(rosters)\n",
	" \n",
	" for i in range(number_of_rosters):\n",
	" try:\n",
	" roster = download_roster(rosters[i])\n",
	" except Exception as e:\n",
	" continue\n",
	" df_list.append(roster)\n",
	" \n",
	" # Some explicit delay seems to be needed here; test to see if this can be shortened?\n",
	" print(\"sleeping for 3 s\")\n",
	" time.sleep(3)\n",
	" print(\"waking up; looking for a class roster as proof that page loaded\")\n",
	" WebDriverWait(browser, timeout=10).until(EC.presence_of_element_located((By.CSS_SELECTOR, 'a[id^=\"CLASSROSTER\"]')))\n",
	" print(\"refreshing roster list\")\n",
	" rosters = browser.find_elements(By.CSS_SELECTOR, 'a[id^=\"CLASSROSTER\"]')\n",
	"\n",
	"\n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# Create an empty list to hold rosters as pandas dataframe objects\n",
	"dfs = []"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"tags": []
	},
	"outputs": [],
	"source": [
	"# Before running this, make sure you are on the all-rosters page and that \"View All\" was selected so all classes are displayed\n",
	"download_all_rosters(dfs)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# check that the expected number of classes were found\n",
	"len(dfs) "
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# if you need to re-run the scraping, reset to the all-rosters page\n",
	"reset_to_rosters()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# combine all rosters into one dataframe\n",
	"combined_classes = pd.concat(dfs, axis=0)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# check that the result looks right\n",
	"combined_classes.head(40)"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# export result to .csv\n",
	"combined_classes.to_csv(\"test.csv\")"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": [
	"# close the browser\n",
	"browser.quit()"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "datasci",
	"language": "python",
	"name": "datasci"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 3
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython3",
	"version": "3.10.4"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 4
	}