Last active
February 15, 2023 04:46
-
-
Save Zeta611/e774e7e0c17824752ee9988708f94dec to your computer and use it in GitHub Desktop.
[Real World Web Scraping] I used this script to find and parse missing ISBNs to help a librarian. Fixed 700+ books, took 3 hours to write. #automation #demo
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import re | |
import requests | |
import webbrowser | |
registrationNumbers = [ | |
"HHA000010715", | |
"HHA000010711", | |
"HHA000001827", | |
"HHA000008672", | |
"HHA000008410", | |
"HHA000008403", | |
"HHA000005292", | |
"HHA000008701", | |
"HHA000009694", | |
"HHA000010286", | |
"HHA000010288", | |
"HHA000010290", | |
"HHA000010292", | |
"HHA000010294", | |
"HHA000010296", | |
"HHA000000172", | |
"HHA000001334", | |
"HHA000010018", | |
"HHA000001267", | |
"HHA000000562", | |
"HHA000002775", | |
"HHA000010699", | |
"HHA000010701", | |
"HHA000010703", | |
"HHA000010705", | |
"HHA000010707", | |
"HHA000010709", | |
"HHA000010745", | |
"HHA000010743", | |
"HHA000010741", | |
"HHA000010739", | |
"HHA000010737", | |
"HHA000010735", | |
"HHA000010731", | |
"HHA000010266", | |
"HHA000010268", | |
"HHA000010270", | |
"HHA000010272", | |
"HHA000010274", | |
"HHA000010276", | |
"HHA000010278", | |
"HHA000010280", | |
"HHA000010282", | |
"HHA000010284", | |
"HHA000009594", | |
"HHA000010729", | |
"HHA000001652", | |
"HHA000010733", | |
"HHA000011537", | |
"HHA000011535", | |
"HHA000011533", | |
"HHA000011531", | |
"HHA000011529", | |
"HHA000011527", | |
"HHA000006438", | |
"HHA000001637", | |
"HHA000002601", | |
"HHA000006441", | |
"HHA000011497", | |
"HHA000003616", | |
"HHA000000066", | |
"HHA000000560", | |
"HHA000008111", | |
"HHA000001389", | |
"HHA000008337", | |
"HHA000008336", | |
"HHA000008334", | |
"HHA000002631", | |
"HHA000008333", | |
"HHA000008331", | |
"HHA000008326", | |
"HHA000005213", | |
"HHA000011849", | |
"HHA000011850", | |
"HHA000011851", | |
"HHA000011852", | |
"HHA000011853", | |
"HHA000011854", | |
"HHA000011855", | |
"HHA000011856", | |
"HHA000001138", | |
"HHA000012794", | |
"HHA000010022", | |
"HHA000010024", | |
"HHA000000644", | |
"HHA000010028", | |
"HHA000010030", | |
"HHA000010032", | |
"HHA000010034", | |
"HHA000010036", | |
"HHA000005391", | |
"HHA000010042", | |
"HHA000010040", | |
"HHA000000434", | |
"HHA000010038", | |
"HHA000001660", | |
"HHA000010717", | |
"HHA000010719", | |
"HHA000010721", | |
"HHA000010723", | |
"HHA000010026", | |
"HHA000010727", | |
"HHA000006440", | |
"HHA000011495", | |
"HHA000011493", | |
"HHA000010020", | |
"HHA000001720", | |
"HHA000010016", | |
"HHA000010014", | |
"HHA000010012", | |
"HHA000000617", | |
"HHA000010010", | |
"HHA000010008", | |
"HHA000010006", | |
"HHA000010004", | |
"HHA000001305", | |
"HHA000001297", | |
"HHA000010795", | |
"HHA000001695", | |
"HHA000000564", | |
"HHA000012701", | |
"HHA000000131", | |
"HHA000002721", | |
"HHA000012726", | |
"HHA000009463", | |
"HHA000010725", | |
"HHA000010713", | |
"HHA000011472", | |
"HHA000011463", | |
"HHA000011460", | |
"HHA000011457", | |
"HHA000011454", | |
"HHA000011451", | |
"HHA000011433", | |
"HHA000011481", | |
"HHA000011484", | |
"HHA000011487", | |
"HHA000011490", | |
"HHA000011436", | |
"HHA000011466", | |
"HHA000010150", | |
"HHA000010147", | |
"HHA000012791", | |
"HHA000010141", | |
"HHA000010144", | |
"HHA000011469", | |
"HHA000010117", | |
"HHA000011507", | |
"HHA000011503", | |
"HHA000011515", | |
"HHA000011519", | |
"HHA000011523", | |
"HHA000010121", | |
"HHA000010125", | |
"HHA000010129", | |
"HHA000010133", | |
"HHA000010137", | |
"HHA000010113", | |
"HHA000011511", | |
"HHA000010109", | |
"HHA000010105", | |
"HHA000011499", | |
"HHA000010609", | |
"HHA000010614", | |
"HHA000010619", | |
"HHA000010624", | |
"HHA000010629", | |
"HHA000010634", | |
"HHA000012845", | |
"HHA000011553", | |
"HHA000009223", | |
"HHA000011547", | |
"HHA000011427", | |
"HHA000011421", | |
"HHA000011415", | |
"HHA000011409", | |
"HHA000011403", | |
"HHA000011391", | |
"HHA000009858", | |
"HHA000011379", | |
"HHA000011373", | |
"HHA000011367", | |
"HHA000011361", | |
"HHA000011349", | |
"HHA000011343", | |
"HHA000011337", | |
"HHA000010153", | |
"HHA000010159", | |
"HHA000011319", | |
"HHA000011313", | |
"HHA000011307", | |
"HHA000011301", | |
"HHA000010907", | |
"HHA000011289", | |
"HHA000011283", | |
"HHA000011277", | |
"HHA000011271", | |
"HHA000011265", | |
"HHA000011259", | |
"HHA000010955", | |
"HHA000010961", | |
"HHA000010967", | |
"HHA000010973", | |
"HHA000010979", | |
"HHA000010985", | |
"HHA000010991", | |
"HHA000010997", | |
"HHA000011003", | |
"HHA000011009", | |
"HHA000011015", | |
"HHA000011021", | |
"HHA000011151", | |
"HHA000011157", | |
"HHA000011163", | |
"HHA000011169", | |
"HHA000011175", | |
"HHA000011181", | |
"HHA000011187", | |
"HHA000011193", | |
"HHA000011199", | |
"HHA000011205", | |
"HHA000011211", | |
"HHA000011217", | |
"HHA000011223", | |
"HHA000011229", | |
"HHA000011235", | |
"HHA000011241", | |
"HHA000011247", | |
"HHA000011253", | |
"HHA000011295", | |
"HHA000011385", | |
"HHA000011331", | |
"HHA000011325", | |
"HHA000010948", | |
"HHA000010913", | |
"HHA000010920", | |
"HHA000010900", | |
"HHA000010893", | |
"HHA000010886", | |
"HHA000010934", | |
"HHA000010927", | |
"HHA000010941", | |
"HHA000009699", | |
"HHA000014201", | |
"HHA000013941", | |
"HHA000013971", | |
"HHA000013961", | |
"HHA000013951", | |
"HHA000014037", | |
"HHA000014027", | |
"HHA000009387", | |
"HHA000017006", | |
"HHA000016994", | |
"HHA000016982", | |
"HHA000016970", | |
"HHA000016958", | |
"HHA000016946", | |
"HHA000009859", | |
] | |
class ScrapeException(Exception): | |
pass | |
class ISBNException(Exception): | |
pass | |
# Search | |
def searchURLForRegistrationNumber(registrationNumber): | |
return f"https://www.l4d.or.kr/yelc/menu/10441/program/30011/plusSearchResultList.do?searchType=DETAIL&searchCategory=ALL&searchKey1=TITLE&searchKeyword1=&searchOperator1=AND&searchKey2=AUTHOR&searchKeyword2=&searchOperator2=AND&searchKey3=PUBLISHER&searchKeyword3=&searchOperator3=AND&searchKey4=KEYWORD&searchKeyword4=&searchOperator4=AND&searchKey6=REG_NO&searchKeyword6={registrationNumber}&searchOperator6=AND&searchKey5=ISBN&searchKeyword5=&searchOperator5=AND&searchPublishStartYear=&searchPublishEndYear=&searchLibrary=MC&searchRoom=ALL&searchSort=KEY&searchOrder=DESC&searchRecordCount=10" | |
# Detail URL | |
def detailURLForBook(record, book): | |
return f"https://www.l4d.or.kr/yelc/menu/10441/program/30011/plusSearchResultDetail.do?searchType=DETAIL&searchMenuCollectionCategory=&searchCategory=ALL&searchKey=&searchKey1=TITLE&searchKey2=AUTHOR&searchKey3=PUBLISHER&searchKey4=KEYWORD&searchKey5=ISBN&searchKeyword=&searchKeyword1=&searchKeyword2=&searchKeyword3=&searchKeyword4=&searchKeyword5=&searchOperator1=AND&searchOperator2=AND&searchOperator3=AND&searchOperator4=AND&searchOperator5=AND&searchPublishStartYear=&searchPublishEndYear=&searchLibrary=MC&searchLibraryArr=MC&searchRoom=ALL&searchKdc=&searchIsbn=&searchSort=KEY&searchOrder=DESC&searchRecordCount=10¤tPageNo=1&viewStatus=IMAGE&preSearchKey=&preSearchKeyword=&reSearchYn=N&recKey={record}&bookKey={book}&publishFormCode=BO&searchSeparateShelfCode=" | |
# Find record / book | |
def getRecordAndBookIDs(searchURL): | |
searchResponse = requests.get(searchURL) | |
soup = BeautifulSoup(searchResponse.content, "html.parser") | |
ul_element = soup.find("ul", {"class": "resultList imageType"}) | |
li_elements = ul_element.find_all("li") | |
if len(li_elements) != 1: | |
raise ScrapeException( | |
f"Expected exactly 1 <li> element, found {(len(li_elements))}" | |
) | |
span_element = li_elements[0].find("span", {"class": "chk"}) | |
input_element = span_element.find("input") | |
data = input_element["value"] | |
return tuple(data.split("^")[:2]) | |
def parseDetail(detailURL): | |
searchResponse = requests.get(detailURL) | |
soup = BeautifulSoup(searchResponse.content, "html.parser") | |
h4_element = soup.find("h4") | |
title = h4_element.text.rstrip("/") | |
try: | |
detail_th_element = soup.find("th", {"scope": "row"}, string="주기사항") | |
detail_td_element = detail_th_element.find_next_sibling("td") | |
detail = detail_td_element.text | |
except AttributeError: | |
detail = None | |
isbn_regex = r"ISBN:?\s*(?P<isbn>\d*)(?P<extra>[^:\s]+)?" | |
try: | |
isbns_th_element = soup.find("th", {"scope": "row"}, string="표준번호") | |
isbns_td_element = isbns_th_element.find_next_sibling("td") | |
isbns_text = isbns_td_element.text | |
isbns = [] | |
for m in re.finditer(isbn_regex, isbns_text): | |
if m["isbn"] is None: | |
raise ISBNException(f"ISBN not found in {isbns_text}") | |
isbns.append((m["isbn"], m["extra"])) | |
except AttributeError: | |
isbns = [] | |
individuals = [] | |
individuals_table = soup.find("table", {"class": "tbl"}) | |
individuals_tbody = individuals_table.find("tbody") | |
for individual_tr in individuals_tbody.find_all("tr"): | |
individual_td = individual_tr.find_all("td")[3] | |
individuals.append(individual_td.string) | |
return (title, detail, isbns, individuals) | |
with open("result.txt", "w") as fresult: | |
with open("log.txt", "w") as flog: | |
fresult.write("Title\tRegistration Number\tISBN\tExtra\tDetail\tURL\n") | |
flog.write("Registration Number\tURL\tTitle\tReason\n") | |
total = len(registrationNumbers) | |
for i, n in enumerate(registrationNumbers): | |
if i % 10 == 0: | |
print(f"Progress: [{i}/{total}] ({i / total * 100:.2f}%)") | |
url = searchURLForRegistrationNumber(n) | |
try: | |
record, book = getRecordAndBookIDs(url) | |
except ScrapeException as e: | |
webbrowser.open_new_tab(url) | |
flog.write( | |
f"{n}\t{url}\t???\tSearch returned zero or more than one results\n" | |
) | |
continue | |
except Exception as e: | |
webbrowser.open_new_tab(url) | |
flog.write(f"{n}\t{url}\t???\tUnknown error {e}\n") | |
continue | |
detailURL = detailURLForBook(record, book) | |
try: | |
title, detail, isbns, individuals = parseDetail(detailURL) | |
except ISBNException: | |
webbrowser.open_new_tab(detailURL) | |
flog.write(f"{n}\t{detailURL}\t{title}\tISBN not found\n") | |
continue | |
except Exception as e: | |
webbrowser.open_new_tab(detailURL) | |
flog.write(f"{n}\t{detailURL}\t{title}\tUnknown error {e}\n") | |
continue | |
if not individuals: | |
webbrowser.open_new_tab(detailURL) | |
flog.write( | |
f"{n}\t{detailURL}\t{title}\tIndividual registration numbers missing\n" | |
) | |
elif len(isbns) != len(individuals): | |
webbrowser.open_new_tab(detailURL) | |
flog.write(f"{n}\t{detailURL}\t{title}\tFull ISBNs are absent\n") | |
else: | |
if len(individuals) == 1: | |
webbrowser.open_new_tab(detailURL) | |
flog.write( | |
f"{n}\t{detailURL}\t{title}\tThere is only one individual book here. Please check!\n" | |
) | |
for i, registrationNumber in enumerate(individuals): | |
isbn, extra = isbns[i] | |
fresult.write( | |
f"{title}\t{registrationNumber}\t{isbn}\t{extra if extra is not None else ''}\t{detail if detail is not None else ''}\t{detailURL}\n" | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment