Skip to content

Instantly share code, notes, and snippets.

@arynyklas
Last active November 18, 2024 17:42
Show Gist options
  • Save arynyklas/8db8db5bfacdd4017b004882d2188462 to your computer and use it in GitHub Desktop.
Save arynyklas/8db8db5bfacdd4017b004882d2188462 to your computer and use it in GitHub Desktop.
learn.astanait.edu.kz parser; paste your cookies from website to `cookies.txt`
from httpx import Client
from bs4 import BeautifulSoup, Tag
import re
import json
import os
raw_course_urls: list[str] = [
"https://learn.astanait.edu.kz/courses/course-v1:AITU+KazHist01+24-25_C1_Y1/course/",
"https://learn.astanait.edu.kz/courses/course-v1:AITU+IntoProg01+24-25_C1_Y1/course/",
"https://learn.astanait.edu.kz/courses/course-v1:AITU+ICT101+24-25_C1_Y1/course/",
"https://learn.astanait.edu.kz/courses/course-v1:AITU+CULT002+24-25_C1_Y1/course/",
"https://learn.astanait.edu.kz/courses/course-v1:AITU+CALC02+24-25_C1_Y1/course/"
]
course_ids: list[str] = [
url.split("/")[-3]
for url in raw_course_urls
]
RE_SPACE: re.Pattern = re.compile(r'\s{2,}')
http_client: Client = Client(
follow_redirects = True,
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0"
}
)
with open("cookies.txt", "r") as file:
for cookie in file.read().strip().replace("…", "").split("; "):
if cookie:
name, value = cookie.split("=", 1)
http_client.cookies.set(name, value)
def pretty_string(string: str) -> str:
return RE_SPACE.sub(" ", string.replace("\n", " ").replace("\t", " ").strip())
def get_course_sections(course_id: str) -> list[dict]:
bs: BeautifulSoup = BeautifulSoup(
markup = http_client.get(f"https://learn.astanait.edu.kz/courses/{course_id}/course/").content,
features = "html.parser"
)
sections: list[Tag] = bs.find_all("li", class_=["outline-item", "section"])
sections_data: list[dict] = []
for section in sections:
section_title: str = pretty_string(section.find("h3", {"class": "section-title"}).text)
subsections: list[Tag] = section.find_all("li", class_=["subsection", "accordion"])
subsections_data: list[dict] = []
for subsection in subsections:
subsections_data.append({
"title": pretty_string(subsection.find("h4", {"class": "subsection-title"}).text),
"url": pretty_string(subsection.find("a")["href"])
})
sections_data.append({
"title": section_title,
"subsections": subsections_data
})
return sections_data
def parse_subsection(subsection_url: str, course_url: str) -> dict:
bs: BeautifulSoup = BeautifulSoup(
markup = http_client.get(
url = subsection_url,
headers = {
"Referer": course_url
}
).content,
features = "html.parser")
print("====================================")
sub_bs: BeautifulSoup = BeautifulSoup(bs.find_all("div", class_=["seq_contents"])[-1].text, "html.parser")
video_el: Tag = sub_bs.find("div", class_=["video"])
if video_el:
data: dict = json.loads(video_el.attrs["data-metadata"].strip())
return {
"url": "https://youtube.com/watch?v=" + data["streams"].split(":", 1)[1]
}
problems_data: list[dict] = []
for problem_wrapper_el in sub_bs.find_all("div", class_=["problems-wrapper"]):
sub_sub_bs: BeautifulSoup = BeautifulSoup(
markup = problem_wrapper_el.attrs["data-content"].strip(),
features = "html.parser"
)
problem_els: list[Tag] = sub_sub_bs.find_all("div", class_=["problem"])
for problem_el in problem_els:
problem_el = problem_el.find("div", class_=["wrapper-problem-response"])
problem_type: str | None = None
main_div: Tag = problem_el.contents[-1]
if "option-input" in main_div.attrs["class"]:
problem_type = "select"
problem_data: dict = {
"question": (
pretty_string(main_div.find("label").text)
if problem_type
else
"\n".join([
content.text.strip()
for content in problem_el.contents[:-1]
if content.text.strip()
]).strip()
),
"options": []
}
if problem_type == "select":
select_el: Tag = main_div.find("select")
for option_el in select_el.find_all("option"):
option_el: Tag
if option_el.attrs["value"].endswith("_default"):
continue
problem_data["options"].append({
"text": pretty_string(option_el.text),
"correct": option_el.attrs.get("selected") is not None
})
else:
for field_el in main_div.find_all("div", class_=["field"]):
field_el: Tag
input_el: Tag = field_el.find("input")
label_el: Tag = field_el.find("label")
problem_type = input_el.attrs["type"]
problem_data["options"].append({
"text": pretty_string(field_el.text),
"correct": (
bool(["choicegroup_incorrect", "choicegroup_correct"].index(label_el.attrs["class"][-1]))
if "submitted" in input_el.attrs["class"]
else
None
)
})
problem_data["type"] = problem_type
correct_list: list[bool | None] = []
for index, option in enumerate(problem_data["options"]):
correct_list.append(option.pop("correct"))
problem_data["options"][index] = option
if correct_list.count(None) != len(problem_data["options"]):
match problem_type:
case "select":
problem_data["correct_index"] = main_div.find("span", class_=["status", "correct"]) is not None
case "radio":
try:
problem_data["correct_index"] = correct_list.index(True)
except ValueError:
pass
case "checkbox":
problem_data["correct_indexes"] = [
index
for index, correct in enumerate(correct_list)
if correct
]
if not problem_data["correct_indexes"]:
del problem_data["correct_indexes"]
problems_data.append(problem_data)
return {
"problems": problems_data
}
for course_id in course_ids:
fixed_course_id_dirname: str = course_id.replace(":", "-")
try:
os.listdir(fixed_course_id_dirname)
except:
os.mkdir(fixed_course_id_dirname)
for course_section in get_course_sections(course_id):
subsection_datas = []
for subsection in course_section["subsections"]:
print(subsection["title"])
print()
subsection_datas.append(parse_subsection(subsection["url"], raw_course_urls[0]))
print("\tparsed subsection", subsection["title"])
fixed_course_section_title: str = course_section['title'].replace(":", "-")
with open(f"{fixed_course_id_dirname}/{fixed_course_section_title}.json", "w") as file:
json.dump(
obj = subsection_datas,
fp = file,
ensure_ascii = False,
indent = 4
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment