Last active
November 18, 2024 17:42
-
-
Save arynyklas/8db8db5bfacdd4017b004882d2188462 to your computer and use it in GitHub Desktop.
learn.astanait.edu.kz parser; paste your cookies from website to `cookies.txt`
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from httpx import Client | |
from bs4 import BeautifulSoup, Tag | |
import re | |
import json | |
import os | |
raw_course_urls: list[str] = [ | |
"https://learn.astanait.edu.kz/courses/course-v1:AITU+KazHist01+24-25_C1_Y1/course/", | |
"https://learn.astanait.edu.kz/courses/course-v1:AITU+IntoProg01+24-25_C1_Y1/course/", | |
"https://learn.astanait.edu.kz/courses/course-v1:AITU+ICT101+24-25_C1_Y1/course/", | |
"https://learn.astanait.edu.kz/courses/course-v1:AITU+CULT002+24-25_C1_Y1/course/", | |
"https://learn.astanait.edu.kz/courses/course-v1:AITU+CALC02+24-25_C1_Y1/course/" | |
] | |
course_ids: list[str] = [ | |
url.split("/")[-3] | |
for url in raw_course_urls | |
] | |
RE_SPACE: re.Pattern = re.compile(r'\s{2,}') | |
http_client: Client = Client( | |
follow_redirects = True, | |
headers = { | |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0" | |
} | |
) | |
with open("cookies.txt", "r") as file: | |
for cookie in file.read().strip().replace("…", "").split("; "): | |
if cookie: | |
name, value = cookie.split("=", 1) | |
http_client.cookies.set(name, value) | |
def pretty_string(string: str) -> str: | |
return RE_SPACE.sub(" ", string.replace("\n", " ").replace("\t", " ").strip()) | |
def get_course_sections(course_id: str) -> list[dict]: | |
bs: BeautifulSoup = BeautifulSoup( | |
markup = http_client.get(f"https://learn.astanait.edu.kz/courses/{course_id}/course/").content, | |
features = "html.parser" | |
) | |
sections: list[Tag] = bs.find_all("li", class_=["outline-item", "section"]) | |
sections_data: list[dict] = [] | |
for section in sections: | |
section_title: str = pretty_string(section.find("h3", {"class": "section-title"}).text) | |
subsections: list[Tag] = section.find_all("li", class_=["subsection", "accordion"]) | |
subsections_data: list[dict] = [] | |
for subsection in subsections: | |
subsections_data.append({ | |
"title": pretty_string(subsection.find("h4", {"class": "subsection-title"}).text), | |
"url": pretty_string(subsection.find("a")["href"]) | |
}) | |
sections_data.append({ | |
"title": section_title, | |
"subsections": subsections_data | |
}) | |
return sections_data | |
def parse_subsection(subsection_url: str, course_url: str) -> dict: | |
bs: BeautifulSoup = BeautifulSoup( | |
markup = http_client.get( | |
url = subsection_url, | |
headers = { | |
"Referer": course_url | |
} | |
).content, | |
features = "html.parser") | |
print("====================================") | |
sub_bs: BeautifulSoup = BeautifulSoup(bs.find_all("div", class_=["seq_contents"])[-1].text, "html.parser") | |
video_el: Tag = sub_bs.find("div", class_=["video"]) | |
if video_el: | |
data: dict = json.loads(video_el.attrs["data-metadata"].strip()) | |
return { | |
"url": "https://youtube.com/watch?v=" + data["streams"].split(":", 1)[1] | |
} | |
problems_data: list[dict] = [] | |
for problem_wrapper_el in sub_bs.find_all("div", class_=["problems-wrapper"]): | |
sub_sub_bs: BeautifulSoup = BeautifulSoup( | |
markup = problem_wrapper_el.attrs["data-content"].strip(), | |
features = "html.parser" | |
) | |
problem_els: list[Tag] = sub_sub_bs.find_all("div", class_=["problem"]) | |
for problem_el in problem_els: | |
problem_el = problem_el.find("div", class_=["wrapper-problem-response"]) | |
problem_type: str | None = None | |
main_div: Tag = problem_el.contents[-1] | |
if "option-input" in main_div.attrs["class"]: | |
problem_type = "select" | |
problem_data: dict = { | |
"question": ( | |
pretty_string(main_div.find("label").text) | |
if problem_type | |
else | |
"\n".join([ | |
content.text.strip() | |
for content in problem_el.contents[:-1] | |
if content.text.strip() | |
]).strip() | |
), | |
"options": [] | |
} | |
if problem_type == "select": | |
select_el: Tag = main_div.find("select") | |
for option_el in select_el.find_all("option"): | |
option_el: Tag | |
if option_el.attrs["value"].endswith("_default"): | |
continue | |
problem_data["options"].append({ | |
"text": pretty_string(option_el.text), | |
"correct": option_el.attrs.get("selected") is not None | |
}) | |
else: | |
for field_el in main_div.find_all("div", class_=["field"]): | |
field_el: Tag | |
input_el: Tag = field_el.find("input") | |
label_el: Tag = field_el.find("label") | |
problem_type = input_el.attrs["type"] | |
problem_data["options"].append({ | |
"text": pretty_string(field_el.text), | |
"correct": ( | |
bool(["choicegroup_incorrect", "choicegroup_correct"].index(label_el.attrs["class"][-1])) | |
if "submitted" in input_el.attrs["class"] | |
else | |
None | |
) | |
}) | |
problem_data["type"] = problem_type | |
correct_list: list[bool | None] = [] | |
for index, option in enumerate(problem_data["options"]): | |
correct_list.append(option.pop("correct")) | |
problem_data["options"][index] = option | |
if correct_list.count(None) != len(problem_data["options"]): | |
match problem_type: | |
case "select": | |
problem_data["correct_index"] = main_div.find("span", class_=["status", "correct"]) is not None | |
case "radio": | |
try: | |
problem_data["correct_index"] = correct_list.index(True) | |
except ValueError: | |
pass | |
case "checkbox": | |
problem_data["correct_indexes"] = [ | |
index | |
for index, correct in enumerate(correct_list) | |
if correct | |
] | |
if not problem_data["correct_indexes"]: | |
del problem_data["correct_indexes"] | |
problems_data.append(problem_data) | |
return { | |
"problems": problems_data | |
} | |
for course_id in course_ids: | |
fixed_course_id_dirname: str = course_id.replace(":", "-") | |
try: | |
os.listdir(fixed_course_id_dirname) | |
except: | |
os.mkdir(fixed_course_id_dirname) | |
for course_section in get_course_sections(course_id): | |
subsection_datas = [] | |
for subsection in course_section["subsections"]: | |
print(subsection["title"]) | |
print() | |
subsection_datas.append(parse_subsection(subsection["url"], raw_course_urls[0])) | |
print("\tparsed subsection", subsection["title"]) | |
fixed_course_section_title: str = course_section['title'].replace(":", "-") | |
with open(f"{fixed_course_id_dirname}/{fixed_course_section_title}.json", "w") as file: | |
json.dump( | |
obj = subsection_datas, | |
fp = file, | |
ensure_ascii = False, | |
indent = 4 | |
) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment