arynyklas · November 18, 2024 17:42
diff --git a/learn_parser.py b/learn_parser.py
 from httpx import Client
 from bs4 import BeautifulSoup, Tag

 import re
 import json
 import os


 raw_course_urls: list[str] = [
    "https://learn.astanait.edu.kz/courses/course-v1:AITU+KazHist01+24-25_C1_Y1/course/",
    "https://learn.astanait.edu.kz/courses/course-v1:AITU+IntoProg01+24-25_C1_Y1/course/",
    "https://learn.astanait.edu.kz/courses/course-v1:AITU+ICT101+24-25_C1_Y1/course/",
    "https://learn.astanait.edu.kz/courses/course-v1:AITU+CULT002+24-25_C1_Y1/course/",
    "https://learn.astanait.edu.kz/courses/course-v1:AITU+CALC02+24-25_C1_Y1/course/"
 ]

 course_ids: list[str] = [
    url.split("/")[-3]
    for url in raw_course_urls
 ]


 RE_SPACE: re.Pattern = re.compile(r'\s{2,}')


 http_client: Client = Client(
    follow_redirects = True,
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0"
    }
 )


 with open("cookies.txt", "r") as file:
    for cookie in file.read().strip().replace("…", "").split("; "):
        if cookie:
            name, value = cookie.split("=", 1)
            http_client.cookies.set(name, value)


 def pretty_string(string: str) -> str:
    return RE_SPACE.sub(" ", string.replace("\n", " ").replace("\t", " ").strip())


 def get_course_sections(course_id: str) -> list[dict]:
    bs: BeautifulSoup = BeautifulSoup(
        markup = http_client.get(f"https://learn.astanait.edu.kz/courses/{course_id}/course/").content,
        features = "html.parser"
    )

    sections: list[Tag] = bs.find_all("li", class_=["outline-item", "section"])
    sections_data: list[dict] = []

    for section in sections:
        section_title: str = pretty_string(section.find("h3", {"class": "section-title"}).text)
        subsections: list[Tag] = section.find_all("li", class_=["subsection", "accordion"])

        subsections_data: list[dict] = []

        for subsection in subsections:
            subsections_data.append({
                "title": pretty_string(subsection.find("h4", {"class": "subsection-title"}).text),
                "url": pretty_string(subsection.find("a")["href"])
            })

        sections_data.append({
            "title": section_title,
            "subsections": subsections_data
        })

    return sections_data


 def parse_subsection(subsection_url: str, course_url: str) -> dict:
    bs: BeautifulSoup = BeautifulSoup(
        markup = http_client.get(
            url = subsection_url,
            headers = {
                "Referer": course_url
            }
        ).content,
        features = "html.parser")

    print("====================================")

    sub_bs: BeautifulSoup = BeautifulSoup(bs.find_all("div", class_=["seq_contents"])[-1].text, "html.parser")
    video_el: Tag = sub_bs.find("div", class_=["video"])

    if video_el:
        data: dict = json.loads(video_el.attrs["data-metadata"].strip())

        return {
            "url": "https://youtube.com/watch?v=" + data["streams"].split(":", 1)[1]
        }

    problems_data: list[dict] = []

    for problem_wrapper_el in sub_bs.find_all("div", class_=["problems-wrapper"]):
        sub_sub_bs: BeautifulSoup = BeautifulSoup(
            markup = problem_wrapper_el.attrs["data-content"].strip(),
            features = "html.parser"
        )

        problem_els: list[Tag] = sub_sub_bs.find_all("div", class_=["problem"])

        for problem_el in problem_els:
            problem_el = problem_el.find("div", class_=["wrapper-problem-response"])

            problem_type: str | None = None

            main_div: Tag = problem_el.contents[-1]

            if "option-input" in main_div.attrs["class"]:
                problem_type = "select"

            problem_data: dict = {
                "question": (
                    pretty_string(main_div.find("label").text)
                    if problem_type
                    else
                    "\n".join([
                        content.text.strip()
                        for content in problem_el.contents[:-1]
                        if content.text.strip()
                    ]).strip()
                ),
                "options": []
            }

            if problem_type == "select":
                select_el: Tag = main_div.find("select")

                for option_el in select_el.find_all("option"):
                    option_el: Tag

                    if option_el.attrs["value"].endswith("_default"):
                        continue

                    problem_data["options"].append({
                        "text": pretty_string(option_el.text),
                        "correct": option_el.attrs.get("selected") is not None
                    })

            else:
                for field_el in main_div.find_all("div", class_=["field"]):
                    field_el: Tag

                    input_el: Tag = field_el.find("input")
                    label_el: Tag = field_el.find("label")

                    problem_type = input_el.attrs["type"]

                    problem_data["options"].append({
                        "text": pretty_string(field_el.text),
                        "correct": (
                            bool(["choicegroup_incorrect", "choicegroup_correct"].index(label_el.attrs["class"][-1]))
                            if "submitted" in input_el.attrs["class"]
                            else
                            None
                        )
                    })

            problem_data["type"] = problem_type

            correct_list: list[bool | None] = []

            for index, option in enumerate(problem_data["options"]):
                correct_list.append(option.pop("correct"))
                problem_data["options"][index] = option

            if correct_list.count(None) != len(problem_data["options"]):
                match problem_type:
                    case "select":
                        problem_data["correct_index"] = main_div.find("span", class_=["status", "correct"]) is not None

                    case "radio":
                        try:
                            problem_data["correct_index"] = correct_list.index(True)
                        except ValueError:
                            pass

                    case "checkbox":
                        problem_data["correct_indexes"] = [
                            index
                            for index, correct in enumerate(correct_list)
                            if correct
                        ]

                        if not problem_data["correct_indexes"]:
                            del problem_data["correct_indexes"]

            problems_data.append(problem_data)

    return {
        "problems": problems_data
    }


 for course_id in course_ids:
    fixed_course_id_dirname: str = course_id.replace(":", "-")

    try:
        os.listdir(fixed_course_id_dirname)
    except:
        os.mkdir(fixed_course_id_dirname)

    for course_section in get_course_sections(course_id):
        subsection_datas = []

        for subsection in course_section["subsections"]:
            print(subsection["title"])
            print()

            subsection_datas.append(parse_subsection(subsection["url"], raw_course_urls[0]))

            print("\tparsed subsection", subsection["title"])

        fixed_course_section_title: str = course_section['title'].replace(":", "-")

        with open(f"{fixed_course_id_dirname}/{fixed_course_section_title}.json", "w") as file:
            json.dump(
                obj = subsection_datas,
                fp = file,
                ensure_ascii = False,
                indent = 4
            )
	from httpx import Client
	from bs4 import BeautifulSoup, Tag

	import re
	import json
	import os


	raw_course_urls: list[str] = [
	"https://learn.astanait.edu.kz/courses/course-v1:AITU+KazHist01+24-25_C1_Y1/course/",
	"https://learn.astanait.edu.kz/courses/course-v1:AITU+IntoProg01+24-25_C1_Y1/course/",
	"https://learn.astanait.edu.kz/courses/course-v1:AITU+ICT101+24-25_C1_Y1/course/",
	"https://learn.astanait.edu.kz/courses/course-v1:AITU+CULT002+24-25_C1_Y1/course/",
	"https://learn.astanait.edu.kz/courses/course-v1:AITU+CALC02+24-25_C1_Y1/course/"
	]

	course_ids: list[str] = [
	url.split("/")[-3]
	for url in raw_course_urls
	]


	RE_SPACE: re.Pattern = re.compile(r'\s{2,}')


	http_client: Client = Client(
	follow_redirects = True,
	headers = {
	"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:132.0) Gecko/20100101 Firefox/132.0"
	}
	)


	with open("cookies.txt", "r") as file:
	for cookie in file.read().strip().replace("…", "").split("; "):
	if cookie:
	name, value = cookie.split("=", 1)
	http_client.cookies.set(name, value)


	def pretty_string(string: str) -> str:
	return RE_SPACE.sub(" ", string.replace("\n", " ").replace("\t", " ").strip())


	def get_course_sections(course_id: str) -> list[dict]:
	bs: BeautifulSoup = BeautifulSoup(
	markup = http_client.get(f"https://learn.astanait.edu.kz/courses/{course_id}/course/").content,
	features = "html.parser"
	)

	sections: list[Tag] = bs.find_all("li", class_=["outline-item", "section"])
	sections_data: list[dict] = []

	for section in sections:
	section_title: str = pretty_string(section.find("h3", {"class": "section-title"}).text)
	subsections: list[Tag] = section.find_all("li", class_=["subsection", "accordion"])

	subsections_data: list[dict] = []

	for subsection in subsections:
	subsections_data.append({
	"title": pretty_string(subsection.find("h4", {"class": "subsection-title"}).text),
	"url": pretty_string(subsection.find("a")["href"])
	})

	sections_data.append({
	"title": section_title,
	"subsections": subsections_data
	})

	return sections_data


	def parse_subsection(subsection_url: str, course_url: str) -> dict:
	bs: BeautifulSoup = BeautifulSoup(
	markup = http_client.get(
	url = subsection_url,
	headers = {
	"Referer": course_url
	}
	).content,
	features = "html.parser")

	print("====================================")

	sub_bs: BeautifulSoup = BeautifulSoup(bs.find_all("div", class_=["seq_contents"])[-1].text, "html.parser")
	video_el: Tag = sub_bs.find("div", class_=["video"])

	if video_el:
	data: dict = json.loads(video_el.attrs["data-metadata"].strip())

	return {
	"url": "https://youtube.com/watch?v=" + data["streams"].split(":", 1)[1]
	}

	problems_data: list[dict] = []

	for problem_wrapper_el in sub_bs.find_all("div", class_=["problems-wrapper"]):
	sub_sub_bs: BeautifulSoup = BeautifulSoup(
	markup = problem_wrapper_el.attrs["data-content"].strip(),
	features = "html.parser"
	)

	problem_els: list[Tag] = sub_sub_bs.find_all("div", class_=["problem"])

	for problem_el in problem_els:
	problem_el = problem_el.find("div", class_=["wrapper-problem-response"])

	problem_type: str \| None = None

	main_div: Tag = problem_el.contents[-1]

	if "option-input" in main_div.attrs["class"]:
	problem_type = "select"

	problem_data: dict = {
	"question": (
	pretty_string(main_div.find("label").text)
	if problem_type
	else
	"\n".join([
	content.text.strip()
	for content in problem_el.contents[:-1]
	if content.text.strip()
	]).strip()
	),
	"options": []
	}

	if problem_type == "select":
	select_el: Tag = main_div.find("select")

	for option_el in select_el.find_all("option"):
	option_el: Tag

	if option_el.attrs["value"].endswith("_default"):
	continue

	problem_data["options"].append({
	"text": pretty_string(option_el.text),
	"correct": option_el.attrs.get("selected") is not None
	})

	else:
	for field_el in main_div.find_all("div", class_=["field"]):
	field_el: Tag

	input_el: Tag = field_el.find("input")
	label_el: Tag = field_el.find("label")

	problem_type = input_el.attrs["type"]

	problem_data["options"].append({
	"text": pretty_string(field_el.text),
	"correct": (
	bool(["choicegroup_incorrect", "choicegroup_correct"].index(label_el.attrs["class"][-1]))
	if "submitted" in input_el.attrs["class"]
	else
	None
	)
	})

	problem_data["type"] = problem_type

	correct_list: list[bool \| None] = []

	for index, option in enumerate(problem_data["options"]):
	correct_list.append(option.pop("correct"))
	problem_data["options"][index] = option

	if correct_list.count(None) != len(problem_data["options"]):
	match problem_type:
	case "select":
	problem_data["correct_index"] = main_div.find("span", class_=["status", "correct"]) is not None

	case "radio":
	try:
	problem_data["correct_index"] = correct_list.index(True)
	except ValueError:
	pass

	case "checkbox":
	problem_data["correct_indexes"] = [
	index
	for index, correct in enumerate(correct_list)
	if correct
	]

	if not problem_data["correct_indexes"]:
	del problem_data["correct_indexes"]

	problems_data.append(problem_data)

	return {
	"problems": problems_data
	}


	for course_id in course_ids:
	fixed_course_id_dirname: str = course_id.replace(":", "-")

	try:
	os.listdir(fixed_course_id_dirname)
	except:
	os.mkdir(fixed_course_id_dirname)

	for course_section in get_course_sections(course_id):
	subsection_datas = []

	for subsection in course_section["subsections"]:
	print(subsection["title"])
	print()

	subsection_datas.append(parse_subsection(subsection["url"], raw_course_urls[0]))

	print("\tparsed subsection", subsection["title"])

	fixed_course_section_title: str = course_section['title'].replace(":", "-")

	with open(f"{fixed_course_id_dirname}/{fixed_course_section_title}.json", "w") as file:
	json.dump(
	obj = subsection_datas,
	fp = file,
	ensure_ascii = False,
	indent = 4
	)