preraku · January 31, 2025 07:28
diff --git a/fetch_page.py b/fetch_page.py
 import requests
 from bs4 import BeautifulSoup
 import json
 import time


 def scrape_oscars(url):
    headers = {
        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
        "Accept-Encoding": "gzip, deflate, br, zstd",
        "Accept-Language": "en-US,en;q=0.9,hi;q=0.8",
        "Cache-Control": "no-cache",
        "Connection": "keep-alive",
        "Cookie": "cookiebot-consent--necessary=1; CookieConsent={stamp:%27pAmVLYMcQ614PaM+h0x6sU/1TjRsiEUedF1AGTkWLCJPlBiZb/wk+Q==%27%2Cnecessary:true%2Cpreferences:true%2Cstatistics:true%2Cmarketing:true%2Cmethod:%27implied%27%2Cver:1%2Cutc:1738230798731%2Cregion:%27us-06%27}; cookiebot-consent--preferences=1; cookiebot-consent--statistics=1; cookiebot-consent--marketing=1",
        "Host": "www.oscars.org",
        "Pragma": "no-cache",
        "Sec-Fetch-Dest": "document",
        "Sec-Fetch-Mode": "navigate",
        "Sec-Fetch-Site": "none",
        "Sec-Fetch-User": "?1",
        "Upgrade-Insecure-Requests": "1",
        "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
        "sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
        "sec-ch-ua-mobile": "?0",
        "sec-ch-ua-platform": "\"macOS\""
    }

    try:
        response = requests.get(url, headers=headers, timeout=5)
        response.raise_for_status()
    except requests.RequestException as e:
        print(f"Error fetching the page: {e}")
        return None
    # Save the response to a file
    with open("response.html", "w", encoding="utf-8") as f:
        f.write(response.text)

    soup = BeautifulSoup(response.text, "html.parser")

    categories = soup.find_all("div", class_="view-grouping")
    oscars_data = []

    for category in categories:
        category_name_tag = category.find("div", class_="view-grouping-header")
        if not category_name_tag:
            continue
        category_name = category_name_tag.get_text(strip=True)

        nominees = []
        for nominee in category.find_all("div", class_="views-row"):
            nominee_name_tag = nominee.find("div", class_="views-field-title")
            movie_name_tag = nominee.find(
                "div", class_="views-field-field-film-title")

            if not nominee_name_tag:
                continue

            nominee_name = nominee_name_tag.get_text(strip=True)
            movie_name = movie_name_tag.get_text(
                strip=True) if movie_name_tag else ""

            nominees.append({"nominee": nominee_name, "movie": movie_name})

        if nominees:
            oscars_data.append(
                {"category": category_name, "nominees": nominees})

    return oscars_data


 url = "https://www.oscars.org/oscars/ceremonies/2025"
 oscars_data = scrape_oscars(url)

 if oscars_data:
    with open("oscars_2025.json", "w", encoding="utf-8") as f:
        json.dump(oscars_data, f, indent=4, ensure_ascii=False)
    print("Oscars data saved to oscars_2025.json")

 print("Done!")
diff --git a/scrape.py b/scrape.py
 from bs4 import BeautifulSoup
 import json


 def clean_text(text):
    # Fix text like the following:
    # e.g. "Instruments of a Beating\n                                                    Heart",
    return ' '.join(text.split())


 # Specific awards are flipped, so I have flip them back.
 awards_to_flip = set(
    [
        # "Actor in a Leading Role",
        # "Actress in a Leading Role",
        # "Actor in a Supporting Role",
        # "Actress in a Supporting Role",
        "International Feature Film",
    ]
 )

 # The following award is too tough to scrape, so it must be hand done.
 # Music (Original Song)
 awards_to_ignore = set(
    [
        "Music (Original Song)",
    ]
 )

 # For best picture, I do not care about the nominee, just the film.
 # I want the first element of the awards list to be "Best Picture".
 # I do not want to add the nominee to the movie data.


 def scrape_oscars():
    with open("response.html", "r", encoding="utf-8") as f:
        response = f.read()

    soup = BeautifulSoup(response, "html.parser")

    award_categories = soup.find_all(
        'div', class_='paragraph--type--award-category')

    movie_name_to_id = {}
    award_name_to_award_data = {}
    movie_id_to_movie_data = {}

    # Seed Best Picture
    award_name_to_award_data["Best Picture"] = {
        'id': 0,
        'name': "Best Picture",
        'nominees': [],
    }

    for category in award_categories:
        category_name = clean_text(category.find(
            'div', class_='field--name-field-award-category-oscars').get_text(strip=True))
        if category_name in awards_to_ignore:
            continue

        if category_name not in award_name_to_award_data:
            award_name_to_award_data[category_name] = {
                'id': len(award_name_to_award_data),
                'name': category_name,
                'nominees': [],
            }

        nominees = category.find_all(
            'div', class_='paragraph--type--award-honoree')

        for nominee in nominees:
            nominee_div = nominee.find(
                'div', class_='field--name-field-award-entities').find('div', class_='field__item')
            nominee_name = clean_text(nominee_div.get_text(strip=True))
            film_name = clean_text(nominee.find_all('div', class_='field--name-field-award-film')[
                0].get_text(strip=True))

            if category_name in awards_to_flip:
                film_name, nominee_name = nominee_name, film_name
            if film_name not in movie_name_to_id:
                movie_name_to_id[film_name] = len(movie_name_to_id)
                film_id = movie_name_to_id[film_name]
                movie_id_to_movie_data[film_id] = {
                    'id': film_id,
                    'title': film_name,
                    'poster': '',
                }
            else:
                film_id = movie_name_to_id[film_name]
            movie_data = movie_id_to_movie_data[film_id]
            if category_name != "Best Picture":
                movie_data[category_name] = nominee_name
            award_name_to_award_data[category_name]['nominees'].append(film_id)
    movie_list = []
    for movie_data in movie_id_to_movie_data.values():
        movie_list.append(movie_data)
    awards_list = []
    for award_data in award_name_to_award_data.values():
        awards_list.append(award_data)
    return movie_list, awards_list


 data = scrape_oscars()
 with open("oscars_2025.json", "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)
 print(f"""Oscars data saved to oscars_2025.json
 You will have to manually add {awards_to_ignore}.
 Please also check the nominees for Best Picture.
      """)
	import requests
	from bs4 import BeautifulSoup
	import json
	import time


	def scrape_oscars(url):
	headers = {
	"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,/;q=0.8,application/signed-exchange;v=b3;q=0.7",
	"Accept-Encoding": "gzip, deflate, br, zstd",
	"Accept-Language": "en-US,en;q=0.9,hi;q=0.8",
	"Cache-Control": "no-cache",
	"Connection": "keep-alive",
	"Cookie": "cookiebot-consent--necessary=1; CookieConsent={stamp:%27pAmVLYMcQ614PaM+h0x6sU/1TjRsiEUedF1AGTkWLCJPlBiZb/wk+Q==%27%2Cnecessary:true%2Cpreferences:true%2Cstatistics:true%2Cmarketing:true%2Cmethod:%27implied%27%2Cver:1%2Cutc:1738230798731%2Cregion:%27us-06%27}; cookiebot-consent--preferences=1; cookiebot-consent--statistics=1; cookiebot-consent--marketing=1",
	"Host": "www.oscars.org",
	"Pragma": "no-cache",
	"Sec-Fetch-Dest": "document",
	"Sec-Fetch-Mode": "navigate",
	"Sec-Fetch-Site": "none",
	"Sec-Fetch-User": "?1",
	"Upgrade-Insecure-Requests": "1",
	"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
	"sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
	"sec-ch-ua-mobile": "?0",
	"sec-ch-ua-platform": "\"macOS\""
	}

	try:
	response = requests.get(url, headers=headers, timeout=5)
	response.raise_for_status()
	except requests.RequestException as e:
	print(f"Error fetching the page: {e}")
	return None
	# Save the response to a file
	with open("response.html", "w", encoding="utf-8") as f:
	f.write(response.text)

	soup = BeautifulSoup(response.text, "html.parser")

	categories = soup.find_all("div", class_="view-grouping")
	oscars_data = []

	for category in categories:
	category_name_tag = category.find("div", class_="view-grouping-header")
	if not category_name_tag:
	continue
	category_name = category_name_tag.get_text(strip=True)

	nominees = []
	for nominee in category.find_all("div", class_="views-row"):
	nominee_name_tag = nominee.find("div", class_="views-field-title")
	movie_name_tag = nominee.find(
	"div", class_="views-field-field-film-title")

	if not nominee_name_tag:
	continue

	nominee_name = nominee_name_tag.get_text(strip=True)
	movie_name = movie_name_tag.get_text(
	strip=True) if movie_name_tag else ""

	nominees.append({"nominee": nominee_name, "movie": movie_name})

	if nominees:
	oscars_data.append(
	{"category": category_name, "nominees": nominees})

	return oscars_data


	url = "https://www.oscars.org/oscars/ceremonies/2025"
	oscars_data = scrape_oscars(url)

	if oscars_data:
	with open("oscars_2025.json", "w", encoding="utf-8") as f:
	json.dump(oscars_data, f, indent=4, ensure_ascii=False)
	print("Oscars data saved to oscars_2025.json")

	print("Done!")
	from bs4 import BeautifulSoup
	import json


	def clean_text(text):
	# Fix text like the following:
	# e.g. "Instruments of a Beating\n Heart",
	return ' '.join(text.split())


	# Specific awards are flipped, so I have flip them back.
	awards_to_flip = set(
	[
	# "Actor in a Leading Role",
	# "Actress in a Leading Role",
	# "Actor in a Supporting Role",
	# "Actress in a Supporting Role",
	"International Feature Film",
	]
	)

	# The following award is too tough to scrape, so it must be hand done.
	# Music (Original Song)
	awards_to_ignore = set(
	[
	"Music (Original Song)",
	]
	)

	# For best picture, I do not care about the nominee, just the film.
	# I want the first element of the awards list to be "Best Picture".
	# I do not want to add the nominee to the movie data.


	def scrape_oscars():
	with open("response.html", "r", encoding="utf-8") as f:
	response = f.read()

	soup = BeautifulSoup(response, "html.parser")

	award_categories = soup.find_all(
	'div', class_='paragraph--type--award-category')

	movie_name_to_id = {}
	award_name_to_award_data = {}
	movie_id_to_movie_data = {}

	# Seed Best Picture
	award_name_to_award_data["Best Picture"] = {
	'id': 0,
	'name': "Best Picture",
	'nominees': [],
	}

	for category in award_categories:
	category_name = clean_text(category.find(
	'div', class_='field--name-field-award-category-oscars').get_text(strip=True))
	if category_name in awards_to_ignore:
	continue

	if category_name not in award_name_to_award_data:
	award_name_to_award_data[category_name] = {
	'id': len(award_name_to_award_data),
	'name': category_name,
	'nominees': [],
	}

	nominees = category.find_all(
	'div', class_='paragraph--type--award-honoree')

	for nominee in nominees:
	nominee_div = nominee.find(
	'div', class_='field--name-field-award-entities').find('div', class_='field__item')
	nominee_name = clean_text(nominee_div.get_text(strip=True))
	film_name = clean_text(nominee.find_all('div', class_='field--name-field-award-film')[
	0].get_text(strip=True))

	if category_name in awards_to_flip:
	film_name, nominee_name = nominee_name, film_name
	if film_name not in movie_name_to_id:
	movie_name_to_id[film_name] = len(movie_name_to_id)
	film_id = movie_name_to_id[film_name]
	movie_id_to_movie_data[film_id] = {
	'id': film_id,
	'title': film_name,
	'poster': '',
	}
	else:
	film_id = movie_name_to_id[film_name]
	movie_data = movie_id_to_movie_data[film_id]
	if category_name != "Best Picture":
	movie_data[category_name] = nominee_name
	award_name_to_award_data[category_name]['nominees'].append(film_id)
	movie_list = []
	for movie_data in movie_id_to_movie_data.values():
	movie_list.append(movie_data)
	awards_list = []
	for award_data in award_name_to_award_data.values():
	awards_list.append(award_data)
	return movie_list, awards_list


	data = scrape_oscars()
	with open("oscars_2025.json", "w", encoding="utf-8") as f:
	json.dump(data, f, indent=4, ensure_ascii=False)
	print(f"""Oscars data saved to oscars_2025.json
	You will have to manually add {awards_to_ignore}.
	Please also check the nominees for Best Picture.
	""")