Skip to content

Instantly share code, notes, and snippets.

@preraku
Created January 31, 2025 07:28
Show Gist options
  • Save preraku/7c2c92cad0a084bc7e8f7b42ebd40851 to your computer and use it in GitHub Desktop.
Save preraku/7c2c92cad0a084bc7e8f7b42ebd40851 to your computer and use it in GitHub Desktop.
Scrape 2025 Oscars nominees from https://www.oscars.org/oscars/ceremonies/2025
import requests
from bs4 import BeautifulSoup
import json
import time
def scrape_oscars(url):
headers = {
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Encoding": "gzip, deflate, br, zstd",
"Accept-Language": "en-US,en;q=0.9,hi;q=0.8",
"Cache-Control": "no-cache",
"Connection": "keep-alive",
"Cookie": "cookiebot-consent--necessary=1; CookieConsent={stamp:%27pAmVLYMcQ614PaM+h0x6sU/1TjRsiEUedF1AGTkWLCJPlBiZb/wk+Q==%27%2Cnecessary:true%2Cpreferences:true%2Cstatistics:true%2Cmarketing:true%2Cmethod:%27implied%27%2Cver:1%2Cutc:1738230798731%2Cregion:%27us-06%27}; cookiebot-consent--preferences=1; cookiebot-consent--statistics=1; cookiebot-consent--marketing=1",
"Host": "www.oscars.org",
"Pragma": "no-cache",
"Sec-Fetch-Dest": "document",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-Site": "none",
"Sec-Fetch-User": "?1",
"Upgrade-Insecure-Requests": "1",
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36",
"sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"",
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": "\"macOS\""
}
try:
response = requests.get(url, headers=headers, timeout=5)
response.raise_for_status()
except requests.RequestException as e:
print(f"Error fetching the page: {e}")
return None
# Save the response to a file
with open("response.html", "w", encoding="utf-8") as f:
f.write(response.text)
soup = BeautifulSoup(response.text, "html.parser")
categories = soup.find_all("div", class_="view-grouping")
oscars_data = []
for category in categories:
category_name_tag = category.find("div", class_="view-grouping-header")
if not category_name_tag:
continue
category_name = category_name_tag.get_text(strip=True)
nominees = []
for nominee in category.find_all("div", class_="views-row"):
nominee_name_tag = nominee.find("div", class_="views-field-title")
movie_name_tag = nominee.find(
"div", class_="views-field-field-film-title")
if not nominee_name_tag:
continue
nominee_name = nominee_name_tag.get_text(strip=True)
movie_name = movie_name_tag.get_text(
strip=True) if movie_name_tag else ""
nominees.append({"nominee": nominee_name, "movie": movie_name})
if nominees:
oscars_data.append(
{"category": category_name, "nominees": nominees})
return oscars_data
url = "https://www.oscars.org/oscars/ceremonies/2025"
oscars_data = scrape_oscars(url)
if oscars_data:
with open("oscars_2025.json", "w", encoding="utf-8") as f:
json.dump(oscars_data, f, indent=4, ensure_ascii=False)
print("Oscars data saved to oscars_2025.json")
print("Done!")
from bs4 import BeautifulSoup
import json
def clean_text(text):
# Fix text like the following:
# e.g. "Instruments of a Beating\n Heart",
return ' '.join(text.split())
# Specific awards are flipped, so I have flip them back.
awards_to_flip = set(
[
# "Actor in a Leading Role",
# "Actress in a Leading Role",
# "Actor in a Supporting Role",
# "Actress in a Supporting Role",
"International Feature Film",
]
)
# The following award is too tough to scrape, so it must be hand done.
# Music (Original Song)
awards_to_ignore = set(
[
"Music (Original Song)",
]
)
# For best picture, I do not care about the nominee, just the film.
# I want the first element of the awards list to be "Best Picture".
# I do not want to add the nominee to the movie data.
def scrape_oscars():
with open("response.html", "r", encoding="utf-8") as f:
response = f.read()
soup = BeautifulSoup(response, "html.parser")
award_categories = soup.find_all(
'div', class_='paragraph--type--award-category')
movie_name_to_id = {}
award_name_to_award_data = {}
movie_id_to_movie_data = {}
# Seed Best Picture
award_name_to_award_data["Best Picture"] = {
'id': 0,
'name': "Best Picture",
'nominees': [],
}
for category in award_categories:
category_name = clean_text(category.find(
'div', class_='field--name-field-award-category-oscars').get_text(strip=True))
if category_name in awards_to_ignore:
continue
if category_name not in award_name_to_award_data:
award_name_to_award_data[category_name] = {
'id': len(award_name_to_award_data),
'name': category_name,
'nominees': [],
}
nominees = category.find_all(
'div', class_='paragraph--type--award-honoree')
for nominee in nominees:
nominee_div = nominee.find(
'div', class_='field--name-field-award-entities').find('div', class_='field__item')
nominee_name = clean_text(nominee_div.get_text(strip=True))
film_name = clean_text(nominee.find_all('div', class_='field--name-field-award-film')[
0].get_text(strip=True))
if category_name in awards_to_flip:
film_name, nominee_name = nominee_name, film_name
if film_name not in movie_name_to_id:
movie_name_to_id[film_name] = len(movie_name_to_id)
film_id = movie_name_to_id[film_name]
movie_id_to_movie_data[film_id] = {
'id': film_id,
'title': film_name,
'poster': '',
}
else:
film_id = movie_name_to_id[film_name]
movie_data = movie_id_to_movie_data[film_id]
if category_name != "Best Picture":
movie_data[category_name] = nominee_name
award_name_to_award_data[category_name]['nominees'].append(film_id)
movie_list = []
for movie_data in movie_id_to_movie_data.values():
movie_list.append(movie_data)
awards_list = []
for award_data in award_name_to_award_data.values():
awards_list.append(award_data)
return movie_list, awards_list
data = scrape_oscars()
with open("oscars_2025.json", "w", encoding="utf-8") as f:
json.dump(data, f, indent=4, ensure_ascii=False)
print(f"""Oscars data saved to oscars_2025.json
You will have to manually add {awards_to_ignore}.
Please also check the nominees for Best Picture.
""")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment