Created
January 31, 2025 07:28
-
-
Save preraku/7c2c92cad0a084bc7e8f7b42ebd40851 to your computer and use it in GitHub Desktop.
Scrape 2025 Oscars nominees from https://www.oscars.org/oscars/ceremonies/2025
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
from bs4 import BeautifulSoup | |
import json | |
import time | |
def scrape_oscars(url): | |
headers = { | |
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", | |
"Accept-Encoding": "gzip, deflate, br, zstd", | |
"Accept-Language": "en-US,en;q=0.9,hi;q=0.8", | |
"Cache-Control": "no-cache", | |
"Connection": "keep-alive", | |
"Cookie": "cookiebot-consent--necessary=1; CookieConsent={stamp:%27pAmVLYMcQ614PaM+h0x6sU/1TjRsiEUedF1AGTkWLCJPlBiZb/wk+Q==%27%2Cnecessary:true%2Cpreferences:true%2Cstatistics:true%2Cmarketing:true%2Cmethod:%27implied%27%2Cver:1%2Cutc:1738230798731%2Cregion:%27us-06%27}; cookiebot-consent--preferences=1; cookiebot-consent--statistics=1; cookiebot-consent--marketing=1", | |
"Host": "www.oscars.org", | |
"Pragma": "no-cache", | |
"Sec-Fetch-Dest": "document", | |
"Sec-Fetch-Mode": "navigate", | |
"Sec-Fetch-Site": "none", | |
"Sec-Fetch-User": "?1", | |
"Upgrade-Insecure-Requests": "1", | |
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36", | |
"sec-ch-ua": "\"Google Chrome\";v=\"131\", \"Chromium\";v=\"131\", \"Not_A Brand\";v=\"24\"", | |
"sec-ch-ua-mobile": "?0", | |
"sec-ch-ua-platform": "\"macOS\"" | |
} | |
try: | |
response = requests.get(url, headers=headers, timeout=5) | |
response.raise_for_status() | |
except requests.RequestException as e: | |
print(f"Error fetching the page: {e}") | |
return None | |
# Save the response to a file | |
with open("response.html", "w", encoding="utf-8") as f: | |
f.write(response.text) | |
soup = BeautifulSoup(response.text, "html.parser") | |
categories = soup.find_all("div", class_="view-grouping") | |
oscars_data = [] | |
for category in categories: | |
category_name_tag = category.find("div", class_="view-grouping-header") | |
if not category_name_tag: | |
continue | |
category_name = category_name_tag.get_text(strip=True) | |
nominees = [] | |
for nominee in category.find_all("div", class_="views-row"): | |
nominee_name_tag = nominee.find("div", class_="views-field-title") | |
movie_name_tag = nominee.find( | |
"div", class_="views-field-field-film-title") | |
if not nominee_name_tag: | |
continue | |
nominee_name = nominee_name_tag.get_text(strip=True) | |
movie_name = movie_name_tag.get_text( | |
strip=True) if movie_name_tag else "" | |
nominees.append({"nominee": nominee_name, "movie": movie_name}) | |
if nominees: | |
oscars_data.append( | |
{"category": category_name, "nominees": nominees}) | |
return oscars_data | |
url = "https://www.oscars.org/oscars/ceremonies/2025" | |
oscars_data = scrape_oscars(url) | |
if oscars_data: | |
with open("oscars_2025.json", "w", encoding="utf-8") as f: | |
json.dump(oscars_data, f, indent=4, ensure_ascii=False) | |
print("Oscars data saved to oscars_2025.json") | |
print("Done!") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from bs4 import BeautifulSoup | |
import json | |
def clean_text(text): | |
# Fix text like the following: | |
# e.g. "Instruments of a Beating\n Heart", | |
return ' '.join(text.split()) | |
# Specific awards are flipped, so I have flip them back. | |
awards_to_flip = set( | |
[ | |
# "Actor in a Leading Role", | |
# "Actress in a Leading Role", | |
# "Actor in a Supporting Role", | |
# "Actress in a Supporting Role", | |
"International Feature Film", | |
] | |
) | |
# The following award is too tough to scrape, so it must be hand done. | |
# Music (Original Song) | |
awards_to_ignore = set( | |
[ | |
"Music (Original Song)", | |
] | |
) | |
# For best picture, I do not care about the nominee, just the film. | |
# I want the first element of the awards list to be "Best Picture". | |
# I do not want to add the nominee to the movie data. | |
def scrape_oscars(): | |
with open("response.html", "r", encoding="utf-8") as f: | |
response = f.read() | |
soup = BeautifulSoup(response, "html.parser") | |
award_categories = soup.find_all( | |
'div', class_='paragraph--type--award-category') | |
movie_name_to_id = {} | |
award_name_to_award_data = {} | |
movie_id_to_movie_data = {} | |
# Seed Best Picture | |
award_name_to_award_data["Best Picture"] = { | |
'id': 0, | |
'name': "Best Picture", | |
'nominees': [], | |
} | |
for category in award_categories: | |
category_name = clean_text(category.find( | |
'div', class_='field--name-field-award-category-oscars').get_text(strip=True)) | |
if category_name in awards_to_ignore: | |
continue | |
if category_name not in award_name_to_award_data: | |
award_name_to_award_data[category_name] = { | |
'id': len(award_name_to_award_data), | |
'name': category_name, | |
'nominees': [], | |
} | |
nominees = category.find_all( | |
'div', class_='paragraph--type--award-honoree') | |
for nominee in nominees: | |
nominee_div = nominee.find( | |
'div', class_='field--name-field-award-entities').find('div', class_='field__item') | |
nominee_name = clean_text(nominee_div.get_text(strip=True)) | |
film_name = clean_text(nominee.find_all('div', class_='field--name-field-award-film')[ | |
0].get_text(strip=True)) | |
if category_name in awards_to_flip: | |
film_name, nominee_name = nominee_name, film_name | |
if film_name not in movie_name_to_id: | |
movie_name_to_id[film_name] = len(movie_name_to_id) | |
film_id = movie_name_to_id[film_name] | |
movie_id_to_movie_data[film_id] = { | |
'id': film_id, | |
'title': film_name, | |
'poster': '', | |
} | |
else: | |
film_id = movie_name_to_id[film_name] | |
movie_data = movie_id_to_movie_data[film_id] | |
if category_name != "Best Picture": | |
movie_data[category_name] = nominee_name | |
award_name_to_award_data[category_name]['nominees'].append(film_id) | |
movie_list = [] | |
for movie_data in movie_id_to_movie_data.values(): | |
movie_list.append(movie_data) | |
awards_list = [] | |
for award_data in award_name_to_award_data.values(): | |
awards_list.append(award_data) | |
return movie_list, awards_list | |
data = scrape_oscars() | |
with open("oscars_2025.json", "w", encoding="utf-8") as f: | |
json.dump(data, f, indent=4, ensure_ascii=False) | |
print(f"""Oscars data saved to oscars_2025.json | |
You will have to manually add {awards_to_ignore}. | |
Please also check the nominees for Best Picture. | |
""") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment