-
-
Save rafnixg/32a72280fe199671dbc9f2c5ecd492ed to your computer and use it in GitHub Desktop.
Una clase del BC de Ciencia de datos.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Scrapper IMDB Calendar Mexico""" | |
import csv | |
import json | |
import requests | |
from bs4 import BeautifulSoup | |
URL = "https://www.imdb.com/calendar/?region=MX" | |
""" | |
1.- Obtener el maqueto HTML | |
- Si el archivo HTML no existe de forma local, crearlo. | |
- Si el archivo HTML existe de forma local, obtener su contenido. | |
2.- Obtener la información | |
- Nombre | |
- Categorias | |
- Reparto | |
3.- Generar un archivo CSV | |
""" | |
def get_imdb_content(): | |
"""Get the content of the IMDB calendar page | |
Returns: | |
string -- The content of the IMDB calendar page | |
None -- If the request was not successful | |
""" | |
headers = {"User-Agent": "Mozilla/5.0"} | |
response = requests.get(URL, headers=headers) # 20x - 30x - 40x - 50x | |
if response.status_code == 200: | |
return response.text | |
return None | |
def create_imdb_file_local(content): | |
"""Create a local file with the content of the IMDB calendar page | |
Arguments: | |
content {string} -- The content of the IMDB calendar page | |
""" | |
try: | |
with open("imdb.html", "w", encoding="UTF-8") as file: | |
file.write(content) | |
except: | |
pass | |
def get_imdb_file_local(): | |
"""Get the content of the local file with the content of the IMDB calendar page | |
Returns: | |
string -- The content of the local file with the content of the IMDB calendar page | |
None -- If the file does not exist | |
""" | |
content = None | |
try: | |
with open("imdb.html", "r", encoding="UTF-8") as file: | |
content = file.read() | |
except: | |
pass | |
return content | |
def get_local_imdb_content(): | |
"""Get the content of the IMDB calendar page from the local file or from the IMDB page | |
Returns: | |
string -- The content of the IMDB calendar page | |
""" | |
content = get_imdb_file_local() | |
if content: | |
return content | |
content = get_imdb_content() | |
create_imdb_file_local(content) | |
return content | |
def create_movie(tag): | |
"""Create a movie object from a tag of the IMDB calendar page | |
Arguments: | |
tag {bs4.element.Tag} -- A tag of the IMDB calendar page | |
Returns: | |
tuple -- A tuple with the name, categories and cast of the movie | |
""" | |
main_div = tag.find("div", {"class": "ipc-metadata-list-summary-item__c"}) | |
name = main_div.div.a.text | |
ul_categories = main_div.find( | |
"ul", | |
{ | |
"class": "ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--no-wrap ipc-inline-list--inline ipc-metadata-list-summary-item__tl base" | |
}, | |
) | |
ul_cast = main_div.find( | |
"ul", | |
{ | |
"class": "ipc-inline-list ipc-inline-list--show-dividers ipc-inline-list--no-wrap ipc-inline-list--inline ipc-metadata-list-summary-item__stl base" | |
}, | |
) | |
cast = None | |
categories = [category.span.text for category in ul_categories.find_all("li")] | |
cast = [cast.span.text for cast in ul_cast.find_all("li")] if ul_cast else [] | |
return (name, categories, cast) | |
def parse_content_html(content): | |
"""Parse the content of the IMDB calendar page | |
Arguments: | |
content {string} -- The content of the IMDB calendar page | |
Returns: | |
list -- A list with the tags of the movies | |
""" | |
soup = BeautifulSoup(content, "html.parser") | |
li_tags = soup.find_all( | |
"li", | |
{ | |
"data-testid": "coming-soon-entry", | |
"class": "ipc-metadata-list-summary-item ipc-metadata-list-summary-item--click sc-8c2b7f1f-0 bpqYIE", | |
}, | |
) | |
return li_tags | |
def create_csv_movies_file(movies): | |
"""Create a CSV file with the movies | |
Arguments: | |
movies {list} -- A list with the movies | |
""" | |
with open("movies.csv", "w", encoding="UTF-8") as file: | |
writer = csv.writer(file, delimiter=",") | |
writer.writerow(["name", "categories", "cast"]) | |
for movie in movies: | |
writer.writerow( | |
[ | |
movie[0], # name | |
",".join(movie[1]), # categories | |
",".join(movie[2]), # cast | |
] | |
) | |
def create_json_movies_file(movies): | |
"""Create a JSON file with the movies | |
Arguments: | |
movies {list} -- A list with the movies | |
""" | |
dict_movies = [ | |
{"name": movie[0], "categories": movie[1], "cast": movie[2]} for movie in movies | |
] | |
with open("movies.json", "w", encoding="utf-8") as file: | |
json.dump(dict_movies, file, indent=4) | |
def get_movies_from_tags(li_tags): | |
"""Get the movies from the tags of the IMDB calendar page | |
Arguments: | |
li_tags {list} -- A list with the tags of the movies | |
Returns: | |
list -- A list with the movies | |
""" | |
movies = [] | |
for tag in li_tags: | |
movie = create_movie(tag) | |
movies.append(movie) | |
return movies | |
def main(): | |
"""Main function""" | |
content = get_local_imdb_content() | |
if not content: | |
print("No se pudo obtener el contenido de IMDB") | |
return None | |
li_tags = parse_content_html(content) | |
movies = get_movies_from_tags(li_tags) | |
create_csv_movies_file(movies) | |
create_json_movies_file(movies) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment