Skip to content

Instantly share code, notes, and snippets.

@BrunoMoreno
Created July 23, 2021 13:28
Show Gist options
  • Save BrunoMoreno/3af5af53acc8a49c8b1b4f19c6460964 to your computer and use it in GitHub Desktop.
Save BrunoMoreno/3af5af53acc8a49c8b1b4f19c6460964 to your computer and use it in GitHub Desktop.
Scraper for imdb movies
import requests
from bs4 import BeautifulSoup
import pandas as pd
import csv
url = "https://www.imdb.com/search/title/?groups=top_1000"
results = requests.get(url)
soup = BeautifulSoup(results.text, "html.parser")
titles = []
years = []
time = []
imdb_ratings = []
genre = []
votes = []
movie_div = soup.find_all("div", class_="lister-item mode-advanced")
for movieSection in movie_div:
name = movieSection.h3.a.text
titles.append(name)
year = movieSection.h3.find("span", class_="lister-item-year").text
years.append(year)
ratings = movieSection.strong.text
imdb_ratings.append(ratings)
category = movieSection.find("span", class_="genre").text.strip()
genre.append(category)
runTime = movieSection.find("span", class_="runtime").text
time.append(runTime)
nv = movieSection.find_all("span", attrs={"name": "nv"})
vote = nv[0].text
votes.append(vote)
movies = pd.DataFrame(
{
"Movie": titles,
"Year": years,
"RunTime": time,
"imdb": imdb_ratings,
"Genre": genre,
"votes": votes,
}
)
# cleaning
movies["Year"] = movies["Year"].str.extract("(\\d+)").astype(int)
movies["RunTime"] = movies["RunTime"].str.replace("min", "minutes")
movies["votes"] = movies["votes"].str.replace(",", "").astype(int)
print(movies)
movies.to_csv(r"movies.csv", index=False, header=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment