Last active
November 12, 2020 02:29
-
-
Save unexceptable/223638796961cb7ebd418e1a663d9b31 to your computer and use it in GitHub Desktop.
web scraper for wildstar addons from curseforge, requirements: https://gist.github.com/Adrian-Turjak/631241e503b8cf4c814a3d5ca2ce8d5e
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import hashlib | |
import yaml | |
import os | |
from bs4 import BeautifulSoup | |
import cloudscraper | |
from markdownify import markdownify as md | |
from requests import exceptions | |
CURSE_ROOT_URL = "https://www.curseforge.com" | |
ADDONS_ROOT_URL = f"{CURSE_ROOT_URL}/wildstar/ws-addons" | |
ROOT_DIR = "wildstar_addons" | |
METADATA_DIR = f"{ROOT_DIR}/addon_metadata" | |
FILES_DIR = f"{ROOT_DIR}/addons" | |
class YamlDumper(yaml.Dumper): | |
"""Custom dumper to deal with a weird list indentation issue. | |
Remove when https://github.com/yaml/pyyaml/issues/234 is solved. | |
""" | |
def increase_indent(self, flow=False, indentless=False): | |
return super(YamlDumper, self).increase_indent(flow, False) | |
def get_page_range(base_page, start_page=None): | |
page_numbers = set() | |
pagination_urls = base_page.find_all(class_="pagination-item") | |
for page_url in pagination_urls: | |
try: | |
if page_url.name == "span": | |
page_numbers.add(int(page_url.text)) | |
else: | |
page_numbers.add(int(page_url.find("span").text)) | |
except ValueError: | |
# Ignore the '...' | |
pass | |
page_numbers = sorted(list(page_numbers)) | |
# NOTE: End is +1 as we do want to include the | |
# last page. | |
if start_page: | |
return range(start_page, page_numbers[-1] + 1) | |
return range(page_numbers[0], page_numbers[-1] + 1) | |
def get_addons_on_index_page(scraper, page_number): | |
page_url = f"{ADDONS_ROOT_URL}?page={page_number}" | |
page_html = scraper.get(page_url, timeout=10) | |
addon_index_page = BeautifulSoup(page_html.content, "html.parser") | |
addon_listings = addon_index_page.find_all(class_="project-listing-row") | |
addon_dicts = [] | |
for listing in addon_listings: | |
mod_page_url = listing.find_all("a")[0].attrs["href"] | |
short_description = listing.find_all(class_="leading-snug")[0].text | |
addon_dicts.append( | |
{ | |
"detail_url": f"{CURSE_ROOT_URL}{mod_page_url}", | |
"short_description": short_description.strip(), | |
} | |
) | |
return addon_dicts | |
def process_addon(scraper, addon_dictionary): | |
addon_detail_url = addon_dictionary["detail_url"] | |
addon_metadata = { | |
"id": addon_detail_url.split("/")[-1], | |
"short_description": addon_dictionary["short_description"], | |
"popularity_rank": addon_dictionary["popularity_rank"], | |
} | |
print(f"Processing Addon: {addon_metadata['id']}") | |
addon_metadata_dir = f"{METADATA_DIR}/{addon_metadata['id']}" | |
if not os.path.exists(addon_metadata_dir): | |
os.makedirs(addon_metadata_dir) | |
page_html = scraper.get(addon_detail_url, timeout=10) | |
addon_page = BeautifulSoup(page_html.content, "html.parser") | |
addon_metadata.update( | |
get_addon_header_details(scraper, addon_page, addon_metadata_dir) | |
) | |
addon_metadata.update( | |
get_addon_side_panel_details(scraper, addon_page, addon_metadata_dir) | |
) | |
addon_metadata.update( | |
get_addon_description(scraper, addon_page, addon_metadata_dir) | |
) | |
addon_metadata.update(get_addon_source_links(addon_page)) | |
addon_metadata.update( | |
get_addon_file_and_details(scraper, f"{addon_detail_url}/files") | |
) | |
addon_metadata.update( | |
get_addon_images(scraper, f"{addon_detail_url}/screenshots", addon_metadata_dir) | |
) | |
with open(f"{addon_metadata_dir}/info.yaml", "w") as file: | |
yaml.dump(addon_metadata, file, Dumper=YamlDumper, default_flow_style=False) | |
def get_addon_header_details(scraper, addon_page, addon_metadata_dir): | |
addon_metadata = {} | |
header = addon_page.find_all(class_="game-header")[0] | |
try: | |
icon_src = header.find_all("img")[0].attrs["src"] | |
# NOTE: Let's not assume they are all png, and preserve the extension: | |
icon_ext = icon_src.split(".")[-1] | |
icon_file_name = f"{addon_metadata_dir}/icon.{icon_ext}" | |
response = scraper.get(icon_src, stream=True, timeout=10) | |
if response.status_code == 200: | |
with open(icon_file_name, "wb") as f: | |
for chunk in response.iter_content(chunk_size=8192): | |
f.write(chunk) | |
except IndexError: | |
pass | |
addon_metadata["name"] = header.find_all("h2")[0].text.strip() | |
total_downloads = header.find_all("span")[0].text | |
addon_metadata["total_downloads"] = int(total_downloads.split()[0].replace(",", "")) | |
last_updated = header.find_all("span")[1].find_all("abbr")[0] | |
addon_metadata["last_updated"] = last_updated.attrs["title"] | |
addon_metadata["last_updated_epoch"] = last_updated.attrs["data-epoch"] | |
api_version = header.find_all("span")[2].text.split(": ")[-1] | |
# NOTE: let's standardise the api version format: | |
addon_metadata["api_version"] = api_version.replace(" ", "").replace("API", "API_") | |
return addon_metadata | |
def get_addon_side_panel_details(scraper, addon_page, addon_metadata_dir): | |
addon_metadata = {} | |
side_panel = addon_page.find_all("aside")[0] | |
created = side_panel.find_all("span")[4].find_all("abbr")[0] | |
addon_metadata["created"] = created.attrs["title"] | |
addon_metadata["created_epoch"] = created.attrs["data-epoch"] | |
addon_metadata["license_short"] = side_panel.find_all("a")[0].text.strip() | |
license_ref = side_panel.find_all("a")[0].attrs["href"] | |
response = scraper.get(f"{CURSE_ROOT_URL}{license_ref}", timeout=10) | |
if response.status_code == 200: | |
license_file_name = f"{addon_metadata_dir}/LICENSE.md" | |
with open(license_file_name, "w") as f: | |
f.write(md(response.content)) | |
categories = [] | |
category_links = side_panel.find_all(class_="pb-4")[1].find_all("a") | |
for category in category_links: | |
categories.append( | |
{ | |
"id": category.attrs["href"].split("/")[-1], | |
"name": category.find_all("figure")[0].attrs["title"], | |
} | |
) | |
addon_metadata["categories"] = categories | |
contributors = [] | |
contributor_links = side_panel.find_all(class_="pb-4")[2].find_all(class_="mb-2") | |
for contributor in contributor_links: | |
contributors.append( | |
{ | |
"name": contributor.find_all("span")[0].text.strip(), | |
"role": contributor.find_all(class_="text-xs")[0].text.strip(), | |
} | |
) | |
addon_metadata["contributors"] = contributors | |
return addon_metadata | |
def get_addon_description(scraper, addon_page, addon_metadata_dir): | |
addon_metadata = {} | |
images_urls = {} | |
possible_source_links = [] | |
description = addon_page.find_all(class_="project-detail__content")[0] | |
images = description.find_all("img") | |
description_images_dir = f"{addon_metadata_dir}/desc_images" | |
if images: | |
if not os.path.exists(description_images_dir): | |
os.makedirs(description_images_dir) | |
for i, image in enumerate(images): | |
img_src = image.attrs["src"] | |
# NOTE: Let's not assume they are all png, and preserve the extension: | |
ext = img_src.split(".")[-1] | |
# NOTE: but if there is an extension that is stupid... skip it. | |
if len(ext) > 5: | |
continue | |
relative_file_name = f"desc_images/image_{i}.{ext}" | |
file_name = f"{addon_metadata_dir}/{relative_file_name}" | |
try: | |
response = scraper.get(img_src, stream=True, timeout=10) | |
if ( | |
response.status_code == 200 | |
and "text" not in response.headers["Content-Type"] | |
): | |
with open(file_name, "wb") as f: | |
for chunk in response.iter_content(chunk_size=8192): | |
f.write(chunk) | |
images_urls[img_src] = relative_file_name | |
except (exceptions.Timeout, exceptions.InvalidSchema, exceptions.SSLError): | |
# NOTE: we don't care too much about these images | |
pass | |
if images and not images_urls: | |
os.rmdir(description_images_dir) | |
md_description = md(description.encode()) | |
for old, new in images_urls.items(): | |
md_description = md_description.replace(old, new) | |
description_file_name = f"{addon_metadata_dir}/description.md" | |
with open(description_file_name, "w") as f: | |
f.write(md_description) | |
links = description.find_all("a") | |
for link in links: | |
if "git" in link.attrs["href"]: | |
possible_source_links.append(link.attrs["href"]) | |
if possible_source_links: | |
addon_metadata["possible_source_links"] = possible_source_links | |
return addon_metadata | |
def get_addon_source_links(addon_page): | |
addon_metadata = {} | |
nav = addon_page.find_all("nav")[1] | |
issues = nav.find(id="nav-issues-svg-class-icon-icon-offsite-nav-viewbox-0-0") | |
if issues: | |
addon_metadata["issues"] = issues.find_all("a")[0].attrs["href"] | |
source = nav.find(id="nav-source-svg-class-icon-icon-offsite-nav-viewbox-0-0") | |
if source: | |
addon_metadata["source"] = source.find_all("a")[0].attrs["href"] | |
return addon_metadata | |
def get_addon_file_and_details(scraper, file_page_url): | |
addon_metadata = {} | |
page_html = scraper.get(file_page_url, timeout=10) | |
file_page = BeautifulSoup(page_html.content, "html.parser") | |
release = file_page.find_all(class_="box")[1] | |
release_name = release.find_all("h3")[0].text.strip() | |
download_link = release.find_all("a")[1].attrs["href"] + "/file" | |
filename = release.find_all(class_="text-sm")[1].text.strip() | |
uploaded_by = release.find_all(class_="text-sm")[3].text.strip() | |
uploaded_at = release.find_all(class_="text-sm")[5].find_all("abbr")[0] | |
api_version = release.find_all(class_="text-sm")[7].text.strip() | |
md5_checksum = release.find_all(class_="text-sm")[13].text | |
addon_metadata["files"] = [ | |
{ | |
"name": release_name, | |
"filename": filename, | |
"uploaded_by": uploaded_by, | |
"uploaded_at": uploaded_at.attrs["title"], | |
"uploaded_at_epoch": uploaded_at.attrs["data-epoch"], | |
# NOTE: let's standardise the api version format: | |
"api_version": api_version.replace(" ", "").replace("API", "API_"), | |
"md5_checksum": md5_checksum, | |
} | |
] | |
file_location = f"{FILES_DIR}/{filename}" | |
response = scraper.get(f"{CURSE_ROOT_URL}{download_link}", stream=True, timeout=30) | |
if response.status_code == 200: | |
with open(file_location, "wb") as f: | |
for chunk in response.iter_content(chunk_size=8192): | |
f.write(chunk) | |
with open(file_location, "rb") as f: | |
file_hash = hashlib.md5() | |
chunk = f.read(8192) | |
while chunk: | |
file_hash.update(chunk) | |
chunk = f.read(8192) | |
downloaded_md5 = file_hash.hexdigest() | |
if downloaded_md5 != md5_checksum: | |
print(f"expected: {md5_checksum} found: {downloaded_md5}") | |
raise Exception(f"File {file_location} does not match checksum.") | |
return addon_metadata | |
def get_addon_images(scraper, addon_images_url, addon_metadata_dir): | |
addon_metadata = {} | |
page_html = scraper.get(addon_images_url, timeout=10) | |
file_page = BeautifulSoup(page_html.content, "html.parser") | |
screenshots = [] | |
screenshot_elems = file_page.find_all(class_="project-screenshot-page")[0].find_all( | |
class_="px-2" | |
) | |
if screenshot_elems: | |
screenshots_dir = f"{addon_metadata_dir}/screenshots" | |
if not os.path.exists(screenshots_dir): | |
os.makedirs(screenshots_dir) | |
for screenshot in screenshot_elems: | |
download_link = screenshot.find_all(class_="mb-2")[0].attrs["data-featherlight"] | |
filename = download_link.split("/")[-1] | |
title = screenshot.find_all("p")[0].text | |
description = screenshot.find_all("p")[1].text | |
response = scraper.get(download_link, stream=True, timeout=10) | |
if response.status_code == 200: | |
with open(f"{screenshots_dir}/{filename}", "wb") as f: | |
for chunk in response.iter_content(chunk_size=8192): | |
f.write(chunk) | |
screenshots.append( | |
{ | |
"filename": filename, | |
"title": title, | |
"description": description, | |
} | |
) | |
if screenshots: | |
addon_metadata["screenshots"] = screenshots | |
return addon_metadata | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser( | |
description="Scrape Curseforge for Wildstar Addons. " | |
"Will output into a relative directory 'wildstar_addons" | |
) | |
parser.add_argument( | |
"--start-page", metavar="<start_page>", type=int, help="Which page to start from.", | |
) | |
args = parser.parse_args() | |
if not os.path.exists(METADATA_DIR): | |
os.makedirs(METADATA_DIR) | |
if not os.path.exists(FILES_DIR): | |
os.makedirs(FILES_DIR) | |
scraper = cloudscraper.create_scraper() | |
page_html = scraper.get(ADDONS_ROOT_URL, timeout=10) | |
root_page = BeautifulSoup(page_html.content, "html.parser") | |
popularity_rank = 1 | |
if args.start_page: | |
popularity_rank += (args.start_page - 1) * 20 | |
for page in get_page_range(root_page, args.start_page): | |
print(f"Processing page: {page}") | |
for addon in get_addons_on_index_page(scraper, page): | |
addon["popularity_rank"] = popularity_rank | |
process_addon(scraper, addon) | |
popularity_rank += 1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment