Last active
August 1, 2025 16:10
-
-
Save raldone01/e273f810cd78e8c4a34aec461c83597c to your computer and use it in GitHub Desktop.
webricks.com wanted list html to rebrickable.csv
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# This script parses a webricks.com wanted list HTML file and converts the part data | |
# into a rebrickable.csv format. | |
import sys | |
import csv | |
from bs4 import BeautifulSoup | |
def parse_html_to_records(html_content): | |
""" | |
Parses the given HTML content to extract part information. | |
Args: | |
html_content: A string containing the HTML source. | |
Returns: | |
A list of dictionaries, where each dictionary represents a part. | |
""" | |
soup = BeautifulSoup(html_content, 'html.parser') | |
records = [] | |
# Find the <dl> block with part details | |
all_details = soup.find_all('tr', class_='item-info') | |
# https://web.archive.org/web/20250713215351/https://rebrickable.com/colors/ | |
# Colors are in a weird webrick lego hybrid we must translate them to rebrickable ids | |
webrick_to_rebrickable_colors = { | |
'26-Black': 0, | |
'194-Light Bluish Gray': 71, | |
'1-White': 15, | |
'23-Blue': 1, | |
'191-Bright Light Orange': 191, | |
'199-Dark Bluish Gray': 72, | |
'138-Dark Tan': 28, | |
'28-Green': 2, | |
'297-Pearl Gold': 297, | |
'21-Red': 4, | |
'5-Tan': 19, | |
'24-Yellow': 14, | |
} | |
webrick_partid_fixer = { | |
'2412': '2412b', | |
'3062': '3062b', | |
'3839': '3839b', | |
'4265c': '4265a', | |
'4287': '4287c', | |
'44237': '2456', | |
'50746': '54200', | |
'6143': '3941', | |
'6590': '3713', | |
'92903': '6005', | |
'93888': '3007', | |
'3040': '3040b', | |
'3049': '3049d', | |
'3069': '3069b', | |
'3070': '3070b', | |
'3794': '3794b', | |
'4032': '4032b', | |
'44301': '44301b', | |
'44567': '44567b', | |
'4697': '4697b', | |
'48729': '48729b', | |
'60470': '60470b', | |
} | |
for details in all_details: | |
# Extract values by matching labels | |
data = {} | |
labels = ['Part ID', 'Color ID', 'SKU'] | |
for dt in details.find_all('dt', class_='label'): | |
label = dt.get_text(strip=True) | |
if label in labels: | |
dd = dt.find_next_sibling('dd', class_='values') | |
data[label] = dd.get_text(strip=True) | |
# Print the extracted values | |
part_id = data.get('Part ID') | |
color_id = data.get('Color ID') | |
sku = data.get('SKU') | |
qty_input = details.find('input', class_='input-text qty') | |
quantity = qty_input['value'] | |
if not part_id or not color_id or not quantity: | |
print("Warning: Missing part information in one of the records.") | |
continue | |
# Translate color ID to rebrickable format if necessary | |
if color_id in webrick_to_rebrickable_colors: | |
color_id = webrick_to_rebrickable_colors[color_id] | |
else: | |
raise ValueError( | |
f"Error: Color ID '{color_id}' not found in the translation map.") | |
# Fix part ID if necessary | |
if part_id in webrick_partid_fixer: | |
print( | |
f"Fixing part ID '{part_id}' to '{webrick_partid_fixer[part_id]}'") | |
part_id = webrick_partid_fixer[part_id] | |
# Clean up suffixes | |
quantity = quantity.replace(' piece', '').strip() | |
# Create a record dictionary | |
record = { | |
'Part': part_id, | |
'Color': color_id, | |
'Quantity': quantity, | |
'Is Spare': False | |
} | |
records.append(record) | |
return records | |
def write_records_to_csv(records, output_filename): | |
""" | |
Writes a list of part records to a CSV file. | |
Args: | |
records: A list of part dictionaries. | |
output_filename: The name of the CSV file to create. | |
""" | |
if not records: | |
print("No valid records were found to write to CSV.") | |
return | |
# The CSV headers are defined by the rebrickable format. | |
headers = ['Part', 'Color', 'Quantity', 'Is Spare'] | |
try: | |
with open(output_filename, 'w', newline='', encoding='utf-8') as csvfile: | |
writer = csv.DictWriter(csvfile, fieldnames=headers) | |
writer.writeheader() | |
writer.writerows(records) | |
print( | |
f"Successfully created {output_filename} with {len(records)} records.") | |
# compute the total number of parts | |
total_parts = sum(int(record['Quantity']) for record in records) | |
print(f"Total parts: {total_parts}") | |
except IOError: | |
print(f"Error: Could not write to file {output_filename}.") | |
sys.exit(1) | |
def main(): | |
if len(sys.argv) != 2: | |
print("Usage: python script.py <input_html_file>") | |
sys.exit(1) | |
input_filename = sys.argv[1] | |
print(f"Processing input file: {input_filename}") | |
if not input_filename.lower().endswith('.html'): | |
print("Error: Input file must have a .html extension.") | |
sys.exit(1) | |
output_filename = input_filename.replace('.html', '_rebrickable.csv') | |
# Read the contents of the input HTML file | |
try: | |
with open(input_filename, 'r', encoding='utf-8') as f: | |
html_content = f.read() | |
except FileNotFoundError: | |
print(f"Error: Input file not found at '{input_filename}'") | |
sys.exit(1) | |
# Parse the HTML to extract part data | |
records = parse_html_to_records(html_content) | |
# Write the extracted records into the output CSV file | |
write_records_to_csv(records, output_filename) | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment