Created
July 29, 2025 15:28
-
-
Save raldone01/abf43e561a993bbbe8615d38217197f1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# Filename: mygobricks_differ.py | |
# Description: This script filters a Rebrickable CSV file to only include parts that are still missing from a Mygobricks missing parts CSV file. | |
# It reads both CSV files, compares the parts, and outputs a new CSV file with the remaining missing parts. | |
import argparse | |
import sys | |
from pathlib import Path | |
import pandas as pd | |
# Rebrickable CSV file columns: | |
# "Part", "Color", "Quantity", "Is Spare" | |
# 87994, 0, 2, False | |
# 30374, 0, 1, False | |
# ... | |
def parse_rebrickable_csv(file_path): | |
""" | |
Reads the Rebrickable CSV and returns a cleaned DataFrame containing part ID, color, quantity, and spare info. | |
""" | |
try: | |
df = pd.read_csv(file_path, usecols=[ | |
"Part", "Color", "Quantity", "Is Spare"]) | |
df["Part"] = df["Part"].astype(str).str.strip() | |
df["Color"] = pd.to_numeric(df["Color"], errors='coerce') | |
df["Quantity"] = pd.to_numeric(df["Quantity"], errors='coerce') | |
df["Is Spare"] = df["Is Spare"].astype(bool) | |
return df | |
except Exception as e: | |
print(f"Error reading Rebrickable CSV file: {e}") | |
sys.exit(1) | |
# Mygobricks missing parts CSV file columns: | |
# "LegoID", "Part", "ColorID", "Quantity", "Reason" | |
# "73507", "Technic Beam 1 x 11 Thick with Alternating Holes", "0", "4", "undefined" | |
# "87087", "Brick Special 1 x 1 with Stud on 1 Side", "72", "2", "undefined" | |
# "undefined", "65473", "0", "8", "undefined" | |
# "undefined", "38799", "0", "1", "undefined" | |
# ... | |
def parse_mygobricks_missing_csv(file_path): | |
""" | |
Reads the Mygobricks missing CSV and returns a cleaned DataFrame of still-missing parts. | |
""" | |
try: | |
df = pd.read_csv(file_path, usecols=[ | |
"LegoID", "Part", "ColorID", "Quantity", "Reason"]) | |
df["LegoID"] = df["LegoID"].astype(str).str.strip() | |
df["Part"] = df["Part"].astype(str).str.strip() | |
df["ColorID"] = pd.to_numeric(df["ColorID"], errors='coerce') | |
df["Quantity"] = pd.to_numeric(df["Quantity"], errors='coerce') | |
df["Reason"] = df["Reason"].astype(str) | |
# Remove invalid part numbers that don't match the expected format | |
valid_lego_part_number_regex = r"^[a-z0-9]+$" | |
mask = ~df['Part'].str.match(valid_lego_part_number_regex, na=False) | |
print(f"Dropping {mask.sum()} invalid part entries from missing CSV") | |
df.loc[mask, 'Part'] = None | |
# If Part is missing, use LegoID as fallback | |
df["Part"] = df["Part"].combine_first(df["LegoID"]) | |
df = df.drop(columns=["LegoID"]) | |
return df | |
except Exception as e: | |
print(f"Error reading Mygobricks missing CSV file: {e}") | |
sys.exit(1) | |
def main(): | |
""" | |
Loads the CSV files, filters parts that are still missing, and writes them into a new CSV. | |
""" | |
parser = argparse.ArgumentParser( | |
description="Filters the Rebrickable dataset to only include parts still missing from Mygobricks." | |
) | |
parser.add_argument( | |
"rebrickable_path", | |
type=Path, | |
help="Path to the Rebrickable CSV file (e.g., 'set_parts.csv')." | |
) | |
parser.add_argument( | |
"missing_path", | |
type=Path, | |
help="Path to the Mygobricks 'missing' CSV file." | |
) | |
args = parser.parse_args() | |
rebrickable_df = parse_rebrickable_csv(args.rebrickable_path) | |
missing_df = parse_mygobricks_missing_csv(args.missing_path) | |
print(f"Total parts in Rebrickable CSV: {len(rebrickable_df)}") | |
print(f"Total parts in Missing CSV: {len(missing_df)}") | |
output_rows = [] | |
# Compare each part-color combination in both files | |
for _, full_row in rebrickable_df.iterrows(): | |
for _, missing_row in missing_df.iterrows(): | |
if full_row["Part"] == missing_row["Part"] and full_row["Color"] == missing_row["ColorID"]: | |
output_rows.append(full_row) | |
output_df = pd.DataFrame(output_rows) | |
print(f"Parts still missing after comparison: {len(output_df)}") | |
# Write result to next to the original file with a new suffix | |
output_path = args.rebrickable_path.with_name( | |
f"{args.rebrickable_path.stem}_still_missing.csv" | |
) | |
output_df.to_csv(output_path, index=False) | |
print(f"Result saved to: {output_path}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment