Created
August 25, 2025 21:44
-
-
Save danieltomasz/8a0cb174f056c811aa34b81954aa6f35 to your computer and use it in GitHub Desktop.
Python script to extract film rankings from Wikipedia's "Cahiers du Cinéma's Annual Top 10 Lists" into CSV format, handling complex HTML tables with rowspan/colspan.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
Script to parse Cahiers du Cinéma markdown file into CSV format | |
Extracts: Number, Title, Year, Directors | |
Properly handles rowspan and colspan attributes | |
""" | |
import csv | |
import re | |
from lxml import html as lxml_html | |
import html | |
def clean_text(text): | |
"""Clean HTML entities and extra whitespace from text""" | |
if not text: | |
return "" | |
# Decode HTML entities | |
text = html.unescape(text) | |
# Remove italic tags and other HTML | |
text = re.sub(r'<[^>]+>', '', text) | |
# Clean up whitespace | |
text = re.sub(r'\s+', ' ', text).strip() | |
return text | |
def extract_year_from_header(text): | |
"""Extract year from header text""" | |
text = text.strip() | |
# Skip decade headers | |
if 's (' in text or '–' in text or '-' in text: | |
return None | |
# Look for 4-digit years | |
match = re.search(r'\b(19\d{2}|20\d{2})\b', text) | |
if match: | |
return match.group(1) | |
return None | |
def parse_table_to_2d_array(table_element): | |
""" | |
Parse HTML table into 2D array properly handling rowspan/colspan | |
Returns list of lists representing the table | |
""" | |
rows = table_element.xpath('.//tr') | |
if not rows: | |
return [] | |
# First pass: determine table dimensions | |
max_cols = 0 | |
for row in rows: | |
cells = row.xpath('.//th | .//td') | |
col_count = sum(int(cell.get('colspan', 1)) for cell in cells) | |
max_cols = max(max_cols, col_count) | |
# Initialize 2D array | |
table_data = [] | |
rowspan_map = {} # Maps (row, col) -> remaining rowspan count | |
for row_idx, row in enumerate(rows): | |
cells = row.xpath('.//th | .//td') | |
current_row = [] | |
col_idx = 0 | |
cell_idx = 0 | |
while col_idx < max_cols: | |
# Check if this cell is covered by a rowspan from above | |
while (row_idx, col_idx) in rowspan_map: | |
current_row.append(rowspan_map[(row_idx, col_idx)]) | |
col_idx += 1 | |
if col_idx >= max_cols: | |
break | |
if col_idx >= max_cols: | |
break | |
# Get the actual cell | |
if cell_idx < len(cells): | |
cell = cells[cell_idx] | |
cell_text = clean_text(cell.text_content()) | |
colspan = int(cell.get('colspan', 1)) | |
rowspan = int(cell.get('rowspan', 1)) | |
# Add cell value for colspan times | |
for c in range(colspan): | |
if col_idx + c < max_cols: | |
current_row.append(cell_text) | |
# Mark future rows if rowspan > 1 | |
for r in range(1, rowspan): | |
if row_idx + r < len(rows): | |
rowspan_map[(row_idx + r, col_idx + c)] = cell_text | |
col_idx += colspan | |
cell_idx += 1 | |
else: | |
# No more cells in this row | |
current_row.append('') | |
col_idx += 1 | |
table_data.append(current_row) | |
return table_data | |
def parse_cahiers_markdown(file_path, output_path): | |
"""Parse the markdown file and extract film data to CSV""" | |
with open(file_path, 'r', encoding='utf-8') as file: | |
content = file.read() | |
# Parse HTML content | |
doc = lxml_html.fromstring(content) | |
films = [] | |
current_year = None | |
# Find all tables | |
tables = doc.xpath('//table') | |
for table in tables: | |
# Convert table to 2D array | |
table_data = parse_table_to_2d_array(table) | |
for row in table_data: | |
if not row: | |
continue | |
# Check if this is a year header row | |
first_cell = row[0] if row else '' | |
# Year headers typically have text in first cell and empty other cells | |
if first_cell and all(cell == '' or cell == first_cell for cell in row[1:]): | |
extracted_year = extract_year_from_header(first_cell) | |
if extracted_year: | |
current_year = extracted_year | |
continue | |
# Skip header rows | |
if first_cell in ['#', 'English Title', 'Original Title', 'Director(s)', 'Production Country']: | |
continue | |
# Skip "No list" rows | |
if 'No list' in first_cell or 'No lists' in first_cell: | |
continue | |
# Check if this is a film row (has a rank number) | |
rank = None | |
if first_cell and first_cell.replace('.', '').strip().isdigit(): | |
rank = first_cell.replace('.', '').strip() | |
# Process film data if we have a rank and year | |
if rank and current_year and len(row) >= 4: | |
# Wikipedia Cahiers table structure (after parsing with colspan handled): | |
# Col 0: Rank | |
# Col 1: English Title | |
# Col 2: Original Title (will be same as col 1 if colspan=2 was used in HTML) | |
# Col 3: Director(s) | |
# Col 4: Production Country | |
title = "" | |
directors = "" | |
# Get title from column 1 (English title) | |
if len(row) > 1 and row[1]: | |
title = row[1] | |
# Directors are ALWAYS in column 3 (index 3) after the title columns | |
if len(row) > 3 and row[3]: | |
directors = row[3] | |
# Add film if we have all required data | |
if title and directors: | |
films.append({ | |
'Number': rank, | |
'Title': title, | |
'Year': current_year, | |
'Directors': directors | |
}) | |
# Write to CSV | |
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile: | |
fieldnames = ['Number', 'Title', 'Year', 'Directors'] | |
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL) | |
writer.writeheader() | |
for film in films: | |
writer.writerow(film) | |
print(f"Extracted {len(films)} films to {output_path}") | |
# Print year counts for verification | |
year_counts = {} | |
for film in films: | |
year = film['Year'] | |
if year: | |
year_counts[year] = year_counts.get(year, 0) + 1 | |
print(f"\nFilms per year: {dict(sorted(year_counts.items()))}") | |
def main(): | |
input_file = "Cahiers du Cinéma's Annual Top 10 Lists - Wikipedia.md" | |
output_file = "cahiers_films.csv" | |
try: | |
parse_cahiers_markdown(input_file, output_file) | |
print("Parsing completed successfully!") | |
# Show first few rows as preview | |
print("\nFirst 10 rows:") | |
with open(output_file, 'r', encoding='utf-8') as f: | |
lines = f.readlines()[:11] | |
for line in lines: | |
print(line.strip()) | |
except FileNotFoundError: | |
print(f"Error: Could not find input file '{input_file}'") | |
print("Please update the input_file path in the script") | |
except ImportError: | |
print("Error: lxml not installed. Install it with: pip install lxml") | |
except Exception as e: | |
print(f"Error: {e}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment