Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save danieltomasz/8a0cb174f056c811aa34b81954aa6f35 to your computer and use it in GitHub Desktop.
Save danieltomasz/8a0cb174f056c811aa34b81954aa6f35 to your computer and use it in GitHub Desktop.
Python script to extract film rankings from Wikipedia's "Cahiers du Cinéma's Annual Top 10 Lists" into CSV format, handling complex HTML tables with rowspan/colspan.
#!/usr/bin/env python3
"""
Script to parse Cahiers du Cinéma markdown file into CSV format
Extracts: Number, Title, Year, Directors
Properly handles rowspan and colspan attributes
"""
import csv
import re
from lxml import html as lxml_html
import html
def clean_text(text):
"""Clean HTML entities and extra whitespace from text"""
if not text:
return ""
# Decode HTML entities
text = html.unescape(text)
# Remove italic tags and other HTML
text = re.sub(r'<[^>]+>', '', text)
# Clean up whitespace
text = re.sub(r'\s+', ' ', text).strip()
return text
def extract_year_from_header(text):
"""Extract year from header text"""
text = text.strip()
# Skip decade headers
if 's (' in text or '–' in text or '-' in text:
return None
# Look for 4-digit years
match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
if match:
return match.group(1)
return None
def parse_table_to_2d_array(table_element):
"""
Parse HTML table into 2D array properly handling rowspan/colspan
Returns list of lists representing the table
"""
rows = table_element.xpath('.//tr')
if not rows:
return []
# First pass: determine table dimensions
max_cols = 0
for row in rows:
cells = row.xpath('.//th | .//td')
col_count = sum(int(cell.get('colspan', 1)) for cell in cells)
max_cols = max(max_cols, col_count)
# Initialize 2D array
table_data = []
rowspan_map = {} # Maps (row, col) -> remaining rowspan count
for row_idx, row in enumerate(rows):
cells = row.xpath('.//th | .//td')
current_row = []
col_idx = 0
cell_idx = 0
while col_idx < max_cols:
# Check if this cell is covered by a rowspan from above
while (row_idx, col_idx) in rowspan_map:
current_row.append(rowspan_map[(row_idx, col_idx)])
col_idx += 1
if col_idx >= max_cols:
break
if col_idx >= max_cols:
break
# Get the actual cell
if cell_idx < len(cells):
cell = cells[cell_idx]
cell_text = clean_text(cell.text_content())
colspan = int(cell.get('colspan', 1))
rowspan = int(cell.get('rowspan', 1))
# Add cell value for colspan times
for c in range(colspan):
if col_idx + c < max_cols:
current_row.append(cell_text)
# Mark future rows if rowspan > 1
for r in range(1, rowspan):
if row_idx + r < len(rows):
rowspan_map[(row_idx + r, col_idx + c)] = cell_text
col_idx += colspan
cell_idx += 1
else:
# No more cells in this row
current_row.append('')
col_idx += 1
table_data.append(current_row)
return table_data
def parse_cahiers_markdown(file_path, output_path):
"""Parse the markdown file and extract film data to CSV"""
with open(file_path, 'r', encoding='utf-8') as file:
content = file.read()
# Parse HTML content
doc = lxml_html.fromstring(content)
films = []
current_year = None
# Find all tables
tables = doc.xpath('//table')
for table in tables:
# Convert table to 2D array
table_data = parse_table_to_2d_array(table)
for row in table_data:
if not row:
continue
# Check if this is a year header row
first_cell = row[0] if row else ''
# Year headers typically have text in first cell and empty other cells
if first_cell and all(cell == '' or cell == first_cell for cell in row[1:]):
extracted_year = extract_year_from_header(first_cell)
if extracted_year:
current_year = extracted_year
continue
# Skip header rows
if first_cell in ['#', 'English Title', 'Original Title', 'Director(s)', 'Production Country']:
continue
# Skip "No list" rows
if 'No list' in first_cell or 'No lists' in first_cell:
continue
# Check if this is a film row (has a rank number)
rank = None
if first_cell and first_cell.replace('.', '').strip().isdigit():
rank = first_cell.replace('.', '').strip()
# Process film data if we have a rank and year
if rank and current_year and len(row) >= 4:
# Wikipedia Cahiers table structure (after parsing with colspan handled):
# Col 0: Rank
# Col 1: English Title
# Col 2: Original Title (will be same as col 1 if colspan=2 was used in HTML)
# Col 3: Director(s)
# Col 4: Production Country
title = ""
directors = ""
# Get title from column 1 (English title)
if len(row) > 1 and row[1]:
title = row[1]
# Directors are ALWAYS in column 3 (index 3) after the title columns
if len(row) > 3 and row[3]:
directors = row[3]
# Add film if we have all required data
if title and directors:
films.append({
'Number': rank,
'Title': title,
'Year': current_year,
'Directors': directors
})
# Write to CSV
with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
fieldnames = ['Number', 'Title', 'Year', 'Directors']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
writer.writeheader()
for film in films:
writer.writerow(film)
print(f"Extracted {len(films)} films to {output_path}")
# Print year counts for verification
year_counts = {}
for film in films:
year = film['Year']
if year:
year_counts[year] = year_counts.get(year, 0) + 1
print(f"\nFilms per year: {dict(sorted(year_counts.items()))}")
def main():
input_file = "Cahiers du Cinéma's Annual Top 10 Lists - Wikipedia.md"
output_file = "cahiers_films.csv"
try:
parse_cahiers_markdown(input_file, output_file)
print("Parsing completed successfully!")
# Show first few rows as preview
print("\nFirst 10 rows:")
with open(output_file, 'r', encoding='utf-8') as f:
lines = f.readlines()[:11]
for line in lines:
print(line.strip())
except FileNotFoundError:
print(f"Error: Could not find input file '{input_file}'")
print("Please update the input_file path in the script")
except ImportError:
print("Error: lxml not installed. Install it with: pip install lxml")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment