danieltomasz · August 25, 2025 21:44
diff --git a/parse_wikipedia_tables_cahiers_de_cinema.py b/parse_wikipedia_tables_cahiers_de_cinema.py
 #!/usr/bin/env python3
 """
 Script to parse Cahiers du Cinéma markdown file into CSV format
 Extracts: Number, Title, Year, Directors
 Properly handles rowspan and colspan attributes
 """

 import csv
 import re
 from lxml import html as lxml_html
 import html

 def clean_text(text):
    """Clean HTML entities and extra whitespace from text"""
    if not text:
        return ""
    
    # Decode HTML entities
    text = html.unescape(text)
    
    # Remove italic tags and other HTML
    text = re.sub(r'<[^>]+>', '', text)
    
    # Clean up whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

 def extract_year_from_header(text):
    """Extract year from header text"""
    text = text.strip()
    
    # Skip decade headers
    if 's (' in text or '–' in text or '-' in text:
        return None
    
    # Look for 4-digit years
    match = re.search(r'\b(19\d{2}|20\d{2})\b', text)
    if match:
        return match.group(1)
    return None

 def parse_table_to_2d_array(table_element):
    """
    Parse HTML table into 2D array properly handling rowspan/colspan
    Returns list of lists representing the table
    """
    rows = table_element.xpath('.//tr')
    if not rows:
        return []
    
    # First pass: determine table dimensions
    max_cols = 0
    for row in rows:
        cells = row.xpath('.//th | .//td')
        col_count = sum(int(cell.get('colspan', 1)) for cell in cells)
        max_cols = max(max_cols, col_count)
    
    # Initialize 2D array
    table_data = []
    rowspan_map = {}  # Maps (row, col) -> remaining rowspan count
    
    for row_idx, row in enumerate(rows):
        cells = row.xpath('.//th | .//td')
        current_row = []
        col_idx = 0
        cell_idx = 0
        
        while col_idx < max_cols:
            # Check if this cell is covered by a rowspan from above
            while (row_idx, col_idx) in rowspan_map:
                current_row.append(rowspan_map[(row_idx, col_idx)])
                col_idx += 1
                if col_idx >= max_cols:
                    break
            
            if col_idx >= max_cols:
                break
                
            # Get the actual cell
            if cell_idx < len(cells):
                cell = cells[cell_idx]
                cell_text = clean_text(cell.text_content())
                colspan = int(cell.get('colspan', 1))
                rowspan = int(cell.get('rowspan', 1))
                
                # Add cell value for colspan times
                for c in range(colspan):
                    if col_idx + c < max_cols:
                        current_row.append(cell_text)
                        
                        # Mark future rows if rowspan > 1
                        for r in range(1, rowspan):
                            if row_idx + r < len(rows):
                                rowspan_map[(row_idx + r, col_idx + c)] = cell_text
                
                col_idx += colspan
                cell_idx += 1
            else:
                # No more cells in this row
                current_row.append('')
                col_idx += 1
        
        table_data.append(current_row)
    
    return table_data

 def parse_cahiers_markdown(file_path, output_path):
    """Parse the markdown file and extract film data to CSV"""
    
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    
    # Parse HTML content
    doc = lxml_html.fromstring(content)
    
    films = []
    current_year = None
    
    # Find all tables
    tables = doc.xpath('//table')
    
    for table in tables:
        # Convert table to 2D array
        table_data = parse_table_to_2d_array(table)
        
        for row in table_data:
            if not row:
                continue
            
            # Check if this is a year header row
            first_cell = row[0] if row else ''
            
            # Year headers typically have text in first cell and empty other cells
            if first_cell and all(cell == '' or cell == first_cell for cell in row[1:]):
                extracted_year = extract_year_from_header(first_cell)
                if extracted_year:
                    current_year = extracted_year
                continue
            
            # Skip header rows
            if first_cell in ['#', 'English Title', 'Original Title', 'Director(s)', 'Production Country']:
                continue
            
            # Skip "No list" rows
            if 'No list' in first_cell or 'No lists' in first_cell:
                continue
            
            # Check if this is a film row (has a rank number)
            rank = None
            if first_cell and first_cell.replace('.', '').strip().isdigit():
                rank = first_cell.replace('.', '').strip()
            
            # Process film data if we have a rank and year
            if rank and current_year and len(row) >= 4:
                # Wikipedia Cahiers table structure (after parsing with colspan handled):
                # Col 0: Rank
                # Col 1: English Title
                # Col 2: Original Title (will be same as col 1 if colspan=2 was used in HTML)
                # Col 3: Director(s)
                # Col 4: Production Country
                
                title = ""
                directors = ""
                
                # Get title from column 1 (English title)
                if len(row) > 1 and row[1]:
                    title = row[1]
                
                # Directors are ALWAYS in column 3 (index 3) after the title columns
                if len(row) > 3 and row[3]:
                    directors = row[3]
                
                # Add film if we have all required data
                if title and directors:
                    films.append({
                        'Number': rank,
                        'Title': title,
                        'Year': current_year,
                        'Directors': directors
                    })
    
    # Write to CSV
    with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
        fieldnames = ['Number', 'Title', 'Year', 'Directors']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)
        
        writer.writeheader()
        for film in films:
            writer.writerow(film)
    
    print(f"Extracted {len(films)} films to {output_path}")
    
    # Print year counts for verification
    year_counts = {}
    for film in films:
        year = film['Year']
        if year:
            year_counts[year] = year_counts.get(year, 0) + 1
    
    print(f"\nFilms per year: {dict(sorted(year_counts.items()))}")

 def main():
    input_file = "Cahiers du Cinéma's Annual Top 10 Lists - Wikipedia.md"
    output_file = "cahiers_films.csv"
    
    try:
        parse_cahiers_markdown(input_file, output_file)
        print("Parsing completed successfully!")
        
        # Show first few rows as preview
        print("\nFirst 10 rows:")
        with open(output_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()[:11]
            for line in lines:
                print(line.strip())
                
    except FileNotFoundError:
        print(f"Error: Could not find input file '{input_file}'")
        print("Please update the input_file path in the script")
    except ImportError:
        print("Error: lxml not installed. Install it with: pip install lxml")
    except Exception as e:
        print(f"Error: {e}")

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Script to parse Cahiers du Cinéma markdown file into CSV format
	Extracts: Number, Title, Year, Directors
	Properly handles rowspan and colspan attributes
	"""

	import csv
	import re
	from lxml import html as lxml_html
	import html

	def clean_text(text):
	"""Clean HTML entities and extra whitespace from text"""
	if not text:
	return ""

	# Decode HTML entities
	text = html.unescape(text)

	# Remove italic tags and other HTML
	text = re.sub(r'<[^>]+>', '', text)

	# Clean up whitespace
	text = re.sub(r'\s+', ' ', text).strip()

	return text

	def extract_year_from_header(text):
	"""Extract year from header text"""
	text = text.strip()

	# Skip decade headers
	if 's (' in text or '–' in text or '-' in text:
	return None

	# Look for 4-digit years
	match = re.search(r'\b(19\d{2}\|20\d{2})\b', text)
	if match:
	return match.group(1)
	return None

	def parse_table_to_2d_array(table_element):
	"""
	Parse HTML table into 2D array properly handling rowspan/colspan
	Returns list of lists representing the table
	"""
	rows = table_element.xpath('.//tr')
	if not rows:
	return []

	# First pass: determine table dimensions
	max_cols = 0
	for row in rows:
	cells = row.xpath('.//th \| .//td')
	col_count = sum(int(cell.get('colspan', 1)) for cell in cells)
	max_cols = max(max_cols, col_count)

	# Initialize 2D array
	table_data = []
	rowspan_map = {} # Maps (row, col) -> remaining rowspan count

	for row_idx, row in enumerate(rows):
	cells = row.xpath('.//th \| .//td')
	current_row = []
	col_idx = 0
	cell_idx = 0

	while col_idx < max_cols:
	# Check if this cell is covered by a rowspan from above
	while (row_idx, col_idx) in rowspan_map:
	current_row.append(rowspan_map[(row_idx, col_idx)])
	col_idx += 1
	if col_idx >= max_cols:
	break

	if col_idx >= max_cols:
	break

	# Get the actual cell
	if cell_idx < len(cells):
	cell = cells[cell_idx]
	cell_text = clean_text(cell.text_content())
	colspan = int(cell.get('colspan', 1))
	rowspan = int(cell.get('rowspan', 1))

	# Add cell value for colspan times
	for c in range(colspan):
	if col_idx + c < max_cols:
	current_row.append(cell_text)

	# Mark future rows if rowspan > 1
	for r in range(1, rowspan):
	if row_idx + r < len(rows):
	rowspan_map[(row_idx + r, col_idx + c)] = cell_text

	col_idx += colspan
	cell_idx += 1
	else:
	# No more cells in this row
	current_row.append('')
	col_idx += 1

	table_data.append(current_row)

	return table_data

	def parse_cahiers_markdown(file_path, output_path):
	"""Parse the markdown file and extract film data to CSV"""

	with open(file_path, 'r', encoding='utf-8') as file:
	content = file.read()

	# Parse HTML content
	doc = lxml_html.fromstring(content)

	films = []
	current_year = None

	# Find all tables
	tables = doc.xpath('//table')

	for table in tables:
	# Convert table to 2D array
	table_data = parse_table_to_2d_array(table)

	for row in table_data:
	if not row:
	continue

	# Check if this is a year header row
	first_cell = row[0] if row else ''

	# Year headers typically have text in first cell and empty other cells
	if first_cell and all(cell == '' or cell == first_cell for cell in row[1:]):
	extracted_year = extract_year_from_header(first_cell)
	if extracted_year:
	current_year = extracted_year
	continue

	# Skip header rows
	if first_cell in ['#', 'English Title', 'Original Title', 'Director(s)', 'Production Country']:
	continue

	# Skip "No list" rows
	if 'No list' in first_cell or 'No lists' in first_cell:
	continue

	# Check if this is a film row (has a rank number)
	rank = None
	if first_cell and first_cell.replace('.', '').strip().isdigit():
	rank = first_cell.replace('.', '').strip()

	# Process film data if we have a rank and year
	if rank and current_year and len(row) >= 4:
	# Wikipedia Cahiers table structure (after parsing with colspan handled):
	# Col 0: Rank
	# Col 1: English Title
	# Col 2: Original Title (will be same as col 1 if colspan=2 was used in HTML)
	# Col 3: Director(s)
	# Col 4: Production Country

	title = ""
	directors = ""

	# Get title from column 1 (English title)
	if len(row) > 1 and row[1]:
	title = row[1]

	# Directors are ALWAYS in column 3 (index 3) after the title columns
	if len(row) > 3 and row[3]:
	directors = row[3]

	# Add film if we have all required data
	if title and directors:
	films.append({
	'Number': rank,
	'Title': title,
	'Year': current_year,
	'Directors': directors
	})

	# Write to CSV
	with open(output_path, 'w', newline='', encoding='utf-8') as csvfile:
	fieldnames = ['Number', 'Title', 'Year', 'Directors']
	writer = csv.DictWriter(csvfile, fieldnames=fieldnames, quoting=csv.QUOTE_ALL)

	writer.writeheader()
	for film in films:
	writer.writerow(film)

	print(f"Extracted {len(films)} films to {output_path}")

	# Print year counts for verification
	year_counts = {}
	for film in films:
	year = film['Year']
	if year:
	year_counts[year] = year_counts.get(year, 0) + 1

	print(f"\nFilms per year: {dict(sorted(year_counts.items()))}")

	def main():
	input_file = "Cahiers du Cinéma's Annual Top 10 Lists - Wikipedia.md"
	output_file = "cahiers_films.csv"

	try:
	parse_cahiers_markdown(input_file, output_file)
	print("Parsing completed successfully!")

	# Show first few rows as preview
	print("\nFirst 10 rows:")
	with open(output_file, 'r', encoding='utf-8') as f:
	lines = f.readlines()[:11]
	for line in lines:
	print(line.strip())

	except FileNotFoundError:
	print(f"Error: Could not find input file '{input_file}'")
	print("Please update the input_file path in the script")
	except ImportError:
	print("Error: lxml not installed. Install it with: pip install lxml")
	except Exception as e:
	print(f"Error: {e}")

	if __name__ == "__main__":
	main()