zopieux · March 25, 2026 20:58
diff --git a/extract_changed_sentences.py b/extract_changed_sentences.py
 """
 pandoc --track-changes=all doc_with_tracking.docx -o changes.json
 ./extract_changed_sentences.py changes.json changed_sentences.txt
 """

 import json
 import sys
 import re

 def find_blocks(obj):
    """Recursively find all text-containing blocks (Paragraphs, Headers, etc.)."""
    blocks = []
    if isinstance(obj, dict):
        # Only grab standard blocks that contain inline text
        if obj.get("t") in ["Para", "Plain", "Header"]:
            blocks.append(obj)
        else:
            for v in obj.values():
                blocks.extend(find_blocks(v))
    elif isinstance(obj, list):
        for item in obj:
            blocks.extend(find_blocks(item))
    return blocks

 def extract_inlines(block):
    """Extract the inline elements array from a block."""
    t = block.get("t")
    c = block.get("c")
    if t in ["Para", "Plain"]:
        return c
    elif t == "Header":
        return c[2] # Header format: [level, attr, [inlines]]
    return []

 def get_chunks(element, in_change=False):
    """
    Recursively parse inline elements into text chunks.
    Returns a list of dicts: {"text": str, "changed": bool}
    """
    chunks = []
    if isinstance(element, list):
        for item in element:
            chunks.extend(get_chunks(item, in_change))
    elif isinstance(element, dict):
        t = element.get("t")
        c = element.get("c")
        
        if t == "Str":
            chunks.append({"text": c, "changed": in_change})
        elif t == "Space":
            chunks.append({"text": " ", "changed": in_change})
        elif t == "Span":
            classes = c[0][1] if len(c) > 0 and len(c[0]) > 1 else []
            content = c[1] if len(c) > 1 else []
            
            if "insertion" in classes:
                chunks.extend(get_chunks(content, True))
            elif "deletion" in classes:
                # Deletions mean the sentence was changed, but the text is gone post-change.
                # We append a 0-length chunk flagged as 'changed' so we can mark adjacent chars.
                chunks.append({"text": "", "changed": True})
            else:
                chunks.extend(get_chunks(content, in_change))
        
        # Handle standard wrappers recursively
        elif t in ["Emph", "Strong", "Strikeout", "Superscript", "Subscript", "SmallCaps"]:
            chunks.extend(get_chunks(c, in_change))
        elif t == "Quoted":
            chunks.append({"text": '"', "changed": in_change})
            chunks.extend(get_chunks(c[1], in_change))
            chunks.append({"text": '"', "changed": in_change})
        elif t in ["LineBreak", "SoftBreak"]:
            chunks.append({"text": " ", "changed": in_change})
        elif t in ["Link", "Cite", "Image"] and len(c) > 1:
            chunks.extend(get_chunks(c[1], in_change))
            
    return chunks

 def process_block(block):
    """Converts a block into post-change sentences that contain changes."""
    inlines = extract_inlines(block)
    chunks = get_chunks(inlines)

    paragraph_text = ""
    changed_mask = []
    pending_deletion = False

    # Build the plaintext string and a parallel boolean mask of the same length
    for chunk in chunks:
        text = chunk["text"]
        is_changed = chunk["changed"]

        # Catch 0-length deletion chunks
        if len(text) == 0 and is_changed:
            pending_deletion = True
            continue

        paragraph_text += text
        mask_segment = [is_changed] * len(text)
        
        # If the preceding text was a deletion, mark the first character of this segment as changed
        if pending_deletion and len(mask_segment) > 0:
            mask_segment[0] = True
            pending_deletion = False

        changed_mask.extend(mask_segment)

    # If a deletion happened at the very end of a paragraph
    if pending_deletion and len(changed_mask) > 0:
        changed_mask[-1] = True

    # Smart regex to split sentences (avoids splitting on things like Mr., Dr., U.K.)
    # Note: Regex lookbehinds in Python must be fixed-width
    pattern = r'(?<!\b[A-Z][a-z])(?<!\b[A-Z])[.!?]+(?=\s|$)'
    
    spans = []
    start = 0
    for m in re.finditer(pattern, paragraph_text):
        end = m.end()
        spans.append((start, end))
        start = end
        # Skip trailing whitespaces for the start of the next sentence
        while start < len(paragraph_text) and paragraph_text[start].isspace():
            start += 1
            
    # Add any remaining text as the last sentence (handles paras without ending punctuation)
    if start < len(paragraph_text):
        spans.append((start, len(paragraph_text)))

    changed_sentences = []
    for s, e in spans:
        sentence_text = paragraph_text[s:e].strip()
        if not sentence_text:
            continue
            
        # If any character in this sentence's mask is True, it has a tracked change
        if any(changed_mask[s:e]):
            changed_sentences.append(sentence_text)

    return changed_sentences

 def main():
    if len(sys.argv) < 2:
        print("Usage: python extract_changes.py <input.json> [output.txt]")
        sys.exit(1)

    input_file = sys.argv[1]
    with open(input_file, 'r', encoding='utf-8') as f:
        data = json.load(f)

    # 1. Locate all text blocks
    blocks = find_blocks(data)
    
    # 2. Extract changed sentences
    changed_sentences = []
    for block in blocks:
        sentences = process_block(block)
        changed_sentences.extend(sentences)

    # 3. Format as requested (separated by \n\n)
    output_text = "\n\n".join(changed_sentences)

    if len(sys.argv) >= 3:
        output_file = sys.argv[2]
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(output_text)
        print(f"Extraction complete. Wrote {len(changed_sentences)} sentences to {output_file}.")
    else:
        print(output_text)

 if __name__ == "__main__":
    main()
	"""
	pandoc --track-changes=all doc_with_tracking.docx -o changes.json
	./extract_changed_sentences.py changes.json changed_sentences.txt
	"""

	import json
	import sys
	import re

	def find_blocks(obj):
	"""Recursively find all text-containing blocks (Paragraphs, Headers, etc.)."""
	blocks = []
	if isinstance(obj, dict):
	# Only grab standard blocks that contain inline text
	if obj.get("t") in ["Para", "Plain", "Header"]:
	blocks.append(obj)
	else:
	for v in obj.values():
	blocks.extend(find_blocks(v))
	elif isinstance(obj, list):
	for item in obj:
	blocks.extend(find_blocks(item))
	return blocks

	def extract_inlines(block):
	"""Extract the inline elements array from a block."""
	t = block.get("t")
	c = block.get("c")
	if t in ["Para", "Plain"]:
	return c
	elif t == "Header":
	return c[2] # Header format: [level, attr, [inlines]]
	return []

	def get_chunks(element, in_change=False):
	"""
	Recursively parse inline elements into text chunks.
	Returns a list of dicts: {"text": str, "changed": bool}
	"""
	chunks = []
	if isinstance(element, list):
	for item in element:
	chunks.extend(get_chunks(item, in_change))
	elif isinstance(element, dict):
	t = element.get("t")
	c = element.get("c")

	if t == "Str":
	chunks.append({"text": c, "changed": in_change})
	elif t == "Space":
	chunks.append({"text": " ", "changed": in_change})
	elif t == "Span":
	classes = c[0][1] if len(c) > 0 and len(c[0]) > 1 else []
	content = c[1] if len(c) > 1 else []

	if "insertion" in classes:
	chunks.extend(get_chunks(content, True))
	elif "deletion" in classes:
	# Deletions mean the sentence was changed, but the text is gone post-change.
	# We append a 0-length chunk flagged as 'changed' so we can mark adjacent chars.
	chunks.append({"text": "", "changed": True})
	else:
	chunks.extend(get_chunks(content, in_change))

	# Handle standard wrappers recursively
	elif t in ["Emph", "Strong", "Strikeout", "Superscript", "Subscript", "SmallCaps"]:
	chunks.extend(get_chunks(c, in_change))
	elif t == "Quoted":
	chunks.append({"text": '"', "changed": in_change})
	chunks.extend(get_chunks(c[1], in_change))
	chunks.append({"text": '"', "changed": in_change})
	elif t in ["LineBreak", "SoftBreak"]:
	chunks.append({"text": " ", "changed": in_change})
	elif t in ["Link", "Cite", "Image"] and len(c) > 1:
	chunks.extend(get_chunks(c[1], in_change))

	return chunks

	def process_block(block):
	"""Converts a block into post-change sentences that contain changes."""
	inlines = extract_inlines(block)
	chunks = get_chunks(inlines)

	paragraph_text = ""
	changed_mask = []
	pending_deletion = False

	# Build the plaintext string and a parallel boolean mask of the same length
	for chunk in chunks:
	text = chunk["text"]
	is_changed = chunk["changed"]

	# Catch 0-length deletion chunks
	if len(text) == 0 and is_changed:
	pending_deletion = True
	continue

	paragraph_text += text
	mask_segment = [is_changed] * len(text)

	# If the preceding text was a deletion, mark the first character of this segment as changed
	if pending_deletion and len(mask_segment) > 0:
	mask_segment[0] = True
	pending_deletion = False

	changed_mask.extend(mask_segment)

	# If a deletion happened at the very end of a paragraph
	if pending_deletion and len(changed_mask) > 0:
	changed_mask[-1] = True

	# Smart regex to split sentences (avoids splitting on things like Mr., Dr., U.K.)
	# Note: Regex lookbehinds in Python must be fixed-width
	pattern = r'(?<!\b[A-Z][a-z])(?<!\b[A-Z])[.!?]+(?=\s\|$)'

	spans = []
	start = 0
	for m in re.finditer(pattern, paragraph_text):
	end = m.end()
	spans.append((start, end))
	start = end
	# Skip trailing whitespaces for the start of the next sentence
	while start < len(paragraph_text) and paragraph_text[start].isspace():
	start += 1

	# Add any remaining text as the last sentence (handles paras without ending punctuation)
	if start < len(paragraph_text):
	spans.append((start, len(paragraph_text)))

	changed_sentences = []
	for s, e in spans:
	sentence_text = paragraph_text[s:e].strip()
	if not sentence_text:
	continue

	# If any character in this sentence's mask is True, it has a tracked change
	if any(changed_mask[s:e]):
	changed_sentences.append(sentence_text)

	return changed_sentences

	def main():
	if len(sys.argv) < 2:
	print("Usage: python extract_changes.py <input.json> [output.txt]")
	sys.exit(1)

	input_file = sys.argv[1]
	with open(input_file, 'r', encoding='utf-8') as f:
	data = json.load(f)

	# 1. Locate all text blocks
	blocks = find_blocks(data)

	# 2. Extract changed sentences
	changed_sentences = []
	for block in blocks:
	sentences = process_block(block)
	changed_sentences.extend(sentences)

	# 3. Format as requested (separated by \n\n)
	output_text = "\n\n".join(changed_sentences)

	if len(sys.argv) >= 3:
	output_file = sys.argv[2]
	with open(output_file, 'w', encoding='utf-8') as f:
	f.write(output_text)
	print(f"Extraction complete. Wrote {len(changed_sentences)} sentences to {output_file}.")
	else:
	print(output_text)

	if __name__ == "__main__":
	main()
No results found