Skip to content

Instantly share code, notes, and snippets.

@zopieux
Created March 25, 2026 20:58
Show Gist options
  • Select an option

  • Save zopieux/c00b3831e572040a0febdcec030d584b to your computer and use it in GitHub Desktop.

Select an option

Save zopieux/c00b3831e572040a0febdcec030d584b to your computer and use it in GitHub Desktop.
"""
pandoc --track-changes=all doc_with_tracking.docx -o changes.json
./extract_changed_sentences.py changes.json changed_sentences.txt
"""
import json
import sys
import re
def find_blocks(obj):
"""Recursively find all text-containing blocks (Paragraphs, Headers, etc.)."""
blocks = []
if isinstance(obj, dict):
# Only grab standard blocks that contain inline text
if obj.get("t") in ["Para", "Plain", "Header"]:
blocks.append(obj)
else:
for v in obj.values():
blocks.extend(find_blocks(v))
elif isinstance(obj, list):
for item in obj:
blocks.extend(find_blocks(item))
return blocks
def extract_inlines(block):
"""Extract the inline elements array from a block."""
t = block.get("t")
c = block.get("c")
if t in ["Para", "Plain"]:
return c
elif t == "Header":
return c[2] # Header format: [level, attr, [inlines]]
return []
def get_chunks(element, in_change=False):
"""
Recursively parse inline elements into text chunks.
Returns a list of dicts: {"text": str, "changed": bool}
"""
chunks = []
if isinstance(element, list):
for item in element:
chunks.extend(get_chunks(item, in_change))
elif isinstance(element, dict):
t = element.get("t")
c = element.get("c")
if t == "Str":
chunks.append({"text": c, "changed": in_change})
elif t == "Space":
chunks.append({"text": " ", "changed": in_change})
elif t == "Span":
classes = c[0][1] if len(c) > 0 and len(c[0]) > 1 else []
content = c[1] if len(c) > 1 else []
if "insertion" in classes:
chunks.extend(get_chunks(content, True))
elif "deletion" in classes:
# Deletions mean the sentence was changed, but the text is gone post-change.
# We append a 0-length chunk flagged as 'changed' so we can mark adjacent chars.
chunks.append({"text": "", "changed": True})
else:
chunks.extend(get_chunks(content, in_change))
# Handle standard wrappers recursively
elif t in ["Emph", "Strong", "Strikeout", "Superscript", "Subscript", "SmallCaps"]:
chunks.extend(get_chunks(c, in_change))
elif t == "Quoted":
chunks.append({"text": '"', "changed": in_change})
chunks.extend(get_chunks(c[1], in_change))
chunks.append({"text": '"', "changed": in_change})
elif t in ["LineBreak", "SoftBreak"]:
chunks.append({"text": " ", "changed": in_change})
elif t in ["Link", "Cite", "Image"] and len(c) > 1:
chunks.extend(get_chunks(c[1], in_change))
return chunks
def process_block(block):
"""Converts a block into post-change sentences that contain changes."""
inlines = extract_inlines(block)
chunks = get_chunks(inlines)
paragraph_text = ""
changed_mask = []
pending_deletion = False
# Build the plaintext string and a parallel boolean mask of the same length
for chunk in chunks:
text = chunk["text"]
is_changed = chunk["changed"]
# Catch 0-length deletion chunks
if len(text) == 0 and is_changed:
pending_deletion = True
continue
paragraph_text += text
mask_segment = [is_changed] * len(text)
# If the preceding text was a deletion, mark the first character of this segment as changed
if pending_deletion and len(mask_segment) > 0:
mask_segment[0] = True
pending_deletion = False
changed_mask.extend(mask_segment)
# If a deletion happened at the very end of a paragraph
if pending_deletion and len(changed_mask) > 0:
changed_mask[-1] = True
# Smart regex to split sentences (avoids splitting on things like Mr., Dr., U.K.)
# Note: Regex lookbehinds in Python must be fixed-width
pattern = r'(?<!\b[A-Z][a-z])(?<!\b[A-Z])[.!?]+(?=\s|$)'
spans = []
start = 0
for m in re.finditer(pattern, paragraph_text):
end = m.end()
spans.append((start, end))
start = end
# Skip trailing whitespaces for the start of the next sentence
while start < len(paragraph_text) and paragraph_text[start].isspace():
start += 1
# Add any remaining text as the last sentence (handles paras without ending punctuation)
if start < len(paragraph_text):
spans.append((start, len(paragraph_text)))
changed_sentences = []
for s, e in spans:
sentence_text = paragraph_text[s:e].strip()
if not sentence_text:
continue
# If any character in this sentence's mask is True, it has a tracked change
if any(changed_mask[s:e]):
changed_sentences.append(sentence_text)
return changed_sentences
def main():
if len(sys.argv) < 2:
print("Usage: python extract_changes.py <input.json> [output.txt]")
sys.exit(1)
input_file = sys.argv[1]
with open(input_file, 'r', encoding='utf-8') as f:
data = json.load(f)
# 1. Locate all text blocks
blocks = find_blocks(data)
# 2. Extract changed sentences
changed_sentences = []
for block in blocks:
sentences = process_block(block)
changed_sentences.extend(sentences)
# 3. Format as requested (separated by \n\n)
output_text = "\n\n".join(changed_sentences)
if len(sys.argv) >= 3:
output_file = sys.argv[2]
with open(output_file, 'w', encoding='utf-8') as f:
f.write(output_text)
print(f"Extraction complete. Wrote {len(changed_sentences)} sentences to {output_file}.")
else:
print(output_text)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment