Created
March 25, 2026 20:58
-
-
Save zopieux/c00b3831e572040a0febdcec030d584b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| pandoc --track-changes=all doc_with_tracking.docx -o changes.json | |
| ./extract_changed_sentences.py changes.json changed_sentences.txt | |
| """ | |
| import json | |
| import sys | |
| import re | |
| def find_blocks(obj): | |
| """Recursively find all text-containing blocks (Paragraphs, Headers, etc.).""" | |
| blocks = [] | |
| if isinstance(obj, dict): | |
| # Only grab standard blocks that contain inline text | |
| if obj.get("t") in ["Para", "Plain", "Header"]: | |
| blocks.append(obj) | |
| else: | |
| for v in obj.values(): | |
| blocks.extend(find_blocks(v)) | |
| elif isinstance(obj, list): | |
| for item in obj: | |
| blocks.extend(find_blocks(item)) | |
| return blocks | |
| def extract_inlines(block): | |
| """Extract the inline elements array from a block.""" | |
| t = block.get("t") | |
| c = block.get("c") | |
| if t in ["Para", "Plain"]: | |
| return c | |
| elif t == "Header": | |
| return c[2] # Header format: [level, attr, [inlines]] | |
| return [] | |
| def get_chunks(element, in_change=False): | |
| """ | |
| Recursively parse inline elements into text chunks. | |
| Returns a list of dicts: {"text": str, "changed": bool} | |
| """ | |
| chunks = [] | |
| if isinstance(element, list): | |
| for item in element: | |
| chunks.extend(get_chunks(item, in_change)) | |
| elif isinstance(element, dict): | |
| t = element.get("t") | |
| c = element.get("c") | |
| if t == "Str": | |
| chunks.append({"text": c, "changed": in_change}) | |
| elif t == "Space": | |
| chunks.append({"text": " ", "changed": in_change}) | |
| elif t == "Span": | |
| classes = c[0][1] if len(c) > 0 and len(c[0]) > 1 else [] | |
| content = c[1] if len(c) > 1 else [] | |
| if "insertion" in classes: | |
| chunks.extend(get_chunks(content, True)) | |
| elif "deletion" in classes: | |
| # Deletions mean the sentence was changed, but the text is gone post-change. | |
| # We append a 0-length chunk flagged as 'changed' so we can mark adjacent chars. | |
| chunks.append({"text": "", "changed": True}) | |
| else: | |
| chunks.extend(get_chunks(content, in_change)) | |
| # Handle standard wrappers recursively | |
| elif t in ["Emph", "Strong", "Strikeout", "Superscript", "Subscript", "SmallCaps"]: | |
| chunks.extend(get_chunks(c, in_change)) | |
| elif t == "Quoted": | |
| chunks.append({"text": '"', "changed": in_change}) | |
| chunks.extend(get_chunks(c[1], in_change)) | |
| chunks.append({"text": '"', "changed": in_change}) | |
| elif t in ["LineBreak", "SoftBreak"]: | |
| chunks.append({"text": " ", "changed": in_change}) | |
| elif t in ["Link", "Cite", "Image"] and len(c) > 1: | |
| chunks.extend(get_chunks(c[1], in_change)) | |
| return chunks | |
| def process_block(block): | |
| """Converts a block into post-change sentences that contain changes.""" | |
| inlines = extract_inlines(block) | |
| chunks = get_chunks(inlines) | |
| paragraph_text = "" | |
| changed_mask = [] | |
| pending_deletion = False | |
| # Build the plaintext string and a parallel boolean mask of the same length | |
| for chunk in chunks: | |
| text = chunk["text"] | |
| is_changed = chunk["changed"] | |
| # Catch 0-length deletion chunks | |
| if len(text) == 0 and is_changed: | |
| pending_deletion = True | |
| continue | |
| paragraph_text += text | |
| mask_segment = [is_changed] * len(text) | |
| # If the preceding text was a deletion, mark the first character of this segment as changed | |
| if pending_deletion and len(mask_segment) > 0: | |
| mask_segment[0] = True | |
| pending_deletion = False | |
| changed_mask.extend(mask_segment) | |
| # If a deletion happened at the very end of a paragraph | |
| if pending_deletion and len(changed_mask) > 0: | |
| changed_mask[-1] = True | |
| # Smart regex to split sentences (avoids splitting on things like Mr., Dr., U.K.) | |
| # Note: Regex lookbehinds in Python must be fixed-width | |
| pattern = r'(?<!\b[A-Z][a-z])(?<!\b[A-Z])[.!?]+(?=\s|$)' | |
| spans = [] | |
| start = 0 | |
| for m in re.finditer(pattern, paragraph_text): | |
| end = m.end() | |
| spans.append((start, end)) | |
| start = end | |
| # Skip trailing whitespaces for the start of the next sentence | |
| while start < len(paragraph_text) and paragraph_text[start].isspace(): | |
| start += 1 | |
| # Add any remaining text as the last sentence (handles paras without ending punctuation) | |
| if start < len(paragraph_text): | |
| spans.append((start, len(paragraph_text))) | |
| changed_sentences = [] | |
| for s, e in spans: | |
| sentence_text = paragraph_text[s:e].strip() | |
| if not sentence_text: | |
| continue | |
| # If any character in this sentence's mask is True, it has a tracked change | |
| if any(changed_mask[s:e]): | |
| changed_sentences.append(sentence_text) | |
| return changed_sentences | |
| def main(): | |
| if len(sys.argv) < 2: | |
| print("Usage: python extract_changes.py <input.json> [output.txt]") | |
| sys.exit(1) | |
| input_file = sys.argv[1] | |
| with open(input_file, 'r', encoding='utf-8') as f: | |
| data = json.load(f) | |
| # 1. Locate all text blocks | |
| blocks = find_blocks(data) | |
| # 2. Extract changed sentences | |
| changed_sentences = [] | |
| for block in blocks: | |
| sentences = process_block(block) | |
| changed_sentences.extend(sentences) | |
| # 3. Format as requested (separated by \n\n) | |
| output_text = "\n\n".join(changed_sentences) | |
| if len(sys.argv) >= 3: | |
| output_file = sys.argv[2] | |
| with open(output_file, 'w', encoding='utf-8') as f: | |
| f.write(output_text) | |
| print(f"Extraction complete. Wrote {len(changed_sentences)} sentences to {output_file}.") | |
| else: | |
| print(output_text) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment