Last active
April 6, 2026 15:27
-
-
Save aarblaster/10d7e3a10fea051e3bd0ea6b50521a00 to your computer and use it in GitHub Desktop.
A python script to transform a vtt file into a useable piece of text.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # vtt_process.py | |
| # Version: 1.0 | |
| # | |
| # Processes Zoom/Teams .vtt transcript files for research use: | |
| # - Removes timecodes and block numbers | |
| # - Merges consecutive same-speaker blocks into paragraphs | |
| # - Replaces speaker names with interview labels (I / R) | |
| # - Applies US → UK spelling substitutions | |
| # | |
| # Usage: | |
| # python3 vtt_process.py <file.vtt> [--self "Speaker Name"] | |
| # | |
| # The speaker named via --self (default: Rose Tyler) is labelled I. | |
| # You can change this default on line 280. | |
| # All other speakers are labelled R (or R2, R3 if there are multiple). | |
| # Output is written to <file>_edited.txt in the same directory — non-destructive. | |
| # | |
| # Created by Anthony Arblaster on 28 March 2026. | |
| # | |
| # Copyright Anthony Arblaster 2026. | |
| # – Web: https://codebyanthony.com | |
| # – Mastodon: https://mastodonapp.uk/@aarblaster | |
| # – GitHub: https://github.com/aarblaster | |
| # | |
| # MIT Licence. | |
| # | |
| import sys | |
| import re | |
| import argparse | |
| from pathlib import Path | |
| # ─── US → UK spelling map ───────────────────────────────────────────────────── | |
| # Keys are lowercase. Substitution is case-preserving (see preserve_case). | |
| SPELLING = { | |
| # -our | |
| 'color': 'colour', 'colors': 'colours', 'colored': 'coloured', | |
| 'coloring': 'colouring', 'colorful': 'colourful', 'colorfully': 'colourfully', | |
| 'honor': 'honour', 'honors': 'honours', 'honored': 'honoured', 'honoring': 'honouring', | |
| 'favor': 'favour', 'favors': 'favours', 'favored': 'favoured', 'favoring': 'favouring', | |
| 'favorite': 'favourite', 'favorites': 'favourites', | |
| 'behavior': 'behaviour', 'behaviors': 'behaviours', 'behavioral': 'behavioural', | |
| 'labor': 'labour', 'labors': 'labours', 'labored': 'laboured', 'laboring': 'labouring', | |
| 'neighbor': 'neighbour', 'neighbors': 'neighbours', | |
| 'neighborhood': 'neighbourhood', 'neighborhoods': 'neighbourhoods', | |
| 'humor': 'humour', 'humors': 'humours', 'humored': 'humoured', | |
| 'glamor': 'glamour', | |
| 'rumor': 'rumour', 'rumors': 'rumours', 'rumored': 'rumoured', | |
| 'valor': 'valour', | |
| 'vigor': 'vigour', | |
| 'odor': 'odour', 'odors': 'odours', | |
| 'armor': 'armour', 'armors': 'armours', 'armored': 'armoured', | |
| 'flavor': 'flavour', 'flavors': 'flavours', 'flavored': 'flavoured', | |
| # -re | |
| 'theater': 'theatre', 'theaters': 'theatres', | |
| 'center': 'centre', 'centers': 'centres', 'centered': 'centred', 'centering': 'centring', | |
| 'liter': 'litre', 'liters': 'litres', | |
| 'fiber': 'fibre', 'fibers': 'fibres', | |
| 'caliber': 'calibre', | |
| 'somber': 'sombre', | |
| 'specter': 'spectre', | |
| 'saber': 'sabre', | |
| 'maneuver': 'manoeuvre', 'maneuvers': 'manoeuvres', | |
| 'maneuvered': 'manoeuvred', 'maneuvering': 'manoeuvring', | |
| # -ise/-ize | |
| 'recognize': 'recognise', 'recognizes': 'recognises', | |
| 'recognized': 'recognised', 'recognizing': 'recognising', | |
| 'realize': 'realise', 'realizes': 'realises', | |
| 'realized': 'realised', 'realizing': 'realising', 'realization': 'realisation', | |
| 'organize': 'organise', 'organizes': 'organises', | |
| 'organized': 'organised', 'organizing': 'organising', | |
| 'organization': 'organisation', 'organizations': 'organisations', | |
| 'organizational': 'organisational', | |
| 'prioritize': 'prioritise', 'prioritizes': 'prioritises', | |
| 'prioritized': 'prioritised', 'prioritizing': 'prioritising', | |
| 'emphasize': 'emphasise', 'emphasizes': 'emphasises', | |
| 'emphasized': 'emphasised', 'emphasizing': 'emphasising', | |
| 'memorize': 'memorise', 'memorizes': 'memorises', 'memorized': 'memorised', | |
| 'minimize': 'minimise', 'minimizes': 'minimises', 'minimized': 'minimised', | |
| 'maximize': 'maximise', 'maximizes': 'maximises', 'maximized': 'maximised', | |
| 'visualize': 'visualise', 'visualizes': 'visualises', | |
| 'visualized': 'visualised', 'visualization': 'visualisation', | |
| 'utilize': 'utilise', 'utilizes': 'utilises', | |
| 'utilized': 'utilised', 'utilization': 'utilisation', | |
| 'characterize': 'characterise', 'characterizes': 'characterises', | |
| 'characterized': 'characterised', 'characterizing': 'characterising', | |
| 'categorize': 'categorise', 'categorizes': 'categorises', 'categorized': 'categorised', | |
| 'summarize': 'summarise', 'summarizes': 'summarises', 'summarized': 'summarised', | |
| 'normalize': 'normalise', 'normalizes': 'normalises', 'normalized': 'normalised', | |
| 'symbolize': 'symbolise', 'symbolizes': 'symbolises', 'symbolized': 'symbolised', | |
| 'specialize': 'specialise', 'specializes': 'specialises', | |
| 'specialized': 'specialised', 'specialization': 'specialisation', | |
| 'generalize': 'generalise', 'generalizes': 'generalises', | |
| 'generalized': 'generalised', 'generalization': 'generalisation', | |
| 'formalize': 'formalise', 'formalizes': 'formalises', 'formalized': 'formalised', | |
| 'standardize': 'standardise', 'standardizes': 'standardises', 'standardized': 'standardised', | |
| 'personalize': 'personalise', 'personalizes': 'personalises', | |
| 'personalized': 'personalised', 'personalization': 'personalisation', | |
| 'optimize': 'optimise', 'optimizes': 'optimises', | |
| 'optimized': 'optimised', 'optimization': 'optimisation', | |
| 'dramatize': 'dramatise', 'dramatizes': 'dramatises', 'dramatized': 'dramatised', | |
| 'theorize': 'theorise', 'theorizes': 'theorises', 'theorized': 'theorised', | |
| 'mobilize': 'mobilise', 'mobilizes': 'mobilises', 'mobilized': 'mobilised', | |
| 'stabilize': 'stabilise', 'stabilizes': 'stabilises', 'stabilized': 'stabilised', | |
| 'modernize': 'modernise', 'modernizes': 'modernises', 'modernized': 'modernised', | |
| 'socialize': 'socialise', 'socializes': 'socialises', 'socialized': 'socialised', | |
| 'conceptualize': 'conceptualise', 'conceptualizes': 'conceptualises', | |
| 'conceptualized': 'conceptualised', | |
| 'familiarize': 'familiarise', 'familiarizes': 'familiarises', 'familiarized': 'familiarised', | |
| 'idealize': 'idealise', 'idealizes': 'idealises', 'idealized': 'idealised', | |
| 'rationalize': 'rationalise', 'rationalizes': 'rationalises', 'rationalized': 'rationalised', | |
| 'actualize': 'actualise', 'actualizes': 'actualises', 'actualized': 'actualised', | |
| 'localize': 'localise', 'localizes': 'localises', 'localized': 'localised', | |
| 'legalize': 'legalise', 'legalizes': 'legalises', 'legalized': 'legalised', | |
| 'nationalize': 'nationalise', 'nationalizes': 'nationalises', 'nationalized': 'nationalised', | |
| 'neutralize': 'neutralise', 'neutralizes': 'neutralises', 'neutralized': 'neutralised', | |
| 'sterilize': 'sterilise', 'sterilizes': 'sterilises', 'sterilized': 'sterilised', | |
| 'subsidize': 'subsidise', 'subsidizes': 'subsidises', 'subsidized': 'subsidised', | |
| 'vocalize': 'vocalise', 'vocalizes': 'vocalises', 'vocalized': 'vocalised', | |
| 'penalize': 'penalise', 'penalizes': 'penalises', 'penalized': 'penalised', | |
| 'urbanize': 'urbanise', 'urbanizes': 'urbanises', 'urbanized': 'urbanised', | |
| 'liberalize': 'liberalise', 'liberalizes': 'liberalises', 'liberalized': 'liberalised', | |
| 'materialize': 'materialise', 'materializes': 'materialises', 'materialized': 'materialised', | |
| 'capitalize': 'capitalise', 'capitalizes': 'capitalises', 'capitalized': 'capitalised', | |
| 'scrutinize': 'scrutinise', 'scrutinizes': 'scrutinises', 'scrutinized': 'scrutinised', | |
| 'program': 'programme', 'programs': 'programmes', | |
| # -yse/-yze | |
| 'analyze': 'analyse', 'analyzes': 'analyses', 'analyzed': 'analysed', 'analyzing': 'analysing', | |
| 'paralyze': 'paralyse', 'paralyzes': 'paralyses', 'paralyzed': 'paralysed', | |
| 'catalyze': 'catalyse', 'catalyzes': 'catalyses', 'catalyzed': 'catalysed', | |
| # -ence/-ense | |
| 'defense': 'defence', 'defenses': 'defences', | |
| 'offense': 'offence', 'offenses': 'offences', | |
| 'pretense': 'pretence', | |
| 'license': 'licence', | |
| # -logue/-log | |
| 'dialog': 'dialogue', 'dialogs': 'dialogues', | |
| 'catalog': 'catalogue', 'catalogs': 'catalogues', 'cataloged': 'catalogued', | |
| 'monolog': 'monologue', 'monologs': 'monologues', | |
| 'analog': 'analogue', 'analogs': 'analogues', | |
| # single l / enrol etc. | |
| 'fulfill': 'fulfil', 'fulfills': 'fulfils', 'fulfillment': 'fulfilment', | |
| 'skillful': 'skilful', 'skillfully': 'skilfully', | |
| 'willful': 'wilful', 'willfully': 'wilfully', | |
| 'enroll': 'enrol', 'enrolls': 'enrols', 'enrollment': 'enrolment', | |
| 'instill': 'instil', 'instills': 'instils', | |
| # misc | |
| 'pajamas': 'pyjamas', | |
| 'gray': 'grey', 'grays': 'greys', 'grayed': 'greyed', 'graying': 'greying', | |
| 'aging': 'ageing', | |
| 'jewelry': 'jewellery', | |
| 'judgment': 'judgement', | |
| 'acknowledgment': 'acknowledgement', | |
| 'aluminum': 'aluminium', | |
| 'math': 'maths', | |
| 'gotten': 'got', | |
| 'cozy': 'cosy', 'cozier': 'cosier', 'coziest': 'cosiest', 'coziness': 'cosiness', | |
| 'skeptic': 'sceptic', 'skeptical': 'sceptical', 'skepticism': 'scepticism', | |
| } | |
| # ─── Helpers ────────────────────────────────────────────────────────────────── | |
| def preserve_case(original, replacement): | |
| """Return replacement with the same capitalisation pattern as original.""" | |
| if original.isupper(): | |
| return replacement.upper() | |
| if original[0].isupper(): | |
| return replacement[0].upper() + replacement[1:] | |
| return replacement | |
| def apply_spelling(text): | |
| """Replace US spellings with UK equivalents, preserving case.""" | |
| # Sort by length descending so longer matches (e.g. 'coloring') beat shorter ones ('color'). | |
| pattern = r'\b(' + '|'.join(re.escape(k) for k in sorted(SPELLING, key=len, reverse=True)) + r')\b' | |
| def replacer(match): | |
| word = match.group(0) | |
| return preserve_case(word, SPELLING[word.lower()]) | |
| return re.sub(pattern, replacer, text, flags=re.IGNORECASE) | |
| # ─── VTT parsing ────────────────────────────────────────────────────────────── | |
| def parse_vtt(content): | |
| """ | |
| Parse a VTT file into a list of (speaker, text) tuples. | |
| Block numbers and timecode lines are discarded. | |
| """ | |
| blocks = [] | |
| # Strip the WEBVTT header line | |
| content = re.sub(r'^WEBVTT[^\n]*\n', '', content, count=1) | |
| for raw_block in re.split(r'\n{2,}', content.strip()): | |
| text_lines = [] | |
| for line in raw_block.splitlines(): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| if re.match(r'^\d+$', line): | |
| continue # block number | |
| if '-->' in line: | |
| continue # timecode | |
| text_lines.append(line) | |
| if not text_lines: | |
| continue | |
| text = ' '.join(text_lines) | |
| # Split "Speaker Name: utterance" | |
| match = re.match(r'^([^:]+):\s*(.+)$', text, re.DOTALL) | |
| if match: | |
| blocks.append((match.group(1).strip(), match.group(2).strip())) | |
| elif blocks: | |
| # No speaker prefix — treat as continuation of previous speaker | |
| speaker, prev_text = blocks[-1] | |
| blocks[-1] = (speaker, prev_text + ' ' + text) | |
| return blocks | |
| def merge_consecutive_speakers(blocks): | |
| """Merge adjacent blocks from the same speaker into a single paragraph.""" | |
| if not blocks: | |
| return [] | |
| merged = [list(blocks[0])] | |
| for speaker, text in blocks[1:]: | |
| if speaker == merged[-1][0]: | |
| merged[-1][1] += ' ' + text | |
| else: | |
| merged.append([speaker, text]) | |
| return [tuple(b) for b in merged] | |
| def assign_labels(blocks, self_name): | |
| """ | |
| Replace full speaker names with short labels. | |
| self_name → I; all others → R, R2, R3 … | |
| """ | |
| labels = {} | |
| r_count = 0 | |
| for speaker, _ in blocks: | |
| if speaker in labels: | |
| continue | |
| if speaker == self_name: | |
| labels[speaker] = 'I' | |
| else: | |
| r_count += 1 | |
| labels[speaker] = 'R' if r_count == 1 else f'R{r_count}' | |
| return [(labels.get(speaker, speaker), text) for speaker, text in blocks] | |
| # ─── Main ───────────────────────────────────────────────────────────────────── | |
| # On line 280, you can change the default Interviewer name. | |
| def main(): | |
| parser = argparse.ArgumentParser( | |
| description='Process a Zoom/Teams .vtt transcript for research use.' | |
| ) | |
| parser.add_argument('input', help='Path to the .vtt file') | |
| parser.add_argument( | |
| '--self', | |
| dest='self_name', | |
| default='Rose Tyler', # Change the default name here so you don't need to input your name every time. | |
| metavar='NAME', | |
| help='Speaker name to label as I (default: "Rose Tyler")', | |
| ) | |
| args = parser.parse_args() | |
| input_path = Path(args.input).expanduser().resolve() | |
| if not input_path.exists(): | |
| print(f'Error: file not found: {input_path}', file=sys.stderr) | |
| sys.exit(1) | |
| content = input_path.read_text(encoding='utf-8') | |
| blocks = parse_vtt(content) | |
| blocks = merge_consecutive_speakers(blocks) | |
| blocks = assign_labels(blocks, args.self_name) | |
| blocks = [(speaker, apply_spelling(text)) for speaker, text in blocks] | |
| output = '\n\n'.join(f'{speaker}: {text}' for speaker, text in blocks) | |
| # Strip .vtt and any preceding .transcript suffix before naming the output. | |
| base = input_path.stem # e.g. "foo.transcript" | |
| if base.endswith('.transcript'): | |
| base = base[: -len('.transcript')] | |
| output_path = input_path.parent / (base + '_edited.txt') | |
| output_path.write_text(output + '\n', encoding='utf-8') | |
| print(f'Written to: {output_path}') | |
| speaker_counts = {} | |
| for speaker, _ in blocks: | |
| speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1 | |
| for speaker, count in sorted(speaker_counts.items()): | |
| print(f' {speaker}: {count} turn(s)') | |
| if __name__ == '__main__': | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This gist can be used with the zsh wrapper to make use in the terminal easy. You can see that wrapper at vtt_process.zsh. For more information about the usage of this gist you can read about it on my site at phd.anthonyarblaster.com/phd/Transcript-Management/.
I developed this as part of my PhD research. You can read more at phd.anthonyarblaster.com.