Skip to content

Instantly share code, notes, and snippets.

@aarblaster
Last active April 6, 2026 15:27
Show Gist options
  • Select an option

  • Save aarblaster/10d7e3a10fea051e3bd0ea6b50521a00 to your computer and use it in GitHub Desktop.

Select an option

Save aarblaster/10d7e3a10fea051e3bd0ea6b50521a00 to your computer and use it in GitHub Desktop.
A python script to transform a vtt file into a useable piece of text.
#!/usr/bin/env python3
# vtt_process.py
# Version: 1.0
#
# Processes Zoom/Teams .vtt transcript files for research use:
# - Removes timecodes and block numbers
# - Merges consecutive same-speaker blocks into paragraphs
# - Replaces speaker names with interview labels (I / R)
# - Applies US → UK spelling substitutions
#
# Usage:
# python3 vtt_process.py <file.vtt> [--self "Speaker Name"]
#
# The speaker named via --self (default: Rose Tyler) is labelled I.
# You can change this default on line 280.
# All other speakers are labelled R (or R2, R3 if there are multiple).
# Output is written to <file>_edited.txt in the same directory — non-destructive.
#
# Created by Anthony Arblaster on 28 March 2026.
#
# Copyright Anthony Arblaster 2026.
# – Web: https://codebyanthony.com
# – Mastodon: https://mastodonapp.uk/@aarblaster
# – GitHub: https://github.com/aarblaster
#
# MIT Licence.
#
import sys
import re
import argparse
from pathlib import Path
# ─── US → UK spelling map ─────────────────────────────────────────────────────
# Keys are lowercase. Substitution is case-preserving (see preserve_case).
SPELLING = {
# -our
'color': 'colour', 'colors': 'colours', 'colored': 'coloured',
'coloring': 'colouring', 'colorful': 'colourful', 'colorfully': 'colourfully',
'honor': 'honour', 'honors': 'honours', 'honored': 'honoured', 'honoring': 'honouring',
'favor': 'favour', 'favors': 'favours', 'favored': 'favoured', 'favoring': 'favouring',
'favorite': 'favourite', 'favorites': 'favourites',
'behavior': 'behaviour', 'behaviors': 'behaviours', 'behavioral': 'behavioural',
'labor': 'labour', 'labors': 'labours', 'labored': 'laboured', 'laboring': 'labouring',
'neighbor': 'neighbour', 'neighbors': 'neighbours',
'neighborhood': 'neighbourhood', 'neighborhoods': 'neighbourhoods',
'humor': 'humour', 'humors': 'humours', 'humored': 'humoured',
'glamor': 'glamour',
'rumor': 'rumour', 'rumors': 'rumours', 'rumored': 'rumoured',
'valor': 'valour',
'vigor': 'vigour',
'odor': 'odour', 'odors': 'odours',
'armor': 'armour', 'armors': 'armours', 'armored': 'armoured',
'flavor': 'flavour', 'flavors': 'flavours', 'flavored': 'flavoured',
# -re
'theater': 'theatre', 'theaters': 'theatres',
'center': 'centre', 'centers': 'centres', 'centered': 'centred', 'centering': 'centring',
'liter': 'litre', 'liters': 'litres',
'fiber': 'fibre', 'fibers': 'fibres',
'caliber': 'calibre',
'somber': 'sombre',
'specter': 'spectre',
'saber': 'sabre',
'maneuver': 'manoeuvre', 'maneuvers': 'manoeuvres',
'maneuvered': 'manoeuvred', 'maneuvering': 'manoeuvring',
# -ise/-ize
'recognize': 'recognise', 'recognizes': 'recognises',
'recognized': 'recognised', 'recognizing': 'recognising',
'realize': 'realise', 'realizes': 'realises',
'realized': 'realised', 'realizing': 'realising', 'realization': 'realisation',
'organize': 'organise', 'organizes': 'organises',
'organized': 'organised', 'organizing': 'organising',
'organization': 'organisation', 'organizations': 'organisations',
'organizational': 'organisational',
'prioritize': 'prioritise', 'prioritizes': 'prioritises',
'prioritized': 'prioritised', 'prioritizing': 'prioritising',
'emphasize': 'emphasise', 'emphasizes': 'emphasises',
'emphasized': 'emphasised', 'emphasizing': 'emphasising',
'memorize': 'memorise', 'memorizes': 'memorises', 'memorized': 'memorised',
'minimize': 'minimise', 'minimizes': 'minimises', 'minimized': 'minimised',
'maximize': 'maximise', 'maximizes': 'maximises', 'maximized': 'maximised',
'visualize': 'visualise', 'visualizes': 'visualises',
'visualized': 'visualised', 'visualization': 'visualisation',
'utilize': 'utilise', 'utilizes': 'utilises',
'utilized': 'utilised', 'utilization': 'utilisation',
'characterize': 'characterise', 'characterizes': 'characterises',
'characterized': 'characterised', 'characterizing': 'characterising',
'categorize': 'categorise', 'categorizes': 'categorises', 'categorized': 'categorised',
'summarize': 'summarise', 'summarizes': 'summarises', 'summarized': 'summarised',
'normalize': 'normalise', 'normalizes': 'normalises', 'normalized': 'normalised',
'symbolize': 'symbolise', 'symbolizes': 'symbolises', 'symbolized': 'symbolised',
'specialize': 'specialise', 'specializes': 'specialises',
'specialized': 'specialised', 'specialization': 'specialisation',
'generalize': 'generalise', 'generalizes': 'generalises',
'generalized': 'generalised', 'generalization': 'generalisation',
'formalize': 'formalise', 'formalizes': 'formalises', 'formalized': 'formalised',
'standardize': 'standardise', 'standardizes': 'standardises', 'standardized': 'standardised',
'personalize': 'personalise', 'personalizes': 'personalises',
'personalized': 'personalised', 'personalization': 'personalisation',
'optimize': 'optimise', 'optimizes': 'optimises',
'optimized': 'optimised', 'optimization': 'optimisation',
'dramatize': 'dramatise', 'dramatizes': 'dramatises', 'dramatized': 'dramatised',
'theorize': 'theorise', 'theorizes': 'theorises', 'theorized': 'theorised',
'mobilize': 'mobilise', 'mobilizes': 'mobilises', 'mobilized': 'mobilised',
'stabilize': 'stabilise', 'stabilizes': 'stabilises', 'stabilized': 'stabilised',
'modernize': 'modernise', 'modernizes': 'modernises', 'modernized': 'modernised',
'socialize': 'socialise', 'socializes': 'socialises', 'socialized': 'socialised',
'conceptualize': 'conceptualise', 'conceptualizes': 'conceptualises',
'conceptualized': 'conceptualised',
'familiarize': 'familiarise', 'familiarizes': 'familiarises', 'familiarized': 'familiarised',
'idealize': 'idealise', 'idealizes': 'idealises', 'idealized': 'idealised',
'rationalize': 'rationalise', 'rationalizes': 'rationalises', 'rationalized': 'rationalised',
'actualize': 'actualise', 'actualizes': 'actualises', 'actualized': 'actualised',
'localize': 'localise', 'localizes': 'localises', 'localized': 'localised',
'legalize': 'legalise', 'legalizes': 'legalises', 'legalized': 'legalised',
'nationalize': 'nationalise', 'nationalizes': 'nationalises', 'nationalized': 'nationalised',
'neutralize': 'neutralise', 'neutralizes': 'neutralises', 'neutralized': 'neutralised',
'sterilize': 'sterilise', 'sterilizes': 'sterilises', 'sterilized': 'sterilised',
'subsidize': 'subsidise', 'subsidizes': 'subsidises', 'subsidized': 'subsidised',
'vocalize': 'vocalise', 'vocalizes': 'vocalises', 'vocalized': 'vocalised',
'penalize': 'penalise', 'penalizes': 'penalises', 'penalized': 'penalised',
'urbanize': 'urbanise', 'urbanizes': 'urbanises', 'urbanized': 'urbanised',
'liberalize': 'liberalise', 'liberalizes': 'liberalises', 'liberalized': 'liberalised',
'materialize': 'materialise', 'materializes': 'materialises', 'materialized': 'materialised',
'capitalize': 'capitalise', 'capitalizes': 'capitalises', 'capitalized': 'capitalised',
'scrutinize': 'scrutinise', 'scrutinizes': 'scrutinises', 'scrutinized': 'scrutinised',
'program': 'programme', 'programs': 'programmes',
# -yse/-yze
'analyze': 'analyse', 'analyzes': 'analyses', 'analyzed': 'analysed', 'analyzing': 'analysing',
'paralyze': 'paralyse', 'paralyzes': 'paralyses', 'paralyzed': 'paralysed',
'catalyze': 'catalyse', 'catalyzes': 'catalyses', 'catalyzed': 'catalysed',
# -ence/-ense
'defense': 'defence', 'defenses': 'defences',
'offense': 'offence', 'offenses': 'offences',
'pretense': 'pretence',
'license': 'licence',
# -logue/-log
'dialog': 'dialogue', 'dialogs': 'dialogues',
'catalog': 'catalogue', 'catalogs': 'catalogues', 'cataloged': 'catalogued',
'monolog': 'monologue', 'monologs': 'monologues',
'analog': 'analogue', 'analogs': 'analogues',
# single l / enrol etc.
'fulfill': 'fulfil', 'fulfills': 'fulfils', 'fulfillment': 'fulfilment',
'skillful': 'skilful', 'skillfully': 'skilfully',
'willful': 'wilful', 'willfully': 'wilfully',
'enroll': 'enrol', 'enrolls': 'enrols', 'enrollment': 'enrolment',
'instill': 'instil', 'instills': 'instils',
# misc
'pajamas': 'pyjamas',
'gray': 'grey', 'grays': 'greys', 'grayed': 'greyed', 'graying': 'greying',
'aging': 'ageing',
'jewelry': 'jewellery',
'judgment': 'judgement',
'acknowledgment': 'acknowledgement',
'aluminum': 'aluminium',
'math': 'maths',
'gotten': 'got',
'cozy': 'cosy', 'cozier': 'cosier', 'coziest': 'cosiest', 'coziness': 'cosiness',
'skeptic': 'sceptic', 'skeptical': 'sceptical', 'skepticism': 'scepticism',
}
# ─── Helpers ──────────────────────────────────────────────────────────────────
def preserve_case(original, replacement):
"""Return replacement with the same capitalisation pattern as original."""
if original.isupper():
return replacement.upper()
if original[0].isupper():
return replacement[0].upper() + replacement[1:]
return replacement
def apply_spelling(text):
"""Replace US spellings with UK equivalents, preserving case."""
# Sort by length descending so longer matches (e.g. 'coloring') beat shorter ones ('color').
pattern = r'\b(' + '|'.join(re.escape(k) for k in sorted(SPELLING, key=len, reverse=True)) + r')\b'
def replacer(match):
word = match.group(0)
return preserve_case(word, SPELLING[word.lower()])
return re.sub(pattern, replacer, text, flags=re.IGNORECASE)
# ─── VTT parsing ──────────────────────────────────────────────────────────────
def parse_vtt(content):
"""
Parse a VTT file into a list of (speaker, text) tuples.
Block numbers and timecode lines are discarded.
"""
blocks = []
# Strip the WEBVTT header line
content = re.sub(r'^WEBVTT[^\n]*\n', '', content, count=1)
for raw_block in re.split(r'\n{2,}', content.strip()):
text_lines = []
for line in raw_block.splitlines():
line = line.strip()
if not line:
continue
if re.match(r'^\d+$', line):
continue # block number
if '-->' in line:
continue # timecode
text_lines.append(line)
if not text_lines:
continue
text = ' '.join(text_lines)
# Split "Speaker Name: utterance"
match = re.match(r'^([^:]+):\s*(.+)$', text, re.DOTALL)
if match:
blocks.append((match.group(1).strip(), match.group(2).strip()))
elif blocks:
# No speaker prefix — treat as continuation of previous speaker
speaker, prev_text = blocks[-1]
blocks[-1] = (speaker, prev_text + ' ' + text)
return blocks
def merge_consecutive_speakers(blocks):
"""Merge adjacent blocks from the same speaker into a single paragraph."""
if not blocks:
return []
merged = [list(blocks[0])]
for speaker, text in blocks[1:]:
if speaker == merged[-1][0]:
merged[-1][1] += ' ' + text
else:
merged.append([speaker, text])
return [tuple(b) for b in merged]
def assign_labels(blocks, self_name):
"""
Replace full speaker names with short labels.
self_name → I; all others → R, R2, R3 …
"""
labels = {}
r_count = 0
for speaker, _ in blocks:
if speaker in labels:
continue
if speaker == self_name:
labels[speaker] = 'I'
else:
r_count += 1
labels[speaker] = 'R' if r_count == 1 else f'R{r_count}'
return [(labels.get(speaker, speaker), text) for speaker, text in blocks]
# ─── Main ─────────────────────────────────────────────────────────────────────
# On line 280, you can change the default Interviewer name.
def main():
parser = argparse.ArgumentParser(
description='Process a Zoom/Teams .vtt transcript for research use.'
)
parser.add_argument('input', help='Path to the .vtt file')
parser.add_argument(
'--self',
dest='self_name',
default='Rose Tyler', # Change the default name here so you don't need to input your name every time.
metavar='NAME',
help='Speaker name to label as I (default: "Rose Tyler")',
)
args = parser.parse_args()
input_path = Path(args.input).expanduser().resolve()
if not input_path.exists():
print(f'Error: file not found: {input_path}', file=sys.stderr)
sys.exit(1)
content = input_path.read_text(encoding='utf-8')
blocks = parse_vtt(content)
blocks = merge_consecutive_speakers(blocks)
blocks = assign_labels(blocks, args.self_name)
blocks = [(speaker, apply_spelling(text)) for speaker, text in blocks]
output = '\n\n'.join(f'{speaker}: {text}' for speaker, text in blocks)
# Strip .vtt and any preceding .transcript suffix before naming the output.
base = input_path.stem # e.g. "foo.transcript"
if base.endswith('.transcript'):
base = base[: -len('.transcript')]
output_path = input_path.parent / (base + '_edited.txt')
output_path.write_text(output + '\n', encoding='utf-8')
print(f'Written to: {output_path}')
speaker_counts = {}
for speaker, _ in blocks:
speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1
for speaker, count in sorted(speaker_counts.items()):
print(f' {speaker}: {count} turn(s)')
if __name__ == '__main__':
main()
@aarblaster
Copy link
Copy Markdown
Author

aarblaster commented Apr 6, 2026

This gist can be used with the zsh wrapper to make use in the terminal easy. You can see that wrapper at vtt_process.zsh. For more information about the usage of this gist you can read about it on my site at phd.anthonyarblaster.com/phd/Transcript-Management/.
I developed this as part of my PhD research. You can read more at phd.anthonyarblaster.com.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment