aarblaster · April 6, 2026 15:27 · aarblaster · Apr 6, 2026
diff --git a/vtt_process.py b/vtt_process.py
 #!/usr/bin/env python3
 # vtt_process.py
 # Version: 1.0
 #
 # Processes Zoom/Teams .vtt transcript files for research use:
 #   - Removes timecodes and block numbers
 #   - Merges consecutive same-speaker blocks into paragraphs
 #   - Replaces speaker names with interview labels (I / R)
 #   - Applies US → UK spelling substitutions
 #
 # Usage:
 #   python3 vtt_process.py <file.vtt> [--self "Speaker Name"]
 #
 # The speaker named via --self (default: Rose Tyler) is labelled I.
 # You can change this default on line 280.
 # All other speakers are labelled R (or R2, R3 if there are multiple).
 # Output is written to <file>_edited.txt in the same directory — non-destructive.
 #
 # Created by Anthony Arblaster on 28 March 2026.
 #
 # Copyright Anthony Arblaster 2026.
 #    – Web: https://codebyanthony.com
 #    – Mastodon: https://mastodonapp.uk/@aarblaster
 #    – GitHub: https://github.com/aarblaster
 #
 # MIT Licence.
 #


 import sys
 import re
 import argparse
 from pathlib import Path


 # ─── US → UK spelling map ─────────────────────────────────────────────────────
 # Keys are lowercase. Substitution is case-preserving (see preserve_case).

 SPELLING = {
 	# -our
 	'color': 'colour', 'colors': 'colours', 'colored': 'coloured',
 	'coloring': 'colouring', 'colorful': 'colourful', 'colorfully': 'colourfully',
 	'honor': 'honour', 'honors': 'honours', 'honored': 'honoured', 'honoring': 'honouring',
 	'favor': 'favour', 'favors': 'favours', 'favored': 'favoured', 'favoring': 'favouring',
 	'favorite': 'favourite', 'favorites': 'favourites',
 	'behavior': 'behaviour', 'behaviors': 'behaviours', 'behavioral': 'behavioural',
 	'labor': 'labour', 'labors': 'labours', 'labored': 'laboured', 'laboring': 'labouring',
 	'neighbor': 'neighbour', 'neighbors': 'neighbours',
 	'neighborhood': 'neighbourhood', 'neighborhoods': 'neighbourhoods',
 	'humor': 'humour', 'humors': 'humours', 'humored': 'humoured',
 	'glamor': 'glamour',
 	'rumor': 'rumour', 'rumors': 'rumours', 'rumored': 'rumoured',
 	'valor': 'valour',
 	'vigor': 'vigour',
 	'odor': 'odour', 'odors': 'odours',
 	'armor': 'armour', 'armors': 'armours', 'armored': 'armoured',
 	'flavor': 'flavour', 'flavors': 'flavours', 'flavored': 'flavoured',

 	# -re
 	'theater': 'theatre', 'theaters': 'theatres',
 	'center': 'centre', 'centers': 'centres', 'centered': 'centred', 'centering': 'centring',
 	'liter': 'litre', 'liters': 'litres',
 	'fiber': 'fibre', 'fibers': 'fibres',
 	'caliber': 'calibre',
 	'somber': 'sombre',
 	'specter': 'spectre',
 	'saber': 'sabre',
 	'maneuver': 'manoeuvre', 'maneuvers': 'manoeuvres',
 	'maneuvered': 'manoeuvred', 'maneuvering': 'manoeuvring',

 	# -ise/-ize
 	'recognize': 'recognise', 'recognizes': 'recognises',
 	'recognized': 'recognised', 'recognizing': 'recognising',
 	'realize': 'realise', 'realizes': 'realises',
 	'realized': 'realised', 'realizing': 'realising', 'realization': 'realisation',
 	'organize': 'organise', 'organizes': 'organises',
 	'organized': 'organised', 'organizing': 'organising',
 	'organization': 'organisation', 'organizations': 'organisations',
 	'organizational': 'organisational',
 	'prioritize': 'prioritise', 'prioritizes': 'prioritises',
 	'prioritized': 'prioritised', 'prioritizing': 'prioritising',
 	'emphasize': 'emphasise', 'emphasizes': 'emphasises',
 	'emphasized': 'emphasised', 'emphasizing': 'emphasising',
 	'memorize': 'memorise', 'memorizes': 'memorises', 'memorized': 'memorised',
 	'minimize': 'minimise', 'minimizes': 'minimises', 'minimized': 'minimised',
 	'maximize': 'maximise', 'maximizes': 'maximises', 'maximized': 'maximised',
 	'visualize': 'visualise', 'visualizes': 'visualises',
 	'visualized': 'visualised', 'visualization': 'visualisation',
 	'utilize': 'utilise', 'utilizes': 'utilises',
 	'utilized': 'utilised', 'utilization': 'utilisation',
 	'characterize': 'characterise', 'characterizes': 'characterises',
 	'characterized': 'characterised', 'characterizing': 'characterising',
 	'categorize': 'categorise', 'categorizes': 'categorises', 'categorized': 'categorised',
 	'summarize': 'summarise', 'summarizes': 'summarises', 'summarized': 'summarised',
 	'normalize': 'normalise', 'normalizes': 'normalises', 'normalized': 'normalised',
 	'symbolize': 'symbolise', 'symbolizes': 'symbolises', 'symbolized': 'symbolised',
 	'specialize': 'specialise', 'specializes': 'specialises',
 	'specialized': 'specialised', 'specialization': 'specialisation',
 	'generalize': 'generalise', 'generalizes': 'generalises',
 	'generalized': 'generalised', 'generalization': 'generalisation',
 	'formalize': 'formalise', 'formalizes': 'formalises', 'formalized': 'formalised',
 	'standardize': 'standardise', 'standardizes': 'standardises', 'standardized': 'standardised',
 	'personalize': 'personalise', 'personalizes': 'personalises',
 	'personalized': 'personalised', 'personalization': 'personalisation',
 	'optimize': 'optimise', 'optimizes': 'optimises',
 	'optimized': 'optimised', 'optimization': 'optimisation',
 	'dramatize': 'dramatise', 'dramatizes': 'dramatises', 'dramatized': 'dramatised',
 	'theorize': 'theorise', 'theorizes': 'theorises', 'theorized': 'theorised',
 	'mobilize': 'mobilise', 'mobilizes': 'mobilises', 'mobilized': 'mobilised',
 	'stabilize': 'stabilise', 'stabilizes': 'stabilises', 'stabilized': 'stabilised',
 	'modernize': 'modernise', 'modernizes': 'modernises', 'modernized': 'modernised',
 	'socialize': 'socialise', 'socializes': 'socialises', 'socialized': 'socialised',
 	'conceptualize': 'conceptualise', 'conceptualizes': 'conceptualises',
 	'conceptualized': 'conceptualised',
 	'familiarize': 'familiarise', 'familiarizes': 'familiarises', 'familiarized': 'familiarised',
 	'idealize': 'idealise', 'idealizes': 'idealises', 'idealized': 'idealised',
 	'rationalize': 'rationalise', 'rationalizes': 'rationalises', 'rationalized': 'rationalised',
 	'actualize': 'actualise', 'actualizes': 'actualises', 'actualized': 'actualised',
 	'localize': 'localise', 'localizes': 'localises', 'localized': 'localised',
 	'legalize': 'legalise', 'legalizes': 'legalises', 'legalized': 'legalised',
 	'nationalize': 'nationalise', 'nationalizes': 'nationalises', 'nationalized': 'nationalised',
 	'neutralize': 'neutralise', 'neutralizes': 'neutralises', 'neutralized': 'neutralised',
 	'sterilize': 'sterilise', 'sterilizes': 'sterilises', 'sterilized': 'sterilised',
 	'subsidize': 'subsidise', 'subsidizes': 'subsidises', 'subsidized': 'subsidised',
 	'vocalize': 'vocalise', 'vocalizes': 'vocalises', 'vocalized': 'vocalised',
 	'penalize': 'penalise', 'penalizes': 'penalises', 'penalized': 'penalised',
 	'urbanize': 'urbanise', 'urbanizes': 'urbanises', 'urbanized': 'urbanised',
 	'liberalize': 'liberalise', 'liberalizes': 'liberalises', 'liberalized': 'liberalised',
 	'materialize': 'materialise', 'materializes': 'materialises', 'materialized': 'materialised',
 	'capitalize': 'capitalise', 'capitalizes': 'capitalises', 'capitalized': 'capitalised',
 	'scrutinize': 'scrutinise', 'scrutinizes': 'scrutinises', 'scrutinized': 'scrutinised',
 	'program': 'programme', 'programs': 'programmes',

 	# -yse/-yze
 	'analyze': 'analyse', 'analyzes': 'analyses', 'analyzed': 'analysed', 'analyzing': 'analysing',
 	'paralyze': 'paralyse', 'paralyzes': 'paralyses', 'paralyzed': 'paralysed',
 	'catalyze': 'catalyse', 'catalyzes': 'catalyses', 'catalyzed': 'catalysed',

 	# -ence/-ense
 	'defense': 'defence', 'defenses': 'defences',
 	'offense': 'offence', 'offenses': 'offences',
 	'pretense': 'pretence',
 	'license': 'licence',

 	# -logue/-log
 	'dialog': 'dialogue', 'dialogs': 'dialogues',
 	'catalog': 'catalogue', 'catalogs': 'catalogues', 'cataloged': 'catalogued',
 	'monolog': 'monologue', 'monologs': 'monologues',
 	'analog': 'analogue', 'analogs': 'analogues',

 	# single l / enrol etc.
 	'fulfill': 'fulfil', 'fulfills': 'fulfils', 'fulfillment': 'fulfilment',
 	'skillful': 'skilful', 'skillfully': 'skilfully',
 	'willful': 'wilful', 'willfully': 'wilfully',
 	'enroll': 'enrol', 'enrolls': 'enrols', 'enrollment': 'enrolment',
 	'instill': 'instil', 'instills': 'instils',

 	# misc
 	'pajamas': 'pyjamas',
 	'gray': 'grey', 'grays': 'greys', 'grayed': 'greyed', 'graying': 'greying',
 	'aging': 'ageing',
 	'jewelry': 'jewellery',
 	'judgment': 'judgement',
 	'acknowledgment': 'acknowledgement',
 	'aluminum': 'aluminium',
 	'math': 'maths',
 	'gotten': 'got',
 	'cozy': 'cosy', 'cozier': 'cosier', 'coziest': 'cosiest', 'coziness': 'cosiness',
 	'skeptic': 'sceptic', 'skeptical': 'sceptical', 'skepticism': 'scepticism',
 }


 # ─── Helpers ──────────────────────────────────────────────────────────────────

 def preserve_case(original, replacement):
 	"""Return replacement with the same capitalisation pattern as original."""
 	if original.isupper():
 		return replacement.upper()
 	if original[0].isupper():
 		return replacement[0].upper() + replacement[1:]
 	return replacement


 def apply_spelling(text):
 	"""Replace US spellings with UK equivalents, preserving case."""
 	# Sort by length descending so longer matches (e.g. 'coloring') beat shorter ones ('color').
 	pattern = r'\b(' + '|'.join(re.escape(k) for k in sorted(SPELLING, key=len, reverse=True)) + r')\b'

 	def replacer(match):
 		word = match.group(0)
 		return preserve_case(word, SPELLING[word.lower()])

 	return re.sub(pattern, replacer, text, flags=re.IGNORECASE)


 # ─── VTT parsing ──────────────────────────────────────────────────────────────

 def parse_vtt(content):
 	"""
 	Parse a VTT file into a list of (speaker, text) tuples.
 	Block numbers and timecode lines are discarded.
 	"""
 	blocks = []

 	# Strip the WEBVTT header line
 	content = re.sub(r'^WEBVTT[^\n]*\n', '', content, count=1)

 	for raw_block in re.split(r'\n{2,}', content.strip()):
 		text_lines = []
 		for line in raw_block.splitlines():
 			line = line.strip()
 			if not line:
 				continue
 			if re.match(r'^\d+$', line):
 				continue  # block number
 			if '-->' in line:
 				continue  # timecode
 			text_lines.append(line)

 		if not text_lines:
 			continue

 		text = ' '.join(text_lines)

 		# Split "Speaker Name: utterance"
 		match = re.match(r'^([^:]+):\s*(.+)$', text, re.DOTALL)
 		if match:
 			blocks.append((match.group(1).strip(), match.group(2).strip()))
 		elif blocks:
 			# No speaker prefix — treat as continuation of previous speaker
 			speaker, prev_text = blocks[-1]
 			blocks[-1] = (speaker, prev_text + ' ' + text)

 	return blocks


 def merge_consecutive_speakers(blocks):
 	"""Merge adjacent blocks from the same speaker into a single paragraph."""
 	if not blocks:
 		return []

 	merged = [list(blocks[0])]
 	for speaker, text in blocks[1:]:
 		if speaker == merged[-1][0]:
 			merged[-1][1] += ' ' + text
 		else:
 			merged.append([speaker, text])

 	return [tuple(b) for b in merged]


 def assign_labels(blocks, self_name):
 	"""
 	Replace full speaker names with short labels.
 	self_name → I; all others → R, R2, R3 …
 	"""
 	labels = {}
 	r_count = 0

 	for speaker, _ in blocks:
 		if speaker in labels:
 			continue
 		if speaker == self_name:
 			labels[speaker] = 'I'
 		else:
 			r_count += 1
 			labels[speaker] = 'R' if r_count == 1 else f'R{r_count}'

 	return [(labels.get(speaker, speaker), text) for speaker, text in blocks]


 # ─── Main ─────────────────────────────────────────────────────────────────────
 # On line 280, you can change the default Interviewer name.

 def main():
 	parser = argparse.ArgumentParser(
 		description='Process a Zoom/Teams .vtt transcript for research use.'
 	)
 	parser.add_argument('input', help='Path to the .vtt file')
 	parser.add_argument(
 		'--self',
 		dest='self_name',
 		default='Rose Tyler', # Change the default name here so you don't need to input your name every time.
 		metavar='NAME',
 		help='Speaker name to label as I (default: "Rose Tyler")',
 	)
 	args = parser.parse_args()

 	input_path = Path(args.input).expanduser().resolve()
 	if not input_path.exists():
 		print(f'Error: file not found: {input_path}', file=sys.stderr)
 		sys.exit(1)

 	content = input_path.read_text(encoding='utf-8')

 	blocks = parse_vtt(content)
 	blocks = merge_consecutive_speakers(blocks)
 	blocks = assign_labels(blocks, args.self_name)
 	blocks = [(speaker, apply_spelling(text)) for speaker, text in blocks]

 	output = '\n\n'.join(f'{speaker}: {text}' for speaker, text in blocks)

 	# Strip .vtt and any preceding .transcript suffix before naming the output.
 	base = input_path.stem  # e.g. "foo.transcript"
 	if base.endswith('.transcript'):
 		base = base[: -len('.transcript')]
 	output_path = input_path.parent / (base + '_edited.txt')
 	output_path.write_text(output + '\n', encoding='utf-8')

 	print(f'Written to: {output_path}')
 	speaker_counts = {}
 	for speaker, _ in blocks:
 		speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1
 	for speaker, count in sorted(speaker_counts.items()):
 		print(f'  {speaker}: {count} turn(s)')


 if __name__ == '__main__':
 	main()
	#!/usr/bin/env python3
	# vtt_process.py
	# Version: 1.0
	#
	# Processes Zoom/Teams .vtt transcript files for research use:
	# - Removes timecodes and block numbers
	# - Merges consecutive same-speaker blocks into paragraphs
	# - Replaces speaker names with interview labels (I / R)
	# - Applies US → UK spelling substitutions
	#
	# Usage:
	# python3 vtt_process.py <file.vtt> [--self "Speaker Name"]
	#
	# The speaker named via --self (default: Rose Tyler) is labelled I.
	# You can change this default on line 280.
	# All other speakers are labelled R (or R2, R3 if there are multiple).
	# Output is written to <file>_edited.txt in the same directory — non-destructive.
	#
	# Created by Anthony Arblaster on 28 March 2026.
	#
	# Copyright Anthony Arblaster 2026.
	# – Web: https://codebyanthony.com
	# – Mastodon: https://mastodonapp.uk/@aarblaster
	# – GitHub: https://github.com/aarblaster
	#
	# MIT Licence.
	#


	import sys
	import re
	import argparse
	from pathlib import Path


	# ─── US → UK spelling map ─────────────────────────────────────────────────────
	# Keys are lowercase. Substitution is case-preserving (see preserve_case).

	SPELLING = {
	# -our
	'color': 'colour', 'colors': 'colours', 'colored': 'coloured',
	'coloring': 'colouring', 'colorful': 'colourful', 'colorfully': 'colourfully',
	'honor': 'honour', 'honors': 'honours', 'honored': 'honoured', 'honoring': 'honouring',
	'favor': 'favour', 'favors': 'favours', 'favored': 'favoured', 'favoring': 'favouring',
	'favorite': 'favourite', 'favorites': 'favourites',
	'behavior': 'behaviour', 'behaviors': 'behaviours', 'behavioral': 'behavioural',
	'labor': 'labour', 'labors': 'labours', 'labored': 'laboured', 'laboring': 'labouring',
	'neighbor': 'neighbour', 'neighbors': 'neighbours',
	'neighborhood': 'neighbourhood', 'neighborhoods': 'neighbourhoods',
	'humor': 'humour', 'humors': 'humours', 'humored': 'humoured',
	'glamor': 'glamour',
	'rumor': 'rumour', 'rumors': 'rumours', 'rumored': 'rumoured',
	'valor': 'valour',
	'vigor': 'vigour',
	'odor': 'odour', 'odors': 'odours',
	'armor': 'armour', 'armors': 'armours', 'armored': 'armoured',
	'flavor': 'flavour', 'flavors': 'flavours', 'flavored': 'flavoured',

	# -re
	'theater': 'theatre', 'theaters': 'theatres',
	'center': 'centre', 'centers': 'centres', 'centered': 'centred', 'centering': 'centring',
	'liter': 'litre', 'liters': 'litres',
	'fiber': 'fibre', 'fibers': 'fibres',
	'caliber': 'calibre',
	'somber': 'sombre',
	'specter': 'spectre',
	'saber': 'sabre',
	'maneuver': 'manoeuvre', 'maneuvers': 'manoeuvres',
	'maneuvered': 'manoeuvred', 'maneuvering': 'manoeuvring',

	# -ise/-ize
	'recognize': 'recognise', 'recognizes': 'recognises',
	'recognized': 'recognised', 'recognizing': 'recognising',
	'realize': 'realise', 'realizes': 'realises',
	'realized': 'realised', 'realizing': 'realising', 'realization': 'realisation',
	'organize': 'organise', 'organizes': 'organises',
	'organized': 'organised', 'organizing': 'organising',
	'organization': 'organisation', 'organizations': 'organisations',
	'organizational': 'organisational',
	'prioritize': 'prioritise', 'prioritizes': 'prioritises',
	'prioritized': 'prioritised', 'prioritizing': 'prioritising',
	'emphasize': 'emphasise', 'emphasizes': 'emphasises',
	'emphasized': 'emphasised', 'emphasizing': 'emphasising',
	'memorize': 'memorise', 'memorizes': 'memorises', 'memorized': 'memorised',
	'minimize': 'minimise', 'minimizes': 'minimises', 'minimized': 'minimised',
	'maximize': 'maximise', 'maximizes': 'maximises', 'maximized': 'maximised',
	'visualize': 'visualise', 'visualizes': 'visualises',
	'visualized': 'visualised', 'visualization': 'visualisation',
	'utilize': 'utilise', 'utilizes': 'utilises',
	'utilized': 'utilised', 'utilization': 'utilisation',
	'characterize': 'characterise', 'characterizes': 'characterises',
	'characterized': 'characterised', 'characterizing': 'characterising',
	'categorize': 'categorise', 'categorizes': 'categorises', 'categorized': 'categorised',
	'summarize': 'summarise', 'summarizes': 'summarises', 'summarized': 'summarised',
	'normalize': 'normalise', 'normalizes': 'normalises', 'normalized': 'normalised',
	'symbolize': 'symbolise', 'symbolizes': 'symbolises', 'symbolized': 'symbolised',
	'specialize': 'specialise', 'specializes': 'specialises',
	'specialized': 'specialised', 'specialization': 'specialisation',
	'generalize': 'generalise', 'generalizes': 'generalises',
	'generalized': 'generalised', 'generalization': 'generalisation',
	'formalize': 'formalise', 'formalizes': 'formalises', 'formalized': 'formalised',
	'standardize': 'standardise', 'standardizes': 'standardises', 'standardized': 'standardised',
	'personalize': 'personalise', 'personalizes': 'personalises',
	'personalized': 'personalised', 'personalization': 'personalisation',
	'optimize': 'optimise', 'optimizes': 'optimises',
	'optimized': 'optimised', 'optimization': 'optimisation',
	'dramatize': 'dramatise', 'dramatizes': 'dramatises', 'dramatized': 'dramatised',
	'theorize': 'theorise', 'theorizes': 'theorises', 'theorized': 'theorised',
	'mobilize': 'mobilise', 'mobilizes': 'mobilises', 'mobilized': 'mobilised',
	'stabilize': 'stabilise', 'stabilizes': 'stabilises', 'stabilized': 'stabilised',
	'modernize': 'modernise', 'modernizes': 'modernises', 'modernized': 'modernised',
	'socialize': 'socialise', 'socializes': 'socialises', 'socialized': 'socialised',
	'conceptualize': 'conceptualise', 'conceptualizes': 'conceptualises',
	'conceptualized': 'conceptualised',
	'familiarize': 'familiarise', 'familiarizes': 'familiarises', 'familiarized': 'familiarised',
	'idealize': 'idealise', 'idealizes': 'idealises', 'idealized': 'idealised',
	'rationalize': 'rationalise', 'rationalizes': 'rationalises', 'rationalized': 'rationalised',
	'actualize': 'actualise', 'actualizes': 'actualises', 'actualized': 'actualised',
	'localize': 'localise', 'localizes': 'localises', 'localized': 'localised',
	'legalize': 'legalise', 'legalizes': 'legalises', 'legalized': 'legalised',
	'nationalize': 'nationalise', 'nationalizes': 'nationalises', 'nationalized': 'nationalised',
	'neutralize': 'neutralise', 'neutralizes': 'neutralises', 'neutralized': 'neutralised',
	'sterilize': 'sterilise', 'sterilizes': 'sterilises', 'sterilized': 'sterilised',
	'subsidize': 'subsidise', 'subsidizes': 'subsidises', 'subsidized': 'subsidised',
	'vocalize': 'vocalise', 'vocalizes': 'vocalises', 'vocalized': 'vocalised',
	'penalize': 'penalise', 'penalizes': 'penalises', 'penalized': 'penalised',
	'urbanize': 'urbanise', 'urbanizes': 'urbanises', 'urbanized': 'urbanised',
	'liberalize': 'liberalise', 'liberalizes': 'liberalises', 'liberalized': 'liberalised',
	'materialize': 'materialise', 'materializes': 'materialises', 'materialized': 'materialised',
	'capitalize': 'capitalise', 'capitalizes': 'capitalises', 'capitalized': 'capitalised',
	'scrutinize': 'scrutinise', 'scrutinizes': 'scrutinises', 'scrutinized': 'scrutinised',
	'program': 'programme', 'programs': 'programmes',

	# -yse/-yze
	'analyze': 'analyse', 'analyzes': 'analyses', 'analyzed': 'analysed', 'analyzing': 'analysing',
	'paralyze': 'paralyse', 'paralyzes': 'paralyses', 'paralyzed': 'paralysed',
	'catalyze': 'catalyse', 'catalyzes': 'catalyses', 'catalyzed': 'catalysed',

	# -ence/-ense
	'defense': 'defence', 'defenses': 'defences',
	'offense': 'offence', 'offenses': 'offences',
	'pretense': 'pretence',
	'license': 'licence',

	# -logue/-log
	'dialog': 'dialogue', 'dialogs': 'dialogues',
	'catalog': 'catalogue', 'catalogs': 'catalogues', 'cataloged': 'catalogued',
	'monolog': 'monologue', 'monologs': 'monologues',
	'analog': 'analogue', 'analogs': 'analogues',

	# single l / enrol etc.
	'fulfill': 'fulfil', 'fulfills': 'fulfils', 'fulfillment': 'fulfilment',
	'skillful': 'skilful', 'skillfully': 'skilfully',
	'willful': 'wilful', 'willfully': 'wilfully',
	'enroll': 'enrol', 'enrolls': 'enrols', 'enrollment': 'enrolment',
	'instill': 'instil', 'instills': 'instils',

	# misc
	'pajamas': 'pyjamas',
	'gray': 'grey', 'grays': 'greys', 'grayed': 'greyed', 'graying': 'greying',
	'aging': 'ageing',
	'jewelry': 'jewellery',
	'judgment': 'judgement',
	'acknowledgment': 'acknowledgement',
	'aluminum': 'aluminium',
	'math': 'maths',
	'gotten': 'got',
	'cozy': 'cosy', 'cozier': 'cosier', 'coziest': 'cosiest', 'coziness': 'cosiness',
	'skeptic': 'sceptic', 'skeptical': 'sceptical', 'skepticism': 'scepticism',
	}


	# ─── Helpers ──────────────────────────────────────────────────────────────────

	def preserve_case(original, replacement):
	"""Return replacement with the same capitalisation pattern as original."""
	if original.isupper():
	return replacement.upper()
	if original[0].isupper():
	return replacement[0].upper() + replacement[1:]
	return replacement


	def apply_spelling(text):
	"""Replace US spellings with UK equivalents, preserving case."""
	# Sort by length descending so longer matches (e.g. 'coloring') beat shorter ones ('color').
	pattern = r'\b(' + '\|'.join(re.escape(k) for k in sorted(SPELLING, key=len, reverse=True)) + r')\b'

	def replacer(match):
	word = match.group(0)
	return preserve_case(word, SPELLING[word.lower()])

	return re.sub(pattern, replacer, text, flags=re.IGNORECASE)


	# ─── VTT parsing ──────────────────────────────────────────────────────────────

	def parse_vtt(content):
	"""
	Parse a VTT file into a list of (speaker, text) tuples.
	Block numbers and timecode lines are discarded.
	"""
	blocks = []

	# Strip the WEBVTT header line
	content = re.sub(r'^WEBVTT[^\n]*\n', '', content, count=1)

	for raw_block in re.split(r'\n{2,}', content.strip()):
	text_lines = []
	for line in raw_block.splitlines():
	line = line.strip()
	if not line:
	continue
	if re.match(r'^\d+$', line):
	continue # block number
	if '-->' in line:
	continue # timecode
	text_lines.append(line)

	if not text_lines:
	continue

	text = ' '.join(text_lines)

	# Split "Speaker Name: utterance"
	match = re.match(r'^([^:]+):\s*(.+)$', text, re.DOTALL)
	if match:
	blocks.append((match.group(1).strip(), match.group(2).strip()))
	elif blocks:
	# No speaker prefix — treat as continuation of previous speaker
	speaker, prev_text = blocks[-1]
	blocks[-1] = (speaker, prev_text + ' ' + text)

	return blocks


	def merge_consecutive_speakers(blocks):
	"""Merge adjacent blocks from the same speaker into a single paragraph."""
	if not blocks:
	return []

	merged = [list(blocks[0])]
	for speaker, text in blocks[1:]:
	if speaker == merged[-1][0]:
	merged[-1][1] += ' ' + text
	else:
	merged.append([speaker, text])

	return [tuple(b) for b in merged]


	def assign_labels(blocks, self_name):
	"""
	Replace full speaker names with short labels.
	self_name → I; all others → R, R2, R3 …
	"""
	labels = {}
	r_count = 0

	for speaker, _ in blocks:
	if speaker in labels:
	continue
	if speaker == self_name:
	labels[speaker] = 'I'
	else:
	r_count += 1
	labels[speaker] = 'R' if r_count == 1 else f'R{r_count}'

	return [(labels.get(speaker, speaker), text) for speaker, text in blocks]


	# ─── Main ─────────────────────────────────────────────────────────────────────
	# On line 280, you can change the default Interviewer name.

	def main():
	parser = argparse.ArgumentParser(
	description='Process a Zoom/Teams .vtt transcript for research use.'
	)
	parser.add_argument('input', help='Path to the .vtt file')
	parser.add_argument(
	'--self',
	dest='self_name',
	default='Rose Tyler', # Change the default name here so you don't need to input your name every time.
	metavar='NAME',
	help='Speaker name to label as I (default: "Rose Tyler")',
	)
	args = parser.parse_args()

	input_path = Path(args.input).expanduser().resolve()
	if not input_path.exists():
	print(f'Error: file not found: {input_path}', file=sys.stderr)
	sys.exit(1)

	content = input_path.read_text(encoding='utf-8')

	blocks = parse_vtt(content)
	blocks = merge_consecutive_speakers(blocks)
	blocks = assign_labels(blocks, args.self_name)
	blocks = [(speaker, apply_spelling(text)) for speaker, text in blocks]

	output = '\n\n'.join(f'{speaker}: {text}' for speaker, text in blocks)

	# Strip .vtt and any preceding .transcript suffix before naming the output.
	base = input_path.stem # e.g. "foo.transcript"
	if base.endswith('.transcript'):
	base = base[: -len('.transcript')]
	output_path = input_path.parent / (base + '_edited.txt')
	output_path.write_text(output + '\n', encoding='utf-8')

	print(f'Written to: {output_path}')
	speaker_counts = {}
	for speaker, _ in blocks:
	speaker_counts[speaker] = speaker_counts.get(speaker, 0) + 1
	for speaker, count in sorted(speaker_counts.items()):
	print(f' {speaker}: {count} turn(s)')


	if __name__ == '__main__':
	main()
No results found