pplantinga · May 26, 2025 18:20
diff --git a/segment_textgrid.py b/segment_textgrid.py
 """Segments an audio file based on a TextGrid file from PRAAT, loosely based on the following repo:

 https://github.com/ThiagoCF05/PraatSegmentation

 Assumes you want new files from all "named" segments in the file:

    xmin: {start time}
    xmax: {end time}
    text: "{name}"

 Author: Peter Plantinga
 """
 from scipy.io import wavfile
 from dataclasses import dataclass
 import argparse
 import pathlib
 import re

 interval_matcher = re.compile(r'xmin = (\d+\.?\d*)\s+xmax = (\d+\.?\d*)\s+text = "(.+)"', re.MULTILINE)


 def load_intervals(text):
    matches = interval_matcher.findall(text)
    return [Interval.from_match(m) for m in matches]


 @dataclass
 class Interval:
    name: str
    start: float
    stop: float

    @classmethod
    def from_match(cls, match):
        return cls(match[2], float(match[0]), float(match[1]))


 @dataclass
 class TextGrid:
    name: str
    intervals: list[Interval]

    @classmethod
    def from_file(cls, filepath):
        name = filepath.stem

        with open(filepath) as f:
            filetext = f.read()
        intervals = load_intervals(filetext)

        return cls(name, intervals)

    def write_segments(self, write_dir, audio, name):
        fs, audio = audio

        for i, interval in enumerate(self.intervals):
            s1 = int(interval.start * fs)
            s2 = int(interval.stop * fs)

            fname = (write_dir / (name + '_' + interval.name)).with_suffix('.wav')
            wavfile.write(fname, fs, audio[s1:s2])


 def main(read_dir, write_dir):
    read_dir = pathlib.Path(read_dir)
    write_dir = pathlib.Path(write_dir)

    for faudio in read_dir.glob("*.wav"):
        print('Reading file', faudio)
        audio = wavfile.read(faudio)
        fgrid = faudio.with_suffix('.TextGrid')
        print('Reading file', fgrid)
        grid = TextGrid.from_file(fgrid)

        print('Writing segments for', len(grid.intervals), 'intervals')
        grid.write_segments(write_dir, audio, faudio.stem)


 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Segmentation of a .wav file based on its .TextGrid annotation.')
    parser.add_argument("src", help="directory with the read files", type=str)
    parser.add_argument("dest", help="directory where the chunks should be written", type=str)
    args = parser.parse_args()

    main(args.src, args.dest)
	"""Segments an audio file based on a TextGrid file from PRAAT, loosely based on the following repo:

	https://github.com/ThiagoCF05/PraatSegmentation

	Assumes you want new files from all "named" segments in the file:

	xmin: {start time}
	xmax: {end time}
	text: "{name}"

	Author: Peter Plantinga
	"""
	from scipy.io import wavfile
	from dataclasses import dataclass
	import argparse
	import pathlib
	import re

	interval_matcher = re.compile(r'xmin = (\d+\.?\d)\s+xmax = (\d+\.?\d)\s+text = "(.+)"', re.MULTILINE)


	def load_intervals(text):
	matches = interval_matcher.findall(text)
	return [Interval.from_match(m) for m in matches]


	@dataclass
	class Interval:
	name: str
	start: float
	stop: float

	@classmethod
	def from_match(cls, match):
	return cls(match[2], float(match[0]), float(match[1]))


	@dataclass
	class TextGrid:
	name: str
	intervals: list[Interval]

	@classmethod
	def from_file(cls, filepath):
	name = filepath.stem

	with open(filepath) as f:
	filetext = f.read()
	intervals = load_intervals(filetext)

	return cls(name, intervals)

	def write_segments(self, write_dir, audio, name):
	fs, audio = audio

	for i, interval in enumerate(self.intervals):
	s1 = int(interval.start * fs)
	s2 = int(interval.stop * fs)

	fname = (write_dir / (name + '_' + interval.name)).with_suffix('.wav')
	wavfile.write(fname, fs, audio[s1:s2])


	def main(read_dir, write_dir):
	read_dir = pathlib.Path(read_dir)
	write_dir = pathlib.Path(write_dir)

	for faudio in read_dir.glob("*.wav"):
	print('Reading file', faudio)
	audio = wavfile.read(faudio)
	fgrid = faudio.with_suffix('.TextGrid')
	print('Reading file', fgrid)
	grid = TextGrid.from_file(fgrid)

	print('Writing segments for', len(grid.intervals), 'intervals')
	grid.write_segments(write_dir, audio, faudio.stem)


	if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='Segmentation of a .wav file based on its .TextGrid annotation.')
	parser.add_argument("src", help="directory with the read files", type=str)
	parser.add_argument("dest", help="directory where the chunks should be written", type=str)
	args = parser.parse_args()

	main(args.src, args.dest)