Created
April 4, 2025 19:43
-
-
Save pplantinga/945839c24d22cb59835aa4a3ebbf235e to your computer and use it in GitHub Desktop.
Segment File from PRAAT TextGrid
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""Segments an audio file based on a TextGrid file from PRAAT, loosely based on the following repo: | |
https://github.com/ThiagoCF05/PraatSegmentation | |
Assumes you want new files from all "named" segments in the file: | |
xmin: {start time} | |
xmax: {end time} | |
text: "{name}" | |
Author: Peter Plantinga | |
""" | |
from textgrid_py3 import TextGrid | |
from scipy.io import wavfile | |
from dataclasses import dataclass | |
import argparse | |
import pathlib | |
import re | |
interval_matcher = re.compile(r'xmin = (\d+\.?\d*)\s+xmax = (\d+\.?\d*)\s+text = "(.+)"', re.MULTILINE) | |
def load_intervals(text): | |
matches = interval_matcher.findall(text) | |
return [Interval.from_match(m) for m in matches] | |
@dataclass | |
class Interval: | |
name: str | |
start: float | |
stop: float | |
@classmethod | |
def from_match(cls, match): | |
return cls(match[2], float(match[0]), float(match[1])) | |
@dataclass | |
class TextGrid: | |
name: str | |
intervals: list[Interval] | |
@classmethod | |
def from_file(cls, filepath): | |
name = filepath.stem | |
with open(filepath) as f: | |
filetext = f.read() | |
intervals = load_intervals(filetext) | |
return cls(name, intervals) | |
def write_segments(self, write_dir, audio, name): | |
fs, audio = audio | |
for i, interval in enumerate(self.intervals): | |
s1 = int(interval.start * fs) | |
s2 = int(interval.stop * fs) | |
fname = (write_dir / (name + '_' + interval.name)).with_suffix('.wav') | |
wavfile.write(fname, fs, audio[s1:s2]) | |
def main(read_dir, write_dir): | |
read_dir = pathlib.Path(read_dir) | |
write_dir = pathlib.Path(write_dir) | |
for faudio in read_dir.glob("*.wav"): | |
print('Reading file', faudio) | |
audio = wavfile.read(faudio) | |
fgrid = faudio.with_suffix('.TextGrid') | |
print('Reading file', fgrid) | |
grid = TextGrid.from_file(fgrid) | |
print('Writing segments for', len(grid.intervals), 'intervals') | |
grid.write_segments(write_dir, audio, faudio.stem) | |
if __name__ == '__main__': | |
parser = argparse.ArgumentParser(description='Segmentation of a .wav file based on its .TextGrid annotation.') | |
parser.add_argument("src", help="directory with the read files", type=str) | |
parser.add_argument("dest", help="directory where the chunks should be written", type=str) | |
args = parser.parse_args() | |
main(args.src, args.dest) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment