Skip to content

Instantly share code, notes, and snippets.

@kenenbek
Created May 30, 2025 11:54
Show Gist options
  • Save kenenbek/97a82255c874977f7ae269faffc73f3d to your computer and use it in GitHub Desktop.
Save kenenbek/97a82255c874977f7ae269faffc73f3d to your computer and use it in GitHub Desktop.
import json
import os, re, argparse, shutil
import sys
import multiprocessing
from functools import partial
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps
from pydub import AudioSegment
from dotenv import load_dotenv
import google.generativeai as genai
import pandas as pd
from tqdm import tqdm
import random
import csv
import librosa
# Load environment variables from .env file
load_dotenv()
# Load Silero VAD model
MODEL = load_silero_vad()
GOOGLE_ASR_MODEL = "gemini-2.5-flash-preview-04-17" #"gemini-2.5-pro-preview-05-06"
def split_and_save(audio_path, out_dir, min_silence_duration):
audio = AudioSegment.from_file(audio_path)
wav = read_audio(audio_path, sampling_rate=16000)
speech_timestamps = get_speech_timestamps(
wav, MODEL, return_seconds=True, sampling_rate=16000,
min_silence_duration_ms=min_silence_duration, threshold=0.5
)
if not os.path.exists(out_dir):
os.makedirs(out_dir)
else:
shutil.rmtree(out_dir)
os.makedirs(out_dir)
outputs = []
for idx, ts in enumerate(speech_timestamps):
start_ms = int(ts['start'] * 1000)
end_ms = int(ts['end'] * 1000)
segment = audio[start_ms:end_ms]
segment.export(os.path.join(out_dir, f'segment_{idx+1}_{round(ts['start'], 2)}_{round(ts['end'], 2)}.wav'), format='wav')
outputs.append(f"segment_{idx+1}_{round(ts['start'], 2)}_{round(ts['end'], 2)}.wav")
return outputs
def process_folder(folder_path):
for file in os.listdir(folder_path):
if file.lower().endswith('.wav'):
file_path = os.path.join(folder_path, file)
base_name = os.path.splitext(file)[0]
out_dir = os.path.join(folder_path, base_name)
split_and_save(file_path, out_dir)
PROMPT = """
Сиз кыргыз тилиндеги аудиону транскрипциялоо боюнча жардамчысыз.
Сизге Кыргызстандагы банктын колл-борборунан кыргыз тилинде (кээде орусча сөздөр менен) аудио файлдар жөнөтүлөт.
Сиздин милдеттериңиз:
Ар дайым аудио файлдын толук транскрипциясын жасаңыз.
Транскрипция текстине чекит менен үтүр белгисин кошпоңуз.
Транскрипцияны кылдаттык менен анализдеңиз. Эгерде колдонуучу паспорттук жеке маалыматтарды айтса же айта баштаса,
анда транскрипциядан кийин ;1 деп кошуңуз.
Эгерде жогоруда көрсөтүлгөн паспорттук жеке маалыматтар транскрипцияда кездешпесе, анда транскрипциядан кийин ;0 деп кошуңуз.
Көңүл буруңуз, бул оператор сураган нерсе эмес, а колдонуучу жооп берген нерсе болушу керек.
Бул сөздөрдүн өздөрү эмес, а алардын маанилери паспорттун жеке маалыматы болуп эсептелет:
Паспорттун сериясы жана номери (мисалы, ID1234567, AN0123456)
Паспорттун берилген күнү (мисалы, 01.01.2020)
Паспортту берген орган (мисалы, МКК 211011)
Жарандын жеке идентификациялык номери (ПИН) (мисалы, 20803199000769)
Маанилүү: Аты-жөнү (ФИО) бул контекстте паспорттун жеке маалыматы болуп эсептелбейт.
Жөн эле календардык дата – бул жеке маалыматтар эмес.
"""
def natural_sort_key(s):
"""
Key function for natural sorting (sorts numbers in strings numerically).
Example: "file2" < "file10" (unlike lexicographic sorting)
"""
return [int(text) if text.isdigit() else text.lower()
for text in re.split('([0-9]+)', s)]
def listdir_lsv(directory):
"""
Returns files sorted like `ls -1v` (one per line, natural sort).
"""
files = os.listdir(directory)
return sorted(files, key=natural_sort_key)
def transcribe_audio_gemini(file_path: str, model: str = "gemini-1.5-pro") -> str:
"""
Transcribe an audio file using Google's Gemini API and return the transcription.
Args:
file_path: Path to the WAV audio file
model: Model to use for transcription (default: gemini-1.5-pro)
Returns:
Transcribed text
Raises:
ValueError: If API key is missing
FileNotFoundError: If audio file doesn't exist
RuntimeError: If transcription fails
"""
# Check for API key
api_key = os.getenv("GOOGLE_API_KEY")
if not api_key:
raise ValueError("GOOGLE_API_KEY not found. Please set it in a .env file.")
# Check if file exists
if not os.path.exists(file_path):
raise FileNotFoundError(f"Audio file not found: {file_path}")
# Initialize Gemini client
try:
genai.configure(api_key=api_key)
model_instance = genai.GenerativeModel(model)
# Open and process the audio file
with open(file_path, "rb") as audio_file:
audio_data = audio_file.read()
# Upload file to Gemini
uploaded_file = genai.upload_file(file_path)
# Generate transcription
response = model_instance.generate_content(
[PROMPT, uploaded_file],
generation_config={
"temperature": 0,
"response_mime_type": "text/plain"
}
)
# Clean up uploaded file
genai.delete_file(uploaded_file.name)
return response.text
except Exception as e:
#raise RuntimeError(f"Error during Gemini transcription: {str(e)}")
print(str(e))
return "Error;0"
def get_audio_duration(file_path):
duration = librosa.get_duration(filename=file_path)
return duration
def depersonalize_one_file(audio_file, msd):
base_name = os.path.splitext(os.path.basename(audio_file))[0]
out_dir = os.path.splitext(audio_file)[0]
segment_names = split_and_save(audio_file, out_dir=out_dir, min_silence_duration=msd)
data = {
'path': [],
GOOGLE_ASR_MODEL: [],
"PID": [],
}
current_segment_num = None
start_time = None
for segment_name in tqdm(segment_names):
filepath = os.path.join(out_dir, segment_name)
data['path'].append(os.path.join(base_name, segment_name))
transcribe = transcribe_audio_gemini(filepath, GOOGLE_ASR_MODEL)
if transcribe.count(";") == 1:
text, pid = transcribe.split(";")
if pid.isdigit(): # Checks if all characters are digits (0-9)
pid = int(pid)
else:
pid = 0
else:
text, pid = "Error", 0
data[GOOGLE_ASR_MODEL].append(text)
data["PID"].append(pid)
if pid == 1:
start_segment_list = os.path.splitext(segment_name)[0].split("_")
current_segment_num = int(start_segment_list[1])
start_time = float(start_segment_list[2])
print(base_name, current_segment_num, text)
break
return current_segment_num, start_time
def process_single_file(folder_path, msd, entry):
basename = os.path.basename(entry)
file_path = os.path.join(folder_path, basename)
segment, start = depersonalize_one_file(file_path, msd)
return basename, segment, start
def depersonalize_specific_dir(folder_path, msd):
specific_files = ['/data/upload/23/8fe3015a-20250301_2035_0550590092_1231.mp3',
'/data/upload/23/d97d6a84-20250301_2032_0507070300_1173.mp3',
'/data/upload/23/7dd04129-20250301_2032_0508170821_1199.mp3',
'/data/upload/23/ce2e11c4-20250301_2013_0708414823_1120.mp3',
'/data/upload/23/374a1d69-1727757709.315706.mp3',
'/data/upload/23/1c792137-20250301_2021_0552296662_1173.mp3',
'/data/upload/23/53d71024-20250301_2024_0771299991_1262.mp3',
'/data/upload/23/a449514a-20250301_2021_0700008240_1227.mp3',
'/data/upload/23/0c685918-20250301_2021_0707440688_1231.mp3',
'/data/upload/23/c57856b3-20250301_2030_0556555113_1165.mp3',
'/data/upload/23/d9ff8da2-20250301_2017_0502721972_1173.mp3',
'/data/upload/23/ddb3b480-20250301_2020_0700010510_1165.mp3']
output = {
'path': [],
'segment': [],
'start': [],
}
# Create a pool of workers
with multiprocessing.Pool(processes=6) as pool:
# Create a partial function with the fixed arguments
process_func = partial(process_single_file, folder_path, msd)
# Process files in parallel and maintain order
results = list(tqdm(
pool.imap(process_func, specific_files),
total=len(specific_files),
desc="Processing files"
))
# Store results in the same order as specific_files
for basename, segment, start in results:
output['path'].append(basename)
output['segment'].append(segment)
output['start'].append(start)
out_df = pd.DataFrame(output)
out_df.to_csv(os.path.join("label_studio_data_test", f'deperson_13_files.csv'), index=False,
quoting=csv.QUOTE_MINIMAL, sep=',')
if __name__ == "__main__":
# Define argument parser
parser = argparse.ArgumentParser(description="Transcribe an audio file.")
parser.add_argument('--audio', type=str, required=True, help='Path to the audio file or directory')
parser.add_argument('--msd', type=int, required=False, help='min_silence_duration', default=400)
# Parse arguments
args = parser.parse_args()
print("You passed audio file:", args.audio)
if os.path.isdir(args.audio):
depersonalize_specific_dir(args.audio, args.msd)
else:
depersonalize_one_file(args.audio, args.msd)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment