Created
May 30, 2025 11:54
-
-
Save kenenbek/97a82255c874977f7ae269faffc73f3d to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import os, re, argparse, shutil | |
import sys | |
import multiprocessing | |
from functools import partial | |
from silero_vad import load_silero_vad, read_audio, get_speech_timestamps | |
from pydub import AudioSegment | |
from dotenv import load_dotenv | |
import google.generativeai as genai | |
import pandas as pd | |
from tqdm import tqdm | |
import random | |
import csv | |
import librosa | |
# Load environment variables from .env file | |
load_dotenv() | |
# Load Silero VAD model | |
MODEL = load_silero_vad() | |
GOOGLE_ASR_MODEL = "gemini-2.5-flash-preview-04-17" #"gemini-2.5-pro-preview-05-06" | |
def split_and_save(audio_path, out_dir, min_silence_duration): | |
audio = AudioSegment.from_file(audio_path) | |
wav = read_audio(audio_path, sampling_rate=16000) | |
speech_timestamps = get_speech_timestamps( | |
wav, MODEL, return_seconds=True, sampling_rate=16000, | |
min_silence_duration_ms=min_silence_duration, threshold=0.5 | |
) | |
if not os.path.exists(out_dir): | |
os.makedirs(out_dir) | |
else: | |
shutil.rmtree(out_dir) | |
os.makedirs(out_dir) | |
outputs = [] | |
for idx, ts in enumerate(speech_timestamps): | |
start_ms = int(ts['start'] * 1000) | |
end_ms = int(ts['end'] * 1000) | |
segment = audio[start_ms:end_ms] | |
segment.export(os.path.join(out_dir, f'segment_{idx+1}_{round(ts['start'], 2)}_{round(ts['end'], 2)}.wav'), format='wav') | |
outputs.append(f"segment_{idx+1}_{round(ts['start'], 2)}_{round(ts['end'], 2)}.wav") | |
return outputs | |
def process_folder(folder_path): | |
for file in os.listdir(folder_path): | |
if file.lower().endswith('.wav'): | |
file_path = os.path.join(folder_path, file) | |
base_name = os.path.splitext(file)[0] | |
out_dir = os.path.join(folder_path, base_name) | |
split_and_save(file_path, out_dir) | |
PROMPT = """ | |
Сиз кыргыз тилиндеги аудиону транскрипциялоо боюнча жардамчысыз. | |
Сизге Кыргызстандагы банктын колл-борборунан кыргыз тилинде (кээде орусча сөздөр менен) аудио файлдар жөнөтүлөт. | |
Сиздин милдеттериңиз: | |
Ар дайым аудио файлдын толук транскрипциясын жасаңыз. | |
Транскрипция текстине чекит менен үтүр белгисин кошпоңуз. | |
Транскрипцияны кылдаттык менен анализдеңиз. Эгерде колдонуучу паспорттук жеке маалыматтарды айтса же айта баштаса, | |
анда транскрипциядан кийин ;1 деп кошуңуз. | |
Эгерде жогоруда көрсөтүлгөн паспорттук жеке маалыматтар транскрипцияда кездешпесе, анда транскрипциядан кийин ;0 деп кошуңуз. | |
Көңүл буруңуз, бул оператор сураган нерсе эмес, а колдонуучу жооп берген нерсе болушу керек. | |
Бул сөздөрдүн өздөрү эмес, а алардын маанилери паспорттун жеке маалыматы болуп эсептелет: | |
Паспорттун сериясы жана номери (мисалы, ID1234567, AN0123456) | |
Паспорттун берилген күнү (мисалы, 01.01.2020) | |
Паспортту берген орган (мисалы, МКК 211011) | |
Жарандын жеке идентификациялык номери (ПИН) (мисалы, 20803199000769) | |
Маанилүү: Аты-жөнү (ФИО) бул контекстте паспорттун жеке маалыматы болуп эсептелбейт. | |
Жөн эле календардык дата – бул жеке маалыматтар эмес. | |
""" | |
def natural_sort_key(s): | |
""" | |
Key function for natural sorting (sorts numbers in strings numerically). | |
Example: "file2" < "file10" (unlike lexicographic sorting) | |
""" | |
return [int(text) if text.isdigit() else text.lower() | |
for text in re.split('([0-9]+)', s)] | |
def listdir_lsv(directory): | |
""" | |
Returns files sorted like `ls -1v` (one per line, natural sort). | |
""" | |
files = os.listdir(directory) | |
return sorted(files, key=natural_sort_key) | |
def transcribe_audio_gemini(file_path: str, model: str = "gemini-1.5-pro") -> str: | |
""" | |
Transcribe an audio file using Google's Gemini API and return the transcription. | |
Args: | |
file_path: Path to the WAV audio file | |
model: Model to use for transcription (default: gemini-1.5-pro) | |
Returns: | |
Transcribed text | |
Raises: | |
ValueError: If API key is missing | |
FileNotFoundError: If audio file doesn't exist | |
RuntimeError: If transcription fails | |
""" | |
# Check for API key | |
api_key = os.getenv("GOOGLE_API_KEY") | |
if not api_key: | |
raise ValueError("GOOGLE_API_KEY not found. Please set it in a .env file.") | |
# Check if file exists | |
if not os.path.exists(file_path): | |
raise FileNotFoundError(f"Audio file not found: {file_path}") | |
# Initialize Gemini client | |
try: | |
genai.configure(api_key=api_key) | |
model_instance = genai.GenerativeModel(model) | |
# Open and process the audio file | |
with open(file_path, "rb") as audio_file: | |
audio_data = audio_file.read() | |
# Upload file to Gemini | |
uploaded_file = genai.upload_file(file_path) | |
# Generate transcription | |
response = model_instance.generate_content( | |
[PROMPT, uploaded_file], | |
generation_config={ | |
"temperature": 0, | |
"response_mime_type": "text/plain" | |
} | |
) | |
# Clean up uploaded file | |
genai.delete_file(uploaded_file.name) | |
return response.text | |
except Exception as e: | |
#raise RuntimeError(f"Error during Gemini transcription: {str(e)}") | |
print(str(e)) | |
return "Error;0" | |
def get_audio_duration(file_path): | |
duration = librosa.get_duration(filename=file_path) | |
return duration | |
def depersonalize_one_file(audio_file, msd): | |
base_name = os.path.splitext(os.path.basename(audio_file))[0] | |
out_dir = os.path.splitext(audio_file)[0] | |
segment_names = split_and_save(audio_file, out_dir=out_dir, min_silence_duration=msd) | |
data = { | |
'path': [], | |
GOOGLE_ASR_MODEL: [], | |
"PID": [], | |
} | |
current_segment_num = None | |
start_time = None | |
for segment_name in tqdm(segment_names): | |
filepath = os.path.join(out_dir, segment_name) | |
data['path'].append(os.path.join(base_name, segment_name)) | |
transcribe = transcribe_audio_gemini(filepath, GOOGLE_ASR_MODEL) | |
if transcribe.count(";") == 1: | |
text, pid = transcribe.split(";") | |
if pid.isdigit(): # Checks if all characters are digits (0-9) | |
pid = int(pid) | |
else: | |
pid = 0 | |
else: | |
text, pid = "Error", 0 | |
data[GOOGLE_ASR_MODEL].append(text) | |
data["PID"].append(pid) | |
if pid == 1: | |
start_segment_list = os.path.splitext(segment_name)[0].split("_") | |
current_segment_num = int(start_segment_list[1]) | |
start_time = float(start_segment_list[2]) | |
print(base_name, current_segment_num, text) | |
break | |
return current_segment_num, start_time | |
def process_single_file(folder_path, msd, entry): | |
basename = os.path.basename(entry) | |
file_path = os.path.join(folder_path, basename) | |
segment, start = depersonalize_one_file(file_path, msd) | |
return basename, segment, start | |
def depersonalize_specific_dir(folder_path, msd): | |
specific_files = ['/data/upload/23/8fe3015a-20250301_2035_0550590092_1231.mp3', | |
'/data/upload/23/d97d6a84-20250301_2032_0507070300_1173.mp3', | |
'/data/upload/23/7dd04129-20250301_2032_0508170821_1199.mp3', | |
'/data/upload/23/ce2e11c4-20250301_2013_0708414823_1120.mp3', | |
'/data/upload/23/374a1d69-1727757709.315706.mp3', | |
'/data/upload/23/1c792137-20250301_2021_0552296662_1173.mp3', | |
'/data/upload/23/53d71024-20250301_2024_0771299991_1262.mp3', | |
'/data/upload/23/a449514a-20250301_2021_0700008240_1227.mp3', | |
'/data/upload/23/0c685918-20250301_2021_0707440688_1231.mp3', | |
'/data/upload/23/c57856b3-20250301_2030_0556555113_1165.mp3', | |
'/data/upload/23/d9ff8da2-20250301_2017_0502721972_1173.mp3', | |
'/data/upload/23/ddb3b480-20250301_2020_0700010510_1165.mp3'] | |
output = { | |
'path': [], | |
'segment': [], | |
'start': [], | |
} | |
# Create a pool of workers | |
with multiprocessing.Pool(processes=6) as pool: | |
# Create a partial function with the fixed arguments | |
process_func = partial(process_single_file, folder_path, msd) | |
# Process files in parallel and maintain order | |
results = list(tqdm( | |
pool.imap(process_func, specific_files), | |
total=len(specific_files), | |
desc="Processing files" | |
)) | |
# Store results in the same order as specific_files | |
for basename, segment, start in results: | |
output['path'].append(basename) | |
output['segment'].append(segment) | |
output['start'].append(start) | |
out_df = pd.DataFrame(output) | |
out_df.to_csv(os.path.join("label_studio_data_test", f'deperson_13_files.csv'), index=False, | |
quoting=csv.QUOTE_MINIMAL, sep=',') | |
if __name__ == "__main__": | |
# Define argument parser | |
parser = argparse.ArgumentParser(description="Transcribe an audio file.") | |
parser.add_argument('--audio', type=str, required=True, help='Path to the audio file or directory') | |
parser.add_argument('--msd', type=int, required=False, help='min_silence_duration', default=400) | |
# Parse arguments | |
args = parser.parse_args() | |
print("You passed audio file:", args.audio) | |
if os.path.isdir(args.audio): | |
depersonalize_specific_dir(args.audio, args.msd) | |
else: | |
depersonalize_one_file(args.audio, args.msd) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment