Last active
November 26, 2024 11:33
-
-
Save pcnoic/59bbe0906796a1bfc12a52c29ecb80cf to your computer and use it in GitHub Desktop.
Transcribe audio using OpenAI
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import argparse | |
import os | |
from openai import OpenAI | |
from pydub import AudioSegment | |
def chunk_audio_by_size(audio, max_size_bytes=25000000, target_duration_ms=60000): | |
""" | |
Splits an audio file into smaller chunks based on a maximum size limit. | |
Args: | |
audio (AudioSegment): Audio file as a pydub AudioSegment. | |
max_size_bytes (int): Maximum size of each chunk in bytes. | |
target_duration_ms (int): Target duration of chunks in milliseconds. | |
Returns: | |
list of AudioSegment: List of audio chunks. | |
""" | |
chunks = [] | |
start = 0 | |
while start < len(audio): | |
end = start + target_duration_ms | |
chunk = audio[start:end] | |
# Ensure chunk size is within the limit | |
while len(chunk.raw_data) > max_size_bytes and target_duration_ms > 1000: | |
target_duration_ms //= 2 # Halve the duration to reduce size | |
chunk = audio[start:start + target_duration_ms] | |
if len(chunk.raw_data) > max_size_bytes: | |
raise ValueError("Cannot create a chunk under the size limit.") | |
chunks.append(chunk) | |
start += target_duration_ms | |
return chunks | |
def transcribe_audio_chunk(client, audio_chunk, temp_file_path): | |
""" | |
Transcribes a single audio chunk using OpenAI API. | |
Args: | |
client (OpenAI): Initialized OpenAI client. | |
audio_chunk (AudioSegment): Audio chunk to transcribe. | |
temp_file_path (str): Temporary file path to save the audio chunk. | |
Returns: | |
str: Transcription text for the chunk. | |
""" | |
audio_chunk.export(temp_file_path, format="wav") | |
with open(temp_file_path, "rb") as audio_file: | |
response = client.audio.transcriptions.create( | |
model="whisper-1", | |
file=audio_file | |
) | |
if hasattr(response, 'text'): | |
return response.text | |
else: | |
raise ValueError("Transcription text attribute not found in the response") | |
def transcribe_audio(audio_path): | |
# Read the OpenAI API key from an environment variable | |
openai_api_key = os.environ.get('OPENAI_API_KEY') | |
if not openai_api_key: | |
raise ValueError("OPENAI_API_KEY environment variable not set") | |
# Initialize the OpenAI client with the API key | |
client = OpenAI(api_key=openai_api_key) | |
# Load and split the audio file | |
audio = AudioSegment.from_file(audio_path) | |
audio_chunks = chunk_audio_by_size(audio) | |
transcription_parts = [] | |
temp_file_path = "temp_chunk.wav" | |
for i, chunk in enumerate(audio_chunks): | |
print(f"Transcribing chunk {i + 1}/{len(audio_chunks)}...") | |
try: | |
transcription_part = transcribe_audio_chunk(client, chunk, temp_file_path) | |
transcription_parts.append(transcription_part) | |
except Exception as e: | |
print(f"Error transcribing chunk {i + 1}: {e}") | |
transcription_parts.append("[Error in transcription]") | |
# Combine all transcription parts | |
transcription_text = "\n".join(transcription_parts) | |
os.remove(temp_file_path) # Clean up the temporary file | |
return transcription_text | |
def main(): | |
# Parse command line arguments | |
parser = argparse.ArgumentParser(description='Transcribe audio file using OpenAI API.') | |
parser.add_argument('--audio-file', type=str, required=True, help='Path to the audio file') | |
args = parser.parse_args() | |
# Transcribe the audio | |
transcription_text = transcribe_audio(args.audio_file) | |
# Save the transcription to a text file | |
output_file = args.audio_file + '.txt' | |
with open(output_file, 'w') as file: | |
file.write(transcription_text) | |
print(f"Transcription saved to {output_file}") | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment