Skip to content

Instantly share code, notes, and snippets.

@pcnoic
Last active November 26, 2024 11:33
Show Gist options
  • Save pcnoic/59bbe0906796a1bfc12a52c29ecb80cf to your computer and use it in GitHub Desktop.
Save pcnoic/59bbe0906796a1bfc12a52c29ecb80cf to your computer and use it in GitHub Desktop.
Transcribe audio using OpenAI
import argparse
import os
from openai import OpenAI
from pydub import AudioSegment
def chunk_audio_by_size(audio, max_size_bytes=25000000, target_duration_ms=60000):
"""
Splits an audio file into smaller chunks based on a maximum size limit.
Args:
audio (AudioSegment): Audio file as a pydub AudioSegment.
max_size_bytes (int): Maximum size of each chunk in bytes.
target_duration_ms (int): Target duration of chunks in milliseconds.
Returns:
list of AudioSegment: List of audio chunks.
"""
chunks = []
start = 0
while start < len(audio):
end = start + target_duration_ms
chunk = audio[start:end]
# Ensure chunk size is within the limit
while len(chunk.raw_data) > max_size_bytes and target_duration_ms > 1000:
target_duration_ms //= 2 # Halve the duration to reduce size
chunk = audio[start:start + target_duration_ms]
if len(chunk.raw_data) > max_size_bytes:
raise ValueError("Cannot create a chunk under the size limit.")
chunks.append(chunk)
start += target_duration_ms
return chunks
def transcribe_audio_chunk(client, audio_chunk, temp_file_path):
"""
Transcribes a single audio chunk using OpenAI API.
Args:
client (OpenAI): Initialized OpenAI client.
audio_chunk (AudioSegment): Audio chunk to transcribe.
temp_file_path (str): Temporary file path to save the audio chunk.
Returns:
str: Transcription text for the chunk.
"""
audio_chunk.export(temp_file_path, format="wav")
with open(temp_file_path, "rb") as audio_file:
response = client.audio.transcriptions.create(
model="whisper-1",
file=audio_file
)
if hasattr(response, 'text'):
return response.text
else:
raise ValueError("Transcription text attribute not found in the response")
def transcribe_audio(audio_path):
# Read the OpenAI API key from an environment variable
openai_api_key = os.environ.get('OPENAI_API_KEY')
if not openai_api_key:
raise ValueError("OPENAI_API_KEY environment variable not set")
# Initialize the OpenAI client with the API key
client = OpenAI(api_key=openai_api_key)
# Load and split the audio file
audio = AudioSegment.from_file(audio_path)
audio_chunks = chunk_audio_by_size(audio)
transcription_parts = []
temp_file_path = "temp_chunk.wav"
for i, chunk in enumerate(audio_chunks):
print(f"Transcribing chunk {i + 1}/{len(audio_chunks)}...")
try:
transcription_part = transcribe_audio_chunk(client, chunk, temp_file_path)
transcription_parts.append(transcription_part)
except Exception as e:
print(f"Error transcribing chunk {i + 1}: {e}")
transcription_parts.append("[Error in transcription]")
# Combine all transcription parts
transcription_text = "\n".join(transcription_parts)
os.remove(temp_file_path) # Clean up the temporary file
return transcription_text
def main():
# Parse command line arguments
parser = argparse.ArgumentParser(description='Transcribe audio file using OpenAI API.')
parser.add_argument('--audio-file', type=str, required=True, help='Path to the audio file')
args = parser.parse_args()
# Transcribe the audio
transcription_text = transcribe_audio(args.audio_file)
# Save the transcription to a text file
output_file = args.audio_file + '.txt'
with open(output_file, 'w') as file:
file.write(transcription_text)
print(f"Transcription saved to {output_file}")
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment