Created
November 12, 2024 15:49
-
-
Save 7shi/10557dcd1a354b6213228dde641db0ab to your computer and use it in GitHub Desktop.
[py] Upload a wave file and transcribe it
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import google.generativeai as genai | |
genai.configure(api_key=os.environ["GEMINI_API_KEY"]) | |
generation_config = { | |
"temperature": 1, | |
"top_p": 0.95, | |
"top_k": 40, | |
"max_output_tokens": 8192, | |
"response_mime_type": "text/plain", | |
} | |
model = genai.GenerativeModel( | |
model_name="models/gemini-1.5-flash-exp-0827", | |
generation_config=generation_config, | |
) | |
wav = "src.wav" | |
prompt = """ | |
Please transcribe this conversation, in the table format of timecode, speaker, caption. Use speaker A, speaker B, etc. to identify speakers. | |
|timecode|speaker|caption| | |
|---|---|---| | |
|00:00|A|text| | |
""".lstrip() | |
file = None | |
try: | |
file = genai.upload_file(wav, mime_type="audio/wav") | |
print(f"Uploaded file '{file.display_name}' as: {file.uri}") | |
chat_session = model.start_chat(history=[ | |
{"role": "user", "parts": [file]} | |
]) | |
response = chat_session.send_message(prompt) | |
fn = os.path.splitext(wav)[0] | |
i = 1 | |
md = f"{fn}.md" | |
while os.path.exists(md): | |
i += 1 | |
md = f"{fn}_{i}.md" | |
with open(md, "w") as f: | |
f.write(response.text) | |
print(f"Transcription saved to: {md}") | |
except Exception as e: | |
print(e) | |
finally: | |
if file: | |
genai.delete_file(file.name) | |
print(f"Deleted file '{file.display_name}' from: {file.uri}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment