Last active
August 12, 2025 00:03
-
-
Save AshtonIzmev/b028fa8d73cb4a07bbaefd06c77d396d to your computer and use it in GitHub Desktop.
OpenAI utils
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from openai import OpenAI | |
| client = OpenAI(api_key='sk-proj-XXX') | |
| def gpt_request(prompt, model="gpt-4o-mini", max_tokens=100): | |
| try: | |
| response = client.chat.completions.create( | |
| model=model, | |
| messages=[{"role": "user", "content": prompt}], | |
| max_tokens=max_tokens, | |
| n=1, | |
| stop=None, | |
| temperature=0.7, | |
| ) | |
| return response.choices[0].message.content.strip() | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| return None | |
| prompt = "What is the capital of France?" | |
| result = gpt_request(prompt) | |
| def get_ada_embedding(text): | |
| client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
| response = client.embeddings.create(input=[text], model="text-embedding-3-small") | |
| return response.data[0].embedding |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| async def transcribe_audio(self, file_path: str, user_key: str) -> str: | |
| """Transcribe audio file using Deepgram API""" | |
| logger.info(f"Transcribing audio file: {file_path}") | |
| # Check if file exists | |
| if not os.path.exists(file_path): | |
| logger.error(f"File not found: {file_path}") | |
| return f"Transcription failed: File not found: {file_path}" | |
| # Get file size | |
| file_size = os.path.getsize(file_path) | |
| if file_size == 0: | |
| logger.error(f"File is empty: {file_path}") | |
| return f"Transcription failed: File is empty: {file_path}" | |
| # Configure transcription options | |
| options = PrerecordedOptions(model="nova-2", smart_format=True, language="fr") | |
| # Read the file as binary data | |
| with open(file_path, "rb") as audio: | |
| buffer_data = audio.read() | |
| payload: FileSource = { | |
| "buffer": buffer_data, | |
| } | |
| # Ensure we're passing binary data to the API | |
| response = self.deepgram.listen.prerecorded.v("1").transcribe_file( | |
| payload, options | |
| ) | |
| # Extract transcript from response | |
| transcript = response.results.channels[0].alternatives[0].transcript | |
| # Delete the audio file after transcription | |
| try: | |
| os.remove(file_path) | |
| logger.info(f"Deleted audio file: {file_path}") | |
| except Exception as e: | |
| logger.error(f"Failed to delete audio file {file_path}: {str(e)}") | |
| logger.info(f"Transcription completed for user: {user_key}") | |
| logger.info(f"Transcript: {transcript}") | |
| return transcript |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import langfun as lf | |
| import pyglove as pg | |
| class Structure(pg.Object): | |
| short_summary: str | |
| title: str | |
| keywords: list[str] | |
| why_interesting: str | |
| def PROMPT(case_details): | |
| return lf.query_prompt( | |
| f'''Do what you need to to | |
| Case details: {case_details} | |
| Format your response strictly according to the Structure class, without any additional explanation.''', | |
| schema=Structure, | |
| lm=lf.llms.Gpt4o(api_key='sk-proj-XXX'), | |
| ).text | |
| def messages(t): | |
| return [ | |
| {"role": "system", "content": "You are a XXX"}, | |
| {"role": "user", "content": PROMPT(t)} | |
| ] | |
| jsonl_data = [generate_jsonl_data( | |
| k, model, max_tokens, messages(v['content'])) for k, v in pdf_contents_analyzed.items()] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import langfun as lf | |
| import pyglove as pg | |
| class AccidentVoitureDetails(pg.Object): | |
| nom_personne_accidentee: str | |
| prenom_personne_accidentee: str | |
| def get_accident_voiture_details(file_text): | |
| r = lf.query(prompt=file_text, schema=AccidentVoitureDetails, lm=lf.llms.Gpt4o()) | |
| return r |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from openai import OpenAI | |
| from dotenv import load_dotenv | |
| import os | |
| load_dotenv() | |
| client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
| PROMPT_GENERIC = """ | |
| You are an expert analyst specializing in data classification. Your task is to analyze a given input and determine if it matches specific criteria based on the provided guidelines. | |
| The inputs are often short, abbreviated, and may not explicitly contain certain keywords. You must look for specific patterns and indicators. | |
| **Analysis and Response Instructions:** | |
| * Analyze the user-provided input. | |
| * If you find a clear or potential match, set `is_match` to `True`. Identify the `keyword`, select the most appropriate `category`, and provide a `confidence` score. | |
| * Use high confidence (0.8-1.0) for explicit terms that strongly indicate a match. | |
| * Use medium confidence (0.4-0.7) for terms that likely indicate a match but aren't certain. | |
| * Use low confidence (0.1-0.3) for ambiguous terms that might indicate a match. | |
| * If the input does not appear to match the criteria, set `is_match` to `False`, set `confidence` to 0.0, and briefly state the reason in the `explanation`. | |
| **Example 1:** | |
| * **Input:** `EXAMPLE INPUT 1` | |
| * **Output:** `{"is_match": true, "category": "Category A", "keyword": "EXAMPLE", "confidence": 1.0, "explanation": "Clear match for Category A."}` | |
| **Example 2:** | |
| * **Input:** `EXAMPLE INPUT 2` | |
| * **Output:** `{"is_match": false, "category": null, "keyword": null, "confidence": 0.0, "explanation": "No matching criteria found."}` | |
| **Example 3:** | |
| * **Input:** `EXAMPLE INPUT 3` | |
| * **Output:** `{"is_match": true, "category": "Category B", "keyword": "INPUT 3", "confidence": 0.95, "explanation": "Strong indicator of Category B."}` | |
| """ | |
| # Define the JSON schema for structured output | |
| json_schema_generic = { | |
| "name": "pattern_analysis", | |
| "schema": { | |
| "type": "object", | |
| "properties": { | |
| "is_match": { | |
| "type": "boolean", | |
| "description": "True if the input matches the specified criteria" | |
| }, | |
| "category": { | |
| "type": ["string", "null"], | |
| "enum": [ | |
| "Category A", "Category B", "Category C", | |
| "Category D", "Category E", "Category F", | |
| "Category G", "Category H", "Category I", | |
| "Category J", "Category K", "Category L", | |
| "Category M", "Category N", "Other" | |
| ], | |
| "description": "Category of match if detected" | |
| }, | |
| "keyword": { | |
| "type": ["string", "null"], | |
| "description": "Specific keyword or phrase that suggests a match" | |
| }, | |
| "confidence": { | |
| "type": "number", | |
| "minimum": 0.0, | |
| "maximum": 1.0, | |
| "description": "Confidence score from 0.0 to 1.0" | |
| }, | |
| "explanation": { | |
| "type": "string", | |
| "description": "Brief explanation of the analysis" | |
| } | |
| }, | |
| "required": ["is_match", "category", "keyword", "confidence", "explanation"], | |
| "additionalProperties": False | |
| }, | |
| "strict": True | |
| } | |
| def generate_jsonl_data_structured(id, model, messages, response_format): | |
| import json | |
| return json.dumps({ | |
| "custom_id": f"request-{id}", | |
| "method": "POST", | |
| "url": "/v1/chat/completions", | |
| "body": { | |
| "model": model, | |
| "messages": messages, | |
| "response_format": response_format | |
| } | |
| }) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def generate_jsonl_data(id, model, max_tokens, messages): | |
| import json | |
| return json.dumps({ | |
| "custom_id": f"request-{id}", | |
| "method": "POST", | |
| "url": "/v1/chat/completions", | |
| "body": { | |
| "model": model, | |
| "messages": messages, | |
| "max_tokens": max_tokens | |
| } | |
| }) | |
| def create_batch_job(client, jsonl_data): | |
| import io | |
| batch_input_file = client.files.create( | |
| file=io.BytesIO("\n".join(jsonl_data).encode('utf-8')), | |
| purpose="batch" | |
| ) | |
| return client.batches.create( | |
| input_file_id=batch_input_file.id, | |
| endpoint="/v1/chat/completions", | |
| completion_window="24h", | |
| metadata={ | |
| "description": "nightly eval job" | |
| } | |
| ) | |
| def check_batch_job(client, batch_job_id): | |
| return client.batches.retrieve(batch_job_id) | |
| def get_batch_job_results(client, output_file_id): | |
| return client.files.content(output_file_id) | |
| def get_zipped_results_list(jsonl_data, results: list): | |
| import json | |
| zipped_results = [] | |
| parsed_results = [json.loads(line) for result in results for line in result.text.strip().split('\n')] | |
| jsonl_parsed = [json.loads(item) for item in jsonl_data] | |
| data_dict = {item['custom_id']: item for item in jsonl_parsed} | |
| result_dict = {item['custom_id']: item for item in parsed_results} | |
| # Match and create zipped_results | |
| for custom_id in data_dict: | |
| if custom_id in result_dict: | |
| data = data_dict[custom_id] | |
| result = result_dict[custom_id] | |
| zipped_results.append({ | |
| 'custom_id': custom_id, | |
| 'prompt': data['body']['messages'][0]['content'], | |
| 'user': data['body']['messages'][1]['content'], | |
| 'response': result['response']['body']['choices'][0]['message']['content'] | |
| }) | |
| else: | |
| raise ValueError(f"No matching result found for custom_id: {custom_id}") | |
| return zipped_results | |
| def get_zipped_results_str(jsonl_data, results): | |
| import json | |
| zipped_results = [] | |
| parsed_results = [json.loads(line) for line in results.text.strip().split('\n')] | |
| jsonl_parsed = [json.loads(item) for item in jsonl_data] | |
| data_dict = {item['custom_id']: item for item in jsonl_parsed} | |
| result_dict = {item['custom_id']: item for item in parsed_results} | |
| # Match and create zipped_results | |
| for custom_id in data_dict: | |
| if custom_id in result_dict: | |
| data = data_dict[custom_id] | |
| result = result_dict[custom_id] | |
| zipped_results.append({ | |
| 'custom_id': custom_id, | |
| 'prompt': data['body']['messages'][0]['content'], | |
| 'response': result['response']['body']['choices'][0]['message']['content'] | |
| }) | |
| else: | |
| raise ValueError(f"No matching result found for custom_id: {custom_id}") | |
| return zipped_results | |
| ############ | |
| ############ | |
| ############ | |
| from openai import OpenAI | |
| client = OpenAI(api_key='sk-proj-_XXXX') | |
| model = "gpt-4o-mini" | |
| max_tokens = 100 | |
| messages = [ | |
| {"role": "user", "content": "What is the capital of France?"}, | |
| {"role": "user", "content": "What is the capital of Germany?"}, | |
| {"role": "user", "content": "What is the capital of Spain?"}, | |
| {"role": "user", "content": "What is the capital of Italy?"}, | |
| {"role": "user", "content": "What is the capital of Portugal?"}, | |
| {"role": "user", "content": "What is the capital of Switzerland?"}, | |
| {"role": "user", "content": "What is the capital of Brazil?"}, | |
| {"role": "user", "content": "What is the capital of Canada?"}, | |
| {"role": "user", "content": "What is the capital of Japan?"}, | |
| ] | |
| jsonl_data = [generate_jsonl_data(i, model, max_tokens, [msg]) for i, msg in enumerate(messages)] | |
| batch_job = create_batch_job(client, jsonl_data) | |
| print(batch_job) | |
| check_job = check_batch_job(client, batch_job.id) | |
| print(check_job) | |
| check_job = check_batch_job(client, batch_job.id) | |
| output_file_id = check_job.output_file_id | |
| print(f"Batch Job Details:") | |
| print(f" ID: {check_job.id}") | |
| from datetime import datetime | |
| print(f" Created At: {datetime.fromtimestamp(int(check_job.created_at)).strftime('%Y-%m-%d %H:%M:%S')}") | |
| print(f" Status: {check_job.status}") | |
| print(f" Completed At: {datetime.fromtimestamp(int(check_job.completed_at)).strftime('%Y-%m-%d %H:%M:%S')}") | |
| print(f" Output File ID: {output_file_id}") | |
| results = get_batch_job_results(client, output_file_id) | |
| # Print the zipped results | |
| for item in get_zipped_results(jsonl_data, results): | |
| print(f"Custom ID: {item['custom_id']}") | |
| print(f"Prompt: {item['prompt']}") | |
| print(f"Response: {item['response']}") | |
| print() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from pydantic import BaseModel | |
| from openai import OpenAI | |
| from dotenv import load_dotenv | |
| import os | |
| load_dotenv() | |
| client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
| class TranslationToEnglish(BaseModel): | |
| text: str | |
| def translate_to_english(text: str) -> str: | |
| completion = client.beta.chat.completions.parse( | |
| model="gpt-4o-mini", | |
| messages=[ | |
| {"role": "system", "content": "Translate the following text to English."}, | |
| {"role": "user", "content": f"{text}"}, | |
| ], | |
| response_format=TranslationToEnglish, | |
| ) | |
| return completion.choices[0].message.parsed |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| OPENAI_PRICING = { | |
| "gpt-4o-mini": (0.15, 0.6), | |
| "gpt-4o": (2.5, 10), | |
| "o1-mini": (15, 60), | |
| } | |
| @dataclass | |
| class UsageTracker: | |
| prompt_tokens: int | |
| completion_tokens: int | |
| total_price: float | |
| def add(self, prompt_tokens: int, completion_tokens: int, model_name: str): | |
| self.prompt_tokens += prompt_tokens | |
| self.completion_tokens += completion_tokens | |
| self.total_price += ( | |
| OPENAI_PRICING[model_name][0] * prompt_tokens / 1000000 | |
| + OPENAI_PRICING[model_name][1] * completion_tokens / 1000000 | |
| ) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| client = OpenAI(api_key=os.getenv("OPENAI_API_KEY")) | |
| async def text_to_speech(text: str, user_key: str) -> Path: | |
| logger.info(f"Converting text to speech: {text[:50]}...") | |
| audio_filename = f"response_{user_key}.mp3" | |
| audio_path = os.path.join(audio_service.audio_dir, audio_filename) | |
| response = client.audio.speech.create( | |
| model="tts-1", | |
| voice="alloy", | |
| input=f"[French] {text}" | |
| ) | |
| output_path = Path(audio_path) | |
| with open(output_path, 'wb') as f: | |
| for chunk in response.iter_bytes(): | |
| f.write(chunk) | |
| logger.info(f"Audio file saved successfully at: {output_path}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment