Created
February 8, 2025 22:00
-
-
Save lukestanley/c353bdec79d1a4d8b1ff57d2a21de6ec to your computer and use it in GitHub Desktop.
Translates sentences of English text to bilingual Chinese and English with emojis
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import nltk | |
import requests | |
import hashlib | |
from diskcache import Cache | |
from time import sleep | |
# Set up disk-backed cache that caches forever | |
cache = Cache('disk_cache') | |
nltk.download('punkt_tab') | |
nltk.download('punkt') | |
API_KEY = "" | |
API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-pro-exp-02-05:generateContent?key={API_KEY}" | |
def translate_array(sentences): | |
"""Translates an array of sentences.""" | |
key = "translate:" + hashlib.sha256(json.dumps(sentences, sort_keys=True).encode('utf-8')).hexdigest() | |
if key in cache: | |
return cache[key] | |
else: | |
print("Cache miss for translation, calling API...") | |
sleep(20) | |
input_json = json.dumps(sentences, ensure_ascii=False) | |
user_message = f"""Return an array of translated output, translating the input array, and adding emojis to each sentence, output the JSON and nothing else. The input text to translate follows (inside the input tags). | |
<input> | |
{input_json} | |
</input>""" | |
system_prompt = """Translate the array of input text to Simplified Chinese with emojis. Make the translation idiomatic and high-quality, suitable for a native speaker with as similar meaning as possible while being idiomatic. Add relevant emojis to enhance readability and engagement. Be creative but ensure the emojis fit the context. Translate without additional remarks. Do not "talk back", just present the Chinese translation as an array of translated strings, a string for each input string.""" | |
data = { | |
"contents": [ | |
{"role": "user", "parts": [{"text": user_message}]} | |
], | |
"systemInstruction": { | |
"role": "user", | |
"parts": [{"text": system_prompt}] | |
}, | |
"generationConfig": { | |
"temperature": 0.7, | |
"topP": 0.95, | |
"maxOutputTokens": 6096, | |
"responseMimeType": "application/json", | |
"responseSchema": { | |
"type": "array", | |
"items": {"type": "string"} | |
} | |
} | |
} | |
response = requests.post(API_URL, headers={'Content-Type': 'application/json'}, data=json.dumps(data)) | |
response.raise_for_status() | |
sleep(0.6) | |
result = response.json()['candidates'][0]['content']['parts'][0]['text'] | |
cache.set(key, result, expire=None) # Cache forever | |
return result | |
def get_sentences(text): | |
"""Caches sentence splitting of input text using nltk.""" | |
key = "sentences:" + hashlib.sha256(text.encode('utf-8')).hexdigest() | |
if key in cache: | |
#print("Cache hit for sentence splitting!") | |
return cache[key] | |
else: | |
print("Cache miss for sentence splitting, processing...") | |
sentences = nltk.sent_tokenize(text) | |
cache.set(key, sentences, expire=None) # Cache forever | |
return sentences | |
def process_input_file_batched(input_file, batch_size=20): | |
"""Processes input file in batches.""" | |
with open(input_file, 'r', encoding='utf-8') as f: | |
text = f.read() | |
sentences = get_sentences(text) | |
for i in range(0, len(sentences), batch_size): | |
batch = sentences[i:i + batch_size] | |
json_response_text = translate_array(batch) | |
translations = json.loads(json_response_text) | |
for j, translation in enumerate(translations): | |
english_sentence = batch[j].strip() if j < len(batch) else "" | |
print("\n\n") | |
print(translation) | |
print(english_sentence) | |
process_input_file_batched("how.txt") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment