lukestanley · February 8, 2025 22:00
diff --git a/translate_zh.py b/translate_zh.py
 import json
 import nltk
 import requests
 import hashlib
 from diskcache import Cache
 from time import sleep

 # Set up disk-backed cache that caches forever
 cache = Cache('disk_cache')

 nltk.download('punkt_tab')
 nltk.download('punkt')

 API_KEY = ""
 API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-pro-exp-02-05:generateContent?key={API_KEY}"

 def translate_array(sentences):
    """Translates an array of sentences."""
    key = "translate:" + hashlib.sha256(json.dumps(sentences, sort_keys=True).encode('utf-8')).hexdigest()
    if key in cache:
        return cache[key]
    else:
        print("Cache miss for translation, calling API...")
        sleep(20)
    
    input_json = json.dumps(sentences, ensure_ascii=False)
    user_message = f"""Return an array of translated output, translating the input array, and adding emojis to each sentence, output the JSON and nothing else. The input text to translate follows (inside the input tags).
 <input>
 {input_json}
 </input>"""
    system_prompt = """Translate the array of input text to Simplified Chinese with emojis. Make the translation idiomatic and high-quality, suitable for a native speaker with as similar meaning as possible while being idiomatic. Add relevant emojis to enhance readability and engagement. Be creative but ensure the emojis fit the context. Translate without additional remarks. Do not "talk back", just present the Chinese translation as an array of translated strings, a string for each input string."""
    
    data = {
        "contents": [
            {"role": "user", "parts": [{"text": user_message}]}
        ],
        "systemInstruction": {
            "role": "user",
            "parts": [{"text": system_prompt}]
        },
        "generationConfig": {
            "temperature": 0.7,
            "topP": 0.95,
            "maxOutputTokens": 6096,
            "responseMimeType": "application/json",
            "responseSchema": {
                "type": "array",
                "items": {"type": "string"}
            }
        }
    }
    
    response = requests.post(API_URL, headers={'Content-Type': 'application/json'}, data=json.dumps(data))
    response.raise_for_status()
    sleep(0.6)
    result = response.json()['candidates'][0]['content']['parts'][0]['text']
    cache.set(key, result, expire=None)  # Cache forever
    return result

 def get_sentences(text):
    """Caches sentence splitting of input text using nltk."""
    key = "sentences:" + hashlib.sha256(text.encode('utf-8')).hexdigest()
    if key in cache:
        #print("Cache hit for sentence splitting!")
        return cache[key]
    else:
        print("Cache miss for sentence splitting, processing...")
    sentences = nltk.sent_tokenize(text)
    cache.set(key, sentences, expire=None)  # Cache forever
    return sentences

 def process_input_file_batched(input_file, batch_size=20):
    """Processes input file in batches."""
    with open(input_file, 'r', encoding='utf-8') as f:
        text = f.read()
    sentences = get_sentences(text)
    
    for i in range(0, len(sentences), batch_size):
        batch = sentences[i:i + batch_size]
        json_response_text = translate_array(batch)
        translations = json.loads(json_response_text)
        
        for j, translation in enumerate(translations):
            english_sentence = batch[j].strip() if j < len(batch) else ""
            print("\n\n")
            print(translation)
            print(english_sentence)

 process_input_file_batched("how.txt")
	import json
	import nltk
	import requests
	import hashlib
	from diskcache import Cache
	from time import sleep

	# Set up disk-backed cache that caches forever
	cache = Cache('disk_cache')

	nltk.download('punkt_tab')
	nltk.download('punkt')

	API_KEY = ""
	API_URL = f"https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-pro-exp-02-05:generateContent?key={API_KEY}"

	def translate_array(sentences):
	"""Translates an array of sentences."""
	key = "translate:" + hashlib.sha256(json.dumps(sentences, sort_keys=True).encode('utf-8')).hexdigest()
	if key in cache:
	return cache[key]
	else:
	print("Cache miss for translation, calling API...")
	sleep(20)

	input_json = json.dumps(sentences, ensure_ascii=False)
	user_message = f"""Return an array of translated output, translating the input array, and adding emojis to each sentence, output the JSON and nothing else. The input text to translate follows (inside the input tags).
	<input>
	{input_json}
	</input>"""
	system_prompt = """Translate the array of input text to Simplified Chinese with emojis. Make the translation idiomatic and high-quality, suitable for a native speaker with as similar meaning as possible while being idiomatic. Add relevant emojis to enhance readability and engagement. Be creative but ensure the emojis fit the context. Translate without additional remarks. Do not "talk back", just present the Chinese translation as an array of translated strings, a string for each input string."""

	data = {
	"contents": [
	{"role": "user", "parts": [{"text": user_message}]}
	],
	"systemInstruction": {
	"role": "user",
	"parts": [{"text": system_prompt}]
	},
	"generationConfig": {
	"temperature": 0.7,
	"topP": 0.95,
	"maxOutputTokens": 6096,
	"responseMimeType": "application/json",
	"responseSchema": {
	"type": "array",
	"items": {"type": "string"}
	}
	}
	}

	response = requests.post(API_URL, headers={'Content-Type': 'application/json'}, data=json.dumps(data))
	response.raise_for_status()
	sleep(0.6)
	result = response.json()['candidates'][0]['content']['parts'][0]['text']
	cache.set(key, result, expire=None) # Cache forever
	return result

	def get_sentences(text):
	"""Caches sentence splitting of input text using nltk."""
	key = "sentences:" + hashlib.sha256(text.encode('utf-8')).hexdigest()
	if key in cache:
	#print("Cache hit for sentence splitting!")
	return cache[key]
	else:
	print("Cache miss for sentence splitting, processing...")
	sentences = nltk.sent_tokenize(text)
	cache.set(key, sentences, expire=None) # Cache forever
	return sentences

	def process_input_file_batched(input_file, batch_size=20):
	"""Processes input file in batches."""
	with open(input_file, 'r', encoding='utf-8') as f:
	text = f.read()
	sentences = get_sentences(text)

	for i in range(0, len(sentences), batch_size):
	batch = sentences[i:i + batch_size]
	json_response_text = translate_array(batch)
	translations = json.loads(json_response_text)

	for j, translation in enumerate(translations):
	english_sentence = batch[j].strip() if j < len(batch) else ""
	print("\n\n")
	print(translation)
	print(english_sentence)

	process_input_file_batched("how.txt")