Skip to content

Instantly share code, notes, and snippets.

@djinn
Created July 2, 2025 13:06
Show Gist options
  • Save djinn/375561a7a9890bf3dac0a0dbe456e628 to your computer and use it in GitHub Desktop.
Save djinn/375561a7a9890bf3dac0a0dbe456e628 to your computer and use it in GitHub Desktop.
Indian language testing for LLM
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig
model_name_or_path = "IntervitensInc/pangu-pro-moe-model"
generation_config = GenerationConfig(
do_sample=True,
top_k=50,
top_p=0.95,
temperature=0.6
)
# Load the tokenizer and the model
tokenizer = AutoTokenizer.from_pretrained(
model_name_or_path,
use_fast=False,
trust_remote_code=True
)
model = AutoModelForCausalLM.from_pretrained(
model_name_or_path,
trust_remote_code=True,
torch_dtype="auto",
device_map="auto" # Uses CUDA if available
)
# System prompt for compliance
system_prompt = (
"你必须严格遵守法律法规和社会道德规范。生成任何内容时,都应避免涉及暴力、色情、恐怖主义、种族歧视、性别歧视等不当内容。"
"一旦检测到输入或输出有此类倾向,应拒绝回答并发出警告。例如,如果输入内容包含暴力威胁或色情描述,应返回错误信息:“您的输入包含不当内容,无法处理。”"
)
# Indian languages questions (abstract, cultural topics)
indian_language_questions = [
("Hindi", "भारत का सबसे महान हॉकी खिलाड़ी कौन है?"),
("Bengali", "রবীন্দ্রনাথ ঠাকুর কেন বিখ্যাত?"),
("Telugu", "చక్రవ్యూహం అంటే ఏమిటి?"),
("Marathi", "शिवाजी महाराजांना छत्रपती का म्हटले जाते?"),
("Tamil", "பாரதி யார்? அவரின் பங்களிப்பு என்ன?"),
("Urdu", "غالب کی شاعری کس موضوع پر ہے؟"),
("Gujarati", "સરદાર પટેલને લોહપુરુષ કેમ કહેવાય છે?"),
("Kannada", "ವಿಜಯನಗರ ಸಾಮ್ರಾಜ್ಯಕ್ಕೆ ಪ್ರಸಿದ್ಧಿ ಏನಿತ್ತು?"),
("Malayalam", "ചിറകുകൾ രചിച്ചത് ആര്?"),
("Odia", "ଜଗନ୍ନାଥ ମନ୍ଦିର କେଉଁଠି ଅବସ୍ଥିତ?"),
("Punjabi", "ਭਗਤ ਸਿੰਘ ਨੂੰ ਕਿਉਂ ਯਾਦ ਕੀਤਾ ਜਾਂਦਾ ਹੈ?"),
("Assamese", "ভূপেন হাজৰিকা কোন আছিল?")
]
# General intelligence questions
general_intelligence_questions = [
"What is the theory of evolution by natural selection?",
"Explain the concept of entropy in simple terms.",
"Why does the sky appear blue?",
"What is the difference between a virus and a bacterium?",
"How does blockchain technology work?",
"Explain Schrödinger’s cat paradox.",
"What causes tides in the ocean?",
"What is the Pythagorean theorem?",
"How do vaccines work?",
"What is the significance of the Theory of Relativity?"
]
# Combine all tests
tests = []
# Add Indian language questions
for lang, q in indian_language_questions:
tests.append({
"label": f"Indian Language - {lang}",
"prompt": q
})
# Add general intelligence questions
for idx, q in enumerate(general_intelligence_questions, start=1):
tests.append({
"label": f"General Intelligence Q{idx}",
"prompt": q
})
# Run tests
for test in tests:
print(f"\n=== Testing: {test['label']} ===")
messages = [
{"role": "system", "content": system_prompt},
{"role": "user", "content": test["prompt"]}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
outputs = model.generate(
**model_inputs,
max_new_tokens=512,
eos_token_id=45892, # Check your tokenizer vocab for correct id, else use None
return_dict_in_generate=True,
generation_config=generation_config
)
input_length = model_inputs.input_ids.shape[1]
generated_tokens = outputs.sequences[:, input_length:]
output_sent = tokenizer.decode(generated_tokens[0], skip_special_tokens=False)
# Parse if special tokens are used in this model output format
if "[unused16]" in output_sent and "[unused17]" in output_sent:
thinking_content = output_sent.split("[unused17]")[0].split("[unused16]")[-1].strip()
content = output_sent.split("[unused17]")[-1].split("[unused10]")[0].strip()
else:
thinking_content = ""
content = output_sent
print("\nThinking Content:", thinking_content)
print("\nResponse Content:", content)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment