Created
July 2, 2025 13:06
-
-
Save djinn/375561a7a9890bf3dac0a0dbe456e628 to your computer and use it in GitHub Desktop.
Indian language testing for LLM
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer, GenerationConfig | |
model_name_or_path = "IntervitensInc/pangu-pro-moe-model" | |
generation_config = GenerationConfig( | |
do_sample=True, | |
top_k=50, | |
top_p=0.95, | |
temperature=0.6 | |
) | |
# Load the tokenizer and the model | |
tokenizer = AutoTokenizer.from_pretrained( | |
model_name_or_path, | |
use_fast=False, | |
trust_remote_code=True | |
) | |
model = AutoModelForCausalLM.from_pretrained( | |
model_name_or_path, | |
trust_remote_code=True, | |
torch_dtype="auto", | |
device_map="auto" # Uses CUDA if available | |
) | |
# System prompt for compliance | |
system_prompt = ( | |
"你必须严格遵守法律法规和社会道德规范。生成任何内容时,都应避免涉及暴力、色情、恐怖主义、种族歧视、性别歧视等不当内容。" | |
"一旦检测到输入或输出有此类倾向,应拒绝回答并发出警告。例如,如果输入内容包含暴力威胁或色情描述,应返回错误信息:“您的输入包含不当内容,无法处理。”" | |
) | |
# Indian languages questions (abstract, cultural topics) | |
indian_language_questions = [ | |
("Hindi", "भारत का सबसे महान हॉकी खिलाड़ी कौन है?"), | |
("Bengali", "রবীন্দ্রনাথ ঠাকুর কেন বিখ্যাত?"), | |
("Telugu", "చక్రవ్యూహం అంటే ఏమిటి?"), | |
("Marathi", "शिवाजी महाराजांना छत्रपती का म्हटले जाते?"), | |
("Tamil", "பாரதி யார்? அவரின் பங்களிப்பு என்ன?"), | |
("Urdu", "غالب کی شاعری کس موضوع پر ہے؟"), | |
("Gujarati", "સરદાર પટેલને લોહપુરુષ કેમ કહેવાય છે?"), | |
("Kannada", "ವಿಜಯನಗರ ಸಾಮ್ರಾಜ್ಯಕ್ಕೆ ಪ್ರಸಿದ್ಧಿ ಏನಿತ್ತು?"), | |
("Malayalam", "ചിറകുകൾ രചിച്ചത് ആര്?"), | |
("Odia", "ଜଗନ୍ନାଥ ମନ୍ଦିର କେଉଁଠି ଅବସ୍ଥିତ?"), | |
("Punjabi", "ਭਗਤ ਸਿੰਘ ਨੂੰ ਕਿਉਂ ਯਾਦ ਕੀਤਾ ਜਾਂਦਾ ਹੈ?"), | |
("Assamese", "ভূপেন হাজৰিকা কোন আছিল?") | |
] | |
# General intelligence questions | |
general_intelligence_questions = [ | |
"What is the theory of evolution by natural selection?", | |
"Explain the concept of entropy in simple terms.", | |
"Why does the sky appear blue?", | |
"What is the difference between a virus and a bacterium?", | |
"How does blockchain technology work?", | |
"Explain Schrödinger’s cat paradox.", | |
"What causes tides in the ocean?", | |
"What is the Pythagorean theorem?", | |
"How do vaccines work?", | |
"What is the significance of the Theory of Relativity?" | |
] | |
# Combine all tests | |
tests = [] | |
# Add Indian language questions | |
for lang, q in indian_language_questions: | |
tests.append({ | |
"label": f"Indian Language - {lang}", | |
"prompt": q | |
}) | |
# Add general intelligence questions | |
for idx, q in enumerate(general_intelligence_questions, start=1): | |
tests.append({ | |
"label": f"General Intelligence Q{idx}", | |
"prompt": q | |
}) | |
# Run tests | |
for test in tests: | |
print(f"\n=== Testing: {test['label']} ===") | |
messages = [ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": test["prompt"]} | |
] | |
text = tokenizer.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
model_inputs = tokenizer([text], return_tensors="pt").to(model.device) | |
outputs = model.generate( | |
**model_inputs, | |
max_new_tokens=512, | |
eos_token_id=45892, # Check your tokenizer vocab for correct id, else use None | |
return_dict_in_generate=True, | |
generation_config=generation_config | |
) | |
input_length = model_inputs.input_ids.shape[1] | |
generated_tokens = outputs.sequences[:, input_length:] | |
output_sent = tokenizer.decode(generated_tokens[0], skip_special_tokens=False) | |
# Parse if special tokens are used in this model output format | |
if "[unused16]" in output_sent and "[unused17]" in output_sent: | |
thinking_content = output_sent.split("[unused17]")[0].split("[unused16]")[-1].strip() | |
content = output_sent.split("[unused17]")[-1].split("[unused10]")[0].strip() | |
else: | |
thinking_content = "" | |
content = output_sent | |
print("\nThinking Content:", thinking_content) | |
print("\nResponse Content:", content) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment