Created
December 20, 2025 18:31
-
-
Save grahamking/7c866fa029c028ed760ae52e9987bc53 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| //! Evals using ort | |
| //! https://github.com/grahamking/ort | |
| //! | |
| //! MIT License | |
| //! Copyright (c) 2025 Graham King | |
| //! | |
| //! Blind model evaluations. How else you gonna choose which models to use? | |
| //! | |
| //! Reads model IDs and prompts. Runs every prompt against every model, one at a time. | |
| //! Writes the results in a directory hierarchy. | |
| //! Make a MODELS_FILE and PROMPTS_FILE each with only two entries and try it, you'll see. | |
| //! | |
| //! https://darkcoding.net/software/personal-ai-evals-aug-2025/ | |
| use ort_openrouter_core::{ | |
| CancelToken, Context, Message, OrtResult, PromptOpts, ReasoningConfig, ReasoningEffort, | |
| Response, ThinkEvent, ort_err, ort_from_err, prompt, utils::get_env, | |
| }; | |
| use std::io::Write as _; | |
| use std::env; | |
| use std::fs; | |
| use std::fs::File; | |
| use std::path::{Path, PathBuf}; | |
| use std::time::{SystemTime, UNIX_EPOCH}; | |
| /// Secret alises for the models so you can blind compare them | |
| const CAT_NAMES: [&str; 15] = [ | |
| "Luna", "Milo", "Oliver", "Bella", "Chloe", "Simba", "Nala", "Kitty", "Shadow", "Gizmo", | |
| "Coco", "Misty", "Tiger", "Salem", "Pumpkin", | |
| ]; | |
| /// System prompt, change at will | |
| const SYSTEM_PROMPT: &str = | |
| "Make your answer concise but complete. No yapping. Direct professional tone. No emoji."; | |
| fn print_usage_and_exit() -> ! { | |
| eprintln!( | |
| "eval --models <models-file> --prompts <prompts-file> --out <dir>\n\ | |
| - models-file is a list of model IDs (e.g. 'moonshotai/kimi-k2') one per line.\n\ | |
| - prompts-file is a list of prompts one per line\n\ | |
| - out dir is a directory to write the results to\n\ | |
| See https://github.com/grahamking/ort for full docs. | |
| " | |
| ); | |
| std::process::exit(2); | |
| } | |
| struct Args { | |
| /// Model IDs one per line. To enable reasoning put anything else on the line after a space. | |
| /// Max 15 models! That's how many cat names we have. Add names if you have more models. | |
| models_file: PathBuf, | |
| /// Prompt to use, one per line | |
| prompts_file: PathBuf, | |
| /// Write output to here, one directory per prompt | |
| out_dir: PathBuf, | |
| } | |
| fn main() -> OrtResult<()> { | |
| let api_key = get_env(c"OPENROUTER_API_KEY"); | |
| if api_key.is_empty() { | |
| eprintln!("OPENROUTER_API_KEY is not set."); | |
| std::process::exit(1); | |
| } | |
| // This is how we would stop all the running evals on ctrl-c, see main.rs | |
| let cancel_token = CancelToken::init(); | |
| let args = parse_args(); | |
| let models: Vec<String> = fs::read_to_string(&args.models_file) | |
| .map_err(ort_from_err) | |
| .context("Reading models file")? | |
| .lines() | |
| .map(str::to_string) | |
| .collect(); | |
| let prompts: Vec<String> = fs::read_to_string(&args.prompts_file) | |
| .map_err(ort_from_err) | |
| .context("Reading prompts file")? | |
| .lines() | |
| .map(str::to_string) | |
| .collect(); | |
| for (eval_num, prompt) in prompts.into_iter().enumerate() { | |
| run_prompt( | |
| &api_key, | |
| cancel_token, | |
| eval_num, | |
| &prompt, | |
| &models, | |
| &args.out_dir, | |
| )?; | |
| } | |
| Ok(()) | |
| } | |
| fn parse_args() -> Args { | |
| let args: Vec<String> = env::args().collect(); | |
| if args.len() != 7 { | |
| print_usage_and_exit(); | |
| } | |
| let mut models_file: PathBuf = Default::default(); | |
| let mut prompts_file: PathBuf = Default::default(); | |
| let mut out_dir: PathBuf = Default::default(); | |
| let mut i = 1; | |
| while i < args.len() { | |
| let arg = &args[i]; | |
| match arg.as_str() { | |
| "-h" | "--help" => print_usage_and_exit(), | |
| "--models" => { | |
| i += 1; | |
| if i >= args.len() { | |
| eprintln!("Missing value for --models"); | |
| print_usage_and_exit(); | |
| } | |
| models_file = args[i].clone().into(); | |
| if !models_file.exists() || !models_file.is_file() { | |
| eprintln!("File not found: {}", models_file.display()); | |
| std::process::exit(3); | |
| } | |
| i += 1; | |
| } | |
| "--prompts" => { | |
| i += 1; | |
| if i >= args.len() { | |
| eprintln!("Missing value for --prompts"); | |
| print_usage_and_exit(); | |
| } | |
| prompts_file = args[i].clone().into(); | |
| if !prompts_file.exists() || !prompts_file.is_file() { | |
| eprintln!("File not found: {}", prompts_file.display()); | |
| std::process::exit(3); | |
| } | |
| i += 1; | |
| } | |
| "--out" => { | |
| i += 1; | |
| if i >= args.len() { | |
| eprintln!("Missing value for --out"); | |
| print_usage_and_exit(); | |
| } | |
| out_dir = args[i].clone().into(); | |
| if !out_dir.exists() || !out_dir.is_dir() { | |
| eprintln!("Directory does not exist: {}", out_dir.display()); | |
| std::process::exit(3); | |
| } | |
| i += 1; | |
| } | |
| s if s.starts_with('-') => { | |
| eprintln!("Unknown flag: {s}"); | |
| print_usage_and_exit(); | |
| } | |
| _ => { | |
| print_usage_and_exit(); | |
| } | |
| } | |
| } | |
| Args { | |
| models_file, | |
| prompts_file, | |
| out_dir, | |
| } | |
| } | |
| fn run_prompt( | |
| api_key: &str, | |
| cancel_token: CancelToken, | |
| eval_num: usize, | |
| prompt: &str, | |
| models: &[String], | |
| out_dir: &Path, | |
| ) -> OrtResult<()> { | |
| println!("\n-- {prompt}"); | |
| // Randomize so their names are not predictable | |
| let mut names: Vec<String> = CAT_NAMES.iter().map(|n| n.to_string()).collect(); | |
| shuffle_strings(&mut names); | |
| // Make the eval directory | |
| let dir_name = PathBuf::from(out_dir).join(format!("eval{eval_num}")); | |
| fs::create_dir_all(&dir_name).map_err(ort_from_err)?; | |
| // Save the prompt | |
| let prompt_path = Path::new(&dir_name).join("prompt"); | |
| fs::write(prompt_path, format!("{prompt}\n")).map_err(ort_from_err)?; | |
| let mut key_file = File::create(Path::new(&dir_name).join("key")).map_err(ort_from_err)?; | |
| for (model_num, model) in models.iter().enumerate() { | |
| let parts: Vec<_> = model.split(' ').collect(); | |
| let enable_reasoning = parts.len() > 1; | |
| println!( | |
| "{} {}", | |
| parts[0], | |
| if enable_reasoning { "reasoning" } else { "" } | |
| ); | |
| let common = PromptOpts { | |
| prompt: None, | |
| // We clone the model name because the struct takes ownership of the String. | |
| models: vec![parts[0].to_string()], | |
| system: Some(SYSTEM_PROMPT.to_string()), | |
| priority: None, | |
| provider: None, | |
| show_reasoning: Some(true), | |
| reasoning: Some(ReasoningConfig { | |
| enabled: true, | |
| effort: Some(ReasoningEffort::Medium), | |
| ..Default::default() | |
| }), | |
| quiet: Some(false), | |
| merge_config: true, | |
| }; | |
| let cat_name = &names[model_num]; | |
| let mut out = File::create(Path::new(&dir_name).join(format!("{cat_name}.txt"))) | |
| .map_err(ort_from_err)?; | |
| let messages = vec![Message::user(prompt.to_string())]; | |
| let queue = prompt::start_prompt_thread(api_key, cancel_token, vec![], common, messages, 0); | |
| let mut consumer = queue.consumer(); | |
| while let Some(data) = consumer.get_next() { | |
| if cancel_token.is_cancelled() { | |
| break; | |
| } | |
| match data { | |
| Response::Start => {} | |
| Response::Think(think) => match think { | |
| ThinkEvent::Start => { | |
| let _ = write!(out, "<think>"); | |
| } | |
| ThinkEvent::Content(s) => { | |
| let _ = write!(out, "{s}"); | |
| } | |
| ThinkEvent::Stop => { | |
| let _ = write!(out, "</think>\n\n"); | |
| } | |
| }, | |
| Response::Content(content) => { | |
| let _ = write!(out, "{content}"); | |
| } | |
| Response::Stats(stats) => { | |
| let _ = writeln!(key_file, "{cat_name}: {stats}"); | |
| } | |
| Response::Error(err) => { | |
| return ort_err(err.to_string()); | |
| } | |
| Response::None => { | |
| eprintln!("Response::None means we read the wrong Queue position"); | |
| } | |
| } | |
| } | |
| let _ = writeln!(out); | |
| let _ = out.flush(); | |
| let _ = key_file.flush(); | |
| } | |
| Ok(()) | |
| } | |
| fn xorshift(seed: &mut u64) -> u64 { | |
| *seed ^= *seed << 13; | |
| *seed ^= *seed >> 7; | |
| *seed ^= *seed << 17; | |
| *seed | |
| } | |
| fn shuffle_strings(vec: &mut [String]) { | |
| let mut seed = SystemTime::now() | |
| .duration_since(UNIX_EPOCH) | |
| .unwrap() | |
| .as_nanos() as u64; | |
| let len = vec.len(); | |
| for i in (1..len).rev() { | |
| let j = (xorshift(&mut seed) % (i as u64 + 1)) as usize; | |
| vec.swap(i, j); | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment