jrosell · July 2, 2025 16:36 · jrosell · Jul 2, 2025
diff --git a/ai-evals.R b/ai-evals.R
 rlang::check_installed(c("vitals", "ellmer", "dplyr", "ggplot2"))

 library(vitals)
 library(ellmer)
 library(dplyr)
 library(ggplot2)

 eval_df <- tibble(
  input = c("What's 2+2?", "What's 2+3?", "What's 2+4?"),
  target = c("4", "5", "6")
 )


 tsk <- Task$new(
  dataset = eval_df, 
  solver = generate(),
  scorer = model_graded_qa()
 )

 tsk_llama <- tsk$clone()$eval(solver_chat = chat_ollama(model = "llama3.2:3b"))
 tsk_qwen <- tsk$clone()$eval(solver_chat = chat_ollama(model = "qwen3:4b"))

 tsk_eval <- 
  vitals_bind(
    llama = tsk_llama, 
    qwen = tsk_qwen
  )

 tsk_eval |>
  rename(model = task) |>
  mutate(
    score = factor(
      case_when(
        score == "I" ~ "Incorrect",
        score == "P" ~ "Partially correct",
        score == "C" ~ "Correct"
      ),
      levels = c("Incorrect", "Partially correct", "Correct"),
      ordered = TRUE
    )
  ) |>
  ggplot(aes(y = model, fill = score)) +
  geom_bar() +
  scale_fill_brewer(breaks = rev, palette = "RdYlGn")
	rlang::check_installed(c("vitals", "ellmer", "dplyr", "ggplot2"))

	library(vitals)
	library(ellmer)
	library(dplyr)
	library(ggplot2)

	eval_df <- tibble(
	input = c("What's 2+2?", "What's 2+3?", "What's 2+4?"),
	target = c("4", "5", "6")
	)


	tsk <- Task$new(
	dataset = eval_df,
	solver = generate(),
	scorer = model_graded_qa()
	)

	tsk_llama <- tsk$clone()$eval(solver_chat = chat_ollama(model = "llama3.2:3b"))
	tsk_qwen <- tsk$clone()$eval(solver_chat = chat_ollama(model = "qwen3:4b"))

	tsk_eval <-
	vitals_bind(
	llama = tsk_llama,
	qwen = tsk_qwen
	)

	tsk_eval \|>
	rename(model = task) \|>
	mutate(
	score = factor(
	case_when(
	score == "I" ~ "Incorrect",
	score == "P" ~ "Partially correct",
	score == "C" ~ "Correct"
	),
	levels = c("Incorrect", "Partially correct", "Correct"),
	ordered = TRUE
	)
	) \|>
	ggplot(aes(y = model, fill = score)) +
	geom_bar() +
	scale_fill_brewer(breaks = rev, palette = "RdYlGn")