Skip to content

Instantly share code, notes, and snippets.

@pszemraj
Created December 10, 2025 04:43
Show Gist options
  • Select an option

  • Save pszemraj/2c6e84f8a0cbd8424e5fa792261b1a0d to your computer and use it in GitHub Desktop.

Select an option

Save pszemraj/2c6e84f8a0cbd8424e5fa792261b1a0d to your computer and use it in GitHub Desktop.
inference with rnj-1
import torch
import time
from transformers import AutoTokenizer, AutoModelForCausalLM
# 1. Basic Timer Context Manager
class Timer:
def __init__(self, name="Task"):
self.name = name
def __enter__(self):
self.start_time = time.perf_counter()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self.end_time = time.perf_counter()
self.elapsed = self.end_time - self.start_time
print(f"[{self.name}] finished in {self.elapsed:.4f} seconds.")
model_id = "EssentialAI/rnj-1-instruct"
print(f"Loading model: {model_id}...")
model = AutoModelForCausalLM.from_pretrained(
model_id,
dtype=torch.bfloat16,
device_map="auto",
)
tokenizer = AutoTokenizer.from_pretrained(model_id)
print("Model and tokenizer loaded successfully.")
messages = [
{
"role": "system",
"content": "You are a helpful AI assistant.",
}, # Optional system message
{"role": "user", "content": "Who are you?"},
]
input_ids = tokenizer.apply_chat_template(
messages, add_generation_prompt=True, return_tensors="pt"
).to(model.device)
# --- Generate Prediction --- #
print("Generating prediction...")
with Timer("Model Generation"):
output_ids = model.generate(
input_ids,
max_new_tokens=50,
pad_token_id=tokenizer.eos_token_id,
do_sample=True,
temperature=0.2,
top_p=0.95,
)
response = tokenizer.decode(
output_ids[0][input_ids.shape[-1] :], skip_special_tokens=True
)
print(response)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment