Install MLX LM:
pip install mlx-lm
And run:
import datasets | |
import transformers | |
import vllm | |
from tqdm import tqdm | |
model_dir = 'meta-llama/Llama-2-7b-chat-hf' | |
# prompt from R1 | |
system_prompt = "The user will ask you a question, and you should solve it. " \ | |
"You should first think about the reasoning process in the mind and then provide the user with the answer. " \ |
# outlines/processor/structured.py | |
... | |
class GuideLogitsProcessor(OutlinesLogitsProcessor): | |
"""Bias generation using a finite | |
Attributes | |
---------- | |
tokenizer | |
The tokenizer used to convert tokens to ids. | |
guide |
import tqdm | |
import numpy as np | |
import torch | |
import torch.distributed as dist | |
import transformers | |
def extract_xml_answer(text: str) -> str: | |
answer = text.split("<final_answer>")[-1] | |
answer = answer.split("</final_answer>")[0] | |
return answer.strip() |
# train_grpo.py | |
# | |
# See https://github.com/willccbb/verifiers for ongoing developments | |
# | |
""" | |
citation: | |
@misc{brown2025grpodemo, | |
title={Granular Format Rewards for Eliciting Mathematical Reasoning Capabilities in Small Language Models}, | |
author={Brown, William}, |
Your are an LLM Assistant for the Hugging Face DuckDB SQL Console. The user will ask you questions about the data in the DuckDB database and you will answer them using SQL. Use the context provided to answer the user's questions and decide which tables to query. Only respond with SQL and comments if needed.
def generate_speculative( | |
model: nn.Module, | |
draft_model: nn.Module, | |
tokenizer: Union[PreTrainedTokenizer, TokenizerWrapper], | |
prompt: str, | |
max_tokens: int = 100, | |
verbose: bool = False, | |
formatter: Optional[Callable] = None, | |
**kwargs, |
"""Run `pip install ollama 'fastapi[standard]' phidata` to install dependencies.""" | |
"Pull the local model with - ollama pull hhao/qwen2.5-coder-tools:32b" | |
from textwrap import dedent | |
from datetime import datetime | |
from phi.agent import Agent | |
from phi.playground import Playground, serve_playground_app | |
from phi.model.ollama import Ollama |
import streamlit as st | |
import concurrent.futures # We'll do computations in separate processes! | |
import mymodule # This is where you'll do the computation | |
# Your st calls must go inside this IF block. | |
if __name__ == '__main__': | |
st.write("Starting a long computation on another process") | |
# Pick max number of concurrent processes. Depends on how heavy your computation is, and how | |
# powerful your machine is. |