Skip to content

Instantly share code, notes, and snippets.

@chottokun
Created April 17, 2025 09:55
Show Gist options
  • Save chottokun/106eef0c6a1892e01ed58f0097b82486 to your computer and use it in GitHub Desktop.
Save chottokun/106eef0c6a1892e01ed58f0097b82486 to your computer and use it in GitHub Desktop.
microsoft/bitnet-b1.58-2B-4T
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"machine_shape": "hm",
"authorship_tag": "ABX9TyMR9PY/dD0m1RzjasG6mtm6",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/chottokun/106eef0c6a1892e01ed58f0097b82486/microsoft-bitnet-b1-58-2b-4t.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "a8Z_R8saEeMo",
"outputId": "3c0b79af-226c-41dc-d405-8ff0e784deee"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"Collecting git+https://github.com/shumingma/transformers.git\n",
" Cloning https://github.com/shumingma/transformers.git to /tmp/pip-req-build-hfuvzjqu\n",
" Running command git clone --filter=blob:none --quiet https://github.com/shumingma/transformers.git /tmp/pip-req-build-hfuvzjqu\n",
" Resolved https://github.com/shumingma/transformers.git to commit e326683e468142d0b20b53f1e9fda1222e8e286a\n",
" Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n",
" Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n",
" Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n",
"Requirement already satisfied: filelock in /usr/local/lib/python3.11/dist-packages (from transformers==4.52.0.dev0) (3.18.0)\n",
"Requirement already satisfied: huggingface-hub<1.0,>=0.30.0 in /usr/local/lib/python3.11/dist-packages (from transformers==4.52.0.dev0) (0.30.2)\n",
"Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.11/dist-packages (from transformers==4.52.0.dev0) (2.0.2)\n",
"Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.11/dist-packages (from transformers==4.52.0.dev0) (24.2)\n",
"Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.11/dist-packages (from transformers==4.52.0.dev0) (6.0.2)\n",
"Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.11/dist-packages (from transformers==4.52.0.dev0) (2024.11.6)\n",
"Requirement already satisfied: requests in /usr/local/lib/python3.11/dist-packages (from transformers==4.52.0.dev0) (2.32.3)\n",
"Requirement already satisfied: tokenizers<0.22,>=0.21 in /usr/local/lib/python3.11/dist-packages (from transformers==4.52.0.dev0) (0.21.1)\n",
"Requirement already satisfied: safetensors>=0.4.3 in /usr/local/lib/python3.11/dist-packages (from transformers==4.52.0.dev0) (0.5.3)\n",
"Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.11/dist-packages (from transformers==4.52.0.dev0) (4.67.1)\n",
"Requirement already satisfied: fsspec>=2023.5.0 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.30.0->transformers==4.52.0.dev0) (2025.3.2)\n",
"Requirement already satisfied: typing-extensions>=3.7.4.3 in /usr/local/lib/python3.11/dist-packages (from huggingface-hub<1.0,>=0.30.0->transformers==4.52.0.dev0) (4.13.1)\n",
"Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.11/dist-packages (from requests->transformers==4.52.0.dev0) (3.4.1)\n",
"Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.11/dist-packages (from requests->transformers==4.52.0.dev0) (3.10)\n",
"Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.11/dist-packages (from requests->transformers==4.52.0.dev0) (2.3.0)\n",
"Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.11/dist-packages (from requests->transformers==4.52.0.dev0) (2025.1.31)\n"
]
}
],
"source": [
"!pip install git+https://github.com/shumingma/transformers.git\n"
]
},
{
"cell_type": "code",
"source": [
"import torch\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
"\n",
"model_id = \"microsoft/bitnet-b1.58-2B-4T\"\n",
"\n",
"# Load tokenizer and model\n",
"tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
"model = AutoModelForCausalLM.from_pretrained(\n",
" model_id,\n",
" torch_dtype=torch.bfloat16\n",
")\n",
"\n",
"# Apply the chat template\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": \"You are a helpful AI assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"How are you?\"},\n",
"]\n",
"prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
"chat_input = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
"\n",
"# Generate response\n",
"chat_outputs = model.generate(**chat_input, max_new_tokens=2048)\n",
"response = tokenizer.decode(chat_outputs[0][chat_input['input_ids'].shape[-1]:], skip_special_tokens=True) # Decode only the response part\n",
"print(\"\\nAssistant Response:\", response)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "hSYDik4LEqat",
"outputId": "ff39989c-3840-4fa0-c964-0a82ecf8118c"
},
"execution_count": 2,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"No CUDA runtime is found, using CUDA_HOME='/usr/local/cuda'\n",
"Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
"The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"Assistant Response: As an AI, I don't have feelings or emotions, but I'm here and ready to assist you! How can I help you today?\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"import torch\n",
"from transformers import AutoModelForCausalLM, AutoTokenizer\n",
"\n",
"def generate_response(messages, model, tokenizer, max_new_tokens=50):\n",
" prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)\n",
" chat_input = tokenizer(prompt, return_tensors=\"pt\").to(model.device)\n",
" chat_outputs = model.generate(**chat_input, max_new_tokens=max_new_tokens)\n",
" response = tokenizer.decode(chat_outputs[0][chat_input['input_ids'].shape[-1]:], skip_special_tokens=True)\n",
" return response\n",
"\n",
"# Example usage (assuming model and tokenizer are already loaded as in the original code):\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": \"You are a helpful AI assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"How are you?\"},\n",
"]\n",
"\n",
"response = generate_response(messages, model, tokenizer)\n",
"print(\"\\nAssistant Response:\", response)\n",
"\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": \"You are a helpful AI assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"What is the capital of France?\"},\n",
"]\n",
"response = generate_response(messages, model, tokenizer)\n",
"print(\"\\nAssistant Response:\", response)\n"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "Gz95n15VEz9y",
"outputId": "635c57f7-9897-4d26-a14d-bd014d1b953a"
},
"execution_count": 3,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n",
"Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"Assistant Response: I'm just a computer program, so I don't have feelings, but I'm here and ready to help you! How can I assist you today?\n",
"\n",
"Assistant Response: The capital of France is Paris.\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%%time\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": \"You are a helpful AI assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Can you speak Japanese?\"},\n",
"]\n",
"\n",
"response = generate_response(messages, model, tokenizer)\n",
"print(\"\\nAssistant Response:\", response)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5Kzg4vB5GJ4a",
"outputId": "05047c25-0980-4c0c-aec9-a44c763a1110"
},
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"Assistant Response: はい、日本語を話せます。でも、your request is about a language, not a translation. If you need help with Japanese, feel free to ask.\n",
"CPU times: user 54 s, sys: 44.7 ms, total: 54 s\n",
"Wall time: 13.6 s\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%%time\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": \"You are a helpful AI assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"まどか☆マギカで一番かわいいのは誰?\"},\n",
"]\n",
"\n",
"response = generate_response(messages, model, tokenizer)\n",
"print(\"\\nAssistant Response:\", response)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "R8QjlibDFFEh",
"outputId": "4ca13052-045d-4016-8b02-903d54340ed9"
},
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"Assistant Response: マギカは、多くの人々とアニメで登場し、多くの人々と引用することが多いです。もちろん、彼女は、多くの人々とかわいい humanoid humanoid_characterに大好きです。\n",
"CPU times: user 1min 8s, sys: 98.8 ms, total: 1min 8s\n",
"Wall time: 17 s\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%%time\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": \"You are a helpful AI assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Who is the cutest character in Madoka Magica?\"},\n",
"]\n",
"\n",
"response = generate_response(messages, model, tokenizer)\n",
"print(\"\\nAssistant Response:\", response)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "mpe-7IHuFvWW",
"outputId": "59d7e347-9e55-474b-824c-86881414a0ba"
},
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"Assistant Response: The cutest character in \"Madoka Magica\" is often considered to be Kyubey, the magical creature who offers girls the chance to become magical girls. Kyubey is known for its innocent and childlike appearance, which adds to its\n",
"CPU times: user 1min 8s, sys: 79.1 ms, total: 1min 8s\n",
"Wall time: 17.2 s\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%%time\n",
"messages = [\n",
" {\"role\": \"system\", \"content\": \"You are a helpful AI assistant.\"},\n",
" {\"role\": \"user\", \"content\": \"Calculate (1 + 1 \\times 2 + 3 \\div 2 + 2^{10}). After completing the calculation, verify the result. Finally, provide the calculation process and the final answer.\"},\n",
"]\n",
"\n",
"response = generate_response(messages, model, tokenizer)\n",
"print(\"\\nAssistant Response:\", response)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "SXw5YFPUGBWx",
"outputId": "fc5dc0d8-961f-40a5-f2c8-9df4e4c52a9e"
},
"execution_count": 7,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.\n"
]
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"\n",
"Assistant Response: To calculate the given expression, we will follow the order of operations, often remembered by the acronym PEMDAS (Parentheses, Exponents, Multiplication and Division (from left to right), Addition and Subtraction (from left to right)).\n",
"\n",
"\n",
"CPU times: user 1min 21s, sys: 92.9 ms, total: 1min 21s\n",
"Wall time: 20.3 s\n"
]
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment