Skip to content

Instantly share code, notes, and snippets.

@AshtonIzmev
Last active August 25, 2025 23:08
Show Gist options
  • Select an option

  • Save AshtonIzmev/50cb33db57f7ba4d296479e1d4604818 to your computer and use it in GitHub Desktop.

Select an option

Save AshtonIzmev/50cb33db57f7ba4d296479e1d4604818 to your computer and use it in GitHub Desktop.
#%pip install sentence_transformers
#%pip install torch
###########################
import os
os.environ["TOKENIZERS_PARALLELISM"] = "true"
import torch
from sentence_transformers import SentenceTransformer
# Maximize tokenizer throughput on CPU threads
os.environ["TOKENIZERS_PARALLELISM"] = "true"
# Faster matmul on A40 without precision loss for inference
torch.backends.cuda.matmul.allow_tf32 = True
torch.set_float32_matmul_precision("high")
###########################
# Example corpus (replace with your real documents; larger lists improve GPU saturation)
import pickle
with open("documents.pkl", "rb") as f:
documents = pickle.load(f)
###########################
# Load model on GPU
torch.backends.cudnn.benchmark = True
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B", device="cuda")
model.max_seq_length = 512
def find_largest_batch_size(sentences):
# Try larger to smaller to fill GPU; adjust candidates as needed
for b in [2048, 1536, 1024, 768, 512, 384, 256, 192, 128, 96, 64, 48, 32]:
try:
probe = sentences[: min(len(sentences), b * 2)]
model.encode(
probe,
batch_size=b,
device="cuda",
show_progress_bar=False,
convert_to_numpy=True,
normalize_embeddings=False,
num_workers=os.cpu_count(),
)
torch.cuda.synchronize()
return b
except RuntimeError as e:
if "out of memory" in str(e).lower():
torch.cuda.empty_cache()
else:
raise
return 32
batch_size = find_largest_batch_size(documents)
print(batch_size)
# Encode in large batches to drive high GPU utilization
full_embeddings = model.encode(
documents,
batch_size=batch_size,
device="cuda",
show_progress_bar=True,
convert_to_numpy=True,
normalize_embeddings=False,
num_workers=os.cpu_count(),
)
print(f"Batch size used: {batch_size}")
print(f"Shape of full embeddings: {full_embeddings.shape}")
###########################
import pickle
with open('full_embeddings.pkl', 'wb') as f:
pickle.dump(full_embeddings, f)
"""
Vector Store Populator
This module handles creating embeddings and populating the vector database.
Responsibilities:
- Initialize SQLite vector database
- Generate embeddings using OpenAI
- Store embeddings with metadata
"""
import random
import sqlite_vec
import numpy as np
import sqlite3
from typing import List
from contextlib import contextmanager
from pydantic import BaseModel
from utils.openai_client import openai_client, use_mock
from utils.llms.schemas import Verse, Fatwa, Hadith
from utils.token_utils import token_counter
@contextmanager
def get_vector_db_connection():
"""Context manager for SQLite connection with vector extension loaded."""
with sqlite3.connect(DB_NAME) as db:
db.enable_load_extension(True)
sqlite_vec.load(db)
db.enable_load_extension(False)
yield db
# Vector store configuration
DB_NAME = "data_rag.db"
EMBEDDING_TABLE = "data_embeddings"
METADATA_TABLE = "data_metadata"
EMBEDDING_DIM = 1024 # Qwen/Qwen3-Embedding-0.6B
def serialize_f32(arr):
"""Serialize numpy array to bytes for SQLite storage."""
return np.array(arr, dtype=np.float32).tobytes()
def init_vector_database():
"""Initialize the vector database with tables."""
print(f"Initializing vector database: {DB_NAME}")
with get_vector_db_connection() as db:
# Drop existing tables
db.execute(f"DROP TABLE IF EXISTS {EMBEDDING_TABLE}")
db.execute(f"DROP TABLE IF EXISTS {METADATA_TABLE}")
# Create embedding table with IF NOT EXISTS
db.execute(f"""
CREATE VIRTUAL TABLE IF NOT EXISTS {EMBEDDING_TABLE} USING vec0(
objectid TEXT,
embedding float[{EMBEDDING_DIM}]
)
""")
# Create metadata table with IF NOT EXISTS
db.execute(f"""
CREATE TABLE IF NOT EXISTS {METADATA_TABLE} (
objectid TEXT PRIMARY KEY,
content_type TEXT,
text_content TEXT,
date_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP
)
""")
# Clear any existing data
db.execute(f"DELETE FROM {EMBEDDING_TABLE}")
db.execute(f"DELETE FROM {METADATA_TABLE}")
db.commit()
print("Vector database initialized successfully")
def get_embedding(text: str) -> List[float]:
"""Get embedding for text using OpenAI API."""
if use_mock:
return [random.random() for _ in range(EMBEDDING_DIM)]
try:
response = openai_client.embeddings.create(
model="text-embedding-3-small",
input=text
)
return response.data[0].embedding
except Exception as e:
print(f"Error getting embedding: {e}")
raise
def get_hf_embedding(document:str) -> List[float]:
from sentence_transformers import SentenceTransformer
# Load the model
model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B")
# 2. Encode documents to get the full 4096-dimension embeddings
full_embeddings = model.encode([document])
# 3. Truncate to your desired MRL dimension
mrl_dimension = EMBEDDING_DIM
# Handle 1D vs 2D arrays
mrl_embeddings = full_embeddings[:, :mrl_dimension]
return mrl_embeddings[0] # Return the first (and only) embedding as a 1D array
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment