Last active
August 25, 2025 23:08
-
-
Save AshtonIzmev/50cb33db57f7ba4d296479e1d4604818 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #%pip install sentence_transformers | |
| #%pip install torch | |
| ########################### | |
| import os | |
| os.environ["TOKENIZERS_PARALLELISM"] = "true" | |
| import torch | |
| from sentence_transformers import SentenceTransformer | |
| # Maximize tokenizer throughput on CPU threads | |
| os.environ["TOKENIZERS_PARALLELISM"] = "true" | |
| # Faster matmul on A40 without precision loss for inference | |
| torch.backends.cuda.matmul.allow_tf32 = True | |
| torch.set_float32_matmul_precision("high") | |
| ########################### | |
| # Example corpus (replace with your real documents; larger lists improve GPU saturation) | |
| import pickle | |
| with open("documents.pkl", "rb") as f: | |
| documents = pickle.load(f) | |
| ########################### | |
| # Load model on GPU | |
| torch.backends.cudnn.benchmark = True | |
| model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B", device="cuda") | |
| model.max_seq_length = 512 | |
| def find_largest_batch_size(sentences): | |
| # Try larger to smaller to fill GPU; adjust candidates as needed | |
| for b in [2048, 1536, 1024, 768, 512, 384, 256, 192, 128, 96, 64, 48, 32]: | |
| try: | |
| probe = sentences[: min(len(sentences), b * 2)] | |
| model.encode( | |
| probe, | |
| batch_size=b, | |
| device="cuda", | |
| show_progress_bar=False, | |
| convert_to_numpy=True, | |
| normalize_embeddings=False, | |
| num_workers=os.cpu_count(), | |
| ) | |
| torch.cuda.synchronize() | |
| return b | |
| except RuntimeError as e: | |
| if "out of memory" in str(e).lower(): | |
| torch.cuda.empty_cache() | |
| else: | |
| raise | |
| return 32 | |
| batch_size = find_largest_batch_size(documents) | |
| print(batch_size) | |
| # Encode in large batches to drive high GPU utilization | |
| full_embeddings = model.encode( | |
| documents, | |
| batch_size=batch_size, | |
| device="cuda", | |
| show_progress_bar=True, | |
| convert_to_numpy=True, | |
| normalize_embeddings=False, | |
| num_workers=os.cpu_count(), | |
| ) | |
| print(f"Batch size used: {batch_size}") | |
| print(f"Shape of full embeddings: {full_embeddings.shape}") | |
| ########################### | |
| import pickle | |
| with open('full_embeddings.pkl', 'wb') as f: | |
| pickle.dump(full_embeddings, f) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| Vector Store Populator | |
| This module handles creating embeddings and populating the vector database. | |
| Responsibilities: | |
| - Initialize SQLite vector database | |
| - Generate embeddings using OpenAI | |
| - Store embeddings with metadata | |
| """ | |
| import random | |
| import sqlite_vec | |
| import numpy as np | |
| import sqlite3 | |
| from typing import List | |
| from contextlib import contextmanager | |
| from pydantic import BaseModel | |
| from utils.openai_client import openai_client, use_mock | |
| from utils.llms.schemas import Verse, Fatwa, Hadith | |
| from utils.token_utils import token_counter | |
| @contextmanager | |
| def get_vector_db_connection(): | |
| """Context manager for SQLite connection with vector extension loaded.""" | |
| with sqlite3.connect(DB_NAME) as db: | |
| db.enable_load_extension(True) | |
| sqlite_vec.load(db) | |
| db.enable_load_extension(False) | |
| yield db | |
| # Vector store configuration | |
| DB_NAME = "data_rag.db" | |
| EMBEDDING_TABLE = "data_embeddings" | |
| METADATA_TABLE = "data_metadata" | |
| EMBEDDING_DIM = 1024 # Qwen/Qwen3-Embedding-0.6B | |
| def serialize_f32(arr): | |
| """Serialize numpy array to bytes for SQLite storage.""" | |
| return np.array(arr, dtype=np.float32).tobytes() | |
| def init_vector_database(): | |
| """Initialize the vector database with tables.""" | |
| print(f"Initializing vector database: {DB_NAME}") | |
| with get_vector_db_connection() as db: | |
| # Drop existing tables | |
| db.execute(f"DROP TABLE IF EXISTS {EMBEDDING_TABLE}") | |
| db.execute(f"DROP TABLE IF EXISTS {METADATA_TABLE}") | |
| # Create embedding table with IF NOT EXISTS | |
| db.execute(f""" | |
| CREATE VIRTUAL TABLE IF NOT EXISTS {EMBEDDING_TABLE} USING vec0( | |
| objectid TEXT, | |
| embedding float[{EMBEDDING_DIM}] | |
| ) | |
| """) | |
| # Create metadata table with IF NOT EXISTS | |
| db.execute(f""" | |
| CREATE TABLE IF NOT EXISTS {METADATA_TABLE} ( | |
| objectid TEXT PRIMARY KEY, | |
| content_type TEXT, | |
| text_content TEXT, | |
| date_created TIMESTAMP DEFAULT CURRENT_TIMESTAMP | |
| ) | |
| """) | |
| # Clear any existing data | |
| db.execute(f"DELETE FROM {EMBEDDING_TABLE}") | |
| db.execute(f"DELETE FROM {METADATA_TABLE}") | |
| db.commit() | |
| print("Vector database initialized successfully") | |
| def get_embedding(text: str) -> List[float]: | |
| """Get embedding for text using OpenAI API.""" | |
| if use_mock: | |
| return [random.random() for _ in range(EMBEDDING_DIM)] | |
| try: | |
| response = openai_client.embeddings.create( | |
| model="text-embedding-3-small", | |
| input=text | |
| ) | |
| return response.data[0].embedding | |
| except Exception as e: | |
| print(f"Error getting embedding: {e}") | |
| raise | |
| def get_hf_embedding(document:str) -> List[float]: | |
| from sentence_transformers import SentenceTransformer | |
| # Load the model | |
| model = SentenceTransformer("Qwen/Qwen3-Embedding-0.6B") | |
| # 2. Encode documents to get the full 4096-dimension embeddings | |
| full_embeddings = model.encode([document]) | |
| # 3. Truncate to your desired MRL dimension | |
| mrl_dimension = EMBEDDING_DIM | |
| # Handle 1D vs 2D arrays | |
| mrl_embeddings = full_embeddings[:, :mrl_dimension] | |
| return mrl_embeddings[0] # Return the first (and only) embedding as a 1D array |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment