Last active
July 4, 2025 16:25
-
-
Save mizchi/de8a19ef473c1fe881c4a541a415bcc2 to your computer and use it in GitHub Desktop.
My Portable RAG
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/** | |
* My Portable RAG | |
* $ pnpm add sqlite-vec @ai-sdk/google ai | |
* SQLite Vector Search + Google AI Embeddings | |
* | |
* Required environment variables: | |
* GOOGLE_GENERATIVE_AI_API_KEY=your-api-key | |
* | |
* Usage: | |
* # Index text content | |
* node rag.ts index --db=index.db --text="Content to search" --title="Article Title" --url="https://example.com" | |
* | |
* # Index file content | |
* node rag.ts index --db=index.db --file=document.txt --title="Document" | |
* | |
* # Search | |
* node rag.ts query --db=index.db "search query" | |
* | |
* Options: | |
* --db: Database file path (default: index.db) | |
* -t, --text: Text to index | |
* -f, --file: File path to index | |
* --title: Document title (optional) | |
* --url: Document URL (optional) | |
* -k: Number of search results (default: 5) | |
*/ | |
import { DatabaseSync } from "node:sqlite"; | |
import * as sqliteVec from "sqlite-vec"; | |
import { google } from "@ai-sdk/google"; | |
import { embed, embedMany } from "ai"; | |
const DIMENSION = 768; // Google Embedding dimension | |
const DEFAULT_DB_PATH = "index.db"; | |
export function chunk(text: string, size = 1000, overlap = 100): string[] { | |
const chunks: string[] = []; | |
for (let i = 0; i < text.length; i += size - overlap) { | |
chunks.push(text.slice(i, i + size)); | |
} | |
return chunks; | |
} | |
export function rerank<T extends { content: string; score: number }>( | |
query: string, | |
results: T[] | |
): T[] { | |
const words = query.toLowerCase().split(/\s+/); | |
return results | |
.map((r) => ({ | |
...r, | |
score: | |
r.score + | |
words.filter((w) => r.content.toLowerCase().includes(w)).length * 0.1, | |
})) | |
.sort((a, b) => b.score - a.score); | |
} | |
export function createDB(path = ":memory:"): DatabaseSync { | |
const db = new DatabaseSync(path, { allowExtension: true }); | |
sqliteVec.load(db); | |
db.exec(` | |
CREATE TABLE IF NOT EXISTS items ( | |
id INTEGER PRIMARY KEY, | |
content TEXT, | |
title TEXT, | |
url TEXT | |
); | |
CREATE VIRTUAL TABLE IF NOT EXISTS vec_items USING vec0(embedding float[${DIMENSION}]); | |
`); | |
return db; | |
} | |
export function save( | |
db: DatabaseSync, | |
content: string, | |
embedding: number[], | |
metadata?: { title?: string; url?: string } | |
): number { | |
const result = db | |
.prepare("INSERT INTO items (content, title, url) VALUES (?, ?, ?)") | |
.run(content, metadata?.title || null, metadata?.url || null); | |
const id = result.lastInsertRowid; | |
db.prepare("INSERT INTO vec_items (rowid, embedding) VALUES (?, ?)").run( | |
BigInt(id), | |
new Uint8Array(new Float32Array(embedding).buffer) | |
); | |
return Number(id); | |
} | |
export function search( | |
db: DatabaseSync, | |
embedding: number[], | |
k = 5 | |
): Array<{ | |
id: number; | |
content: string; | |
title: string | null; | |
url: string | null; | |
distance: number; | |
}> { | |
return db | |
.prepare( | |
` | |
SELECT i.id, i.content, i.title, i.url, v.distance | |
FROM vec_items v | |
JOIN items i ON i.id = v.rowid | |
WHERE v.embedding MATCH ? AND k = ? | |
ORDER BY v.distance | |
` | |
) | |
.all(new Uint8Array(new Float32Array(embedding).buffer), k); | |
} | |
export async function index( | |
db: DatabaseSync, | |
text: string, | |
metadata?: { title?: string; url?: string }, | |
chunkSize = 1000 | |
): Promise<number[]> { | |
const chunks = chunk(text, chunkSize); | |
// Batchable | |
const { embeddings } = await embedMany({ | |
model: google.textEmbeddingModel("text-embedding-004"), | |
values: chunks, | |
}); | |
const ids: number[] = []; | |
for (let i = 0; i < chunks.length; i++) { | |
const chunkText = chunks[i]; | |
const embedding = embeddings[i] || Array(DIMENSION).fill(0); | |
ids.push(save(db, chunkText, embedding, metadata)); | |
} | |
return ids; | |
} | |
// RAG search | |
export async function ragSearch( | |
db: DatabaseSync, | |
query: string, | |
k = 5 | |
): Promise< | |
Array<{ | |
content: string; | |
title: string | null; | |
url: string | null; | |
score: number; | |
}> | |
> { | |
const { embedding } = await embed({ | |
model: google.textEmbeddingModel("text-embedding-004"), | |
value: query, | |
}); | |
const results = search(db, embedding, k * 2); | |
// Convert distance to score | |
const scored = results.map((r) => ({ | |
content: r.content, | |
title: r.title, | |
url: r.url, | |
score: 1 / (1 + r.distance), | |
})); | |
// Rerank and return top k results | |
return rerank(query, scored).slice(0, k); | |
} | |
// CLI usage example | |
if (import.meta.url === `file://${process.argv[1]}`) { | |
const { parseArgs } = await import("node:util"); | |
const fs = await import("node:fs/promises"); | |
const cmd = process.argv[2]; | |
switch (cmd) { | |
case "q": | |
case "query": { | |
const parsed = parseArgs({ | |
args: process.argv.slice(3), | |
options: { | |
db: { type: "string" }, | |
k: { type: "string", short: "k" }, | |
}, | |
allowPositionals: true, | |
}); | |
const dbPath = parsed.values.db || DEFAULT_DB_PATH; | |
const query = parsed.positionals[0] || ""; | |
const k = parsed.values.k ? Number(parsed.values.k) : 5; | |
const db = createDB(dbPath); | |
const results = await ragSearch(db, query, k); | |
// Display results | |
results.forEach((r, i) => { | |
console.log(`\n${i + 1}. ${r.title || "(Untitled)"}`); | |
if (r.url) console.log(` URL: ${r.url}`); | |
console.log(` Score: ${r.score.toFixed(3)}`); | |
console.log(` Content: ${r.content.substring(0, 100)}...`); | |
}); | |
db.close(); | |
break; | |
} | |
case "i": | |
case "index": { | |
const parsed = parseArgs({ | |
args: process.argv.slice(3), | |
options: { | |
db: { type: "string" }, | |
text: { type: "string", short: "t" }, | |
file: { type: "string", short: "f" }, | |
title: { type: "string" }, | |
url: { type: "string" }, | |
}, | |
allowPositionals: true, | |
}); | |
const dbPath = parsed.values.db || DEFAULT_DB_PATH; | |
const content = | |
parsed.values.text || | |
(parsed.values.file | |
? await fs.readFile(parsed.values.file, "utf-8") | |
: ""); | |
if (!content) { | |
console.error("Please specify text or file path"); | |
process.exit(1); | |
} | |
const metadata = { | |
title: parsed.values.title, | |
url: parsed.values.url, | |
}; | |
const db = createDB(dbPath); | |
const ids = await index(db, content, metadata); | |
console.log(`Indexed ${ids.length} chunks`); | |
if (metadata.title) console.log(`Title: ${metadata.title}`); | |
if (metadata.url) console.log(`URL: ${metadata.url}`); | |
db.close(); | |
break; | |
} | |
default: { | |
console.error(`Unknown command: ${cmd || "(none)"}`); | |
console.error("\nUsage:"); | |
console.error( | |
" node rag.ts index --db=index.db --text='text' --title='Title' --url='URL'" | |
); | |
console.error( | |
" node rag.ts index --db=index.db --file=file.txt --title='Title'" | |
); | |
console.error(" node rag.ts query --db=index.db 'search query'"); | |
console.error("\nCommands:"); | |
console.error(" i, index Index text content"); | |
console.error(" q, query Search indexed content"); | |
process.exit(1); | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment