Created
March 27, 2021 16:29
-
-
Save tehbeard/30ed38acf6fc4ba5dd8a7dd771ad1382 to your computer and use it in GitHub Desktop.
Mucking around with Full text search and IndexedDB.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import { openDB, IDBPDatabase, DBSchema, IDBPTransaction,IDBCursorDirection } from "idb/with-async-ittr"; | |
import { stemmer } from "./stemmer"; | |
const TBL_ENTRY = "FTSEntry"; | |
const IDX_BY_SCORE = "byScore"; | |
const IDX_BY_DOC_ID = "byDocId"; | |
type FTSEntry = { id: any, word: string , count: number }; | |
interface FullTextSchema extends DBSchema { | |
[TBL_ENTRY]: { key: number, value: FTSEntry, indexes: { [IDX_BY_SCORE]: [string, number], [IDX_BY_DOC_ID]: any } } | |
} | |
type Tokenizer = (input: string) => string[] | |
type TokenProcessor = (input: string[]) => string[] | |
const defaultTokenizer = input => input.replace(/[^\w\s]/," ").split(/\s/) | |
const processorWrapper = (fn: (a:string) => string) => input => input.map(fn); | |
const lowercaseProcessor = processorWrapper( a => a.toLowerCase()); | |
const DEFAULT_STOP_WORDS = [ | |
'the', 'be', 'to', 'of', 'and', 'a', 'in', 'that', 'have', | |
'I', 'it', 'for', 'not', 'on', 'with', 'he', 'as', 'you', | |
'do', 'at', 'this', 'but', 'his', 'by', 'from' | |
]; | |
const stopwordProcessorFactory = stopwords => input => input.filter(s => !stopwords.includes(s)); | |
const blankBlcoker = input => input.filter(s => s!=""); | |
const stemmerProcessor = processorWrapper( a => stemmer(a)); | |
export class FullTextDb { | |
private db: IDBPDatabase<FullTextSchema>; | |
private tokenizer: Tokenizer = defaultTokenizer; | |
private processors: TokenProcessor[] = [ | |
lowercaseProcessor, | |
blankBlcoker, | |
stopwordProcessorFactory(DEFAULT_STOP_WORDS), | |
stemmerProcessor | |
]; | |
async open(name:string) | |
{ | |
this.db = await openDB<FullTextSchema>(name, 1, { | |
blocked() | |
{ | |
alert("BLOCKED"); | |
}, | |
terminated() | |
{ | |
alert("TERMINATED"); | |
}, | |
upgrade(db, oldVersion, newVersion, txn) { | |
const entry = db.createObjectStore(TBL_ENTRY,{ autoIncrement: true }) | |
entry.createIndex(IDX_BY_DOC_ID, "id"); | |
entry.createIndex(IDX_BY_SCORE, ["word", "count"]); | |
} | |
}); | |
} | |
async index(id: any, corpus: string) | |
{ | |
const rawTokens = this.tokenizer(corpus); | |
const processedTokens = this.processors.reduce( (t,cb) => cb(t), rawTokens); | |
const tokenCount = processedTokens.reduce( (o, v) => { | |
o[v] = (o[v] ?? 0) + 1; | |
return o; | |
}, {}) as Record<string, number>; | |
// console.log({ | |
// rawTokens, | |
// processedTokens, | |
// tokenCount | |
// }); | |
const txn = this.db.transaction(TBL_ENTRY,"readwrite"); | |
const oldKeys = await txn.store.index(IDX_BY_DOC_ID).getAllKeys(IDBKeyRange.only(id)); | |
for(let oldKey of oldKeys) | |
{ | |
txn.store.delete(oldKey); | |
} | |
for(let [word, count] of Object.entries(tokenCount)){ | |
txn.store.put({ | |
id, | |
word, | |
count | |
}) | |
} | |
await txn.done; | |
} | |
async search(terms: string) | |
{ | |
const rawTokens = this.tokenizer(terms); | |
const processedTokens = Array.from(new Set(this.processors.reduce( (t,cb) => cb(t), rawTokens))); | |
const txn = this.db.transaction(TBL_ENTRY, "readonly"); | |
const idx = txn.store.index(IDX_BY_SCORE); | |
const results = {}; | |
const [ | |
totalDocCount, | |
cursors | |
] = await Promise.all([ | |
(async () => { | |
let c = 0; | |
const it = txn.store.index(IDX_BY_DOC_ID).iterate(null, "nextunique") | |
for await(let i of it) | |
{ | |
c++; | |
} | |
return c; | |
})(), | |
Promise.all( | |
processedTokens.map( | |
async tkn => { | |
const key = IDBKeyRange.bound([tkn,1], [tkn, Number.MAX_SAFE_INTEGER]); | |
return { | |
cursor: idx.iterate(key, "prev"), | |
total: await idx.count(key) | |
}; | |
} | |
) | |
) | |
]); | |
for(let {cursor, total} of cursors) | |
{ | |
for await (let entry of cursor) | |
{ | |
results[entry.value.id] = { | |
...(results[entry.value.id] ?? {}), | |
[entry.value.word]: entry.value.count * Math.log( totalDocCount / total ) | |
} | |
} | |
} | |
const ranked = Object.entries( results ).map( ([ id, scores ]) => { | |
const totalScore = Object.values(scores).reduce( (a,b) => a+b, 0); | |
return { id, scores, totalScore }; | |
}).sort( ({totalScore:a},{totalScore: b}) => a == b ? 0 : Math.sign(b-a) ); | |
return ranked; | |
} | |
close() | |
{ | |
this.db.close(); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment