Created
June 26, 2020 00:24
-
-
Save Rayraegah/f0be8cc59edd71843acbc8764146f300 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* | |
A Succinct Trie for Javascript | |
By Steve Hanov | |
Released to the public domain. | |
This file contains functions for creating a succinctly encoded trie structure | |
from a list of words. The trie is encoded to a succinct bit string using the | |
method of Jacobson (1989). The bitstring is then encoded using BASE-64. | |
The resulting trie does not have to be decoded to be used. This file also | |
contains functions for looking up a word in the BASE-64 encoded data, in | |
O(mlogn) time, where m is the number of letters in the target word, and n is | |
the number of nodes in the trie. | |
Objects for encoding: | |
TrieNode | |
Trie | |
BitWriter | |
Objects for decoding: | |
BitString | |
FrozenTrieNode | |
FrozenTrie | |
QUICK USAGE: | |
Suppose we let data be some output of the demo encoder: | |
var data = { | |
"nodeCount": 37, | |
"directory": "BMIg", | |
"trie": "v2qqqqqqqpIUn4A5JZyBZ4ggCKh55ZZgBA5ZZd5vIEl1wx8g8A" | |
}; | |
var frozenTrie = new FrozenTrie( Data.trie, Data.directory, Data.nodeCount); | |
alert( frozenTrie.lookup( "hello" ) ); // outputs true | |
alert( frozenTrie.lookup( "kwijibo" ) ); // outputs false | |
*/ | |
// Configure the bit writing and reading functions to work natively in BASE-64 | |
// encoding. That way, we don't have to convert back and forth to bytes. | |
const BASE64 = | |
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789-_"; | |
/** | |
The width of each unit of the encoding, in bits. Here we use 6, for base-64 | |
encoding. | |
*/ | |
const W = 6; | |
/** | |
Returns the character unit that represents the given value. If this were | |
binary data, we would simply return id. | |
*/ | |
function CHR(id) | |
{ | |
return BASE64[id]; | |
} | |
/** | |
Returns the decimal value of the given character unit. | |
*/ | |
const BASE64_CACHE = {"A" : 0, "B" : 1, "C" : 2, "D" : 3, "E" : 4, "F" : 5, "G" : | |
6, "H" : 7, "I" : 8, "J" : 9, "K" : 10, "L" : 11, "M" : 12, "N" : 13, "O" : | |
14, "P" : 15, "Q" : 16, "R" : 17, "S" : 18, "T" : 19, "U" : 20, "V" : | |
21, "W" : 22, "X" : 23, "Y" : 24, "Z" : 25, "a" : 26, "b" : 27, "c" : | |
28, "d" : 29, "e" : 30, "f" : 31, "g" : 32, "h" : 33, "i" : 34, "j" : | |
35, "k" : 36, "l" : 37, "m" : 38, "n" : 39, "o" : 40, "p" : 41, "q" : | |
42, "r" : 43, "s" : 44, "t" : 45, "u" : 46, "v" : 47, "w" : 48, "x" : | |
49, "y" : 50, "z" : 51, "0" : 52, "1" : 53, "2" : 54, "3" : 55, "4" : | |
56, "5" : 57, "6" : 58, "7" : 59, "8" : 60, "9" : 61, "-" : 62, "_" : | |
63}; | |
function ORD(ch) | |
{ | |
// Used to be: return BASE64.indexOf(ch); | |
return BASE64_CACHE[ch]; | |
} | |
/** | |
Fixed values for the L1 and L2 table sizes in the Rank Directory | |
*/ | |
const L1 = 32*32; | |
const L2 = 32; | |
/** | |
The BitWriter will create a stream of bytes, letting you write a certain | |
number of bits at a time. This is part of the encoder, so it is not | |
optimized for memory or speed. | |
*/ | |
class BitWriter { | |
constructor() { | |
this.init(); | |
} | |
init() { | |
this.bits = []; | |
} | |
/** | |
Write some data to the bit string. The number of bits must be 32 or | |
fewer. | |
*/ | |
write(data, numBits) { | |
for( let i = numBits - 1; i >= 0; i-- ) { | |
if ( data & ( 1 << i ) ) { | |
this.bits.push(1); | |
} else { | |
this.bits.push(0); | |
} | |
} | |
} | |
/** | |
Get the bitstring represented as a javascript string of bytes | |
*/ | |
getData() { | |
const chars = []; | |
let b = 0; | |
let i = 0; | |
for ( let j = 0; j < this.bits.length; j++ ) { | |
b = ( b << 1 ) | this.bits[j]; | |
i += 1; | |
if ( i === W ) { | |
chars.push( CHR(b) ); | |
i = b = 0; | |
} | |
} | |
if ( i ) { | |
chars.push( CHR(b << ( W - i )) ); | |
} | |
return chars.join(""); | |
} | |
/** | |
Returns the bits as a human readable binary string for debugging | |
*/ | |
getDebugString(group) { | |
const chars = []; | |
let i = 0; | |
for( let j = 0; j < this.bits.length; j++ ) { | |
chars.push( `${this.bits[j]}` ); | |
i++; | |
if ( i === group ) { | |
chars.push( ' ' ); | |
i = 0; | |
} | |
} | |
return chars.join(""); | |
} | |
} | |
/** | |
Given a string of data (eg, in BASE-64), the BitString class supports | |
reading or counting a number of bits from an arbitrary position in the | |
string. | |
*/ | |
class BitString { | |
constructor(str) { | |
this.init( str ); | |
} | |
init(str) { | |
this.bytes = str; | |
this.length = this.bytes.length * W; | |
} | |
/** | |
Returns the internal string of bytes | |
*/ | |
getData() { | |
return this.bytes; | |
} | |
/** | |
Returns a decimal number, consisting of a certain number, n, of bits | |
starting at a certain position, p. | |
*/ | |
get(p, n) { | |
// case 1: bits lie within the given byte | |
if ( ( p % W ) + n <= W ) { | |
return ( ORD( this.bytes[ p / W | 0 ] ) & BitString.MaskTop[ p % W ] ) >> | |
( W - p % W - n ); | |
// case 2: bits lie incompletely in the given byte | |
} else { | |
let result = ( ORD( this.bytes[ p / W | 0 ] ) & | |
BitString.MaskTop[ p % W ] ); | |
const l = W - p % W; | |
p += l; | |
n -= l; | |
while ( n >= W ) { | |
result = (result << W) | ORD( this.bytes[ p / W | 0 ] ); | |
p += W; | |
n -= W; | |
} | |
if ( n > 0 ) { | |
result = (result << n) | ( ORD( this.bytes[ p / W | 0 ] ) >> | |
( W - n ) ); | |
} | |
return result; | |
} | |
} | |
/** | |
Counts the number of bits set to 1 starting at position p and | |
ending at position p + n | |
*/ | |
count(p, n) { | |
let count = 0; | |
while( n >= 8 ) { | |
count += BitString.BitsInByte[ this.get( p, 8 ) ]; | |
p += 8; | |
n -= 8; | |
} | |
return count + BitString.BitsInByte[ this.get( p, n ) ]; | |
} | |
/** | |
Returns the number of bits set to 1 up to and including position x. | |
This is the slow implementation used for testing. | |
*/ | |
rank(x) { | |
let rank = 0; | |
for( let i = 0; i <= x; i++ ) { | |
if ( this.get(i, 1) ) { | |
rank++; | |
} | |
} | |
return rank; | |
} | |
} | |
BitString.MaskTop = [ | |
0x3f, 0x1f, 0x0f, 0x07, 0x03, 0x01, 0x00 | |
]; | |
BitString.BitsInByte = [ | |
0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4, 1, 2, 2, 3, 2, 3, 3, 4, 2, | |
3, 3, 4, 3, 4, 4, 5, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, | |
3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, | |
4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, | |
3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, | |
6, 6, 7, 1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5, 2, 3, 3, 4, 3, 4, | |
4, 5, 3, 4, 4, 5, 4, 5, 5, 6, 2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, | |
6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 2, 3, 3, 4, 3, 4, 4, 5, | |
3, 4, 4, 5, 4, 5, 5, 6, 3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 3, | |
4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7, 4, 5, 5, 6, 5, 6, 6, 7, 5, 6, | |
6, 7, 6, 7, 7, 8 | |
]; | |
/** | |
The rank directory allows you to build an index to quickly compute the | |
rank() and select() functions. The index can itself be encoded as a binary | |
string. | |
*/ | |
class RankDirectory { | |
constructor(directoryData, bitData, numBits, l1Size, l2Size) { | |
this.init(directoryData, bitData, numBits, l1Size, l2Size); | |
} | |
init(directoryData, bitData, numBits, l1Size, l2Size) { | |
this.directory = new BitString( directoryData ); | |
this.data = new BitString( bitData ); | |
this.l1Size = l1Size; | |
this.l2Size = l2Size; | |
this.l1Bits = Math.ceil( Math.log( numBits ) / Math.log( 2 ) ); | |
this.l2Bits = Math.ceil( Math.log( l1Size ) / Math.log( 2 ) ); | |
this.sectionBits = (l1Size / l2Size - 1) * this.l2Bits + this.l1Bits; | |
this.numBits = numBits; | |
} | |
/** | |
Returns the string representation of the directory. | |
*/ | |
getData() { | |
return this.directory.getData(); | |
} | |
/** | |
Returns the number of 1 or 0 bits (depending on the "which" parameter) to | |
to and including position x. | |
*/ | |
rank(which, x) { | |
if ( which === 0 ) { | |
return x - this.rank( 1, x ) + 1; | |
} | |
let rank = 0; | |
let o = x; | |
let sectionPos = 0; | |
if ( o >= this.l1Size ) { | |
sectionPos = ( o / this.l1Size | 0 ) * this.sectionBits; | |
rank = this.directory.get( sectionPos - this.l1Bits, this.l1Bits ); | |
o = o % this.l1Size; | |
} | |
if ( o >= this.l2Size ) { | |
sectionPos += ( o / this.l2Size | 0 ) * this.l2Bits; | |
rank += this.directory.get( sectionPos - this.l2Bits, this.l2Bits ); | |
} | |
rank += this.data.count( x - x % this.l2Size, x % this.l2Size + 1 ); | |
return rank; | |
} | |
/** | |
Returns the position of the y'th 0 or 1 bit, depending on the "which" | |
parameter. | |
*/ | |
select(which, y) { | |
let high = this.numBits; | |
let low = -1; | |
let val = -1; | |
while ( high - low > 1 ) { | |
const probe = (high + low) / 2 | 0; | |
const r = this.rank( which, probe ); | |
if ( r === y ) { | |
// We have to continue searching after we have found it, | |
// because we want the _first_ occurrence. | |
val = probe; | |
high = probe; | |
} else if ( r < y ) { | |
low = probe; | |
} else { | |
high = probe; | |
} | |
} | |
return val; | |
} | |
} | |
/** | |
Used to build a rank directory from the given input string. | |
@param data A javascript string containing the data, as readable using the | |
BitString object. | |
@param numBits The number of bits to index. | |
@param l1Size The number of bits that each entry in the Level 1 table | |
summarizes. This should be a multiple of l2Size. | |
@param l2Size The number of bits that each entry in the Level 2 table | |
summarizes. | |
*/ | |
RankDirectory.Create = (data, numBits, l1Size, l2Size) => { | |
const bits = new BitString( data ); | |
let p = 0; | |
let i = 0; | |
let count1 = 0; | |
let count2 = 0; | |
const l1bits = Math.ceil( Math.log( numBits ) / Math.log(2) ); | |
const l2bits = Math.ceil( Math.log( l1Size ) / Math.log(2) ); | |
const directory = new BitWriter(); | |
while( p + l2Size <= numBits ) { | |
count2 += bits.count( p, l2Size ); | |
i += l2Size; | |
p += l2Size; | |
if ( i === l1Size ) { | |
count1 += count2; | |
directory.write( count1, l1bits ); | |
count2 = 0; | |
i = 0; | |
} else { | |
directory.write( count2, l2bits ); | |
} | |
} | |
return new RankDirectory( directory.getData(), data, numBits, l1Size, l2Size ); | |
}; | |
/** | |
A Trie node, for use in building the encoding trie. This is not needed for | |
the decoder. | |
*/ | |
function TrieNode( letter ) | |
{ | |
this.letter = letter; | |
this.final = false; | |
this.children = []; | |
} | |
class Trie { | |
constructor() { | |
this.init(); | |
} | |
init() { | |
this.previousWord = ""; | |
this.root = new TrieNode(' '); | |
this.cache = [ this.root ]; | |
this.nodeCount = 1; | |
} | |
/** | |
Returns the number of nodes in the trie | |
*/ | |
getNodeCount() { | |
return this.nodeCount; | |
} | |
/** | |
Inserts a word into the trie. This function is fastest if the words are | |
inserted in alphabetical order. | |
*/ | |
insert(word) { | |
let commonPrefix = 0; | |
for( var i = 0; i < Math.min( word.length, this.previousWord.length ); | |
i++ ) | |
{ | |
if ( word[i] !== this.previousWord[i] ) { break; } | |
commonPrefix += 1; | |
} | |
this.cache.length = commonPrefix + 1; | |
let node = this.cache[ this.cache.length - 1 ]; | |
for( i = commonPrefix; i < word.length; i++ ) { | |
const next = new TrieNode( word[i] ); | |
this.nodeCount++; | |
node.children.push( next ); | |
this.cache.push( next ); | |
node = next; | |
} | |
node.final = true; | |
this.previousWord = word; | |
} | |
/** | |
Apply a function to each node, traversing the trie in level order. | |
*/ | |
apply(fn) { | |
const level = [ this.root ]; | |
while( level.length > 0 ) { | |
const node = level.shift(); | |
for( let i = 0; i < node.children.length; i++ ) { | |
level.push( node.children[i] ); | |
} | |
fn( node ); | |
} | |
} | |
/** | |
Encode the trie and all of its nodes. Returns a string representing the | |
encoded data. | |
*/ | |
encode() { | |
// Write the unary encoding of the tree in level order. | |
const bits = new BitWriter(); | |
bits.write( 0x02, 2 ); | |
this.apply( ({children}) => { | |
for( let i = 0; i < children.length; i++ ) { | |
bits.write( 1, 1 ); | |
} | |
bits.write( 0, 1 ); | |
}); | |
// Write the data for each node, using 6 bits for node. 1 bit stores | |
// the "final" indicator. The other 5 bits store one of the 26 letters | |
// of the alphabet. | |
const a = ("a").charCodeAt(0); | |
this.apply( ({letter, final}) => { | |
let value = letter.charCodeAt(0) - a; | |
if ( final ) { | |
value |= 0x20; | |
} | |
bits.write( value, 6 ); | |
}); | |
return bits.getData(); | |
} | |
} | |
/** | |
This class is used for traversing the succinctly encoded trie. | |
*/ | |
class FrozenTrieNode { | |
constructor(trie, index, letter, final, firstChild, childCount) { | |
this.trie = trie; | |
this.index = index; | |
this.letter = letter; | |
this.final = final; | |
this.firstChild = firstChild; | |
this.childCount = childCount; | |
} | |
/** | |
Returns the number of children. | |
*/ | |
getChildCount() { | |
return this.childCount; | |
} | |
/** | |
Returns the FrozenTrieNode for the given child. | |
@param index The 0-based index of the child of this node. For example, if | |
the node has 5 children, and you wanted the 0th one, pass in 0. | |
*/ | |
getChild(index) { | |
return this.trie.getNodeByIndex( this.firstChild + index ); | |
} | |
} | |
/** | |
The FrozenTrie is used for looking up words in the encoded trie. | |
@param data A string representing the encoded trie. | |
@param directoryData A string representing the RankDirectory. The global L1 | |
and L2 constants are used to determine the L1Size and L2size. | |
@param nodeCount The number of nodes in the trie. | |
*/ | |
class FrozenTrie { | |
constructor(data, directoryData, nodeCount) { | |
this.init( data, directoryData, nodeCount ); | |
} | |
init(data, directoryData, nodeCount) { | |
this.data = new BitString( data ); | |
this.directory = new RankDirectory( directoryData, data, | |
nodeCount * 2 + 1, L1, L2 ); | |
// The position of the first bit of the data in 0th node. In non-root | |
// nodes, this would contain 6-bit letters. | |
this.letterStart = nodeCount * 2 + 1; | |
} | |
/** | |
Retrieve the FrozenTrieNode of the trie, given its index in level-order. | |
This is a private function that you don't have to use. | |
*/ | |
getNodeByIndex(index) { | |
// retrieve the 6-bit letter. | |
const final = this.data.get( this.letterStart + index * 6, 1 ) === 1; | |
const letter = String.fromCharCode( | |
this.data.get( this.letterStart + index * 6 + 1, 5 ) + | |
'a'.charCodeAt(0)); | |
const firstChild = this.directory.select( 0, index+1 ) - index; | |
// Since the nodes are in level order, this nodes children must go up | |
// until the next node's children start. | |
const childOfNextNode = this.directory.select( 0, index + 2 ) - index - 1; | |
return new FrozenTrieNode( this, index, letter, final, firstChild, | |
childOfNextNode - firstChild ); | |
} | |
/** | |
Retrieve the root node. You can use this node to obtain all of the other | |
nodes in the trie. | |
*/ | |
getRoot() { | |
return this.getNodeByIndex( 0 ); | |
} | |
/** | |
Look-up a word in the trie. Returns true if and only if the word exists | |
in the trie. | |
*/ | |
lookup(word) { | |
let node = this.getRoot(); | |
for ( let i = 0; i < word.length; i++ ) { | |
let child; | |
let j = 0; | |
for ( ; j < node.getChildCount(); j++ ) { | |
child = node.getChild( j ); | |
if ( child.letter === word[i] ) { | |
break; | |
} | |
} | |
if ( j === node.getChildCount() ) { | |
return false; | |
} | |
node = child; | |
} | |
return node.final; | |
} | |
} | |
/************************************************************************************************** | |
DEMONSTATION APPLICATION FUNCTIONS | |
*************************************************************************************************/ | |
/** | |
Load a dictionary asynchronously. | |
*/ | |
function loadDictionary() | |
{ | |
let xmlHttpReq; | |
try { | |
xmlHttpReq = new XMLHttpRequest(); | |
} catch ( trymicrosoft ) { | |
try { | |
xmlHttpReq = new ActiveXObject("Msxml2.XMLHTTP"); | |
} catch(othermicrosoft) { | |
try { | |
xmlHttpReq = new ActiveXObject("Microsoft.XMLHTTP"); | |
} catch(failed) { | |
xmlHttpReq = null; | |
} | |
} | |
} | |
strUrl = "ospd3.txt"; | |
xmlHttpReq.open("GET", "ospd3.txt", true); | |
xmlHttpReq.setRequestHeader('Content-Type', 'application/x-www-form-urlencoded'); | |
xmlHttpReq.onreadystatechange = () => { | |
if (xmlHttpReq.readyState === 4) { | |
if (xmlHttpReq.status === 200 ) { | |
document.getElementById("input").value = | |
xmlHttpReq.responseText; | |
} else if ( xmlHttpReq.message ) { | |
alert( xmlHttpReq.message ); | |
} else { | |
alert( "Network error. Check internet connection" ); | |
} | |
} | |
}; | |
xmlHttpReq.send(""); | |
} | |
/** | |
Encode the trie in the input text box. | |
*/ | |
function go() | |
{ | |
// create a trie | |
const trie = new Trie(); | |
// split the words of the input up. Sort them for faster trie insertion. | |
const words = document.getElementById("input").value.split(/\s+/); | |
words.sort(); | |
const regex = /^[a-z]+$/; | |
for ( let i = 0; i < words.length; i++ ) { | |
// To save space, our encoding handles only the letters a-z. Ignore | |
// words that contain other characters. | |
const word = words[i].toLowerCase(); | |
if ( word.match( /^[a-z]+$/ ) ) { | |
trie.insert( word ); | |
} | |
} | |
// Encode the trie. | |
const trieData = trie.encode(); | |
// Encode the rank directory | |
const directory = RankDirectory.Create( trieData, trie.getNodeCount() * 2 + | |
1, L1, L2 ); | |
let output; | |
output = `{\n "nodeCount": ${trie.getNodeCount()},\n`; | |
output += ` "directory": "${directory.getData()}",\n`; | |
output += ` "trie": "${trieData}"\n`; | |
output += "}\n"; | |
document.getElementById("output").value = output; | |
document.getElementById("encodeStatus").innerHTML = | |
`Encoded ${document.getElementById("input").value.length} bytes to ${output.length} bytes.`; | |
} | |
/** | |
Decode the data in the output textarea, and use it to check if a word exists | |
in the dictionary. | |
*/ | |
function lookup() | |
{ | |
let status = ""; | |
try | |
{ | |
const json = eval( `(${document.getElementById("output").value})` ); | |
const ftrie = new FrozenTrie( json.trie, json.directory, json.nodeCount | |
); | |
const word = document.getElementById("lookup").value; | |
if ( ftrie.lookup( document.getElementById("lookup").value ) ) { | |
status = `"${word}' is in the dictionary.`; | |
} else { | |
status = `"${word}' IS NOT in the dictionary.`; | |
} | |
} catch ( e ) { | |
status = "Error. Have you encoded the dictionary yet?"; | |
} | |
document.getElementById("status").innerHTML = status; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment