Created
May 24, 2019 21:36
-
-
Save vladiibine/f01202ee77f664587b84cfb20ab22046 to your computer and use it in GitHub Desktop.
Trie for storing text. Very inefficient memory-wise
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import copy | |
import unittest | |
class Node: | |
__slots__ = ["count", "next"] | |
def __init__(self, count, next_): | |
self.count = count | |
self.next = next_ | |
class VladTrie: | |
"""Also a good name would have been VeryInefficientTrie | |
Stores words that share common roots more "efficiently". | |
At the moment, this obviously uses more memory than the text itself | |
It is therefore of no use! (well, maybe can serve as a fancier counter) | |
""" | |
__slots__ = ["data"] | |
def __init__(self, *strings, compress=False): | |
self.data = {"next": {}, "count":0} | |
# self.data = Node(0, None) | |
for s in strings: | |
self.add(s, compress=False) | |
if compress: | |
self.data = self.create_compressed_trie(self.data) | |
def add(self, s, node=None, compress=True): | |
if node: | |
current_node = node | |
else: | |
current_node = self.data | |
for index, c in enumerate(s): | |
if c in current_node["next"]: | |
current_node["next"][c]["count"] += 1 | |
else: | |
current_node["next"][c] = {"count": 1, "next": {}} | |
current_node = current_node["next"][c] | |
if compress: | |
self.data = self.create_compressed_trie(self.data) | |
@classmethod | |
def _create_shorter_next_node(cls, char, node, parent_count): | |
if len(node["next"]) == 0: | |
return char, node | |
elif len(node["next"]) > 1: | |
new_node = {"count": node["count"], "next": {}} | |
for key, next_node in node["next"].items(): | |
new_key, shorter_next_node = cls._create_shorter_next_node( | |
key, next_node, node["count"] | |
) | |
new_node["next"][new_key] = shorter_next_node | |
return char, new_node | |
else: # len(node["next"]) == 1 | |
chars = [char] | |
cursor_node = node | |
while len(cursor_node["next"]) == 1: | |
parent_count = cursor_node["count"] | |
backup_node = cursor_node | |
new_key, cursor_node = list(cursor_node["next"].items())[0] | |
current_count = cursor_node["count"] | |
if parent_count != current_count: | |
# We have gone too far. | |
# We're in the Trie("a", "ab") situation, and need to | |
# allow "a" to exist as a key alone | |
cursor_node = backup_node | |
break | |
chars.append(new_key) | |
cursor_replacement_node = {"count": cursor_node["count"], "next": {}} | |
for key, child_node in cursor_node["next"].items(): | |
new_key, new_child = cls._create_shorter_next_node( | |
key, child_node, | |
parent_count | |
) | |
cursor_replacement_node["next"][new_key] = new_child | |
return ''.join(chars), cursor_replacement_node | |
def create_compressed_trie(self, node=None): | |
if node is None: | |
node = self.data | |
_, new_node = self._create_shorter_next_node('', node, 0) | |
return new_node | |
def dump_dict(self, compress=True): | |
if compress: | |
return copy.deepcopy(self.create_compressed_trie(self.data)) | |
else: | |
return copy.deepcopy(self.data) | |
class TestCreateCompressedTrie(unittest.TestCase): | |
def test_simpler_trie(self): | |
t = VladTrie("aa", "aabb1", "aabb2") | |
d = t.create_compressed_trie() | |
self.assertEqual({ | |
"count": 0, | |
"next": { | |
"aa": { | |
"count": 3, | |
"next": { | |
"bb": { | |
"count": 2, | |
"next": { | |
"1": { | |
"count": 1, | |
"next": {} | |
}, | |
"2": { | |
"count": 1, | |
"next": {} | |
} | |
} | |
}, | |
} | |
} | |
} | |
}, d) | |
def test_second_branching(self): | |
t = VladTrie("aadd", "aaddx", "aaddy", "aaddzz1", "aaddzz2") | |
d = t.create_compressed_trie() | |
self.assertEqual({ | |
"count": 0, | |
"next": { | |
"aadd": { | |
"count": 5, | |
"next": { | |
"x": { | |
"count": 1, | |
"next": {} | |
}, | |
"y": { | |
"count": 1, | |
"next": {} | |
}, | |
"zz": { | |
"count": 2, | |
"next": { | |
"1": { | |
"count": 1, | |
"next": {} | |
}, | |
"2": { | |
"count": 1, | |
"next": {}}}}}}} | |
}, d) | |
def test_complex_trie(self): | |
t = VladTrie("aa", "aab", "aabc", "aabcddd", "aabcdddx", "aabcdddy", "aabcdddzz1", "aabcdddzz2") | |
d = t.create_compressed_trie() | |
self.assertEqual({ | |
"count": 0, | |
"next": { | |
"aa": { | |
"count": 8, | |
"next": { | |
"b": { | |
"count": 7, | |
"next": { | |
"c": { | |
"count": 6, | |
"next": { | |
"ddd": { | |
"count": 5, | |
"next": { | |
"x": { | |
"count": 1, | |
"next": {} | |
}, | |
"y": { | |
"count": 1, | |
"next": {} | |
}, | |
"zz": { | |
"count": 2, | |
"next": { | |
"1": { | |
"count": 1, | |
"next": {} | |
}, | |
"2": { | |
"count": 1, | |
"next": {}}}}}}}}}}}}} | |
}, d) | |
class TestCreateShorterNextNode(unittest.TestCase): | |
def test_single_letter_simple_shortening(self): | |
t = VladTrie("fo") | |
new_key, new_node = VladTrie._create_shorter_next_node( | |
"f", t.data["next"]["f"], 0) | |
self.assertEqual("fo", new_key) | |
self.assertEqual({"count": 1, "next": {}}, new_node) | |
def test_two_letter_simple_shortening(self): | |
t = VladTrie("bar") | |
new_key, new_node = VladTrie._create_shorter_next_node( | |
"b", t.data["next"]["b"], 0 | |
) | |
self.assertEqual("bar", new_key) | |
self.assertEqual({"count": 1, "next": {}}, new_node) | |
def test_two_letter_shortening_then_stop(self): | |
t = VladTrie("qqq", "qqw") | |
new_key, new_node = VladTrie._create_shorter_next_node( | |
"q", t.data["next"]["q"], 0 | |
) | |
self.assertEqual("qq", new_key) | |
self.assertEqual({ | |
"count": 2, | |
"next": { | |
"q": { | |
"count":1, | |
"next":{}}, | |
"w":{ | |
"count":1, | |
"next":{}}}}, | |
new_node | |
) | |
def test_two_chains(self): | |
t = VladTrie("qqqqii", "qqqqxxxzzz", "qqqqxxxttt") | |
new_key, new_node = VladTrie._create_shorter_next_node( | |
"q", t.data["next"]["q"], 0 | |
) | |
self.assertEqual("qqqq", new_key) | |
self.assertEqual({ | |
"count": 3, | |
"next": { | |
"ii": { | |
"count": 1, | |
"next": {}}, | |
"xxx": { | |
"count": 2, | |
"next": { | |
"zzz": { | |
"count": 1, | |
"next": {},}, | |
"ttt": { | |
"count": 1, | |
"next": {}}}}, | |
}}, new_node) | |
class Tests(unittest.TestCase): | |
def test_add_1_string(self): | |
t = VladTrie("bostan") | |
self.assertEqual( | |
t.dump_dict(), | |
{ | |
"count": 0, | |
"next": | |
{"bostan": {"count": 1, "next": {}}} | |
} | |
) | |
def test_add_2_unrelated_strings(self): | |
t = VladTrie("bostan", "masina") | |
self.assertEqual( | |
t.dump_dict(), | |
{ | |
"count": 0, | |
"next": | |
{ | |
"bostan": {"count": 1, "next": {}}, | |
"masina": {"count": 1, "next": {}} | |
} | |
}, | |
) | |
def test_add_2_related_strings_included_in_each_other(self): | |
t = VladTrie("bostan", "bostanilor") | |
self.assertEqual( | |
t.dump_dict(), | |
{ | |
"count": 0, | |
"next": | |
{"bostan": { | |
"count": 2, | |
"next": { | |
"ilor": { | |
"count": 1, | |
"next": {}}}}} | |
} | |
) | |
def test_spread_one_word(self): | |
t = VladTrie("bar") | |
self.assertEqual( | |
t.dump_dict(False), | |
{ | |
"next": { | |
"b": { | |
"count": 1, | |
"next": { | |
"a": { | |
"count": 1, | |
"next": { | |
"r": { | |
"count": 1, | |
"next": {}}}}}}}, | |
"count": 0} | |
) | |
def test_spread_two_words(self): | |
t = VladTrie("bar", "foo") | |
self.assertEqual( | |
t.dump_dict(False), | |
{ | |
"next": { | |
"b": { | |
"count": 1, | |
"next": { | |
"a": { | |
"count": 1, | |
"next": { | |
"r": { | |
"count": 1, | |
"next": {}}}}}}, | |
"f": { | |
"count": 1, | |
"next": { | |
"o": { | |
"count": 1, | |
"next": { | |
"o": { | |
"count": 1, | |
"next": {}}}}}}, | |
}, | |
"count": 0 | |
} | |
) | |
def test_spread_2_identical_words(self): | |
t = VladTrie("foo", "foo") | |
self.assertEqual( | |
t.dump_dict(compress=False), | |
{ | |
"next": { | |
"f": { | |
"count": 2, | |
"next": { | |
"o": { | |
"count": 2, | |
"next": { | |
"o": { | |
"count": 2, | |
"next": {}}}}}}, | |
}, | |
"count": 0 | |
} | |
) | |
def test_spread_2_words_with_common_roots(self): | |
t = VladTrie("bar", "baz") | |
self.assertEqual( | |
t.dump_dict(compress=False), | |
{ | |
"next": { | |
"b": { | |
"count": 2, | |
"next": { | |
"a": { | |
"count": 2, | |
"next": { | |
"r": { | |
"count": 1, | |
"next": {} | |
}, | |
"z": { | |
"count": 1, | |
"next": {} | |
} | |
}}}}, | |
}, | |
"count": 0 | |
} | |
) | |
def test_spread_3_words_with_common_roots(self): | |
t = VladTrie("qqq", "qww", "qwe") | |
self.assertEqual( | |
t.dump_dict(compress=False), | |
{ | |
"count": 0, | |
"next": { | |
"q": { | |
"count": 3, | |
"next": { | |
"q": { | |
"count": 1, | |
"next": { | |
"q": { | |
"count": 1, | |
"next": {}, | |
} | |
} | |
}, | |
"w": { | |
"count": 2, | |
"next": { | |
"w": { | |
"count": 1, | |
"next": {} | |
}, | |
"e": { | |
"count": 1, | |
"next": {} | |
} | |
} | |
}, | |
} | |
} | |
} | |
} | |
) | |
def main(): | |
# add 1 string s1 to a dict D1 = {s1: d1} , where d1 = {"count": 1} | |
# retrieve another string s2 | |
# compare it char by char to the string in temp_list1 | |
# if char number 0 if the strings doesn't match, place s2 into temp_list1 | |
# if s1 == s2, then increment d1["count"] | |
# if s1 has some chars at the beginning in common with s2 at the beginning, then | |
# remove d1 from temp_list1 | |
# add d2 = {"str": s1[:X], "count": 2} to | |
# add 1 string s1 to dict d1 = {s1: {"count": 1, "}} | |
# retrieve a new string s2 | |
# for each incremental slice of s2 (s2[:1], s2[:2], ...), check if such a | |
# string is found in the dict d1 | |
# if such a substring is found, increment the int value of that key | |
# also, | |
# elif no such substring (or the entire string) is found, add d1[s2] = 1 | |
# | |
pass | |
if __name__ == '__main__': | |
# main() | |
unittest.main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment