vladiibine · May 24, 2019 21:36
diff --git a/vlad_trie.py b/vlad_trie.py
 import copy
 import unittest


 class Node:
    __slots__ = ["count", "next"]

    def __init__(self, count, next_):
        self.count = count
        self.next = next_


 class VladTrie:
    """Also a good name would have been VeryInefficientTrie

    Stores words that share common roots more "efficiently".
    At the moment, this obviously uses more memory than the text itself

    It is therefore of no use! (well, maybe can serve as a fancier counter)
    """
    __slots__ = ["data"]

    def __init__(self, *strings, compress=False):
        self.data = {"next": {}, "count":0}
        # self.data = Node(0, None)

        for s in strings:
            self.add(s, compress=False)

        if compress:
            self.data = self.create_compressed_trie(self.data)


    def add(self, s, node=None, compress=True):
        if node:
            current_node = node
        else:
            current_node = self.data


        for index, c in enumerate(s):
            if c in current_node["next"]:
                current_node["next"][c]["count"] += 1

            else:
                current_node["next"][c] = {"count": 1, "next": {}}

            current_node = current_node["next"][c]

        if compress:
            self.data = self.create_compressed_trie(self.data)


    @classmethod
    def _create_shorter_next_node(cls, char, node, parent_count):
        if len(node["next"]) == 0:
            return char, node

        elif len(node["next"]) > 1:
            new_node = {"count": node["count"], "next": {}}

            for key, next_node in node["next"].items():
                new_key, shorter_next_node = cls._create_shorter_next_node(
                    key, next_node, node["count"]
                )
                new_node["next"][new_key] = shorter_next_node

            return char, new_node

        else:  # len(node["next"]) == 1
            chars = [char]

            cursor_node = node
            while len(cursor_node["next"]) == 1:
                parent_count = cursor_node["count"]
                backup_node = cursor_node
                new_key, cursor_node = list(cursor_node["next"].items())[0]
                current_count = cursor_node["count"]

                if parent_count != current_count:
                    # We have gone too far.
                    # We're in the Trie("a", "ab") situation, and need to
                    # allow "a" to exist as a key alone
                    cursor_node = backup_node
                    break
                chars.append(new_key)

            cursor_replacement_node = {"count": cursor_node["count"], "next": {}}

            for key, child_node in cursor_node["next"].items():
                new_key, new_child = cls._create_shorter_next_node(
                    key, child_node,
                    parent_count
                )
                cursor_replacement_node["next"][new_key] = new_child

            return ''.join(chars), cursor_replacement_node


    def create_compressed_trie(self, node=None):
        if node is None:
            node = self.data

        _, new_node = self._create_shorter_next_node('', node, 0)

        return new_node

    def dump_dict(self, compress=True):
        if compress:
            return copy.deepcopy(self.create_compressed_trie(self.data))
        else:
            return copy.deepcopy(self.data)


 class TestCreateCompressedTrie(unittest.TestCase):
    def test_simpler_trie(self):
        t = VladTrie("aa", "aabb1", "aabb2")
        d = t.create_compressed_trie()

        self.assertEqual({
            "count": 0,
            "next": {
                "aa": {
                    "count": 3,
                    "next": {
                        "bb": {
                            "count": 2,
                            "next": {
                                "1": {
                                    "count": 1,
                                    "next": {}
                                },
                                "2": {
                                    "count": 1,
                                    "next": {}
                                }
                            }
                        },
                    }
                }
            }
        }, d)

    def test_second_branching(self):
        t = VladTrie("aadd", "aaddx", "aaddy", "aaddzz1", "aaddzz2")

        d = t.create_compressed_trie()

        self.assertEqual({
            "count": 0,
            "next": {
                "aadd": {
                    "count": 5,
                    "next": {
                        "x": {
                            "count": 1,
                            "next": {}
                        },
                        "y": {
                            "count": 1,
                            "next": {}
                        },
                        "zz": {
                            "count": 2,
                            "next": {
                                "1": {
                                    "count": 1,
                                    "next": {}
                                },
                                "2": {
                                    "count": 1,
                                    "next": {}}}}}}}
        }, d)

    def test_complex_trie(self):
        t = VladTrie("aa", "aab", "aabc", "aabcddd", "aabcdddx", "aabcdddy", "aabcdddzz1", "aabcdddzz2")

        d = t.create_compressed_trie()

        self.assertEqual({
            "count": 0,
            "next": {
                "aa": {
                    "count": 8,
                    "next": {
                        "b": {
                            "count": 7,
                            "next": {
                                "c": {
                                    "count": 6,
                                    "next": {
                                        "ddd": {
                                            "count": 5,
                                            "next": {
                                                "x": {
                                                    "count": 1,
                                                    "next": {}
                                                },
                                                "y": {
                                                    "count": 1,
                                                    "next": {}
                                                },
                                                "zz": {
                                                    "count": 2,
                                                    "next": {
                                                        "1": {
                                                            "count": 1,
                                                            "next": {}
                                                        },
                                                        "2": {
                                                            "count": 1,
                                                            "next": {}}}}}}}}}}}}}
        }, d)

 class TestCreateShorterNextNode(unittest.TestCase):
    def test_single_letter_simple_shortening(self):
        t = VladTrie("fo")
        new_key, new_node = VladTrie._create_shorter_next_node(
            "f", t.data["next"]["f"], 0)

        self.assertEqual("fo", new_key)
        self.assertEqual({"count": 1, "next": {}}, new_node)

    def test_two_letter_simple_shortening(self):
        t = VladTrie("bar")
        new_key, new_node = VladTrie._create_shorter_next_node(
            "b", t.data["next"]["b"], 0
        )
        self.assertEqual("bar", new_key)
        self.assertEqual({"count": 1, "next": {}}, new_node)

    def test_two_letter_shortening_then_stop(self):
        t = VladTrie("qqq", "qqw")
        new_key, new_node = VladTrie._create_shorter_next_node(
            "q", t.data["next"]["q"], 0
        )

        self.assertEqual("qq", new_key)
        self.assertEqual({
            "count": 2,
            "next": {
                "q": {
                    "count":1,
                    "next":{}},
                "w":{
                    "count":1,
                    "next":{}}}},
            new_node
        )

    def test_two_chains(self):
        t = VladTrie("qqqqii", "qqqqxxxzzz", "qqqqxxxttt")
        new_key, new_node = VladTrie._create_shorter_next_node(
            "q", t.data["next"]["q"], 0
        )

        self.assertEqual("qqqq", new_key)
        self.assertEqual({
            "count": 3,
            "next": {
                "ii": {
                    "count": 1,
                    "next": {}},
                "xxx": {
                    "count": 2,
                    "next": {
                        "zzz": {
                            "count": 1,
                            "next": {},},
                        "ttt": {
                            "count": 1,
                            "next": {}}}},
            }}, new_node)




 class Tests(unittest.TestCase):
    def test_add_1_string(self):
        t = VladTrie("bostan")

        self.assertEqual(
            t.dump_dict(),
            {
                "count": 0,
                "next":
                    {"bostan": {"count": 1, "next": {}}}
            }
        )

    def test_add_2_unrelated_strings(self):
        t = VladTrie("bostan", "masina")

        self.assertEqual(
            t.dump_dict(),
            {
                "count": 0,
                "next":
                    {
                        "bostan": {"count": 1, "next": {}},
                        "masina": {"count": 1, "next": {}}
                    }
            },
        )

    def test_add_2_related_strings_included_in_each_other(self):
        t = VladTrie("bostan", "bostanilor")

        self.assertEqual(
            t.dump_dict(),
            {
                "count": 0,
                "next":
                    {"bostan": {
                        "count": 2,
                        "next": {
                            "ilor": {
                                "count": 1,
                                "next": {}}}}}
            }
        )

    def test_spread_one_word(self):
        t = VladTrie("bar")

        self.assertEqual(
            t.dump_dict(False),
            {
                "next": {
                    "b": {
                        "count": 1,
                        "next": {
                            "a": {
                                "count": 1,
                                "next": {
                                    "r": {
                                        "count": 1,
                                        "next": {}}}}}}},
                "count": 0}
        )

    def test_spread_two_words(self):
        t = VladTrie("bar", "foo")
        self.assertEqual(
            t.dump_dict(False),
            {
                "next": {
                    "b": {
                        "count": 1,
                        "next": {
                            "a": {
                                "count": 1,
                                "next": {
                                    "r": {
                                        "count": 1,
                                        "next": {}}}}}},
                    "f": {
                        "count": 1,
                        "next": {
                            "o": {
                                "count": 1,
                                "next": {
                                    "o": {
                                        "count": 1,
                                        "next": {}}}}}},
                },
                "count": 0
            }
        )

    def test_spread_2_identical_words(self):
        t = VladTrie("foo", "foo")

        self.assertEqual(
            t.dump_dict(compress=False),
                        {
                "next": {
                    "f": {
                        "count": 2,
                        "next": {
                            "o": {
                                "count": 2,
                                "next": {
                                    "o": {
                                        "count": 2,
                                        "next": {}}}}}},
                },
                "count": 0
            }
        )

    def test_spread_2_words_with_common_roots(self):
        t = VladTrie("bar", "baz")
        self.assertEqual(
            t.dump_dict(compress=False),
            {
                "next": {
                    "b": {
                        "count": 2,
                        "next": {
                            "a": {
                                "count": 2,
                                "next": {
                                    "r": {
                                        "count": 1,
                                        "next": {}
                                    },
                                    "z": {
                                        "count": 1,
                                        "next": {}
                                    }
                                }}}},
                },
                "count": 0
            }

        )

    def test_spread_3_words_with_common_roots(self):
        t = VladTrie("qqq", "qww", "qwe")
        self.assertEqual(
            t.dump_dict(compress=False),
            {
                "count": 0,
                "next": {
                    "q": {
                        "count": 3,
                        "next": {
                            "q": {
                                "count": 1,
                                "next": {
                                    "q": {
                                        "count": 1,
                                        "next": {},
                                    }
                                }
                            },
                            "w": {
                                "count": 2,
                                "next": {
                                    "w": {
                                        "count": 1,
                                        "next": {}
                                    },
                                    "e": {
                                        "count": 1,
                                        "next": {}
                                    }
                                }
                            },

                        }
                    }
                }
            }
        )


 def main():
    # add 1 string s1 to a dict D1 = {s1: d1} , where d1 = {"count": 1}
    # retrieve another string s2
    # compare it char by char to the string in temp_list1
        # if char number 0 if the strings doesn't match, place s2 into temp_list1
        # if s1 == s2, then increment d1["count"]
        # if s1 has some chars at the beginning in common with s2 at the beginning, then
            # remove d1 from temp_list1
            # add d2 = {"str": s1[:X], "count": 2} to

    # add 1 string s1 to dict d1 = {s1: {"count": 1, "}}
    # retrieve a new string s2
    # for each incremental slice of s2 (s2[:1], s2[:2], ...), check if such a
    #   string is found in the dict d1
    # if such a substring is found, increment the int value of that key
    #   also,
    # elif no such substring (or the entire string) is found, add d1[s2] = 1
    #

    pass


 if __name__ == '__main__':
    # main()
    unittest.main()
	import copy
	import unittest


	class Node:
	__slots__ = ["count", "next"]

	def __init__(self, count, next_):
	self.count = count
	self.next = next_


	class VladTrie:
	"""Also a good name would have been VeryInefficientTrie

	Stores words that share common roots more "efficiently".
	At the moment, this obviously uses more memory than the text itself

	It is therefore of no use! (well, maybe can serve as a fancier counter)
	"""
	__slots__ = ["data"]

	def __init__(self, *strings, compress=False):
	self.data = {"next": {}, "count":0}
	# self.data = Node(0, None)

	for s in strings:
	self.add(s, compress=False)

	if compress:
	self.data = self.create_compressed_trie(self.data)


	def add(self, s, node=None, compress=True):
	if node:
	current_node = node
	else:
	current_node = self.data


	for index, c in enumerate(s):
	if c in current_node["next"]:
	current_node["next"][c]["count"] += 1

	else:
	current_node["next"][c] = {"count": 1, "next": {}}

	current_node = current_node["next"][c]

	if compress:
	self.data = self.create_compressed_trie(self.data)


	@classmethod
	def _create_shorter_next_node(cls, char, node, parent_count):
	if len(node["next"]) == 0:
	return char, node

	elif len(node["next"]) > 1:
	new_node = {"count": node["count"], "next": {}}

	for key, next_node in node["next"].items():
	new_key, shorter_next_node = cls._create_shorter_next_node(
	key, next_node, node["count"]
	)
	new_node["next"][new_key] = shorter_next_node

	return char, new_node

	else: # len(node["next"]) == 1
	chars = [char]

	cursor_node = node
	while len(cursor_node["next"]) == 1:
	parent_count = cursor_node["count"]
	backup_node = cursor_node
	new_key, cursor_node = list(cursor_node["next"].items())[0]
	current_count = cursor_node["count"]

	if parent_count != current_count:
	# We have gone too far.
	# We're in the Trie("a", "ab") situation, and need to
	# allow "a" to exist as a key alone
	cursor_node = backup_node
	break
	chars.append(new_key)

	cursor_replacement_node = {"count": cursor_node["count"], "next": {}}

	for key, child_node in cursor_node["next"].items():
	new_key, new_child = cls._create_shorter_next_node(
	key, child_node,
	parent_count
	)
	cursor_replacement_node["next"][new_key] = new_child

	return ''.join(chars), cursor_replacement_node


	def create_compressed_trie(self, node=None):
	if node is None:
	node = self.data

	_, new_node = self._create_shorter_next_node('', node, 0)

	return new_node

	def dump_dict(self, compress=True):
	if compress:
	return copy.deepcopy(self.create_compressed_trie(self.data))
	else:
	return copy.deepcopy(self.data)


	class TestCreateCompressedTrie(unittest.TestCase):
	def test_simpler_trie(self):
	t = VladTrie("aa", "aabb1", "aabb2")
	d = t.create_compressed_trie()

	self.assertEqual({
	"count": 0,
	"next": {
	"aa": {
	"count": 3,
	"next": {
	"bb": {
	"count": 2,
	"next": {
	"1": {
	"count": 1,
	"next": {}
	},
	"2": {
	"count": 1,
	"next": {}
	}
	}
	},
	}
	}
	}
	}, d)

	def test_second_branching(self):
	t = VladTrie("aadd", "aaddx", "aaddy", "aaddzz1", "aaddzz2")

	d = t.create_compressed_trie()

	self.assertEqual({
	"count": 0,
	"next": {
	"aadd": {
	"count": 5,
	"next": {
	"x": {
	"count": 1,
	"next": {}
	},
	"y": {
	"count": 1,
	"next": {}
	},
	"zz": {
	"count": 2,
	"next": {
	"1": {
	"count": 1,
	"next": {}
	},
	"2": {
	"count": 1,
	"next": {}}}}}}}
	}, d)

	def test_complex_trie(self):
	t = VladTrie("aa", "aab", "aabc", "aabcddd", "aabcdddx", "aabcdddy", "aabcdddzz1", "aabcdddzz2")

	d = t.create_compressed_trie()

	self.assertEqual({
	"count": 0,
	"next": {
	"aa": {
	"count": 8,
	"next": {
	"b": {
	"count": 7,
	"next": {
	"c": {
	"count": 6,
	"next": {
	"ddd": {
	"count": 5,
	"next": {
	"x": {
	"count": 1,
	"next": {}
	},
	"y": {
	"count": 1,
	"next": {}
	},
	"zz": {
	"count": 2,
	"next": {
	"1": {
	"count": 1,
	"next": {}
	},
	"2": {
	"count": 1,
	"next": {}}}}}}}}}}}}}
	}, d)

	class TestCreateShorterNextNode(unittest.TestCase):
	def test_single_letter_simple_shortening(self):
	t = VladTrie("fo")
	new_key, new_node = VladTrie._create_shorter_next_node(
	"f", t.data["next"]["f"], 0)

	self.assertEqual("fo", new_key)
	self.assertEqual({"count": 1, "next": {}}, new_node)

	def test_two_letter_simple_shortening(self):
	t = VladTrie("bar")
	new_key, new_node = VladTrie._create_shorter_next_node(
	"b", t.data["next"]["b"], 0
	)
	self.assertEqual("bar", new_key)
	self.assertEqual({"count": 1, "next": {}}, new_node)

	def test_two_letter_shortening_then_stop(self):
	t = VladTrie("qqq", "qqw")
	new_key, new_node = VladTrie._create_shorter_next_node(
	"q", t.data["next"]["q"], 0
	)

	self.assertEqual("qq", new_key)
	self.assertEqual({
	"count": 2,
	"next": {
	"q": {
	"count":1,
	"next":{}},
	"w":{
	"count":1,
	"next":{}}}},
	new_node
	)

	def test_two_chains(self):
	t = VladTrie("qqqqii", "qqqqxxxzzz", "qqqqxxxttt")
	new_key, new_node = VladTrie._create_shorter_next_node(
	"q", t.data["next"]["q"], 0
	)

	self.assertEqual("qqqq", new_key)
	self.assertEqual({
	"count": 3,
	"next": {
	"ii": {
	"count": 1,
	"next": {}},
	"xxx": {
	"count": 2,
	"next": {
	"zzz": {
	"count": 1,
	"next": {},},
	"ttt": {
	"count": 1,
	"next": {}}}},
	}}, new_node)




	class Tests(unittest.TestCase):
	def test_add_1_string(self):
	t = VladTrie("bostan")

	self.assertEqual(
	t.dump_dict(),
	{
	"count": 0,
	"next":
	{"bostan": {"count": 1, "next": {}}}
	}
	)

	def test_add_2_unrelated_strings(self):
	t = VladTrie("bostan", "masina")

	self.assertEqual(
	t.dump_dict(),
	{
	"count": 0,
	"next":
	{
	"bostan": {"count": 1, "next": {}},
	"masina": {"count": 1, "next": {}}
	}
	},
	)

	def test_add_2_related_strings_included_in_each_other(self):
	t = VladTrie("bostan", "bostanilor")

	self.assertEqual(
	t.dump_dict(),
	{
	"count": 0,
	"next":
	{"bostan": {
	"count": 2,
	"next": {
	"ilor": {
	"count": 1,
	"next": {}}}}}
	}
	)

	def test_spread_one_word(self):
	t = VladTrie("bar")

	self.assertEqual(
	t.dump_dict(False),
	{
	"next": {
	"b": {
	"count": 1,
	"next": {
	"a": {
	"count": 1,
	"next": {
	"r": {
	"count": 1,
	"next": {}}}}}}},
	"count": 0}
	)

	def test_spread_two_words(self):
	t = VladTrie("bar", "foo")
	self.assertEqual(
	t.dump_dict(False),
	{
	"next": {
	"b": {
	"count": 1,
	"next": {
	"a": {
	"count": 1,
	"next": {
	"r": {
	"count": 1,
	"next": {}}}}}},
	"f": {
	"count": 1,
	"next": {
	"o": {
	"count": 1,
	"next": {
	"o": {
	"count": 1,
	"next": {}}}}}},
	},
	"count": 0
	}
	)

	def test_spread_2_identical_words(self):
	t = VladTrie("foo", "foo")

	self.assertEqual(
	t.dump_dict(compress=False),
	{
	"next": {
	"f": {
	"count": 2,
	"next": {
	"o": {
	"count": 2,
	"next": {
	"o": {
	"count": 2,
	"next": {}}}}}},
	},
	"count": 0
	}
	)

	def test_spread_2_words_with_common_roots(self):
	t = VladTrie("bar", "baz")
	self.assertEqual(
	t.dump_dict(compress=False),
	{
	"next": {
	"b": {
	"count": 2,
	"next": {
	"a": {
	"count": 2,
	"next": {
	"r": {
	"count": 1,
	"next": {}
	},
	"z": {
	"count": 1,
	"next": {}
	}
	}}}},
	},
	"count": 0
	}

	)

	def test_spread_3_words_with_common_roots(self):
	t = VladTrie("qqq", "qww", "qwe")
	self.assertEqual(
	t.dump_dict(compress=False),
	{
	"count": 0,
	"next": {
	"q": {
	"count": 3,
	"next": {
	"q": {
	"count": 1,
	"next": {
	"q": {
	"count": 1,
	"next": {},
	}
	}
	},
	"w": {
	"count": 2,
	"next": {
	"w": {
	"count": 1,
	"next": {}
	},
	"e": {
	"count": 1,
	"next": {}
	}
	}
	},

	}
	}
	}
	}
	)


	def main():
	# add 1 string s1 to a dict D1 = {s1: d1} , where d1 = {"count": 1}
	# retrieve another string s2
	# compare it char by char to the string in temp_list1
	# if char number 0 if the strings doesn't match, place s2 into temp_list1
	# if s1 == s2, then increment d1["count"]
	# if s1 has some chars at the beginning in common with s2 at the beginning, then
	# remove d1 from temp_list1
	# add d2 = {"str": s1[:X], "count": 2} to

	# add 1 string s1 to dict d1 = {s1: {"count": 1, "}}
	# retrieve a new string s2
	# for each incremental slice of s2 (s2[:1], s2[:2], ...), check if such a
	# string is found in the dict d1
	# if such a substring is found, increment the int value of that key
	# also,
	# elif no such substring (or the entire string) is found, add d1[s2] = 1
	#

	pass


	if __name__ == '__main__':
	# main()
	unittest.main()