blazs · January 11, 2018 07:48
diff --git a/huffman.py b/huffman.py
 """
 Quick and dirty prototypical implementation of the Huffman encoding.

 Blaz Sovdat ([email protected]), December 20th 2017
 """


 import collections
 import enum
 import heapq
 import functools
 import sys
 import pickle
 import bitarray


 class Command(enum.Enum):
    COMPRESS = 'compress'
    DECOMPRESS = 'decompress'


 @functools.total_ordering
 class Node:
    def __init__(self, freq, left=None, right=None, letter=None):
        self.freq = freq
        self.left = left
        self.right = right
        self.letter = letter

    def __lt__(self, other):
        return self.freq < other.freq

    def __eq__(self, other):
        return self.freq == other.freq

    def is_leaf(self):
        return self.left is None and self.right is None

    def __repr__(self):
        return "{},{}".format(str(self.freq), str(self.letter))


 def compute_frequecies(text):
    d = collections.defaultdict(int)
    for c in text:
        d[c] += 1
    return d


 def build_huffman(freqs):
    heap = [Node(v, letter=k) for (k, v) in freqs.items()]
    heapq.heapify(heap)
    while len(heap) > 1:
        u = heapq.heappop(heap)
        v = heapq.heappop(heap)
        w = Node(u.freq+v.freq, left=u, right=v)
        heapq.heappush(heap, w)
    return heapq.heappop(heap)


 def build_codes(huff_tree):
    table = {}
    stack = [(huff_tree, '')]
    while len(stack) > 0:
        (node, code) = stack.pop()
        if node.is_leaf():
            table[node.letter] = code # bitarray.bitarray(code)
        else: # Each node in the tree of a Huffman code has either both siblings or none (by construction)
            stack.append((node.left, code + '0'))
            stack.append((node.right, code + '1'))
    return table


 def encode(text, codes):
    return bitarray.bitarray(''.join([codes[ch] for ch in text]))


 def decode(code, tree):
    """
    Because the Huffman encoding is a prefix code, we have the property that
    no code is a prefix of another code. This follows trivially from the fact
    that the codes are represented by the Huffman tree. We use that propety to
    decode efficiently.
    """
    text = []
    i = 0
    root = tree
    while i < len(code):
        node = root
        while not node.is_leaf():
            if i >= len(code):
                return text
            if not code[i]:
                node = node.left
            else:
                node = node.right
            i += 1
        text.append(node.letter)
    return bytearray(text)


 def compress(fnm_in, fnm_out):
    with open(fnm_in, "rb") as fin:
        text = fin.read()
        freqs = compute_frequecies(text)
        tree = build_huffman(freqs)
        codes = build_codes(tree)
        code = encode(text, codes)
        with open(fnm_out, 'wb') as fout:
            pickle.dump((tree, code), fout)


 def decompress(fnm_in, fnm_o):
    with open(fnm_in, 'rb') as fin, open(fnm_o, 'wb') as fout:
        tree, code = pickle.load(fin)
        fout.write(decode(code, tree))


 if __name__ == '__main__':
    if len(sys.argv) < 4:
        sys.exit(1)
 
    cmd, fnm_i, fnm_o = Command(sys.argv[1]), sys.argv[2], sys.argv[3]
    {
        Command.COMPRESS: compress,
        Command.DECOMPRESS: decompress
    }[cmd](fnm_i, fnm_o)
	"""
	Quick and dirty prototypical implementation of the Huffman encoding.

	Blaz Sovdat ([email protected]), December 20th 2017
	"""


	import collections
	import enum
	import heapq
	import functools
	import sys
	import pickle
	import bitarray


	class Command(enum.Enum):
	COMPRESS = 'compress'
	DECOMPRESS = 'decompress'


	@functools.total_ordering
	class Node:
	def __init__(self, freq, left=None, right=None, letter=None):
	self.freq = freq
	self.left = left
	self.right = right
	self.letter = letter

	def __lt__(self, other):
	return self.freq < other.freq

	def __eq__(self, other):
	return self.freq == other.freq

	def is_leaf(self):
	return self.left is None and self.right is None

	def __repr__(self):
	return "{},{}".format(str(self.freq), str(self.letter))


	def compute_frequecies(text):
	d = collections.defaultdict(int)
	for c in text:
	d[c] += 1
	return d


	def build_huffman(freqs):
	heap = [Node(v, letter=k) for (k, v) in freqs.items()]
	heapq.heapify(heap)
	while len(heap) > 1:
	u = heapq.heappop(heap)
	v = heapq.heappop(heap)
	w = Node(u.freq+v.freq, left=u, right=v)
	heapq.heappush(heap, w)
	return heapq.heappop(heap)


	def build_codes(huff_tree):
	table = {}
	stack = [(huff_tree, '')]
	while len(stack) > 0:
	(node, code) = stack.pop()
	if node.is_leaf():
	table[node.letter] = code # bitarray.bitarray(code)
	else: # Each node in the tree of a Huffman code has either both siblings or none (by construction)
	stack.append((node.left, code + '0'))
	stack.append((node.right, code + '1'))
	return table


	def encode(text, codes):
	return bitarray.bitarray(''.join([codes[ch] for ch in text]))


	def decode(code, tree):
	"""
	Because the Huffman encoding is a prefix code, we have the property that
	no code is a prefix of another code. This follows trivially from the fact
	that the codes are represented by the Huffman tree. We use that propety to
	decode efficiently.
	"""
	text = []
	i = 0
	root = tree
	while i < len(code):
	node = root
	while not node.is_leaf():
	if i >= len(code):
	return text
	if not code[i]:
	node = node.left
	else:
	node = node.right
	i += 1
	text.append(node.letter)
	return bytearray(text)


	def compress(fnm_in, fnm_out):
	with open(fnm_in, "rb") as fin:
	text = fin.read()
	freqs = compute_frequecies(text)
	tree = build_huffman(freqs)
	codes = build_codes(tree)
	code = encode(text, codes)
	with open(fnm_out, 'wb') as fout:
	pickle.dump((tree, code), fout)


	def decompress(fnm_in, fnm_o):
	with open(fnm_in, 'rb') as fin, open(fnm_o, 'wb') as fout:
	tree, code = pickle.load(fin)
	fout.write(decode(code, tree))


	if __name__ == '__main__':
	if len(sys.argv) < 4:
	sys.exit(1)

	cmd, fnm_i, fnm_o = Command(sys.argv[1]), sys.argv[2], sys.argv[3]
	{
	Command.COMPRESS: compress,
	Command.DECOMPRESS: decompress
	}[cmd](fnm_i, fnm_o)