Skip to content

Instantly share code, notes, and snippets.

@blazs
Last active January 11, 2018 07:48
Show Gist options
  • Save blazs/f313507eaba3a7279b22eac4a325d113 to your computer and use it in GitHub Desktop.
Save blazs/f313507eaba3a7279b22eac4a325d113 to your computer and use it in GitHub Desktop.
Quick&dirty implementation of the Huffman encoding.
"""
Quick and dirty prototypical implementation of the Huffman encoding.
Blaz Sovdat ([email protected]), December 20th 2017
"""
import collections
import enum
import heapq
import functools
import sys
import pickle
import bitarray
class Command(enum.Enum):
COMPRESS = 'compress'
DECOMPRESS = 'decompress'
@functools.total_ordering
class Node:
def __init__(self, freq, left=None, right=None, letter=None):
self.freq = freq
self.left = left
self.right = right
self.letter = letter
def __lt__(self, other):
return self.freq < other.freq
def __eq__(self, other):
return self.freq == other.freq
def is_leaf(self):
return self.left is None and self.right is None
def __repr__(self):
return "{},{}".format(str(self.freq), str(self.letter))
def compute_frequecies(text):
d = collections.defaultdict(int)
for c in text:
d[c] += 1
return d
def build_huffman(freqs):
heap = [Node(v, letter=k) for (k, v) in freqs.items()]
heapq.heapify(heap)
while len(heap) > 1:
u = heapq.heappop(heap)
v = heapq.heappop(heap)
w = Node(u.freq+v.freq, left=u, right=v)
heapq.heappush(heap, w)
return heapq.heappop(heap)
def build_codes(huff_tree):
table = {}
stack = [(huff_tree, '')]
while len(stack) > 0:
(node, code) = stack.pop()
if node.is_leaf():
table[node.letter] = code # bitarray.bitarray(code)
else: # Each node in the tree of a Huffman code has either both siblings or none (by construction)
stack.append((node.left, code + '0'))
stack.append((node.right, code + '1'))
return table
def encode(text, codes):
return bitarray.bitarray(''.join([codes[ch] for ch in text]))
def decode(code, tree):
"""
Because the Huffman encoding is a prefix code, we have the property that
no code is a prefix of another code. This follows trivially from the fact
that the codes are represented by the Huffman tree. We use that propety to
decode efficiently.
"""
text = []
i = 0
root = tree
while i < len(code):
node = root
while not node.is_leaf():
if i >= len(code):
return text
if not code[i]:
node = node.left
else:
node = node.right
i += 1
text.append(node.letter)
return bytearray(text)
def compress(fnm_in, fnm_out):
with open(fnm_in, "rb") as fin:
text = fin.read()
freqs = compute_frequecies(text)
tree = build_huffman(freqs)
codes = build_codes(tree)
code = encode(text, codes)
with open(fnm_out, 'wb') as fout:
pickle.dump((tree, code), fout)
def decompress(fnm_in, fnm_o):
with open(fnm_in, 'rb') as fin, open(fnm_o, 'wb') as fout:
tree, code = pickle.load(fin)
fout.write(decode(code, tree))
if __name__ == '__main__':
if len(sys.argv) < 4:
sys.exit(1)
cmd, fnm_i, fnm_o = Command(sys.argv[1]), sys.argv[2], sys.argv[3]
{
Command.COMPRESS: compress,
Command.DECOMPRESS: decompress
}[cmd](fnm_i, fnm_o)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment