Last active
January 11, 2018 07:48
-
-
Save blazs/f313507eaba3a7279b22eac4a325d113 to your computer and use it in GitHub Desktop.
Quick&dirty implementation of the Huffman encoding.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Quick and dirty prototypical implementation of the Huffman encoding. | |
Blaz Sovdat ([email protected]), December 20th 2017 | |
""" | |
import collections | |
import enum | |
import heapq | |
import functools | |
import sys | |
import pickle | |
import bitarray | |
class Command(enum.Enum): | |
COMPRESS = 'compress' | |
DECOMPRESS = 'decompress' | |
@functools.total_ordering | |
class Node: | |
def __init__(self, freq, left=None, right=None, letter=None): | |
self.freq = freq | |
self.left = left | |
self.right = right | |
self.letter = letter | |
def __lt__(self, other): | |
return self.freq < other.freq | |
def __eq__(self, other): | |
return self.freq == other.freq | |
def is_leaf(self): | |
return self.left is None and self.right is None | |
def __repr__(self): | |
return "{},{}".format(str(self.freq), str(self.letter)) | |
def compute_frequecies(text): | |
d = collections.defaultdict(int) | |
for c in text: | |
d[c] += 1 | |
return d | |
def build_huffman(freqs): | |
heap = [Node(v, letter=k) for (k, v) in freqs.items()] | |
heapq.heapify(heap) | |
while len(heap) > 1: | |
u = heapq.heappop(heap) | |
v = heapq.heappop(heap) | |
w = Node(u.freq+v.freq, left=u, right=v) | |
heapq.heappush(heap, w) | |
return heapq.heappop(heap) | |
def build_codes(huff_tree): | |
table = {} | |
stack = [(huff_tree, '')] | |
while len(stack) > 0: | |
(node, code) = stack.pop() | |
if node.is_leaf(): | |
table[node.letter] = code # bitarray.bitarray(code) | |
else: # Each node in the tree of a Huffman code has either both siblings or none (by construction) | |
stack.append((node.left, code + '0')) | |
stack.append((node.right, code + '1')) | |
return table | |
def encode(text, codes): | |
return bitarray.bitarray(''.join([codes[ch] for ch in text])) | |
def decode(code, tree): | |
""" | |
Because the Huffman encoding is a prefix code, we have the property that | |
no code is a prefix of another code. This follows trivially from the fact | |
that the codes are represented by the Huffman tree. We use that propety to | |
decode efficiently. | |
""" | |
text = [] | |
i = 0 | |
root = tree | |
while i < len(code): | |
node = root | |
while not node.is_leaf(): | |
if i >= len(code): | |
return text | |
if not code[i]: | |
node = node.left | |
else: | |
node = node.right | |
i += 1 | |
text.append(node.letter) | |
return bytearray(text) | |
def compress(fnm_in, fnm_out): | |
with open(fnm_in, "rb") as fin: | |
text = fin.read() | |
freqs = compute_frequecies(text) | |
tree = build_huffman(freqs) | |
codes = build_codes(tree) | |
code = encode(text, codes) | |
with open(fnm_out, 'wb') as fout: | |
pickle.dump((tree, code), fout) | |
def decompress(fnm_in, fnm_o): | |
with open(fnm_in, 'rb') as fin, open(fnm_o, 'wb') as fout: | |
tree, code = pickle.load(fin) | |
fout.write(decode(code, tree)) | |
if __name__ == '__main__': | |
if len(sys.argv) < 4: | |
sys.exit(1) | |
cmd, fnm_i, fnm_o = Command(sys.argv[1]), sys.argv[2], sys.argv[3] | |
{ | |
Command.COMPRESS: compress, | |
Command.DECOMPRESS: decompress | |
}[cmd](fnm_i, fnm_o) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment