Skip to content

Instantly share code, notes, and snippets.

@CoderCowMoo
Created January 21, 2025 16:50
Show Gist options
  • Save CoderCowMoo/01feaa711dd0431938f444a1751260d4 to your computer and use it in GitHub Desktop.
Save CoderCowMoo/01feaa711dd0431938f444a1751260d4 to your computer and use it in GitHub Desktop.
Arithmetic Coding Simple 1.2mb - 697.41kb
# OK time to arithmetic code. V1 because its so simplistic as a concept
# I'll make a dictionary of all unique symbols in the string (rather than
# a predefined dict)
# and I'll go ahead and subdivide and add it to a running total.
from collections import Counter
from sys import getsizeof
text = "Hello world!"
text_dc = {s:i for i,s in enumerate(sorted(set(text)))}
print(text_dc)
# OK now to encode, we'll
def encode(text, text_dc):
#another thing we want for arithmetic coding is the probability of each letter
counts = Counter(text)
total_count = len(text)
probabilities = {s: counts[s] / total_count for s in counts}
# print(probabilities)
# yet another thing according to gemini is cumulative probabilities
# which im not understanding just yet, but maybe implementing will help
sorted_symbols = sorted(text_dc.keys())
cumulative_prob = 0
cumulative_probabilities = {}
for s in sorted_symbols:
cumulative_probabilities[s] = cumulative_prob
cumulative_prob += probabilities[s]
# print(cumulative_probabilities)
low = 0.0
high = 1.0
# from what I understanding boss, the point of low and high is to get the
# encoded message. I thought like a single value would be added to/subtracted
# to get the message, but the message is literally between low and high
# like the value between them is the kinda precision/leeway of the message?
underflow_bits = 0
output_bits = []
for char in text:
range_size = high - low
cumulative_probability_lower = cumulative_probabilities[char]
cumulative_probability_higher = cumulative_probability_lower + probabilities[char]
high = low + range_size * cumulative_probability_higher
low = low + range_size * cumulative_probability_lower
# print(f"Character: {char}, Index: {text_dc[char]}, Low: {low:.14f}, High: {high:.14f}")
# gemini made this
while True:
if high < 0.5:
output_bits.append(0)
for _ in range(underflow_bits):
output_bits.append(1)
underflow_bits = 0
low = low * 2
high = high * 2
elif low >= 0.5:
output_bits.append(1)
for _ in range(underflow_bits):
output_bits.append(0)
underflow_bits = 0
low = 2 * (low - 0.5)
high = 2 * (high - 0.5)
elif low >= 0.25 and high < 0.75:
underflow_bits += 1
low = 2 * (low - 0.25)
high = 2 * (high - 0.25)
else: break
underflow_bits += 1
if low < 0.25:
output_bits.append(0)
for _ in range(underflow_bits):
output_bits.append(1)
else:
output_bits.append(1)
for _ in range(underflow_bits):
output_bits.append(0)
encoded_number = "".join(map(str, output_bits))
return encoded_number, cumulative_probabilities, probabilities
# I FINALLY UNDERSTANDING THIS ONE BOSS.
# it feels good to understand, but its not intuitive yet, like I can't rotate
# the cum_probs in my mind, so I'll try implement decoding by myself completely.
# https://www.youtube.com/watch?v=7vfqhoJVwuc
# according to above we're not done. We also need to binary subdivide until
# we get a chunk that lies completely between low and high
# i think this is so that we get a binary string that represents our compressed text
# https://web.archive.org/web/20241217122540/https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html
# so from the above post, apparently we don't NEED to binary subdivide, thats just a better method???
# what i've implemented above is called the infinite precision method
# because as the length of the text gets bigger, it'll need more and more precision.
# so I'm going to comment it all and write up a bitstream method
# def decode(text, text_dc, low, high):
# # so its basically the reverse of the encoding exactly no?
# # I have to find out how to get the previous low and high, so range_size would be known
# # and cumulative_probability_higher and lower should be known if I reverse the
# # cum_prob dictionary right?
# # I'll copy over the cum_prob code.
# counts = Counter(text)
# total_count = len(text)
# probabilities = {s: counts[s] / total_count for s in counts}
#
# sorted_symbols = sorted(text_dc.keys())
# cumulative_prob = 0
# cumulative_probabilities = {}
# for s in sorted_symbols:
# cumulative_probabilities[s] = cumulative_prob
# cumulative_prob += probabilities[s]
# # up to here is the exact same code as encode. Probably should make this a
# # function, but I'll do that later
#
def decode(encoded_number, cumulative_probabilities, probabilities, text_dc, length):
# Reverse the symbol-to-index mapping for easier lookup during decoding
index_to_symbol = {i: s for s, i in text_dc.items()}
low = 0.0
high = 1.0
decoded_text = ""
encoded_value = int(encoded_number, 2) / (2 ** len(encoded_number)) # Convert binary string to a value between 0 and 1
for _ in range(length): # Decode for the length of the original text
range_size = high - low
# Find the symbol whose range contains encoded_value
for symbol in sorted(cumulative_probabilities, key=cumulative_probabilities.get):
symbol_low = cumulative_probabilities[symbol]
symbol_high = symbol_low + probabilities[symbol]
scaled_low = low + range_size * symbol_low
scaled_high = low + range_size * symbol_high
if scaled_low <= encoded_value < scaled_high:
decoded_text += symbol
low = scaled_low
high = scaled_high
break
# Renormalization (mirror the encoder)
while True:
if high < 0.5:
low = low * 2
high = high * 2
encoded_value = encoded_value * 2
elif low >= 0.5:
low = 2 * (low - 0.5)
high = 2 * (high - 0.5)
encoded_value = 2 * (encoded_value - 0.5)
elif low >= 0.25 and high < 0.75:
low = 2 * (low - 0.25)
high = 2 * (high - 0.25)
encoded_value = 2 * (encoded_value - 0.25)
else:
break
return decoded_text
encoded_number, cumulative_probabilities, probabilities = encode(text, text_dc)
print(f"Encoded number: {encoded_number}")
decoded_text = decode(encoded_number, cumulative_probabilities, probabilities, text_dc, len(text))
print(f"Decoded Text: {decoded_text}")
# I also want to try compressing a big text file and seeing the ratio
# https://www.gutenberg.org/cache/epub/15/pg15.txt
file = open("pg15.txt")
file_text = file.read()
text_dc = {s:i for i,s in enumerate(sorted(set(file_text)))}
encoded_number, cumulative_probabilities, probabilities = encode(file_text, text_dc)
print(f"encoded_number bytes: {getsizeof(encoded_number)}")
print(f"cumulative_probabilities bytes: {getsizeof(cumulative_probabilities)}")
print(f"probabilities bytes: {getsizeof(probabilities)}")
print(f"text_dc bytes: {getsizeof(text_dc)}")
print(f"Encoded number length: {(len(encoded_number)/8)/1000:.2f} KiloBytes")
print(text_dc)
file.close()
# What are the next steps of the operation?
# well the way arithmetic coding can be kinda goated, is if you use a good probabilities generator for the text. idk how that would work.
-------------/compression % ls -lah
total 4360
drwxr-xr-x@ 8 -------- staff 256B 21 Jan 19:43 .
drwxr-xr-x 5 -------- staff 160B 19 Jan 12:11 ..
-rw-r--r--@ 1 -------- staff 7.5K 21 Jan 17:43 arithmetic_coding_v1.py
-rw-r--r--@ 1 -------- staff 278B 19 Jan 15:27 goal.md
-rw-r--r--@ 1 -------- staff 716B 21 Jan 19:42 huffman_v1.py
-rw-r--r--@ 1 -------- staff 1.2M 21 Jan 13:23 pg15.txt
-rw-r--r--@ 1 -------- staff 500K 21 Jan 13:23 pg15.txt.gz
-rw-r--r--@ 1 ------- staff 410K 21 Jan 13:23 pg15.txt.xz
-------------/compression % python3 arithmetic_coding_v1.py
{' ': 0, '!': 1, 'H': 2, 'd': 3, 'e': 4, 'l': 5, 'o': 6, 'r': 7, 'w': 8}
Encoded number: 0011001011000111011111111111111101110
Decoded Text: Hello world!
encoded_number bytes: 5579341
cumulative_probabilities bytes: 3328
probabilities bytes: 3328
text_dc bytes: 3328
Encoded number length: 697.41 KiloBytes
{'\n': 0, ' ': 1, '!': 2, '#': 3, '$': 4, '%': 5, '&': 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44, 'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'X': 50, 'Y': 51, 'Z': 52, '[': 53, ']': 54, '_': 55, 'a': 56, 'b': 57, 'c': 58, 'd': 59, 'e': 60, 'f': 61, 'g': 62, 'h': 63, 'i': 64, 'j': 65, 'k': 66, 'l': 67, 'm': 68, 'n': 69, 'o': 70, 'p': 71, 'q': 72, 'r': 73, 's': 74, 't': 75, 'u': 76, 'v': 77, 'w': 78, 'x': 79, 'y': 80, 'z': 81, '£': 82, 'â': 83, 'æ': 84, 'è': 85, 'é': 86, 'ù': 87, 'Œ': 88, 'œ': 89, 'η': 90, 'ο': 91, 'ς': 92, 'τ': 93, 'ϰ': 94, 'ו': 95, 'ח': 96, '—': 97, '‘': 98, '’': 99, '“': 100, '”': 101, '•': 102, '™': 103, '\ufeff': 104}
// redacting information removed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment