Created
January 21, 2025 16:50
-
-
Save CoderCowMoo/01feaa711dd0431938f444a1751260d4 to your computer and use it in GitHub Desktop.
Arithmetic Coding Simple 1.2mb - 697.41kb
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# OK time to arithmetic code. V1 because its so simplistic as a concept | |
# I'll make a dictionary of all unique symbols in the string (rather than | |
# a predefined dict) | |
# and I'll go ahead and subdivide and add it to a running total. | |
from collections import Counter | |
from sys import getsizeof | |
text = "Hello world!" | |
text_dc = {s:i for i,s in enumerate(sorted(set(text)))} | |
print(text_dc) | |
# OK now to encode, we'll | |
def encode(text, text_dc): | |
#another thing we want for arithmetic coding is the probability of each letter | |
counts = Counter(text) | |
total_count = len(text) | |
probabilities = {s: counts[s] / total_count for s in counts} | |
# print(probabilities) | |
# yet another thing according to gemini is cumulative probabilities | |
# which im not understanding just yet, but maybe implementing will help | |
sorted_symbols = sorted(text_dc.keys()) | |
cumulative_prob = 0 | |
cumulative_probabilities = {} | |
for s in sorted_symbols: | |
cumulative_probabilities[s] = cumulative_prob | |
cumulative_prob += probabilities[s] | |
# print(cumulative_probabilities) | |
low = 0.0 | |
high = 1.0 | |
# from what I understanding boss, the point of low and high is to get the | |
# encoded message. I thought like a single value would be added to/subtracted | |
# to get the message, but the message is literally between low and high | |
# like the value between them is the kinda precision/leeway of the message? | |
underflow_bits = 0 | |
output_bits = [] | |
for char in text: | |
range_size = high - low | |
cumulative_probability_lower = cumulative_probabilities[char] | |
cumulative_probability_higher = cumulative_probability_lower + probabilities[char] | |
high = low + range_size * cumulative_probability_higher | |
low = low + range_size * cumulative_probability_lower | |
# print(f"Character: {char}, Index: {text_dc[char]}, Low: {low:.14f}, High: {high:.14f}") | |
# gemini made this | |
while True: | |
if high < 0.5: | |
output_bits.append(0) | |
for _ in range(underflow_bits): | |
output_bits.append(1) | |
underflow_bits = 0 | |
low = low * 2 | |
high = high * 2 | |
elif low >= 0.5: | |
output_bits.append(1) | |
for _ in range(underflow_bits): | |
output_bits.append(0) | |
underflow_bits = 0 | |
low = 2 * (low - 0.5) | |
high = 2 * (high - 0.5) | |
elif low >= 0.25 and high < 0.75: | |
underflow_bits += 1 | |
low = 2 * (low - 0.25) | |
high = 2 * (high - 0.25) | |
else: break | |
underflow_bits += 1 | |
if low < 0.25: | |
output_bits.append(0) | |
for _ in range(underflow_bits): | |
output_bits.append(1) | |
else: | |
output_bits.append(1) | |
for _ in range(underflow_bits): | |
output_bits.append(0) | |
encoded_number = "".join(map(str, output_bits)) | |
return encoded_number, cumulative_probabilities, probabilities | |
# I FINALLY UNDERSTANDING THIS ONE BOSS. | |
# it feels good to understand, but its not intuitive yet, like I can't rotate | |
# the cum_probs in my mind, so I'll try implement decoding by myself completely. | |
# https://www.youtube.com/watch?v=7vfqhoJVwuc | |
# according to above we're not done. We also need to binary subdivide until | |
# we get a chunk that lies completely between low and high | |
# i think this is so that we get a binary string that represents our compressed text | |
# https://web.archive.org/web/20241217122540/https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html | |
# so from the above post, apparently we don't NEED to binary subdivide, thats just a better method??? | |
# what i've implemented above is called the infinite precision method | |
# because as the length of the text gets bigger, it'll need more and more precision. | |
# so I'm going to comment it all and write up a bitstream method | |
# def decode(text, text_dc, low, high): | |
# # so its basically the reverse of the encoding exactly no? | |
# # I have to find out how to get the previous low and high, so range_size would be known | |
# # and cumulative_probability_higher and lower should be known if I reverse the | |
# # cum_prob dictionary right? | |
# # I'll copy over the cum_prob code. | |
# counts = Counter(text) | |
# total_count = len(text) | |
# probabilities = {s: counts[s] / total_count for s in counts} | |
# | |
# sorted_symbols = sorted(text_dc.keys()) | |
# cumulative_prob = 0 | |
# cumulative_probabilities = {} | |
# for s in sorted_symbols: | |
# cumulative_probabilities[s] = cumulative_prob | |
# cumulative_prob += probabilities[s] | |
# # up to here is the exact same code as encode. Probably should make this a | |
# # function, but I'll do that later | |
# | |
def decode(encoded_number, cumulative_probabilities, probabilities, text_dc, length): | |
# Reverse the symbol-to-index mapping for easier lookup during decoding | |
index_to_symbol = {i: s for s, i in text_dc.items()} | |
low = 0.0 | |
high = 1.0 | |
decoded_text = "" | |
encoded_value = int(encoded_number, 2) / (2 ** len(encoded_number)) # Convert binary string to a value between 0 and 1 | |
for _ in range(length): # Decode for the length of the original text | |
range_size = high - low | |
# Find the symbol whose range contains encoded_value | |
for symbol in sorted(cumulative_probabilities, key=cumulative_probabilities.get): | |
symbol_low = cumulative_probabilities[symbol] | |
symbol_high = symbol_low + probabilities[symbol] | |
scaled_low = low + range_size * symbol_low | |
scaled_high = low + range_size * symbol_high | |
if scaled_low <= encoded_value < scaled_high: | |
decoded_text += symbol | |
low = scaled_low | |
high = scaled_high | |
break | |
# Renormalization (mirror the encoder) | |
while True: | |
if high < 0.5: | |
low = low * 2 | |
high = high * 2 | |
encoded_value = encoded_value * 2 | |
elif low >= 0.5: | |
low = 2 * (low - 0.5) | |
high = 2 * (high - 0.5) | |
encoded_value = 2 * (encoded_value - 0.5) | |
elif low >= 0.25 and high < 0.75: | |
low = 2 * (low - 0.25) | |
high = 2 * (high - 0.25) | |
encoded_value = 2 * (encoded_value - 0.25) | |
else: | |
break | |
return decoded_text | |
encoded_number, cumulative_probabilities, probabilities = encode(text, text_dc) | |
print(f"Encoded number: {encoded_number}") | |
decoded_text = decode(encoded_number, cumulative_probabilities, probabilities, text_dc, len(text)) | |
print(f"Decoded Text: {decoded_text}") | |
# I also want to try compressing a big text file and seeing the ratio | |
# https://www.gutenberg.org/cache/epub/15/pg15.txt | |
file = open("pg15.txt") | |
file_text = file.read() | |
text_dc = {s:i for i,s in enumerate(sorted(set(file_text)))} | |
encoded_number, cumulative_probabilities, probabilities = encode(file_text, text_dc) | |
print(f"encoded_number bytes: {getsizeof(encoded_number)}") | |
print(f"cumulative_probabilities bytes: {getsizeof(cumulative_probabilities)}") | |
print(f"probabilities bytes: {getsizeof(probabilities)}") | |
print(f"text_dc bytes: {getsizeof(text_dc)}") | |
print(f"Encoded number length: {(len(encoded_number)/8)/1000:.2f} KiloBytes") | |
print(text_dc) | |
file.close() | |
# What are the next steps of the operation? | |
# well the way arithmetic coding can be kinda goated, is if you use a good probabilities generator for the text. idk how that would work. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
-------------/compression % ls -lah | |
total 4360 | |
drwxr-xr-x@ 8 -------- staff 256B 21 Jan 19:43 . | |
drwxr-xr-x 5 -------- staff 160B 19 Jan 12:11 .. | |
-rw-r--r--@ 1 -------- staff 7.5K 21 Jan 17:43 arithmetic_coding_v1.py | |
-rw-r--r--@ 1 -------- staff 278B 19 Jan 15:27 goal.md | |
-rw-r--r--@ 1 -------- staff 716B 21 Jan 19:42 huffman_v1.py | |
-rw-r--r--@ 1 -------- staff 1.2M 21 Jan 13:23 pg15.txt | |
-rw-r--r--@ 1 -------- staff 500K 21 Jan 13:23 pg15.txt.gz | |
-rw-r--r--@ 1 ------- staff 410K 21 Jan 13:23 pg15.txt.xz | |
-------------/compression % python3 arithmetic_coding_v1.py | |
{' ': 0, '!': 1, 'H': 2, 'd': 3, 'e': 4, 'l': 5, 'o': 6, 'r': 7, 'w': 8} | |
Encoded number: 0011001011000111011111111111111101110 | |
Decoded Text: Hello world! | |
encoded_number bytes: 5579341 | |
cumulative_probabilities bytes: 3328 | |
probabilities bytes: 3328 | |
text_dc bytes: 3328 | |
Encoded number length: 697.41 KiloBytes | |
{'\n': 0, ' ': 1, '!': 2, '#': 3, '$': 4, '%': 5, '&': 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44, 'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'X': 50, 'Y': 51, 'Z': 52, '[': 53, ']': 54, '_': 55, 'a': 56, 'b': 57, 'c': 58, 'd': 59, 'e': 60, 'f': 61, 'g': 62, 'h': 63, 'i': 64, 'j': 65, 'k': 66, 'l': 67, 'm': 68, 'n': 69, 'o': 70, 'p': 71, 'q': 72, 'r': 73, 's': 74, 't': 75, 'u': 76, 'v': 77, 'w': 78, 'x': 79, 'y': 80, 'z': 81, '£': 82, 'â': 83, 'æ': 84, 'è': 85, 'é': 86, 'ù': 87, 'Œ': 88, 'œ': 89, 'η': 90, 'ο': 91, 'ς': 92, 'τ': 93, 'ϰ': 94, 'ו': 95, 'ח': 96, '—': 97, '‘': 98, '’': 99, '“': 100, '”': 101, '•': 102, '™': 103, '\ufeff': 104} | |
// redacting information removed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment