CoderCowMoo · January 21, 2025 16:50
diff --git a/arithmetic_coding_v1.py b/arithmetic_coding_v1.py
 # OK time to arithmetic code. V1 because its so simplistic as a concept
 # I'll make a dictionary of all unique symbols in the string (rather than
 # a predefined dict)
 # and I'll go ahead and subdivide and add it to a running total.
 from collections import Counter
 from sys import getsizeof

 text = "Hello world!"
 text_dc = {s:i for i,s in enumerate(sorted(set(text)))}
 print(text_dc)

 # OK now to encode, we'll
 def encode(text, text_dc):
    #another thing we want for arithmetic coding is the probability of each letter
    counts = Counter(text)
    total_count = len(text)
    probabilities = {s: counts[s] / total_count for s in counts}
    # print(probabilities)

    # yet another thing according to gemini is cumulative probabilities
    # which im not understanding just yet, but maybe implementing will help
    sorted_symbols = sorted(text_dc.keys())
    cumulative_prob = 0
    cumulative_probabilities = {}
    for s in sorted_symbols:
        cumulative_probabilities[s] = cumulative_prob
        cumulative_prob += probabilities[s]
    # print(cumulative_probabilities)

    low = 0.0
    high = 1.0

    # from what I understanding boss, the point of low and high is to get the
    # encoded message. I thought like a single value would be added to/subtracted
    # to get the message, but the message is literally between low and high
    # like the value between them is the kinda precision/leeway of the message?

    underflow_bits = 0
    output_bits = []

    for char in text:
        range_size = high - low

        cumulative_probability_lower = cumulative_probabilities[char]
        cumulative_probability_higher = cumulative_probability_lower + probabilities[char]

        high = low + range_size * cumulative_probability_higher
        low = low + range_size * cumulative_probability_lower

        # print(f"Character: {char}, Index: {text_dc[char]}, Low: {low:.14f}, High: {high:.14f}")

        # gemini made this
        while True:
            if high < 0.5:
                output_bits.append(0)
                for _ in range(underflow_bits):
                    output_bits.append(1)
                underflow_bits = 0
                low = low * 2
                high = high * 2
            elif low >= 0.5:
                output_bits.append(1)
                for _ in range(underflow_bits):
                    output_bits.append(0)
                underflow_bits = 0
                low = 2 * (low - 0.5)
                high = 2 * (high - 0.5)
            elif low >= 0.25 and high < 0.75:
                underflow_bits += 1
                low = 2 * (low - 0.25)
                high = 2 * (high - 0.25)
            else: break

    underflow_bits += 1
    if low < 0.25:
        output_bits.append(0)
        for _ in range(underflow_bits):
            output_bits.append(1)
    else:
        output_bits.append(1)
        for _ in range(underflow_bits):
            output_bits.append(0)

    encoded_number = "".join(map(str, output_bits))
    return encoded_number, cumulative_probabilities, probabilities

    # I FINALLY UNDERSTANDING THIS ONE BOSS.
    # it feels good to understand, but its not intuitive yet, like I can't rotate
    # the cum_probs in my mind, so I'll try implement decoding by myself completely.

    # https://www.youtube.com/watch?v=7vfqhoJVwuc
    # according to above we're not done. We also need to binary subdivide until
    # we get a chunk that lies completely between low and high
    # i think this is so that we get a binary string that represents our compressed text
    # https://web.archive.org/web/20241217122540/https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html
    # so from the above post, apparently we don't NEED to binary subdivide, thats just a better method???
    # what i've implemented above is called the infinite precision method
    # because as the length of the text gets bigger, it'll need more and more precision.
    # so I'm going to comment it all and write up a bitstream method


 # def decode(text, text_dc, low, high):
 #     # so its basically the reverse of the encoding exactly no?
 #     # I have to find out how to get the previous low and high, so range_size would be known
 #     # and cumulative_probability_higher and lower should be known if I reverse the
 #     # cum_prob dictionary right?
 #     # I'll copy over the cum_prob code.
 #     counts = Counter(text)
 #     total_count = len(text)
 #     probabilities = {s: counts[s] / total_count for s in counts}
 #
 #     sorted_symbols = sorted(text_dc.keys())
 #     cumulative_prob = 0
 #     cumulative_probabilities = {}
 #     for s in sorted_symbols:
 #         cumulative_probabilities[s] = cumulative_prob
 #         cumulative_prob += probabilities[s]
 #     # up to here is the exact same code as encode. Probably should make this a
 #     # function, but I'll do that later
 #
 def decode(encoded_number, cumulative_probabilities, probabilities, text_dc, length):

    # Reverse the symbol-to-index mapping for easier lookup during decoding
    index_to_symbol = {i: s for s, i in text_dc.items()}

    low = 0.0
    high = 1.0
    decoded_text = ""
    encoded_value = int(encoded_number, 2) / (2 ** len(encoded_number)) # Convert binary string to a value between 0 and 1

    for _ in range(length):  # Decode for the length of the original text
        range_size = high - low

        # Find the symbol whose range contains encoded_value
        for symbol in sorted(cumulative_probabilities, key=cumulative_probabilities.get):
            symbol_low = cumulative_probabilities[symbol]
            symbol_high = symbol_low + probabilities[symbol]
            
            scaled_low = low + range_size * symbol_low
            scaled_high = low + range_size * symbol_high

            if scaled_low <= encoded_value < scaled_high:
                decoded_text += symbol
                low = scaled_low
                high = scaled_high
                break
        
        # Renormalization (mirror the encoder)
        while True:
            if high < 0.5:
                low = low * 2
                high = high * 2
                encoded_value = encoded_value * 2
            elif low >= 0.5:
                low = 2 * (low - 0.5)
                high = 2 * (high - 0.5)
                encoded_value = 2 * (encoded_value - 0.5)
            elif low >= 0.25 and high < 0.75:
                low = 2 * (low - 0.25)
                high = 2 * (high - 0.25)
                encoded_value = 2 * (encoded_value - 0.25)
            else:
                break

    return decoded_text

 encoded_number, cumulative_probabilities, probabilities = encode(text, text_dc)
 print(f"Encoded number: {encoded_number}")
 decoded_text = decode(encoded_number, cumulative_probabilities, probabilities, text_dc, len(text))
 print(f"Decoded Text: {decoded_text}")

 # I also want to try compressing a big text file and seeing the ratio
 # https://www.gutenberg.org/cache/epub/15/pg15.txt
 file = open("pg15.txt")
 file_text = file.read()
 text_dc = {s:i for i,s in enumerate(sorted(set(file_text)))}
 encoded_number, cumulative_probabilities, probabilities = encode(file_text, text_dc)
 print(f"encoded_number bytes: {getsizeof(encoded_number)}")
 print(f"cumulative_probabilities bytes: {getsizeof(cumulative_probabilities)}")
 print(f"probabilities bytes: {getsizeof(probabilities)}")
 print(f"text_dc bytes: {getsizeof(text_dc)}")
 print(f"Encoded number length: {(len(encoded_number)/8)/1000:.2f} KiloBytes")
 print(text_dc)
 file.close()

 # What are the next steps of the operation?
 # well the way arithmetic coding can be kinda goated, is if you use a good probabilities generator for the text. idk how that would work.
diff --git a/ratio for moby dick.txt b/ratio for moby dick.txt
 -------------/compression % ls -lah
 total 4360
 drwxr-xr-x@ 8 --------  staff   256B 21 Jan 19:43 .
 drwxr-xr-x  5 --------  staff   160B 19 Jan 12:11 ..
 -rw-r--r--@ 1 --------  staff   7.5K 21 Jan 17:43 arithmetic_coding_v1.py
 -rw-r--r--@ 1 --------  staff   278B 19 Jan 15:27 goal.md
 -rw-r--r--@ 1 --------  staff   716B 21 Jan 19:42 huffman_v1.py
 -rw-r--r--@ 1 --------  staff   1.2M 21 Jan 13:23 pg15.txt
 -rw-r--r--@ 1 --------  staff   500K 21 Jan 13:23 pg15.txt.gz
 -rw-r--r--@ 1 -------  staff   410K 21 Jan 13:23 pg15.txt.xz
 -------------/compression % python3 arithmetic_coding_v1.py
 {' ': 0, '!': 1, 'H': 2, 'd': 3, 'e': 4, 'l': 5, 'o': 6, 'r': 7, 'w': 8}
 Encoded number: 0011001011000111011111111111111101110
 Decoded Text: Hello world!
 encoded_number bytes: 5579341
 cumulative_probabilities bytes: 3328
 probabilities bytes: 3328
 text_dc bytes: 3328
 Encoded number length: 697.41 KiloBytes
 {'\n': 0, ' ': 1, '!': 2, '#': 3, '$': 4, '%': 5, '&': 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44, 'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'X': 50, 'Y': 51, 'Z': 52, '[': 53, ']': 54, '_': 55, 'a': 56, 'b': 57, 'c': 58, 'd': 59, 'e': 60, 'f': 61, 'g': 62, 'h': 63, 'i': 64, 'j': 65, 'k': 66, 'l': 67, 'm': 68, 'n': 69, 'o': 70, 'p': 71, 'q': 72, 'r': 73, 's': 74, 't': 75, 'u': 76, 'v': 77, 'w': 78, 'x': 79, 'y': 80, 'z': 81, '£': 82, 'â': 83, 'æ': 84, 'è': 85, 'é': 86, 'ù': 87, 'Œ': 88, 'œ': 89, 'η': 90, 'ο': 91, 'ς': 92, 'τ': 93, 'ϰ': 94, 'ו': 95, 'ח': 96, '—': 97, '‘': 98, '’': 99, '“': 100, '”': 101, '•': 102, '™': 103, '\ufeff': 104}

 // redacting information removed
	# OK time to arithmetic code. V1 because its so simplistic as a concept
	# I'll make a dictionary of all unique symbols in the string (rather than
	# a predefined dict)
	# and I'll go ahead and subdivide and add it to a running total.
	from collections import Counter
	from sys import getsizeof

	text = "Hello world!"
	text_dc = {s:i for i,s in enumerate(sorted(set(text)))}
	print(text_dc)

	# OK now to encode, we'll
	def encode(text, text_dc):
	#another thing we want for arithmetic coding is the probability of each letter
	counts = Counter(text)
	total_count = len(text)
	probabilities = {s: counts[s] / total_count for s in counts}
	# print(probabilities)

	# yet another thing according to gemini is cumulative probabilities
	# which im not understanding just yet, but maybe implementing will help
	sorted_symbols = sorted(text_dc.keys())
	cumulative_prob = 0
	cumulative_probabilities = {}
	for s in sorted_symbols:
	cumulative_probabilities[s] = cumulative_prob
	cumulative_prob += probabilities[s]
	# print(cumulative_probabilities)

	low = 0.0
	high = 1.0

	# from what I understanding boss, the point of low and high is to get the
	# encoded message. I thought like a single value would be added to/subtracted
	# to get the message, but the message is literally between low and high
	# like the value between them is the kinda precision/leeway of the message?

	underflow_bits = 0
	output_bits = []

	for char in text:
	range_size = high - low

	cumulative_probability_lower = cumulative_probabilities[char]
	cumulative_probability_higher = cumulative_probability_lower + probabilities[char]

	high = low + range_size * cumulative_probability_higher
	low = low + range_size * cumulative_probability_lower

	# print(f"Character: {char}, Index: {text_dc[char]}, Low: {low:.14f}, High: {high:.14f}")

	# gemini made this
	while True:
	if high < 0.5:
	output_bits.append(0)
	for _ in range(underflow_bits):
	output_bits.append(1)
	underflow_bits = 0
	low = low * 2
	high = high * 2
	elif low >= 0.5:
	output_bits.append(1)
	for _ in range(underflow_bits):
	output_bits.append(0)
	underflow_bits = 0
	low = 2 * (low - 0.5)
	high = 2 * (high - 0.5)
	elif low >= 0.25 and high < 0.75:
	underflow_bits += 1
	low = 2 * (low - 0.25)
	high = 2 * (high - 0.25)
	else: break

	underflow_bits += 1
	if low < 0.25:
	output_bits.append(0)
	for _ in range(underflow_bits):
	output_bits.append(1)
	else:
	output_bits.append(1)
	for _ in range(underflow_bits):
	output_bits.append(0)

	encoded_number = "".join(map(str, output_bits))
	return encoded_number, cumulative_probabilities, probabilities

	# I FINALLY UNDERSTANDING THIS ONE BOSS.
	# it feels good to understand, but its not intuitive yet, like I can't rotate
	# the cum_probs in my mind, so I'll try implement decoding by myself completely.

	# https://www.youtube.com/watch?v=7vfqhoJVwuc
	# according to above we're not done. We also need to binary subdivide until
	# we get a chunk that lies completely between low and high
	# i think this is so that we get a binary string that represents our compressed text
	# https://web.archive.org/web/20241217122540/https://marknelson.us/posts/2014/10/19/data-compression-with-arithmetic-coding.html
	# so from the above post, apparently we don't NEED to binary subdivide, thats just a better method???
	# what i've implemented above is called the infinite precision method
	# because as the length of the text gets bigger, it'll need more and more precision.
	# so I'm going to comment it all and write up a bitstream method


	# def decode(text, text_dc, low, high):
	# # so its basically the reverse of the encoding exactly no?
	# # I have to find out how to get the previous low and high, so range_size would be known
	# # and cumulative_probability_higher and lower should be known if I reverse the
	# # cum_prob dictionary right?
	# # I'll copy over the cum_prob code.
	# counts = Counter(text)
	# total_count = len(text)
	# probabilities = {s: counts[s] / total_count for s in counts}
	#
	# sorted_symbols = sorted(text_dc.keys())
	# cumulative_prob = 0
	# cumulative_probabilities = {}
	# for s in sorted_symbols:
	# cumulative_probabilities[s] = cumulative_prob
	# cumulative_prob += probabilities[s]
	# # up to here is the exact same code as encode. Probably should make this a
	# # function, but I'll do that later
	#
	def decode(encoded_number, cumulative_probabilities, probabilities, text_dc, length):

	# Reverse the symbol-to-index mapping for easier lookup during decoding
	index_to_symbol = {i: s for s, i in text_dc.items()}

	low = 0.0
	high = 1.0
	decoded_text = ""
	encoded_value = int(encoded_number, 2) / (2 ** len(encoded_number)) # Convert binary string to a value between 0 and 1

	for _ in range(length): # Decode for the length of the original text
	range_size = high - low

	# Find the symbol whose range contains encoded_value
	for symbol in sorted(cumulative_probabilities, key=cumulative_probabilities.get):
	symbol_low = cumulative_probabilities[symbol]
	symbol_high = symbol_low + probabilities[symbol]

	scaled_low = low + range_size * symbol_low
	scaled_high = low + range_size * symbol_high

	if scaled_low <= encoded_value < scaled_high:
	decoded_text += symbol
	low = scaled_low
	high = scaled_high
	break

	# Renormalization (mirror the encoder)
	while True:
	if high < 0.5:
	low = low * 2
	high = high * 2
	encoded_value = encoded_value * 2
	elif low >= 0.5:
	low = 2 * (low - 0.5)
	high = 2 * (high - 0.5)
	encoded_value = 2 * (encoded_value - 0.5)
	elif low >= 0.25 and high < 0.75:
	low = 2 * (low - 0.25)
	high = 2 * (high - 0.25)
	encoded_value = 2 * (encoded_value - 0.25)
	else:
	break

	return decoded_text

	encoded_number, cumulative_probabilities, probabilities = encode(text, text_dc)
	print(f"Encoded number: {encoded_number}")
	decoded_text = decode(encoded_number, cumulative_probabilities, probabilities, text_dc, len(text))
	print(f"Decoded Text: {decoded_text}")

	# I also want to try compressing a big text file and seeing the ratio
	# https://www.gutenberg.org/cache/epub/15/pg15.txt
	file = open("pg15.txt")
	file_text = file.read()
	text_dc = {s:i for i,s in enumerate(sorted(set(file_text)))}
	encoded_number, cumulative_probabilities, probabilities = encode(file_text, text_dc)
	print(f"encoded_number bytes: {getsizeof(encoded_number)}")
	print(f"cumulative_probabilities bytes: {getsizeof(cumulative_probabilities)}")
	print(f"probabilities bytes: {getsizeof(probabilities)}")
	print(f"text_dc bytes: {getsizeof(text_dc)}")
	print(f"Encoded number length: {(len(encoded_number)/8)/1000:.2f} KiloBytes")
	print(text_dc)
	file.close()

	# What are the next steps of the operation?
	# well the way arithmetic coding can be kinda goated, is if you use a good probabilities generator for the text. idk how that would work.
	-------------/compression % ls -lah
	total 4360
	drwxr-xr-x@ 8 -------- staff 256B 21 Jan 19:43 .
	drwxr-xr-x 5 -------- staff 160B 19 Jan 12:11 ..
	-rw-r--r--@ 1 -------- staff 7.5K 21 Jan 17:43 arithmetic_coding_v1.py
	-rw-r--r--@ 1 -------- staff 278B 19 Jan 15:27 goal.md
	-rw-r--r--@ 1 -------- staff 716B 21 Jan 19:42 huffman_v1.py
	-rw-r--r--@ 1 -------- staff 1.2M 21 Jan 13:23 pg15.txt
	-rw-r--r--@ 1 -------- staff 500K 21 Jan 13:23 pg15.txt.gz
	-rw-r--r--@ 1 ------- staff 410K 21 Jan 13:23 pg15.txt.xz
	-------------/compression % python3 arithmetic_coding_v1.py
	{' ': 0, '!': 1, 'H': 2, 'd': 3, 'e': 4, 'l': 5, 'o': 6, 'r': 7, 'w': 8}
	Encoded number: 0011001011000111011111111111111101110
	Decoded Text: Hello world!
	encoded_number bytes: 5579341
	cumulative_probabilities bytes: 3328
	probabilities bytes: 3328
	text_dc bytes: 3328
	Encoded number length: 697.41 KiloBytes
	{'\n': 0, ' ': 1, '!': 2, '#': 3, '$': 4, '%': 5, '&': 6, '(': 7, ')': 8, '*': 9, ',': 10, '-': 11, '.': 12, '/': 13, '0': 14, '1': 15, '2': 16, '3': 17, '4': 18, '5': 19, '6': 20, '7': 21, '8': 22, '9': 23, ':': 24, ';': 25, '?': 26, 'A': 27, 'B': 28, 'C': 29, 'D': 30, 'E': 31, 'F': 32, 'G': 33, 'H': 34, 'I': 35, 'J': 36, 'K': 37, 'L': 38, 'M': 39, 'N': 40, 'O': 41, 'P': 42, 'Q': 43, 'R': 44, 'S': 45, 'T': 46, 'U': 47, 'V': 48, 'W': 49, 'X': 50, 'Y': 51, 'Z': 52, '[': 53, ']': 54, '_': 55, 'a': 56, 'b': 57, 'c': 58, 'd': 59, 'e': 60, 'f': 61, 'g': 62, 'h': 63, 'i': 64, 'j': 65, 'k': 66, 'l': 67, 'm': 68, 'n': 69, 'o': 70, 'p': 71, 'q': 72, 'r': 73, 's': 74, 't': 75, 'u': 76, 'v': 77, 'w': 78, 'x': 79, 'y': 80, 'z': 81, '£': 82, 'â': 83, 'æ': 84, 'è': 85, 'é': 86, 'ù': 87, 'Œ': 88, 'œ': 89, 'η': 90, 'ο': 91, 'ς': 92, 'τ': 93, 'ϰ': 94, 'ו': 95, 'ח': 96, '—': 97, '‘': 98, '’': 99, '“': 100, '”': 101, '•': 102, '™': 103, '\ufeff': 104}

	// redacting information removed