Skip to content

Instantly share code, notes, and snippets.

@arduinka55055
Last active February 2, 2024 10:10
Show Gist options
  • Save arduinka55055/3ee02921f2bbcfcafe4fce393ef4e9ff to your computer and use it in GitHub Desktop.
Save arduinka55055/3ee02921f2bbcfcafe4fce393ef4e9ff to your computer and use it in GitHub Desktop.
Лабораторна 1, досліджуємо ентропію тексту
## -*- coding: utf-8 -*-
#pip install requests
#pip install matplotlib
#pip install numpy
import numpy as np
import matplotlib.pyplot as plt
#get text entropy form a file
file = open("/Users/denis/Downloads/eneida.txt", "r", encoding="utf-8")
#utf8 encoding
text = file.read()
file.close()
#all chars to lower
text = text.lower()
#count the number of each character
char_count = {}
for char in text:
#check if ascii
if ord(char) < 127 and ord(char) > 32:
continue
#replace space with _
if char == ' ':
char = '_'
#replace newline with \
if char == '\n':
char = '\\'
if char in char_count:
char_count[char] += 1
else:
char_count[char] = 1
#calculate frequency of each letter appear sorted by count
char_count = dict(sorted(char_count.items(), key=lambda item: item[1], reverse=True))
print(char_count)
plt.bar(char_count.keys(), char_count.values())
#calculate the probability of each character
char_prob = {}
for char in char_count:
char_prob[char] = char_count[char]/len(text)
#sort by alphabetical order
char_prob = dict(sorted(char_prob.items()))
#show histogram of character probabilities
plt.bar(char_prob.keys(), char_prob.values())
plt.show()
#order by probability
char_prob = dict(sorted(char_prob.items(), key=lambda item: item[1], reverse=True))
plt.bar(char_prob.keys(), char_prob.values())
plt.show()
#circle diagram
plt.pie(char_prob.values(), labels=char_prob.keys())
plt.show()
#calculate entropy
entropy = 0
for char in char_prob:
entropy += char_prob[char]*np.log2(char_prob[char])
entropy = -entropy
print("Entropy: ", entropy)
#compress and calculate compression ratio
import zlib
compressed = zlib.compress(text.encode('utf-8'))
print("Bytes plain: ", len(text))
print("Bytes compressed: ", len(compressed))
print("Compression ratio: ", len(text)/len(compressed))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment