Last active
February 2, 2024 10:10
-
-
Save arduinka55055/3ee02921f2bbcfcafe4fce393ef4e9ff to your computer and use it in GitHub Desktop.
Лабораторна 1, досліджуємо ентропію тексту
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
## -*- coding: utf-8 -*- | |
#pip install requests | |
#pip install matplotlib | |
#pip install numpy | |
import numpy as np | |
import matplotlib.pyplot as plt | |
#get text entropy form a file | |
file = open("/Users/denis/Downloads/eneida.txt", "r", encoding="utf-8") | |
#utf8 encoding | |
text = file.read() | |
file.close() | |
#all chars to lower | |
text = text.lower() | |
#count the number of each character | |
char_count = {} | |
for char in text: | |
#check if ascii | |
if ord(char) < 127 and ord(char) > 32: | |
continue | |
#replace space with _ | |
if char == ' ': | |
char = '_' | |
#replace newline with \ | |
if char == '\n': | |
char = '\\' | |
if char in char_count: | |
char_count[char] += 1 | |
else: | |
char_count[char] = 1 | |
#calculate frequency of each letter appear sorted by count | |
char_count = dict(sorted(char_count.items(), key=lambda item: item[1], reverse=True)) | |
print(char_count) | |
plt.bar(char_count.keys(), char_count.values()) | |
#calculate the probability of each character | |
char_prob = {} | |
for char in char_count: | |
char_prob[char] = char_count[char]/len(text) | |
#sort by alphabetical order | |
char_prob = dict(sorted(char_prob.items())) | |
#show histogram of character probabilities | |
plt.bar(char_prob.keys(), char_prob.values()) | |
plt.show() | |
#order by probability | |
char_prob = dict(sorted(char_prob.items(), key=lambda item: item[1], reverse=True)) | |
plt.bar(char_prob.keys(), char_prob.values()) | |
plt.show() | |
#circle diagram | |
plt.pie(char_prob.values(), labels=char_prob.keys()) | |
plt.show() | |
#calculate entropy | |
entropy = 0 | |
for char in char_prob: | |
entropy += char_prob[char]*np.log2(char_prob[char]) | |
entropy = -entropy | |
print("Entropy: ", entropy) | |
#compress and calculate compression ratio | |
import zlib | |
compressed = zlib.compress(text.encode('utf-8')) | |
print("Bytes plain: ", len(text)) | |
print("Bytes compressed: ", len(compressed)) | |
print("Compression ratio: ", len(text)/len(compressed)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment