Created
June 28, 2022 15:06
-
-
Save FilipDominec/912b18147842ed5de7adbf3fab1413c9 to your computer and use it in GitHub Desktop.
Searches for such charset conversion, which would generate a wrong encoded string from a known correct one
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
#-*- coding: utf-8 -*- | |
# Searches for such charset conversion, which would generate a wrong encoded string from a known correct one | |
# Public domain, written by Filip Dominec 2022 | |
# EXAMPLES: | |
#wrong, correct = "╪ konstrukЯnб ¤eчenб", "ě konstrukční řešení" | |
#wrong, correct = "slouÄeninovĂ˝ch", "sloučeninových" | |
#wrong, correct = "pøípravu slouèeninových polovodièù", "přípravu sloučeninových polovodičů" | |
#wrong, correct = "ý", "ý" | |
#wrong, correct = "Pro přípravu sloučeninových polovodičů vyuľívá jako zdrojové materiály", "Pro přípravu sloučeninových polovodičů využívá jako zdrojové materiály" | |
#wrong, correct = "à", "ů" | |
#wrong, correct = "v∞m╪r", "výměr" | |
#wrong, correct = "slouèeninových", "sloučeninových" | |
#wrong, correct = "vyuľívá","využívá" | |
#wrong, correct = "vyu¾ívá", "využívá" | |
wrong, correct = "M╪²ení", "Měření", | |
import os | |
## Try all encodings (big table!) | |
def encodinglist(): # https://stackoverflow.com/questions/1728376/get-a-list-of-all-the-encodings-python-can-encode-to | |
r=[] | |
for i in os.listdir(os.path.split(__import__("encodings").__file__)[0]): | |
name=os.path.splitext(i)[0] | |
try: | |
"".encode(name) | |
except: | |
pass | |
else: | |
if name not in ("idna", "punycode"): | |
r.append(name.replace("_","-")) | |
r.sort() | |
return r | |
enclist = encodinglist() | |
## Narrow list of likely encodings | |
#enclist = ['ascii', 'utf8', 'latin-1'] | |
#win_encs = [f'Windows-125{n}' for n in range(8)] | |
#iso_encs = [f'ISO-8859-{n}' for n in range(1,10) ] | |
#enclist = enclist + win_encs + iso_encs | |
possible_froms = [] | |
possible_tos = [] | |
possible_solutions = [] | |
enclen = max(len(c) for c in enclist) | |
enclist_aligned = [f"{enc:{enclen}} " for enc in enclist] | |
print("REALLY ENCODED: \ BUT INTERPRETED AS:") | |
for ll in ("".join(j) for j in zip(*enclist_aligned)): | |
print(" "*enclen + " " + ll) | |
for f,a in zip(enclist, enclist_aligned): | |
print(a, end="") | |
for t in enclist: | |
try: | |
co = wrong.encode(t,"ignore").decode(f,"ignore") | |
if co == correct: | |
#print(f,t) | |
print("X", end="") | |
possible_froms.append(f) | |
possible_tos.append(t) | |
possible_solutions.append((f,t)) | |
else: | |
print("·" if f!=t else " ", end="") | |
#print(co, end="") | |
#if "ý" in co: print(f,t) | |
except: | |
pass | |
print("E",end="") | |
print() | |
#for f,t in possible_solutions: | |
print(f"Conclusion: when '{correct}' is encoded as:\n\t{set(possible_froms)}\nbut (mis)interpreted as:\n\t{set(possible_tos)},\n it may appear as '{wrong}'") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example output: