Created
June 28, 2022 15:06
-
-
Save FilipDominec/912b18147842ed5de7adbf3fab1413c9 to your computer and use it in GitHub Desktop.
Searches for such charset conversion, which would generate a wrong encoded string from a known correct one
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/python3 | |
| #-*- coding: utf-8 -*- | |
| # Searches for such charset conversion, which would generate a wrong encoded string from a known correct one | |
| # Public domain, written by Filip Dominec 2022 | |
| # EXAMPLES: | |
| #wrong, correct = "โช konstrukะฏnะฑ ยคeัenะฑ", "ฤ konstrukฤnรญ ลeลกenรญ" | |
| #wrong, correct = "slouรยeninovฤหch", "slouฤeninovรฝch" | |
| #wrong, correct = "pรธรญpravu slouรจeninovรฝch polovodiรจรน", "pลรญpravu slouฤeninovรฝch polovodiฤลฏ" | |
| #wrong, correct = "รฝ", "รฝ" | |
| #wrong, correct = "Pro pลรญpravu slouฤeninovรฝch polovodiฤลฏ vyuฤพรญvรก jako zdrojovรฉ materiรกly", "Pro pลรญpravu slouฤeninovรฝch polovodiฤลฏ vyuลพรญvรก jako zdrojovรฉ materiรกly" | |
| #wrong, correct = "ร ", "ลฏ" | |
| #wrong, correct = "vโmโชr", "vรฝmฤr" | |
| #wrong, correct = "slouรจeninovรฝch", "slouฤeninovรฝch" | |
| #wrong, correct = "vyuฤพรญvรก","vyuลพรญvรก" | |
| #wrong, correct = "vyuยพรญvรก", "vyuลพรญvรก" | |
| wrong, correct = "Mโชยฒenรญ", "Mฤลenรญ", | |
| import os | |
| ## Try all encodings (big table!) | |
| def encodinglist(): # https://stackoverflow.com/questions/1728376/get-a-list-of-all-the-encodings-python-can-encode-to | |
| r=[] | |
| for i in os.listdir(os.path.split(__import__("encodings").__file__)[0]): | |
| name=os.path.splitext(i)[0] | |
| try: | |
| "".encode(name) | |
| except: | |
| pass | |
| else: | |
| if name not in ("idna", "punycode"): | |
| r.append(name.replace("_","-")) | |
| r.sort() | |
| return r | |
| enclist = encodinglist() | |
| ## Narrow list of likely encodings | |
| #enclist = ['ascii', 'utf8', 'latin-1'] | |
| #win_encs = [f'Windows-125{n}' for n in range(8)] | |
| #iso_encs = [f'ISO-8859-{n}' for n in range(1,10) ] | |
| #enclist = enclist + win_encs + iso_encs | |
| possible_froms = [] | |
| possible_tos = [] | |
| possible_solutions = [] | |
| enclen = max(len(c) for c in enclist) | |
| enclist_aligned = [f"{enc:{enclen}} " for enc in enclist] | |
| print("REALLY ENCODED: \ BUT INTERPRETED AS:") | |
| for ll in ("".join(j) for j in zip(*enclist_aligned)): | |
| print(" "*enclen + " " + ll) | |
| for f,a in zip(enclist, enclist_aligned): | |
| print(a, end="") | |
| for t in enclist: | |
| try: | |
| co = wrong.encode(t,"ignore").decode(f,"ignore") | |
| if co == correct: | |
| #print(f,t) | |
| print("X", end="") | |
| possible_froms.append(f) | |
| possible_tos.append(t) | |
| possible_solutions.append((f,t)) | |
| else: | |
| print("ยท" if f!=t else " ", end="") | |
| #print(co, end="") | |
| #if "รฝ" in co: print(f,t) | |
| except: | |
| pass | |
| print("E",end="") | |
| print() | |
| #for f,t in possible_solutions: | |
| print(f"Conclusion: when '{correct}' is encoded as:\n\t{set(possible_froms)}\nbut (mis)interpreted as:\n\t{set(possible_tos)},\n it may appear as '{wrong}'") |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example output: