-
Introduction 'another_4433185' gathers different ways of dumping latin-1 chars (such as accented letters) from UniCode 0..255 into straight 7bit/ HTML strings.
-
Original author This was originally fetched from cookedapple It was a very basic function (see 'l2h').
-
Content ** Files: --- README.md - this markdown (help) file --- l2h - convert unicode chars to html (Python), incomplete. --- letter2html.py - generates HTML or Python script for handling latin-1.
-
-
Save serrasqueiro/c22acb8c3044747bd083e9f720de23e7 to your computer and use it in GitHub Desktop.
convert unicode chars to html (incomplete)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def latin1_to_html (unicrap): | |
"""This takes a UNICODE string and replaces Latin-1 characters with | |
something equivalent in html. It returns a plain ASCII string. | |
This function makes a best effort to convert Latin-1 characters into | |
ASCII equivalents. It does not just strip out the Latin-1 characters. | |
All characters in the standard 7-bit ASCII range are preserved. | |
In the 8th bit range all the Latin-1 accented letters are converted | |
to unaccented equivalents. Most symbol characters are converted to | |
something meaningful. Anything not converted is deleted. | |
""" | |
xlate={0xc0:'À', 0xc1:'Á', 0xc2:'Â', 0xc3:'Ã', 0xc4:'Ä', 0xc5:'Å', | |
0xc6:'Ae', 0xc7:'C', | |
0xc8:'È', 0xc9:'É', 0xca:'E', 0xcb:'E', | |
0xcc:'Ì', 0xcd:'Í', 0xce:'I', 0xcf:'I', | |
0xd0:'Th', 0xd1:'N', | |
0xd2:'Ò', 0xd3:'Ó', 0xd4:'O', 0xd5:'O', 0xd6:'O', 0xd8:'O', | |
0xd9:'Ù', 0xda:'Ù', 0xdb:'U', 0xdc:'U', | |
0xdd:'Y', 0xde:'th', 0xdf:'ss', | |
0xe0:'à', 0xe1:'á', 0xe2:'â', 0xe3:'ã', 0xe4:'ä', 0xe5:'å', | |
0xe6:'ae', 0xe7:'c', | |
0xe8:'è', 0xe9:'é', 0xea:'e', 0xeb:'e', | |
0xec:'ì', 0xed:'í', 0xee:'i', 0xef:'i', | |
0xf0:'th', 0xf1:'n', | |
0xf2:'ò', 0xf3:'ó', 0xf4:'o', 0xf5:'o', 0xf6:'o', 0xf8:'o', | |
0xf9:'ù', 0xfa:'ú', 0xfb:'u', 0xfc:'u', | |
0xfd:'y', 0xfe:'th', 0xff:'y', | |
0xa1:'!', 0xa2:'{cent}', 0xa3:'{pound}', 0xa4:'{currency}', | |
0xa5:'{yen}', 0xa6:'|', 0xa7:'{section}', 0xa8:'{umlaut}', | |
0xa9:'{C}', 0xaa:'{^a}', 0xab:'<<', 0xac:'{not}', | |
0xad:'-', 0xae:'{R}', 0xaf:'_', 0xb0:'{degrees}', | |
0xb1:'{+/-}', 0xb2:'{^2}', 0xb3:'{^3}', 0xb4:"'", | |
0xb5:'{micro}', 0xb6:'{paragraph}', 0xb7:'*', 0xb8:'{cedilla}', | |
0xb9:'{^1}', 0xba:'{^o}', 0xbb:'>>', | |
0xbc:'{1/4}', 0xbd:'{1/2}', 0xbe:'{3/4}', 0xbf:'?', | |
0xd7:'*', 0xf7:'/' | |
} | |
r = '' | |
for i in unicrap: | |
if xlate.has_key(ord(i)): | |
r += xlate[ord(i)] | |
elif ord(i) >= 0x80: | |
pass | |
else: | |
r += str(i) | |
return r |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# letter2html.py -- (c)2019 Henrique Moreira | |
""" | |
letter2html.py - simple Python functions to convert text to HTML | |
""" | |
# | |
# test this script | |
# | |
def test_letter2html (outFile, inArgs): | |
args = inArgs | |
assert translateTable.nConverts>0 | |
tbl = TranslateTable( debug=1 ) | |
return 0 | |
class TranslateTable: | |
def __init__ (self, debug=0): | |
xlate = { | |
0xc0:'À', 0xc1:'Á', 0xc2:'Â', 0xc3:'Ã', 0xc4:'Ä', 0xc5:'Å', | |
0xc6:'Ae', 0xc7:'Ç', | |
0xc8:'È', 0xc9:'É', 0xca:'E', 0xcb:'E', | |
0xcc:'Ì', 0xcd:'Í', 0xce:'I', 0xcf:'I', | |
0xd0:'Th', 0xd1:'N', | |
0xd2:'Ò', 0xd3:'Ó', 0xd4:'O', 0xd5:'O', 0xd6:'O', 0xd8:'O', | |
0xd9:'Ù', 0xda:'Ù', 0xdb:'U', 0xdc:'U', | |
0xdd:'Y', 0xde:'th', 0xdf:'ss', | |
0xe0:'à', 0xe1:'á', 0xe2:'â', 0xe3:'ã', 0xe4:'ä', 0xe5:'å', | |
0xe6:'ae', 0xe7:'c', | |
0xe8:'è', 0xe9:'é', 0xea:'e', 0xeb:'e', | |
0xec:'ì', 0xed:'í', 0xee:'i', 0xef:'i', | |
0xf0:'th', 0xf1:'n', | |
0xf2:'ò', 0xf3:'ó', 0xf4:'o', 0xf5:'o', 0xf6:'o', 0xf8:'o', | |
0xf9:'ù', 0xfa:'ú', 0xfb:'u', 0xfc:'u', | |
0xfd:'y', 0xfe:'th', 0xff:'y', | |
0xa1:'!', 0xa2:'{cent}', 0xa3:'{pound}', 0xa4:'{currency}', | |
0xa5:'¥', | |
0xa6:'|', # broken bar | |
0xa7:'{section}', 0xa8:'{umlaut}', | |
0xa9:'©', 0xaa:'{^a}', 0xab:'<<', 0xac:'¬', | |
0xad:'-', 0xae:'{R}', 0xaf:'_', 0xb0:'{degrees}', | |
0xb1:'{+/-}', 0xb2:'{^2}', 0xb3:'{^3}', 0xb4:"'", | |
0xb5:'{micro}', 0xb6:'{paragraph}', 0xb7:'*', | |
0xb8:'ç', | |
0xb9:'{^1}', 0xba:'{^o}', 0xbb:'>>', | |
0xbc:'{1/4}', 0xbd:'{1/2}', 0xbe:'{3/4}', | |
0xbf:'?', # inverted question mark | |
0xd7:'*', # multiplication sign | |
0xf7:'/', # division sign | |
} | |
self.nConverts = len( xlate ) | |
self.hash_convertions( debug, relate=xlate ) | |
def hash_convertions (self, debug, **vars): | |
if debug>=2: | |
for aKey, val in vars.items(): | |
print( "key: <b>{}</b>".format( aKey ), "is: <pre>", val ,"</pre>\n") | |
print("<hr>") | |
digConv = vars[ "relate" ] | |
used = [0] * 256 | |
if debug>0: | |
def hint_str (info, k): | |
hint = "<a {} href='https://www.fileformat.info/info/unicode/char/{:04x}/index.htm'>{}</a>".format( "target='_blank'", k, info ) | |
return hint | |
for kind in ["&", ""]: | |
for k, html in digConv.items(): | |
aLen = len( html ) | |
assert aLen>0 | |
first = html[ 0 ].lower() | |
if used[ k ]>0: | |
continue | |
bas = "k: 0x{:02x}".format(k) + " " + hint_str( "{:d}d".format(k, k), k ) | |
if kind=="": | |
show = html.replace( "<", "<" ).replace( ">", ">" ) | |
print(bas, "; html is:", show, "<BR/>") | |
used[ k ] += 1 | |
else: | |
if html.find( kind )==0: | |
print(bas, "; html string is:", html, "<BR/>") | |
used[ k ] += 1 | |
if first=="&": | |
isOk = html.endswith( ";" ) | |
assert isOk | |
k = 0x7f | |
while True: | |
k += 1 | |
if k >= 256: | |
break | |
if k==0xa0: | |
print("<hr>Now ASCII after 0xA0 (160d)<br>") | |
if used[ k ]<=0: | |
hint = hint_str("info", k) | |
print("<p>unused k:", "0x{:02x} #{:03};".format(k, k), hint, "</p>") | |
return True | |
def latin1_to_html (s): | |
"""This takes a UNICODE string and replaces Latin-1 characters with | |
something equivalent in html. It returns a plain ASCII string. | |
This function makes a best effort to convert Latin-1 characters into | |
ASCII equivalents. It does not just strip out the Latin-1 characters. | |
All characters in the standard 7-bit ASCII range are preserved. | |
In the 8th bit range all the Latin-1 accented letters are converted | |
to unaccented equivalents. Most symbol characters are converted to | |
something meaningful. Anything not converted is deleted. | |
""" | |
r = '' | |
for i in s: | |
if xlate.has_key(ord(i)): | |
r += xlate[ord(i)] | |
elif ord(i) >= 0x80: | |
pass | |
else: | |
r += str(i) | |
return r | |
# | |
# Global / singletons | |
# | |
translateTable = TranslateTable() | |
# | |
# Test suite | |
# | |
if __name__ == "__main__": | |
import sys | |
outFile = sys.stdout; | |
code = test_letter2html( outFile, sys.argv[ 1: ] ) | |
sys.exit( code ) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment