Last active
August 29, 2015 14:27
-
-
Save riccardomurri/3c3ccec30f037be174d3 to your computer and use it in GitHub Desktop.
Two simple functions for rendering a Unicode string using ASCII characters only.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# /usr/bin/env python | |
# -*- encoding: utf-8 -*- | |
# | |
""" | |
Two simple functions for rendering a Unicode string using ASCII | |
characters only. | |
The only sensible applications are on words in a Latin-derived | |
alphabet (i.e., anything that could be rendered in a ISO-8859-* | |
character set); anything else will just be replaced by a string of | |
"unprintable" marks. | |
""" | |
def asciify_crudely(unistr, default='?'): | |
# note that the entire docstring has to be *unicode*, otherwise | |
# doctests fail inexplicably! | |
u""" | |
Render unicode string `unistr` using ASCII characters only. | |
The rendering is quite crude in that any non-ASCII character is | |
just replaced with the glyph `?`:: | |
>>> asciify_crudely(u'pâté') | |
'p?t?' | |
>>> asciify_crudely(u'PÂTÉ') | |
'P?T?' | |
The character to be substituted for untranslatable characters can | |
be passed as second argument:: | |
>>> asciify_crudely(u'pâté', '*') | |
'p*t*' | |
""" | |
converted = [] | |
for unichr in iter(unistr): | |
try: | |
ch = unichr.decode('ascii') | |
except: | |
ch = default | |
converted.append(chr(ord(ch))) | |
return ''.join(converted) | |
def latinify(unistr, default='?'): | |
# note that the entire docstring has to be *unicode*, otherwise | |
# doctests fail inexplicably! | |
u""" | |
Render unicode string `unistr` using ASCII characters only. | |
Latin letters with diacritical marks are substituted with their | |
"bare" equivalent:: | |
>>> latinify(u'pâté') | |
'pate' | |
>>> latinify(u'PÂTÉ') | |
'PATE' | |
Letters which have no direct equivalent in the latin alphabet are | |
replaced with the glyph `?`:: | |
>>> latinify(u'Sigurður Þórarinsson') | |
'Sigur?ur ?orarinsson' | |
The character to be substituted for untranslatable characters can | |
be passed as second argument:: | |
>>> latinify(u'Sigurður Þórarinsson', '*') | |
'Sigur*ur *orarinsson' | |
""" | |
from unicodedata import name | |
converted = [] | |
for unich in iter(unistr): | |
try: | |
ch = unich.decode('ascii') | |
except: | |
# deduce a latin letter equivalent from the Unicode data | |
# point name; e.g., since `name(u'á') == 'LATIN SMALL | |
# LETTER A WITH ACUTE'` translate `á` to `a`. However, in | |
# some cases the unicode name is still "LATIN LETTER" | |
# although no direct equivalent in the Latin alphabeth | |
# exists (e.g., Þ, "LATIN CAPITAL LETTER THORN") -- we can | |
# avoid these cases by checking that the letter name is | |
# composed of one letter only. | |
what = name(unich).split() | |
if what[0] == 'LATIN' and what[2] == 'LETTER' and len(what[3]) == 1: | |
if what[1] == 'SMALL': | |
ch = what[3].lower() | |
else: # what[1] == 'CAPITAL' | |
ch = what[3].upper() | |
else: | |
ch = default | |
converted.append(chr(ord(ch))) | |
return ''.join(converted) | |
if __name__ == '__main__': | |
import doctest | |
doctest.testmod(name='asciify', | |
optionflags=doctest.NORMALIZE_WHITESPACE) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment