-
-
Save poppingtonic/06efae823d38e6d5a1f455c564572919 to your computer and use it in GitHub Desktop.
English to Tengwar transliterator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# This file converts English text to Tengwar, using my own personal | |
# preferences for transliterating Tengwar. | |
# | |
# Currently, the output that is created is intended for use with the Tengwar | |
# Annatar font and related font families. | |
# | |
# Example usage: | |
# >>> print convert("This was a triumph. I'm making a note here: huge success!") | |
# | |
# -- then paste the resulting text into a document rendered in Tengwar Annatar. | |
# | |
# Chelsea Voss, 2015 | |
# Examples can either yield a single character, or a single character after a | |
# carrier | |
# In addition to this, some characters have multiple values, and those depend | |
# on the previous values | |
# Special characters: T for theta, D for eth | |
# R for pre-vowel r, S and Z for vowel-less s and z | |
# Q for rd, L for ld, W for wh, C for ch, K for kh, G for gh, X for sh, H for | |
# zh, N for ng | |
def dictzip(str1, str2): | |
output = {} | |
assert len(str1) == len(str2) | |
l = len(str1) | |
for i in xrange(l): | |
output[str1[i]] = str2[i] | |
return output | |
# So, English has two different pronunciations of 'th', and Tengwar | |
# distinguishes between them. TODO: use a library to determine which 'th' | |
# we're dealing with. In the meantime: voiced 'th' is the rare one, | |
# so these cases handle that. | |
# replace only the first instance of th | |
voiced_th_prefices = ['their', 'these', 'those', 'although', | |
'them', 'thine', 'thy', 'thou', 'there'] | |
# replace only the second instance of th | |
voiced_th_special_prefices = ['thither'] | |
# must be alone -- punctuation may extend them, but consider 'thank' -- these aren't prefixes | |
voiced_th_solo_prefices = ['that', 'this', 'than', 'they', 'thee', 'though'] | |
# should have only one th apiece | |
voiced_th_always_safe = ['feather', 'together', 'bathing', 'bathe', | |
'father', 'mother', 'clothing', 'clothe', | |
'brother', 'weather', 'either', 'gather', | |
'other', 'another', 'worthy', 'rather', 'soothing', | |
'soothe', 'smooth', 'leather', 'tether', 'breathe', | |
'breathing', 'lathe', 'seethe', 'seething', 'scathe', | |
'scathing', 'teethe', 'teething', 'loath', | |
'loathing', 'neither', 'thence', 'rhythm', | |
'slither', 'southern', 'bother', 'altogether', | |
'lather', 'hither'] | |
def replace_th(inp): | |
for x in voiced_th_always_safe: | |
if x in inp: | |
inp = inp.replace(x, x.replace('th', 'TH')) | |
for x in voiced_th_solo_prefices: | |
if x == inp: | |
inp = inp.replace('th', 'TH') | |
for x in voiced_th_prefices: | |
l = len(x) | |
if inp[:l] == x: | |
inp = inp.replace(x, x.replace('th', 'TH')) | |
for x in voiced_th_special_prefices: | |
l = len(x) | |
if inp[:l] == x: | |
inp = inp.replace(x, x.replace('th', 'TH', 2).replace('TH', 'th', 1)) | |
return inp | |
punctuation = { | |
'.': u'-', | |
',': u'\xb7', | |
'!': u'\xc1', | |
'?': u'\xc0', | |
';': u'\xc3', | |
'"': u'\xbb', | |
'\'': u'\xb2', | |
'_': u'\xc2', | |
'-': u'\\', | |
'`': u'\xb1', | |
':': '-', | |
'/': u'\u203a', | |
'\\': u'\u203a', | |
'<': '*', | |
'>': 'I', | |
'[': '*', | |
']': 'I', | |
'{': '*', | |
'}': 'I', | |
'(': '*', | |
')': 'I', | |
'@': '1E', | |
'#': '9dE1x#', | |
'$': u'k\xa1', | |
'%': 'q6R85$1', | |
'^': 'z7D1R', | |
'&': '5#2', | |
'*': u'\u02c6', | |
'=': u'\xac', | |
'+': u'` \xb0', | |
'|': u'\xbd', | |
' ': ' ', | |
'\n': '\n', | |
'\t': u'\xb7-\xb7', | |
} | |
def elfify_start(inp): | |
import re | |
split_inp = re.findall(r"[\w']+|[.,!\?;\"'-_`:<>/\\\[\]\(\){}@#$%^&\*=\+| \n]", inp) | |
output = unicode('') | |
for item in split_inp: | |
output += unicode(elfify_token(item)) | |
return output | |
def elfify_token(item): | |
if item in punctuation.keys(): | |
return punctuation[item] | |
if item.isdigit(): | |
return elfify_number(int(item)) | |
item = item.replace("'","") | |
return elfify_word(item) | |
def elfify_number(num): | |
# TODO: implement fancy base-12 Elvish numerals | |
return '`````' | |
def elfify_word(inp): | |
inp = inp.lower() | |
if inp == '': | |
return inp | |
# Detect 'of' | |
if inp == 'of': | |
return 'W' | |
# Detect 'the' | |
if inp == 'the': | |
return '@' | |
# Detect voiced th, replace with TH | |
inp = replace_th(inp) | |
# Detect hard and soft c and g | |
for i in range(len(inp) - 1): | |
first = inp[:i] | |
cur = inp[i] | |
rest = inp[i+1:] | |
if cur == 'g': | |
if rest[0] in 'eiy': | |
inp = first + 'j' + rest | |
elif cur == 'c': | |
if rest[0] in 'eiy': | |
inp = first + 's' + rest | |
elif rest[0] in 'h': | |
inp = first + 'C' + rest # Ch | |
else: | |
inp = first + 'k' + rest | |
if inp[-1] == 'c': | |
inp = inp[:-1] + 'k' | |
# Detect places where we can use the pre-vowel r | |
for i in range(len(inp) - 1): | |
if inp[i] == 'r' and inp[i+1] in 'aeiouy': | |
inp = inp[:i] + 'R' + inp[i+1:] | |
# q == k | |
inp = inp.replace('q', 'k') | |
# Detect differences between consonant y (henceforth Y) and vowel y | |
# All ys which do not come before a vowel are consonants | |
# Hey, it's just like r! | |
for i in range(len(inp) - 1): | |
if inp[i] == 'y' and inp[i+1] in 'aeiou': | |
inp = inp[:i] + 'Y' + inp[i+1:] | |
# Detach the ending s if we notice one... and it's not after aiou | |
if len(inp) > 0 and inp[-1] == 's': | |
if len(inp) > 1 and inp[-2] not in 'aiou': | |
inp = inp[:-1] | |
has_trailing_s = True | |
else: | |
has_trailing_s = False | |
else: | |
has_trailing_s = False | |
# Detach the ending e if we notice one -- note, it must be: | |
# vowel THEN consonant THEN e | |
if len(inp) >= 3 and inp[-1] == 'e' and inp[-2] not in 'aeiouy': | |
inp = inp[:-1] | |
has_trailing_e = True | |
else: | |
has_trailing_e = False | |
# Elfification | |
if len(inp) == 0: | |
output = carrier | |
else: | |
output = elfify_postfix(inp) | |
# Detect places where we can use the not-post-vowel s and z | |
for i in range(len(output)-1): | |
# fancy S | |
if output[i] == 'i' and output[i+1] not in vowels: | |
output = output[:i] + '8' + output[i+1:] | |
# fancy Z | |
if output[i] == ',' and output[i+1] not in vowels: | |
output = output[:i] + 'k' + output[i+1:] | |
# Add the ending e if we detached it earlier | |
if has_trailing_e: | |
output = output + 'O' | |
# Add the ending s if we detached it earlier | |
if has_trailing_s: | |
if output[-1] in '7um8k': | |
output = output + u'\xc5' | |
elif output[-1] in 'qwertyo': | |
output = output + u'\xc6' | |
elif output[-1] in 'l9': | |
output = output + u'\xa5' | |
else: | |
output = output + '_' | |
return output | |
consonants = dictzip('tdnrRhpbfvmwsj--lYkg-z-', | |
u'125679qwertyisghjlzxn,.') | |
doubles = { | |
'sh': 'd', | |
'zh': 'f', | |
'ch': 'a', | |
'Ch': 'a', | |
'ph': 'e', | |
'kh': 'c', | |
'gh': 'v', | |
'wh': 'o', | |
'ng': 'b', | |
'rd': 'u', | |
'ld': 'm', | |
'th': '3', | |
'TH': '4', # voiced | |
} | |
vowel_series = { | |
'a': '#EDC', | |
'e': '$RFV', | |
'i': '%TGB', | |
'o': '^YHN', | |
'u': '&UJM', | |
'y': u'\xd8\xd9\xda\xdb' | |
} | |
vowels = '#EDC$RFV%TGB^YHN&UJM' | |
# Index into the output of vowel_series. | |
# For example, a 0 before an A yields #. | |
vowels_for_consonants = {'`': 3, '~': 3, '1': 1, 'q': 1, 'a': 2, 'z': 2, '2': 0, 'w': 0, 's': 0, 'x': 0, '3': 2, 'e': 2, 'd': 1, 'c': 1, '4': 0, 'r': 0, 'f': 0, 'v': 0, '5': 0, 't': 0, 'g': 0, 'b': 0, '6': 1, 'y': 1, 'h': 2, 'n': 2, '7': 2, 'u': 2, 'j': 0, 'm': 0, 'i': 2, ',': 2, '9': 3, 'o': 0, 'l': 2, '.': 2, } | |
short_carrier = '`' | |
carrier = short_carrier | |
long_carrier = '~' | |
def elfify_postfix(postfix): | |
if len(postfix) == 0: | |
return '' | |
# TODO: Actually add the appropriate character | |
if not postfix[0].isalpha(): | |
return '`' + elfify_postfix(postfix[1:]) | |
# Check whether we can apply a double -- if so, apply and recurse | |
for double in doubles: | |
l = len(double) | |
if postfix[:l] == double: | |
return doubles[double] + elfify_postfix(postfix[l:]) | |
# Otherwise, apply the appropriate consonant or vowel placeholder | |
nxt = postfix[0] | |
postfix = postfix[1:] | |
# If it's a vowel: Check whether the next thing == a vowel; if so, add the carrier. | |
# If not, add the appropriate vowel for the consonant that's coming next. | |
# This requires that we first recurse, then check! | |
if nxt in vowel_series.keys(): | |
if len(postfix) == 0: | |
next_consonant = carrier # add a carrier -- we're at the end of the word | |
elif postfix[0] in vowel_series.keys(): | |
next_consonant = carrier # add a carrier -- the next thing == a vowel | |
else: | |
rest = elfify_postfix(postfix) | |
next_consonant = rest[0] | |
vowel_to_add = vowel_series[nxt][vowels_for_consonants[next_consonant]] | |
return next_consonant + vowel_to_add + rest[1:] | |
vowel_to_add = vowel_series[nxt][vowels_for_consonants[next_consonant]] | |
return next_consonant + vowel_to_add + elfify_postfix(postfix) | |
# If it's a consonant, add it! | |
# TODO: Maybe add a doubler ('") if the next consonant == the same thing! | |
if nxt in consonants.keys(): | |
next_consonant = consonants[nxt] | |
return next_consonant + elfify_postfix(postfix) | |
if nxt == 'x': | |
return u'z\xe6' + elfify_postfix(postfix) | |
# Otherwise, raise an error! | |
else: | |
raise NotImplementedError("%s, %s" % (nxt, postfix)) | |
# TODO: Fancy n-bars and w-bars here. | |
return postfix | |
convert = elfify_start |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment