Last active
December 11, 2024 00:40
-
-
Save csvoss/e58302f7394a57860c46 to your computer and use it in GitHub Desktop.
English to Tengwar transliterator
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
""" | |
This file converts English text to Tengwar, using my own personal preferences for | |
transliterating Tengwar (as extracted from the Tengwar Textbook). | |
Currently, the output that is created is intended for use with the Tengwar Annatar font | |
and related font families. | |
Warning: this code is extremely messy, as I basically hacked it up over the course | |
of a few hours between midnight and 3am one day. | |
Example usage: | |
>>> from english_to_tengwar import convert | |
>>> convert("This was a triumph. I'm making a note here: huge success!") | |
-- then paste the resulting text into a document rendered in Tengwar Annatar. | |
To run unit tests on this file: | |
$ python -m unittest english_to_tengwar | |
Examples can either yield a single character, or a single character after a carrier. | |
In addition to this, some characters have multiple values, and those depend on the | |
previous values. | |
Special characters: T for theta, D for eth | |
R for pre-vowel r, S and Z for vowel-less s and z | |
Q for rd, L for ld, W for wh, C for ch, K for kh, G for gh, X for sh, H for | |
zh, N for ng | |
""" | |
import re | |
from typing import Dict | |
from unittest import TestCase | |
def dictzip(str1: str, str2: str) -> Dict[str, str]: | |
output = {} | |
assert len(str1) == len(str2) | |
for i in range(len(str1)): | |
output[str1[i]] = str2[i] | |
return output | |
# So, English has two different pronunciations of 'th', and Tengwar distinguishes | |
# between them. TODO: use a library to determine which 'th' we're dealing with. In the | |
# meantime: voiced 'th' is the rare one, so these cases handle that. | |
# replace only the first instance of th | |
voiced_th_prefices = [ | |
"their", | |
"these", | |
"those", | |
"although", | |
"them", | |
"thine", | |
"thy", | |
"thou", | |
"there", | |
] | |
# replace only the second instance of th | |
voiced_th_special_prefices = ["thither"] | |
# must be alone -- punctuation may extend them, but consider 'thank' -- these aren't | |
# prefixes | |
voiced_th_solo_prefices = ["that", "this", "than", "they", "thee", "though"] | |
# should have only one th apiece | |
voiced_th_always_safe = [ | |
"feather", | |
"together", | |
"bathing", | |
"bathe", | |
"father", | |
"mother", | |
"clothing", | |
"clothe", | |
"brother", | |
"weather", | |
"either", | |
"gather", | |
"other", | |
"another", | |
"worthy", | |
"rather", | |
"soothing", | |
"soothe", | |
"smooth", | |
"leather", | |
"tether", | |
"breathe", | |
"breathing", | |
"lathe", | |
"seethe", | |
"seething", | |
"scathe", | |
"scathing", | |
"teethe", | |
"teething", | |
"loath", | |
"loathing", | |
"neither", | |
"thence", | |
"rhythm", | |
"slither", | |
"southern", | |
"bother", | |
"altogether", | |
"lather", | |
"hither", | |
] | |
def replace_th(inp): | |
for x in voiced_th_always_safe: | |
if x in inp: | |
inp = inp.replace(x, x.replace("th", "TH")) | |
for x in voiced_th_solo_prefices: | |
if x == inp: | |
inp = inp.replace("th", "TH") | |
for x in voiced_th_prefices: | |
if inp[: len(x)] == x: | |
inp = inp.replace(x, x.replace("th", "TH")) | |
for x in voiced_th_special_prefices: | |
if inp[: len(x)] == x: | |
inp = inp.replace(x, x.replace("th", "TH", 2).replace("TH", "th", 1)) | |
return inp | |
punctuation = { | |
".": "-", | |
",": "\xb7", | |
"!": "\xc1", | |
"?": "\xc0", | |
";": "\xc3", | |
'"': "\xbb", | |
"'": "\xb2", | |
"_": "·", | |
"-": "·", | |
"`": "\xb1", | |
":": "-", | |
"/": "\u203a", | |
"\\": "\u203a", | |
"<": "Œ", | |
">": "œ", | |
"[": "Œ", | |
"]": "œ", | |
"{": "Œ", | |
"}": "œ", | |
"(": "Œ", | |
")": "œ", | |
"@": "1E", | |
"#": "9dE1x#", | |
"$": "k\xa1", | |
"%": "q6R85$1", | |
"^": "z7D1R", | |
"&": "5#2", | |
"*": "\u02c6", | |
"=": "\xac", | |
"+": "` \xb0", | |
"|": "\xbd", | |
" ": " ", | |
"\n": "\n", | |
"\t": "\xb7-\xb7", | |
} | |
def tengwar_start(inp) -> str: | |
split_inp = re.findall( | |
r"[^\W_]+|[.,!\?;\"'-_`:<>/\\\[\]\(\){}@#$%^&\*=\+| \n]", inp | |
) | |
output = "" | |
for item in split_inp: | |
output += tengwar_token(item) | |
return output | |
def tengwar_token(item): | |
if item in punctuation.keys(): | |
return punctuation[item] | |
if item.isdigit(): | |
return tengwar_number(int(item)) | |
item = item.replace("'", "") | |
return tengwar_word(item) | |
def tengwar_number(num: int) -> str: | |
# TODO: implement fancy base-12 Elvish numerals | |
return "`````" | |
def tengwar_word(inp) -> str: | |
inp = inp.lower() | |
if inp == "": | |
return inp | |
# Detect 'of' | |
if inp == "of": | |
return "W" | |
# Detect 'the' | |
if inp == "the": | |
return "@" | |
# Detect voiced th, replace with TH | |
inp = replace_th(inp) | |
# Detect hard and soft c and g | |
for i in range(len(inp) - 1): | |
first = inp[:i] | |
cur = inp[i] | |
rest = inp[i + 1 :] | |
if cur == "g": | |
if rest[0] in "eiy": | |
inp = first + "j" + rest | |
elif cur == "c": | |
if rest[0] in "eiy": | |
inp = first + "s" + rest | |
elif rest[0] in "h": | |
inp = first + "C" + rest # Ch | |
else: | |
inp = first + "k" + rest | |
if inp[-1] == "c": | |
inp = inp[:-1] + "k" | |
# Detect places where we can use the pre-vowel r | |
for i in range(len(inp) - 1): | |
if inp[i] == "r" and inp[i + 1] in "aeiouy": | |
inp = inp[:i] + "R" + inp[i + 1 :] | |
# q == k | |
inp = inp.replace("q", "k") | |
# Detect differences between consonant y (henceforth Y) and vowel y | |
# All ys which do not come before a vowel are consonants | |
# Hey, it's just like r! | |
for i in range(len(inp) - 1): | |
if inp[i] == "y" and inp[i + 1] in "aeiou": | |
inp = inp[:i] + "Y" + inp[i + 1 :] | |
# Detach the ending s if we notice one... and it's not after aiou | |
if len(inp) > 0 and inp[-1] == "s": | |
if len(inp) > 1 and inp[-2] not in "aiou": | |
inp = inp[:-1] | |
has_trailing_s = True | |
else: | |
has_trailing_s = False | |
else: | |
has_trailing_s = False | |
# Detach the ending e if we notice one -- note, it must be: | |
# vowel THEN consonant THEN e | |
if len(inp) >= 3 and inp[-1] == "e" and inp[-2] not in "aeiouy": | |
inp = inp[:-1] | |
has_trailing_e = True | |
else: | |
has_trailing_e = False | |
# Elfification | |
if len(inp) == 0: | |
output = carrier | |
else: | |
output = tengwar_postfix(inp) | |
# Detect places where we can use the not-post-vowel s and z | |
for i in range(len(output) - 1): | |
# fancy S | |
if output[i] == "i" and output[i + 1] not in vowels: | |
output = output[:i] + "8" + output[i + 1 :] | |
# fancy Z | |
if output[i] == "," and output[i + 1] not in vowels: | |
output = output[:i] + "k" + output[i + 1 :] | |
# Add the ending e if we detached it earlier | |
if has_trailing_e: | |
output = output + "O" | |
# Add the ending s if we detached it earlier | |
if has_trailing_s: | |
if output[-1] in "7um8k": | |
output = output + "\xc5" | |
elif output[-1] in "qwertyo": | |
output = output + "\xc6" | |
elif output[-1] in "l9": | |
output = output + "\xa5" | |
else: | |
output = output + "_" | |
return output | |
consonants = dictzip("tdnrRhpbfvmwsj--lYkg-z-", "125679qwertyisghjlzxn,.") | |
doubles = { | |
"sh": "d", | |
"zh": "f", | |
"ch": "a", | |
"Ch": "a", | |
"ph": "e", | |
"kh": "c", | |
"gh": "v", | |
"wh": "o", | |
"ng": "b", | |
"rd": "u", | |
"ld": "m", | |
"th": "3", | |
"TH": "4", # voiced | |
} | |
vowel_series = { | |
"a": "#EDC", | |
"e": "$RFV", | |
"i": "%TGB", | |
"o": "^YHN", | |
"u": "&UJM", | |
"y": "\xd8\xd9\xda\xdb", | |
} | |
vowels = "#EDC$RFV%TGB^YHN&UJM" | |
# Index into the output of vowel_series. | |
# For example, a 0 before an A yields #. | |
vowels_for_consonants = { | |
"`": 3, | |
"~": 3, | |
"1": 1, | |
"q": 1, | |
"a": 2, | |
"z": 2, | |
"2": 0, | |
"w": 0, | |
"s": 0, | |
"x": 0, | |
"3": 2, | |
"e": 2, | |
"d": 1, | |
"c": 1, | |
"4": 0, | |
"r": 0, | |
"f": 0, | |
"v": 0, | |
"5": 0, | |
"t": 0, | |
"g": 0, | |
"b": 0, | |
"6": 1, | |
"y": 1, | |
"h": 2, | |
"n": 2, | |
"7": 2, | |
"u": 2, | |
"j": 0, | |
"m": 0, | |
"i": 2, | |
",": 2, | |
"9": 3, | |
"o": 0, | |
"l": 2, | |
".": 2, | |
} | |
short_carrier = "`" | |
carrier = short_carrier | |
long_carrier = "~" | |
def tengwar_postfix(postfix): | |
if len(postfix) == 0: | |
return "" | |
# TODO: Actually add the appropriate character | |
if not postfix[0].isalpha(): | |
return "`" + tengwar_postfix(postfix[1:]) | |
# Check whether we can apply a double -- if so, apply and recurse | |
for double in doubles: | |
if postfix[: len(double)] == double: | |
return doubles[double] + tengwar_postfix(postfix[len(double) :]) | |
# Otherwise, apply the appropriate consonant or vowel placeholder | |
nxt = postfix[0] | |
postfix = postfix[1:] | |
# If it's a vowel: Check whether the next thing == a vowel; if so, add the carrier. | |
# If not, add the appropriate vowel for the consonant that's coming next. | |
# This requires that we first recurse, then check! | |
if nxt in vowel_series.keys(): | |
if len(postfix) == 0: | |
next_consonant = carrier # add a carrier -- we're at the end of the word | |
elif postfix[0] in vowel_series.keys(): | |
next_consonant = carrier # add a carrier -- the next thing == a vowel | |
else: | |
rest = tengwar_postfix(postfix) | |
next_consonant = rest[0] | |
vowel_to_add = vowel_series[nxt][vowels_for_consonants[next_consonant]] | |
return next_consonant + vowel_to_add + rest[1:] | |
vowel_to_add = vowel_series[nxt][vowels_for_consonants[next_consonant]] | |
return next_consonant + vowel_to_add + tengwar_postfix(postfix) | |
# If it's a consonant, add it! | |
# TODO: Maybe add a doubler ('") if the next consonant == the same thing! | |
if nxt in consonants.keys(): | |
next_consonant = consonants[nxt] | |
return next_consonant + tengwar_postfix(postfix) | |
if nxt == "x": | |
return "z\xe6" + tengwar_postfix(postfix) | |
# Otherwise, raise an error! | |
else: | |
raise NotImplementedError("%s, %s" % (nxt, postfix)) | |
# TODO: Fancy n-bars and w-bars here. | |
return postfix | |
convert = tengwar_start | |
class TengwarTest(TestCase): | |
maxDiff = 10000 | |
def test_noop(self): | |
for pair in blog_post_for_unittest.split("\n\n"): | |
first, second = pair.strip().split("\n") | |
expected = second.strip() | |
actual = convert(first).strip().replace(" ", " ") | |
self.assertEqual(expected, actual) | |
blog_post_for_unittest = r""" | |
Transliterating Tengwar | |
175#8j1T7F1Eb% 1b$y6E | |
Tengwar is a writing system invented by J.R.R. Tolkien for use by the elves of Middle-Earth. Lately, I’ve learned how to write in Tengwar – not by learning any Elvish language, but by learning how to transliterate English into Tengwar using the instructions found in the Tengwar Textbook. | |
1b$y6E iG `C y71Tb% 88Ú1t$ 5%r5$12$ w`Û s-6-6- 1j^z`B5$ e6Y iJO w`Û @ j$rO_ W t2%2jO·`V6E3- j1Ej$`Û· `Br`V j`V6E52$ 9yY 1`N y71TO 5% 1b$y6E 51Y w`Û j`V6E5b% 5#`Û j$rdT jb#`Ms#O· w1U w`Û j`V6E5b% 9yY 1`N 175#8j1T7F1EO b$jdT 5%1`N 1b$y6E iJb% @ 5%817zJ1`B5^_ e`N5&2 5% @ 1b$y6E 1zFæ1w`NzH- | |
I’ve found this writing system to be useful for writing down small notes-to-self, and I’ve become quite good at writing it. | |
`Br`V e`N5&2 4iG y71Tb% 88Ú1t$ 1`N w`V iJeFj& e6Y y71Tb% 2yY5 8tj#j 51YO_·1`N·8j$e· 5#2 `Br`V wzFt^O z`M1TO x`N2^ 1E y71Tb% 1T- | |
Problem is, I’m still no good at reading Tengwar. Writing characters down on a piece of paper feels fluid and easy, but once I take a step back and look at the page, it’s incomprehensible at a glance. I have to sound the words out, character by character, if I want to read what I have written. | |
q7w^jt$ iG· `Bt 81j%j 5`N x`N2^ 1E 7`V2#b% 1b$y6E- y71Tb% a7DzD16R_ 2yY5 5^ `C q`BiFO W qqE6R e`Vj$_ ej`M2% 5#2 `ViD`Û· w1U 5^iO `B 1zDO `C 81qR wzDz 5#2 j`NzH 1E @ qs#O· 1Ti 5%zt^q79V5$8w%jO 1E `C xj5#iO- `B 9r#O 1`N 8`N5&2 @ yuH_ `N1U· a7DzD16R w`Û a7DzD16R· eG `B y5#1 1`N 7`V2# o1E `B 9r#O y71T15$- | |
However, Tengwar is incredibly pretty. I want to get more practice reading it. What if I could read whatever I want with this writing system? If I only had a script that could convert English text into readable Tengwar for me! | |
9yYr$6R· 1b$y6E iG 5%z72$w%j`Û q71R1`Û- `B y5#1 1`N s1R t7HO q7zD1iGO 7`V2#b% 1T- o1E eG `B z`Nm& 7`V2# o1Er$6R `B y5#1 y3G 4iG y71Tb% 88Ú1t$À eG `B 5^j`Û 92# `C 8z7qT1 41E z`Nm& z5^r6R1 b$jdT 1zFæ1 5%1`N 7`V2#w#jO 1b$y6E e6Y t`VÁ | |
As a glance through the Tengwar Textbook will demonstrate, Tengwar is pretty complicated. There isn’t a single standard way to write in English using Tengwar: there are a variety of “modes”, each of which has a different set of rules. I have my own personal way of writing in Tengwar that combines some features of each of those modes that I like. | |
iD `C xj5#iO 37`Nv& @ 1b$y6E 1zFæ1w`NzH yj%j 2t$5^8171EO· 1b$y6E iG q71R1`Û zt^qjzG1E2$- 47FO iG51 `C 8b%jO 815#2uD y`C`Û 1`N y71TO 5% b$jdT iJb% 1b$y6E- 47FO 7DO `C r7D`B1R`Û W t2^O_· `VaD W oaG 9iD `C 2eGe7F5$1 81R W 7j&O_- `B 9r#O t`Û yY5 q6R85^j# y`C`Û W y71Tb% 5% 1b$y6E 41E zt^w5%O_ 8t^O e`V1E7JO_ W `VaD W 4iHO t2^O_ 41E `B jzGO- | |
Even though there are already some scripts around the Internet that will claim to transliterate English to Tengwar, they don’t necessarily follow my mode of writing, or even a standard mode. Thus I decided, one evening, to write my own script. | |
r$5$ 4`Nv& 47FO 7DO j#7`V2#`Û 8t^O 8z7qT1_ 7D`N5&2 @ 5%16R51R 41E yj%j zj`Ct% 1`N 175#8j1T7F1EO b$jdT 1`N 1b$y6E· 4`V`Û 25^1 5iFiF87Dj%`Û ej^jyY t`Û t2^O W y71Tb%· 6Y r$5$ `C 815#2uD t2^O- 3iJ `B 2iF2%2$· 5^O r$5$b%· 1`N y71TO t`Û yY5 8z7qT1- | |
I like the look of the Tengwar Annatar font, so the script would convert English text to the characters needed to render Tengwar text in that font. Eventually, I may extend it so that I can also output Tengwar using TengwarScript, a TeX package. Writing a script with Tengwar Annatar in mind is the more difficult task of the two because of the way it typesets vowels (tehtar), so adding support for TengwarScript onto the existing script would be easy. | |
`B jzGO @ j`NzH W @ 1b$y6E 5#51E6E e5^1· 8`N @ 8z7qT1 y`Nm& z5^r6R1 b$jdT 1zFæ1 1`N @ a7DzD16R_ 5`V2$2$ 1`N 75$26R 1b$y6E 1zFæ1 5% 41E e5^1- r$5$1`Mj#j`Û· `B t`C`Û zFæ15$2 1T 8`N 41E `B z5# j#8`N `N1Uq1U 1b$y6E iJb% 1b$y6E8z7qT1· `C 1zFæ qzDzs#O- y71Tb% `C 8z7qT1 y3G 1b$y6E 5#51E6E 5% t5%2 iG @ t7HO 2eGezGj&1 1iDz W @ 1y`N wzF`CiJO W @ y`C`Û 1T 1qÙiF1R_ ryYj$_ Œ19V16Eœ· 8`N 2#2b% 8qUq6Y1 e6Y 1b$y6E8z7qT1 5^1`N @ zFæiG1b% 8z7qT1 y`Nm& w`V `ViD`Û- | |
As I built this thing and debugged the little errors and inconsistencies that I noticed here and there, I kept track of what it output as the result for the sentence (“This was a triumph. I’m making a note here: huge success!”) that I used for testing. Tengwar has various little complexities – the R-rule, a distinction between voiced and voiceless ‘th’, double consonants like ‘ch’ and ‘ph’ and ‘ng’ and ‘rd’, and vowel carriers – that make correct transliteration more difficult. When put together, the history of my testing string provides a visualization of my progress against these complexities as I improved the script. | |
iD `B w`Mj%1 4iG 3b% 5#2 2w$x&s2$ @ j1T1jO 6R76Y_ 5#2 5%z5^8iG15$8`B`V_ 41E `B 51YiG2$ 97FO 5#2 47FO· `B zqR1 17zDz W o1E 1T `N1Uq1U iD @ 7iFj&1 e6Y @ 85$15$iO Œ4iG yiD `C 17`Bt&e- `Bt tzDb% `C 51YO 97FO- 9s&O 8zJ8iF_Áœ 41E `B iJ2$ e6Y 1iF1b%- 1b$y6E 9iD r7D`B`NiJ j1T1jO zt^qjzFæ1T`B`V_ @ 6·7j&O· `C 2iG15%z1`B5^ w1Ry`V5$ r`NiG2$ 5#2 r`NiGj$iF_ 3· 2`Nw&jO z5^85^5#1_ jzGO a 5#2 e 5#2 b 5#2 u· 5#2 ryYj$ z6E7`B6R_ 41E tzDO z6Y7zF1 175#8j1T7F1E`B5^ t7HO 2eGezGj&1- o5$ q1U 1s^4$6R· @ 9iG17H`Û W t`Û 1iF1b% 817b% q7r^2%O_ `C riG`Mj#,G1E`B5^ W t`Û q7x^7iF_ x#`C5%81 4iFO zt^qjzFæ1T`B`V_ iD `B t%q7r^2$ @ 8z7qT1- | |
With the finished product, now I can take my favorite poems and stories, pass them through the transliterator, render the resulting text using the Tengwar Annatar font, and send that document to my Kindle! There are still some little details which could be improved upon, but I’m pleased with the result so far. | |
y3G @ e5%dT2$ q72^zJ1· 5yY `B z5# 1zDO t`Û er#7H1TO q`Nt$_ 5#2 817H`B`V_· qiD_ 4t$ 37`Nv& @ 175#8j1T7F1E6Y· 75$26R @ 7iFj&1b% 1zFæ1 iJb% @ 1b$y6E 5#51E6E e5^1· 5#2 85$2 41E 2zHt&5$1 1`N t`Û z5%2jOÁ 47FO 7DO 81j%j 8t^O j1T1jO 21R`Cj%_ oaG z`Nm& w`V t%q7r^2$ qU5^· w1U `Bt qj`ViD2$ y3G @ 7iFj&1 8`N e6E- | |
(english_to_tengwar.py on GitHub Gist) | |
Œb$jdT·1`N·1b$y6E-q`Û 5^ s3Gw& siG1œ | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment