Created
May 2, 2024 04:51
-
-
Save algonacci/22bfa346ffcfb0a3d251ec963fb21c25 to your computer and use it in GitHub Desktop.
Transliterate from Arabic to Indonesian
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
MAPPING = { | |
'ا': 'a', 'أ': 'a', 'إ': 'i', 'آ': 'a', 'ب': 'b', 'ت': 't', 'ث': 'th', 'ج': 'j', 'ح': 'h', | |
'خ': 'kh', 'د': 'd', 'ذ': 'dh', 'ر': 'r', 'ز': 'z', 'س': 's', 'ش': 'sh', 'ص': 's', | |
'ض': 'd', 'ط': 't', 'ظ': 'z', 'ع': "'a", 'غ': 'gh', 'ف': 'f', 'ق': 'q', 'ك': 'k', | |
'ل': 'l', 'م': 'm', 'ن': 'n', 'ه': 'h', 'و': 'w', 'ي': 'y', 'ئ': 'y', 'ى': 'a', | |
'ة': 'h', 'ؤ': 'w' | |
} | |
VOWELS = { | |
'\u064E': 'a', # Fatha | |
'\u0650': 'i', # Kasra | |
'\u064F': 'u', # Damma | |
'\u064B': 'an', # Fathatan | |
'\u064D': 'in', # Kasratan | |
'\u064C': 'un', # Dammatan | |
'\u0652': '', # Sukun | |
'\u0651': '' # Shadda | |
} | |
WORD_SEPARATORS = {'\u0020', '\u00A0'} # Space and No-Break Space | |
def arabic_to_latin(text: str) -> str: | |
result = '' | |
last_char = '' | |
word_start = True | |
for i, char in enumerate(text): | |
if char in WORD_SEPARATORS: | |
word_start = True | |
result += ' ' | |
continue | |
if char in MAPPING: | |
mapped_char = MAPPING[char] | |
if result and result[-1] not in 'aiou' and char in '\u0627\u0623\u0625\u0622': | |
# Alif variations | |
mapped_char = 'a' | |
result += mapped_char | |
word_start = False | |
elif char in VOWELS: | |
if result and result[-1] not in 'aiou': | |
# Only add vowel marks if the last char is not a vowel | |
result += VOWELS[char] | |
word_start = False | |
# Handle doubling of characters with Shadda | |
if char == '\u0651' and i > 0 and text[i - 1] in MAPPING: | |
# Shadda | |
result += MAPPING[text[i - 1]] | |
last_char = char | |
word_start = False | |
if word_start and result: | |
# Capitalize the first letter of each word | |
result = result[:-1] + result[-1].upper() | |
word_start = False | |
return result | |
# Example usage | |
arabic_text = "اَلسَّلَامُ عَلَيْكُمْ" | |
latin_text = arabic_to_latin(arabic_text) | |
print("Latin:", latin_text) | |
#Latin: alsalaamu 'alaykum |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment