Created
November 26, 2019 19:44
-
-
Save pypt/4357bf9ea68191715b127e8eea46806a to your computer and use it in GitHub Desktop.
lang_id.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import logging | |
from typing import Optional | |
from urllib.parse import urlparse | |
import cld2 | |
logging.basicConfig(level=logging.DEBUG) | |
UNKNOWN_LANGUAGE_CODE = 'un-UN' | |
def language_code_for_text(text: str) -> Optional[str]: | |
""" | |
Guess the language of a text input. | |
https://pypi.org/project/cld2-cffi/ | |
:param text: Text. | |
:return: ISO 639-1 language code, e.g. "en", or None if language couldn't be determined. | |
""" | |
assert text is not None, "Text is None." | |
assert len(text), "Text is empty." | |
try: | |
is_reliable, text_bytes_found, details = cld2.detect(text) | |
except Exception as ex: | |
logging.warning(f"Unable to determine language for text '{text[0:40]}...': {ex}") | |
return None | |
if not is_reliable: | |
logging.warning(f"Language guess is not reliable for text '{text[0:40]}...'") | |
# Still stick with it though | |
if len(details) == 0: | |
logging.warning(f"Language could not be guessed for text '{text[0:40]}...'") | |
return None | |
best_guess = details[0] | |
return best_guess.language_code | |
def country_tld_from_url(url: str) -> Optional[str]: | |
""" | |
Extract country TLD from URL. | |
:param url: URL, e.g. "https://www.bbc.co.uk/news/politics/eu-regions/vote2014_sitemap.xml". | |
:return: Country TLD of URL without the prefix period, e.g. "uk", or None if there's no TLD. | |
""" | |
if not url: | |
return None | |
try: | |
parsed_url = urlparse(url) | |
except Exception as ex: | |
logging.warning(f"Unable to parse URL {url}: {ex}") | |
return None | |
hostname_parts = parsed_url.hostname.split('.') | |
if len(hostname_parts) < 2: | |
logging.warning(f"No TLD found in URL {url}") | |
return None | |
return hostname_parts[-1] | |
def iso_639_1_code_to_bcp_47_identifier(iso_639_1_code: str, url_hint: Optional[str] = None) -> Optional[str]: | |
""" | |
Convert ISO 639-1 language code to BCP-47 identifier. | |
Google Cloud requires for us to pass the language as a BCP-47 identifier: | |
https://cloud.google.com/speech-to-text/docs/languages | |
so we have to do some guessing about the dialect the audio data is going to be in. | |
:param iso_639_1_code: ISO 639-1 language code, e.g. "en". | |
:param url_hint: Optional URL hint to use for guessing the dialect used. | |
:return: BCP-47 identifier, e.g. "en-US", or None if the identifier can't be determined. | |
""" | |
if not iso_639_1_code: | |
logging.warning("ISO 639-1 code is unset.") | |
return None | |
tld = None | |
if url_hint: | |
tld = country_tld_from_url(url_hint) | |
iso_639_1_code = iso_639_1_code.lower() | |
if iso_639_1_code in { | |
# Language == country.upper() | |
'de', | |
'hr', | |
'is', | |
'it', | |
'lv', | |
'lt', | |
'hu', | |
'nl', | |
'pl', | |
'ro', | |
'sk', | |
'sl', | |
'fi', | |
'tr', | |
'bg', | |
'ru', | |
'th', | |
}: | |
return f"{iso_639_1_code}-{iso_639_1_code.upper()}" | |
elif iso_639_1_code in { | |
# Languages in India | |
'gu', | |
'gn', | |
'ml', | |
'mr', | |
}: | |
return f"{iso_639_1_code}-IN" | |
elif iso_639_1_code == 'af': | |
return 'af-ZA' | |
elif iso_639_1_code == 'am': | |
return 'am-ET' | |
elif iso_639_1_code == 'hy': | |
return 'hy-AM' | |
elif iso_639_1_code == 'az': | |
return 'az-AZ' | |
elif iso_639_1_code == 'id': | |
return 'id-ID' | |
elif iso_639_1_code == 'ms': | |
return 'ms-MY' | |
elif iso_639_1_code == 'bn': | |
if tld == 'in': | |
return 'bn-IN' | |
# Fallback | |
return 'bn-BD' | |
elif iso_639_1_code == 'ca': | |
return 'ca-ES' | |
elif iso_639_1_code == 'cs': | |
return 'cs-CZ' | |
elif iso_639_1_code == 'da': | |
return 'da-DK' | |
elif iso_639_1_code == 'en': | |
if tld == 'uk': | |
return 'en-GB' | |
elif tld in { | |
'au', | |
'ca', | |
'gh', | |
'in', | |
'ie', | |
'ke', | |
'nz', | |
'ng', | |
'ph', | |
'sg', | |
'za', | |
'tz', | |
}: | |
return f'en-{tld.upper()}' | |
# Fallback | |
return 'en-US' | |
elif iso_639_1_code == 'es': | |
if tld in { | |
'ar', | |
'bo', | |
'cl', | |
'co', | |
'cr', | |
'ec', | |
'sv', | |
'es', | |
'us', | |
'gt', | |
'hn', | |
'mx', | |
'ni', | |
'pa', | |
'py', | |
'pe', | |
'pr', | |
'do', | |
'uy', | |
've', | |
}: | |
return f'es-{tld.upper()}' | |
# Fallback | |
return 'es-ES' | |
elif iso_639_1_code == 'eu': | |
return 'eu-ES' | |
elif iso_639_1_code == 'fil': | |
return 'fil-PH' | |
elif iso_639_1_code == 'fr': | |
if tld == 'ca': | |
return 'fr-CA' | |
return 'fr-FR' | |
elif iso_639_1_code == 'gl': | |
return 'gl-ES' | |
elif iso_639_1_code == 'ka': | |
return 'ka-GE' | |
elif iso_639_1_code == 'zu': | |
return 'zu-ZA' | |
elif iso_639_1_code == 'jv': | |
return 'jv-ID' | |
elif iso_639_1_code == 'km': | |
return 'km-KH' | |
elif iso_639_1_code == 'lo': | |
return 'lo-LA' | |
elif iso_639_1_code == 'ne': | |
return 'ne-NP' | |
elif iso_639_1_code == 'nb': | |
return 'nb-NO' | |
elif iso_639_1_code == 'pt': | |
if tld == 'br': | |
return 'pt-BR' | |
# Fallback | |
return 'pt-PT' | |
elif iso_639_1_code == 'si': | |
return 'si-LK' | |
elif iso_639_1_code == 'su': | |
return 'su-ID' | |
elif iso_639_1_code == 'sw': | |
if tld == 'tz': | |
return 'sw-TZ' | |
# Fallback | |
return 'sw-KE' | |
elif iso_639_1_code == 'sv': | |
return 'sv-SE' | |
elif iso_639_1_code == 'ta': | |
if tld in { | |
'sg', | |
'lk', | |
'my', | |
}: | |
return f'ta-{tld.upper()}' | |
# Fallback | |
return 'ta-IN' | |
elif iso_639_1_code == 'te': | |
return 'te-IN' | |
elif iso_639_1_code == 'vi': | |
return 'vi-VN' | |
elif iso_639_1_code == 'ur': | |
if tld == 'pk': | |
return 'ur-PK' | |
# Fallback -- more Urdu speakers in India than Pakistan | |
return 'ur-IN' | |
elif iso_639_1_code == 'el': | |
return 'el-GR' | |
elif iso_639_1_code == 'sr': | |
return 'sr-RS' | |
elif iso_639_1_code == 'uk': | |
return 'uk-UA' | |
elif iso_639_1_code == 'he': | |
return 'he-IL' | |
elif iso_639_1_code == 'ar': | |
if tld in { | |
'il', | |
'jo', | |
'ae', | |
'bh', | |
'dz', | |
'sa', | |
'iq', | |
'kw', | |
'ma', | |
'tn', | |
'om', | |
'ps', | |
'qa', | |
'lb', | |
}: | |
return f'ar-{tld.upper()}' | |
# Fallback -- Egyptian Arabic is the most popular dialect | |
return 'ar-EG' | |
elif iso_639_1_code == 'hi': | |
return 'hi-IN' | |
elif iso_639_1_code == 'ko': | |
return 'ko-KR' | |
# Chinese (simplified) | |
elif iso_639_1_code == 'zh' or iso_639_1_code == 'zh-Hans': | |
if tld == 'hk': | |
return 'zh-HK' | |
# Fallback | |
return 'zh' | |
# Chinese (traditional) | |
elif iso_639_1_code == 'yue' or iso_639_1_code == 'zh-Hant': | |
if tld == 'tw': | |
return 'zh-TW' | |
# Fallback | |
return 'yue-Hant-HK' | |
elif iso_639_1_code == 'ja': | |
return 'ja-JP' | |
return None |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment