Created
December 13, 2018 23:52
-
-
Save palimondo/ae9f507735314e508a7aee739f1be6b9 to your computer and use it in GitHub Desktop.
String Benchmarking Corpus UDHR
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/// Text Corpus in various scripts and languages. | |
/// | |
/// Uses the Article 1 of Universal Declaration of Human Rights. | |
/// Individual entries are documented in format: | |
/// Language[, Variant] - Script | |
/// | |
/// Statistics on language use are from Ethnologue (2018), as cited on | |
/// https://en.wikipedia.org/wiki/List_of_languages_by_total_number_of_speakers | |
/// * T - Estimated Total Speakers (not reliable) | |
/// * L1 - First Language Speakers | |
/// For metalanguages, a most used language is chosen to represent the group. | |
/// For example Mandarin, Chinese will stand as member of the Chinese | |
/// metalanguage in the corpus. | |
enum UniversalDeclaration: String, CaseIterable { | |
/// English - Latn (Basic Latin). | |
/// T: 1.121 billion, L1: 378 million | |
case eng = """ | |
All human beings are born free and equal in dignity and rights. \ | |
They are endowed with reason and conscience and should act towards \ | |
one another in a spirit of brotherhood. | |
""" | |
/// Chinese, Mandarin (Simplified) - Hans. | |
/// T: 1.107 billion, L1: 908 million | |
case cmn = """ | |
人人生而自由,在尊严和权利上一律平等。他们赋有理性和良心,并应以兄弟关系的精神相对待。 | |
""" | |
/// Hindi - Deva | |
/// T: 534 million, L1: 260 million | |
case hin = """ | |
सभी मनुष्यों को गौरव और अधिकारों के मामले में जन्मजात स्वतन्त्रता और समानता प्राप्त है । \ | |
उन्हें बुद्धि और अन्तरात्मा की देन प्राप्त है और परस्पर उन्हें भाईचारे के भाव से बर्ताव करना चाहिए । | |
""" | |
/// Spanish - Latn (Latin-1 Supplement) | |
/// T: 512 million, L1: 442 million | |
case spa = """ | |
Todos los seres humanos nacen libres e iguales en dignidad y derechos y, \ | |
dotados como están de razón y conciencia, deben comportarse fraternalmente \ | |
los unos con los otros. | |
""" | |
/// Arabic - Arab | |
/// T: 315 million | |
case arb = """ | |
يولد جميع الناس أحرارًا متساوين في الكرامة والحقوق. \ | |
وقد وهبوا عقلاً وضميرًا وعليهم أن يعامل بعضهم بعضًا بروح الإخاء. | |
""" | |
/// French - Latn (Latin-1 Supplement) | |
/// T: 285 million, L1: 77 million | |
case fra = """ | |
Tous les êtres humains naissent libres et égaux en dignité et en droits. \ | |
Ils sont doués de raison et de conscience et doivent agir les uns envers \ | |
les autres dans un esprit de fraternité. | |
""" | |
/// Malay - Latn | |
/// T: 281 million, L1: 77 million | |
case zlm = """ | |
Semua manusia dilahirkan bebas dan samarata dari segi kemuliaan dan hak-hak. \ | |
Mereka mempunyai pemikiran dan perasaan hati dan hendaklah bertindak di \ | |
antara satu sama lain dengan semangat persaudaraan. | |
""" | |
/// Russian - Cyrl | |
/// T: 264 million, L1: 154 million | |
case rus = """ | |
Все люди рождаются свободными и равными в своем достоинстве и правах. \ | |
Они наделены разумом и совестью и должны поступать в отношении друг друга \ | |
в духе братства. | |
""" | |
/// Bengali - Beng | |
/// T: 261 million, L1: 242 million | |
case ben = """ | |
সমস্ত মানুষ স্বাধীনভাবে সমান মর্যাদা এবং অধিকার নিয়ে জন্মগ্রহণ করে। \ | |
তাঁদের বিবেক এবং বুদ্ধি আছে; সুতরাং সকলেরই একে অপরের প্রতি ভ্রাতৃত্বসুলভ মনোভাব নিয়ে আচরণ করা উচিত। | |
""" | |
/// Portuguese (Brasil) - Latn ( | |
/// T: 237 million, L1: 222 million | |
case por = """ | |
Todos os seres humanos nascem livres e iguais em dignidade e direitos. \ | |
São dotados de razão e consciência e devem agir em relação uns aos outros \ | |
com espírito de fraternidade. | |
""" | |
/// Urdu - Arab | |
/// T: 163 million, L1: 70 million | |
case urd = """ | |
تمام انسان آزاد اور حقوق و عزت کے اعتبار سے برابر پیدا ہوئے ہیں۔ \ | |
انہیں ضمیر اور عقل ودیعت ہوئی ہے۔ اس لئے انہیں ایک دوسرے کے ساتھ بھائی \ | |
چارے کا سلوک کرنا چاہیئے۔ | |
""" | |
/// German - Latn | |
/// T: 132 million, L1: 76 million | |
case deu = """ | |
Alle Menschen sind frei und gleich an Würde und Rechten geboren. \ | |
Sie sind mit Vernunft und Gewissen begabt und sollen einander im Geist \ | |
der Brüderlichkeit begegnen. | |
""" | |
/// Japanese - Jpan | |
/// T: 128 million, L1: 128 million | |
case jpn = """ | |
すべての人間は、生まれながらにして自由であり、かつ、尊厳と権利とについて平等である。\ | |
人間は、理性と良心とを授けられており、互いに同胞の精神をもって行動しなければならない。 | |
""" | |
/// Panjabi, Western (Lahnda) - Arab | |
/// T: 119 million, L1: 119 million | |
case pnb = """ | |
سارے انسان آزاد تے حقوق تے عزت دے لحاظ نال برابر پیدا ہوندے نیں ۔ ۔ \ | |
اوہ عقل سمجھ تے چنگے مندے دی پچھان تے احساس رکھدے نے ایس واسطے \ | |
اوہناں نوں اک دوجے نال بھائی چارے والا سلوک کرنا چاہی دا اے ۔ ۔ | |
""" | |
/// Panjabi, Eastern - Guru | |
/// T: 29 million, L1: 29 million | |
case pan = """ | |
ਸਾਰਾ ਮਨੁੱਖੀ ਪਰਿਵਾਰ ਆਪਣੀ ਮਹਿਮਾ, ਸ਼ਾਨ ਅਤੇ ਹੱਕਾਂ ਦੇ ਪੱਖੋਂ ਜਨਮ ਤੋਂ ਹੀ ਆਜ਼ਾਦ ਹੈ ਅਤੇ ਸੁਤੇ ਸਿੱਧ ਸਾਰੇ ਲੋਕ ਬਰਾਬਰ ਹਨ । \ | |
ਉਨ੍ਹਾਂ ਸਭਨਾ ਨੂੰ ਤਰਕ ਅਤੇ ਜ਼ਮੀਰ ਦੀ ਸੌਗਾਤ ਮਿਲੀ ਹੋਈ ਹੈ ਅਤੇ ਉਨ੍ਹਾਂ ਨੂੰ ਭਰਾਤਰੀਭਾਵ ਦੀ ਭਾਵਨਾ ਰਖਦਿਆਂ ਆਪਸ ਵਿਚ ਵਿਚਰਣਾ ਚਾਹੀਦਾ ਹੈ । | |
""" | |
/// Farsi - Arab | |
/// T: 110 million, L1: 60 million | |
case pes = """ | |
تمام افراد بشر آزاد بدنیا میایند و از لحاظ حیثیت و حقوق با هم برابرند. \ | |
همه دارای عقل و وجدان میباشند و باید نسبت بیکدیگر با روح برادری رفتار کنند. | |
""" | |
/// Swahili - Latn | |
/// T: million, L1: million | |
case swh = """ | |
Watu wote wamezaliwa huru, hadhi na haki zao ni sawa. \ | |
Wote wamejaliwa akili na dhamiri, hivyo yapasa watendeane kindugu. | |
""" | |
/// Javanese - Script | |
/// T: 84 million, L1: 84 million | |
case jav = """ | |
꧋ꦱꦧꦼꦤ꧀ꦲꦸꦮꦺꦴꦁꦏꦭꦲꦶꦂꦫꦏꦺꦏꦤ꧀ꦛꦶꦩꦂꦢꦶꦏꦭꦤ꧀ꦢꦂꦧꦺꦩꦂꦠꦧꦠ꧀ꦭꦤ꧀ꦲꦏ꧀ꦲꦏ꧀ꦏꦁꦥꦝ꧉\u{200B}\ | |
ꦏꦧꦺꦃꦥꦶꦤꦫꦶꦁꦔꦤ꧀ꦲꦏꦭ꧀ꦭꦤ꧀ꦏꦭ꧀ꦧꦸꦱꦂꦠꦏꦲꦗꦧ꧀ꦥꦱꦿꦮꦸꦁꦔꦤ꧀ꦲꦁꦒꦺꦴꦤ꧀ꦤꦺꦩꦼꦩꦶꦠꦿꦤ꧀ꦱꦶꦗꦶꦭꦤ꧀ꦱꦶꦗꦶꦤꦺꦏꦤ꧀ꦛꦶꦗꦶꦮꦺꦴꦱꦸꦩꦢꦸꦭꦸꦂ꧉\u{200B} | |
""" | |
/// Telugu - Telu | |
/// T: 80 million, L1: 75 million | |
case tel = """ | |
ప్రతిపత్తిస్వత్వముల విషయమున మానవులెల్లరును జన్మతః స్వతంత్రులును సమానులును నగుదురు. \ | |
వారు వివేచన-అంతఃకరణ సంపన్నులగుటచే పరస్పరము భ్రాతృభావముతో వర్తింపవలయును. | |
""" | |
/// Turkish - Latn (Extended-A) | |
/// T: 79 million, L1: 75 million | |
case tur = """ | |
Bütün insanlar hür, haysiyet ve haklar bakımından eşit doğarlar. \ | |
Akıl ve vicdana sahiptirler ve birbirlerine karşı kardeşlik zihniyeti \ | |
ile hareket etmelidirler. | |
""" | |
/// Korean - Hang | |
/// T: 77 million, L1: 77 million | |
case kor = """ | |
모든 인간은 태어날 때부터 자유로우며 그 존엄과 권리에 있어 동등하다. \ | |
인간은 천부적으로 이성과 양심을 부여받았으며 서로 형제애의 정신으로 행동하여야 한다. | |
""" | |
/// Marathi - Deva | |
/// T: 75 million, L1: 72 million | |
case mar = """ | |
सर्व मानवी व्यक्ति जन्मतःच स्वतंत्र आहेत व त्यांना समान प्रतिष्ठा व समान अधिकार आहेत. \ | |
त्यांना विचारशक्ति व सदसविद्वेकबुद्धि लाभलेली आहे. व त्यांनी एकमेकांशी बंधुत्याच्या भावनेने आचरण करावे. | |
""" | |
/// Tamil - Taml | |
/// T: 75 million, L1: 67 million | |
case tam = """ | |
மனிதப் பிறிவியினர் சகலரும் சுதந்திரமாகவே பிறக்கின்றனர்; அவர்கள் மதிப்பிலும், உரிமைகளிலும் \ | |
சமமானவர்கள், அவர்கள் நியாயத்தையும் மனச்சாட்சியையும் இயற்பண்பாகப் பெற்றவர்கள். \ | |
அவர்கள் ஒருவருடனொருவர் சகோதர உணர்வுப் பாங்கில் நடந்துகொள்ளல் வேண்டும். | |
""" | |
/// Vietnamese - Latn | |
/// T: 68 million, L1: 68 million | |
case vie = """ | |
Tất cả mọi người sinh ra đều được tự do và bình đẳng về nhân phẩm và quyền. \ | |
Mọi con người đều được tạo hoá ban cho lý trí và lương tâm và cần phải đối \ | |
xử với nhau trong tình bằng hữu. | |
""" | |
/// Vietnamese - Hani | |
case vieh = """ | |
畢哿每𠊛生𠚢調得自由吧平等𧗱人品吧權。每𡥵𠊛調得造化頒朱理智吧良心吧勤沛對處𢭲膮𥪝情朋友。 | |
""" | |
/// Italian - Latn | |
/// T: 68 million, L1: 65 million | |
case ita = """ | |
Tutti gli esseri umani nascono liberi ed eguali in dignità e diritti. \ | |
Essi sono dotati di ragione e di coscienza e devono agire gli uni verso \ | |
gli altri in spirito di fratellanza. | |
""" | |
/// Hausa - Latn | |
/// T: 63 million, L1: 44 million | |
case hau = """ | |
Su dai ƴan‐adam, ana haifuwarsu ne duka ƴantattu, kuma kowannensu na da \ | |
mutunci da hakkoki daidai da na kowa. Suna da hankali da tunani, saboda \ | |
haka duk abin da za su aikata wa juna, ya kamata su yi shi a cikin ƴan‐uwanci. | |
""" | |
/// Thai - Thai | |
/// T: million, L1: million | |
case tha = """ | |
มนุษย์ทั้งหลายเกิดมามีอิสระและเสมอภาคกันในเกียรติศักด[เกียรติศักดิ์]และสิทธิ \ | |
ต่างมีเหตุผลและมโนธรรม และควรปฏิบัติต่อกันด้วยเจตนารมณ์แห่งภราดรภาพ | |
""" | |
/// Lang - Script | |
/// T: million, L1: million | |
// case xxx = """ | |
//""" | |
/// Pictorial Math - Emoji+Math | |
/// T: 42?, L1: 1? | |
case emo = """ | |
∀👤 ∈ 👥,🤰➡️👶: 👤 ∈ 🗽 | |
👤 ≡ 👤: {🤝,📜} | |
👥 = {👤 | 🧠, 👼🤔👿} | |
👤∼👤: 👬👻 | |
""" | |
} | |
print(UniversalDeclaration.allCases.count) | |
let stats: [(lang: String, chars:Int, scalars:Int, utf16:Int, utf8:Int)] = | |
UniversalDeclaration.allCases.map { article1 in | |
let s: String = article1.rawValue | |
return (String(describing: article1), | |
s.count, s.unicodeScalars.count, s.utf16.count, s.utf8.count) | |
} | |
print("lang | char | scal | utf16 | utf8") | |
print(repeatElement("---", count: 5).joined(separator: "|")) | |
print(stats | |
.map({ "\($0.0) | \($0.1) | \($0.2) | \($0.3) | \($0.4)" }) | |
.joined(separator: "\n") | |
) | |
//print(stats.map({String(describing: $0)}).joined(separator: "\n")) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment