Created
July 2, 2020 20:59
-
-
Save piegu/d320edd105537b53a95cae31a266998f to your computer and use it in GitHub Desktop.
en tokenizer on a text in 3 languages of Byte-Level-BPE_universal_tokenizer_but.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# English pre-trained tokenizer on a text in 3 languages (en, pt, fr) | |
# text in 3 languages to be tokenized | |
text_en = 'Jacques-Germain Soufflot (Irancy, July 22, 1713 - Paris, August 29, 1780) was a French architect, initiator of the architectural style of Neoclassicism.' | |
text_pt = 'Jacques-Germain Soufflot (Irancy, 22 de julho de 1713 — Paris, 29 de agosto de 1780) foi um arquitecto francês, iniciador do estilo arquitectónico do Neoclassicismo.' | |
text_fr = 'Jacques-Germain Soufflot (Irancy, 22 juillet 1713 - Paris, 29 août 1780) était un architecte français, initiateur du style architectural du néoclassicisme.' | |
langs = ['en', 'pt', 'fr'] | |
texts = [text_en,text_pt,text_fr] | |
for lang,text in zip(*[langs,texts]): | |
print(f'({lang}) {TitledStr(text)}\n') | |
# number and list of classical tokens (ie, tokens separated by a blank) | |
for lang,text in zip(*[langs,texts]): | |
print(f'({lang} - {len(text.split())} tokens) {TitledStr(text.split(" "))}\n') | |
# number and list of tokens | |
# after the text tokenization by imported BPE GPT2TokenizerFast (trained with an English corpus...) | |
for lang,text in zip(*[langs,texts]): | |
toks = tokenizer_en.tokenize(text) | |
print(f'({lang} - {len(toks)} tokens) {TitledStr(toks)}\n') | |
# number and list of tokens ids | |
# after the text tokenization + numerization by imported BPE GPT2TokenizerFast (trained with an English corpus...) | |
for lang,text in zip(*[langs,texts]): | |
toks_ids = tokenizer_en.encode(text) | |
print(f'({lang} - {len(toks_ids)} tokens) {TitledStr(toks_ids)}\n') | |
# decode (back to the text) | |
for lang,text in zip(*[langs,texts]): | |
toks_ids = tokenizer_en.encode(text) | |
text_decoded = tokenizer_en.decode(toks_ids) | |
print(f'({lang}) {TitledStr(text_decoded)}\n') | |
# graph | |
# source: https://matplotlib.org/3.2.1/gallery/lines_bars_and_markers/barchart.html#sphx-glr-gallery-lines-bars-and-markers-barchart-py | |
text_split = list() | |
toks_split = list() | |
for text in texts: | |
text_split.append(len(text.split())) | |
toks_ids = tokenizer_en.encode(text) | |
toks_split.append(len(toks_ids)) | |
labels = langs | |
xy = list(np.array([1.,2.,3.]) - 0.2) | |
xz = list(np.array([1.,2.,3.]) + 0.2) | |
y = text_split | |
z = toks_split | |
ax = plt.subplot(111) | |
ax.bar(xy, y, width=0.4, color='b', align='center') | |
ax.bar(xz, z, width=0.4, color='g', align='center') | |
ax.set_xlabel('languages') | |
ax.set_xticks(range(1,len(labels)+1)) | |
ax.set_xticklabels(labels) | |
ax.set_ylabel('number of tokens') | |
ax.legend(['split(" ")', 'GPTTokenizerFast (en)']) | |
ax.set_title('Number of tokens by tokenization method and lang') | |
plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment