piegu · July 2, 2020 20:59
diff --git a/Byte-Level-BPE_universal_tokenizer_but_en_tokenizer_3_languages.ipynb b/Byte-Level-BPE_universal_tokenizer_but_en_tokenizer_3_languages.ipynb
 # English pre-trained tokenizer on a text in 3 languages (en, pt, fr)

 # text in 3 languages to be tokenized
 text_en = 'Jacques-Germain Soufflot (Irancy, July 22, 1713 - Paris, August 29, 1780) was a French architect, initiator of the architectural style of Neoclassicism.'
 text_pt = 'Jacques-Germain Soufflot (Irancy, 22 de julho de 1713 — Paris, 29 de agosto de 1780) foi um arquitecto francês, iniciador do estilo arquitectónico do Neoclassicismo.'
 text_fr = 'Jacques-Germain Soufflot (Irancy, 22 juillet 1713 - Paris, 29 août 1780) était un architecte français, initiateur du style architectural du néoclassicisme.'

 langs = ['en', 'pt', 'fr']
 texts = [text_en,text_pt,text_fr]

 for lang,text in zip(*[langs,texts]):
    print(f'({lang}) {TitledStr(text)}\n')

 # number and list of classical tokens (ie, tokens separated by a blank)
 for lang,text in zip(*[langs,texts]):
    print(f'({lang} - {len(text.split())} tokens) {TitledStr(text.split(" "))}\n')

 # number and list of tokens 
 # after the text tokenization by imported BPE GPT2TokenizerFast (trained with an English corpus...)
 for lang,text in zip(*[langs,texts]):
    toks = tokenizer_en.tokenize(text)
    print(f'({lang} - {len(toks)} tokens) {TitledStr(toks)}\n')

 # number and list of tokens ids
 # after the text tokenization + numerization by imported BPE GPT2TokenizerFast (trained with an English corpus...)
 for lang,text in zip(*[langs,texts]):
    toks_ids = tokenizer_en.encode(text)
    print(f'({lang} - {len(toks_ids)} tokens) {TitledStr(toks_ids)}\n')

 # decode (back to the text)
 for lang,text in zip(*[langs,texts]):
    toks_ids = tokenizer_en.encode(text)
    text_decoded = tokenizer_en.decode(toks_ids)
    print(f'({lang}) {TitledStr(text_decoded)}\n')

 # graph
 # source: https://matplotlib.org/3.2.1/gallery/lines_bars_and_markers/barchart.html#sphx-glr-gallery-lines-bars-and-markers-barchart-py
 text_split = list()
 toks_split = list()

 for text in texts:
    text_split.append(len(text.split()))
    toks_ids = tokenizer_en.encode(text)
    toks_split.append(len(toks_ids))
    
 labels = langs
 xy = list(np.array([1.,2.,3.]) - 0.2)
 xz = list(np.array([1.,2.,3.]) + 0.2)
 y = text_split
 z = toks_split

 ax = plt.subplot(111)
 ax.bar(xy, y, width=0.4, color='b', align='center')
 ax.bar(xz, z, width=0.4, color='g', align='center')

 ax.set_xlabel('languages')
 ax.set_xticks(range(1,len(labels)+1))
 ax.set_xticklabels(labels)
 ax.set_ylabel('number of tokens')
 ax.legend(['split(" ")', 'GPTTokenizerFast (en)'])

 ax.set_title('Number of tokens by tokenization method and lang')

 plt.show()
	# English pre-trained tokenizer on a text in 3 languages (en, pt, fr)

	# text in 3 languages to be tokenized
	text_en = 'Jacques-Germain Soufflot (Irancy, July 22, 1713 - Paris, August 29, 1780) was a French architect, initiator of the architectural style of Neoclassicism.'
	text_pt = 'Jacques-Germain Soufflot (Irancy, 22 de julho de 1713 — Paris, 29 de agosto de 1780) foi um arquitecto francês, iniciador do estilo arquitectónico do Neoclassicismo.'
	text_fr = 'Jacques-Germain Soufflot (Irancy, 22 juillet 1713 - Paris, 29 août 1780) était un architecte français, initiateur du style architectural du néoclassicisme.'

	langs = ['en', 'pt', 'fr']
	texts = [text_en,text_pt,text_fr]

	for lang,text in zip(*[langs,texts]):
	print(f'({lang}) {TitledStr(text)}\n')

	# number and list of classical tokens (ie, tokens separated by a blank)
	for lang,text in zip(*[langs,texts]):
	print(f'({lang} - {len(text.split())} tokens) {TitledStr(text.split(" "))}\n')

	# number and list of tokens
	# after the text tokenization by imported BPE GPT2TokenizerFast (trained with an English corpus...)
	for lang,text in zip(*[langs,texts]):
	toks = tokenizer_en.tokenize(text)
	print(f'({lang} - {len(toks)} tokens) {TitledStr(toks)}\n')

	# number and list of tokens ids
	# after the text tokenization + numerization by imported BPE GPT2TokenizerFast (trained with an English corpus...)
	for lang,text in zip(*[langs,texts]):
	toks_ids = tokenizer_en.encode(text)
	print(f'({lang} - {len(toks_ids)} tokens) {TitledStr(toks_ids)}\n')

	# decode (back to the text)
	for lang,text in zip(*[langs,texts]):
	toks_ids = tokenizer_en.encode(text)
	text_decoded = tokenizer_en.decode(toks_ids)
	print(f'({lang}) {TitledStr(text_decoded)}\n')

	# graph
	# source: https://matplotlib.org/3.2.1/gallery/lines_bars_and_markers/barchart.html#sphx-glr-gallery-lines-bars-and-markers-barchart-py
	text_split = list()
	toks_split = list()

	for text in texts:
	text_split.append(len(text.split()))
	toks_ids = tokenizer_en.encode(text)
	toks_split.append(len(toks_ids))

	labels = langs
	xy = list(np.array([1.,2.,3.]) - 0.2)
	xz = list(np.array([1.,2.,3.]) + 0.2)
	y = text_split
	z = toks_split

	ax = plt.subplot(111)
	ax.bar(xy, y, width=0.4, color='b', align='center')
	ax.bar(xz, z, width=0.4, color='g', align='center')

	ax.set_xlabel('languages')
	ax.set_xticks(range(1,len(labels)+1))
	ax.set_xticklabels(labels)
	ax.set_ylabel('number of tokens')
	ax.legend(['split(" ")', 'GPTTokenizerFast (en)'])

	ax.set_title('Number of tokens by tokenization method and lang')

	plt.show()