Created
August 30, 2023 09:58
-
-
Save gromgull/da13f68e9768a44fa158900f933d2a94 to your computer and use it in GitHub Desktop.
BPE Encoding Tutorial from Huggingface
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "code", | |
"execution_count": 1, | |
"id": "4d0db2a2", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"2023-08-30 10:13:27.900353: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\n", | |
"To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n" | |
] | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "4424ba5c3b5442f2932858d518ded7a5", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Downloading: 0%| | 0.00/665 [00:00<?, ?B/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "a380316b33674902878f918db676cbbf", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Downloading: 0%| | 0.00/1.04M [00:00<?, ?B/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "906fe17299b147da946166486d4f6f63", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Downloading: 0%| | 0.00/456k [00:00<?, ?B/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
}, | |
{ | |
"data": { | |
"application/vnd.jupyter.widget-view+json": { | |
"model_id": "601bc6160bdd447882ec2769dafa6606", | |
"version_major": 2, | |
"version_minor": 0 | |
}, | |
"text/plain": [ | |
"Downloading: 0%| | 0.00/1.36M [00:00<?, ?B/s]" | |
] | |
}, | |
"metadata": {}, | |
"output_type": "display_data" | |
} | |
], | |
"source": [ | |
"from transformers import AutoTokenizer\n", | |
"\n", | |
"tokenizer = AutoTokenizer.from_pretrained(\"gpt2\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 3, | |
"id": "7d429e99", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"corpus = [\n", | |
" \"This is the Hugging Face Course.\",\n", | |
" \"This chapter is about tokenization.\",\n", | |
" \"This section shows several tokenizer algorithbms.\",\n", | |
" \"Hopefully, you will be able to understand how they are trained and generate tokens.\",\n", | |
"]" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 21, | |
"id": "862da2c5", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"[('Gunnar', (0, 6)),\n", | |
" ('Ġ', (6, 7)),\n", | |
" ('Ġeats', (7, 12)),\n", | |
" ('Ċ', (12, 13)),\n", | |
" ('cake', (13, 17)),\n", | |
" ('.', (17, 18))]" | |
] | |
}, | |
"execution_count": 21, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(\"Gunnar eats\\ncake.\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 4, | |
"id": "be1a4896", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"defaultdict(<class 'int'>, {'This': 3, 'Ġis': 2, 'Ġthe': 1, 'ĠHugging': 1, 'ĠFace': 1, 'ĠCourse': 1, '.': 4, 'Ġchapter': 1, 'Ġabout': 1, 'Ġtokenization': 1, 'Ġsection': 1, 'Ġshows': 1, 'Ġseveral': 1, 'Ġtokenizer': 1, 'Ġalgorithms': 1, 'Hopefully': 1, ',': 1, 'Ġyou': 1, 'Ġwill': 1, 'Ġbe': 1, 'Ġable': 1, 'Ġto': 1, 'Ġunderstand': 1, 'Ġhow': 1, 'Ġthey': 1, 'Ġare': 1, 'Ġtrained': 1, 'Ġand': 1, 'Ġgenerate': 1, 'Ġtokens': 1})\n" | |
] | |
} | |
], | |
"source": [ | |
"from collections import defaultdict\n", | |
"\n", | |
"word_freqs = defaultdict(int)\n", | |
"\n", | |
"for text in corpus:\n", | |
" words_with_offsets = tokenizer.backend_tokenizer.pre_tokenizer.pre_tokenize_str(text)\n", | |
" new_words = [word for word, offset in words_with_offsets]\n", | |
" for word in new_words:\n", | |
" word_freqs[word] += 1\n", | |
"\n", | |
"print(word_freqs)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 5, | |
"id": "263b8335", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"[',', '.', 'C', 'F', 'H', 'T', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'k', 'l', 'm', 'n', 'o', 'p', 'r', 's', 't', 'u', 'v', 'w', 'y', 'z', 'Ġ']\n" | |
] | |
} | |
], | |
"source": [ | |
"alphabet = []\n", | |
"\n", | |
"for word in word_freqs.keys():\n", | |
" for letter in word:\n", | |
" if letter not in alphabet:\n", | |
" alphabet.append(letter)\n", | |
"alphabet.sort()\n", | |
"\n", | |
"print(alphabet)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 26, | |
"id": "39678c7c", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"31" | |
] | |
}, | |
"execution_count": 26, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vocab = [\"<|endoftext|>\"] + alphabet.copy()\n", | |
"len(vocab)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"id": "16c8ecf2", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{'This': ['T', 'h', 'i', 's'],\n", | |
" 'Ġis': ['Ġ', 'i', 's'],\n", | |
" 'Ġthe': ['Ġ', 't', 'h', 'e'],\n", | |
" 'ĠHugging': ['Ġ', 'H', 'u', 'g', 'g', 'i', 'n', 'g'],\n", | |
" 'ĠFace': ['Ġ', 'F', 'a', 'c', 'e'],\n", | |
" 'ĠCourse': ['Ġ', 'C', 'o', 'u', 'r', 's', 'e'],\n", | |
" '.': ['.'],\n", | |
" 'Ġchapter': ['Ġ', 'c', 'h', 'a', 'p', 't', 'e', 'r'],\n", | |
" 'Ġabout': ['Ġ', 'a', 'b', 'o', 'u', 't'],\n", | |
" 'Ġtokenization': ['Ġ',\n", | |
" 't',\n", | |
" 'o',\n", | |
" 'k',\n", | |
" 'e',\n", | |
" 'n',\n", | |
" 'i',\n", | |
" 'z',\n", | |
" 'a',\n", | |
" 't',\n", | |
" 'i',\n", | |
" 'o',\n", | |
" 'n'],\n", | |
" 'Ġsection': ['Ġ', 's', 'e', 'c', 't', 'i', 'o', 'n'],\n", | |
" 'Ġshows': ['Ġ', 's', 'h', 'o', 'w', 's'],\n", | |
" 'Ġseveral': ['Ġ', 's', 'e', 'v', 'e', 'r', 'a', 'l'],\n", | |
" 'Ġtokenizer': ['Ġ', 't', 'o', 'k', 'e', 'n', 'i', 'z', 'e', 'r'],\n", | |
" 'Ġalgorithms': ['Ġ', 'a', 'l', 'g', 'o', 'r', 'i', 't', 'h', 'm', 's'],\n", | |
" 'Hopefully': ['H', 'o', 'p', 'e', 'f', 'u', 'l', 'l', 'y'],\n", | |
" ',': [','],\n", | |
" 'Ġyou': ['Ġ', 'y', 'o', 'u'],\n", | |
" 'Ġwill': ['Ġ', 'w', 'i', 'l', 'l'],\n", | |
" 'Ġbe': ['Ġ', 'b', 'e'],\n", | |
" 'Ġable': ['Ġ', 'a', 'b', 'l', 'e'],\n", | |
" 'Ġto': ['Ġ', 't', 'o'],\n", | |
" 'Ġunderstand': ['Ġ', 'u', 'n', 'd', 'e', 'r', 's', 't', 'a', 'n', 'd'],\n", | |
" 'Ġhow': ['Ġ', 'h', 'o', 'w'],\n", | |
" 'Ġthey': ['Ġ', 't', 'h', 'e', 'y'],\n", | |
" 'Ġare': ['Ġ', 'a', 'r', 'e'],\n", | |
" 'Ġtrained': ['Ġ', 't', 'r', 'a', 'i', 'n', 'e', 'd'],\n", | |
" 'Ġand': ['Ġ', 'a', 'n', 'd'],\n", | |
" 'Ġgenerate': ['Ġ', 'g', 'e', 'n', 'e', 'r', 'a', 't', 'e'],\n", | |
" 'Ġtokens': ['Ġ', 't', 'o', 'k', 'e', 'n', 's']}" | |
] | |
}, | |
"execution_count": 22, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"splits = {word: [c for c in word] for word in word_freqs.keys()}\n", | |
"splits" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"id": "8bc3aaaf", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def compute_pair_freqs(splits):\n", | |
" pair_freqs = defaultdict(int)\n", | |
" for word, freq in word_freqs.items():\n", | |
" split = splits[word]\n", | |
" if len(split) == 1:\n", | |
" continue\n", | |
" for i in range(len(split) - 1):\n", | |
" pair = (split[i], split[i + 1])\n", | |
" pair_freqs[pair] += freq\n", | |
" return pair_freqs" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 24, | |
"id": "bf4729a6", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"('T', 'h'): 3\n", | |
"('h', 'i'): 3\n", | |
"('i', 's'): 5\n", | |
"('Ġ', 'i'): 2\n", | |
"('Ġ', 't'): 7\n", | |
"('t', 'h'): 3\n" | |
] | |
} | |
], | |
"source": [ | |
"pair_freqs = compute_pair_freqs(splits)\n", | |
"\n", | |
"for i, key in enumerate(pair_freqs.keys()):\n", | |
" print(f\"{key}: {pair_freqs[key]}\")\n", | |
" if i >= 5:\n", | |
" break" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 25, | |
"id": "b728166b", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"('Ġ', 't') 7\n" | |
] | |
} | |
], | |
"source": [ | |
"best_pair = \"\"\n", | |
"max_freq = None\n", | |
"\n", | |
"for pair, freq in pair_freqs.items():\n", | |
" if max_freq is None or max_freq < freq:\n", | |
" best_pair = pair\n", | |
" max_freq = freq\n", | |
"\n", | |
"print(best_pair, max_freq)" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 11, | |
"id": "1a5c67f7", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def merge_pair(a, b, splits):\n", | |
" for word in word_freqs:\n", | |
" split = splits[word]\n", | |
" if len(split) == 1:\n", | |
" continue\n", | |
"\n", | |
" i = 0\n", | |
" while i < len(split) - 1:\n", | |
" if split[i] == a and split[i + 1] == b:\n", | |
" split = split[:i] + [a + b] + split[i + 2 :]\n", | |
" else:\n", | |
" i += 1\n", | |
" splits[word] = split\n", | |
" return splits" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 12, | |
"id": "a246f6a5", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"merges = {(\"Ġ\", \"t\"): \"Ġt\"}\n", | |
"vocab.append(\"Ġt\")" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 13, | |
"id": "622788eb", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"['Ġt', 'r', 'a', 'i', 'n', 'e', 'd']\n" | |
] | |
} | |
], | |
"source": [ | |
"splits = merge_pair(\"Ġ\", \"t\", splits)\n", | |
"print(splits[\"Ġtrained\"])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 14, | |
"id": "6d857449", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"vocab_size = 50\n", | |
"\n", | |
"while len(vocab) < vocab_size:\n", | |
" pair_freqs = compute_pair_freqs(splits)\n", | |
" best_pair = \"\"\n", | |
" max_freq = None\n", | |
" for pair, freq in pair_freqs.items():\n", | |
" if max_freq is None or max_freq < freq:\n", | |
" best_pair = pair\n", | |
" max_freq = freq\n", | |
" splits = merge_pair(*best_pair, splits)\n", | |
" merges[best_pair] = best_pair[0] + best_pair[1]\n", | |
" vocab.append(best_pair[0] + best_pair[1])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 15, | |
"id": "5bc2dcfd", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"{('Ġ', 't'): 'Ġt',\n", | |
" ('i', 's'): 'is',\n", | |
" ('e', 'r'): 'er',\n", | |
" ('Ġ', 'a'): 'Ġa',\n", | |
" ('Ġt', 'o'): 'Ġto',\n", | |
" ('e', 'n'): 'en',\n", | |
" ('T', 'h'): 'Th',\n", | |
" ('Th', 'is'): 'This',\n", | |
" ('o', 'u'): 'ou',\n", | |
" ('s', 'e'): 'se',\n", | |
" ('Ġto', 'k'): 'Ġtok',\n", | |
" ('Ġtok', 'en'): 'Ġtoken',\n", | |
" ('n', 'd'): 'nd',\n", | |
" ('Ġ', 'is'): 'Ġis',\n", | |
" ('Ġt', 'h'): 'Ġth',\n", | |
" ('Ġth', 'e'): 'Ġthe',\n", | |
" ('i', 'n'): 'in',\n", | |
" ('Ġa', 'b'): 'Ġab',\n", | |
" ('Ġtoken', 'i'): 'Ġtokeni'}" | |
] | |
}, | |
"execution_count": 15, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"merges" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 16, | |
"id": "44b48140", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['<|endoftext|>',\n", | |
" ',',\n", | |
" '.',\n", | |
" 'C',\n", | |
" 'F',\n", | |
" 'H',\n", | |
" 'T',\n", | |
" 'a',\n", | |
" 'b',\n", | |
" 'c',\n", | |
" 'd',\n", | |
" 'e',\n", | |
" 'f',\n", | |
" 'g',\n", | |
" 'h',\n", | |
" 'i',\n", | |
" 'k',\n", | |
" 'l',\n", | |
" 'm',\n", | |
" 'n',\n", | |
" 'o',\n", | |
" 'p',\n", | |
" 'r',\n", | |
" 's',\n", | |
" 't',\n", | |
" 'u',\n", | |
" 'v',\n", | |
" 'w',\n", | |
" 'y',\n", | |
" 'z',\n", | |
" 'Ġ',\n", | |
" 'Ġt',\n", | |
" 'is',\n", | |
" 'er',\n", | |
" 'Ġa',\n", | |
" 'Ġto',\n", | |
" 'en',\n", | |
" 'Th',\n", | |
" 'This',\n", | |
" 'ou',\n", | |
" 'se',\n", | |
" 'Ġtok',\n", | |
" 'Ġtoken',\n", | |
" 'nd',\n", | |
" 'Ġis',\n", | |
" 'Ġth',\n", | |
" 'Ġthe',\n", | |
" 'in',\n", | |
" 'Ġab',\n", | |
" 'Ġtokeni']" | |
] | |
}, | |
"execution_count": 16, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"vocab" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 17, | |
"id": "cf629b1a", | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def tokenize(text):\n", | |
" pre_tokenize_result = tokenizer._tokenizer.pre_tokenizer.pre_tokenize_str(text)\n", | |
" pre_tokenized_text = [word for word, offset in pre_tokenize_result]\n", | |
" splits = [[l for l in word] for word in pre_tokenized_text]\n", | |
" for pair, merge in merges.items():\n", | |
" for idx, split in enumerate(splits):\n", | |
" i = 0\n", | |
" while i < len(split) - 1:\n", | |
" if split[i] == pair[0] and split[i + 1] == pair[1]:\n", | |
" split = split[:i] + [merge] + split[i + 2 :]\n", | |
" else:\n", | |
" i += 1\n", | |
" splits[idx] = split\n", | |
"\n", | |
" return sum(splits, [])" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 18, | |
"id": "e3bea0aa", | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"data": { | |
"text/plain": [ | |
"['This', 'Ġis', 'Ġ', 'n', 'o', 't', 'Ġa', 'Ġtoken', '.']" | |
] | |
}, | |
"execution_count": 18, | |
"metadata": {}, | |
"output_type": "execute_result" | |
} | |
], | |
"source": [ | |
"tokenize(\"This is not a token.\")\n" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": null, | |
"id": "f7a1455e", | |
"metadata": {}, | |
"outputs": [], | |
"source": [] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 3 (ipykernel)", | |
"language": "python", | |
"name": "python3" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 3 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython3", | |
"version": "3.9.6" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 5 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment