Created
September 8, 2019 21:37
-
-
Save adiprasad/8125e2bc68f167eda965571e15fb2948 to your computer and use it in GitHub Desktop.
Tokens image dataset creating using Stanford OCR and Gutenberg
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{ | |
"cells": [ | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"# Creating an image dataset using character images from Stanford OCR and Tokens from Gutenberg" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Imports" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 19, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"import numpy as np\n", | |
"from sets import Set \n", | |
"from collections import defaultdict\n", | |
"import pickle\n", | |
"import nltk\n", | |
"import re\n", | |
"import os\n", | |
"import pickle\n", | |
"from string import lower\n", | |
"from sklearn.model_selection import train_test_split\n", | |
"from collections import Counter\n", | |
"import matplotlib.pyplot as plt\n", | |
"from string import lower" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Creating a character image variants dictionary" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 20, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Character : a\n", | |
"Number of variants: 4021\n", | |
"Character : c\n", | |
"Number of variants: 2072\n", | |
"Character : b\n", | |
"Number of variants: 1282\n", | |
"Character : e\n", | |
"Number of variants: 4945\n", | |
"Character : d\n", | |
"Number of variants: 1441\n", | |
"Character : g\n", | |
"Number of variants: 2471\n", | |
"Character : f\n", | |
"Number of variants: 921\n", | |
"Character : i\n", | |
"Number of variants: 4027\n", | |
"Character : h\n", | |
"Number of variants: 861\n", | |
"Character : k\n", | |
"Number of variants: 909\n", | |
"Character : j\n", | |
"Number of variants: 189\n", | |
"Character : m\n", | |
"Number of variants: 1587\n", | |
"Character : l\n", | |
"Number of variants: 1696\n", | |
"Character : o\n", | |
"Number of variants: 3854\n", | |
"Character : n\n", | |
"Number of variants: 4988\n", | |
"Character : q\n", | |
"Number of variants: 341\n", | |
"Character : p\n", | |
"Number of variants: 1377\n", | |
"Character : s\n", | |
"Number of variants: 1394\n", | |
"Character : r\n", | |
"Number of variants: 2634\n", | |
"Character : u\n", | |
"Number of variants: 2538\n", | |
"Character : t\n", | |
"Number of variants: 2126\n", | |
"Character : w\n", | |
"Number of variants: 520\n", | |
"Character : v\n", | |
"Number of variants: 661\n", | |
"Character : y\n", | |
"Number of variants: 1221\n", | |
"Character : x\n", | |
"Number of variants: 413\n", | |
"Character : z\n", | |
"Number of variants: 1091\n" | |
] | |
} | |
], | |
"source": [ | |
"data_file = open('letter.data', 'r')\n", | |
"char_pixel_dict = defaultdict(Set)\t# key : character, value : set of pixel array variants for that character\n", | |
"\n", | |
"\n", | |
"for line in data_file:\n", | |
"\tdata_arr = line.split()\n", | |
"\tchar = data_arr[1]\n", | |
"\n", | |
"\tchar_label = ord(char) - 97\n", | |
"\n", | |
"\tchar_features = map(lambda x : int(x), data_arr[6:])\n", | |
"\n", | |
"\tchar_set = char_pixel_dict.get(char, Set())\n", | |
"\n", | |
"\tchar_feature_tup = tuple(char_features)\n", | |
"\n", | |
"\tchar_set.add(char_feature_tup)\n", | |
"\n", | |
"\tchar_pixel_dict[char] = char_set\n", | |
"\n", | |
" \n", | |
"\n", | |
"for key, val in char_pixel_dict.items():\n", | |
"\tprint \"Character : \" + str(key)\n", | |
"\tprint \"Number of variants: \" + str(len(val))\n", | |
"\n", | |
"\n", | |
"with open(\"./letter_variants.npy\", \"w\") as f:\n", | |
"\tpickle.dump(char_pixel_dict, f)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Script to create images and labels for tokens extracted from the book Moby Dick" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 22, | |
"metadata": {}, | |
"outputs": [], | |
"source": [ | |
"def get_dir(dir_name):\n", | |
"\tdir_path = os.path.join(os.getcwd(), dir_name)\n", | |
"\t\n", | |
"\tif (not os.path.exists(dir_path)):\n", | |
"\t\tos.makedirs(dir_path)\n", | |
"\n", | |
"\treturn dir_path\n", | |
"\n", | |
"def init_gutenberg():\n", | |
"\tnltk.download('gutenberg')\n", | |
"\n", | |
"\n", | |
"def get_book_tokens_gt_len(book_name, length):\n", | |
"\tbook_tokens = nltk.corpus.gutenberg.words(book_name)\n", | |
"\tbook_token_set = Set(book_tokens)\n", | |
"\tbook_token_list = map(lambda x : x.strip(), book_token_set)\n", | |
"\tbook_words = filter(lambda x : re.match(r'^[a-zA-Z]+$', x), book_token_list)\n", | |
"\tbook_words_gt_eq_len = filter(lambda x : len(x) >= length, book_words)\n", | |
"\n", | |
"\tbook_words_gt_eq_len = map(lambda x : lower(x), book_words_gt_eq_len)\n", | |
"\n", | |
"\treturn np.array(book_words_gt_eq_len)\n", | |
"\n", | |
"def _get_file_name(img_num, file_type = \"img\"):\n", | |
"\tif (file_type == \"img\"):\n", | |
"\t\tfile_name = \"img_\" + str(img_num) + \".png\"\n", | |
"\telse:\n", | |
"\t\tfile_name = \"img_\" + str(img_num) + \".txt\"\n", | |
"\n", | |
"\treturn file_name\n", | |
"\n", | |
"def _save_img_in_folder(dir_path, img_num, word_arr):\n", | |
"\tfile_name = _get_file_name(img_num)\n", | |
"\n", | |
"\tfile_path = os.path.join(dir_path, file_name)\n", | |
"\n", | |
"\tplt.imsave(file_path, word_arr)\n", | |
"\n", | |
"\n", | |
"def _draw_and_save_word_in_dir(img_num, img_arr, dir_path):\n", | |
"\timg_arr = img_arr.reshape(img_arr.shape[0], 16,- 1)\n", | |
"\n", | |
"\tword_arr = img_arr[0]\n", | |
"\n", | |
"\tfor i in range(1, len(img_arr)):\n", | |
"\t\tword_arr = np.hstack((word_arr, np.zeros((16,2), dtype = int)))\n", | |
"\t\tword_arr = np.hstack((word_arr, img_arr[i]))\n", | |
"\n", | |
"\tword_arr*=255\n", | |
"\tword_arr = word_arr[:,8:]\t\t# first 8 cols are zeros, 2 col padding at start and end\n", | |
"\n", | |
"\t_save_img_in_folder(dir_path, img_num, word_arr)\n", | |
"\n", | |
"\n", | |
"def _write_word_arr_in_dir(img_num, img_arr, dir_path):\n", | |
"\tfile_name = _get_file_name(img_num, \"txt\")\n", | |
"\n", | |
"\tfile_path = os.path.join(dir_path, file_name)\n", | |
"\n", | |
"\timg_arr = img_arr[1:]\n", | |
"\n", | |
"\twith open(file_path, \"w\") as f:\n", | |
"\t\tfor img in img_arr:\n", | |
"\t\t\tline = \" \".join([`num` for num in img]) + \"\\n\"\n", | |
"\t\t\tf.write(line)\n", | |
"\n", | |
"def save_word_to_dir(word_arr, word_num, dir_path):\n", | |
"\t_draw_and_save_word_in_dir(word_num, word_arr, dir_path)\n", | |
"\t_write_word_arr_in_dir(word_num, word_arr, dir_path)\n", | |
"\n", | |
"\n", | |
"def get_char_img_len(char_to_img_arr_dict):\n", | |
"\tlist_of_a_variants = list(char_to_img_arr_dict['a'])\n", | |
"\n", | |
"\treturn len(list_of_a_variants[0])\n", | |
"\n", | |
"\n", | |
"def get_random_variant_of_char(char_to_img_arr_dict, char):\n", | |
"\tchar_variants_list = list(char_to_img_arr_dict[char])\n", | |
"\tnum_variants = len(char_variants_list)\n", | |
"\n", | |
"\t#print \"Char : \" + char\n", | |
"\t#rint \"Number of variants : \" + str(num_variants)\n", | |
"\n", | |
"\trandom_variant_idx = np.random.choice(num_variants)\n", | |
"\n", | |
"\trandom_variant = char_variants_list[random_variant_idx]\n", | |
"\n", | |
"\treturn np.array(random_variant)\n", | |
"\n", | |
"\n", | |
"def save_word_imgs_to_dir(words, dir_path, char_to_img_arr_dict):\n", | |
"\tchar_feature_len = get_char_img_len(char_to_img_arr_dict)\n", | |
"\n", | |
"\t#print \"last word \" + str(words[-1])\n", | |
"\n", | |
"\tfor word_num, word in enumerate(words):\n", | |
"\t\t#print \"Writing \" + str(word) + \" to file\"\n", | |
"\t\tword_arr = np.zeros((1,char_feature_len), dtype = int)\n", | |
"\n", | |
"\t\tfor char in word:\n", | |
"\t\t\tvariant_of_char = get_random_variant_of_char(char_to_img_arr_dict, char)\n", | |
"\t\t\tword_arr = np.vstack((word_arr, variant_of_char))\n", | |
"\n", | |
"\t\tsave_word_to_dir(word_arr, word_num, dir_path)\n", | |
"\t\t#print \"Write successful!\"\n", | |
"\n", | |
"\n", | |
"def write_words_list_to_dir(words, file_path):\n", | |
"\twith open(file_path, 'w') as word_file:\n", | |
"\t\tfor word_num, word in enumerate(words):\n", | |
"\t\t\tword_file.write(str(word_num) + \" \" + word + \"\\n\")\n", | |
"\n", | |
"\n", | |
"def filter_words_for_stratification(words_list, test_split):\n", | |
"\tlen_words_list = map(lambda x : len(x), words_list)\n", | |
"\n", | |
"\tmin_words_req_per_len = int(np.ceil(test_split * 10))\n", | |
"\n", | |
"\tlen_words_counter = Counter(len_words_list)\n", | |
"\n", | |
"\tprint \"Len words counter \" + str(len_words_counter)\n", | |
"\n", | |
"\tnot_allowed_lengths = [x[0] for x in len_words_counter.items() if x[1] < min_words_req_per_len]\n", | |
"\n", | |
"\tprint \"Not allowed lengths \" + str(not_allowed_lengths)\n", | |
"\n", | |
"\tlen_words_list = np.array(len_words_list)\n", | |
"\n", | |
"\tmask = np.zeros(len(words_list), dtype=bool)\n", | |
"\n", | |
"\tfor n_length in not_allowed_lengths:\n", | |
"\t\tn_len_mask = (len_words_list == n_length)\n", | |
"\t\tmask = np.ma.mask_or(mask, n_len_mask)\n", | |
"\n", | |
"\tinverted_mask = np.invert(mask)\n", | |
"\n", | |
"\treturn words_list[inverted_mask]\n", | |
"\n", | |
"\n", | |
"def create_data_set(words_list, char_to_img_arr_dict, test_split):\n", | |
"\twords_list = filter_words_for_stratification(words_list, test_split)\n", | |
"\n", | |
"\tprint \"Number of tokens after filtering \" + str(len(words_list))\n", | |
"\n", | |
"\tlen_words_list = map(lambda x : len(x), words_list)\n", | |
"\n", | |
"\twords_train, words_test, _ , _ = train_test_split(words_list, len_words_list, test_size = test_split, shuffle = True, stratify = len_words_list)\n", | |
"\t\n", | |
"\tdata_dir = get_dir('data')\n", | |
"\n", | |
"\ttrain_dir_path = get_dir('data/train_words')\n", | |
"\ttest_dir_path = get_dir('data/test_words')\n", | |
"\n", | |
"\ttrain_words_file_path = os.path.join(data_dir, 'train_words.txt')\n", | |
"\ttest_words_file_path = os.path.join(data_dir, 'test_words.txt')\n", | |
"\n", | |
"\tsave_word_imgs_to_dir(words_train, train_dir_path, char_to_img_arr_dict)\n", | |
"\tsave_word_imgs_to_dir(words_test, test_dir_path, char_to_img_arr_dict)\n", | |
"\n", | |
"\twrite_words_list_to_dir(words_train, train_words_file_path)\n", | |
"\twrite_words_list_to_dir(words_test, test_words_file_path)" | |
] | |
}, | |
{ | |
"cell_type": "markdown", | |
"metadata": {}, | |
"source": [ | |
"## Running the data preparation script" | |
] | |
}, | |
{ | |
"cell_type": "code", | |
"execution_count": 23, | |
"metadata": {}, | |
"outputs": [ | |
{ | |
"name": "stderr", | |
"output_type": "stream", | |
"text": [ | |
"[nltk_data] Downloading package gutenberg to /Users/ady/nltk_data...\n", | |
"[nltk_data] Package gutenberg is already up-to-date!\n" | |
] | |
}, | |
{ | |
"name": "stdout", | |
"output_type": "stream", | |
"text": [ | |
"Number of extracted tokens 18862\n", | |
"Len words counter Counter({7: 3005, 6: 2941, 8: 2708, 5: 2393, 9: 2174, 4: 1701, 10: 1440, 11: 881, 3: 589, 12: 550, 13: 286, 14: 122, 15: 48, 16: 13, 17: 9, 18: 1, 20: 1})\n", | |
"Not allowed lengths [18, 20]\n", | |
"Number of tokens after filtering 18860\n", | |
"Dataset preparation successful.. Check ./data\n" | |
] | |
} | |
], | |
"source": [ | |
"init_gutenberg()\n", | |
"\n", | |
"mb_dick_tokens = get_book_tokens_gt_len('melville-moby_dick.txt', 3)\n", | |
"\n", | |
"print \"Number of extracted tokens \" + str(len(mb_dick_tokens))\n", | |
"\n", | |
"with open(\"letter_variants.npy\") as f:\n", | |
" char_to_img_arr_dict = pickle.load(f)\n", | |
"\n", | |
"create_data_set(mb_dick_tokens, char_to_img_arr_dict, 0.2)\n", | |
"\n", | |
"print \"Dataset preparation successful.. Check ./data\"" | |
] | |
} | |
], | |
"metadata": { | |
"kernelspec": { | |
"display_name": "Python 2", | |
"language": "python", | |
"name": "python2" | |
}, | |
"language_info": { | |
"codemirror_mode": { | |
"name": "ipython", | |
"version": 2 | |
}, | |
"file_extension": ".py", | |
"mimetype": "text/x-python", | |
"name": "python", | |
"nbconvert_exporter": "python", | |
"pygments_lexer": "ipython2", | |
"version": "2.7.16" | |
} | |
}, | |
"nbformat": 4, | |
"nbformat_minor": 2 | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment