Last active
March 15, 2021 11:17
-
-
Save Shreyz-max/f3fb0509bc2b0bfe42cac9f951c1483d to your computer and use it in GitHub Desktop.
cleaning captions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train_path='training_data' | |
TRAIN_LABEL_PATH = os.path.join(train_path, 'training_label.json') | |
# mentioning the train test split | |
train_split = 0.85 | |
# loading the json file for training | |
with open(TRAIN_LABEL_PATH) as data_file: | |
y_data = json.load(data_file) | |
# train_list contains all the captions with their video ID | |
# vocab_list contains all the vocabulary from training data | |
train_list = [] | |
vocab_list = [] | |
for y in y_data: | |
for caption in y['caption']: | |
caption = "<bos> " + caption + " <eos>" | |
# we are only using sentences whose length lie between 6 and 10 | |
if len(caption.split())>10 or len(caption.split())<6: | |
continue | |
else: | |
train_list.append([caption, y['id']]) | |
print(len(train_list)) | |
random.shuffle(train_list) | |
training_list = train_list[:int(len(train_list)*train_split)] | |
validation_list = train_list[int(len(train_list)*train_split):] | |
for train in training_list: | |
vocab_list.append(train[0]) | |
# Tokenizing the words | |
tokenizer = Tokenizer(num_words=1500) | |
tokenizer.fit_on_texts(vocab_list) | |
x_data = {} | |
TRAIN_FEATURE_DIR = os.path.join('training_data', 'feat') | |
# Loading all the numpy arrays at once and saving them in a dictionary | |
for filename in os.listdir(TRAIN_FEATURE_DIR): | |
f = np.load(os.path.join(TRAIN_FEATURE_DIR, filename)) | |
x_data[filename[:-4]] = f |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment