Shreyz-max · March 15, 2021 11:17
diff --git a/clean_captions.py b/clean_captions.py
 train_path='training_data'
 TRAIN_LABEL_PATH = os.path.join(train_path, 'training_label.json')
 # mentioning the train test split
 train_split = 0.85
 # loading the json file for training
 with open(TRAIN_LABEL_PATH) as data_file:    
    y_data = json.load(data_file)
 # train_list contains all the captions with their video ID
 # vocab_list contains all the vocabulary from training data
 train_list = []
 vocab_list = []
 for y in y_data:
  for caption in y['caption']:
    caption = "<bos> " + caption + " <eos>"
    # we are only using sentences whose length lie between 6 and 10
    if len(caption.split())>10 or len(caption.split())<6:
      continue
    else:
      train_list.append([caption, y['id']])
 print(len(train_list))
 random.shuffle(train_list)
 training_list = train_list[:int(len(train_list)*train_split)]
 validation_list = train_list[int(len(train_list)*train_split):]
 for train in training_list:
    vocab_list.append(train[0])
 # Tokenizing the words
 tokenizer = Tokenizer(num_words=1500)
 tokenizer.fit_on_texts(vocab_list)
 x_data = {}
 TRAIN_FEATURE_DIR = os.path.join('training_data', 'feat')
 # Loading all the numpy arrays at once and saving them in a dictionary
 for filename in os.listdir(TRAIN_FEATURE_DIR):
    f = np.load(os.path.join(TRAIN_FEATURE_DIR, filename))
    x_data[filename[:-4]] = f
	train_path='training_data'
	TRAIN_LABEL_PATH = os.path.join(train_path, 'training_label.json')
	# mentioning the train test split
	train_split = 0.85
	# loading the json file for training
	with open(TRAIN_LABEL_PATH) as data_file:
	y_data = json.load(data_file)
	# train_list contains all the captions with their video ID
	# vocab_list contains all the vocabulary from training data
	train_list = []
	vocab_list = []
	for y in y_data:
	for caption in y['caption']:
	caption = "<bos> " + caption + " <eos>"
	# we are only using sentences whose length lie between 6 and 10
	if len(caption.split())>10 or len(caption.split())<6:
	continue
	else:
	train_list.append([caption, y['id']])
	print(len(train_list))
	random.shuffle(train_list)
	training_list = train_list[:int(len(train_list)*train_split)]
	validation_list = train_list[int(len(train_list)*train_split):]
	for train in training_list:
	vocab_list.append(train[0])
	# Tokenizing the words
	tokenizer = Tokenizer(num_words=1500)
	tokenizer.fit_on_texts(vocab_list)
	x_data = {}
	TRAIN_FEATURE_DIR = os.path.join('training_data', 'feat')
	# Loading all the numpy arrays at once and saving them in a dictionary
	for filename in os.listdir(TRAIN_FEATURE_DIR):
	f = np.load(os.path.join(TRAIN_FEATURE_DIR, filename))
	x_data[filename[:-4]] = f
No results found