Skip to content

Instantly share code, notes, and snippets.

@Taekyoon
Last active August 10, 2020 00:52
Show Gist options
  • Save Taekyoon/5f906c4ddabd7452ae9bc6885acf0dcf to your computer and use it in GitHub Desktop.
Save Taekyoon/5f906c4ddabd7452ae9bc6885acf0dcf to your computer and use it in GitHub Desktop.
def bert_tokenizer_v2(sent1, sent2, MAX_LEN):
# For Two setenece input
encoded_dict = tokenizer.encode_plus(
text = sent1,
text_pair = sent2,
add_special_tokens = True, # Add '[CLS]' and '[SEP]'
max_length = MAX_LEN, # Pad & truncate all sentences.
pad_to_max_length = True,
return_attention_mask = True # Construct attn. masks.
)
input_id = encoded_dict['input_ids']
attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding).
token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences
return input_id, attention_mask, token_type_id
input_ids = []
attention_masks = []
token_type_ids = []
for sent1, sent2 in tqdm(train_data_snli_xnli[['sentence1', 'sentence2']].values,
total=len(train_data_snli_xnli)):
input_id, attention_mask, token_type_id = bert_tokenizer_v2(sent1, sent2, MAX_LEN)
input_ids.append(input_id)
attention_masks.append(attention_mask)
token_type_ids.append(token_type_id)
train_snli_xnli_input_ids = np.array(input_ids, dtype=np.int64)
train_snli_xnli_attention_masks = np.array(attention_masks, dtype=np.int64)
train_snli_xnli_type_ids = np.array(token_type_ids, dtype=np.int64)
train_snli_xnli_inputs = (train_snli_xnli_input_ids, train_snli_xnli_attention_masks, train_snli_xnli_type_ids)
train_data_sents = list()
for train_sent_1, train_sent_2 in tqdm(train_data_snli_xnli[['sentence1', 'sentence2']].values,
total=len(train_data_snli_xnli):
train_tokenized_sent_1 = vocab[tokenizer(clean_text(train_sent_1))]
train_tokenized_sent_2 = vocab[tokenizer(clean_text(train_sent_2))]
tokens = [vocab[vocab.bos_token]]
tokens += pad_sequences([train_tokenized_sent_1],
SENT_MAX_LEN,
value=vocab[vocab.padding_token],
padding='post').tolist()[0]
tokens += [vocab[vocab.sep_token]]
tokens += pad_sequences([train_tokenized_sent_2],
SENT_MAX_LEN,
value=vocab[vocab.padding_token],
padding='post').tolist()[0]
tokens += [vocab[vocab.eos_token]]
train_data_sents.append(tokens)
train_data_sents = np.array(train_data_sents, dtype=np.int64)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment