Last active
August 10, 2020 00:52
-
-
Save Taekyoon/5f906c4ddabd7452ae9bc6885acf0dcf to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def bert_tokenizer_v2(sent1, sent2, MAX_LEN): | |
# For Two setenece input | |
encoded_dict = tokenizer.encode_plus( | |
text = sent1, | |
text_pair = sent2, | |
add_special_tokens = True, # Add '[CLS]' and '[SEP]' | |
max_length = MAX_LEN, # Pad & truncate all sentences. | |
pad_to_max_length = True, | |
return_attention_mask = True # Construct attn. masks. | |
) | |
input_id = encoded_dict['input_ids'] | |
attention_mask = encoded_dict['attention_mask'] # And its attention mask (simply differentiates padding from non-padding). | |
token_type_id = encoded_dict['token_type_ids'] # differentiate two sentences | |
return input_id, attention_mask, token_type_id | |
input_ids = [] | |
attention_masks = [] | |
token_type_ids = [] | |
for sent1, sent2 in tqdm(train_data_snli_xnli[['sentence1', 'sentence2']].values, | |
total=len(train_data_snli_xnli)): | |
input_id, attention_mask, token_type_id = bert_tokenizer_v2(sent1, sent2, MAX_LEN) | |
input_ids.append(input_id) | |
attention_masks.append(attention_mask) | |
token_type_ids.append(token_type_id) | |
train_snli_xnli_input_ids = np.array(input_ids, dtype=np.int64) | |
train_snli_xnli_attention_masks = np.array(attention_masks, dtype=np.int64) | |
train_snli_xnli_type_ids = np.array(token_type_ids, dtype=np.int64) | |
train_snli_xnli_inputs = (train_snli_xnli_input_ids, train_snli_xnli_attention_masks, train_snli_xnli_type_ids) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
train_data_sents = list() | |
for train_sent_1, train_sent_2 in tqdm(train_data_snli_xnli[['sentence1', 'sentence2']].values, | |
total=len(train_data_snli_xnli): | |
train_tokenized_sent_1 = vocab[tokenizer(clean_text(train_sent_1))] | |
train_tokenized_sent_2 = vocab[tokenizer(clean_text(train_sent_2))] | |
tokens = [vocab[vocab.bos_token]] | |
tokens += pad_sequences([train_tokenized_sent_1], | |
SENT_MAX_LEN, | |
value=vocab[vocab.padding_token], | |
padding='post').tolist()[0] | |
tokens += [vocab[vocab.sep_token]] | |
tokens += pad_sequences([train_tokenized_sent_2], | |
SENT_MAX_LEN, | |
value=vocab[vocab.padding_token], | |
padding='post').tolist()[0] | |
tokens += [vocab[vocab.eos_token]] | |
train_data_sents.append(tokens) | |
train_data_sents = np.array(train_data_sents, dtype=np.int64) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment