Created
May 5, 2020 18:33
-
-
Save a7v8x/5125d1033feb4be46d32165e9b56490a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import BertTokenizer | |
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True) | |
max_length_test = 20 | |
test_sentence = 'Test tokenization sentence. Followed by another sentence' | |
# add special tokens | |
test_sentence_with_special_tokens = '[CLS]' + test_sentence + '[SEP]' | |
tokenized = tokenizer.tokenize(test_sentence_with_special_tokens) | |
print('tokenized', tokenized) | |
# convert tokens to ids in WordPiece | |
input_ids = tokenizer.convert_tokens_to_ids(tokenized) | |
# precalculation of pad length, so that we can reuse it later on | |
padding_length = max_length_test - len(input_ids) | |
# map tokens to WordPiece dictionary and add pad token for those text shorter than our max length | |
input_ids = input_ids + ([0] * padding_length) | |
# attention should focus just on sequence with non padded tokens | |
attention_mask = [1] * len(input_ids) | |
# do not focus attention on padded tokens | |
attention_mask = attention_mask + ([0] * padding_length) | |
# token types, needed for example for question answering, for our purpose we will just set 0 as we have just one sequence | |
token_type_ids = [0] * max_length_test | |
bert_input = { | |
"token_ids": input_ids, | |
"token_type_ids": token_type_ids, | |
"attention_mask": attention_mask | |
} print(bert_input) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment