a7v8x · May 5, 2020 18:33
diff --git a/tokenization.py b/tokenization.py
 from transformers import BertTokenizer

 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


 max_length_test = 20
 test_sentence = 'Test tokenization sentence. Followed by another sentence'

 # add special tokens
 test_sentence_with_special_tokens = '[CLS]' + test_sentence + '[SEP]'
 tokenized = tokenizer.tokenize(test_sentence_with_special_tokens)
 print('tokenized', tokenized)

 # convert tokens to ids in WordPiece
 input_ids = tokenizer.convert_tokens_to_ids(tokenized)
  
 # precalculation of pad length, so that we can reuse it later on
 padding_length = max_length_test - len(input_ids)

 # map tokens to WordPiece dictionary and add pad token for those text shorter than our max length
 input_ids = input_ids + ([0] * padding_length)

 # attention should focus just on sequence with non padded tokens
 attention_mask = [1] * len(input_ids)

 # do not focus attention on padded tokens
 attention_mask = attention_mask + ([0] * padding_length)

 # token types, needed for example for question answering, for our purpose we will just set 0 as we have just one sequence
 token_type_ids = [0] * max_length_test
 bert_input = {
    "token_ids": input_ids,
    "token_type_ids": token_type_ids,
    "attention_mask": attention_mask
 } print(bert_input)
	from transformers import BertTokenizer

	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)


	max_length_test = 20
	test_sentence = 'Test tokenization sentence. Followed by another sentence'

	# add special tokens
	test_sentence_with_special_tokens = '[CLS]' + test_sentence + '[SEP]'
	tokenized = tokenizer.tokenize(test_sentence_with_special_tokens)
	print('tokenized', tokenized)

	# convert tokens to ids in WordPiece
	input_ids = tokenizer.convert_tokens_to_ids(tokenized)

	# precalculation of pad length, so that we can reuse it later on
	padding_length = max_length_test - len(input_ids)

	# map tokens to WordPiece dictionary and add pad token for those text shorter than our max length
	input_ids = input_ids + ([0] * padding_length)

	# attention should focus just on sequence with non padded tokens
	attention_mask = [1] * len(input_ids)

	# do not focus attention on padded tokens
	attention_mask = attention_mask + ([0] * padding_length)

	# token types, needed for example for question answering, for our purpose we will just set 0 as we have just one sequence
	token_type_ids = [0] * max_length_test
	bert_input = {
	"token_ids": input_ids,
	"token_type_ids": token_type_ids,
	"attention_mask": attention_mask
	} print(bert_input)