kabirahuja2431 · October 8, 2019 14:34 · namhkoh · Mar 29, 2021
diff --git a/preprocess_single.py b/preprocess_single.py
 import torch
 from transformers import BertModel, BertTokenizer

 #Creating instance of BertModel
 bert_model = BertModel.from_pretrained('bert-base-uncased')

 #Creating intance of tokenizer
 tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
 #Specifying the max length
 T = 12

 sentence = 'I really enjoyed this movie a lot.'
 #Step 1: Tokenize
 tokens = tokenizer.tokenize(sentence)
 #Step 2: Add [CLS] and [SEP]
 tokens = ['[CLS]'] + tokens + ['[SEP]']
 #Step 3: Pad tokens
 padded_tokens = tokens + ['[PAD]' for _ in range(T - len(tokens))]
 attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
 #Step 4: Segment ids
 seg_ids = [0 for _ in range(len(padded_tokens))] #Optional!
 #Step 5: Get BERT vocabulary index for each token
 token_ids = tokenizer.convert_tokens_to_ids(padded_tokens)

 #Converting everything to torch tensors before feeding them to bert_model
 token_ids = torch.tensor(token_ids).unsqueeze(0) #Shape : [1, 12]
 attn_mask = torch.tensor(attn_mask).unsqueeze(0) #Shape : [1, 12]
 seg_ids   = torch.tensor(seg_ids).unsqueeze(0) #Shape : [1, 12]

 #Feed them to bert
 hidden_reps, cls_head = bert_model(token_ids, attention_mask = attn_mask,\
                                  token_type_ids = seg_ids)
 print(hidden_reps.shape)
 #Out: torch.Size([1, 12, 768])
 print(cls_head.shape)
 #Out: torch.Size([1, 768])
	import torch
	from transformers import BertModel, BertTokenizer

	#Creating instance of BertModel
	bert_model = BertModel.from_pretrained('bert-base-uncased')

	#Creating intance of tokenizer
	tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
	#Specifying the max length
	T = 12

	sentence = 'I really enjoyed this movie a lot.'
	#Step 1: Tokenize
	tokens = tokenizer.tokenize(sentence)
	#Step 2: Add [CLS] and [SEP]
	tokens = ['[CLS]'] + tokens + ['[SEP]']
	#Step 3: Pad tokens
	padded_tokens = tokens + ['[PAD]' for _ in range(T - len(tokens))]
	attn_mask = [1 if token != '[PAD]' else 0 for token in padded_tokens]
	#Step 4: Segment ids
	seg_ids = [0 for _ in range(len(padded_tokens))] #Optional!
	#Step 5: Get BERT vocabulary index for each token
	token_ids = tokenizer.convert_tokens_to_ids(padded_tokens)

	#Converting everything to torch tensors before feeding them to bert_model
	token_ids = torch.tensor(token_ids).unsqueeze(0) #Shape : [1, 12]
	attn_mask = torch.tensor(attn_mask).unsqueeze(0) #Shape : [1, 12]
	seg_ids = torch.tensor(seg_ids).unsqueeze(0) #Shape : [1, 12]

	#Feed them to bert
	hidden_reps, cls_head = bert_model(token_ids, attention_mask = attn_mask,\
	token_type_ids = seg_ids)
	print(hidden_reps.shape)
	#Out: torch.Size([1, 12, 768])
	print(cls_head.shape)
	#Out: torch.Size([1, 768])