-
-
Save pommedeterresautee/1a334b665710bec9bb65965f662c94c8 to your computer and use it in GitHub Desktop.
| # required by (\ SHELL COMMANDS \) | |
| SHELL:=/bin/bash | |
| VIRT_ENV_FOLDER = ~/.local/share/virtualenvs/xnli | |
| SOURCE_VIRT_ENV = source $(VIRT_ENV_FOLDER)/bin/activate | |
| .PHONY: train | |
| train: | |
| ( \ | |
| $(SOURCE_VIRT_ENV); \ | |
| python trainer.py \ | |
| --output_dir ./models/speed_camembert_max_len_128_fp16_dynamic_padding_smart_batching_batch_64_seed_321 \ | |
| --overwrite_output_dir \ | |
| --save_steps 0 \ | |
| --seed 321 \ | |
| --num_train_epochs 1 \ | |
| --learning_rate 5e-5 \ | |
| --per_gpu_train_batch_size 64 \ | |
| --gradient_accumulation_steps 1 \ | |
| --per_gpu_eval_batch_size 64 \ | |
| --max_seq_len 128 \ | |
| --dynamic_padding \ | |
| --smart_batching \ | |
| --fp16 \ | |
| --evaluate_during_training ; \ | |
| ) | |
| # --smart_batching | |
| # --dynamic_padding | |
| # --fp16 |
| import logging | |
| import os | |
| import random | |
| import time | |
| from dataclasses import dataclass, field | |
| from typing import Dict, Optional | |
| from typing import List | |
| import numpy as np | |
| import torch | |
| from torch.utils.data.dataset import Dataset, IterableDataset | |
| from torch.utils.tensorboard import SummaryWriter | |
| from transformers import AutoTokenizer, EvalPrediction, Trainer, HfArgumentParser, TrainingArguments, \ | |
| AutoModelForSequenceClassification, set_seed, AutoConfig | |
| from transformers import PreTrainedTokenizer, DataCollator, PreTrainedModel | |
| import wandb | |
| set_seed(123) | |
| label_codes = {"contradictory": 0, "contradiction": 0, "neutral": 1, "entailment": 2} | |
| wandb.init(project="speed_training") | |
| class MyTrainer(Trainer): | |
| def _setup_wandb(self): | |
| wandb.init(project="speed_training", | |
| config=vars(self.args), | |
| name=self.args.output_dir) | |
| wandb.watch(self.model, log="gradients", log_freq=self.args.logging_steps) | |
| @dataclass | |
| class Example: | |
| text_a: str | |
| text_b: str | |
| label: int | |
| @dataclass | |
| class Features: | |
| input_ids: List[int] | |
| attention_mask: List[int] | |
| label: int | |
| @dataclass | |
| class ModelParameters: | |
| max_seq_len: Optional[int] = field( | |
| default=None, | |
| metadata={"help": "max seq len"}, | |
| ) | |
| dynamic_padding: bool = field( | |
| default=False, | |
| metadata={"help": "limit pad size at batch level"}, | |
| ) | |
| smart_batching: bool = field( | |
| default=False, | |
| metadata={"help": "build batch of similar sizes"}, | |
| ) | |
| dynamic_batch_size: bool = field( | |
| default=False, | |
| metadata={"help": "build batch of similar sizes"}, | |
| ) | |
| class TextDataset(Dataset): | |
| def __init__(self, tokenizer: PreTrainedTokenizer, pad_to_max_length: bool, max_len: int, | |
| examples: List[Example]) -> None: | |
| self.tokenizer = tokenizer | |
| self.max_len = max_len | |
| self.examples: List[Example] = examples | |
| self.current = 0 | |
| self.pad_to_max_length = pad_to_max_length | |
| def encode(self, ex: Example) -> Features: | |
| encode_dict = self.tokenizer.encode_plus(text=ex.text_a, | |
| text_pair=ex.text_b, | |
| add_special_tokens=True, | |
| max_length=self.max_len, | |
| pad_to_max_length=self.pad_to_max_length, | |
| return_token_type_ids=False, | |
| return_attention_mask=True, | |
| return_overflowing_tokens=False, | |
| return_special_tokens_mask=False, | |
| ) | |
| return Features(input_ids=encode_dict["input_ids"], | |
| attention_mask=encode_dict["attention_mask"], | |
| label=ex.label) | |
| def __getitem__(self, idx) -> Features: # Trainer doesn't support IterableDataset (define sampler) | |
| if self.current == len(self.examples): | |
| self.current = 0 | |
| ex = self.examples[self.current] | |
| self.current += 1 | |
| return self.encode(ex=ex) | |
| def __len__(self): | |
| return len(self.examples) | |
| def pad_seq(seq: List[int], max_batch_len: int, pad_value: int) -> List[int]: | |
| return seq + (max_batch_len - len(seq)) * [pad_value] | |
| @dataclass | |
| class SmartCollator(DataCollator): | |
| pad_token_id: int | |
| def collate_batch(self, batch: List[Features]) -> Dict[str, torch.Tensor]: | |
| batch_inputs = list() | |
| batch_attention_masks = list() | |
| labels = list() | |
| max_size = max([len(ex.input_ids) for ex in batch]) | |
| for item in batch: | |
| batch_inputs += [pad_seq(item.input_ids, max_size, self.pad_token_id)] | |
| batch_attention_masks += [pad_seq(item.attention_mask, max_size, 0)] | |
| labels.append(item.label) | |
| return {"input_ids": torch.tensor(batch_inputs, dtype=torch.long), | |
| "attention_mask": torch.tensor(batch_attention_masks, dtype=torch.long), | |
| "labels": torch.tensor(labels, dtype=torch.long) | |
| } | |
| def load_transformers_model(pretrained_model_name_or_path: str, | |
| use_cuda: bool, | |
| mixed_precision: bool) -> PreTrainedModel: | |
| config = AutoConfig.from_pretrained(pretrained_model_name_or_path=pretrained_model_name_or_path, | |
| num_labels=3) | |
| model = AutoModelForSequenceClassification.from_pretrained( | |
| pretrained_model_name_or_path=pretrained_model_name_or_path, | |
| config=config) | |
| if use_cuda and torch.cuda.is_available(): | |
| device = torch.device('cuda') | |
| model.to(device) | |
| if mixed_precision: | |
| try: | |
| from apex import amp | |
| model = amp.initialize(model, opt_level='O1') | |
| except ImportError: | |
| raise ImportError("Please install apex from https://www.github.com/nvidia/apex to use fp16 training.") | |
| return model | |
| def load_train_data(path: str, sort: bool) -> List[Example]: | |
| sentences = list() | |
| with open(path) as f: | |
| first = False | |
| for line in f: | |
| if not first: | |
| first = True | |
| continue | |
| text_a, text_b, label = line.rstrip().split("\t") | |
| lab = len(text_a) + len(text_b) | |
| sentences.append((lab, Example(text_a=text_a, text_b=text_b, label=label_codes[label]))) | |
| if sort: | |
| sentences.sort(key=lambda x: x[0]) | |
| return [e for (_, e) in sentences] | |
| def load_dev_data(path: str) -> List[Example]: | |
| sentences = list() | |
| with open(path) as f: | |
| for raw_line in f: | |
| line = raw_line.split("\t") | |
| if line[0] != "fr": | |
| continue | |
| text_a = line[6] | |
| text_b = line[7] | |
| label = line[1] | |
| lab = len(text_a) + len(text_b) | |
| sentences.append((lab, Example(text_a=text_a, text_b=text_b, label=label_codes[label]))) | |
| sentences.sort(key=lambda x: x[0]) | |
| return [e for (_, e) in sentences] | |
| def build_batches(sentences: List[Example], batch_size: int) -> List[Example]: | |
| batch_ordered_sentences = list() | |
| while len(sentences) > 0: | |
| to_take = min(batch_size, len(sentences)) | |
| select = random.randint(0, len(sentences) - to_take) | |
| batch_ordered_sentences += sentences[select:select + to_take] | |
| del sentences[select:select + to_take] | |
| return batch_ordered_sentences | |
| if __name__ == "__main__": | |
| parser = HfArgumentParser((TrainingArguments, ModelParameters)) | |
| training_args, model_args = parser.parse_args_into_dataclasses() # type: (TrainingArguments, ModelParameters) | |
| train_sentences = load_train_data(path="resources/XNLI-MT-1.0/multinli/multinli.train.fr.tsv", | |
| sort=model_args.smart_batching) | |
| tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path="camembert-base") | |
| if model_args.max_seq_len: | |
| max_sequence_len = model_args.max_seq_len | |
| else: | |
| longest_sentence = max(train_sentences, key=len) | |
| max_sequence_len = len(tokenizer.encode(text=longest_sentence.text_a, text_pair=longest_sentence.text_b)) | |
| train_batches = build_batches(sentences=train_sentences, batch_size=training_args.per_gpu_train_batch_size) | |
| valid_sentences = load_dev_data(path="resources/XNLI-1.0/xnli.test.tsv") | |
| valid_batches = build_batches(sentences=valid_sentences, batch_size=training_args.per_gpu_eval_batch_size) | |
| train_set = TextDataset(tokenizer=tokenizer, | |
| max_len=max_sequence_len, | |
| examples=train_batches, | |
| pad_to_max_length=not model_args.dynamic_padding) | |
| valid_set = TextDataset(tokenizer=tokenizer, | |
| max_len=max_sequence_len, | |
| examples=valid_batches, | |
| pad_to_max_length=not model_args.dynamic_padding) | |
| model = load_transformers_model(pretrained_model_name_or_path="camembert-base", | |
| use_cuda=True, | |
| mixed_precision=False) | |
| def compute_metrics(p: EvalPrediction) -> Dict: | |
| preds = np.argmax(p.predictions, axis=1) | |
| return {"acc": (preds == p.label_ids).mean()} | |
| trainer = MyTrainer( | |
| model=model, | |
| args=training_args, | |
| train_dataset=train_set, | |
| # data_collator=IdentityCollator(pad_token_id=tokenizer.pad_token_id), | |
| data_collator=SmartCollator(pad_token_id=tokenizer.pad_token_id), | |
| tb_writer=SummaryWriter(log_dir='logs', flush_secs=10), | |
| eval_dataset=valid_set, | |
| compute_metrics=compute_metrics, | |
| ) | |
| start_time = time.time() | |
| trainer.train() | |
| wandb.config.update(model_args) | |
| wandb.config.update(training_args) | |
| wandb.log({"training time": int((time.time() - start_time) / 60)}) | |
| trainer.save_model() | |
| trainer.evaluate() | |
| logging.info("*** Evaluate ***") | |
| result = trainer.evaluate() | |
| wandb.log(result) | |
| output_eval_file = os.path.join(training_args.output_dir, "eval_results.txt") | |
| with open(output_eval_file, "w") as writer: | |
| logging.info("***** Eval results *****") | |
| for key, value in result.items(): | |
| logging.info(" %s = %s", key, value) | |
| writer.write("%s = %s\n" % (key, value)) |
you are right!
Coool stufff. Thanks for sharing :)
TypeError Traceback (most recent call last)
in
39
40 @DataClass
---> 41 class SmartCollator(DataCollator):
42 pad_token_id: int
43
TypeError: function() argument 'code' must be code, not str
Hello! When I try your work, I got the exception above. I simply cant debug it, could you please tell me why would it happen? Thank you.
Which version of Python are you using?
**hi how can i ran this code in colab please help me
**
You have error on line 181. The function return value signature should be List[List[Example]]. This also translates to other functions below.
By the way, why TextDataset's __get_item__ does not use idx parameter in any way?
This code is very old and I think that most of these ideas are implemented in the lib
Hi Michaël!,
Great POC! Is its possible that you are evaluating twice (unnecessarily)? (Line 246, 248)
Best,