Seq2SeqDataset uses linecache to save memory by @Pradhy729 (#5792)

sshleifer · Pradhy729 · web-flow · commit 09a2f40684f7 · 2020-07-18T13:57:33.000-04:00
Co-authored-by: Pradhy729 &lt;49659913+Pradhy729@users.noreply.github.com&gt;
diff --git a/examples/seq2seq/README.md b/examples/seq2seq/README.md
@@ -7,27 +7,24 @@ For `bertabs` instructions, see `bertabs/README.md`.
 
 
 ### Data
-
-CNN/DailyMail data
+XSUM Data:
 ```bash
 cd examples/seq2seq
-wget https://s3.amazonaws.com/datasets.huggingface.co/summarization/cnn_dm.tgz
-tar -xzvf cnn_dm.tgz
-
-export CNN_DIR=${PWD}/cnn_dm
+wget https://s3.amazonaws.com/datasets.huggingface.co/summarization/xsum.tar.gz
+tar -xzvf xsum.tar.gz
+export XSUM_DIR=${PWD}/xsum
 ```
-
 this should make a directory called cnn_dm/ with files like `test.source`.
 To use your own data, copy that files format. Each article to be summarized is on its own line.
 
-XSUM Data:
+CNN/DailyMail data
 ```bash
 cd examples/seq2seq
-wget https://s3.amazonaws.com/datasets.huggingface.co/summarization/xsum.tar.gz
-tar -xzvf xsum.tar.gz
-export XSUM_DIR=${PWD}/xsum
-```
+wget https://s3.amazonaws.com/datasets.huggingface.co/summarization/cnn_dm.tgz
+tar -xzvf cnn_dm.tgz
 
+export CNN_DIR=${PWD}/cnn_dm
+```
 
 WMT16 English-Romanian Translation Data:
 ```bash
@@ -40,7 +37,7 @@ export ENRO_DIR=${PWD}/wmt_en_ro
 If you are using your own data, it must be formatted as one directory with 6 files: train.source, train.target, val.source, val.target, test.source, test.target.  
 The `.source` files are the input, the `.target` files are the desired output.
 
-
+ 
 ### Tips and Tricks
 
 General Tips:
@@ -64,6 +61,10 @@ Summarization Tips:
 - If you are finetuning on your own dataset, start from `distilbart-cnn-12-6` if you want long summaries and `distilbart-xsum-12-6` if you want short summaries.
 (It rarely makes sense to start from `bart-large` unless you are a researching finetuning methods). 
 
+**Update 2018-07-18**
+Datasets: Seq2SeqDataset will be used for all models besides MBart, for which MBartDataset will be used.**
+A new dataset is needed to support multilingual tasks.
+
 ### Summarization Finetuning
 Run/modify `finetune.sh`
 
@@ -78,8 +79,6 @@ The following command should work on a 16GB GPU:
     --model_name_or_path facebook/bart-large
 ```
 
-
-
 ### Translation Finetuning
 
 First, follow the wmt_en_ro download instructions.
@@ -124,23 +123,6 @@ from transformers import AutoModelForSeq2SeqLM
 model = AutoModelForSeq2SeqLM.from_pretrained(f'{output_dir}/best_tfmr')
 ```
 
-#### XSUM Shared Task
-Compare XSUM results with others by using `--logger_name wandb_shared`. This requires `wandb` registration.
-
-Here is an example command, but you can do whatever you want. Hopefully this will make debugging and collaboration easier!
-```bash
-WANDB_PROJECT='hf_xsum' ./finetune.sh \
-    --data_dir $XSUM_DIR \
-    --output_dir xsum_frozen_embs \
-    --model_name_or_path facebook/bart-large \
-    --train_batch_size 16 --eval_batch_size 16 --freeze_embeds --freeze_encoder \
-    --num_train_epochs 6 \
-    --max_target_length=60 --val_max_target_length=60 --test_max_target_length=100 \
-    --logger_name wandb
-```
-
-You can see your wandb logs [here](https://app.wandb.ai/sshleifer/hf_xsum?workspace=user-)
-
 ### Evaluation Commands
 
 To create summaries for each article in dataset, we use `run_eval.py`, here are a few commands that run eval for different tasks and models.
diff --git a/examples/seq2seq/distillation.py b/examples/seq2seq/distillation.py
@@ -15,28 +15,15 @@
 
 try:
     from .finetune import SummarizationModule
-    from .initialization_utils import init_student, copy_layers
-    from .utils import (
-        use_task_specific_params,
-        SummarizationDataset,
-        pickle_load,
-        freeze_params,
-        assert_all_frozen,
-        any_requires_grad,
-    )
     from .finetune import main as ft_main
+    from .initialization_utils import init_student, copy_layers
+    from .utils import use_task_specific_params, pickle_load, freeze_params, assert_all_frozen, any_requires_grad
+
 except ImportError:
     from finetune import SummarizationModule
     from finetune import main as ft_main
     from initialization_utils import init_student, copy_layers
-    from utils import (
-        use_task_specific_params,
-        SummarizationDataset,
-        pickle_load,
-        freeze_params,
-        assert_all_frozen,
-        any_requires_grad,
-    )
+    from utils import use_task_specific_params, pickle_load, freeze_params, assert_all_frozen, any_requires_grad
 
 
 class BartSummarizationDistiller(SummarizationModule):
@@ -115,11 +102,6 @@ def copy_t5_to_student(self, d_layers_to_copy, e_layers_to_copy, hparams, studen
         if self.different_encoder:
             copy_layers(teacher.encoder.block, student.encoder.block, e_layers_to_copy)
 
-    def get_dataset(self, type_path) -> SummarizationDataset:
-        n_obs = self.n_obs[type_path]
-        dataset = SummarizationDataset(self.tokenizer, type_path=type_path, n_obs=n_obs, **self.dataset_kwargs)
-        return dataset
-
     def calc_mse_loss(self, teacher_outputs: torch.Tensor, student_outputs: torch.Tensor, mask) -> torch.FloatTensor:
         if mask is not None:
             # mask has False at padding_idx
diff --git a/examples/seq2seq/finetune.py b/examples/seq2seq/finetune.py
@@ -21,7 +21,6 @@
     from .utils import (
         assert_all_frozen,
         use_task_specific_params,
-        SummarizationDataset,
         lmap,
         flatten_list,
         pickle_save,
@@ -32,12 +31,17 @@
         get_git_info,
         ROUGE_KEYS,
         calculate_bleu_score,
+        Seq2SeqDataset,
+        MBartDataset,
     )
+
     from .callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback
 except ImportError:
     from utils import (
+        Seq2SeqDataset,
+        MBartDataset,
+        assert_all_frozen,
         use_task_specific_params,
-        SummarizationDataset,
         lmap,
         flatten_list,
         pickle_save,
@@ -48,7 +52,6 @@
         get_git_info,
         ROUGE_KEYS,
         calculate_bleu_score,
-        assert_all_frozen,
     )
     from callbacks import Seq2SeqLoggingCallback, get_checkpoint_callback
 
@@ -100,6 +103,7 @@ def __init__(self, hparams, **kwargs):
         self.hparams.git_sha = get_git_info()["repo_sha"]
         self.num_workers = hparams.num_workers
         self.decoder_start_token_id = None
+        self.dataset_class = Seq2SeqDataset
 
     def freeze_embeds(self):
         """Freeze token embeddings and positional embeddings for bart, just token embeddings for t5."""
@@ -163,7 +167,7 @@ def calc_generative_metrics(self, preds, target) -> Dict:
 
     def _generative_step(self, batch: dict) -> dict:
         pad_token_id = self.tokenizer.pad_token_id
-        source_ids, source_mask, y = SummarizationDataset.trim_seq2seq_batch(batch, pad_token_id)
+        source_ids, source_mask, y = Seq2SeqDataset.trim_seq2seq_batch(batch, pad_token_id)
         t0 = time.time()
         generated_ids = self.model.generate(
             input_ids=source_ids,
@@ -187,10 +191,10 @@ def test_step(self, batch, batch_idx):
     def test_epoch_end(self, outputs):
         return self.validation_epoch_end(outputs, prefix="test")
 
-    def get_dataset(self, type_path) -> SummarizationDataset:
+    def get_dataset(self, type_path) -> Seq2SeqDataset:
         n_obs = self.n_obs[type_path]
         max_target_length = self.target_lens[type_path]
-        dataset = SummarizationDataset(
+        dataset = self.dataset_class(
             self.tokenizer,
             type_path=type_path,
             n_obs=n_obs,
@@ -303,6 +307,8 @@ def __init__(self, hparams, **kwargs):
         self.dataset_kwargs["tgt_lang"] = hparams.tgt_lang
         if self.model.config.decoder_start_token_id is None and isinstance(self.tokenizer, MBartTokenizer):
             self.decoder_start_token_id = self.tokenizer.lang_code_to_id[hparams.tgt_lang]
+        if isinstance(self.tokenizer, MBartTokenizer):
+            self.dataset_class = MBartDataset
 
     def calc_generative_metrics(self, preds, target) -> dict:
         return calculate_bleu_score(preds, target)
diff --git a/examples/seq2seq/test_seq2seq_examples.py b/examples/seq2seq/test_seq2seq_examples.py
@@ -9,23 +9,25 @@
 
 import pytest
 import torch
+from pytest import param
 from torch.utils.data import DataLoader
 
-from transformers import AutoTokenizer
+from transformers import AutoTokenizer, MBartTokenizer
 from transformers.testing_utils import require_multigpu
 
 from .distillation import distill_main, evaluate_checkpoint
 from .finetune import main
 from .pack_dataset import pack_data_dir
 from .run_eval import generate_summaries_or_translations, run_generate
-from .utils import SummarizationDataset, lmap, load_json
+from .utils import MBartDataset, Seq2SeqDataset, lmap, load_json
 
 
 logging.basicConfig(level=logging.DEBUG)
 
 logger = logging.getLogger()
 CUDA_AVAILABLE = torch.cuda.is_available()
 CHEAP_ARGS = {
+    "label_smoothing_eps": 0.2,
     "logger_name": "default",
     "length_penalty": 0.5,
     "cache_dir": "",
@@ -80,11 +82,11 @@
 
 
 def _dump_articles(path: Path, articles: list):
-    with path.open("w") as f:
-        f.write("\n".join(articles))
+    content = "\n".join(articles)
+    Path(path).open("w").writelines(content)
 
 
-ARTICLES = [" Sam ate lunch today", "Sams lunch ingredients"]
+ARTICLES = [" Sam ate lunch today.", "Sams lunch ingredients."]
 SUMMARIES = ["A very interesting story about what I ate for lunch.", "Avocado, celery, turkey, coffee"]
 T5_TINY = "patrickvonplaten/t5-tiny-random"
 BART_TINY = "sshleifer/bart-tiny-random"
@@ -208,7 +210,7 @@ def test_run_eval_bart(model):
 
 
 @pytest.mark.parametrize(
-    ["model"], [pytest.param(T5_TINY), pytest.param(BART_TINY), pytest.param(MBART_TINY), pytest.param(MARIAN_TINY)]
+    ["model"], [pytest.param(T5_TINY), pytest.param(BART_TINY), pytest.param(MBART_TINY), pytest.param(MARIAN_TINY)],
 )
 def test_finetune(model):
     args_d: dict = CHEAP_ARGS.copy()
@@ -260,22 +262,50 @@ def test_pack_dataset():
     assert orig_paths == new_paths
 
 
-@pytest.mark.parametrize(
-    ["tok"], [pytest.param(T5_TINY), pytest.param(BART_TINY), pytest.param(MBART_TINY), pytest.param(MARIAN_TINY)]
-)
-def test_dataset(tok):
-    tokenizer = AutoTokenizer.from_pretrained(tok)
+def test_mbart_dataset_truncation():
+    tokenizer = MBartTokenizer.from_pretrained(MBART_TINY)
     tmp_dir = make_test_data_dir()
     max_len_source = max(len(tokenizer.encode(a)) for a in ARTICLES)
     max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES)
-    trunc_target = 4
-    train_dataset = SummarizationDataset(
+    trunc = 4
+    src_lang, tgt_lang = "ro_RO", "de_DE"  # NOT WHAT IT WAS TRAINED ON
+    train_dataset = MBartDataset(
         tokenizer,
         data_dir=tmp_dir,
         type_path="train",
-        max_source_length=20,
-        max_target_length=trunc_target,
-        tgt_lang="ro_RO",
+        max_source_length=trunc,
+        max_target_length=1000,  # ignored
+        src_lang=src_lang,
+        tgt_lang=tgt_lang,
+    )
+    dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=train_dataset.collate_fn)
+    for batch in dataloader:
+        assert isinstance(batch, dict)
+        assert batch["attention_mask"].shape == batch["input_ids"].shape
+        # show that articles were trimmed.
+        assert batch["input_ids"].shape[1] == trunc
+        # show that targets are the same len
+        assert batch["decoder_input_ids"].shape[1] == trunc
+        # check language codes in correct place
+        assert batch["decoder_input_ids"][0, 0].item() == tokenizer.lang_code_to_id[tgt_lang]
+        assert batch["decoder_input_ids"][0, -1].item() == tokenizer.eos_token_id
+        assert batch["input_ids"][0, -2].item() == tokenizer.eos_token_id
+        assert batch["input_ids"][0, -1].item() == tokenizer.lang_code_to_id[src_lang]
+
+        assert max_len_target > trunc  # Truncated
+        assert max_len_source > trunc
+        break  # No need to test every batch
+
+
+@pytest.mark.parametrize(["tok"], [pytest.param(T5_TINY), pytest.param(BART_TINY), param(MARIAN_TINY)])
+def test_summarization_dataset_truncation(tok):
+    tokenizer = AutoTokenizer.from_pretrained(tok)
+    tmp_dir = make_test_data_dir()
+    max_len_source = max(len(tokenizer.encode(a)) for a in ARTICLES)
+    max_len_target = max(len(tokenizer.encode(a)) for a in SUMMARIES)
+    trunc_target = 4
+    train_dataset = Seq2SeqDataset(
+        tokenizer, data_dir=tmp_dir, type_path="train", max_source_length=20, max_target_length=trunc_target,
     )
     dataloader = DataLoader(train_dataset, batch_size=2, collate_fn=train_dataset.collate_fn)
     for batch in dataloader:
@@ -286,3 +316,4 @@ def test_dataset(tok):
         # show that targets were truncated
         assert batch["decoder_input_ids"].shape[1] == trunc_target  # Truncated
         assert max_len_target > trunc_target  # Truncated
+        break  # No need to test every batch
diff --git a/examples/seq2seq/utils.py b/examples/seq2seq/utils.py
diff --git a/src/transformers/tokenization_bart.py b/src/transformers/tokenization_bart.py