Skip to content

Commit 239dd23

Browse files
committed
[Follow up 213]
Masked indices should have -1 and not -100. Updating documentation + scripts that were forgotten
1 parent 522c5b5 commit 239dd23

14 files changed

+37
-37
lines changed

examples/contrib/run_openai_gpt.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -81,7 +81,7 @@ def pre_process_datasets(encoded_datasets, input_len, cap_length, start_token, d
8181
n_batch = len(dataset)
8282
input_ids = np.zeros((n_batch, 2, input_len), dtype=np.int64)
8383
mc_token_ids = np.zeros((n_batch, 2), dtype=np.int64)
84-
lm_labels = np.full((n_batch, 2, input_len), fill_value=-100, dtype=np.int64)
84+
lm_labels = np.full((n_batch, 2, input_len), fill_value=-1, dtype=np.int64)
8585
mc_labels = np.zeros((n_batch,), dtype=np.int64)
8686
for i, (story, cont1, cont2, mc_label), in enumerate(dataset):
8787
with_cont1 = [start_token] + story[:cap_length] + [delimiter_token] + cont1[:cap_length] + [clf_token]

examples/distillation/distiller.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -109,7 +109,7 @@ def __init__(
109109
self.last_log = 0
110110

111111
self.ce_loss_fct = nn.KLDivLoss(reduction="batchmean")
112-
self.lm_loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
112+
self.lm_loss_fct = nn.CrossEntropyLoss()
113113
if self.alpha_mse > 0.0:
114114
self.mse_loss_fct = nn.MSELoss(reduction="sum")
115115
if self.alpha_cos > 0.0:
@@ -200,7 +200,7 @@ def prepare_batch_mlm(self, batch):
200200
-------
201201
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
202202
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
203-
mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -100 where there is nothing to predict.
203+
mlm_labels: `torch.tensor(bs, seq_length)` - The masked languge modeling labels. There is a -1 where there is nothing to predict.
204204
"""
205205
token_ids, lengths = batch
206206
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
@@ -244,7 +244,7 @@ def prepare_batch_mlm(self, batch):
244244
)
245245
token_ids = token_ids.masked_scatter(pred_mask, _token_ids)
246246

247-
mlm_labels[~pred_mask] = -100 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
247+
mlm_labels[~pred_mask] = -1 # previously `mlm_labels[1-pred_mask] = -1`, cf pytorch 1.2.0 compatibility
248248

249249
# sanity checks
250250
assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size
@@ -265,15 +265,15 @@ def prepare_batch_clm(self, batch):
265265
-------
266266
token_ids: `torch.tensor(bs, seq_length)` - The token ids after the modifications for MLM.
267267
attn_mask: `torch.tensor(bs, seq_length)` - The attention mask for the self-attention.
268-
clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -100 where there is nothing to predict.
268+
clm_labels: `torch.tensor(bs, seq_length)` - The causal languge modeling labels. There is a -1 where there is nothing to predict.
269269
"""
270270
token_ids, lengths = batch
271271
token_ids, lengths = self.round_batch(x=token_ids, lengths=lengths)
272272
assert token_ids.size(0) == lengths.size(0)
273273

274274
attn_mask = torch.arange(token_ids.size(1), dtype=torch.long, device=lengths.device) < lengths[:, None]
275275
clm_labels = token_ids.new(token_ids.size()).copy_(token_ids)
276-
clm_labels[~attn_mask] = -100 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
276+
clm_labels[~attn_mask] = -1 # previously `clm_labels[1-attn_mask] = -1`, cf pytorch 1.2.0 compatibility
277277

278278
# sanity checks
279279
assert 0 <= token_ids.min() <= token_ids.max() < self.vocab_size

examples/run_lm_finetuning.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -206,7 +206,7 @@ def mask_tokens(inputs: torch.Tensor, tokenizer: PreTrainedTokenizer, args) -> T
206206
padding_mask = labels.eq(tokenizer.pad_token_id)
207207
probability_matrix.masked_fill_(padding_mask, value=0.0)
208208
masked_indices = torch.bernoulli(probability_matrix).bool()
209-
labels[~masked_indices] = -100 # We only compute loss on masked tokens
209+
labels[~masked_indices] = -1 # We only compute loss on masked tokens
210210

211211
# 80% of the time, we replace masked input tokens with tokenizer.mask_token ([MASK])
212212
indices_replaced = torch.bernoulli(torch.full(labels.shape, 0.8)).bool() & masked_indices

src/transformers/modeling_albert.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -632,8 +632,8 @@ def forward(
632632
r"""
633633
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
634634
Labels for computing the masked language modeling loss.
635-
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
636-
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with
635+
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
636+
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with
637637
labels in ``[0, ..., config.vocab_size]``
638638
639639
Returns:

src/transformers/modeling_bert.py

+7-7
Original file line numberDiff line numberDiff line change
@@ -846,8 +846,8 @@ def forward(
846846
r"""
847847
masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
848848
Labels for computing the masked language modeling loss.
849-
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
850-
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
849+
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
850+
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
851851
in ``[0, ..., config.vocab_size]``
852852
next_sentence_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
853853
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
@@ -948,13 +948,13 @@ def forward(
948948
r"""
949949
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
950950
Labels for computing the masked language modeling loss.
951-
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
952-
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
951+
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
952+
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
953953
in ``[0, ..., config.vocab_size]``
954954
lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
955955
Labels for computing the left-to-right language modeling loss (next word prediction).
956-
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
957-
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
956+
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
957+
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
958958
in ``[0, ..., config.vocab_size]``
959959
960960
Returns:
@@ -1015,7 +1015,7 @@ def forward(
10151015
# 2. If `lm_labels` is provided we are in a causal scenario where we
10161016
# try to predict the next token for each input in the decoder.
10171017
if masked_lm_labels is not None:
1018-
loss_fct = CrossEntropyLoss() # -100 index = padding token
1018+
loss_fct = CrossEntropyLoss() # -1 index = padding token
10191019
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
10201020
outputs = (masked_lm_loss,) + outputs
10211021

src/transformers/modeling_ctrl.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -479,8 +479,8 @@ def forward(
479479
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
480480
Labels for language modeling.
481481
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
482-
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
483-
All labels set to ``-100`` are ignored (masked), the loss is only
482+
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
483+
All labels set to ``-1`` are ignored (masked), the loss is only
484484
computed for labels in ``[0, ..., config.vocab_size]``
485485
486486
Return:

src/transformers/modeling_distilbert.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -517,8 +517,8 @@ def forward(self, input_ids=None, attention_mask=None, head_mask=None, inputs_em
517517
r"""
518518
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
519519
Labels for computing the masked language modeling loss.
520-
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
521-
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
520+
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
521+
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
522522
in ``[0, ..., config.vocab_size]``
523523
524524
Returns:

src/transformers/modeling_gpt2.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -547,8 +547,8 @@ def forward(
547547
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
548548
Labels for language modeling.
549549
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
550-
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
551-
All labels set to ``-100`` are ignored (masked), the loss is only
550+
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
551+
All labels set to ``-1`` are ignored (masked), the loss is only
552552
computed for labels in ``[0, ..., config.vocab_size]``
553553
554554
Return:
@@ -655,7 +655,7 @@ def forward(
655655
Labels for language modeling.
656656
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
657657
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
658-
All labels set to ``-100`` are ignored (masked), the loss is only
658+
All labels set to ``-1`` are ignored (masked), the loss is only
659659
computed for labels in ``[0, ..., config.vocab_size]``
660660
mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
661661
Labels for computing the multiple choice classification loss.

src/transformers/modeling_openai.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -516,8 +516,8 @@ def forward(
516516
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
517517
Labels for language modeling.
518518
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
519-
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
520-
All labels set to ``-100`` are ignored (masked), the loss is only
519+
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
520+
All labels set to ``-1`` are ignored (masked), the loss is only
521521
computed for labels in ``[0, ..., config.vocab_size]``
522522
523523
Return:
@@ -621,7 +621,7 @@ def forward(
621621
Labels for language modeling.
622622
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
623623
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
624-
All labels set to ``-100`` are ignored (masked), the loss is only
624+
All labels set to ``-1`` are ignored (masked), the loss is only
625625
computed for labels in ``[0, ..., config.vocab_size]``
626626
mc_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size)`, `optional`, defaults to :obj:`None`)
627627
Labels for computing the multiple choice classification loss.

src/transformers/modeling_roberta.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -200,8 +200,8 @@ def forward(
200200
r"""
201201
masked_lm_labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
202202
Labels for computing the masked language modeling loss.
203-
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
204-
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
203+
Indices should be in ``[-1, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
204+
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
205205
in ``[0, ..., config.vocab_size]``
206206
207207
Returns:

src/transformers/modeling_t5.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -802,8 +802,8 @@ class T5WithLMHeadModel(T5PreTrainedModel):
802802
r"""
803803
**lm_labels**: (`optional`) ``torch.LongTensor`` of shape ``(batch_size, sequence_length)``:
804804
Labels for computing the masked language modeling loss.
805-
Indices should either be in ``[0, ..., config.vocab_size]`` or -100 (see ``input_ids`` docstring).
806-
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
805+
Indices should either be in ``[0, ..., config.vocab_size]`` or -1 (see ``input_ids`` docstring).
806+
Tokens with indices set to ``-1`` are ignored (masked), the loss is only computed for the tokens with labels
807807
in ``[0, ..., config.vocab_size]``.
808808
809809
Outputs: `Tuple` comprising various elements depending on the configuration (config) and inputs:
@@ -906,7 +906,7 @@ def forward(self, **kwargs):
906906
if lm_labels is not None:
907907
shift_logits = lm_logits[..., :-1, :].contiguous()
908908
shift_labels = lm_labels[..., 1:].contiguous()
909-
loss_fct = CrossEntropyLoss(ignore_index=-100)
909+
loss_fct = CrossEntropyLoss()
910910
loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
911911
decoder_outputs = (
912912
loss,

src/transformers/modeling_transfo_xl.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -858,8 +858,8 @@ def forward(self, input_ids=None, mems=None, head_mask=None, inputs_embeds=None,
858858
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
859859
Labels for language modeling.
860860
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
861-
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
862-
All labels set to ``-100`` are ignored (masked), the loss is only
861+
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
862+
All labels set to ``-1`` are ignored (masked), the loss is only
863863
computed for labels in ``[0, ..., config.vocab_size]``
864864
865865
Return:

src/transformers/modeling_xlm.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -667,8 +667,8 @@ def forward(
667667
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
668668
Labels for language modeling.
669669
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
670-
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
671-
All labels set to ``-100`` are ignored (masked), the loss is only
670+
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
671+
All labels set to ``-1`` are ignored (masked), the loss is only
672672
computed for labels in ``[0, ..., config.vocab_size]``
673673
674674
Return:

src/transformers/modeling_xlnet.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -993,8 +993,8 @@ def forward(
993993
labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size, sequence_length)`, `optional`, defaults to :obj:`None`):
994994
Labels for language modeling.
995995
Note that the labels **are shifted** inside the model, i.e. you can set ``lm_labels = input_ids``
996-
Indices are selected in ``[-100, 0, ..., config.vocab_size]``
997-
All labels set to ``-100`` are ignored (masked), the loss is only
996+
Indices are selected in ``[-1, 0, ..., config.vocab_size]``
997+
All labels set to ``-1`` are ignored (masked), the loss is only
998998
computed for labels in ``[0, ..., config.vocab_size]``
999999
10001000
Return:

0 commit comments

Comments
 (0)