Skip to content

Commit 8bf7312

Browse files
Add AlbertForPreTraining and TFAlbertForPreTraining models. (#4057)
* Add AlbertForPreTraining and TFAlbertForPreTraining models. * PyTorch conversion * TensorFlow conversion * style Co-authored-by: Lysandre <lysandre.debut@reseau.eseo.fr>
1 parent c99fe03 commit 8bf7312

9 files changed

+263
-14
lines changed

src/transformers/__init__.py

+2
Original file line numberDiff line numberDiff line change
@@ -287,6 +287,7 @@
287287
from .modeling_albert import (
288288
AlbertPreTrainedModel,
289289
AlbertModel,
290+
AlbertForPreTraining,
290291
AlbertForMaskedLM,
291292
AlbertForSequenceClassification,
292293
AlbertForQuestionAnswering,
@@ -490,6 +491,7 @@
490491
TFAlbertPreTrainedModel,
491492
TFAlbertMainLayer,
492493
TFAlbertModel,
494+
TFAlbertForPreTraining,
493495
TFAlbertForMaskedLM,
494496
TFAlbertForSequenceClassification,
495497
TFAlbertForQuestionAnswering,

src/transformers/convert_albert_original_tf_checkpoint_to_pytorch.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020

2121
import torch
2222

23-
from transformers import AlbertConfig, AlbertForMaskedLM, load_tf_weights_in_albert
23+
from transformers import AlbertConfig, AlbertForPreTraining, load_tf_weights_in_albert
2424

2525

2626
logging.basicConfig(level=logging.INFO)
@@ -30,7 +30,7 @@ def convert_tf_checkpoint_to_pytorch(tf_checkpoint_path, albert_config_file, pyt
3030
# Initialise PyTorch model
3131
config = AlbertConfig.from_json_file(albert_config_file)
3232
print("Building PyTorch model from configuration: {}".format(str(config)))
33-
model = AlbertForMaskedLM(config)
33+
model = AlbertForPreTraining(config)
3434

3535
# Load weights from tf checkpoint
3636
load_tf_weights_in_albert(model, config, tf_checkpoint_path)

src/transformers/convert_pytorch_checkpoint_to_tf2.py

+5-5
Original file line numberDiff line numberDiff line change
@@ -46,7 +46,7 @@
4646
OpenAIGPTConfig,
4747
RobertaConfig,
4848
T5Config,
49-
TFAlbertForMaskedLM,
49+
TFAlbertForPreTraining,
5050
TFBertForPreTraining,
5151
TFBertForQuestionAnswering,
5252
TFBertForSequenceClassification,
@@ -109,7 +109,7 @@
109109
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
110110
CTRLLMHeadModel,
111111
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
112-
AlbertForMaskedLM,
112+
AlbertForPreTraining,
113113
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
114114
T5ForConditionalGeneration,
115115
T5_PRETRAINED_MODEL_ARCHIVE_MAP,
@@ -148,7 +148,7 @@
148148
DISTILBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
149149
CTRLLMHeadModel,
150150
CTRL_PRETRAINED_MODEL_ARCHIVE_MAP,
151-
AlbertForMaskedLM,
151+
AlbertForPreTraining,
152152
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
153153
T5ForConditionalGeneration,
154154
T5_PRETRAINED_MODEL_ARCHIVE_MAP,
@@ -318,8 +318,8 @@
318318
),
319319
"albert": (
320320
AlbertConfig,
321-
TFAlbertForMaskedLM,
322-
AlbertForMaskedLM,
321+
TFAlbertForPreTraining,
322+
AlbertForPreTraining,
323323
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
324324
ALBERT_PRETRAINED_CONFIG_ARCHIVE_MAP,
325325
),

src/transformers/modeling_albert.py

+124-1
Original file line numberDiff line numberDiff line change
@@ -111,7 +111,8 @@ def load_tf_weights_in_albert(model, config, tf_checkpoint_path):
111111

112112
# No ALBERT model currently handles the next sentence prediction task
113113
if "seq_relationship" in name:
114-
continue
114+
name = name.replace("seq_relationship/output_", "sop_classifier/classifier/")
115+
name = name.replace("weights", "weight")
115116

116117
name = name.split("/")
117118

@@ -568,6 +569,115 @@ def forward(
568569
return outputs
569570

570571

572+
@add_start_docstrings(
573+
"""Albert Model with two heads on top as done during the pre-training: a `masked language modeling` head and
574+
a `sentence order prediction (classification)` head. """,
575+
ALBERT_START_DOCSTRING,
576+
)
577+
class AlbertForPreTraining(AlbertPreTrainedModel):
578+
def __init__(self, config):
579+
super().__init__(config)
580+
581+
self.albert = AlbertModel(config)
582+
self.predictions = AlbertMLMHead(config)
583+
self.sop_classifier = AlbertSOPHead(config)
584+
585+
self.init_weights()
586+
self.tie_weights()
587+
588+
def tie_weights(self):
589+
self._tie_or_clone_weights(self.predictions.decoder, self.albert.embeddings.word_embeddings)
590+
591+
def get_output_embeddings(self):
592+
return self.predictions.decoder
593+
594+
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
595+
def forward(
596+
self,
597+
input_ids=None,
598+
attention_mask=None,
599+
token_type_ids=None,
600+
position_ids=None,
601+
head_mask=None,
602+
inputs_embeds=None,
603+
masked_lm_labels=None,
604+
sentence_order_label=None,
605+
):
606+
r"""
607+
masked_lm_labels (``torch.LongTensor`` of shape ``(batch_size, sequence_length)``, `optional`, defaults to :obj:`None`):
608+
Labels for computing the masked language modeling loss.
609+
Indices should be in ``[-100, 0, ..., config.vocab_size]`` (see ``input_ids`` docstring)
610+
Tokens with indices set to ``-100`` are ignored (masked), the loss is only computed for the tokens with labels
611+
in ``[0, ..., config.vocab_size]``
612+
sentence_order_label (``torch.LongTensor`` of shape ``(batch_size,)``, `optional`, defaults to :obj:`None`):
613+
Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair (see :obj:`input_ids` docstring)
614+
Indices should be in ``[0, 1]``.
615+
``0`` indicates original order (sequence A, then sequence B),
616+
``1`` indicates switched order (sequence B, then sequence A).
617+
618+
Returns:
619+
:obj:`tuple(torch.FloatTensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
620+
loss (`optional`, returned when ``masked_lm_labels`` is provided) ``torch.FloatTensor`` of shape ``(1,)``:
621+
Total loss as the sum of the masked language modeling loss and the next sequence prediction (classification) loss.
622+
prediction_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`)
623+
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
624+
sop_scores (:obj:`torch.FloatTensor` of shape :obj:`(batch_size, 2)`):
625+
Prediction scores of the next sequence prediction (classification) head (scores of True/False
626+
continuation before SoftMax).
627+
hidden_states (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
628+
Tuple of :obj:`torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer)
629+
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
630+
631+
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
632+
attentions (:obj:`tuple(torch.FloatTensor)`, `optional`, returned when ``config.output_attentions=True``):
633+
Tuple of :obj:`torch.FloatTensor` (one for each layer) of shape
634+
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`.
635+
636+
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
637+
heads.
638+
639+
640+
Examples::
641+
642+
from transformers import AlbertTokenizer, AlbertForPreTraining
643+
import torch
644+
645+
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
646+
model = AlbertForPreTraining.from_pretrained('albert-base-v2')
647+
648+
input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0) # Batch size 1
649+
outputs = model(input_ids)
650+
651+
prediction_scores, sop_scores = outputs[:2]
652+
653+
"""
654+
655+
outputs = self.albert(
656+
input_ids,
657+
attention_mask=attention_mask,
658+
token_type_ids=token_type_ids,
659+
position_ids=position_ids,
660+
head_mask=head_mask,
661+
inputs_embeds=inputs_embeds,
662+
)
663+
664+
sequence_output, pooled_output = outputs[:2]
665+
666+
prediction_scores = self.predictions(sequence_output)
667+
sop_scores = self.sop_classifier(pooled_output)
668+
669+
outputs = (prediction_scores, sop_scores,) + outputs[2:] # add hidden states and attention if they are here
670+
671+
if masked_lm_labels is not None and sentence_order_label is not None:
672+
loss_fct = CrossEntropyLoss()
673+
masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), masked_lm_labels.view(-1))
674+
sentence_order_loss = loss_fct(sop_scores.view(-1, 2), sentence_order_label.view(-1))
675+
total_loss = masked_lm_loss + sentence_order_loss
676+
outputs = (total_loss,) + outputs
677+
678+
return outputs # (loss), prediction_scores, sop_scores, (hidden_states), (attentions)
679+
680+
571681
class AlbertMLMHead(nn.Module):
572682
def __init__(self, config):
573683
super().__init__()
@@ -592,6 +702,19 @@ def forward(self, hidden_states):
592702
return prediction_scores
593703

594704

705+
class AlbertSOPHead(nn.Module):
706+
def __init__(self, config):
707+
super().__init__()
708+
709+
self.dropout = nn.Dropout(config.classifier_dropout_prob)
710+
self.classifier = nn.Linear(config.hidden_size, config.num_labels)
711+
712+
def forward(self, pooled_output):
713+
dropout_pooled_output = self.dropout(pooled_output)
714+
logits = self.classifier(dropout_pooled_output)
715+
return logits
716+
717+
595718
@add_start_docstrings(
596719
"Albert Model with a `language modeling` head on top.", ALBERT_START_DOCSTRING,
597720
)

src/transformers/modeling_auto.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@
4343
from .modeling_albert import (
4444
ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
4545
AlbertForMaskedLM,
46+
AlbertForPreTraining,
4647
AlbertForQuestionAnswering,
4748
AlbertForSequenceClassification,
4849
AlbertForTokenClassification,
@@ -189,7 +190,7 @@
189190
[
190191
(T5Config, T5ForConditionalGeneration),
191192
(DistilBertConfig, DistilBertForMaskedLM),
192-
(AlbertConfig, AlbertForMaskedLM),
193+
(AlbertConfig, AlbertForPreTraining),
193194
(CamembertConfig, CamembertForMaskedLM),
194195
(XLMRobertaConfig, XLMRobertaForMaskedLM),
195196
(BartConfig, BartForConditionalGeneration),

src/transformers/modeling_tf_albert.py

+67-1
Original file line numberDiff line numberDiff line change
@@ -475,7 +475,6 @@ def call(self, hidden_states):
475475
hidden_states = self.activation(hidden_states)
476476
hidden_states = self.LayerNorm(hidden_states)
477477
hidden_states = self.decoder(hidden_states, mode="linear") + self.decoder_bias
478-
hidden_states = hidden_states + self.bias
479478
return hidden_states
480479

481480

@@ -718,6 +717,73 @@ def call(self, inputs, **kwargs):
718717
return outputs
719718

720719

720+
@add_start_docstrings(
721+
"""Albert Model with two heads on top for pre-training:
722+
a `masked language modeling` head and a `sentence order prediction` (classification) head. """,
723+
ALBERT_START_DOCSTRING,
724+
)
725+
class TFAlbertForPreTraining(TFAlbertPreTrainedModel):
726+
def __init__(self, config, *inputs, **kwargs):
727+
super().__init__(config, *inputs, **kwargs)
728+
self.num_labels = config.num_labels
729+
730+
self.albert = TFAlbertMainLayer(config, name="albert")
731+
self.predictions = TFAlbertMLMHead(config, self.albert.embeddings, name="predictions")
732+
self.sop_classifier = TFAlbertSOPHead(config, name="sop_classifier")
733+
734+
def get_output_embeddings(self):
735+
return self.albert.embeddings
736+
737+
@add_start_docstrings_to_callable(ALBERT_INPUTS_DOCSTRING)
738+
def call(self, inputs, **kwargs):
739+
r"""
740+
Return:
741+
:obj:`tuple(tf.Tensor)` comprising various elements depending on the configuration (:class:`~transformers.BertConfig`) and inputs:
742+
prediction_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, config.vocab_size)`):
743+
Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
744+
sop_scores (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, 2)`):
745+
Prediction scores of the sentence order prediction (classification) head (scores of True/False continuation before SoftMax).
746+
hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when :obj:`config.output_hidden_states=True`):
747+
tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer)
748+
of shape :obj:`(batch_size, sequence_length, hidden_size)`.
749+
Hidden-states of the model at the output of each layer plus the initial embedding outputs.
750+
attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``config.output_attentions=True``):
751+
tuple of :obj:`tf.Tensor` (one for each layer) of shape
752+
:obj:`(batch_size, num_heads, sequence_length, sequence_length)`:
753+
Attentions weights after the attention softmax, used to compute the weighted average in the self-attention heads.
754+
Examples::
755+
import tensorflow as tf
756+
from transformers import AlbertTokenizer, TFAlbertForPreTraining
757+
tokenizer = AlbertTokenizer.from_pretrained('albert-base-v2')
758+
model = TFAlbertForPreTraining.from_pretrained('albert-base-v2')
759+
input_ids = tf.constant(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True))[None, :] # Batch size 1
760+
outputs = model(input_ids)
761+
prediction_scores, sop_scores = outputs[:2]
762+
"""
763+
764+
outputs = self.albert(inputs, **kwargs)
765+
sequence_output, pooled_output = outputs[:2]
766+
prediction_scores = self.predictions(sequence_output)
767+
sop_scores = self.sop_classifier(pooled_output, training=kwargs.get("training", False))
768+
outputs = (prediction_scores, sop_scores) + outputs[2:]
769+
return outputs
770+
771+
772+
class TFAlbertSOPHead(tf.keras.layers.Layer):
773+
def __init__(self, config, **kwargs):
774+
super().__init__(**kwargs)
775+
776+
self.dropout = tf.keras.layers.Dropout(config.classifier_dropout_prob)
777+
self.classifier = tf.keras.layers.Dense(
778+
config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier",
779+
)
780+
781+
def call(self, pooled_output, training: bool):
782+
dropout_pooled_output = self.dropout(pooled_output, training=training)
783+
logits = self.classifier(dropout_pooled_output)
784+
return logits
785+
786+
721787
@add_start_docstrings("""Albert Model with a `language modeling` head on top. """, ALBERT_START_DOCSTRING)
722788
class TFAlbertForMaskedLM(TFAlbertPreTrainedModel):
723789
def __init__(self, config, *inputs, **kwargs):

src/transformers/modeling_tf_auto.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
from .modeling_tf_albert import (
3737
TF_ALBERT_PRETRAINED_MODEL_ARCHIVE_MAP,
3838
TFAlbertForMaskedLM,
39+
TFAlbertForPreTraining,
3940
TFAlbertForQuestionAnswering,
4041
TFAlbertForSequenceClassification,
4142
TFAlbertModel,
@@ -132,7 +133,7 @@
132133
[
133134
(T5Config, TFT5ForConditionalGeneration),
134135
(DistilBertConfig, TFDistilBertForMaskedLM),
135-
(AlbertConfig, TFAlbertForMaskedLM),
136+
(AlbertConfig, TFAlbertForPreTraining),
136137
(RobertaConfig, TFRobertaForMaskedLM),
137138
(BertConfig, TFBertForPreTraining),
138139
(OpenAIGPTConfig, TFOpenAIGPTLMHeadModel),
@@ -412,7 +413,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *model_args, **kwargs):
412413
in the `pretrained_model_name_or_path` string (in the following order):
413414
- contains `t5`: :class:`~transformers.TFT5ModelWithLMHead` (T5 model)
414415
- contains `distilbert`: :class:`~transformers.TFDistilBertForMaskedLM` (DistilBERT model)
415-
- contains `albert`: :class:`~transformers.TFAlbertForMaskedLM` (ALBERT model)
416+
- contains `albert`: :class:`~transformers.TFAlbertForPreTraining` (ALBERT model)
416417
- contains `roberta`: :class:`~transformers.TFRobertaForMaskedLM` (RoBERTa model)
417418
- contains `bert`: :class:`~transformers.TFBertForPreTraining` (Bert model)
418419
- contains `openai-gpt`: :class:`~transformers.TFOpenAIGPTLMHeadModel` (OpenAI GPT model)

0 commit comments

Comments
 (0)