Skip to content

Commit 37ed3ab

Browse files
authoredMay 13, 2021
Enable option for subword regularization in more tokenizers. (#11417)
* improve slow class tok usage at xlm rob * add subword regularization for barthez * improve barthez tok. test * fix tokenizer tests * add subword regularization for camembert * add subword regularization for deberta v2 tokenizer * add more doc to deberta v2 tokenizer * add subword regularization for speech to text tok. * fix sp_model_kwargs type in speech 2 text tok. * add subword regularization for M2M100 tok. * add more concrete type hints * fix tests for m2m100 and s2t tok. * add missing Any import * fix syntax error in m2m100 tok. * fix unpickle of m2m100 and s2t tok. * fix test of m2m100 and s2t tok. * improve unpickle of deberta v2 tok. * add test for pickle of barthez & camembert * fix pickle of barthez & camembert * add test for deberta v2 tok. pickle * fix m2m100 tok. pickle * fix s2t tok. pickle * add subword regularization to albert tok. * refactor subword reg. test into TokenizerTesterMixin improve albert tok. test remove sample argument form albert tok. check subword reg. using TokenizerTesterMixin improve tok. tests improve xlm roberta tok. tests improve xlm roberta tok. tests * add subword regularization for big bird t. * improve xlm roberta tok. test * add subword regularization for mbart50 tok. * add subword regularization for pegasus tok. * add subword regularization for reformer tok. * add subword regularization for T5 tok. * fix t5 tok. test formatting * add subword regularization for xlm_proph. tok. * add subword regularization for xlnet tok. * add subword regularization for gert_gen tok. * add typing to tokenizers * add typing to xlm rob. tok * add subword regularization for marian tok. * add reverse tok. test * fix marian tok test * fix marian tok test * fix casing in tok. tests * fix style of tok. common test * fix deberta v2 tok test * add type annotations to tok. tests * add type annotations to tok. __init__ * add typing to kokenizer * add type annotations to tok. __init__ * don't specify the default when it's None * fix barthez tok. doc * move sentencepiece tok. tests to TokenizerTesterMixin * fix unused imports * fix albert tok. test * add comment to sentencepiece test options * fix Any import at big bird tok. * fix Any import at xlm prophetnet tok. * empty commit to trigger CI
1 parent fa84540 commit 37ed3ab

33 files changed

+578
-181
lines changed
 

‎src/transformers/models/albert/tokenization_albert.py

+29-10
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
import os
1919
import unicodedata
2020
from shutil import copyfile
21-
from typing import List, Optional, Tuple
21+
from typing import Any, Dict, List, Optional, Tuple
2222

2323
import sentencepiece as spm
2424

@@ -102,6 +102,20 @@ class AlbertTokenizer(PreTrainedTokenizer):
102102
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
103103
The token used for masking values. This is the token used when training this model with masked language
104104
modeling. This is the token which the model will try to predict.
105+
sp_model_kwargs (:obj:`dict`, `optional`):
106+
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
107+
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
108+
109+
- ``enable_sampling``: Enable subword regularization.
110+
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
111+
112+
- ``nbest_size = {0,1}``: No sampling is performed.
113+
- ``nbest_size > 1``: samples from the nbest_size results.
114+
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
115+
using forward-filtering-and-backward-sampling algorithm.
116+
117+
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
118+
BPE-dropout.
105119
106120
Attributes:
107121
sp_model (:obj:`SentencePieceProcessor`):
@@ -125,11 +139,14 @@ def __init__(
125139
pad_token="<pad>",
126140
cls_token="[CLS]",
127141
mask_token="[MASK]",
142+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
128143
**kwargs
129-
):
144+
) -> None:
130145
# Mask token behave like a normal word, i.e. include the space before it
131146
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
132147

148+
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
149+
133150
super().__init__(
134151
do_lower_case=do_lower_case,
135152
remove_space=remove_space,
@@ -141,6 +158,7 @@ def __init__(
141158
pad_token=pad_token,
142159
cls_token=cls_token,
143160
mask_token=mask_token,
161+
sp_model_kwargs=self.sp_model_kwargs,
144162
**kwargs,
145163
)
146164

@@ -149,7 +167,7 @@ def __init__(
149167
self.keep_accents = keep_accents
150168
self.vocab_file = vocab_file
151169

152-
self.sp_model = spm.SentencePieceProcessor()
170+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
153171
self.sp_model.Load(vocab_file)
154172

155173
@property
@@ -168,7 +186,12 @@ def __getstate__(self):
168186

169187
def __setstate__(self, d):
170188
self.__dict__ = d
171-
self.sp_model = spm.SentencePieceProcessor()
189+
190+
# for backward compatibility
191+
if not hasattr(self, "sp_model_kwargs"):
192+
self.sp_model_kwargs = {}
193+
194+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
172195
self.sp_model.Load(self.vocab_file)
173196

174197
def preprocess_text(self, inputs):
@@ -186,14 +209,10 @@ def preprocess_text(self, inputs):
186209

187210
return outputs
188211

189-
def _tokenize(self, text, sample=False):
212+
def _tokenize(self, text: str) -> List[str]:
190213
"""Tokenize a string."""
191214
text = self.preprocess_text(text)
192-
193-
if not sample:
194-
pieces = self.sp_model.EncodeAsPieces(text)
195-
else:
196-
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
215+
pieces = self.sp_model.encode(text, out_type=str)
197216
new_pieces = []
198217
for piece in pieces:
199218
if len(piece) > 1 and piece[-1] == str(",") and piece[-2].isdigit():

‎src/transformers/models/barthez/tokenization_barthez.py

+29-6
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
import os
1919
from shutil import copyfile
20-
from typing import List, Optional, Tuple
20+
from typing import Any, Dict, List, Optional, Tuple
2121

2222
import sentencepiece as spm
2323

@@ -89,6 +89,20 @@ class BarthezTokenizer(PreTrainedTokenizer):
8989
modeling. This is the token which the model will try to predict.
9090
additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
9191
Additional special tokens used by the tokenizer.
92+
sp_model_kwargs (:obj:`dict`, `optional`):
93+
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
94+
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
95+
96+
- ``enable_sampling``: Enable subword regularization.
97+
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
98+
99+
- ``nbest_size = {0,1}``: No sampling is performed.
100+
- ``nbest_size > 1``: samples from the nbest_size results.
101+
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
102+
using forward-filtering-and-backward-sampling algorithm.
103+
104+
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
105+
BPE-dropout.
92106
93107
Attributes:
94108
sp_model (:obj:`SentencePieceProcessor`):
@@ -110,11 +124,14 @@ def __init__(
110124
unk_token="<unk>",
111125
pad_token="<pad>",
112126
mask_token="<mask>",
127+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
113128
**kwargs
114-
):
129+
) -> None:
115130
# Mask token behave like a normal word, i.e. include the space before it
116131
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
117132

133+
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
134+
118135
super().__init__(
119136
bos_token=bos_token,
120137
eos_token=eos_token,
@@ -123,11 +140,12 @@ def __init__(
123140
cls_token=cls_token,
124141
pad_token=pad_token,
125142
mask_token=mask_token,
143+
sp_model_kwargs=self.sp_model_kwargs,
126144
**kwargs,
127145
)
128146

129147
self.vocab_file = vocab_file
130-
self.sp_model = spm.SentencePieceProcessor()
148+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
131149
self.sp_model.Load(str(vocab_file))
132150

133151
self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
@@ -219,8 +237,8 @@ def get_vocab(self):
219237
vocab.update(self.added_tokens_encoder)
220238
return vocab
221239

222-
def _tokenize(self, text):
223-
return self.sp_model.EncodeAsPieces(text)
240+
def _tokenize(self, text: str) -> List[str]:
241+
return self.sp_model.encode(text, out_type=str)
224242

225243
def _convert_token_to_id(self, token):
226244
"""Converts a token (str) in an id using the vocab."""
@@ -243,7 +261,12 @@ def __getstate__(self):
243261

244262
def __setstate__(self, d):
245263
self.__dict__ = d
246-
self.sp_model = spm.SentencePieceProcessor()
264+
265+
# for backward compatibility
266+
if not hasattr(self, "sp_model_kwargs"):
267+
self.sp_model_kwargs = {}
268+
269+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
247270
self.sp_model.Load(self.vocab_file)
248271

249272
def convert_tokens_to_string(self, tokens):

‎src/transformers/models/bert_generation/tokenization_bert_generation.py

+29-10
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
import os
1919
from shutil import copyfile
20-
from typing import List, Optional, Tuple
20+
from typing import Any, Dict, List, Optional, Tuple
2121

2222
import sentencepiece as spm
2323

@@ -58,6 +58,20 @@ class BertGenerationTokenizer(PreTrainedTokenizer):
5858
token instead.
5959
pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
6060
The token used for padding, for example when batching sequences of different lengths.
61+
sp_model_kwargs (:obj:`dict`, `optional`):
62+
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
63+
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
64+
65+
- ``enable_sampling``: Enable subword regularization.
66+
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
67+
68+
- ``nbest_size = {0,1}``: No sampling is performed.
69+
- ``nbest_size > 1``: samples from the nbest_size results.
70+
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
71+
using forward-filtering-and-backward-sampling algorithm.
72+
73+
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
74+
BPE-dropout.
6175
"""
6276

6377
vocab_files_names = VOCAB_FILES_NAMES
@@ -74,21 +88,25 @@ def __init__(
7488
unk_token="<unk>",
7589
pad_token="<pad>",
7690
sep_token="<::::>",
91+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
7792
**kwargs
78-
):
93+
) -> None:
94+
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
95+
7996
# Add extra_ids to the special token list
8097
super().__init__(
8198
bos_token=bos_token,
8299
eos_token=eos_token,
83100
unk_token=unk_token,
84101
pad_token=pad_token,
85102
sep_token=sep_token,
103+
sp_model_kwargs=self.sp_model_kwargs,
86104
**kwargs,
87105
)
88106

89107
self.vocab_file = vocab_file
90108

91-
self.sp_model = spm.SentencePieceProcessor()
109+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
92110
self.sp_model.Load(vocab_file)
93111

94112
@property
@@ -107,16 +125,17 @@ def __getstate__(self):
107125

108126
def __setstate__(self, d):
109127
self.__dict__ = d
110-
self.sp_model = spm.SentencePieceProcessor()
128+
129+
# for backward compatibility
130+
if not hasattr(self, "sp_model_kwargs"):
131+
self.sp_model_kwargs = {}
132+
133+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
111134
self.sp_model.Load(self.vocab_file)
112135

113-
def _tokenize(self, text, sample=False):
136+
def _tokenize(self, text: str) -> List[str]:
114137
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
115-
if not sample:
116-
pieces = self.sp_model.EncodeAsPieces(text)
117-
else:
118-
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
119-
return pieces
138+
return self.sp_model.encode(text, out_type=str)
120139

121140
def _convert_token_to_id(self, token):
122141
"""Converts a token (str) in an id using the vocab."""

‎src/transformers/models/big_bird/tokenization_big_bird.py

+28-10
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717

1818
import os
1919
from shutil import copyfile
20-
from typing import List, Optional, Tuple
20+
from typing import Any, Dict, List, Optional, Tuple
2121

2222
import sentencepiece as spm
2323

@@ -74,7 +74,20 @@ class BigBirdTokenizer(PreTrainedTokenizer):
7474
mask_token (:obj:`str`, `optional`, defaults to :obj:`"[MASK]"`):
7575
The token used for masking values. This is the token used when training this model with masked language
7676
modeling. This is the token which the model will try to predict.
77+
sp_model_kwargs (:obj:`dict`, `optional`):
78+
Will be passed to the ``SentencePieceProcessor.__init__()`` method. The `Python wrapper for SentencePiece
79+
<https://github.com/google/sentencepiece/tree/master/python>`__ can be used, among other things, to set:
7780
81+
- ``enable_sampling``: Enable subword regularization.
82+
- ``nbest_size``: Sampling parameters for unigram. Invalid for BPE-Dropout.
83+
84+
- ``nbest_size = {0,1}``: No sampling is performed.
85+
- ``nbest_size > 1``: samples from the nbest_size results.
86+
- ``nbest_size < 0``: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
87+
using forward-filtering-and-backward-sampling algorithm.
88+
89+
- ``alpha``: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
90+
BPE-dropout.
7891
"""
7992

8093
vocab_files_names = VOCAB_FILES_NAMES
@@ -93,8 +106,9 @@ def __init__(
93106
sep_token="[SEP]",
94107
mask_token="[MASK]",
95108
cls_token="[CLS]",
109+
sp_model_kwargs: Optional[Dict[str, Any]] = None,
96110
**kwargs
97-
):
111+
) -> None:
98112
bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
99113
eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
100114
unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
@@ -105,6 +119,8 @@ def __init__(
105119
# Mask token behave like a normal word, i.e. include the space before it
106120
mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
107121

122+
self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
123+
108124
super().__init__(
109125
bos_token=bos_token,
110126
eos_token=eos_token,
@@ -113,12 +129,13 @@ def __init__(
113129
sep_token=sep_token,
114130
mask_token=mask_token,
115131
cls_token=cls_token,
132+
sp_model_kwargs=self.sp_model_kwargs,
116133
**kwargs,
117134
)
118135

119136
self.vocab_file = vocab_file
120137

121-
self.sp_model = spm.SentencePieceProcessor()
138+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
122139
self.sp_model.Load(vocab_file)
123140

124141
@property
@@ -137,16 +154,17 @@ def __getstate__(self):
137154

138155
def __setstate__(self, d):
139156
self.__dict__ = d
140-
self.sp_model = spm.SentencePieceProcessor()
157+
158+
# for backward compatibility
159+
if not hasattr(self, "sp_model_kwargs"):
160+
self.sp_model_kwargs = {}
161+
162+
self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
141163
self.sp_model.Load(self.vocab_file)
142164

143-
def _tokenize(self, text, sample=False):
165+
def _tokenize(self, text: str) -> List[str]:
144166
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
145-
if not sample:
146-
pieces = self.sp_model.EncodeAsPieces(text)
147-
else:
148-
pieces = self.sp_model.SampleEncodeAsPieces(text, 64, 0.1)
149-
return pieces
167+
return self.sp_model.encode(text, out_type=str)
150168

151169
def _convert_token_to_id(self, token):
152170
"""Converts a token (str) in an id using the vocab."""

0 commit comments

Comments
 (0)
Please sign in to comment.