Skip to content

Commit 21ca148

Browse files
authored
is_pretokenized -> is_split_into_words (#7236)
* is_pretokenized -> is_split_into_words * Fix tests
1 parent 324f361 commit 21ca148

9 files changed

+144
-74
lines changed

docs/source/custom_datasets.rst

+3-3
Original file line numberDiff line numberDiff line change
@@ -324,7 +324,7 @@ which we'll use in a moment:
324324
id2tag = {id: tag for tag, id in tag2id.items()}
325325
326326
To encode the tokens, we'll use a pre-trained DistilBert tokenizer. We can tell the tokenizer that we're dealing
327-
with ready-split tokens rather than full sentence strings by passing ``is_pretokenized=True``. We'll also pass
327+
with ready-split tokens rather than full sentence strings by passing ``is_split_into_words=True``. We'll also pass
328328
``padding=True`` and ``truncation=True`` to pad the sequences to be the same length. Lastly, we can tell the model
329329
to return information about the tokens which are split by the wordpiece tokenization process, which we will need in
330330
a moment.
@@ -333,8 +333,8 @@ a moment.
333333
334334
from transformers import DistilBertTokenizerFast
335335
tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
336-
train_encodings = tokenizer(train_texts, is_pretokenized=True, return_offsets_mapping=True, padding=True, truncation=True)
337-
val_encodings = tokenizer(val_texts, is_pretokenized=True, return_offsets_mapping=True, padding=True, truncation=True)
336+
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
337+
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
338338
339339
Great, so now our tokens are nicely encoded in the format that they need to be in to feed them into our DistilBert
340340
model below.

docs/source/preprocessing.rst

+6-6
Original file line numberDiff line numberDiff line change
@@ -290,12 +290,12 @@ predictions in `named entity recognition (NER) <https://en.wikipedia.org/wiki/Na
290290
if that was the case) but just split into words (which is often the first step in subword tokenization algorithms
291291
like BPE).
292292

293-
If you want to use pre-tokenized inputs, just set :obj:`is_pretokenized=True` when passing your inputs to the
293+
If you want to use pre-tokenized inputs, just set :obj:`is_split_into_words=True` when passing your inputs to the
294294
tokenizer. For instance, we have:
295295

296296
.. code-block::
297297
298-
>>> encoded_input = tokenizer(["Hello", "I'm", "a", "single", "sentence"], is_pretokenized=True)
298+
>>> encoded_input = tokenizer(["Hello", "I'm", "a", "single", "sentence"], is_split_into_words=True)
299299
>>> print(encoded_input)
300300
{'input_ids': [101, 8667, 146, 112, 182, 170, 1423, 5650, 102],
301301
'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0],
@@ -312,7 +312,7 @@ like this:
312312
batch_sentences = [["Hello", "I'm", "a", "single", "sentence"],
313313
["And", "another", "sentence"],
314314
["And", "the", "very", "very", "last", "one"]]
315-
encoded_inputs = tokenizer(batch_sentences, is_pretokenized=True)
315+
encoded_inputs = tokenizer(batch_sentences, is_split_into_words=True)
316316
317317
or a batch of pair sentences like this:
318318

@@ -321,7 +321,7 @@ or a batch of pair sentences like this:
321321
batch_of_second_sentences = [["I'm", "a", "sentence", "that", "goes", "with", "the", "first", "sentence"],
322322
["And", "I", "should", "be", "encoded", "with", "the", "second", "sentence"],
323323
["And", "I", "go", "with", "the", "very", "last", "one"]]
324-
encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences, is_pretokenized=True)
324+
encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences, is_split_into_words=True)
325325
326326
And you can add padding, truncation as well as directly return tensors like before:
327327

@@ -330,14 +330,14 @@ And you can add padding, truncation as well as directly return tensors like befo
330330
## PYTORCH CODE
331331
batch = tokenizer(batch_sentences,
332332
batch_of_second_sentences,
333-
is_pretokenized=True,
333+
is_split_into_words=True,
334334
padding=True,
335335
truncation=True,
336336
return_tensors="pt")
337337
## TENSORFLOW CODE
338338
batch = tokenizer(batch_sentences,
339339
batch_of_second_sentences,
340-
is_pretokenized=True,
340+
is_split_into_words=True,
341341
padding=True,
342342
truncation=True,
343343
return_tensors="tf")

src/transformers/tokenization_gpt2.py

+30-10
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
import json
1919
import os
20+
import warnings
2021
from functools import lru_cache
2122

2223
import regex as re
@@ -121,7 +122,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
121122
122123
.. note::
123124
124-
When used with ``is_pretokenized=True``, this tokenizer will add a space before each word (even the first one).
125+
When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first one).
125126
126127
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
127128
should refer to the superclass for more information regarding methods.
@@ -288,9 +289,16 @@ def save_vocabulary(self, save_directory):
288289

289290
return vocab_file, merge_file
290291

291-
def prepare_for_tokenization(self, text, is_pretokenized=False, **kwargs):
292+
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
293+
if "is_pretokenized" in kwargs:
294+
warnings.warn(
295+
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
296+
FutureWarning,
297+
)
298+
is_split_into_words = kwargs.pop("is_pretokenized")
299+
292300
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
293-
if is_pretokenized or add_prefix_space:
301+
if is_split_into_words or add_prefix_space:
294302
text = " " + text
295303
return (text, kwargs)
296304

@@ -317,7 +325,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
317325
318326
.. note::
319327
320-
When used with ``is_pretokenized=True``, this tokenizer needs to be instantiated with
328+
When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
321329
``add_prefix_space=True``.
322330
323331
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
@@ -377,19 +385,31 @@ def __init__(
377385
self.add_prefix_space = add_prefix_space
378386

379387
def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
380-
381-
is_pretokenized = kwargs.get("is_pretokenized", False)
382-
assert self.add_prefix_space or not is_pretokenized, (
388+
if "is_pretokenized" in kwargs:
389+
warnings.warn(
390+
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
391+
FutureWarning,
392+
)
393+
is_split_into_words = kwargs.pop("is_pretokenized")
394+
395+
is_split_into_words = kwargs.get("is_split_into_words", False)
396+
assert self.add_prefix_space or not is_split_into_words, (
383397
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
384398
"to use it with pretokenized inputs."
385399
)
386400

387401
return super()._batch_encode_plus(*args, **kwargs)
388402

389403
def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
390-
391-
is_pretokenized = kwargs.get("is_pretokenized", False)
392-
assert self.add_prefix_space or not is_pretokenized, (
404+
if "is_pretokenized" in kwargs:
405+
warnings.warn(
406+
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
407+
FutureWarning,
408+
)
409+
is_split_into_words = kwargs.pop("is_pretokenized")
410+
411+
is_split_into_words = kwargs.get("is_split_into_words", False)
412+
assert self.add_prefix_space or not is_split_into_words, (
393413
f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
394414
"to use it with pretokenized inputs."
395415
)

src/transformers/tokenization_roberta.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
# limitations under the License.
1515
"""Tokenization classes for RoBERTa."""
1616

17-
17+
import warnings
1818
from typing import List, Optional
1919

2020
from tokenizers.processors import RobertaProcessing
@@ -81,7 +81,7 @@ class RobertaTokenizer(GPT2Tokenizer):
8181
8282
.. note::
8383
84-
When used with ``is_pretokenized=True``, this tokenizer will add a space before each word (even the first one).
84+
When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first one).
8585
8686
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
8787
should refer to the superclass for more information regarding methods.
@@ -251,9 +251,16 @@ def create_token_type_ids_from_sequences(
251251
return len(cls + token_ids_0 + sep) * [0]
252252
return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
253253

254-
def prepare_for_tokenization(self, text, is_pretokenized=False, **kwargs):
254+
def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
255+
if "is_pretokenized" in kwargs:
256+
warnings.warn(
257+
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
258+
FutureWarning,
259+
)
260+
is_split_into_words = kwargs.pop("is_pretokenized")
261+
255262
add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
256-
if (is_pretokenized or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
263+
if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
257264
text = " " + text
258265
return (text, kwargs)
259266

@@ -280,7 +287,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
280287
281288
.. note::
282289
283-
When used with ``is_pretokenized=True``, this tokenizer needs to be instantiated with
290+
When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
284291
``add_prefix_space=True``.
285292
286293
This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users

src/transformers/tokenization_utils.py

+36-11
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
import itertools
2020
import re
2121
import unicodedata
22+
import warnings
2223
from typing import Any, Dict, List, Optional, Tuple, Union, overload
2324

2425
from .file_utils import add_end_docstrings
@@ -250,6 +251,12 @@ def tokenize(self, text: TextInput, **kwargs) -> List[str]:
250251
Returns:
251252
:obj:`List[str]`: The list of tokens.
252253
"""
254+
if "is_pretokenized" in kwargs:
255+
warnings.warn(
256+
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
257+
FutureWarning,
258+
)
259+
kwargs["is_split_into_words"] = kwargs.pop("is_pretokenized")
253260
# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
254261
all_special_tokens_extended = dict(
255262
(str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
@@ -402,7 +409,7 @@ def _encode_plus(
402409
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
403410
max_length: Optional[int] = None,
404411
stride: int = 0,
405-
is_pretokenized: bool = False,
412+
is_split_into_words: bool = False,
406413
pad_to_multiple_of: Optional[int] = None,
407414
return_tensors: Optional[Union[str, TensorType]] = None,
408415
return_token_type_ids: Optional[bool] = None,
@@ -419,17 +426,19 @@ def get_input_ids(text):
419426
tokens = self.tokenize(text, **kwargs)
420427
return self.convert_tokens_to_ids(tokens)
421428
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
422-
if is_pretokenized:
423-
tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text)))
429+
if is_split_into_words:
430+
tokens = list(
431+
itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
432+
)
424433
return self.convert_tokens_to_ids(tokens)
425434
else:
426435
return self.convert_tokens_to_ids(text)
427436
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
428437
return text
429438
else:
430-
if is_pretokenized:
439+
if is_split_into_words:
431440
raise ValueError(
432-
f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_pretokenized=True`."
441+
f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`."
433442
)
434443
else:
435444
raise ValueError(
@@ -445,6 +454,13 @@ def get_input_ids(text):
445454
"https://github.com/huggingface/transformers/pull/2674"
446455
)
447456

457+
if "is_pretokenized" in kwargs:
458+
warnings.warn(
459+
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
460+
FutureWarning,
461+
)
462+
is_split_into_words = kwargs.pop("is_pretokenized")
463+
448464
first_ids = get_input_ids(text)
449465
second_ids = get_input_ids(text_pair) if text_pair is not None else None
450466

@@ -482,7 +498,7 @@ def _batch_encode_plus(
482498
truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
483499
max_length: Optional[int] = None,
484500
stride: int = 0,
485-
is_pretokenized: bool = False,
501+
is_split_into_words: bool = False,
486502
pad_to_multiple_of: Optional[int] = None,
487503
return_tensors: Optional[Union[str, TensorType]] = None,
488504
return_token_type_ids: Optional[bool] = None,
@@ -499,8 +515,10 @@ def get_input_ids(text):
499515
tokens = self.tokenize(text, **kwargs)
500516
return self.convert_tokens_to_ids(tokens)
501517
elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
502-
if is_pretokenized:
503-
tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text)))
518+
if is_split_into_words:
519+
tokens = list(
520+
itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
521+
)
504522
return self.convert_tokens_to_ids(tokens)
505523
else:
506524
return self.convert_tokens_to_ids(text)
@@ -518,11 +536,18 @@ def get_input_ids(text):
518536
"transformers.PreTrainedTokenizerFast."
519537
)
520538

539+
if "is_pretokenized" in kwargs:
540+
warnings.warn(
541+
"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
542+
FutureWarning,
543+
)
544+
is_split_into_words = kwargs.pop("is_pretokenized")
545+
521546
input_ids = []
522547
for ids_or_pair_ids in batch_text_or_text_pairs:
523548
if not isinstance(ids_or_pair_ids, (list, tuple)):
524549
ids, pair_ids = ids_or_pair_ids, None
525-
elif is_pretokenized and not isinstance(ids_or_pair_ids[0], (list, tuple)):
550+
elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
526551
ids, pair_ids = ids_or_pair_ids, None
527552
else:
528553
ids, pair_ids = ids_or_pair_ids
@@ -616,7 +641,7 @@ def _batch_prepare_for_model(
616641
return batch_outputs
617642

618643
def prepare_for_tokenization(
619-
self, text: str, is_pretokenized: bool = False, **kwargs
644+
self, text: str, is_split_into_words: bool = False, **kwargs
620645
) -> Tuple[str, Dict[str, Any]]:
621646
"""
622647
Performs any necessary transformations before tokenization.
@@ -627,7 +652,7 @@ def prepare_for_tokenization(
627652
Args:
628653
test (:obj:`str`):
629654
The text to prepare.
630-
is_pretokenized (:obj:`bool`, `optional`, defaults to :obj:`False`):
655+
is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
631656
Whether or not the text has been pretokenized.
632657
kwargs:
633658
Keyword arguments to use for the tokenization.

0 commit comments

Comments
 (0)