19
19
import itertools
20
20
import re
21
21
import unicodedata
22
+ import warnings
22
23
from typing import Any , Dict , List , Optional , Tuple , Union , overload
23
24
24
25
from .file_utils import add_end_docstrings
@@ -250,6 +251,12 @@ def tokenize(self, text: TextInput, **kwargs) -> List[str]:
250
251
Returns:
251
252
:obj:`List[str]`: The list of tokens.
252
253
"""
254
+ if "is_pretokenized" in kwargs :
255
+ warnings .warn (
256
+ "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead." ,
257
+ FutureWarning ,
258
+ )
259
+ kwargs ["is_split_into_words" ] = kwargs .pop ("is_pretokenized" )
253
260
# Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
254
261
all_special_tokens_extended = dict (
255
262
(str (t ), t ) for t in self .all_special_tokens_extended if isinstance (t , AddedToken )
@@ -402,7 +409,7 @@ def _encode_plus(
402
409
truncation_strategy : TruncationStrategy = TruncationStrategy .DO_NOT_TRUNCATE ,
403
410
max_length : Optional [int ] = None ,
404
411
stride : int = 0 ,
405
- is_pretokenized : bool = False ,
412
+ is_split_into_words : bool = False ,
406
413
pad_to_multiple_of : Optional [int ] = None ,
407
414
return_tensors : Optional [Union [str , TensorType ]] = None ,
408
415
return_token_type_ids : Optional [bool ] = None ,
@@ -419,17 +426,19 @@ def get_input_ids(text):
419
426
tokens = self .tokenize (text , ** kwargs )
420
427
return self .convert_tokens_to_ids (tokens )
421
428
elif isinstance (text , (list , tuple )) and len (text ) > 0 and isinstance (text [0 ], str ):
422
- if is_pretokenized :
423
- tokens = list (itertools .chain (* (self .tokenize (t , is_pretokenized = True , ** kwargs ) for t in text )))
429
+ if is_split_into_words :
430
+ tokens = list (
431
+ itertools .chain (* (self .tokenize (t , is_split_into_words = True , ** kwargs ) for t in text ))
432
+ )
424
433
return self .convert_tokens_to_ids (tokens )
425
434
else :
426
435
return self .convert_tokens_to_ids (text )
427
436
elif isinstance (text , (list , tuple )) and len (text ) > 0 and isinstance (text [0 ], int ):
428
437
return text
429
438
else :
430
- if is_pretokenized :
439
+ if is_split_into_words :
431
440
raise ValueError (
432
- f"Input { text } is not valid. Should be a string or a list/tuple of strings when `is_pretokenized =True`."
441
+ f"Input { text } is not valid. Should be a string or a list/tuple of strings when `is_split_into_words =True`."
433
442
)
434
443
else :
435
444
raise ValueError (
@@ -445,6 +454,13 @@ def get_input_ids(text):
445
454
"https://github.com/huggingface/transformers/pull/2674"
446
455
)
447
456
457
+ if "is_pretokenized" in kwargs :
458
+ warnings .warn (
459
+ "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead." ,
460
+ FutureWarning ,
461
+ )
462
+ is_split_into_words = kwargs .pop ("is_pretokenized" )
463
+
448
464
first_ids = get_input_ids (text )
449
465
second_ids = get_input_ids (text_pair ) if text_pair is not None else None
450
466
@@ -482,7 +498,7 @@ def _batch_encode_plus(
482
498
truncation_strategy : TruncationStrategy = TruncationStrategy .DO_NOT_TRUNCATE ,
483
499
max_length : Optional [int ] = None ,
484
500
stride : int = 0 ,
485
- is_pretokenized : bool = False ,
501
+ is_split_into_words : bool = False ,
486
502
pad_to_multiple_of : Optional [int ] = None ,
487
503
return_tensors : Optional [Union [str , TensorType ]] = None ,
488
504
return_token_type_ids : Optional [bool ] = None ,
@@ -499,8 +515,10 @@ def get_input_ids(text):
499
515
tokens = self .tokenize (text , ** kwargs )
500
516
return self .convert_tokens_to_ids (tokens )
501
517
elif isinstance (text , (list , tuple )) and len (text ) > 0 and isinstance (text [0 ], str ):
502
- if is_pretokenized :
503
- tokens = list (itertools .chain (* (self .tokenize (t , is_pretokenized = True , ** kwargs ) for t in text )))
518
+ if is_split_into_words :
519
+ tokens = list (
520
+ itertools .chain (* (self .tokenize (t , is_split_into_words = True , ** kwargs ) for t in text ))
521
+ )
504
522
return self .convert_tokens_to_ids (tokens )
505
523
else :
506
524
return self .convert_tokens_to_ids (text )
@@ -518,11 +536,18 @@ def get_input_ids(text):
518
536
"transformers.PreTrainedTokenizerFast."
519
537
)
520
538
539
+ if "is_pretokenized" in kwargs :
540
+ warnings .warn (
541
+ "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead." ,
542
+ FutureWarning ,
543
+ )
544
+ is_split_into_words = kwargs .pop ("is_pretokenized" )
545
+
521
546
input_ids = []
522
547
for ids_or_pair_ids in batch_text_or_text_pairs :
523
548
if not isinstance (ids_or_pair_ids , (list , tuple )):
524
549
ids , pair_ids = ids_or_pair_ids , None
525
- elif is_pretokenized and not isinstance (ids_or_pair_ids [0 ], (list , tuple )):
550
+ elif is_split_into_words and not isinstance (ids_or_pair_ids [0 ], (list , tuple )):
526
551
ids , pair_ids = ids_or_pair_ids , None
527
552
else :
528
553
ids , pair_ids = ids_or_pair_ids
@@ -616,7 +641,7 @@ def _batch_prepare_for_model(
616
641
return batch_outputs
617
642
618
643
def prepare_for_tokenization (
619
- self , text : str , is_pretokenized : bool = False , ** kwargs
644
+ self , text : str , is_split_into_words : bool = False , ** kwargs
620
645
) -> Tuple [str , Dict [str , Any ]]:
621
646
"""
622
647
Performs any necessary transformations before tokenization.
@@ -627,7 +652,7 @@ def prepare_for_tokenization(
627
652
Args:
628
653
test (:obj:`str`):
629
654
The text to prepare.
630
- is_pretokenized (:obj:`bool`, `optional`, defaults to :obj:`False`):
655
+ is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
631
656
Whether or not the text has been pretokenized.
632
657
kwargs:
633
658
Keyword arguments to use for the tokenization.
0 commit comments