huggingface
diff --git a/‎docs/source/custom_datasets.rst
+3-3 b/‎docs/source/custom_datasets.rst
+3-3
diff --git a/‎docs/source/preprocessing.rst
+6-6 b/‎docs/source/preprocessing.rst
+6-6
diff --git a/‎src/transformers/tokenization_gpt2.py
+30-10 b/‎src/transformers/tokenization_gpt2.py
+30-10
diff --git a/‎src/transformers/tokenization_roberta.py
+12-5 b/‎src/transformers/tokenization_roberta.py
+12-5
diff --git a/‎src/transformers/tokenization_utils.py
+36-11 b/‎src/transformers/tokenization_utils.py
+36-11
@@ -324,7 +324,7 @@ which we'll use in a moment:
     id2tag = {id: tag for tag, id in tag2id.items()}
 
 To encode the tokens, we'll use a pre-trained DistilBert tokenizer. We can tell the tokenizer that we're dealing
-with ready-split tokens rather than full sentence strings by passing ``is_pretokenized=True``. We'll also pass
+with ready-split tokens rather than full sentence strings by passing ``is_split_into_words=True``. We'll also pass
 ``padding=True`` and ``truncation=True`` to pad the sequences to be the same length. Lastly, we can tell the model
 to return information about the tokens which are split by the wordpiece tokenization process, which we will need in
 a moment.
@@ -333,8 +333,8 @@ a moment.
 
     from transformers import DistilBertTokenizerFast
     tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')
-    train_encodings = tokenizer(train_texts, is_pretokenized=True, return_offsets_mapping=True, padding=True, truncation=True)
-    val_encodings = tokenizer(val_texts, is_pretokenized=True, return_offsets_mapping=True, padding=True, truncation=True)
+    train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
+    val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
 
 Great, so now our tokens are nicely encoded in the format that they need to be in to feed them into our DistilBert
 model below.
 
@@ -290,12 +290,12 @@ predictions in `named entity recognition (NER) <https://en.wikipedia.org/wiki/Na
     if that was the case) but just split into words (which is often the first step in subword tokenization algorithms
     like BPE).
 
-If you want to use pre-tokenized inputs, just set :obj:`is_pretokenized=True` when passing your inputs to the
+If you want to use pre-tokenized inputs, just set :obj:`is_split_into_words=True` when passing your inputs to the
 tokenizer. For instance, we have:
 
 .. code-block::
 
-    >>> encoded_input = tokenizer(["Hello", "I'm", "a", "single", "sentence"], is_pretokenized=True)
+    >>> encoded_input = tokenizer(["Hello", "I'm", "a", "single", "sentence"], is_split_into_words=True)
     >>> print(encoded_input)
     {'input_ids': [101, 8667, 146, 112, 182, 170, 1423, 5650, 102],
      'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 
@@ -312,7 +312,7 @@ like this:
     batch_sentences = [["Hello", "I'm", "a", "single", "sentence"],
                        ["And", "another", "sentence"],
                        ["And", "the", "very", "very", "last", "one"]]
-    encoded_inputs = tokenizer(batch_sentences, is_pretokenized=True)
+    encoded_inputs = tokenizer(batch_sentences, is_split_into_words=True)
 
 or a batch of pair sentences like this:
 
@@ -321,7 +321,7 @@ or a batch of pair sentences like this:
     batch_of_second_sentences = [["I'm", "a", "sentence", "that", "goes", "with", "the", "first", "sentence"],
                                  ["And", "I", "should", "be", "encoded", "with", "the", "second", "sentence"],
                                  ["And", "I", "go", "with", "the", "very", "last", "one"]]
-    encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences, is_pretokenized=True)
+    encoded_inputs = tokenizer(batch_sentences, batch_of_second_sentences, is_split_into_words=True)
 
 And you can add padding, truncation as well as directly return tensors like before:
 
@@ -330,14 +330,14 @@ And you can add padding, truncation as well as directly return tensors like befo
     ## PYTORCH CODE
     batch = tokenizer(batch_sentences,
                       batch_of_second_sentences,
-                      is_pretokenized=True,
+                      is_split_into_words=True,
                       padding=True,
                       truncation=True,
                       return_tensors="pt")
     ## TENSORFLOW CODE
     batch = tokenizer(batch_sentences,
                       batch_of_second_sentences,
-                      is_pretokenized=True,
+                      is_split_into_words=True,
                       padding=True,
                       truncation=True,
                       return_tensors="tf")
@@ -17,6 +17,7 @@
 
 import json
 import os
+import warnings
 from functools import lru_cache
 
 import regex as re
@@ -121,7 +122,7 @@ class GPT2Tokenizer(PreTrainedTokenizer):
 
     .. note::
 
-        When used with ``is_pretokenized=True``, this tokenizer will add a space before each word (even the first one).
+        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first one).
 
     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
     should refer to the superclass for more information regarding methods.
@@ -288,9 +289,16 @@ def save_vocabulary(self, save_directory):
 
         return vocab_file, merge_file
 
-    def prepare_for_tokenization(self, text, is_pretokenized=False, **kwargs):
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        if "is_pretokenized" in kwargs:
+            warnings.warn(
+                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
+                FutureWarning,
+            )
+            is_split_into_words = kwargs.pop("is_pretokenized")
+
         add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
-        if is_pretokenized or add_prefix_space:
+        if is_split_into_words or add_prefix_space:
             text = " " + text
         return (text, kwargs)
 
@@ -317,7 +325,7 @@ class GPT2TokenizerFast(PreTrainedTokenizerFast):
 
     .. note::
 
-        When used with ``is_pretokenized=True``, this tokenizer needs to be instantiated with
+        When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
         ``add_prefix_space=True``.
 
     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
@@ -377,19 +385,31 @@ def __init__(
         self.add_prefix_space = add_prefix_space
 
     def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
-
-        is_pretokenized = kwargs.get("is_pretokenized", False)
-        assert self.add_prefix_space or not is_pretokenized, (
+        if "is_pretokenized" in kwargs:
+            warnings.warn(
+                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
+                FutureWarning,
+            )
+            is_split_into_words = kwargs.pop("is_pretokenized")
+
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+        assert self.add_prefix_space or not is_split_into_words, (
             f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
             "to use it with pretokenized inputs."
         )
 
         return super()._batch_encode_plus(*args, **kwargs)
 
     def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
-
-        is_pretokenized = kwargs.get("is_pretokenized", False)
-        assert self.add_prefix_space or not is_pretokenized, (
+        if "is_pretokenized" in kwargs:
+            warnings.warn(
+                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
+                FutureWarning,
+            )
+            is_split_into_words = kwargs.pop("is_pretokenized")
+
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+        assert self.add_prefix_space or not is_split_into_words, (
             f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
             "to use it with pretokenized inputs."
         )
 
@@ -14,7 +14,7 @@
 # limitations under the License.
 """Tokenization classes for RoBERTa."""
 
-
+import warnings
 from typing import List, Optional
 
 from tokenizers.processors import RobertaProcessing
@@ -81,7 +81,7 @@ class RobertaTokenizer(GPT2Tokenizer):
 
     .. note::
 
-        When used with ``is_pretokenized=True``, this tokenizer will add a space before each word (even the first one).
+        When used with ``is_split_into_words=True``, this tokenizer will add a space before each word (even the first one).
 
     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the methods. Users
     should refer to the superclass for more information regarding methods.
@@ -251,9 +251,16 @@ def create_token_type_ids_from_sequences(
             return len(cls + token_ids_0 + sep) * [0]
         return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
 
-    def prepare_for_tokenization(self, text, is_pretokenized=False, **kwargs):
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        if "is_pretokenized" in kwargs:
+            warnings.warn(
+                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
+                FutureWarning,
+            )
+            is_split_into_words = kwargs.pop("is_pretokenized")
+
         add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
-        if (is_pretokenized or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
+        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
             text = " " + text
         return (text, kwargs)
 
@@ -280,7 +287,7 @@ class RobertaTokenizerFast(GPT2TokenizerFast):
 
     .. note::
 
-        When used with ``is_pretokenized=True``, this tokenizer needs to be instantiated with
+        When used with ``is_split_into_words=True``, this tokenizer needs to be instantiated with
         ``add_prefix_space=True``.
 
     This tokenizer inherits from :class:`~transformers.PreTrainedTokenizerFast` which contains most of the methods. Users
 
@@ -19,6 +19,7 @@
 import itertools
 import re
 import unicodedata
+import warnings
 from typing import Any, Dict, List, Optional, Tuple, Union, overload
 
 from .file_utils import add_end_docstrings
@@ -250,6 +251,12 @@ def tokenize(self, text: TextInput, **kwargs) -> List[str]:
         Returns:
             :obj:`List[str]`: The list of tokens.
         """
+        if "is_pretokenized" in kwargs:
+            warnings.warn(
+                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
+                FutureWarning,
+            )
+            kwargs["is_split_into_words"] = kwargs.pop("is_pretokenized")
         # Simple mapping string => AddedToken for special tokens with specific tokenization behaviors
         all_special_tokens_extended = dict(
             (str(t), t) for t in self.all_special_tokens_extended if isinstance(t, AddedToken)
@@ -402,7 +409,7 @@ def _encode_plus(
         truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
         max_length: Optional[int] = None,
         stride: int = 0,
-        is_pretokenized: bool = False,
+        is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
@@ -419,17 +426,19 @@ def get_input_ids(text):
                 tokens = self.tokenize(text, **kwargs)
                 return self.convert_tokens_to_ids(tokens)
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
-                if is_pretokenized:
-                    tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text)))
+                if is_split_into_words:
+                    tokens = list(
+                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
+                    )
                     return self.convert_tokens_to_ids(tokens)
                 else:
                     return self.convert_tokens_to_ids(text)
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], int):
                 return text
             else:
-                if is_pretokenized:
+                if is_split_into_words:
                     raise ValueError(
-                        f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_pretokenized=True`."
+                        f"Input {text} is not valid. Should be a string or a list/tuple of strings when `is_split_into_words=True`."
                     )
                 else:
                     raise ValueError(
@@ -445,6 +454,13 @@ def get_input_ids(text):
                 "https://github.com/huggingface/transformers/pull/2674"
             )
 
+        if "is_pretokenized" in kwargs:
+            warnings.warn(
+                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
+                FutureWarning,
+            )
+            is_split_into_words = kwargs.pop("is_pretokenized")
+
         first_ids = get_input_ids(text)
         second_ids = get_input_ids(text_pair) if text_pair is not None else None
 
@@ -482,7 +498,7 @@ def _batch_encode_plus(
         truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
         max_length: Optional[int] = None,
         stride: int = 0,
-        is_pretokenized: bool = False,
+        is_split_into_words: bool = False,
         pad_to_multiple_of: Optional[int] = None,
         return_tensors: Optional[Union[str, TensorType]] = None,
         return_token_type_ids: Optional[bool] = None,
@@ -499,8 +515,10 @@ def get_input_ids(text):
                 tokens = self.tokenize(text, **kwargs)
                 return self.convert_tokens_to_ids(tokens)
             elif isinstance(text, (list, tuple)) and len(text) > 0 and isinstance(text[0], str):
-                if is_pretokenized:
-                    tokens = list(itertools.chain(*(self.tokenize(t, is_pretokenized=True, **kwargs) for t in text)))
+                if is_split_into_words:
+                    tokens = list(
+                        itertools.chain(*(self.tokenize(t, is_split_into_words=True, **kwargs) for t in text))
+                    )
                     return self.convert_tokens_to_ids(tokens)
                 else:
                     return self.convert_tokens_to_ids(text)
@@ -518,11 +536,18 @@ def get_input_ids(text):
                 "transformers.PreTrainedTokenizerFast."
             )
 
+        if "is_pretokenized" in kwargs:
+            warnings.warn(
+                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
+                FutureWarning,
+            )
+            is_split_into_words = kwargs.pop("is_pretokenized")
+
         input_ids = []
         for ids_or_pair_ids in batch_text_or_text_pairs:
             if not isinstance(ids_or_pair_ids, (list, tuple)):
                 ids, pair_ids = ids_or_pair_ids, None
-            elif is_pretokenized and not isinstance(ids_or_pair_ids[0], (list, tuple)):
+            elif is_split_into_words and not isinstance(ids_or_pair_ids[0], (list, tuple)):
                 ids, pair_ids = ids_or_pair_ids, None
             else:
                 ids, pair_ids = ids_or_pair_ids
@@ -616,7 +641,7 @@ def _batch_prepare_for_model(
         return batch_outputs
 
     def prepare_for_tokenization(
-        self, text: str, is_pretokenized: bool = False, **kwargs
+        self, text: str, is_split_into_words: bool = False, **kwargs
     ) -> Tuple[str, Dict[str, Any]]:
         """
         Performs any necessary transformations before tokenization.
@@ -627,7 +652,7 @@ def prepare_for_tokenization(
         Args:
             test (:obj:`str`):
                 The text to prepare.
-            is_pretokenized (:obj:`bool`, `optional`, defaults to :obj:`False`):
+            is_split_into_words (:obj:`bool`, `optional`, defaults to :obj:`False`):
                 Whether or not the text has been pretokenized.
             kwargs:
                 Keyword arguments to use for the tokenization.