41
41
model_type_to_module_name ,
42
42
replace_list_option_in_docstrings ,
43
43
)
44
+ from .dynamic import get_class_from_dynamic_module
44
45
45
46
46
47
logger = logging .get_logger (__name__ )
@@ -412,6 +413,10 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
412
413
Whether or not to try to load the fast version of the tokenizer.
413
414
tokenizer_type (:obj:`str`, `optional`):
414
415
Tokenizer type to be loaded.
416
+ trust_remote_code (:obj:`bool`, `optional`, defaults to :obj:`False`):
417
+ Whether or not to allow for custom models defined on the Hub in their own modeling files. This option
418
+ should only be set to :obj:`True` for repositories you trust and in which you have read the code, as it
419
+ will execute code present on the Hub on your local machine.
415
420
kwargs (additional keyword arguments, `optional`):
416
421
Will be passed to the Tokenizer ``__init__()`` method. Can be used to set special tokens like
417
422
``bos_token``, ``eos_token``, ``unk_token``, ``sep_token``, ``pad_token``, ``cls_token``,
@@ -436,6 +441,7 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
436
441
437
442
use_fast = kwargs .pop ("use_fast" , True )
438
443
tokenizer_type = kwargs .pop ("tokenizer_type" , None )
444
+ trust_remote_code = kwargs .pop ("trust_remote_code" , False )
439
445
440
446
# First, let's see whether the tokenizer_type is passed so that we can leverage it
441
447
if tokenizer_type is not None :
@@ -464,17 +470,45 @@ def from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs):
464
470
# Next, let's try to use the tokenizer_config file to get the tokenizer class.
465
471
tokenizer_config = get_tokenizer_config (pretrained_model_name_or_path , ** kwargs )
466
472
config_tokenizer_class = tokenizer_config .get ("tokenizer_class" )
473
+ tokenizer_auto_map = tokenizer_config .get ("auto_map" )
467
474
468
475
# If that did not work, let's try to use the config.
469
476
if config_tokenizer_class is None :
470
477
if not isinstance (config , PretrainedConfig ):
471
- config = AutoConfig .from_pretrained (pretrained_model_name_or_path , ** kwargs )
478
+ config = AutoConfig .from_pretrained (
479
+ pretrained_model_name_or_path , trust_remote_code = trust_remote_code , ** kwargs
480
+ )
472
481
config_tokenizer_class = config .tokenizer_class
482
+ if hasattr (config , "auto_map" ) and "AutoTokenizer" in config .auto_map :
483
+ tokenizer_auto_map = config .auto_map ["AutoTokenizer" ]
473
484
474
485
# If we have the tokenizer class from the tokenizer config or the model config we're good!
475
486
if config_tokenizer_class is not None :
476
487
tokenizer_class = None
477
- if use_fast and not config_tokenizer_class .endswith ("Fast" ):
488
+ if tokenizer_auto_map is not None :
489
+ if not trust_remote_code :
490
+ raise ValueError (
491
+ f"Loading { pretrained_model_name_or_path } requires you to execute the tokenizer file in that repo "
492
+ "on your local machine. Make sure you have read the code there to avoid malicious use, then set "
493
+ "the option `trust_remote_code=True` to remove this error."
494
+ )
495
+ if kwargs .get ("revision" , None ) is None :
496
+ logger .warn (
497
+ "Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure "
498
+ "no malicious code has been contributed in a newer revision."
499
+ )
500
+
501
+ if use_fast and tokenizer_auto_map [1 ] is not None :
502
+ class_ref = tokenizer_auto_map [1 ]
503
+ else :
504
+ class_ref = tokenizer_auto_map [0 ]
505
+
506
+ module_file , class_name = class_ref .split ("." )
507
+ tokenizer_class = get_class_from_dynamic_module (
508
+ pretrained_model_name_or_path , module_file + ".py" , class_name , ** kwargs
509
+ )
510
+
511
+ elif use_fast and not config_tokenizer_class .endswith ("Fast" ):
478
512
tokenizer_class_candidate = f"{ config_tokenizer_class } Fast"
479
513
tokenizer_class = tokenizer_class_from_name (tokenizer_class_candidate )
480
514
if tokenizer_class is None :
0 commit comments