Skip to content

Commit 30ca529

Browse files
authored
Make the sacremoses dependency optional (#17049)
* Make sacremoses optional * Pickle
1 parent bb2e088 commit 30ca529

File tree

4 files changed

+63
-12
lines changed

4 files changed

+63
-12
lines changed

setup.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -288,6 +288,7 @@ def run(self):
288288
"nltk",
289289
"GitPython",
290290
"hf-doc-builder",
291+
'sacremoses'
291292
)
292293
+ extras["retrieval"]
293294
+ extras["modelcreation"]
@@ -365,7 +366,6 @@ def run(self):
365366
"protobuf",
366367
"regex",
367368
"requests",
368-
"sacremoses",
369369
"sentencepiece",
370370
"torch",
371371
"tokenizers",
@@ -383,7 +383,6 @@ def run(self):
383383
deps["pyyaml"], # used for the model cards metadata
384384
deps["regex"], # for OpenAI GPT
385385
deps["requests"], # for downloading models over HTTPS
386-
deps["sacremoses"], # for XLM
387386
deps["tokenizers"],
388387
deps["tqdm"], # progress bars in model download and training scripts
389388
]

src/transformers/dependency_versions_check.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
# order specific notes:
2424
# - tqdm must be checked before tokenizers
2525

26-
pkgs_to_check_at_runtime = "python tqdm regex sacremoses requests packaging filelock numpy tokenizers".split()
26+
pkgs_to_check_at_runtime = "python tqdm regex requests packaging filelock numpy tokenizers".split()
2727
if sys.version_info < (3, 7):
2828
pkgs_to_check_at_runtime.append("dataclasses")
2929
if sys.version_info < (3, 8):

src/transformers/models/fsmt/tokenization_fsmt.py

+31-5
Original file line numberDiff line numberDiff line change
@@ -21,8 +21,6 @@
2121
import unicodedata
2222
from typing import Dict, List, Optional, Tuple
2323

24-
import sacremoses as sm
25-
2624
from ...tokenization_utils import PreTrainedTokenizer
2725
from ...utils import logging
2826

@@ -212,6 +210,16 @@ def __init__(
212210
**kwargs,
213211
)
214212

213+
try:
214+
import sacremoses
215+
except ImportError:
216+
raise ImportError(
217+
"You need to install sacremoses to use XLMTokenizer. "
218+
"See https://pypi.org/project/sacremoses/ for installation."
219+
)
220+
221+
self.sm = sacremoses
222+
215223
self.src_vocab_file = src_vocab_file
216224
self.tgt_vocab_file = tgt_vocab_file
217225
self.merges_file = merges_file
@@ -254,21 +262,21 @@ def vocab_size(self) -> int:
254262

255263
def moses_punct_norm(self, text, lang):
256264
if lang not in self.cache_moses_punct_normalizer:
257-
punct_normalizer = sm.MosesPunctNormalizer(lang=lang)
265+
punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
258266
self.cache_moses_punct_normalizer[lang] = punct_normalizer
259267
return self.cache_moses_punct_normalizer[lang].normalize(text)
260268

261269
def moses_tokenize(self, text, lang):
262270
if lang not in self.cache_moses_tokenizer:
263-
moses_tokenizer = sm.MosesTokenizer(lang=lang)
271+
moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
264272
self.cache_moses_tokenizer[lang] = moses_tokenizer
265273
return self.cache_moses_tokenizer[lang].tokenize(
266274
text, aggressive_dash_splits=True, return_str=False, escape=True
267275
)
268276

269277
def moses_detokenize(self, tokens, lang):
270278
if lang not in self.cache_moses_tokenizer:
271-
moses_detokenizer = sm.MosesDetokenizer(lang=self.tgt_lang)
279+
moses_detokenizer = self.sm.MosesDetokenizer(lang=self.tgt_lang)
272280
self.cache_moses_detokenizer[lang] = moses_detokenizer
273281
return self.cache_moses_detokenizer[lang].detokenize(tokens)
274282

@@ -516,3 +524,21 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
516524
index += 1
517525

518526
return src_vocab_file, tgt_vocab_file, merges_file
527+
528+
def __getstate__(self):
529+
state = self.__dict__.copy()
530+
state["sm"] = None
531+
return state
532+
533+
def __setstate__(self, d):
534+
self.__dict__ = d
535+
536+
try:
537+
import sacremoses
538+
except ImportError:
539+
raise ImportError(
540+
"You need to install sacremoses to use XLMTokenizer. "
541+
"See https://pypi.org/project/sacremoses/ for installation."
542+
)
543+
544+
self.sm = sacremoses

src/transformers/models/xlm/tokenization_xlm.py

+30-4
Original file line numberDiff line numberDiff line change
@@ -22,8 +22,6 @@
2222
import unicodedata
2323
from typing import List, Optional, Tuple
2424

25-
import sacremoses as sm
26-
2725
from ...tokenization_utils import PreTrainedTokenizer
2826
from ...utils import logging
2927

@@ -629,6 +627,16 @@ def __init__(
629627
**kwargs,
630628
)
631629

630+
try:
631+
import sacremoses
632+
except ImportError:
633+
raise ImportError(
634+
"You need to install sacremoses to use XLMTokenizer. "
635+
"See https://pypi.org/project/sacremoses/ for installation."
636+
)
637+
638+
self.sm = sacremoses
639+
632640
# cache of sm.MosesPunctNormalizer instance
633641
self.cache_moses_punct_normalizer = dict()
634642
# cache of sm.MosesTokenizer instance
@@ -659,15 +667,15 @@ def do_lower_case(self):
659667

660668
def moses_punct_norm(self, text, lang):
661669
if lang not in self.cache_moses_punct_normalizer:
662-
punct_normalizer = sm.MosesPunctNormalizer(lang=lang)
670+
punct_normalizer = self.sm.MosesPunctNormalizer(lang=lang)
663671
self.cache_moses_punct_normalizer[lang] = punct_normalizer
664672
else:
665673
punct_normalizer = self.cache_moses_punct_normalizer[lang]
666674
return punct_normalizer.normalize(text)
667675

668676
def moses_tokenize(self, text, lang):
669677
if lang not in self.cache_moses_tokenizer:
670-
moses_tokenizer = sm.MosesTokenizer(lang=lang)
678+
moses_tokenizer = self.sm.MosesTokenizer(lang=lang)
671679
self.cache_moses_tokenizer[lang] = moses_tokenizer
672680
else:
673681
moses_tokenizer = self.cache_moses_tokenizer[lang]
@@ -970,3 +978,21 @@ def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] =
970978
index += 1
971979

972980
return vocab_file, merge_file
981+
982+
def __getstate__(self):
983+
state = self.__dict__.copy()
984+
state["sm"] = None
985+
return state
986+
987+
def __setstate__(self, d):
988+
self.__dict__ = d
989+
990+
try:
991+
import sacremoses
992+
except ImportError:
993+
raise ImportError(
994+
"You need to install sacremoses to use XLMTokenizer. "
995+
"See https://pypi.org/project/sacremoses/ for installation."
996+
)
997+
998+
self.sm = sacremoses

0 commit comments

Comments
 (0)