Skip to content

Commit 8189977

Browse files
authored
[Core Tokenization] Support a fix for spm fast models (#26678)
* fix * last attempt * current work * fix forward compatibility * save all special tokens * current state * revert additional changes * updates * remove tokenizer.model * add a test and the fix * nit * revert one more break * fix typefield issue * quality * more tests * fix fields for FC * more nits? * new additional changes * how * some updates * the fix * where do we stand * nits * nits * revert unrelated changes * nits nits nits * styling * don't break llama just yet * revert llama changes * safe arg check * fixup * Add a test for T5 * Necessary changes * Tests passing, added tokens need to not be normalized. If the added tokens are normalized, it will the stripping which seems to be unwanted for a normal functioning * Add even more tests, when normalization is set to True (which does not work 😓 ) * Add even more tests, when normalization is set to True (which does not work 😓 ) * Update to main * nits * fmt * more and more test * comments * revert change as tests are failing * make the test more readble * nits * refactor the test * nit * updates * simplify * style * style * style convert slow * Update src/transformers/convert_slow_tokenizer.py
1 parent a1668cc commit 8189977

File tree

2 files changed

+47
-5
lines changed

2 files changed

+47
-5
lines changed

src/transformers/convert_slow_tokenizer.py

+12-5
Original file line numberDiff line numberDiff line change
@@ -552,15 +552,22 @@ def tokenizer(self, proto):
552552

553553
def normalizer(self, proto):
554554
precompiled_charsmap = proto.normalizer_spec.precompiled_charsmap
555+
_normalizers = [
556+
normalizers.Strip(left=False, right=True), # stripping is important
557+
normalizers.Replace(Regex(" {2,}"), "▁"),
558+
]
555559
if not precompiled_charsmap:
556-
return normalizers.Sequence([normalizers.Replace(Regex(" {2,}"), " ")])
560+
return normalizers.Sequence(_normalizers)
557561
else:
558-
return normalizers.Sequence(
559-
[normalizers.Precompiled(precompiled_charsmap), normalizers.Replace(Regex(" {2,}"), " ")]
560-
)
562+
return normalizers.Sequence([normalizers.Precompiled(precompiled_charsmap)] + _normalizers)
561563

562564
def pre_tokenizer(self, replacement, add_prefix_space):
563-
return pre_tokenizers.Metaspace(replacement=replacement, add_prefix_space=add_prefix_space)
565+
prepend_scheme = "always"
566+
if hasattr(self.original_tokenizer, "legacy") and not self.original_tokenizer.legacy:
567+
prepend_scheme = "first"
568+
return pre_tokenizers.Metaspace(
569+
replacement=replacement, add_prefix_space=add_prefix_space, prepend_scheme=prepend_scheme
570+
)
564571

565572
def post_processor(self):
566573
return None

tests/models/t5/test_tokenization_t5.py

+35
Original file line numberDiff line numberDiff line change
@@ -424,6 +424,41 @@ def test_some_edge_cases(self):
424424
self.assertEqual(tokens, [])
425425
self.assertEqual(tokens, tokenizer.sp_model.encode("▁", out_type=str))
426426

427+
def test_fast_slow_edge_cases(self):
428+
# We are testing spaces before and spaces after special tokens + space transformations
429+
slow_tokenizer = T5Tokenizer.from_pretrained("t5-base", legacy=False)
430+
fast_tokenizer = T5TokenizerFast.from_pretrained("t5-base", legacy=False, from_slow=True)
431+
slow_tokenizer.add_tokens(AddedToken("<new_token_test_>", rstrip=False, lstrip=False, normalized=False))
432+
fast_tokenizer.add_tokens(AddedToken("<new_token_test_>", rstrip=False, lstrip=False, normalized=False))
433+
434+
edge_case = "Hey!<new_token_test_>. How</s>Hey <new_token_test_>!"
435+
EXPECTED_SLOW = ["▁Hey", "!", "<new_token_test_>", ".", "▁How", "</s>", "He", "y", "<new_token_test_>", "!"] # fmt: skip
436+
with self.subTest(f"slow {edge_case} normalized = False"):
437+
self.assertEqual(slow_tokenizer.tokenize(edge_case), EXPECTED_SLOW)
438+
with self.subTest(f"Fast {edge_case} normalized = False"):
439+
self.assertEqual(fast_tokenizer.tokenize(edge_case), EXPECTED_SLOW)
440+
441+
hard_case = "Hey! <new_token_test_>. How</s> Hey <new_token_test_> ! . "
442+
EXPECTED_SLOW = ["▁Hey", "!", "<new_token_test_>", ".", "▁How", "</s>", "▁Hey", "<new_token_test_>", "▁", "!", "▁", "."] # fmt: skip
443+
with self.subTest(f"slow {edge_case} normalized = False"):
444+
self.assertEqual(slow_tokenizer.tokenize(hard_case), EXPECTED_SLOW)
445+
with self.subTest(f"fast {edge_case} normalized = False"):
446+
self.assertEqual(fast_tokenizer.tokenize(hard_case), EXPECTED_SLOW)
447+
448+
fast_tokenizer = T5TokenizerFast.from_pretrained("t5-base", legacy=False, from_slow=True)
449+
fast_tokenizer.add_tokens(AddedToken("<new_token_test_>", rstrip=False, lstrip=False, normalized=True))
450+
451+
# `normalized=True` is the default normalization scheme when adding a token. Normalize -> don't strip the space.
452+
# the issue now is that our slow tokenizer should NOT strip the space if we want to simulate sentencepiece token addition.
453+
454+
EXPECTED_FAST = ["▁Hey", "!", "<new_token_test_>", ".", "▁How", "</s>", "He", "y", "▁", "<new_token_test_>", "!"] # fmt: skip
455+
with self.subTest(f"fast {edge_case} normalized = True"):
456+
self.assertEqual(fast_tokenizer.tokenize(edge_case), EXPECTED_FAST)
457+
458+
EXPECTED_FAST = ['▁Hey', '!', '▁', '<new_token_test_>', '.', '▁How', '</s>', '▁Hey','▁', '<new_token_test_>', '▁', '!', '▁', '.'] # fmt: skip
459+
with self.subTest(f"fast {edge_case} normalized = False"):
460+
self.assertEqual(fast_tokenizer.tokenize(hard_case), EXPECTED_FAST)
461+
427462

428463
@require_sentencepiece
429464
@require_tokenizers

0 commit comments

Comments
 (0)