Skip to content

Commit 6de4ee6

Browse files
Wav2 vec2 phoneme ctc tokenizer optimisation (#16817)
* Solved href rendering issue in heading Markdown references in headings such as '####' don't render well. Replaced it with <h4>...<a></a></h> banners. * PhonemeTokenizer optimization using phonemizer lib The backend should only be initialized once, otherwise it is reloaded. Added `init_backend` function, intializes a backend attribute. Phonemize re-uses self.backend. Should give ~10 times faster phonemization. * formatted file with make style * Documentation suggestion Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update /tokenization_wav2vec2_phoneme.py based on PR suggestion Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Update CONTRIBUTING.md Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
1 parent 306c9ee commit 6de4ee6

File tree

2 files changed

+24
-12
lines changed

2 files changed

+24
-12
lines changed

CONTRIBUTING.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -368,8 +368,7 @@ For documentation strings, 🤗 Transformers follows the [google style](https://
368368
Check our [documentation writing guide](https://github.com/huggingface/transformers/tree/main/docs#writing-documentation---specification)
369369
for more information.
370370

371-
#### This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md)
372-
371+
**This guide was heavily inspired by the awesome [scikit-learn guide to contributing](https://github.com/scikit-learn/scikit-learn/blob/main/CONTRIBUTING.md).**
373372

374373
### Develop on Windows
375374

src/transformers/models/wav2vec2_phoneme/tokenization_wav2vec2_phoneme.py

Lines changed: 23 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -158,6 +158,9 @@ def __init__(
158158
self.phonemizer_lang = phonemizer_lang
159159
self.phonemizer_backend = phonemizer_backend
160160

161+
if do_phonemize:
162+
self.init_backend(self.phonemizer_lang)
163+
161164
with open(vocab_file, encoding="utf-8") as vocab_handle:
162165
self.encoder = json.load(vocab_handle)
163166
self.decoder = {v: k for k, v in self.encoder.items()}
@@ -169,6 +172,18 @@ def vocab_size(self) -> int:
169172
def get_vocab(self) -> Dict:
170173
return dict(self.encoder, **self.added_tokens_encoder)
171174

175+
def init_backend(self, phonemizer_lang: str):
176+
"""
177+
Initializes the backend.
178+
179+
Args:
180+
phonemizer_lang (`str`): The language to be used.
181+
"""
182+
requires_backends(self, "phonemizer")
183+
from phonemizer.backend import BACKENDS
184+
185+
self.backend = BACKENDS[self.phonemizer_backend](phonemizer_lang, language_switch="remove-flags")
186+
172187
def prepare_for_tokenization(
173188
self,
174189
text: str,
@@ -209,6 +224,7 @@ def prepare_for_tokenization(
209224
# set the correct phonemizer language
210225
if phonemizer_lang is not None:
211226
self.phonemizer_lang = phonemizer_lang
227+
self.init_backend(phonemizer_lang)
212228

213229
return (text, {})
214230

@@ -234,23 +250,20 @@ def _tokenize(self, text, **kwargs):
234250
return tokens
235251

236252
def phonemize(self, text: str, phonemizer_lang: Optional[str] = None) -> str:
237-
requires_backends(self, "phonemizer")
238-
239-
from phonemizer import phonemize
240253
from phonemizer.separator import Separator
241254

242255
word_delimiter = self.word_delimiter_token + " " if self.word_delimiter_token is not None else ""
243-
phonemizer_lang = phonemizer_lang if phonemizer_lang is not None else self.phonemizer_lang
256+
if phonemizer_lang is not None and phonemizer_lang != self.phonemizer_lang:
257+
self.init_backend(phonemizer_lang)
258+
else:
259+
phonemizer_lang = self.phonemizer_lang
244260

245261
separator = Separator(phone=self.phone_delimiter_token, word=word_delimiter, syllable="")
246-
phonemes = phonemize(
247-
text,
248-
language=phonemizer_lang,
249-
backend=self.phonemizer_backend,
262+
phonemes = self.backend.phonemize(
263+
[text],
250264
separator=separator,
251-
language_switch="remove-flags",
252265
)
253-
phonemes = phonemes.strip()
266+
phonemes = phonemes[0].strip()
254267

255268
return phonemes
256269

0 commit comments

Comments
 (0)