Skip to content

Commit 3ca2b6f

Browse files
Merge pull request #2 from kkew3/fix-notimplementederror
bug fix: NotImplementedError when constructing CharacterTokenizer
2 parents 94a5d5b + 703bde8 commit 3ca2b6f

File tree

1 file changed

+15
-12
lines changed

1 file changed

+15
-12
lines changed

charactertokenizer/core.py

+15-12
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,18 @@ def __init__(self, characters: Sequence[str], model_max_length: int, **kwargs):
4141

4242
mask_token = AddedToken("[MASK]", lstrip=True, rstrip=False)
4343

44+
self._vocab_str_to_int = {
45+
"[CLS]": 0,
46+
"[SEP]": 1,
47+
"[BOS]": 2,
48+
"[MASK]": 3,
49+
"[PAD]": 4,
50+
"[RESERVED]": 5,
51+
"[UNK]": 6,
52+
**{ch: i + 7 for i, ch in enumerate(characters)},
53+
}
54+
self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}
55+
4456
super().__init__(
4557
bos_token=bos_token,
4658
eos_token=eos_token,
@@ -54,22 +66,13 @@ def __init__(self, characters: Sequence[str], model_max_length: int, **kwargs):
5466
**kwargs,
5567
)
5668

57-
self._vocab_str_to_int = {
58-
"[CLS]": 0,
59-
"[SEP]": 1,
60-
"[BOS]": 2,
61-
"[MASK]": 3,
62-
"[PAD]": 4,
63-
"[RESERVED]": 5,
64-
"[UNK]": 6,
65-
**{ch: i + 7 for i, ch in enumerate(characters)},
66-
}
67-
self._vocab_int_to_str = {v: k for k, v in self._vocab_str_to_int.items()}
68-
6969
@property
7070
def vocab_size(self) -> int:
7171
return len(self._vocab_str_to_int)
7272

73+
def get_vocab(self):
74+
return self._vocab_str_to_int
75+
7376
def _tokenize(self, text: str) -> List[str]:
7477
return list(text)
7578

0 commit comments

Comments
 (0)