FlameF0X commited on
Commit
d655584
·
verified ·
1 Parent(s): 72510fa

Create tokenization_i3.py

Browse files
Files changed (1) hide show
  1. tokenization_i3.py +48 -0
tokenization_i3.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from transformers import PreTrainedTokenizer
3
+
4
+ class I3Tokenizer(PreTrainedTokenizer):
5
+ def __init__(self, vocab_file, **kwargs):
6
+ super().__init__(**kwargs)
7
+ with open(vocab_file, "r") as f:
8
+ vocab_data = json.load(f)
9
+ self.chunk_to_idx = vocab_data["chunk_to_idx"]
10
+ self.idx_to_chunk = {int(k): v for k, v in vocab_data["idx_to_chunk"].items()}
11
+ self.vocab_size = vocab_data["vocab_size"]
12
+
13
+ @property
14
+ def vocab_size(self):
15
+ return len(self.chunk_to_idx)
16
+
17
+ def _tokenize(self, text):
18
+ # replicate your ChunkTokenizer.encode logic
19
+ text = text.lower()
20
+ pos = 0
21
+ tokens = []
22
+ while pos < len(text):
23
+ chunk = text[pos:pos+2]
24
+ if chunk in self.chunk_to_idx:
25
+ tokens.append(chunk)
26
+ pos += 2
27
+ else:
28
+ pos += 1
29
+ return tokens
30
+
31
+ def _convert_token_to_id(self, token):
32
+ return self.chunk_to_idx.get(token, 0)
33
+
34
+ def _convert_id_to_token(self, index):
35
+ return self.idx_to_chunk.get(index, "")
36
+
37
+ def convert_tokens_to_string(self, tokens):
38
+ return "".join(tokens)
39
+
40
+ def save_vocabulary(self, save_directory):
41
+ vocab_file = f"{save_directory}/tokenizer.json"
42
+ with open(vocab_file, "w") as f:
43
+ json.dump({
44
+ "chunk_to_idx": self.chunk_to_idx,
45
+ "idx_to_chunk": self.idx_to_chunk,
46
+ "vocab_size": self.vocab_size,
47
+ }, f)
48
+ return (vocab_file,)