i3-lab
/

i3-12m

+import json
+from transformers import PreTrainedTokenizer
+class I3Tokenizer(PreTrainedTokenizer):
+    def __init__(self, vocab_file, **kwargs):
+        super().__init__(**kwargs)
+        with open(vocab_file, "r") as f:
+            vocab_data = json.load(f)
+        self.chunk_to_idx = vocab_data["chunk_to_idx"]
+        self.idx_to_chunk = {int(k): v for k, v in vocab_data["idx_to_chunk"].items()}
+        self.vocab_size = vocab_data["vocab_size"]
+    @property
+    def vocab_size(self):
+        return len(self.chunk_to_idx)
+    def _tokenize(self, text):
+        # replicate your ChunkTokenizer.encode logic
+        text = text.lower()
+        pos = 0
+        tokens = []
+        while pos < len(text):
+            chunk = text[pos:pos+2]
+            if chunk in self.chunk_to_idx:
+                tokens.append(chunk)
+                pos += 2
+            else:
+                pos += 1
+        return tokens
+    def _convert_token_to_id(self, token):
+        return self.chunk_to_idx.get(token, 0)
+    def _convert_id_to_token(self, index):
+        return self.idx_to_chunk.get(index, "")
+    def convert_tokens_to_string(self, tokens):
+        return "".join(tokens)
+    def save_vocabulary(self, save_directory):
+        vocab_file = f"{save_directory}/tokenizer.json"
+        with open(vocab_file, "w") as f:
+            json.dump({
+                "chunk_to_idx": self.chunk_to_idx,
+                "idx_to_chunk": self.idx_to_chunk,
+                "vocab_size": self.vocab_size,
+            }, f)
+        return (vocab_file,)