Upload tokenizer

a9cb2e6 almost 3 years ago

33.3 kB

	import os
	import re
	from typing import List, Optional, Tuple
	from shutil import copyfile
	import sentencepiece as spm
	import warnings
	import logging
	import json
	import multiprocessing
	from collections import Counter
	from typing import Collection, Callable, Dict
	from tokenizers import NormalizedString, PreTokenizedString
	from transformers.tokenization_utils import PreTrainedTokenizer
	from tokenizers import Tokenizer, pre_tokenizers, models
	from pythainlp.tokenize import word_tokenize
	from pythainlp.corpus import thai_syllables, thai_words
	from pythainlp.util.trie import Trie
	from functools import partial


	try:
	from thai2transformers.helper import get_file_size, multi_imap
	except ModuleNotFoundError:
	import sys
	sys.path.append('../scripts') # path hacking
	from thai2transformers.helper import get_file_size, multi_imap

	logger = logging.getLogger()

	VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}

	SPIECE_UNDERLINE = '▁'
	SPACE_TOKEN = "<_>"
	DEPRECATED_SPACE_TOKEN = '<th_roberta_space_token>'
	SEFR_SPLIT_TOKEN = '<\|>'
	ADDITIONAL_SPECIAL_TOKENS = ['<s>', '<pad>', '</s>', '<unk>', '<mask>', SPACE_TOKEN, '\n']
	ADDITIONAL_SPECIAL_TOKENS_EXCLUDE_SPACE_TOKEN = \
	[e for e in ADDITIONAL_SPECIAL_TOKENS if e != SPACE_TOKEN]
	SET_ADDITIONAL_SPECIAL_TOKENS = frozenset(ADDITIONAL_SPECIAL_TOKENS)

	PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
	"th-roberta-base": 514,
	}

	# Store pre tokenizer function (text cutter)
	PRE_TOKENIZERS_MAP = {'newmm': partial(
	word_tokenize,
	custom_dict=Trie(frozenset(set(thai_words()).union(set(ADDITIONAL_SPECIAL_TOKENS))))
	),
	'syllable': partial(
	word_tokenize,
	custom_dict=Trie(frozenset(set(thai_syllables()).union(set(ADDITIONAL_SPECIAL_TOKENS))))
	),
	}

	_nb_cores = multiprocessing.cpu_count()


	def split_additional_special_token(texts):
	"""
	Split list of text by additional special exclude space token.

	Args:
	texts: list of text.

	Returns:
	list_of_pre_cut_texts: list of list of pre cut text.

	Examples::

	>>> split_additional_special_token(['hello world</s></s>'])
	[['hello world', '</s>', '</s>']]
	"""
	# Construct regex pattern to match additional special tokens exlude space token.
	# Not sure, if we need to escape the token but this seems to do fine.
	group = '\|'.join(ADDITIONAL_SPECIAL_TOKENS_EXCLUDE_SPACE_TOKEN)
	splitter = re.compile(f'({group})')
	list_of_pre_cut_texts = []
	for text in texts:
	pre_cut_texts = []
	# Split the text this will inculde the additional token itself
	# and some time empty string in case of splitting consecutive
	# additional token.
	for e in splitter.split(text):
	# Filter out empty string space except if the string is additional
	# special token itself.
	if len(e) > 0 and (not e.isspace() or e in ADDITIONAL_SPECIAL_TOKENS):
	# Replace space token with actual space, since we want
	# to pass space into the cutter.
	pre_cut_texts.append(e.replace(SPACE_TOKEN, ' '))
	list_of_pre_cut_texts.append(pre_cut_texts)
	return list_of_pre_cut_texts


	def sefr_cut_tokenize(texts, n_jobs=1, chunk_size=200):
	"""
	Cut list of texts using sefr_cut.

	Args:
	texts:
	list of texts.
	n_jobs:
	Number of multiprocessing cores. -1 will use all avaliable cores.
	1 will use single core. Defaults to 1.
	chunk_size:
	size of each cutting pass in case of multiprocessing. Defaults to 200.

	Returns:
	final_list_of_cut_texts: list of list of cut text.

	Examples::
	>>> sefr_cut_tokenize(['hello world</s></s>'])
	[['hello', '<_>', 'world', '</s>', '</s>']]
	"""
	if n_jobs != 1 and isinstance(texts, list):
	n_jobs = n_jobs if n_jobs != -1 else multiprocessing.cpu_count()
	return multi_imap(texts, chunk_size=chunk_size,
	f=sefr_cut_tokenize, n_cores=n_jobs)
	if not isinstance(texts, list):
	return sefr_cut_tokenize([texts])[0]
	# We need to import the library inside the function itself to be able to use
	# multiprocessing correctly. If we did not do this. Most of the times,
	# the lock will stuck and the program will hang up.
	import sefr_cut
	import tensorflow as tf
	# Try to run tensorflow in single thread mode so we can limit the program to
	# a single process this usually give speed up with multiprocessing.
	# Because sefr_cut do tokenize each text sperately anyway. So there is not
	# much speed up to gain by using tensorflow with parallelism in conjunction
	# with multiprocessing module.
	os.environ['OMP_NUM_THREADS'] = '1'
	tf.config.threading.set_intra_op_parallelism_threads(1)
	tf.config.threading.set_inter_op_parallelism_threads(1)
	sefr_cut.load_model(engine='best')

	list_of_pre_cut_texts = split_additional_special_token(texts)
	list_of_cut_texts = []
	for pre_cut_texts in list_of_pre_cut_texts:
	cut_texts = []
	for pre_cut_text in pre_cut_texts:
	if pre_cut_text not in SET_ADDITIONAL_SPECIAL_TOKENS:
	# Tokenize pre_cut_text if it is not additional special tokens
	cut_texts.extend(sefr_cut.tokenize(pre_cut_text)[0])
	else:
	# Append token as is
	cut_texts.append(pre_cut_text)
	list_of_cut_texts.append(cut_texts)

	# Put SPACE_TOKEN back
	list_of_cut_texts = [[cut_text.replace(' ', SPACE_TOKEN) for cut_text in cut_texts]
	for cut_texts in list_of_cut_texts]

	# Split SPACE_TOKEN out of text
	final_list_of_cut_texts = []
	splitter = re.compile(f'({SPACE_TOKEN})')
	for cut_texts in list_of_cut_texts:
	final_cut_texts = []
	for cut_text in cut_texts:
	if SPACE_TOKEN in cut_text and cut_text != SPACE_TOKEN:
	final_cut_texts.extend([e for e in splitter.split(cut_text) if len(e) > 0])
	else:
	final_cut_texts.append(cut_text)
	final_list_of_cut_texts.append(final_cut_texts)
	return final_list_of_cut_texts


	# Should we do this a bit cleaner?
	PRE_TOKENIZERS_MAP['sefr_cut'] = partial(sefr_cut_tokenize, n_jobs=-1)

	sefr_cut_splitter = re.compile(f'({re.escape(SEFR_SPLIT_TOKEN)})')


	def fake_sefr_cut_keep_split_token(text):
	"""
	Split text at SEFR_SPLIT_TOKEN and kept split token.

	Args:
	text: string.

	Returns:
	list: tokens.

	Examples::

	>>> SEFR_SPLIT_TOKEN
	'<\|>'
	>>> fake_sefr_cut_keep_split_token(f'hello{SEFR_SPLIT_TOKEN}world')
	['hello', '<\|>', 'world']
	"""
	return [e for e in sefr_cut_splitter.split(text) if len(e) > 0]


	def fake_sefr_cut(text):
	"""
	Split text at SEFR_SPLIT_TOKEN.

	Args:
	text: string.

	Returns:
	list: tokens.

	Examples::

	>>> SEFR_SPLIT_TOKEN
	'<\|>'
	>>> fake_sefr_cut(f'hello{SEFR_SPLIT_TOKEN}world')
	['hello', 'world']
	"""
	return text.split(SEFR_SPLIT_TOKEN)


	PRE_TOKENIZERS_MAP['fake_sefr_cut'] = fake_sefr_cut
	PRE_TOKENIZERS_MAP['fake_sefr_cut_keep_split_token'] = fake_sefr_cut_keep_split_token


	class CustomPreTokenizer:
	def __init__(self, pre_tokenize_func: Callable):
	self.pre_tokenize_func = pre_tokenize_func

	def split(
	self, n: int, normalized_string: NormalizedString
	) -> Collection[NormalizedString]:
	# is argument n needs?
	break_i = []
	total_i = 0
	for word in self.pre_tokenize_func(str(normalized_string)):
	total_i += len(word)
	break_i.append(total_i)
	splits = []
	last = 0
	for (i, char) in enumerate(str(normalized_string)):
	if i in break_i:
	splits.append(normalized_string[last:i])
	last = i
	splits.append(normalized_string[last:])
	return splits

	def pre_tokenize(self, pretok: PreTokenizedString):
	pretok.split(self.split)


	class FakeSefrCustomTokenizer(CustomPreTokenizer):
	"""
	CustomPreTokenizer that skip SEFR_SPLIT_TOKEN

	Args:
	pre_tokenizer_func: pre tokenize function.
	"""

	def split(
	self, n: int, normalized_string: NormalizedString
	) -> Collection[NormalizedString]:
	# We have to operate on original normalized string since it track aligment or something
	kept_indices = []
	p = 0
	for word in self.pre_tokenize_func(str(normalized_string)):
	if word != SEFR_SPLIT_TOKEN:
	kept_indices.append((p, p + len(word)))
	p += len(word)
	splits = []
	for start, stop in kept_indices:
	splits.append(normalized_string[start:stop])
	return splits


	class WordLevelTrainer:
	"""
	Trainer for word level tokenizer.

	Args:
	pre_tokenize_func:
	pre tokenize function.
	input_files:
	text files for vocabulary creation.
	additional_special_token:
	special tokens that will be explicitly added in vocabulary.
	vocab_size:
	size of vocabulary.
	vocab_min_freq:
	minimum frequency required to kept the word in vocabulary.
	progress:
	show progress.

	Examples::

	>>> trainer = WordLevelTrainer(pre_tokenize_func=pre_tokenizer_func,
	vocab_size=custom_args.vocab_size,
	vocab_min_freq=custom_args.vocab_min_freq,
	input_files=train_files,
	additional_special_tokens=additional_special_tokens)
	>>> trainer.count_parallel()
	>>> trainer.save_vocab(custom_args.output_file)
	"""

	def __init__(
	self,
	pre_tokenize_func: Callable,
	input_files: str,
	additional_special_tokens: Collection[str],
	vocab_size: int = None,
	vocab_min_freq: int = None,
	progress: bool = True
	):
	self.pre_tokenize_func = pre_tokenize_func
	self.vocab_size = vocab_size
	self.special_tokens = additional_special_tokens
	self.input_files = input_files
	self.vocab = None
	self.freq = None
	self.vocab_min_freq = vocab_min_freq
	self.progress = progress
	if self.vocab_min_freq is not None and self.vocab_size is not None:
	raise AttributeError('use only vocab_min_freq or vocab_size')

	def count_one(self, fname: str) -> Counter:
	with open(fname, "r") as f:
	file_size = get_file_size(f)
	words = []
	i = 0
	while True:
	line = f.readline()
	if line:
	line = line.strip()
	if len(line) > 0 and not line.isspace():
	words.extend(self.pre_tokenize_func(line))
	else:
	break
	i += 1
	if self.progress and i % 5000 == 0:
	print(f'\rProcessed {f.tell() / file_size * 100:.2f}%',
	flush=True, end=' ')
	return Counter(words)

	def count_parallel(self, nb_cores: int = _nb_cores) -> Dict[(str, int)]:
	counters = [self.count_one(fname) for fname in self.input_files]
	# disable multiprocessing for now for easier debugging
	# with multiprocessing.Pool(nb_cores) as pool:
	# counters = pool.map(self.count_one, self.input_files)
	counter_all = sum(counters, Counter())
	# Remove special token from counter_all since this will
	# interfere with vocabulary creation later
	# for example if only '<s>' is in counter and addtional tokens = ['<s>']
	# the return vocab will be {'<s>': 1} instead of expected {'<s>': 0}
	# if we didnt remove '<s>' from counter_all
	special_tok_freq = {}
	for tok in self.special_tokens:
	if tok in counter_all:
	special_tok_freq[tok] = counter_all[tok]
	del counter_all[tok]
	if self.vocab_size is not None:
	counter_all.most_common(self.vocab_size)
	else:
	counter_all = [(key, value) for key, value in counter_all.items()
	if value >= self.vocab_min_freq]
	self.freq = [(tok, special_tok_freq.get(tok, 0))
	for tok in self.special_tokens] + counter_all
	self.vocab = dict((c[0], i) for i, c in enumerate(self.freq))
	return self.vocab

	def save_vocab(self, output_path: str):
	os.makedirs(os.path.dirname(output_path), exist_ok=True)
	with open(output_path, "w") as f:
	json.dump(self.vocab, f)


	class ThaiRobertaTokenizer(PreTrainedTokenizer):
	"""
	Adapted from :class:`~transformers.CamembertTokenizer`. Construct a
	Thai Roberta tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.

	This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
	Users should refer to this superclass for more information regarding those methods.

	Args:
	vocab_file (:obj:`str`):
	`SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
	contains the vocabulary necessary to instantiate a tokenizer.
	bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
	The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

	.. note::

	When building a sequence using special tokens, this is not the token that is used for the beginning
	of sequence. The token used is the :obj:`cls_token`.
	eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
	The end of sequence token.

	.. note::

	When building a sequence using special tokens, this is not the token that is used for the end
	of sequence. The token used is the :obj:`sep_token`.
	sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
	The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
	for sequence classification or for a text and a question for question answering.
	It is also used as the last token of a sequence built with special tokens.
	cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
	The classifier token which is used when doing sequence classification (classification of the whole
	sequence instead of per-token classification). It is the first token of the sequence when built with
	special tokens.
	unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
	The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
	token instead.
	pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
	The token used for padding, for example when batching sequences of different lengths.
	mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
	The token used for masking values. This is the token used when training this model with masked language
	modeling. This is the token which the model will try to predict.
	additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
	Additional special tokens used by the tokenizer.

	Attributes:
	sp_model (:obj:`SentencePieceProcessor`):
	The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
	"""

	vocab_files_names = VOCAB_FILES_NAMES
	max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
	model_input_names = ["attention_mask"]

	def __init__(
	self,
	vocab_file,
	bos_token="<s>",
	eos_token="</s>",
	sep_token="</s>",
	cls_token="<s>",
	unk_token="<unk>",
	pad_token="<pad>",
	mask_token="<mask>",
	additional_special_tokens=[SPACE_TOKEN],
	**kwargs
	):
	super().__init__(
	bos_token=bos_token,
	eos_token=eos_token,
	unk_token=unk_token,
	sep_token=sep_token,
	cls_token=cls_token,
	pad_token=pad_token,
	mask_token=mask_token,
	additional_special_tokens=additional_special_tokens,
	**kwargs,
	)
	self.sp_model = spm.SentencePieceProcessor()
	self.sp_model.Load(str(vocab_file))
	self.vocab_file = vocab_file

	def build_inputs_with_special_tokens(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	"""
	Build model inputs from a sequence or a pair of sequence for sequence classification tasks
	by concatenating and adding special tokens.
	An CamemBERT sequence has the following format:

	- single sequence: ``<s> X </s>``
	- pair of sequences: ``<s> A </s></s> B </s>``

	Args:
	token_ids_0 (:obj:`List[int]`):
	List of IDs to which the special tokens will be added.
	token_ids_1 (:obj:`List[int]`, `optional`):
	Optional second list of IDs for sequence pairs.

	Returns:
	:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
	"""

	if token_ids_1 is None:
	return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
	cls = [self.cls_token_id]
	sep = [self.sep_token_id]
	return cls + token_ids_0 + sep + sep + token_ids_1 + sep

	def get_special_tokens_mask(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
	) -> List[int]:
	"""
	Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
	special tokens using the tokenizer ``prepare_for_model`` method.

	Args:
	token_ids_0 (:obj:`List[int]`):
	List of IDs.
	token_ids_1 (:obj:`List[int]`, `optional`):
	Optional second list of IDs for sequence pairs.
	already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
	Whether or not the token list is already formatted with special tokens for the model.

	Returns:
	:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
	"""
	if already_has_special_tokens:
	if token_ids_1 is not None:
	raise ValueError(
	"You should not supply a second sequence if the provided sequence of "
	"ids is already formated with special tokens for the model."
	)
	return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

	if token_ids_1 is None:
	return [1] + ([0] * len(token_ids_0)) + [1]
	return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

	def create_token_type_ids_from_sequences(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	"""
	Create a mask from the two sequences passed to be used in a sequence-pair classification task.
	CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.

	Args:
	token_ids_0 (:obj:`List[int]`):
	List of IDs.
	token_ids_1 (:obj:`List[int]`, `optional`):
	Optional second list of IDs for sequence pairs.

	Returns:
	:obj:`List[int]`: List of zeros.
	"""
	sep = [self.sep_token_id]
	cls = [self.cls_token_id]

	if token_ids_1 is None:
	return len(cls + token_ids_0 + sep) * [0]
	return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

	@property
	def vocab_size(self):
	return len(self.sp_model)

	def get_vocab(self):
	vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
	return vocab

	def _tokenize(self, text):
	return self.sp_model.EncodeAsPieces(text)

	def _convert_token_to_id(self, token):
	""" Converts a token (str) in an id using the vocab. """
	return self.sp_model.PieceToId(token)

	def _convert_id_to_token(self, index):
	"""Converts an index (integer) in a token (str) using the vocab."""
	return self.sp_model.IdToPiece(index)

	def __getstate__(self):
	state = self.__dict__.copy()
	state["sp_model"] = None
	return state

	def __setstate__(self, d):
	self.__dict__ = d
	self.sp_model = spm.SentencePieceProcessor()
	self.sp_model.Load(self.vocab_file)

	def convert_tokens_to_string(self, tokens):
	"""Converts a sequence of tokens (strings for sub-words) in a single string."""
	out_string = "".join(tokens).replace(SPIECE_UNDERLINE, "\n").strip()
	return out_string

	def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
	if not os.path.isdir(save_directory):
	logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
	return
	out_vocab_file = os.path.join(
	save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
	)

	if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
	copyfile(self.vocab_file, out_vocab_file)

	return (out_vocab_file,)

	def prepare_for_tokenization(self, text, space_token=SPACE_TOKEN, is_split_into_words=False, **kwargs):
	if "is_pretokenized" in kwargs:
	warnings.warn(
	"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
	FutureWarning,
	)
	is_split_into_words = kwargs.pop("is_pretokenized")

	# replace empty space with special space token

	text = text.replace(' ', space_token)

	return (text, kwargs)


	class BaseThaiWordsTokenizer(PreTrainedTokenizer):
	"""Base cass for word level tokenizer."""

	def build_inputs_with_special_tokens(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	"""
	Build model inputs from a sequence or a pair of sequence for sequence classification tasks
	by concatenating and adding special tokens.
	An CamemBERT sequence has the following format:

	- single sequence: ``<s> X </s>``
	- pair of sequences: ``<s> A </s></s> B </s>``

	Args:
	token_ids_0 (:obj:`List[int]`):
	List of IDs to which the special tokens will be added.
	token_ids_1 (:obj:`List[int]`, `optional`):
	Optional second list of IDs for sequence pairs.

	Returns:
	:obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
	"""
	if token_ids_1 is None:
	return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
	cls = [self.cls_token_id]
	sep = [self.sep_token_id]
	return cls + token_ids_0 + sep + sep + token_ids_1 + sep

	def get_special_tokens_mask(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
	) -> List[int]:
	"""
	Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
	special tokens using the tokenizer ``prepare_for_model`` method.

	Args:
	token_ids_0 (:obj:`List[int]`):
	List of IDs.
	token_ids_1 (:obj:`List[int]`, `optional`):
	Optional second list of IDs for sequence pairs.
	already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
	Whether or not the token list is already formatted with special tokens for the model.

	Returns:
	:obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
	"""
	if already_has_special_tokens:
	if token_ids_1 is not None:
	raise ValueError(
	"You should not supply a second sequence if the provided sequence of "
	"ids is already formated with special tokens for the model."
	)
	return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

	if token_ids_1 is None:
	return [1] + ([0] * len(token_ids_0)) + [1]
	return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

	def create_token_type_ids_from_sequences(
	self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
	) -> List[int]:
	"""
	Create a mask from the two sequences passed to be used in a sequence-pair classification task.
	CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.

	Args:
	token_ids_0 (:obj:`List[int]`):
	List of IDs.
	token_ids_1 (:obj:`List[int]`, `optional`):
	Optional second list of IDs for sequence pairs.

	Returns:
	:obj:`List[int]`: List of zeros.
	"""
	sep = [self.sep_token_id]
	cls = [self.cls_token_id]

	if token_ids_1 is None:
	return len(cls + token_ids_0 + sep) * [0]
	return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

	@property
	def vocab_size(self):
	return len(self.tokenizer_model.get_vocab())

	def get_vocab(self):
	vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
	return vocab

	def _tokenize(self, text):
	return self.tokenizer_model.encode(text).tokens

	def _convert_token_to_id(self, token):
	""" Converts a token (str) in an id using the vocab. """
	i = self.tokenizer_model.token_to_id(token)
	if i is None:
	return self.unk_token_id
	else:
	return i

	def _convert_id_to_token(self, index):
	"""Converts an index (integer) in a token (str) using the vocab."""
	return self.tokenizer_model.id_to_token(index)

	def convert_tokens_to_string(self, tokens):
	"""Converts a sequence of tokens (strings for sub-words) in a single string."""
	out_string = "".join(tokens).strip()
	return out_string

	def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
	if not os.path.isdir(save_directory):
	logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
	return
	out_vocab_file = os.path.join(
	save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
	)

	if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
	copyfile(self.vocab_file, out_vocab_file)

	return (out_vocab_file,)

	def prepare_for_tokenization(self, text, space_token=SPACE_TOKEN, is_split_into_words=False, **kwargs):
	if "is_pretokenized" in kwargs:
	warnings.warn(
	"`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
	FutureWarning,
	)
	is_split_into_words = kwargs.pop("is_pretokenized")

	# replace empty space with special space token

	text = text.replace(' ', space_token)

	return (text, kwargs)

	def __getstate__(self):
	# What is this funcion even do?
	raise NotImplementedError

	def __setstate__(self, d):
	# What is this funcion even do?
	raise NotImplementedError


	class ThaiWordsNewmmTokenizer(BaseThaiWordsTokenizer):
	"""
	Newmm tokenizer.
	"""
	vocab_files_names = {"vocab_file": "newmm.json"} # vocabulary file location in folder

	def __init__(
	self,
	vocab_file,
	bos_token="<s>",
	eos_token="</s>",
	sep_token="</s>",
	cls_token="<s>",
	unk_token="<unk>",
	pad_token="<pad>",
	mask_token="<mask>",
	additional_special_tokens=ADDITIONAL_SPECIAL_TOKENS,
	**kwargs
	):
	super().__init__(
	bos_token=bos_token,
	eos_token=eos_token,
	unk_token=unk_token,
	sep_token=sep_token,
	cls_token=cls_token,
	pad_token=pad_token,
	mask_token=mask_token,
	additional_special_tokens=ADDITIONAL_SPECIAL_TOKENS,
	**kwargs,
	)
	pre_tokenizer_func = PRE_TOKENIZERS_MAP['newmm']
	custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
	CustomPreTokenizer(pre_tokenizer_func))
	tokenizer = Tokenizer(models.WordLevel.from_file(vocab_file))
	tokenizer.pre_tokenizer = custom_pre_tokenizer
	self.tokenizer_model = tokenizer
	self.vocab_file = vocab_file

	def __getstate__(self):
	state = self.__dict__.copy()
	state["tokenizer_model"] = None
	return state

	def __setstate__(self, d):
	self.__dict__ = d
	pre_tokenizer_func = PRE_TOKENIZERS_MAP['newmm']
	custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
	CustomPreTokenizer(pre_tokenizer_func))
	tokenizer = Tokenizer(models.WordLevel.from_file(self.vocab_file))
	tokenizer.pre_tokenizer = custom_pre_tokenizer
	self.tokenizer_model = tokenizer


	class ThaiWordsSyllableTokenizer(BaseThaiWordsTokenizer):
	"""
	Syllable tokenizer.
	"""
	vocab_files_names = {"vocab_file": "syllable.json"}

	def __init__(
	self,
	vocab_file,
	bos_token="<s>",
	eos_token="</s>",
	sep_token="</s>",
	cls_token="<s>",
	unk_token="<unk>",
	pad_token="<pad>",
	mask_token="<mask>",
	additional_special_tokens=ADDITIONAL_SPECIAL_TOKENS,
	**kwargs
	):
	super().__init__(
	bos_token=bos_token,
	eos_token=eos_token,
	unk_token=unk_token,
	sep_token=sep_token,
	cls_token=cls_token,
	pad_token=pad_token,
	mask_token=mask_token,
	additional_special_tokens=ADDITIONAL_SPECIAL_TOKENS,
	**kwargs,
	)
	pre_tokenizer_func = PRE_TOKENIZERS_MAP['syllable']
	custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
	CustomPreTokenizer(pre_tokenizer_func))
	tokenizer = Tokenizer(models.WordLevel.from_file(vocab_file))
	tokenizer.pre_tokenizer = custom_pre_tokenizer
	self.tokenizer_model = tokenizer
	self.vocab_file = vocab_file

	def __getstate__(self):
	state = self.__dict__.copy()
	state["tokenizer_model"] = None
	return state

	def __setstate__(self, d):
	self.__dict__ = d
	pre_tokenizer_func = PRE_TOKENIZERS_MAP['syllable']
	custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
	CustomPreTokenizer(pre_tokenizer_func))
	tokenizer = Tokenizer(models.WordLevel.from_file(self.vocab_file))
	tokenizer.pre_tokenizer = custom_pre_tokenizer
	self.tokenizer_model = tokenizer


	class FakeSefrCutTokenizer(BaseThaiWordsTokenizer):
	"""
	FakeSefrCut tokenizer.
	"""
	vocab_files_names = {"vocab_file": "fake_sefr_cut.json"}

	def __init__(
	self,
	vocab_file,
	bos_token="<s>",
	eos_token="</s>",
	sep_token="</s>",
	cls_token="<s>",
	unk_token="<unk>",
	pad_token="<pad>",
	mask_token="<mask>",
	additional_special_tokens=ADDITIONAL_SPECIAL_TOKENS,
	**kwargs
	):
	super().__init__(
	bos_token=bos_token,
	eos_token=eos_token,
	unk_token=unk_token,
	sep_token=sep_token,
	cls_token=cls_token,
	pad_token=pad_token,
	mask_token=mask_token,
	additional_special_tokens=ADDITIONAL_SPECIAL_TOKENS,
	**kwargs,
	)
	pre_tokenizer_func = PRE_TOKENIZERS_MAP['fake_sefr_cut_keep_split_token']
	custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
	FakeSefrCustomTokenizer(pre_tokenizer_func))
	tokenizer = Tokenizer(models.WordLevel.from_file(vocab_file))
	tokenizer.pre_tokenizer = custom_pre_tokenizer
	self.tokenizer_model = tokenizer
	self.vocab_file = vocab_file