import os
import re
from typing import List, Optional, Tuple
from shutil import copyfile
import sentencepiece as spm
import warnings
import logging
import json
import multiprocessing
from collections import Counter
from typing import Collection, Callable, Dict
from tokenizers import NormalizedString, PreTokenizedString
from transformers.tokenization_utils import PreTrainedTokenizer
from tokenizers import Tokenizer, pre_tokenizers, models
from pythainlp.tokenize import word_tokenize
from pythainlp.corpus import thai_syllables, thai_words
from pythainlp.util.trie import Trie
from functools import partial


try:
    from thai2transformers.helper import get_file_size, multi_imap
except ModuleNotFoundError:
    import sys
    sys.path.append('../scripts')  # path hacking
    from thai2transformers.helper import get_file_size, multi_imap

logger = logging.getLogger()

VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}

SPIECE_UNDERLINE = '▁'
SPACE_TOKEN = "<_>"
DEPRECATED_SPACE_TOKEN = '<th_roberta_space_token>'
SEFR_SPLIT_TOKEN = '<|>'
ADDITIONAL_SPECIAL_TOKENS = ['<s>', '<pad>', '</s>', '<unk>', '<mask>', SPACE_TOKEN, '\n']
ADDITIONAL_SPECIAL_TOKENS_EXCLUDE_SPACE_TOKEN = \
    [e for e in ADDITIONAL_SPECIAL_TOKENS if e != SPACE_TOKEN]
SET_ADDITIONAL_SPECIAL_TOKENS = frozenset(ADDITIONAL_SPECIAL_TOKENS)

PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
    "th-roberta-base": 514,
}

# Store pre tokenizer function (text cutter)
PRE_TOKENIZERS_MAP = {'newmm': partial(
    word_tokenize,
    custom_dict=Trie(frozenset(set(thai_words()).union(set(ADDITIONAL_SPECIAL_TOKENS))))
    ),
                      'syllable': partial(
    word_tokenize,
    custom_dict=Trie(frozenset(set(thai_syllables()).union(set(ADDITIONAL_SPECIAL_TOKENS))))
    ),
    }

_nb_cores = multiprocessing.cpu_count()


def split_additional_special_token(texts):
    """
    Split list of text by additional special exclude space token.

    Args:
        texts: list of text.

    Returns:
        list_of_pre_cut_texts: list of list of pre cut text.

    Examples::

        >>> split_additional_special_token(['hello world</s></s>'])
        [['hello world', '</s>', '</s>']]
    """
    # Construct regex pattern to match additional special tokens exlude space token.
    # Not sure, if we need to escape the token but this seems to do fine.
    group = '|'.join(ADDITIONAL_SPECIAL_TOKENS_EXCLUDE_SPACE_TOKEN)
    splitter = re.compile(f'({group})')
    list_of_pre_cut_texts = []
    for text in texts:
        pre_cut_texts = []
        # Split the text this will inculde the additional token itself
        # and some time empty string in case of splitting consecutive
        # additional token.
        for e in splitter.split(text):
            # Filter out empty string space except if the string is additional
            # special token itself.
            if len(e) > 0 and (not e.isspace() or e in ADDITIONAL_SPECIAL_TOKENS):
                # Replace space token with actual space, since we want
                # to pass space into the cutter.
                pre_cut_texts.append(e.replace(SPACE_TOKEN, ' '))
        list_of_pre_cut_texts.append(pre_cut_texts)
    return list_of_pre_cut_texts


def sefr_cut_tokenize(texts, n_jobs=1, chunk_size=200):
    """
    Cut list of texts using sefr_cut.

    Args:
        texts:
            list of texts.
        n_jobs:
            Number of multiprocessing cores. -1 will use all avaliable cores.
            1 will use single core. Defaults to 1.
        chunk_size:
            size of each cutting pass in case of multiprocessing. Defaults to 200.

    Returns:
        final_list_of_cut_texts: list of list of cut text.

    Examples::
        >>> sefr_cut_tokenize(['hello world</s></s>'])
        [['hello', '<_>', 'world', '</s>', '</s>']]
    """
    if n_jobs != 1 and isinstance(texts, list):
        n_jobs = n_jobs if n_jobs != -1 else multiprocessing.cpu_count()
        return multi_imap(texts, chunk_size=chunk_size,
                          f=sefr_cut_tokenize, n_cores=n_jobs)
    if not isinstance(texts, list):
        return sefr_cut_tokenize([texts])[0]
    # We need to import the library inside the function itself to be able to use
    # multiprocessing correctly. If we did not do this. Most of the times,
    # the lock will stuck and the program will hang up.
    import sefr_cut
    import tensorflow as tf
    # Try to run tensorflow in single thread mode so we can limit the program to
    # a single process this usually give speed up with multiprocessing.
    # Because sefr_cut do tokenize each text sperately anyway. So there is not
    # much speed up to gain by using tensorflow with parallelism in conjunction
    # with multiprocessing module.
    os.environ['OMP_NUM_THREADS'] = '1'
    tf.config.threading.set_intra_op_parallelism_threads(1)
    tf.config.threading.set_inter_op_parallelism_threads(1)
    sefr_cut.load_model(engine='best')

    list_of_pre_cut_texts = split_additional_special_token(texts)
    list_of_cut_texts = []
    for pre_cut_texts in list_of_pre_cut_texts:
        cut_texts = []
        for pre_cut_text in pre_cut_texts:
            if pre_cut_text not in SET_ADDITIONAL_SPECIAL_TOKENS:
                # Tokenize pre_cut_text if it is not additional special tokens
                cut_texts.extend(sefr_cut.tokenize(pre_cut_text)[0])
            else:
                # Append token as is
                cut_texts.append(pre_cut_text)
        list_of_cut_texts.append(cut_texts)

    # Put SPACE_TOKEN back
    list_of_cut_texts = [[cut_text.replace(' ', SPACE_TOKEN) for cut_text in cut_texts]
                         for cut_texts in list_of_cut_texts]

    # Split SPACE_TOKEN out of text
    final_list_of_cut_texts = []
    splitter = re.compile(f'({SPACE_TOKEN})')
    for cut_texts in list_of_cut_texts:
        final_cut_texts = []
        for cut_text in cut_texts:
            if SPACE_TOKEN in cut_text and cut_text != SPACE_TOKEN:
                final_cut_texts.extend([e for e in splitter.split(cut_text) if len(e) > 0])
            else:
                final_cut_texts.append(cut_text)
        final_list_of_cut_texts.append(final_cut_texts)
    return final_list_of_cut_texts


# Should we do this a bit cleaner?
PRE_TOKENIZERS_MAP['sefr_cut'] = partial(sefr_cut_tokenize, n_jobs=-1)

sefr_cut_splitter = re.compile(f'({re.escape(SEFR_SPLIT_TOKEN)})')


def fake_sefr_cut_keep_split_token(text):
    """
    Split text at SEFR_SPLIT_TOKEN and kept split token.

    Args:
        text: string.

    Returns:
        list: tokens.

    Examples::

        >>> SEFR_SPLIT_TOKEN
        '<|>'
        >>> fake_sefr_cut_keep_split_token(f'hello{SEFR_SPLIT_TOKEN}world')
        ['hello', '<|>', 'world']
    """
    return [e for e in sefr_cut_splitter.split(text) if len(e) > 0]


def fake_sefr_cut(text):
    """
    Split text at SEFR_SPLIT_TOKEN.

    Args:
        text: string.

    Returns:
        list: tokens.

    Examples::

        >>> SEFR_SPLIT_TOKEN
        '<|>'
        >>> fake_sefr_cut(f'hello{SEFR_SPLIT_TOKEN}world')
        ['hello', 'world']
    """
    return text.split(SEFR_SPLIT_TOKEN)


PRE_TOKENIZERS_MAP['fake_sefr_cut'] = fake_sefr_cut
PRE_TOKENIZERS_MAP['fake_sefr_cut_keep_split_token'] = fake_sefr_cut_keep_split_token


class CustomPreTokenizer:
    def __init__(self, pre_tokenize_func: Callable):
        self.pre_tokenize_func = pre_tokenize_func

    def split(
        self, n: int, normalized_string: NormalizedString
    ) -> Collection[NormalizedString]:
        # is argument n needs?
        break_i = []
        total_i = 0
        for word in self.pre_tokenize_func(str(normalized_string)):
            total_i += len(word)
            break_i.append(total_i)
        splits = []
        last = 0
        for (i, char) in enumerate(str(normalized_string)):
            if i in break_i:
                splits.append(normalized_string[last:i])
                last = i
        splits.append(normalized_string[last:])
        return splits

    def pre_tokenize(self, pretok: PreTokenizedString):
        pretok.split(self.split)


class FakeSefrCustomTokenizer(CustomPreTokenizer):
    """
    CustomPreTokenizer that skip SEFR_SPLIT_TOKEN

    Args:
        pre_tokenizer_func: pre tokenize function.
    """

    def split(
        self, n: int, normalized_string: NormalizedString
    ) -> Collection[NormalizedString]:
        # We have to operate on original normalized string since it track aligment or something
        kept_indices = []
        p = 0
        for word in self.pre_tokenize_func(str(normalized_string)):
            if word != SEFR_SPLIT_TOKEN:
                kept_indices.append((p, p + len(word)))
            p += len(word)
        splits = []
        for start, stop in kept_indices:
            splits.append(normalized_string[start:stop])
        return splits


class WordLevelTrainer:
    """
    Trainer for word level tokenizer.

    Args:
        pre_tokenize_func:
            pre tokenize function.
        input_files:
            text files for vocabulary creation.
        additional_special_token:
            special tokens that will be explicitly added in vocabulary.
        vocab_size:
            size of vocabulary.
        vocab_min_freq:
            minimum frequency required to kept the word in vocabulary.
        progress:
            show progress.

    Examples::

        >>> trainer = WordLevelTrainer(pre_tokenize_func=pre_tokenizer_func,
                                       vocab_size=custom_args.vocab_size,
                                       vocab_min_freq=custom_args.vocab_min_freq,
                                       input_files=train_files,
                                       additional_special_tokens=additional_special_tokens)
        >>> trainer.count_parallel()
        >>> trainer.save_vocab(custom_args.output_file)
    """

    def __init__(
        self,
        pre_tokenize_func: Callable,
        input_files: str,
        additional_special_tokens: Collection[str],
        vocab_size: int = None,
        vocab_min_freq: int = None,
        progress: bool = True
    ):
        self.pre_tokenize_func = pre_tokenize_func
        self.vocab_size = vocab_size
        self.special_tokens = additional_special_tokens
        self.input_files = input_files
        self.vocab = None
        self.freq = None
        self.vocab_min_freq = vocab_min_freq
        self.progress = progress
        if self.vocab_min_freq is not None and self.vocab_size is not None:
            raise AttributeError('use only vocab_min_freq or vocab_size')

    def count_one(self, fname: str) -> Counter:
        with open(fname, "r") as f:
            file_size = get_file_size(f)
            words = []
            i = 0
            while True:
                line = f.readline()
                if line:
                    line = line.strip()
                    if len(line) > 0 and not line.isspace():
                        words.extend(self.pre_tokenize_func(line))
                else:
                    break
                i += 1
                if self.progress and i % 5000 == 0:
                    print(f'\rProcessed {f.tell() / file_size * 100:.2f}%',
                          flush=True, end=' ')
        return Counter(words)

    def count_parallel(self, nb_cores: int = _nb_cores) -> Dict[(str, int)]:
        counters = [self.count_one(fname) for fname in self.input_files]
        # disable multiprocessing for now for easier debugging
        # with multiprocessing.Pool(nb_cores) as pool:
        #     counters = pool.map(self.count_one, self.input_files)
        counter_all = sum(counters, Counter())
        # Remove special token from counter_all since this will
        # interfere with vocabulary creation later
        # for example if only '<s>' is in counter and addtional tokens = ['<s>']
        # the return vocab will be {'<s>': 1} instead of expected {'<s>': 0}
        # if we didnt remove '<s>' from counter_all
        special_tok_freq = {}
        for tok in self.special_tokens:
            if tok in counter_all:
                special_tok_freq[tok] = counter_all[tok]
                del counter_all[tok]
        if self.vocab_size is not None:
            counter_all.most_common(self.vocab_size)
        else:
            counter_all = [(key, value) for key, value in counter_all.items()
                           if value >= self.vocab_min_freq]
        self.freq = [(tok, special_tok_freq.get(tok, 0))
                     for tok in self.special_tokens] + counter_all
        self.vocab = dict((c[0], i) for i, c in enumerate(self.freq))
        return self.vocab

    def save_vocab(self, output_path: str):
        os.makedirs(os.path.dirname(output_path), exist_ok=True)
        with open(output_path, "w") as f:
            json.dump(self.vocab, f)


class ThaiRobertaTokenizer(PreTrainedTokenizer):
    """
    Adapted from :class:`~transformers.CamembertTokenizer`. Construct a
    Thai Roberta tokenizer. Based on `SentencePiece <https://github.com/google/sentencepiece>`__.

    This tokenizer inherits from :class:`~transformers.PreTrainedTokenizer` which contains most of the main methods.
    Users should refer to this superclass for more information regarding those methods.

    Args:
        vocab_file (:obj:`str`):
            `SentencePiece <https://github.com/google/sentencepiece>`__ file (generally has a `.spm` extension) that
            contains the vocabulary necessary to instantiate a tokenizer.
        bos_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.

            .. note::

                When building a sequence using special tokens, this is not the token that is used for the beginning
                of sequence. The token used is the :obj:`cls_token`.
        eos_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
            The end of sequence token.

            .. note::

                When building a sequence using special tokens, this is not the token that is used for the end
                of sequence. The token used is the :obj:`sep_token`.
        sep_token (:obj:`str`, `optional`, defaults to :obj:`"</s>"`):
            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences
            for sequence classification or for a text and a question for question answering.
            It is also used as the last token of a sequence built with special tokens.
        cls_token (:obj:`str`, `optional`, defaults to :obj:`"<s>"`):
            The classifier token which is used when doing sequence classification (classification of the whole
            sequence instead of per-token classification). It is the first token of the sequence when built with
            special tokens.
        unk_token (:obj:`str`, `optional`, defaults to :obj:`"<unk>"`):
            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
            token instead.
        pad_token (:obj:`str`, `optional`, defaults to :obj:`"<pad>"`):
            The token used for padding, for example when batching sequences of different lengths.
        mask_token (:obj:`str`, `optional`, defaults to :obj:`"<mask>"`):
            The token used for masking values. This is the token used when training this model with masked language
            modeling. This is the token which the model will try to predict.
        additional_special_tokens (:obj:`List[str]`, `optional`, defaults to :obj:`["<s>NOTUSED", "</s>NOTUSED"]`):
            Additional special tokens used by the tokenizer.

    Attributes:
        sp_model (:obj:`SentencePieceProcessor`):
            The `SentencePiece` processor that is used for every conversion (string, tokens and IDs).
    """

    vocab_files_names = VOCAB_FILES_NAMES
    max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
    model_input_names = ["attention_mask"]

    def __init__(
        self,
        vocab_file,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        additional_special_tokens=[SPACE_TOKEN],
        **kwargs
    ):
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            additional_special_tokens=additional_special_tokens,
            **kwargs,
        )
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(str(vocab_file))
        self.vocab_file = vocab_file

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        An CamemBERT sequence has the following format:

        - single sequence: ``<s> X </s>``
        - pair of sequences: ``<s> A </s></s> B </s>``

        Args:
            token_ids_0 (:obj:`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (:obj:`List[int]`, `optional`):
                Optional second list of IDs for sequence pairs.

        Returns:
            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
        """

        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` method.

        Args:
            token_ids_0 (:obj:`List[int]`):
                List of IDs.
            token_ids_1 (:obj:`List[int]`, `optional`):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formated with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
        CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (:obj:`List[int]`):
                List of IDs.
            token_ids_1 (:obj:`List[int]`, `optional`):
                Optional second list of IDs for sequence pairs.

        Returns:
            :obj:`List[int]`: List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    @property
    def vocab_size(self):
        return len(self.sp_model)

    def get_vocab(self):
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        return vocab

    def _tokenize(self, text):
        return self.sp_model.EncodeAsPieces(text)

    def _convert_token_to_id(self, token):
        """ Converts a token (str) in an id using the vocab. """
        return self.sp_model.PieceToId(token)

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.sp_model.IdToPiece(index)

    def __getstate__(self):
        state = self.__dict__.copy()
        state["sp_model"] = None
        return state

    def __setstate__(self, d):
        self.__dict__ = d
        self.sp_model = spm.SentencePieceProcessor()
        self.sp_model.Load(self.vocab_file)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, "\n").strip()
        return out_string

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        return (out_vocab_file,)

    def prepare_for_tokenization(self, text, space_token=SPACE_TOKEN, is_split_into_words=False, **kwargs):
        if "is_pretokenized" in kwargs:
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
            is_split_into_words = kwargs.pop("is_pretokenized")

        # replace empty space with special space token

        text = text.replace(' ', space_token)

        return (text, kwargs)


class BaseThaiWordsTokenizer(PreTrainedTokenizer):
    """Base cass for word level tokenizer."""

    def build_inputs_with_special_tokens(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Build model inputs from a sequence or a pair of sequence for sequence classification tasks
        by concatenating and adding special tokens.
        An CamemBERT sequence has the following format:

        - single sequence: ``<s> X </s>``
        - pair of sequences: ``<s> A </s></s> B </s>``

        Args:
            token_ids_0 (:obj:`List[int]`):
                List of IDs to which the special tokens will be added.
            token_ids_1 (:obj:`List[int]`, `optional`):
                Optional second list of IDs for sequence pairs.

        Returns:
            :obj:`List[int]`: List of `input IDs <../glossary.html#input-ids>`__ with the appropriate special tokens.
        """
        if token_ids_1 is None:
            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
        cls = [self.cls_token_id]
        sep = [self.sep_token_id]
        return cls + token_ids_0 + sep + sep + token_ids_1 + sep

    def get_special_tokens_mask(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
    ) -> List[int]:
        """
        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
        special tokens using the tokenizer ``prepare_for_model`` method.

        Args:
            token_ids_0 (:obj:`List[int]`):
                List of IDs.
            token_ids_1 (:obj:`List[int]`, `optional`):
                Optional second list of IDs for sequence pairs.
            already_has_special_tokens (:obj:`bool`, `optional`, defaults to :obj:`False`):
                Whether or not the token list is already formatted with special tokens for the model.

        Returns:
            :obj:`List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
        """
        if already_has_special_tokens:
            if token_ids_1 is not None:
                raise ValueError(
                    "You should not supply a second sequence if the provided sequence of "
                    "ids is already formated with special tokens for the model."
                )
            return list(map(lambda x: 1 if x in [self.sep_token_id, self.cls_token_id] else 0, token_ids_0))

        if token_ids_1 is None:
            return [1] + ([0] * len(token_ids_0)) + [1]
        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]

    def create_token_type_ids_from_sequences(
        self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
    ) -> List[int]:
        """
        Create a mask from the two sequences passed to be used in a sequence-pair classification task.
        CamemBERT, like RoBERTa, does not make use of token type ids, therefore a list of zeros is returned.

        Args:
            token_ids_0 (:obj:`List[int]`):
                List of IDs.
            token_ids_1 (:obj:`List[int]`, `optional`):
                Optional second list of IDs for sequence pairs.

        Returns:
            :obj:`List[int]`: List of zeros.
        """
        sep = [self.sep_token_id]
        cls = [self.cls_token_id]

        if token_ids_1 is None:
            return len(cls + token_ids_0 + sep) * [0]
        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]

    @property
    def vocab_size(self):
        return len(self.tokenizer_model.get_vocab())

    def get_vocab(self):
        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
        return vocab

    def _tokenize(self, text):
        return self.tokenizer_model.encode(text).tokens

    def _convert_token_to_id(self, token):
        """ Converts a token (str) in an id using the vocab. """
        i = self.tokenizer_model.token_to_id(token)
        if i is None:
            return self.unk_token_id
        else:
            return i

    def _convert_id_to_token(self, index):
        """Converts an index (integer) in a token (str) using the vocab."""
        return self.tokenizer_model.id_to_token(index)

    def convert_tokens_to_string(self, tokens):
        """Converts a sequence of tokens (strings for sub-words) in a single string."""
        out_string = "".join(tokens).strip()
        return out_string

    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
        if not os.path.isdir(save_directory):
            logger.error("Vocabulary path ({}) should be a directory".format(save_directory))
            return
        out_vocab_file = os.path.join(
            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
        )

        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
            copyfile(self.vocab_file, out_vocab_file)

        return (out_vocab_file,)

    def prepare_for_tokenization(self, text, space_token=SPACE_TOKEN, is_split_into_words=False, **kwargs):
        if "is_pretokenized" in kwargs:
            warnings.warn(
                "`is_pretokenized` is deprecated and will be removed in a future version, use `is_split_into_words` instead.",
                FutureWarning,
            )
            is_split_into_words = kwargs.pop("is_pretokenized")

        # replace empty space with special space token

        text = text.replace(' ', space_token)

        return (text, kwargs)

    def __getstate__(self):
        # What is this funcion even do?
        raise NotImplementedError

    def __setstate__(self, d):
        # What is this funcion even do?
        raise NotImplementedError


class ThaiWordsNewmmTokenizer(BaseThaiWordsTokenizer):
    """
    Newmm tokenizer.
    """
    vocab_files_names = {"vocab_file": "newmm.json"}  # vocabulary file location in folder

    def __init__(
        self,
        vocab_file,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        additional_special_tokens=ADDITIONAL_SPECIAL_TOKENS,
        **kwargs
    ):
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            additional_special_tokens=ADDITIONAL_SPECIAL_TOKENS,
            **kwargs,
        )
        pre_tokenizer_func = PRE_TOKENIZERS_MAP['newmm']
        custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
            CustomPreTokenizer(pre_tokenizer_func))
        tokenizer = Tokenizer(models.WordLevel.from_file(vocab_file))
        tokenizer.pre_tokenizer = custom_pre_tokenizer
        self.tokenizer_model = tokenizer
        self.vocab_file = vocab_file

    def __getstate__(self):
        state = self.__dict__.copy()
        state["tokenizer_model"] = None
        return state

    def __setstate__(self, d):
        self.__dict__ = d
        pre_tokenizer_func = PRE_TOKENIZERS_MAP['newmm']
        custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
            CustomPreTokenizer(pre_tokenizer_func))
        tokenizer = Tokenizer(models.WordLevel.from_file(self.vocab_file))
        tokenizer.pre_tokenizer = custom_pre_tokenizer
        self.tokenizer_model = tokenizer


class ThaiWordsSyllableTokenizer(BaseThaiWordsTokenizer):
    """
    Syllable tokenizer.
    """
    vocab_files_names = {"vocab_file": "syllable.json"}

    def __init__(
        self,
        vocab_file,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        additional_special_tokens=ADDITIONAL_SPECIAL_TOKENS,
        **kwargs
    ):
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            additional_special_tokens=ADDITIONAL_SPECIAL_TOKENS,
            **kwargs,
        )
        pre_tokenizer_func = PRE_TOKENIZERS_MAP['syllable']
        custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
            CustomPreTokenizer(pre_tokenizer_func))
        tokenizer = Tokenizer(models.WordLevel.from_file(vocab_file))
        tokenizer.pre_tokenizer = custom_pre_tokenizer
        self.tokenizer_model = tokenizer
        self.vocab_file = vocab_file

    def __getstate__(self):
        state = self.__dict__.copy()
        state["tokenizer_model"] = None
        return state

    def __setstate__(self, d):
        self.__dict__ = d
        pre_tokenizer_func = PRE_TOKENIZERS_MAP['syllable']
        custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
            CustomPreTokenizer(pre_tokenizer_func))
        tokenizer = Tokenizer(models.WordLevel.from_file(self.vocab_file))
        tokenizer.pre_tokenizer = custom_pre_tokenizer
        self.tokenizer_model = tokenizer


class FakeSefrCutTokenizer(BaseThaiWordsTokenizer):
    """
    FakeSefrCut tokenizer.
    """
    vocab_files_names = {"vocab_file": "fake_sefr_cut.json"}

    def __init__(
        self,
        vocab_file,
        bos_token="<s>",
        eos_token="</s>",
        sep_token="</s>",
        cls_token="<s>",
        unk_token="<unk>",
        pad_token="<pad>",
        mask_token="<mask>",
        additional_special_tokens=ADDITIONAL_SPECIAL_TOKENS,
        **kwargs
    ):
        super().__init__(
            bos_token=bos_token,
            eos_token=eos_token,
            unk_token=unk_token,
            sep_token=sep_token,
            cls_token=cls_token,
            pad_token=pad_token,
            mask_token=mask_token,
            additional_special_tokens=ADDITIONAL_SPECIAL_TOKENS,
            **kwargs,
        )
        pre_tokenizer_func = PRE_TOKENIZERS_MAP['fake_sefr_cut_keep_split_token']
        custom_pre_tokenizer = pre_tokenizers.PreTokenizer.custom(
            FakeSefrCustomTokenizer(pre_tokenizer_func))
        tokenizer = Tokenizer(models.WordLevel.from_file(vocab_file))
        tokenizer.pre_tokenizer = custom_pre_tokenizer
        self.tokenizer_model = tokenizer
        self.vocab_file = vocab_file