Source code for pororo.tasks.tokenization

"""Tokenization related modeling class"""

import json
import os
import re
from abc import abstractmethod
from typing import List, Optional
from unicodedata import normalize

from kss import split_sentences

from pororo.tasks.utils.base import PororoFactoryBase, PororoSimpleBase
from pororo.tasks.utils.download_utils import download_or_load


[docs]class PororoTokenizationFactory(PororoFactoryBase):
    """
    Use the dictionary you want to use to tokenize about the sentence.

    Args:
        sent: (str) sentence to be tokenized

    Returns:
        List[str]: tokenized token list

    Examples:
        >>> tk = Pororo(task="tokenization", lang="ko", model="bpe32k.ko", )
        >>> tk("하늘을 나는 새를 보았다")
        ["_하늘", "을", "_나는", "_새", "를", "_보", "았다"]
        >>> tk = Pororo(task="tokenization", lang="en", model="roberta")
        >>> tk("I love you")
        ['I', 'Ġlove', 'Ġyou']
        >>> tk('''If the values aren’t unique, there is no unique inversion of the dictionary anyway or, with other words, inverting does not make sense.''')
        ['If', 'Ġthe', 'Ġvalues', 'Ġaren', 'âĢ', 'Ļ', 't', 'Ġunique', ',', 'Ġthere', 'Ġis', 'Ġno', 'Ġunique', 'Ġin', 'version', 'Ġof', 'Ġthe', 'Ġdictionary', 'Ġanyway', 'Ġor', ',', 'Ġwith', 'Ġother', 'Ġwords', ',', 'Ġinver', 'ting', 'Ġdoes', 'Ġnot', 'Ġmake', 'Ġsense', '.']

    """

    def __init__(self, task: str, lang: str, model: Optional[str]):
        super().__init__(task, lang, model)

[docs]    @staticmethod
    def get_available_langs():
        return ["en", "ko", "ja", "zh"]

[docs]    @staticmethod
    def get_available_models():
        return {
            "en": [
                "moses",
                "bpe32k.en",
                "roberta",
                "sent_en",
            ],
            "ko": [
                "bpe4k.ko",
                "bpe8k.ko",
                "bpe16k.ko",
                "bpe32k.ko",
                "bpe64k.ko",
                "unigram4k.ko",
                "unigram8k.ko",
                "unigram16k.ko",
                "unigram32k.ko",
                "unigram64k.ko",
                "jpe4k.ko",
                "jpe8k.ko",
                "jpe16k.ko",
                "jpe32k.ko",
                "jpe64k.ko",
                "mecab.bpe4k.ko",
                "mecab.bpe8k.ko",
                "mecab.bpe16k.ko",
                "mecab.bpe32k.ko",
                "mecab.bpe64k.ko",
                "char",
                "jamo",
                "word",
                "mecab_ko",
                "sent_ko",
            ],
            "ja": [
                "mecab",
                "bpe8k.ja",
                "sent_ja",
            ],
            "zh": [
                "jieba",
                "sent_zh",
            ],
        }

[docs]    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if "sent" in self.config.n_model:
            import nltk

            try:
                nltk.data.find("tokenizers/punkt")
            except LookupError:
                nltk.download("punkt")

            from nltk.tokenize import sent_tokenize

            return PororoSentTokenizer(sent_tokenize, self.config)

        if self.config.n_model == "mecab_ko":
            try:
                import mecab
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install python-mecab-ko with: `pip install python-mecab-ko`"
                )
            model = mecab.MeCab()
            return PororoMecabKoTokenizer(model, self.config)

        if self.config.n_model == "char":
            return PororoCharTokenizer(self.config)

        if self.config.n_model == "jamo":
            return PororoJamoTokenizer(self.config)

        if self.config.n_model == "word":
            return PororoWordTokenizer(self.config)

        if self.config.n_model == "roberta":
            from fairseq.data.encoders.gpt2_bpe import get_encoder

            encoder = download_or_load("misc/encoder.json", self.config.lang)
            vocab = download_or_load("misc/vocab.bpe", self.config.lang)
            model = get_encoder(encoder, vocab)

            with open(encoder, "r") as f_vocab:
                vocab = json.load(f_vocab)
                inv_dict = {v: k for k, v in vocab.items()}

            return PororoRoBERTaTokenizer(model, vocab, inv_dict, self.config)

        if self.config.n_model == "moses":
            try:
                from sacremoses import MosesDetokenizer, MosesTokenizer
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install sacremoses with: `pip install sacremoses`")
            model = MosesTokenizer(lang="en")
            detok = MosesDetokenizer(lang="en")
            return PororoMosesTokenizer(model, detok, self.config)

        if self.config.n_model == "jieba":
            try:
                import jieba
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install jieba with: `pip install jieba`")
            model = jieba.cut
            return PororoJiebaTokenizer(model, self.config)

        if self.config.n_model == "mecab":
            try:
                import fugashi
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install fugashi with: `pip install fugashi`")

            try:
                import ipadic
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install ipadic with: `pip install ipadic`")

            dic_dir = ipadic.DICDIR
            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = "-d {} -r {} ".format(
                dic_dir,
                mecabrc,
            )
            model = fugashi.GenericTagger(mecab_option)
            return PororoMecabTokenizer(model, self.config)
        else:
            from pororo.tasks.utils.tokenizer import CustomTokenizer

            path = download_or_load(
                f"tokenizers/{self.config.n_model}.zip",
                self.config.lang,
            )

            ext = "json" if "unigram" not in self.config.n_model else "txt"
            merges_filename = (f"{path}/merges.txt" if "unigram"
                               not in self.config.n_model else None)

            model = CustomTokenizer.from_file(
                vocab_filename=f"{path}/vocab.{ext}",
                merges_filename=merges_filename,
                normalize=True if "jpe" not in self.config.n_model else False,
            )
            if "jpe" in self.config.n_model:
                return PororoJamoPairTokenizer(model, self.config)
            if "mecab.bpe" in self.config.n_model:
                return PororoMecabSPTokenizer(model, self.config)
            return PororoSPTokenizer(model, self.config)


[docs]class PororoTokenizerBase(PororoSimpleBase):

[docs]    @abstractmethod
    def detokenize(self, tokens: List[str]):
        raise NotImplementedError("`detokenize()` is not implemented")

[docs]    @abstractmethod
    def convert_tokens_to_ids(self, tokens: List[str]):
        raise NotImplementedError(
            "`convert_tokens_to_ids()` is not implemented")


[docs]class PororoSentTokenizer(PororoTokenizerBase):

    def __init__(self, model, config):
        super().__init__(config)
        self._model = model

[docs]    def cj_tokenize(self, text: str):
        text = text.replace("。", "。[SEP]")
        text = text.replace("！", "！[SEP]")
        text = text.replace("？", "？[SEP]")

        if "[SEP]" in text:
            sents = text.split("[SEP]")
            sents = sents[:-1]
        else:
            sents = [text]

        return sents

[docs]    def predict(self, text: str, **kwargs) -> List[str]:
        if self.lang in ["zh", "ja"]:
            return self.cj_tokenize(text)
        elif self.lang == "ko":
            return split_sentences(text)
        return self._model(text)


[docs]class PororoMecabKoTokenizer(PororoTokenizerBase):

    def __init__(self, model, config):
        super().__init__(config)
        self._model = model

[docs]    def detokenize(self, tokens: List[str]):
        text = "".join(tokens).replace("▃", " ").strip()
        return text

[docs]    def predict(
        self,
        text: str,
        **kwargs,
    ) -> List[str]:
        preserve_whitespace = kwargs.get("preserve_whitespace", True)

        text = text.strip()
        text_ptr = 0
        results = list()

        for unit in self._model.parse(text):
            token = unit[0]
            if preserve_whitespace:
                if text[text_ptr] == " ":
                    # Move text pointer to whitespace token to reserve whitespace
                    # cf. to prevent double white-space, we move pointer to next eojeol
                    while text[text_ptr] == " ":
                        text_ptr += 1
                    results.append(" ")
            results.append(token)
            text_ptr += len(token)

        return results


[docs]class PororoMosesTokenizer(PororoTokenizerBase):

    def __init__(self, model, detok, config):
        super().__init__(config)
        self._model = model
        self._detok = detok

[docs]    def detokenize(self, tokens: List[str]):
        return self._detok.detokenize(tokens)

[docs]    def predict(self, text: str, **kwargs) -> List[str]:
        return self._model.tokenize(text)


[docs]class PororoJiebaTokenizer(PororoTokenizerBase):

    def __init__(self, model, config):
        super().__init__(config)
        self._model = model

[docs]    def detokenize(self, tokens: List[str]):
        return "".join(tokens)

[docs]    def predict(self, text: str, **kwargs) -> List[str]:
        return list(self._model(text))


[docs]class PororoMecabTokenizer(PororoTokenizerBase):

    def __init__(self, model, config):
        super().__init__(config)
        self._model = model

[docs]    def detokenize(self, tokens: List[str]):
        return "".join(tokens)

[docs]    def predict(self, text: str, **kwargs) -> List[str]:
        parsed = self._model.parse(text)

        res = []
        for line in parsed.split("\n"):
            if line == "EOS":
                break
            toks = line.split("\t")
            res.append(toks[0])
        return res


[docs]class PororoWordTokenizer(PororoTokenizerBase):

    def __init__(self, config):
        super().__init__(config)

[docs]    def detokenize(self, tokens: List[str]) -> str:
        """
        Untokenizing a text undoes the tokenizing operation, restoring
        punctuation and spaces to the places that people expect them to be.
        Ideally, `untokenize(tokenize(text))` should be identical to `text`,
        except for line breaks.

        """
        text = " ".join(tokens)
        step1 = text.replace("`` ", '"').replace(" ''", '"')
        step1 = step1.replace(". . .", "...")

        step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")

        step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)

        step4 = re.sub(r" ([.,:;?!%]+)$", r"\1", step3)

        step5 = step4.replace(" '", "'").replace(" n't", "n't")
        step5 = step5.replace("can not", "cannot")

        step6 = step5.replace(" ` ", " '")
        return step6.strip()

[docs]    def predict(self, text: str, **kwargs) -> List[str]:
        return re.findall(r"\w+|[^\w\s]", text, re.UNICODE)


[docs]class PororoCharTokenizer(PororoTokenizerBase):

    def __init__(self, config):
        super().__init__(config)

[docs]    def detokenize(self, tokens: List[str]):
        text = "".join(tokens).replace("▁", " ").strip()
        return text

[docs]    def predict(self, text: str, **kwargs) -> List[str]:
        text = text.strip().replace(" ", "▁")
        return list(text)


[docs]class PororoJamoTokenizer(PororoTokenizerBase):

    def __init__(self, config):
        super().__init__(config)

[docs]    def detokenize(self, tokens: List[str]):
        return normalize("NFKC", "".join(tokens)).replace("▁", " ")

[docs]    def predict(self, text: str, **kwargs) -> List[str]:
        return list("▁".join(
            [normalize("NFKD", token) for token in text.strip().split(" ")]))


[docs]class PororoJamoPairTokenizer(PororoTokenizerBase):

    def __init__(self, model, config):
        super().__init__(config)
        self._model = model

[docs]    def detokenize(self, tokens: List[str]):
        tokens = list("".join(tokens).replace("▁", " ").strip())
        return normalize("NFKC", "".join(tokens)).replace("▁", " ")

[docs]    def predict(self, text: str, **kwargs) -> List[str]:
        text = "▁".join(
            [normalize("NFKD", token) for token in text.strip().split(" ")])
        tokenized = self._model.segment(text.strip())
        return tokenized


[docs]class PororoSPTokenizer(PororoTokenizerBase):

    def __init__(self, model, config):
        super().__init__(config)
        self._model = model

[docs]    def detokenize(self, tokens: List[str]):
        text = "".join(tokens).replace("▁", " ").strip()
        return text

[docs]    def predict(self, text: str, **kwargs):
        tokenized = self._model.segment(text.strip())
        return tokenized


[docs]class PororoMecabSPTokenizer(PororoTokenizerBase):

    def __init__(self, model, config):
        super().__init__(config)
        self._model = model

[docs]    def detokenize(self, tokens: List[str]):
        text = "".join(tokens).replace("▁", " ").strip()
        return text

[docs]    def predict(self, text: str, **kwargs):
        tokenized = self._model.segment(text)
        return tokenized


[docs]class PororoRoBERTaTokenizer(PororoTokenizerBase):

    def __init__(self, model, vocab, inv_dict, config):
        super().__init__(config)
        self._model = model
        self._vocab = vocab
        self._inv_dict = inv_dict

[docs]    def convert_tokens_to_ids(self, tokens: List[str]):
        return [self._vocab[token] for token in tokens]

[docs]    def predict(self, text: str, **kwargs):
        tokens = self._model.encode(text)
        tokens = [self._inv_dict[token] for token in tokens]
        return tokens