Source code for pororo.tasks.pos_tagging

"""Part-Of-Speech Tagging related modeling class"""

import os
import re
from typing import List, Optional, Tuple, Union

from pororo.tasks.utils.base import PororoFactoryBase, PororoSimpleBase


[docs]class PororoPosFactory(PororoFactoryBase):
    """
    Conduct Part-of-Speech tagging

    Korean (`mecab-ko`)

        - dataset: N/A
        - metric: N/A

    japanese (`mecab-ipadic`)

        - dataset: N/A
        - metric: N/A

    English (`nltk`)

        - dataset: N/A
        - metric: N/A

    Chinese (`jieba`)

        - dataset: N/A
        - metric: N/A

    Args:
        sent (str): input sentence to be tagged

    Returns:
        List[Tuple[str, str]]: list of token and its corresponding pos tag tuple

    Examples:
        >>> pos = Pororo(task="pos", lang="ko")
        >>> pos("안녕하세요. 제 이름은 카터입니다.")
        [('안녕', 'NNG'), ('하', 'XSV'), ('시', 'EP'), ('어요', 'EF'), ('.', 'SF'), (' ', 'SPACE'),
         ('저', 'NP'), ('의', 'JKG'), (' ', 'SPACE'), ('이름', 'NNG'), ('은', 'JX'), (' ', 'SPACE'),
         ('카터', 'NNP'), ('이', 'VCP'), ('ᄇ니다', 'EF'), ('.', 'SF')]
        >>> pos = Pororo("pos", lang="ja")
        >>> pos("日本語でペラペラではないです")
        [('日本語', '名詞'), ('で', '助詞'), ('ペラペラ', '副詞'), ('で', '助動詞'),
         ('は', '助詞'), ('ない', '助動詞'), ('です', '助動詞')]
        >>> pos = Pororo("pos", lang="en")
        >>> pos("The striped bats are hanging, on their feet for best.")
        [('The', 'DT'), (' ', 'SPACE'), ('striped', 'JJ'), (' ', 'SPACE'), ('bats', 'NNS'),
         (' ', 'SPACE'), ('are', 'VBP'), (' ', 'SPACE'), ('hanging', 'VBG'), (',', ','),
         (' ', 'SPACE'), ('on', 'IN'), (' ', 'SPACE'), ('their', 'PRP$'), (' ', 'SPACE'),
         ('feet', 'NNS'), (' ', 'SPACE'), ('for', 'IN'), (' ', 'SPACE'), ('best', 'JJS'), ('.', '.')]
        >>> pos = Pororo("pos", lang="zh")
        >>> pos("乒乓球拍卖完了")
        [('乒乓球', 'n'), ('拍卖', 'v'), ('完', 'v'), ('了', 'ul')]

    """

    def __init__(self, task: str, lang: str, model: Optional[str]):
        super().__init__(task, lang, model)

[docs]    @staticmethod
    def get_available_langs():
        return ["en", "ko", "ja", "zh"]

[docs]    @staticmethod
    def get_available_models():
        return {
            "en": ["nltk"],
            "ko": ["mecab-ko"],
            "ja": ["mecab-ipadic"],
            "zh": ["jieba"],
        }

[docs]    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if self.config.n_model == "nltk":
            import nltk

            try:
                nltk.data.find("tokenizers/punkt")
            except LookupError:
                nltk.download("punkt")

            try:
                nltk.data.find("taggers/averaged_perceptron_tagger")
            except LookupError:
                nltk.download("averaged_perceptron_tagger")
            return PororoNLTKPosTagger(nltk, self.config)

        if self.config.n_model == "mecab-ko":
            try:
                import mecab
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install python-mecab-ko with: `pip install python-mecab-ko`"
                )
            model = mecab.MeCab()
            return PororoMecabPos(model, self.config)

        if self.config.n_model == "mecab-ipadic":
            try:
                import fugashi
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install fugashi with: `pip install fugashi`")

            try:
                import ipadic
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install ipadic with: `pip install ipadic`")
            dic_dir = ipadic.DICDIR
            mecabrc = os.path.join(dic_dir, "mecabrc")
            mecab_option = "-d {} -r {} ".format(
                dic_dir,
                mecabrc,
            )
            model = fugashi.GenericTagger(mecab_option)
            return PororoMecabJap(model, self.config)

        if self.config.n_model == "jieba":
            try:
                import jieba  # noqa
            except ModuleNotFoundError as error:
                raise error.__class__(
                    "Please install jieba with: `pip install jieba`")
            import jieba.posseg as jieba_pos

            model = jieba_pos
            return PororoJieba(model, self.config)


[docs]class PororoMecabPos(PororoSimpleBase):

    def __init__(self, model, config):
        super().__init__(config)
        self._model = model

    def _postprocess(self, unit: str) -> Tuple[str, str]:
        """
        Examples:
            >>> parse('나\tNP,*,F,나,*,*,*,*')
            나/NP
            >>> parse('산다\tVV+EC,*,F,산다,Inflect,VV,EC,사/VV/*+ᆫ다/EC/*')
            사/VV+ᆫ다/EC
            >>> parse("', '\tSC,*,*,*,*,*,*,*")
            ,/SC

        """
        # Should split line with tap since comma is frequently used in input sentence
        morph = unit[0]
        features = unit[1]
        pos = features.pos
        analysis = features.expression

        if analysis and ("+" in analysis):
            if "*" in analysis:
                token = [
                    morph.rsplit("/", 1)[0] for morph in analysis.split("+")
                ]
                token = [(t.split("/")[0], t.split("/")[1]) for t in token]
            else:
                analysis = (analysis.replace("+/", "[PLUS]/").replace(
                    "+", "[SEP]").replace("[PLUS]", "+"))
                token = [(pair.split("/")[0], pair.split("/")[1])
                         for pair in analysis.split("[SEP]")]
        else:
            token = (morph, pos)

        return morph, token

[docs]    def stringfy(self, result: List[Tuple[str, str]]) -> str:
        res_str = ""
        for pair in result:
            if pair[1] == "SPACE":
                res_str = res_str[:-1]
                res_str += " "
            else:
                res_str += f"{pair[0]}/{pair[1]}+"
        return res_str[:-1]

[docs]    def predict(
        self,
        sent: str,
        **kwargs,
    ) -> Union[Tuple[str, str], str]:
        """
        Conduct Part-of-Speech tagging using mecab-ko

        Args:
            sent (str): input sentence to be tagged
            return_surface (bool): whether to return surface
            return_string (bool): whether to return value as a string

        Returns:
            List[Tuple[str, str]]: list of token and its corresponding pos tag tuple

        """
        return_surface = kwargs.get("return_surface", False)
        return_string = kwargs.get("return_string", False)

        sent = sent.strip()
        sent_ptr = 0
        results = []

        if return_surface:
            analyzed = self._model.pos(sent)
        else:
            analyzed = self._model.parse(sent)

        for unit in analyzed:
            if not return_surface:
                morph, token = self._postprocess(unit)
            else:
                token = unit
                morph = unit[0]
            if sent[sent_ptr] == " ":
                # Move sent pointer to whitespace token to reserve whitespace
                # cf. to prevent double white-space, we move pointer to next eojeol
                while sent[sent_ptr] == " ":
                    sent_ptr += 1
                results.append((" ", "SPACE"))
            if isinstance(token, tuple):
                results.append(token)
            elif isinstance(token, list):
                results.extend(token)
            sent_ptr += len(morph)

        if return_string:
            return self.stringfy(results)

        return results


[docs]class PororoMecabJap(PororoSimpleBase):

    def __init__(self, model, config):
        super().__init__(config)
        self._model = model

[docs]    def predict(self, sent: str, **kwargs):
        """
        Conduct Part-of-Speech tagging using mecab and ipadic modules

        Args:
            sent (str): input sentence to be tagged

        Returns:
            List[Tuple[str, str]]: list of token and its corresponding pos tag tuple

        """
        mecab_output = self._model.parse(sent)

        pairs = list()
        for line in mecab_output.split("\n"):
            if line == "EOS":
                break
            token, tag = line.split("\t")
            tags = tag.split(",")
            pairs.append((token, tags[0]))
        return pairs


[docs]class PororoJieba(PororoSimpleBase):

    def __init__(self, model, config):
        super().__init__(config)
        self._model = model

[docs]    def predict(self, sent: str, **kwargs):
        """
        Conduct Part-of-Speech tagging using jieba modules

        Args:
            sent (str): input sentence to be tagged

        Returns:
            List[Tuple[str, str]]: list of token and its corresponding pos tag tuple

        """
        jieba_output = self._model.cut(sent)
        return [(word.word, word.flag) for word in list(jieba_output)]


[docs]class PororoNLTKPosTagger(PororoSimpleBase):

    def __init__(self, model, config):
        super().__init__(config)
        self._model = model

    def _clean(self, sent: str):
        """
        Cleanse input sentence

        Args:
            sent (str): input sentence to be cleansed

        Returns:
            str: cleansed output sentence

        """
        sent = sent.strip()
        sent = re.sub("\s", " ", sent)
        sent = re.sub(" +", " ", sent)
        return sent

    def _align(self, sent: str, tokens: List[Tuple[str, str]]):
        """
        Align sentence with tagged token pairs

        Args:
            sent (str): original input sentence
            tokens (List[Tuple[str, str]]): list of token and pos tag pair tuple

        Returns:
            List[Tuple[str, str]]: list of aligned token and pos tag pair tuple

        Examples:
            >>> sent = The striped bats are hanging, on their feet for best.
            >>> tokens = [('The', 'DT'), ('striped', 'JJ'), ('bats', 'NNS'), ('are', 'VBP'), ('hanging', 'VBG'), (',', ','), ('on', 'IN'), ('their', 'PRP$'), ('feet', 'NNS'), ('for', 'IN'), ('best', 'JJS'), ('.', '.')]
            >>> align(sent, tokens)
            [('The', 'DT'), (' ', 'SPACE'), ('striped', 'JJ'), (' ', 'SPACE'), ('bats', 'NNS'),
             (' ', 'SPACE'), ('are', 'VBP'), (' ', 'SPACE'), ('hanging', 'VBG'), (',', ','),
             (' ', 'SPACE'), ('on', 'IN'), (' ', 'SPACE'), ('their', 'PRP$'), (' ', 'SPACE'),
             ('feet', 'NNS'), (' ', 'SPACE'), ('for', 'IN'), (' ', 'SPACE'), ('best', 'JJS'), ('.', '.')]

        """
        result = list()
        while True:
            token = tokens.pop(0)
            word = token[0]

            # correct strange behaviors of nltk `word_tokenize`
            # https://github.com/nltk/nltk/issues/1630
            if (word in ("``", "''")) and (sent[0] == '"'):
                word = '"'
                token = ('"', '"')
            if (word == "...") and (sent[0] == "…"):  # ellipsis
                word = "…"
                token = ("…", "…")

            if sent.startswith(f"{word} "):
                sent = sent[len(f"{word} "):]
                result.append(token)
                result.append((" ", "SPACE"))
            elif sent.startswith(word):
                sent = sent[len(word):]
                result.append(token)
            else:
                raise ValueError(f"CANNOT align the {token} to {sent}")

            if not tokens:
                break
        return result

[docs]    def predict(self, sent: str, **kwargs):
        """
        Conduct Part-of-Speech tagging using NLTK modules

        Args:
            sent (str): input sentence to be tagged

        Returns:
            List[Tuple[str, str]]: list of token and its corresponding pos tag tuple

        """
        words = self._model.word_tokenize(self._clean(sent))
        pos_tags = self._model.pos_tag(words)
        return self._align(sent, pos_tags)