Source code for pororo.tasks.lemmatization

"""Lemmatization related modeling class"""

from typing import Optional

from pororo.tasks.utils.base import PororoFactoryBase, PororoSimpleBase


[docs]class PororoLemmatizationFactory(PororoFactoryBase): """ Conduct lemmatization English (`nltk`) - dataset: N/A - metric: N/A Args: sent (str): input sentence to be lemmatized Returns: List[str]: lemma list generated by NLTK Examples: >>> lemma = Pororo(task="lemma", lang="en") >>> lemma("The striped bats are hanging, on their feet for best.") ['The', 'striped', 'bat', 'be', 'hang', ',', 'on', 'their', 'foot', 'for', 'best', """ def __init__(self, task: str, lang: str, model: Optional[str]): super().__init__(task, lang, model)
[docs] @staticmethod def get_available_langs(): return ["en"]
[docs] @staticmethod def get_available_models(): return { "en": ["nltk"], }
[docs] def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if self.config.n_model == "nltk": import nltk try: nltk.data.find("tokenizers/punkt") except LookupError: nltk.download("punkt") try: nltk.data.find("corpora/wordnet") except LookupError: nltk.download("wordnet") try: nltk.data.find("taggers/averaged_perceptron_tagger") except LookupError: nltk.download("averaged_perceptron_tagger") from nltk.stem import WordNetLemmatizer model = WordNetLemmatizer() return PororoNLTKLemmatize(nltk, model, self.config)
[docs]class PororoNLTKLemmatize(PororoSimpleBase): def __init__(self, nltk, model, config): super().__init__(config) self._nltk = nltk self._model = model def _get_wordnet_pos(self, pos: str): """ Get wordnet style pos tag Args: pos (str): pos tag generated by NLTK Returns: str: wordnet style pos tag """ tag = pos[0].lower() tag = tag if tag != "j" else "a" tag = tag if tag in "anvr" else "n" return tag
[docs] def predict(self, sent: str, **kwargs): """ Conduct lemmatization with NLTK module Args: sent (str): input sentence to be lemmatized Returns: List[str]: lemma list generated by NLTK """ words = self._nltk.word_tokenize(sent) pos_tags = self._nltk.pos_tag(words) lemmas = list() for word, pos in pos_tags: pos = self._get_wordnet_pos(pos) lemma = self._model.lemmatize(word, pos) lemmas.append(lemma) return lemmas