"""Lemmatization related modeling class"""
from typing import Optional
from pororo.tasks.utils.base import PororoFactoryBase, PororoSimpleBase
[docs]class PororoLemmatizationFactory(PororoFactoryBase):
"""
Conduct lemmatization
English (`nltk`)
- dataset: N/A
- metric: N/A
Args:
sent (str): input sentence to be lemmatized
Returns:
List[str]: lemma list generated by NLTK
Examples:
>>> lemma = Pororo(task="lemma", lang="en")
>>> lemma("The striped bats are hanging, on their feet for best.")
['The', 'striped', 'bat', 'be', 'hang', ',', 'on', 'their', 'foot', 'for', 'best',
"""
def __init__(self, task: str, lang: str, model: Optional[str]):
super().__init__(task, lang, model)
[docs] @staticmethod
def get_available_langs():
return ["en"]
[docs] @staticmethod
def get_available_models():
return {
"en": ["nltk"],
}
[docs] def load(self, device: str):
"""
Load user-selected task-specific model
Args:
device (str): device information
Returns:
object: User-selected task-specific model
"""
if self.config.n_model == "nltk":
import nltk
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt")
try:
nltk.data.find("corpora/wordnet")
except LookupError:
nltk.download("wordnet")
try:
nltk.data.find("taggers/averaged_perceptron_tagger")
except LookupError:
nltk.download("averaged_perceptron_tagger")
from nltk.stem import WordNetLemmatizer
model = WordNetLemmatizer()
return PororoNLTKLemmatize(nltk, model, self.config)
[docs]class PororoNLTKLemmatize(PororoSimpleBase):
def __init__(self, nltk, model, config):
super().__init__(config)
self._nltk = nltk
self._model = model
def _get_wordnet_pos(self, pos: str):
"""
Get wordnet style pos tag
Args:
pos (str): pos tag generated by NLTK
Returns:
str: wordnet style pos tag
"""
tag = pos[0].lower()
tag = tag if tag != "j" else "a"
tag = tag if tag in "anvr" else "n"
return tag
[docs] def predict(self, sent: str, **kwargs):
"""
Conduct lemmatization with NLTK module
Args:
sent (str): input sentence to be lemmatized
Returns:
List[str]: lemma list generated by NLTK
"""
words = self._nltk.word_tokenize(sent)
pos_tags = self._nltk.pos_tag(words)
lemmas = list()
for word, pos in pos_tags:
pos = self._get_wordnet_pos(pos)
lemma = self._model.lemmatize(word, pos)
lemmas.append(lemma)
return lemmas