"""Tokenization related modeling class"""
import json
import os
import re
from abc import abstractmethod
from typing import List, Optional
from unicodedata import normalize
from kss import split_sentences
from pororo.tasks.utils.base import PororoFactoryBase, PororoSimpleBase
from pororo.tasks.utils.download_utils import download_or_load
[docs]class PororoTokenizationFactory(PororoFactoryBase):
"""
Use the dictionary you want to use to tokenize about the sentence.
Args:
sent: (str) sentence to be tokenized
Returns:
List[str]: tokenized token list
Examples:
>>> tk = Pororo(task="tokenization", lang="ko", model="bpe32k.ko", )
>>> tk("하늘을 나는 새를 보았다")
["_하늘", "을", "_나는", "_새", "를", "_보", "았다"]
>>> tk = Pororo(task="tokenization", lang="en", model="roberta")
>>> tk("I love you")
['I', 'Ġlove', 'Ġyou']
>>> tk('''If the values aren’t unique, there is no unique inversion of the dictionary anyway or, with other words, inverting does not make sense.''')
['If', 'Ġthe', 'Ġvalues', 'Ġaren', 'âĢ', 'Ļ', 't', 'Ġunique', ',', 'Ġthere', 'Ġis', 'Ġno', 'Ġunique', 'Ġin', 'version', 'Ġof', 'Ġthe', 'Ġdictionary', 'Ġanyway', 'Ġor', ',', 'Ġwith', 'Ġother', 'Ġwords', ',', 'Ġinver', 'ting', 'Ġdoes', 'Ġnot', 'Ġmake', 'Ġsense', '.']
"""
def __init__(self, task: str, lang: str, model: Optional[str]):
super().__init__(task, lang, model)
[docs] @staticmethod
def get_available_langs():
return ["en", "ko", "ja", "zh"]
[docs] @staticmethod
def get_available_models():
return {
"en": [
"moses",
"bpe32k.en",
"roberta",
"sent_en",
],
"ko": [
"bpe4k.ko",
"bpe8k.ko",
"bpe16k.ko",
"bpe32k.ko",
"bpe64k.ko",
"unigram4k.ko",
"unigram8k.ko",
"unigram16k.ko",
"unigram32k.ko",
"unigram64k.ko",
"jpe4k.ko",
"jpe8k.ko",
"jpe16k.ko",
"jpe32k.ko",
"jpe64k.ko",
"mecab.bpe4k.ko",
"mecab.bpe8k.ko",
"mecab.bpe16k.ko",
"mecab.bpe32k.ko",
"mecab.bpe64k.ko",
"char",
"jamo",
"word",
"mecab_ko",
"sent_ko",
],
"ja": [
"mecab",
"bpe8k.ja",
"sent_ja",
],
"zh": [
"jieba",
"sent_zh",
],
}
[docs] def load(self, device: str):
"""
Load user-selected task-specific model
Args:
device (str): device information
Returns:
object: User-selected task-specific model
"""
if "sent" in self.config.n_model:
import nltk
try:
nltk.data.find("tokenizers/punkt")
except LookupError:
nltk.download("punkt")
from nltk.tokenize import sent_tokenize
return PororoSentTokenizer(sent_tokenize, self.config)
if self.config.n_model == "mecab_ko":
try:
import mecab
except ModuleNotFoundError as error:
raise error.__class__(
"Please install python-mecab-ko with: `pip install python-mecab-ko`"
)
model = mecab.MeCab()
return PororoMecabKoTokenizer(model, self.config)
if self.config.n_model == "char":
return PororoCharTokenizer(self.config)
if self.config.n_model == "jamo":
return PororoJamoTokenizer(self.config)
if self.config.n_model == "word":
return PororoWordTokenizer(self.config)
if self.config.n_model == "roberta":
from fairseq.data.encoders.gpt2_bpe import get_encoder
encoder = download_or_load("misc/encoder.json", self.config.lang)
vocab = download_or_load("misc/vocab.bpe", self.config.lang)
model = get_encoder(encoder, vocab)
with open(encoder, "r") as f_vocab:
vocab = json.load(f_vocab)
inv_dict = {v: k for k, v in vocab.items()}
return PororoRoBERTaTokenizer(model, vocab, inv_dict, self.config)
if self.config.n_model == "moses":
try:
from sacremoses import MosesDetokenizer, MosesTokenizer
except ModuleNotFoundError as error:
raise error.__class__(
"Please install sacremoses with: `pip install sacremoses`")
model = MosesTokenizer(lang="en")
detok = MosesDetokenizer(lang="en")
return PororoMosesTokenizer(model, detok, self.config)
if self.config.n_model == "jieba":
try:
import jieba
except ModuleNotFoundError as error:
raise error.__class__(
"Please install jieba with: `pip install jieba`")
model = jieba.cut
return PororoJiebaTokenizer(model, self.config)
if self.config.n_model == "mecab":
try:
import fugashi
except ModuleNotFoundError as error:
raise error.__class__(
"Please install fugashi with: `pip install fugashi`")
try:
import ipadic
except ModuleNotFoundError as error:
raise error.__class__(
"Please install ipadic with: `pip install ipadic`")
dic_dir = ipadic.DICDIR
mecabrc = os.path.join(dic_dir, "mecabrc")
mecab_option = "-d {} -r {} ".format(
dic_dir,
mecabrc,
)
model = fugashi.GenericTagger(mecab_option)
return PororoMecabTokenizer(model, self.config)
else:
from pororo.tasks.utils.tokenizer import CustomTokenizer
path = download_or_load(
f"tokenizers/{self.config.n_model}.zip",
self.config.lang,
)
ext = "json" if "unigram" not in self.config.n_model else "txt"
merges_filename = (f"{path}/merges.txt" if "unigram"
not in self.config.n_model else None)
model = CustomTokenizer.from_file(
vocab_filename=f"{path}/vocab.{ext}",
merges_filename=merges_filename,
normalize=True if "jpe" not in self.config.n_model else False,
)
if "jpe" in self.config.n_model:
return PororoJamoPairTokenizer(model, self.config)
if "mecab.bpe" in self.config.n_model:
return PororoMecabSPTokenizer(model, self.config)
return PororoSPTokenizer(model, self.config)
[docs]class PororoTokenizerBase(PororoSimpleBase):
[docs] @abstractmethod
def detokenize(self, tokens: List[str]):
raise NotImplementedError("`detokenize()` is not implemented")
[docs] @abstractmethod
def convert_tokens_to_ids(self, tokens: List[str]):
raise NotImplementedError(
"`convert_tokens_to_ids()` is not implemented")
[docs]class PororoSentTokenizer(PororoTokenizerBase):
def __init__(self, model, config):
super().__init__(config)
self._model = model
[docs] def cj_tokenize(self, text: str):
text = text.replace("。", "。[SEP]")
text = text.replace("!", "![SEP]")
text = text.replace("?", "?[SEP]")
if "[SEP]" in text:
sents = text.split("[SEP]")
sents = sents[:-1]
else:
sents = [text]
return sents
[docs] def predict(self, text: str, **kwargs) -> List[str]:
if self.lang in ["zh", "ja"]:
return self.cj_tokenize(text)
elif self.lang == "ko":
return split_sentences(text)
return self._model(text)
[docs]class PororoMecabKoTokenizer(PororoTokenizerBase):
def __init__(self, model, config):
super().__init__(config)
self._model = model
[docs] def detokenize(self, tokens: List[str]):
text = "".join(tokens).replace("▃", " ").strip()
return text
[docs] def predict(
self,
text: str,
**kwargs,
) -> List[str]:
preserve_whitespace = kwargs.get("preserve_whitespace", True)
text = text.strip()
text_ptr = 0
results = list()
for unit in self._model.parse(text):
token = unit[0]
if preserve_whitespace:
if text[text_ptr] == " ":
# Move text pointer to whitespace token to reserve whitespace
# cf. to prevent double white-space, we move pointer to next eojeol
while text[text_ptr] == " ":
text_ptr += 1
results.append(" ")
results.append(token)
text_ptr += len(token)
return results
[docs]class PororoMosesTokenizer(PororoTokenizerBase):
def __init__(self, model, detok, config):
super().__init__(config)
self._model = model
self._detok = detok
[docs] def detokenize(self, tokens: List[str]):
return self._detok.detokenize(tokens)
[docs] def predict(self, text: str, **kwargs) -> List[str]:
return self._model.tokenize(text)
[docs]class PororoJiebaTokenizer(PororoTokenizerBase):
def __init__(self, model, config):
super().__init__(config)
self._model = model
[docs] def detokenize(self, tokens: List[str]):
return "".join(tokens)
[docs] def predict(self, text: str, **kwargs) -> List[str]:
return list(self._model(text))
[docs]class PororoMecabTokenizer(PororoTokenizerBase):
def __init__(self, model, config):
super().__init__(config)
self._model = model
[docs] def detokenize(self, tokens: List[str]):
return "".join(tokens)
[docs] def predict(self, text: str, **kwargs) -> List[str]:
parsed = self._model.parse(text)
res = []
for line in parsed.split("\n"):
if line == "EOS":
break
toks = line.split("\t")
res.append(toks[0])
return res
[docs]class PororoWordTokenizer(PororoTokenizerBase):
def __init__(self, config):
super().__init__(config)
[docs] def detokenize(self, tokens: List[str]) -> str:
"""
Untokenizing a text undoes the tokenizing operation, restoring
punctuation and spaces to the places that people expect them to be.
Ideally, `untokenize(tokenize(text))` should be identical to `text`,
except for line breaks.
"""
text = " ".join(tokens)
step1 = text.replace("`` ", '"').replace(" ''", '"')
step1 = step1.replace(". . .", "...")
step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
step4 = re.sub(r" ([.,:;?!%]+)$", r"\1", step3)
step5 = step4.replace(" '", "'").replace(" n't", "n't")
step5 = step5.replace("can not", "cannot")
step6 = step5.replace(" ` ", " '")
return step6.strip()
[docs] def predict(self, text: str, **kwargs) -> List[str]:
return re.findall(r"\w+|[^\w\s]", text, re.UNICODE)
[docs]class PororoCharTokenizer(PororoTokenizerBase):
def __init__(self, config):
super().__init__(config)
[docs] def detokenize(self, tokens: List[str]):
text = "".join(tokens).replace("▁", " ").strip()
return text
[docs] def predict(self, text: str, **kwargs) -> List[str]:
text = text.strip().replace(" ", "▁")
return list(text)
[docs]class PororoJamoTokenizer(PororoTokenizerBase):
def __init__(self, config):
super().__init__(config)
[docs] def detokenize(self, tokens: List[str]):
return normalize("NFKC", "".join(tokens)).replace("▁", " ")
[docs] def predict(self, text: str, **kwargs) -> List[str]:
return list("▁".join(
[normalize("NFKD", token) for token in text.strip().split(" ")]))
[docs]class PororoJamoPairTokenizer(PororoTokenizerBase):
def __init__(self, model, config):
super().__init__(config)
self._model = model
[docs] def detokenize(self, tokens: List[str]):
tokens = list("".join(tokens).replace("▁", " ").strip())
return normalize("NFKC", "".join(tokens)).replace("▁", " ")
[docs] def predict(self, text: str, **kwargs) -> List[str]:
text = "▁".join(
[normalize("NFKD", token) for token in text.strip().split(" ")])
tokenized = self._model.segment(text.strip())
return tokenized
[docs]class PororoSPTokenizer(PororoTokenizerBase):
def __init__(self, model, config):
super().__init__(config)
self._model = model
[docs] def detokenize(self, tokens: List[str]):
text = "".join(tokens).replace("▁", " ").strip()
return text
[docs] def predict(self, text: str, **kwargs):
tokenized = self._model.segment(text.strip())
return tokenized
[docs]class PororoMecabSPTokenizer(PororoTokenizerBase):
def __init__(self, model, config):
super().__init__(config)
self._model = model
[docs] def detokenize(self, tokens: List[str]):
text = "".join(tokens).replace("▁", " ").strip()
return text
[docs] def predict(self, text: str, **kwargs):
tokenized = self._model.segment(text)
return tokenized
[docs]class PororoRoBERTaTokenizer(PororoTokenizerBase):
def __init__(self, model, vocab, inv_dict, config):
super().__init__(config)
self._model = model
self._vocab = vocab
self._inv_dict = inv_dict
[docs] def convert_tokens_to_ids(self, tokens: List[str]):
return [self._vocab[token] for token in tokens]
[docs] def predict(self, text: str, **kwargs):
tokens = self._model.encode(text)
tokens = [self._inv_dict[token] for token in tokens]
return tokens