Source code for pororo.tasks.grammatical_error_correction

"""Grammatical Error Correction related modeling class"""

import re
from typing import List, Optional, Union

from pororo.tasks.utils.base import (
    PororoFactoryBase,
    PororoGenerationBase,
    PororoSimpleBase,
)
from pororo.tasks.utils.download_utils import download_or_load


[docs]class PororoGecFactory(PororoFactoryBase): """ Grammatical error correction English (`transformer.base.en.gec`) - dataset: FCE, W&I+LOCNESS - metric: TBU English (`transformer.base.en.char_gec`) - dataset: xfspell - metric: TBU - ref: http://www.realworldnlpbook.com/blog/unreasonable-effectiveness-of-transformer-spell-checker.html Korean (`charbert.base.ko.spacing`) - dataset: Internal data (based on Wikipedia) - metric: F1 (89.51) Args: text (str): input sentence to fix grammatical error beam (int): size of beam search temperature (float): temperature for sampling top_k (int): variable for top k sampling top_p (float): variable for top p sampling no_repeat_ngram_size (int): no repeat ngram size len_penalty (float): length penalty ratio Examples: >>> gec = Pororo(task="gec", lang="en") >>> gec("This apple are so sweet.") "This apple is so sweet." >>> gec("'I've love you, before I meet her!'") "'I've loved you, before I met her!" >>> # It works better if I use two modules in succession with `correct_spell` option >>> # Of course, it requires more computation and time. >>> gec("Travel by bus is exspensive , bored and annoying .") # bad result 'Travel by bus is exspensive, boring and annoying.' >>> gec("Travel by bus is exspensive , bored and annoying .", correct_spell=True) # better result 'Travelling by bus is expensive, boring, and annoying.' >>> spacing = Pororo(task="gec", lang="ko") >>> spacing("카 카오브 레인에서는 무슨 일을 하 나 요?") '카카오브레인에서는 무슨 일을 하나요?' >>> spacing("아버지가방에들어간다.") '아버지가 방에 들어간다.' Notes: Korean error correction is beta version. It only supports spacing correction currently. """ def __init__(self, task: str, lang: str, model: Optional[str]): super().__init__(task, lang, model)
[docs] @staticmethod def get_available_langs(): return ["en", "ko"]
[docs] @staticmethod def get_available_models(): return { "en": [ "transformer.base.en.gec", "transformer.base.en.char_gec", ], "ko": ["charbert.base.ko.spacing"], }
[docs] def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "charbert" in self.config.n_model: from pororo.models.brainbert import CharBrainRobertaModel model = (CharBrainRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) print( "As of now, this beta model tries to correct spacing errors in Korean text." ) return PororoBertSpacing(model, self.config) if "transformer" in self.config.n_model: from fairseq.models.transformer import TransformerModel from pororo.tasks.utils.tokenizer import CustomTokenizer load_dict = download_or_load( f"transformer/{self.config.n_model}", self.config.lang, ) tokenizer = None model = (TransformerModel.from_pretrained( model_name_or_path=load_dict.path, checkpoint_file=f"{self.config.n_model}.pt", data_name_or_path=load_dict.dict_path, source_lang=load_dict.src_dict, target_lang=load_dict.tgt_dict, ).eval().to(device)) if "char" in self.config.n_model: return PororoTransformerGecChar(model, self.config) if load_dict.src_tok: tokenizer = CustomTokenizer.from_file( vocab_filename=f"{load_dict.src_tok}/vocab.json", merges_filename=f"{load_dict.src_tok}/merges.txt", ) return PororoTransformerGec(model, tokenizer, device, self.config)
[docs]class PororoTransformerGecChar(PororoGenerationBase): def __init__(self, model, config): super().__init__(config) self._model = model self._symbols = "[:.,!?\"']" self._chars = set( "0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ .,!?'-" ) def _preprocess(self, text: str): """ Preprocess input sentence to replace whitespace token with whitespace" Args: text (str): input sentence Returns: str: preprocessed input sentence """ text = text.strip() chars = [] unks = [] for ch in text: if ch in self._chars: chars.append(ch) else: chars.append("▮") unks.append(ch) text = "".join(chars) text = re.sub(" +", " ", text).strip() tokens = [ch if ch != " " else "▁" for ch in text] return " ".join(tokens[:1023]), unks def _despace_puncts(self, text: str): """ desapce punctionation Args: text (str): input sentence Returns: desapced sentence """ text = re.sub("([\"']) ", r"\1", text, count=0) return re.sub(f" ({self._symbols}+)", r"\1", text, count=0) def _postprocess(self, output: str, unks: List): """ Postprocess output sentence to replace whitespace Args: output (str): output sentence generated by model unks (List[str]): pre-replaced unknown token lists Returns: str: postprocessed output sentence """ if unks: pointer = 0 chars = [c for c in output] for i, char in enumerate(chars): if char == "▮": chars[i] = unks[pointer] pointer += 1 output = "".join(chars) output = output.replace(" ", "").replace("▁", " ").strip() return output
[docs] def predict( self, text: str, beam: int = 5, temperature: float = 1.0, top_k: int = -1, top_p: float = -1, no_repeat_ngram_size: int = 4, len_penalty: float = 1.0, ): """ Conduct grammar error correction Args: text (str): input sentence beam (int): beam search size temperature (float): temperature scale top_k (int): top-K sampling vocabulary size top_p (float): top-p sampling ratio no_repeat_ngram_size (int): no repeat ngram size len_penalty (float): length penalty ratio Returns: str: grammartically corrected sentence """ sampling = False if top_k != -1 or top_p != -1: sampling = True text, unks = self._preprocess(text) output = self._model.translate( text, beam=beam, sampling=sampling, temperature=temperature, sampling_topk=top_k, sampling_topp=top_p, max_len_a=1, max_len_b=50, no_repeat_ngram_size=no_repeat_ngram_size, lenpen=len_penalty, ) output = self._postprocess(output, unks) return output
[docs]class PororoTransformerGec(PororoGenerationBase): def __init__(self, model, tokenizer, device, config): super().__init__(config) self._model = model self._tokenizer = tokenizer self._symbols = "[:.,!?\"']" self._clitics = ("n't", "'ll", "'s", "'m", "'ve", "'d", "'re") self._device = device self._corrector = None def _space_puncts(self, text: str): """ Args: text (str): input sentence Returns: str: processed string Examples: noise!He -> noise ! He """ _text = [] for word in text.strip().split(): detect_clitic = False for clitic in self._clitics: if re.search(clitic, word): detect_clitic = True if re.search(self._symbols, word) is not None and not detect_clitic: if not word.count(".") > 1: # e.g., `U.S.` is correct. word = re.sub(f"({self._symbols})", r" \1 ", word, count=0) _text.append(word) return " ".join(_text) def _space_contracts(self, text: str): """ Args: text (str): input sentence Returns: str: processed string Examples: haven't -> have n't """ _text = [] for w in text.split(): for clitic in self._clitics: w = re.sub(f"([A-Za-z])({clitic})", r"\1 \2", w) _text.append(w) return " ".join(_text) def _collapse_spaces(self, text): """ Args: text (str): input string Returns: str: processed string """ text = re.sub(" +", " ", text) return text def _despace_puncts(self, text: str): """ Inverse function of _space_puncts(self, text) Args: text (str): input sentence Returns: str: processed string """ text = re.sub("([\"']) ", r"\1", text, count=0) return re.sub(f" ({self._symbols}+)", r"\1", text, count=0) def _despace_contracts(self, text: str): """ Inverse function of _space_contracts(self, text) Args: text (str): input sentence Returns: str: processed string """ for clitic in self._clitics: text = re.sub(f" ({clitic})", r"\1", text, count=0) return text def _preprocess(self, text: str): """ Preprocess using simple methods Args: text (str): input sentence Returns: str: preprocessed string """ text = self._space_puncts(text) text = self._space_contracts(text) text = self._collapse_spaces(text) pieces = self._tokenizer.segment(text.strip()) return " ".join(pieces) def _postprocess(self, output: str): """ Postprocess output sentence to replace whitespace Args: output (str): sentence to postprocess Returns: str: postprocessed string """ output = output.replace(" ", "").replace("▁", " ").strip() output = self._despace_puncts(output) output = self._despace_contracts(output) return output def _correct_spell(self, text: str): """ Conduct error correction for spell Args: text (str): input sentence Returns: result of spell error correction """ if self._corrector is None: self._corrector = PororoGecFactory( task="gec", lang="en", model="transformer.base.en.char_gec", ) self._corrector = self._corrector.load(self._device) return self._grammar_postprocess(self._corrector(text)) def _grammar_postprocess(self, text: str): """ Postprocess output sentence Args: text (str): sentence to postprocess Returns: str: postprocessed string """ text = re.sub(" '(s|t|ll|m|re|ve|d)", r"'\1", text) # pairs for pair in ("()", "<>", "{}", "[]", "''", '""'): opening, closing = pair opening_ = re.escape(opening) closing_ = re.escape(closing) text = re.sub( f"{opening_} +([^{closing_}]+) +{closing_}", rf"{opening}\1{closing}", text, ) text = re.sub(" ([:.,!?])", r"\1", text) return text
[docs] def predict( self, text: str, beam: int = 5, temperature: float = 1.0, top_k: int = -1, top_p: float = -1, no_repeat_ngram_size: int = 4, len_penalty: float = 1.0, **kwargs, ): """ Conduct grammar error correction Args: text (str): input sentence beam (int): beam search size temperature (float): temperature scale top_k (int): top-K sampling vocabulary size top_p (float): top-p sampling ratio no_repeat_ngram_size (int): no repeat ngram size len_penalty (float): length penalty ratio Returns: str: grammartically corrected sentence Examples: >>> gec = Pororo(task="gec", model="transformer.base.en.gec", lang="en") >>> gec("This apple are so sweet.") "This apple is so sweet." >>> gec("'I've love you, before I meet her!'") "'I've loved you, before I met her!" """ correct_spell = kwargs.get("correct_spell", False) sampling = False if top_k != -1 or top_p != -1: sampling = True if correct_spell: text = self._correct_spell(text) text = self._preprocess(text) output = self._model.translate( text, beam=beam, sampling=sampling, temperature=temperature, sampling_topk=top_k, sampling_topp=top_p, max_len_a=1, max_len_b=50, no_repeat_ngram_size=no_repeat_ngram_size, lenpen=len_penalty, ) output = self._postprocess(output) return self._grammar_postprocess(output) if correct_spell else output
[docs]class PororoBertSpacing(PororoSimpleBase): def __init__(self, model, config): super().__init__(config) self._model = model def _postprocess(self, tokens: List[str]) -> str: """ Postprocess spacing correction result Args: tokens (List[str]): list containing character and its predicted label Returns: str: postprocessed and spacing corrected sentence """ result = str() for pair in tokens: token, label = pair if label == "0": result += token elif label == "1": result += f"▁{token}" elif label == "2": result += f"{token.replace('▁', '')}" return result.replace("▁", " ").strip()
[docs] def predict(self, text: str, **kwargs) -> Union[List[str], str]: """ Conduct spacing correction Args: text: (str) sentence to be spacing error corrected Returns: str: spacing error corrected sentence """ if isinstance(text, str): text = [text] result = self._model.predict_tags(text) li_result = [] for r in result: li_result.append(self._postprocess(r)) return li_result if len(li_result) > 1 else li_result[0]