Source code for pororo.tasks.named_entity_recognition

"""Named Entity Recognition related modeling class"""

import re
from collections import defaultdict
from typing import List, Optional, Tuple

from pororo.tasks.utils.base import PororoFactoryBase, PororoSimpleBase
from pororo.tasks.utils.download_utils import download_or_load


[docs]class PororoNerFactory(PororoFactoryBase): """ Conduct named entity recognition English (`roberta.base.en.ner`) - dataset: OntoNotes 5.0 - metric: F1 (91.63) Korean (`charbert.base.ko.ner`) - dataset: https://corpus.korean.go.kr/ 개체명 분석 말뭉치 - metric: F1 (89.63) Japanese (`jaberta.base.ja.ner`) - dataset: Kyoto University Web Document Leads Corpus - metric: F1 (76.74) - ref: https://github.com/ku-nlp/KWDLC Chinese (`zhberta.base.zh.ner`) - dataset: OntoNotes 5.0 - metric: F1 (79.06) Args: sent: (str) sentence to be sequence labeled Returns: List[Tuple[str, str]]: token and its predicted tag tuple list Examples: >>> ner = Pororo(task="ner", lang="en) >>> ner("It was in midfield where Arsenal took control of the game, and that was mainly down to Thomas Partey and Mohamed Elneny.") [('It', 'O'), ('was', 'O'), ('in', 'O'), ('midfield', 'O'), ('where', 'O'), ('Arsenal', 'ORG'), ('took', 'O'), ('control', 'O'), ('of', 'O'), ('the', 'O'), ('game', 'O'), (',', 'O'), ('and', 'O'), ('that', 'O'), ('was', 'O'), ('mainly', 'O'), ('down', 'O'), ('to', 'O'), ('Thomas Partey', 'PERSON'), ('and', 'O'), ('Mohamed Elneny', 'PERSON'), ('.', 'O')] >>> ner = Pororo(task="ner", lang="ko") >>> ner("손흥민은 28세의 183 센티미터, 77 킬로그램이며, 현재 주급은 약 3억 원이다.") [('손흥민', 'PERSON'), ('은', 'O'), (' ', 'O'), ('28세', 'QUANTITY'), ('의', 'O'), (' ', 'O'), ('183 센티미터', 'QUANTITY'), (',', 'O'), (' ', 'O'), ('77 킬로그램', 'QUANTITY'), ('이며,', 'O'), (' ', 'O'), ('현재', 'O'), (' ', 'O'), ('주급은', 'O'), (' ', 'O'), ('약 3억 원', 'QUANTITY'), ('이다.', 'O')] >>> # `apply_wsd` : for korean, you can use Word Sense Disambiguation module to get more specific tag >>> ner("손흥민은 28세의 183 센티미터, 77 킬로그램이며, 현재 주급은 약 3억 원이다.", apply_wsd=True) [('손흥민', 'PERSON'), ('은', 'O'), (' ', 'O'), ('28세', 'AGE'), ('의', 'O'), (' ', 'O'), ('183 센티미터', 'LENGTH/DISTANCE'), (',', 'O'), (' ', 'O'), ('77 킬로그램', 'WEIGHT'), ('이며,', 'O'), (' ', 'O'), ('현재', 'O'), (' ', 'O'), ('주급은', 'O'), (' ', 'O'), ('약 3억 원', 'MONEY'), ('이다.', 'O')] >>> ner = Pororo(task="ner", lang="zh") >>> ner("毛泽东(1893年12月26日-1976年9月9日),字润之,湖南湘潭人。中华民国大陆时期、中国共产党和中华人民共和国的重要政治家、经济家、军事家、战略家、外交家和诗人。") [('毛泽东', 'PERSON'), ('(', 'O'), ('1893年12月26日-1976年9月9日', 'DATE'), (')', 'O'), (',', 'O'), ('字润之', 'O'), (',', 'O'), ('湖南', 'GPE'), ('湘潭', 'GPE'), ('人', 'O'), ('。', 'O'), ('中华民国大陆时期', 'GPE'), ('、', 'O'), ('中国共产党', 'ORG'), ('和', 'O'), ('中华人民共和国', 'GPE'), ('的', 'O'), ('重', 'O'), ('要', 'O'), ('政', 'O'), ('治', 'O'), ('家', 'O'), ('、', 'O'), ('经', 'O'), ('济', 'O'), ('家', 'O'), ('、', 'O'), ('军', 'O'), ('事', 'O'), ('家', 'O'), ('、', 'O'), ('战', 'O'), ('略', 'O'), ('家', 'O'), ('、', 'O'), ('外', 'O'), ('交', 'O'), ('家', 'O'), ('和', 'O'), ('诗', 'O'), ('人', 'O'), ('。', 'O')] >>> ner = Pororo(task="ner", lang="ja") >>> ner("豊臣 秀吉、または羽柴 秀吉は、戦国時代から安土桃山時代にかけての武将、大名。天下人、武家関白、太閤。三英傑の一人。") [('豊臣秀吉', 'PERSON'), ('、', 'O'), ('または', 'O'), ('羽柴秀吉', 'PERSON'), ('は', 'O'), ('、', 'O'), ('戦国時代', 'DATE'), ('から', 'O'), ('安土桃山時代', 'DATE'), ('にかけて', 'O'), ('の', 'O'), ('武将', 'O'), ('、', 'O'), ('大名', 'O'), ('。', 'O'), ('天下', 'O'), ('人', 'O'), ('、', 'O'), ('武家', 'O'), ('関白', 'O'), ('、太閤', 'O'), ('。', 'O'), ('三', 'O'), ('英', 'O'), ('傑', 'O'), ('の', 'O'), ('一', 'O'), ('人', 'O'), ('。', 'O')] """ def __init__(self, task: str, lang: str, model: Optional[str]): super().__init__(task, lang, model)
[docs] @staticmethod def get_available_langs(): return ["en", "ko", "zh", "ja"]
[docs] @staticmethod def get_available_models(): return { "en": ["roberta.base.en.ner"], "ko": ["charbert.base.ko.ner"], "zh": ["zhberta.base.zh.ner"], "ja": ["jaberta.base.ja.ner"], }
[docs] def load(self, device): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "roberta" in self.config.n_model: from pororo.models.brainbert import CustomRobertaModel model = (CustomRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) return PororoBertNerEn(model, self.config) if "charbert" in self.config.n_model: from pororo.models.brainbert import CharBrainRobertaModel from pororo.tasks.tokenization import PororoTokenizationFactory model = (CharBrainRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) sent_tokenizer = PororoTokenizationFactory( task="tokenization", model="sent_ko", lang=self.config.lang, ).load(device) f_wsd_dict = open( download_or_load( f"misc/wiki.{self.config.lang}.items", self.config.lang, ), "r", ) wsd_dict = defaultdict(dict) for line in f_wsd_dict.readlines(): origin, target, word = line.strip().split("\t") wsd_dict[origin][word] = target return PororoBertCharNer( model, sent_tokenizer, wsd_dict, device, self.config, ) if "zhberta" in self.config.n_model: from pororo.models.brainbert import ZhbertaModel model = (ZhbertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) return PororoBertNerZh(model, self.config) if "jaberta" in self.config.n_model: from pororo.models.brainbert import JabertaModel model = (JabertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) return PororoBertNerJa(model, self.config)
[docs]class PororoBertNerEn(PororoSimpleBase): def __init__(self, model, config): super().__init__(config) self._model = model def _postprocess(self, tags: List[str]): """ Postprocess NER tags to concatenate BIO Args: tags (List[str]): inferenced tag list Returns: List[str]: postprocessed BIO scheme tag list """ def _remove_tail(tag): if "-" in tag: tag = tag[2:] return tag result = list() word = tags[0][0] tag = tags[0][1] for pair in tags[1:]: token, label = pair if "I" in label: word += token else: word = word.strip() if word.endswith("."): result.append((word[:-1], _remove_tail(tag))) result.append((".", "O")) else: result.append((word, _remove_tail(tag))) word = token tag = label word = word.strip() if word.endswith("."): result.append((word[:-1], _remove_tail(tag))) result.append((".", "O")) else: result.append((word, _remove_tail(tag))) return [pair for pair in result if pair[0] != ""]
[docs] def predict(self, sent: str, **kwargs): """ Conduct named entity recognition with english RoBERTa Args: sent: (str) sentence to be sequence labeled Returns: List[Tuple[str, str]]: token and its predicted tag tuple list """ return self._postprocess(self._model.predict_tags(sent))
[docs]class PororoBertCharNer(PororoSimpleBase): def __init__( self, model, sent_tokenizer, wsd_dict, device, config, ): super().__init__(config) self._model = model self._sent_tokenizer = sent_tokenizer self._tag = { "PS": "PERSON", "LC": "LOCATION", "OG": "ORGANIZATION", "AF": "ARTIFACT", "DT": "DATE", "TI": "TIME", "CV": "CIVILIZATION", "AM": "ANIMAL", "PT": "PLANT", "QT": "QUANTITY", "FD": "STUDY_FIELD", "TR": "THEORY", "EV": "EVENT", "MT": "MATERIAL", "TM": "TERM", } self._device = device self._wsd_dict = wsd_dict self._wsd = None self._cls2cat = None self._quant2cat = None self._term2cat = None def _template_match(self, text, expression2category): """ Apply template match using regular expression Args: text (str): text to be searched expression2category (dict): regular expression dict Returns: str: regex matched category """ for expression, category in expression2category.items(): if re.search(expression, text) is not None: return category
[docs] def apply_dict(self, tags: List[Tuple[str, str]]): """ Apply pre-defined dictionary to get detail tag info Args: tags (List[Tuple[str, str]]): inference word-tag pair result Returns: List[Tuple[str, str]]: dict-applied result """ result = [] for pair in tags: word, tag = pair if (tag in self._wsd_dict.keys()) and (word in self._wsd_dict[tag]): result.append((word, self._wsd_dict[tag][word].upper())) else: result.append(pair) return result
def _apply_wsd(self, tags: List[Tuple[str, str]]): """ Apply Word Sense Disambiguation to get detail tag info Args: tags (List[Tuple[str, str]]): inference word-tag pair result Returns: List[Tuple[str, str]]: wsd-applied result """ if self._wsd is None: from pororo.tasks import PororoWsdFactory self._wsd = PororoWsdFactory( task="wsd", lang="ko", model="transformer.large.ko.wsd", ).load(self._device) if self._cls2cat is None: self._cls2cat = dict() lines = (open( download_or_load( "misc/wsd.cls.txt", self.config.lang, ), "r", encoding="utf8", ).read().strip().splitlines()) for line in lines: morph, homonymno, category = line.split() classifier = f"{morph}__NNB__{homonymno}" # bound noun self._cls2cat[classifier] = category if self._quant2cat is None: self._quant2cat = dict() self._term2cat = dict() lines = (open( download_or_load( "misc/re.templates.txt", self.config.lang, ), "r", ).read().strip().splitlines()) for line in lines: category, ner_category, expression = line.split(" ", 2) if ner_category == "QUANTITY": self._quant2cat[expression] = category elif ner_category == "TERM": self._term2cat[expression] = category input_text_with_markers = str() target_token_ids = [] for idx, ner_token in enumerate(tags): surface, tag = ner_token # as {} will be used as special symbols surface = surface.replace("{", "{") surface = surface.replace("}", "}") if tag == "TERM": cat = self._template_match(surface, self._term2cat) if cat is not None: tags[idx] = (surface, cat) input_text_with_markers += surface elif tag == "QUANTITY": cat = self._template_match(surface, self._quant2cat) if cat is not None: tags[idx] = (surface, cat) input_text_with_markers += surface else: target_token_ids.append(idx) input_text_with_markers += "{" + surface + "}" else: input_text_with_markers += surface wsd_results = self._wsd(input_text_with_markers) action = False has_category = False categories = [] for wsd_token in wsd_results: morph, tag, homonymno = wsd_token[:3] if morph == "{": has_category = False action = True elif morph == "}": if has_category is False: categories.append("QUANTITY") # original category has_category = False action = False if action: if homonymno is None: homonymno = "00" query = f"{morph}__{tag}__{homonymno}" if query in self._cls2cat: category = self._cls2cat[query] categories.append(category) has_category = True action = False assert len(target_token_ids) == len(categories) for target_token_id, cat in zip(target_token_ids, categories): tags[target_token_id] = (tags[target_token_id][0], cat) return tags def _postprocess(self, tags: List[Tuple[str, str]]): """ Postprocess characted tags to concatenate BIO Args: tags (List[Tuple[str, str]]): characted token and its corresponding tag tuple list Returns: List(Tuple[str, str]): postprocessed entity token and its corresponding tag tuple list """ def _remove_tail(tag: str): if "-" in tag: tag = tag[:-2] return tag result = list() tmp_word = tags[0][0] prev_ori_tag = tags[0][1] prev_tag = _remove_tail(prev_ori_tag) for _, pair in enumerate(tags[1:]): char = pair[0] ori_tag = pair[1] tag = _remove_tail(ori_tag) if ("▁" in char) and ("-I" not in ori_tag): result.append((tmp_word, prev_tag)) result.append((" ", "O")) tmp_word = char prev_tag = tag continue if (tag == prev_tag) and (("-I" in ori_tag) or "O" in ori_tag): tmp_word += char elif (tag != prev_tag) and ("-I" in ori_tag) and (tag != "O"): tmp_word += char else: result.append((tmp_word, prev_tag)) tmp_word = char prev_tag = tag result.append((tmp_word, prev_tag)) result = [(pair[0].replace("▁", " ").strip(), pair[1]) if pair[0] != " " else (" ", "O") for pair in result] return result
[docs] def predict( self, text: str, **kwargs, ): """ Conduct named entity recognition with character BERT Args: text: (str) sentence to be sequence labeled apply_wsd: (bool) whether to apply wsd to get more specific label information ignore_labels: (list) labels to be ignored Returns: List[Tuple[str, str]]: token and its predicted tag tuple list """ apply_wsd = kwargs.get("apply_wsd", False) ignore_labels = kwargs.get("ignore_labels", []) texts = text.strip().split("\n") result = [] for text in texts: for sent in self._sent_tokenizer(text.strip()): res = self._model.predict_tags(sent) res = [ pair for pair in self._postprocess(res) if pair[1] not in ignore_labels ] res = [( pair[0], self._tag[pair[1]], ) if pair[1] in self._tag else pair for pair in res] res = res if not apply_wsd else self._apply_wsd(res) result.extend(self.apply_dict(res)) result.extend([(" ", "O")]) return result[:-1]
[docs]class PororoBertNerZh(PororoSimpleBase): def __init__(self, model, config): super().__init__(config) self._model = model def _postprocess( self, tags: List[str], ): """ Postprocess NER tags to concatenate BIO Args: tags (List[str]): inferenced tag list Returns: List[str]: postprocessed BIO scheme tag list """ def _remove_tail(tag): if "-" in tag: tag = tag[2:] return tag result = list() word = tags[0][0] tag = tags[0][1] for pair in tags[1:]: token, label = pair if "I" in label: word += token else: word = word.strip() result.append((word, _remove_tail(tag))) word = token tag = label word = word.strip() result.append((word, _remove_tail(tag))) return result
[docs] def predict(self, sent: str, **kwargs): """ Conduct named entity recognition with Chinese RoBERTa Args: sent: (str) sentence to be sequence labeled Returns: List[Tuple[str, str]]: token and its predicted tag tuple list """ tags = self._model.predict_tags(sent) return self._postprocess(tags)
[docs]class PororoBertNerJa(PororoSimpleBase): def __init__(self, model, config): super().__init__(config) self._model = model def _postprocess( self, tags: List[str], ): """ Postprocess NER tags to concatenate BIO Args: tags (List[str]): inferenced tag list Returns: List[str]: postprocessed BIO scheme tag list """ def _remove_tail(tag): if "-" in tag: tag = tag[2:] return tag result = list() word = tags[0][0] tag = tags[0][1] for pair in tags[1:]: token, label = pair if "I" in label: word += token else: word = word.strip() result.append((word.replace("##", ""), _remove_tail(tag))) word = token tag = label word = word.strip() result.append((word.replace("##", ""), _remove_tail(tag))) return result
[docs] def predict(self, sent: str, **kwargs): """ Conduct named entity recognition with Japanese RoBERTa Args: sent: (str) sentence to be sequence labeled Returns: List[Tuple[str, str]]: token and its predicted tag tuple list """ return self._postprocess(self._model.predict_tags(sent))