"""Named Entity Recognition related modeling class"""
import re
from collections import defaultdict
from typing import List, Optional, Tuple
from pororo.tasks.utils.base import PororoFactoryBase, PororoSimpleBase
from pororo.tasks.utils.download_utils import download_or_load
[docs]class PororoNerFactory(PororoFactoryBase):
"""
Conduct named entity recognition
English (`roberta.base.en.ner`)
- dataset: OntoNotes 5.0
- metric: F1 (91.63)
Korean (`charbert.base.ko.ner`)
- dataset: https://corpus.korean.go.kr/ 개체명 분석 말뭉치
- metric: F1 (89.63)
Japanese (`jaberta.base.ja.ner`)
- dataset: Kyoto University Web Document Leads Corpus
- metric: F1 (76.74)
- ref: https://github.com/ku-nlp/KWDLC
Chinese (`zhberta.base.zh.ner`)
- dataset: OntoNotes 5.0
- metric: F1 (79.06)
Args:
sent: (str) sentence to be sequence labeled
Returns:
List[Tuple[str, str]]: token and its predicted tag tuple list
Examples:
>>> ner = Pororo(task="ner", lang="en)
>>> ner("It was in midfield where Arsenal took control of the game, and that was mainly down to Thomas Partey and Mohamed Elneny.")
[('It', 'O'), ('was', 'O'), ('in', 'O'), ('midfield', 'O'), ('where', 'O'), ('Arsenal', 'ORG'), ('took', 'O'), ('control', 'O'), ('of', 'O'), ('the', 'O'), ('game', 'O'), (',', 'O'), ('and', 'O'), ('that', 'O'), ('was', 'O'), ('mainly', 'O'), ('down', 'O'), ('to', 'O'), ('Thomas Partey', 'PERSON'), ('and', 'O'), ('Mohamed Elneny', 'PERSON'), ('.', 'O')]
>>> ner = Pororo(task="ner", lang="ko")
>>> ner("손흥민은 28세의 183 센티미터, 77 킬로그램이며, 현재 주급은 약 3억 원이다.")
[('손흥민', 'PERSON'), ('은', 'O'), (' ', 'O'), ('28세', 'QUANTITY'), ('의', 'O'), (' ', 'O'), ('183 센티미터', 'QUANTITY'), (',', 'O'), (' ', 'O'), ('77 킬로그램', 'QUANTITY'), ('이며,', 'O'), (' ', 'O'), ('현재', 'O'), (' ', 'O'), ('주급은', 'O'), (' ', 'O'), ('약 3억 원', 'QUANTITY'), ('이다.', 'O')]
>>> # `apply_wsd` : for korean, you can use Word Sense Disambiguation module to get more specific tag
>>> ner("손흥민은 28세의 183 센티미터, 77 킬로그램이며, 현재 주급은 약 3억 원이다.", apply_wsd=True)
[('손흥민', 'PERSON'), ('은', 'O'), (' ', 'O'), ('28세', 'AGE'), ('의', 'O'), (' ', 'O'), ('183 센티미터', 'LENGTH/DISTANCE'), (',', 'O'), (' ', 'O'), ('77 킬로그램', 'WEIGHT'), ('이며,', 'O'), (' ', 'O'), ('현재', 'O'), (' ', 'O'), ('주급은', 'O'), (' ', 'O'), ('약 3억 원', 'MONEY'), ('이다.', 'O')]
>>> ner = Pororo(task="ner", lang="zh")
>>> ner("毛泽东(1893年12月26日-1976年9月9日),字润之,湖南湘潭人。中华民国大陆时期、中国共产党和中华人民共和国的重要政治家、经济家、军事家、战略家、外交家和诗人。")
[('毛泽东', 'PERSON'), ('(', 'O'), ('1893年12月26日-1976年9月9日', 'DATE'), (')', 'O'), (',', 'O'), ('字润之', 'O'), (',', 'O'), ('湖南', 'GPE'), ('湘潭', 'GPE'), ('人', 'O'), ('。', 'O'), ('中华民国大陆时期', 'GPE'), ('、', 'O'), ('中国共产党', 'ORG'), ('和', 'O'), ('中华人民共和国', 'GPE'), ('的', 'O'), ('重', 'O'), ('要', 'O'), ('政', 'O'), ('治', 'O'), ('家', 'O'), ('、', 'O'), ('经', 'O'), ('济', 'O'), ('家', 'O'), ('、', 'O'), ('军', 'O'), ('事', 'O'), ('家', 'O'), ('、', 'O'), ('战', 'O'), ('略', 'O'), ('家', 'O'), ('、', 'O'), ('外', 'O'), ('交', 'O'), ('家', 'O'), ('和', 'O'), ('诗', 'O'), ('人', 'O'), ('。', 'O')]
>>> ner = Pororo(task="ner", lang="ja")
>>> ner("豊臣 秀吉、または羽柴 秀吉は、戦国時代から安土桃山時代にかけての武将、大名。天下人、武家関白、太閤。三英傑の一人。")
[('豊臣秀吉', 'PERSON'), ('、', 'O'), ('または', 'O'), ('羽柴秀吉', 'PERSON'), ('は', 'O'), ('、', 'O'), ('戦国時代', 'DATE'), ('から', 'O'), ('安土桃山時代', 'DATE'), ('にかけて', 'O'), ('の', 'O'), ('武将', 'O'), ('、', 'O'), ('大名', 'O'), ('。', 'O'), ('天下', 'O'), ('人', 'O'), ('、', 'O'), ('武家', 'O'), ('関白', 'O'), ('、太閤', 'O'), ('。', 'O'), ('三', 'O'), ('英', 'O'), ('傑', 'O'), ('の', 'O'), ('一', 'O'), ('人', 'O'), ('。', 'O')]
"""
def __init__(self, task: str, lang: str, model: Optional[str]):
super().__init__(task, lang, model)
[docs] @staticmethod
def get_available_langs():
return ["en", "ko", "zh", "ja"]
[docs] @staticmethod
def get_available_models():
return {
"en": ["roberta.base.en.ner"],
"ko": ["charbert.base.ko.ner"],
"zh": ["zhberta.base.zh.ner"],
"ja": ["jaberta.base.ja.ner"],
}
[docs] def load(self, device):
"""
Load user-selected task-specific model
Args:
device (str): device information
Returns:
object: User-selected task-specific model
"""
if "roberta" in self.config.n_model:
from pororo.models.brainbert import CustomRobertaModel
model = (CustomRobertaModel.load_model(
f"bert/{self.config.n_model}",
self.config.lang,
).eval().to(device))
return PororoBertNerEn(model, self.config)
if "charbert" in self.config.n_model:
from pororo.models.brainbert import CharBrainRobertaModel
from pororo.tasks.tokenization import PororoTokenizationFactory
model = (CharBrainRobertaModel.load_model(
f"bert/{self.config.n_model}",
self.config.lang,
).eval().to(device))
sent_tokenizer = PororoTokenizationFactory(
task="tokenization",
model="sent_ko",
lang=self.config.lang,
).load(device)
f_wsd_dict = open(
download_or_load(
f"misc/wiki.{self.config.lang}.items",
self.config.lang,
),
"r",
)
wsd_dict = defaultdict(dict)
for line in f_wsd_dict.readlines():
origin, target, word = line.strip().split("\t")
wsd_dict[origin][word] = target
return PororoBertCharNer(
model,
sent_tokenizer,
wsd_dict,
device,
self.config,
)
if "zhberta" in self.config.n_model:
from pororo.models.brainbert import ZhbertaModel
model = (ZhbertaModel.load_model(
f"bert/{self.config.n_model}",
self.config.lang,
).eval().to(device))
return PororoBertNerZh(model, self.config)
if "jaberta" in self.config.n_model:
from pororo.models.brainbert import JabertaModel
model = (JabertaModel.load_model(
f"bert/{self.config.n_model}",
self.config.lang,
).eval().to(device))
return PororoBertNerJa(model, self.config)
[docs]class PororoBertNerEn(PororoSimpleBase):
def __init__(self, model, config):
super().__init__(config)
self._model = model
def _postprocess(self, tags: List[str]):
"""
Postprocess NER tags to concatenate BIO
Args:
tags (List[str]): inferenced tag list
Returns:
List[str]: postprocessed BIO scheme tag list
"""
def _remove_tail(tag):
if "-" in tag:
tag = tag[2:]
return tag
result = list()
word = tags[0][0]
tag = tags[0][1]
for pair in tags[1:]:
token, label = pair
if "I" in label:
word += token
else:
word = word.strip()
if word.endswith("."):
result.append((word[:-1], _remove_tail(tag)))
result.append((".", "O"))
else:
result.append((word, _remove_tail(tag)))
word = token
tag = label
word = word.strip()
if word.endswith("."):
result.append((word[:-1], _remove_tail(tag)))
result.append((".", "O"))
else:
result.append((word, _remove_tail(tag)))
return [pair for pair in result if pair[0] != ""]
[docs] def predict(self, sent: str, **kwargs):
"""
Conduct named entity recognition with english RoBERTa
Args:
sent: (str) sentence to be sequence labeled
Returns:
List[Tuple[str, str]]: token and its predicted tag tuple list
"""
return self._postprocess(self._model.predict_tags(sent))
[docs]class PororoBertCharNer(PororoSimpleBase):
def __init__(
self,
model,
sent_tokenizer,
wsd_dict,
device,
config,
):
super().__init__(config)
self._model = model
self._sent_tokenizer = sent_tokenizer
self._tag = {
"PS": "PERSON",
"LC": "LOCATION",
"OG": "ORGANIZATION",
"AF": "ARTIFACT",
"DT": "DATE",
"TI": "TIME",
"CV": "CIVILIZATION",
"AM": "ANIMAL",
"PT": "PLANT",
"QT": "QUANTITY",
"FD": "STUDY_FIELD",
"TR": "THEORY",
"EV": "EVENT",
"MT": "MATERIAL",
"TM": "TERM",
}
self._device = device
self._wsd_dict = wsd_dict
self._wsd = None
self._cls2cat = None
self._quant2cat = None
self._term2cat = None
def _template_match(self, text, expression2category):
"""
Apply template match using regular expression
Args:
text (str): text to be searched
expression2category (dict): regular expression dict
Returns:
str: regex matched category
"""
for expression, category in expression2category.items():
if re.search(expression, text) is not None:
return category
[docs] def apply_dict(self, tags: List[Tuple[str, str]]):
"""
Apply pre-defined dictionary to get detail tag info
Args:
tags (List[Tuple[str, str]]): inference word-tag pair result
Returns:
List[Tuple[str, str]]: dict-applied result
"""
result = []
for pair in tags:
word, tag = pair
if (tag in self._wsd_dict.keys()) and (word in self._wsd_dict[tag]):
result.append((word, self._wsd_dict[tag][word].upper()))
else:
result.append(pair)
return result
def _apply_wsd(self, tags: List[Tuple[str, str]]):
"""
Apply Word Sense Disambiguation to get detail tag info
Args:
tags (List[Tuple[str, str]]): inference word-tag pair result
Returns:
List[Tuple[str, str]]: wsd-applied result
"""
if self._wsd is None:
from pororo.tasks import PororoWsdFactory
self._wsd = PororoWsdFactory(
task="wsd",
lang="ko",
model="transformer.large.ko.wsd",
).load(self._device)
if self._cls2cat is None:
self._cls2cat = dict()
lines = (open(
download_or_load(
"misc/wsd.cls.txt",
self.config.lang,
),
"r",
encoding="utf8",
).read().strip().splitlines())
for line in lines:
morph, homonymno, category = line.split()
classifier = f"{morph}__NNB__{homonymno}" # bound noun
self._cls2cat[classifier] = category
if self._quant2cat is None:
self._quant2cat = dict()
self._term2cat = dict()
lines = (open(
download_or_load(
"misc/re.templates.txt",
self.config.lang,
),
"r",
).read().strip().splitlines())
for line in lines:
category, ner_category, expression = line.split(" ", 2)
if ner_category == "QUANTITY":
self._quant2cat[expression] = category
elif ner_category == "TERM":
self._term2cat[expression] = category
input_text_with_markers = str()
target_token_ids = []
for idx, ner_token in enumerate(tags):
surface, tag = ner_token
# as {} will be used as special symbols
surface = surface.replace("{", "{")
surface = surface.replace("}", "}")
if tag == "TERM":
cat = self._template_match(surface, self._term2cat)
if cat is not None:
tags[idx] = (surface, cat)
input_text_with_markers += surface
elif tag == "QUANTITY":
cat = self._template_match(surface, self._quant2cat)
if cat is not None:
tags[idx] = (surface, cat)
input_text_with_markers += surface
else:
target_token_ids.append(idx)
input_text_with_markers += "{" + surface + "}"
else:
input_text_with_markers += surface
wsd_results = self._wsd(input_text_with_markers)
action = False
has_category = False
categories = []
for wsd_token in wsd_results:
morph, tag, homonymno = wsd_token[:3]
if morph == "{":
has_category = False
action = True
elif morph == "}":
if has_category is False:
categories.append("QUANTITY") # original category
has_category = False
action = False
if action:
if homonymno is None:
homonymno = "00"
query = f"{morph}__{tag}__{homonymno}"
if query in self._cls2cat:
category = self._cls2cat[query]
categories.append(category)
has_category = True
action = False
assert len(target_token_ids) == len(categories)
for target_token_id, cat in zip(target_token_ids, categories):
tags[target_token_id] = (tags[target_token_id][0], cat)
return tags
def _postprocess(self, tags: List[Tuple[str, str]]):
"""
Postprocess characted tags to concatenate BIO
Args:
tags (List[Tuple[str, str]]): characted token and its corresponding tag tuple list
Returns:
List(Tuple[str, str]): postprocessed entity token and its corresponding tag tuple list
"""
def _remove_tail(tag: str):
if "-" in tag:
tag = tag[:-2]
return tag
result = list()
tmp_word = tags[0][0]
prev_ori_tag = tags[0][1]
prev_tag = _remove_tail(prev_ori_tag)
for _, pair in enumerate(tags[1:]):
char = pair[0]
ori_tag = pair[1]
tag = _remove_tail(ori_tag)
if ("▁" in char) and ("-I" not in ori_tag):
result.append((tmp_word, prev_tag))
result.append((" ", "O"))
tmp_word = char
prev_tag = tag
continue
if (tag == prev_tag) and (("-I" in ori_tag) or "O" in ori_tag):
tmp_word += char
elif (tag != prev_tag) and ("-I" in ori_tag) and (tag != "O"):
tmp_word += char
else:
result.append((tmp_word, prev_tag))
tmp_word = char
prev_tag = tag
result.append((tmp_word, prev_tag))
result = [(pair[0].replace("▁", " ").strip(),
pair[1]) if pair[0] != " " else (" ", "O")
for pair in result]
return result
[docs] def predict(
self,
text: str,
**kwargs,
):
"""
Conduct named entity recognition with character BERT
Args:
text: (str) sentence to be sequence labeled
apply_wsd: (bool) whether to apply wsd to get more specific label information
ignore_labels: (list) labels to be ignored
Returns:
List[Tuple[str, str]]: token and its predicted tag tuple list
"""
apply_wsd = kwargs.get("apply_wsd", False)
ignore_labels = kwargs.get("ignore_labels", [])
texts = text.strip().split("\n")
result = []
for text in texts:
for sent in self._sent_tokenizer(text.strip()):
res = self._model.predict_tags(sent)
res = [
pair for pair in self._postprocess(res)
if pair[1] not in ignore_labels
]
res = [(
pair[0],
self._tag[pair[1]],
) if pair[1] in self._tag else pair for pair in res]
res = res if not apply_wsd else self._apply_wsd(res)
result.extend(self.apply_dict(res))
result.extend([(" ", "O")])
return result[:-1]
[docs]class PororoBertNerZh(PororoSimpleBase):
def __init__(self, model, config):
super().__init__(config)
self._model = model
def _postprocess(
self,
tags: List[str],
):
"""
Postprocess NER tags to concatenate BIO
Args:
tags (List[str]): inferenced tag list
Returns:
List[str]: postprocessed BIO scheme tag list
"""
def _remove_tail(tag):
if "-" in tag:
tag = tag[2:]
return tag
result = list()
word = tags[0][0]
tag = tags[0][1]
for pair in tags[1:]:
token, label = pair
if "I" in label:
word += token
else:
word = word.strip()
result.append((word, _remove_tail(tag)))
word = token
tag = label
word = word.strip()
result.append((word, _remove_tail(tag)))
return result
[docs] def predict(self, sent: str, **kwargs):
"""
Conduct named entity recognition with Chinese RoBERTa
Args:
sent: (str) sentence to be sequence labeled
Returns:
List[Tuple[str, str]]: token and its predicted tag tuple list
"""
tags = self._model.predict_tags(sent)
return self._postprocess(tags)
[docs]class PororoBertNerJa(PororoSimpleBase):
def __init__(self, model, config):
super().__init__(config)
self._model = model
def _postprocess(
self,
tags: List[str],
):
"""
Postprocess NER tags to concatenate BIO
Args:
tags (List[str]): inferenced tag list
Returns:
List[str]: postprocessed BIO scheme tag list
"""
def _remove_tail(tag):
if "-" in tag:
tag = tag[2:]
return tag
result = list()
word = tags[0][0]
tag = tags[0][1]
for pair in tags[1:]:
token, label = pair
if "I" in label:
word += token
else:
word = word.strip()
result.append((word.replace("##", ""), _remove_tail(tag)))
word = token
tag = label
word = word.strip()
result.append((word.replace("##", ""), _remove_tail(tag)))
return result
[docs] def predict(self, sent: str, **kwargs):
"""
Conduct named entity recognition with Japanese RoBERTa
Args:
sent: (str) sentence to be sequence labeled
Returns:
List[Tuple[str, str]]: token and its predicted tag tuple list
"""
return self._postprocess(self._model.predict_tags(sent))