Source code for pororo.tasks.contextualized_embedding

"""Contextualized Embedding related modeling class"""

from typing import Optional

from pororo.tasks.utils.base import PororoFactoryBase, PororoSimpleBase


[docs]class PororoContextualFactory(PororoFactoryBase): """ Conduct contextualized embedding English (`roberta.base.en`) - dataset: N/A - metric: N/A Korean (`brainbert.base.ko`) - dataset: N/A - metric: N/A Japanese (`jaberta.base.ja`) - dataset: N/A - metric: N/A Chinese (`zhberta.base.zh`) - dataset: N/A - metric: N/A Args: sent (str): input sentence to be contextualized embedded Returns: np.array: sentence embedding with subword units Examples: >>> cse = Pororo(task="cse", lang="ko") >>> cse("하늘을 나는 새") array([[92.53, 20.24, 32.32, ...], ..., [63.24, 53.19, 45.78, ...]], dtype=float32) # (len(subwords), hidden_dim) >>> cse = Pororo(task="cse", lang="zh") >>> cse("一群人抬头看着建筑物屋顶边缘的3人。") array([[ 0.61136365, 0.24613665, 0.6259908 , ..., 0.32798234, 0.10512973, -0.06808531],..., [-0.00931012, -0.04459633, 1.0253953 , ..., 0.30732906, 0.22213839, 0.25226325]], dtype=float32) >>> cse = Pororo(task="cse", lang="ja") >>> cse("おはようございます") array([[-0.26724914, -0.23364174, -0.07206455, ..., 0.30293447, -0.36008322, 0.24684878], ..., [-0.7470922 , -0.30342472, -0.64015895, ..., -0.17556943, 0.10660946, -0.17191087]], dtype=float32) """ def __init__(self, task: str, lang: str, model: Optional[str]): super().__init__(task, lang, model)
[docs] @staticmethod def get_available_langs(): return ["en", "ko", "zh", "ja"]
[docs] @staticmethod def get_available_models(): return { "en": ["roberta.base.en"], "ko": ["brainbert.base.ko"], "zh": ["zhberta.base.zh"], "ja": ["jaberta.base.ja"], }
[docs] def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "roberta" in self.config.n_model: from pororo.models.brainbert import CustomRobertaModel model = (CustomRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device)) return PororoBertContextualized(model, self.config, device) if "brainbert" in self.config.n_model: from pororo.models.brainbert import BrainRobertaModel model = BrainRobertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device) return PororoBertContextualized(model, self.config, device) if "jaberta" in self.config.n_model: from pororo.models.brainbert import JabertaModel model = JabertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device) return PororoBertContextualized(model, self.config, device) if "zhberta" in self.config.n_model: from pororo.models.brainbert import ZhbertaModel model = ZhbertaModel.load_model( f"bert/{self.config.n_model}", self.config.lang, ).eval().to(device) return PororoBertContextualized(model, self.config, device)
[docs]class PororoBertContextualized(PororoSimpleBase): def __init__(self, model, config, device): super().__init__(config) self._model = model self._device = device
[docs] def predict(self, sent: str, **kwargs): """ Conduct contextualized embedding Args: sent (str): input sentence to be contextualized embedded Returns: np.array: sentence embedding with subword units """ indices = self._model.encode(sent).to(self._device) features, _ = self._model.model( indices.unsqueeze(0), features_only=True, ) return features.squeeze(0).detach().cpu().numpy()