Source code for pororo.tasks.semantic_textual_similarity

"""Semantic Textual Similarity related modeling class"""

from typing import Optional

from scipy import spatial

from pororo.tasks.utils.base import PororoBiencoderBase, PororoFactoryBase
from pororo.tasks.utils.download_utils import download_or_load


[docs]class PororoStsFactory(PororoFactoryBase):
    """
    Sentence similarity base semantic textual similarity using korsts, sts

    Korean (`brainbert.base.ko.korsts`)

        - dataset: KorSTS (Ham et al. 2020)
        - metric: Spearman (83.00)

    Korean (`brainsbert.base.ko.kornli.korsts`)

        - dataset: KorSTS (Ham et al. 2020)
        - metric: Spearman (83.46)

    English (`roberta.base.en.sts`)

        - dataset: STS-B (Daniel Cer et al. 2017)
        - metric: Spearman (91.2)

    Japanese (`jaberta.base.ja.sts`)

        - dataset: Translated `STS-B` (Daniel Cer et al. 2017)
        - metric: Spearman (82.80)

    Chinese (`zhberta.base.zh.sts`)

        - dataset: Translated `STS-B` (Daniel Cer et al. 2017)
        - metric: Spearman (83.65)

    Examples:
        >>> sts = Pororo(task="similarity", lang="ko")
        >>> sts("나는 동물을 좋아하는 사람이야", "강아지를 좋아하는 아버지")
        0.415
        >>> sts = Pororo(task="similarity", lang="ja")
        >>> sts("ベビーパンダがスライドを下ります。", "パンダがスライドを下って滑ります。") # ["아기 팬더가 슬라이드를 내려 갑니다.", "팬더가 슬라이드를 내려 미끄러집니다."]
        0.746
        >>> sts = Pororo(task="similarity", lang="zh")
        >>> sts('三名男子在街上做同样的舞蹈。', '街上有三个无衬衫的男人在跳舞。')  # ["세 남자가 거리에서 같은 춤을 춥니다.", "거리에서 춤추는 세 명의 벗은 남자가 있습니다."]
        0.669
        >>> sts = Pororo(task="similarity", lang="en")
        >>> sts("Two dogs and one cat sitting on couch.", "Two dogs and a cat resting on a couch.")
        0.921

    """

    def __init__(self, task: str, lang: str, model: Optional[str]):
        super().__init__(task, lang, model)

[docs]    @staticmethod
    def get_available_langs():
        return ["en", "ko", "ja", "zh"]

[docs]    @staticmethod
    def get_available_models():
        return {
            "en": ["roberta.base.en.sts"],
            "ko": [
                "brainbert.base.ko.korsts",
                "brainsbert.base.ko.kornli.korsts",
            ],
            "ja": ["jaberta.base.ja.sts", "jasbert.base.ja.nli.sts"],
            "zh": ["zhberta.base.zh.sts", "zhsbert.base.zh.nli.sts"],
        }

[docs]    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if "brainbert" in self.config.n_model:
            from pororo.models.brainbert import BrainRobertaModel

            model = (BrainRobertaModel.load_model(
                f"bert/{self.config.n_model}",
                self.config.lang,
            ).eval().to(device))
            return PororoBertSts(model, self.config)

        if "jaberta" in self.config.n_model:
            from pororo.models.brainbert import JabertaModel

            model = (JabertaModel.load_model(
                f"bert/{self.config.n_model}",
                self.config.lang,
            ).eval().to(device))
            return PororoBertSts(model, self.config)

        if "zhberta" in self.config.n_model:
            from pororo.models.brainbert import ZhbertaModel

            model = (ZhbertaModel.load_model(
                f"bert/{self.config.n_model}",
                self.config.lang,
            ).eval().to(device))
            return PororoBertSts(model, self.config)

        if "sbert" in self.config.n_model:
            from sentence_transformers import SentenceTransformer

            path = download_or_load(
                f"sbert/{self.config.n_model}",
                self.config.lang,
            )
            model = SentenceTransformer(path).eval().to(device)
            return PororoSBertSts(model, self.config)

        if "roberta" in self.config.n_model:
            from pororo.models.brainbert import CustomRobertaModel

            model = (CustomRobertaModel.load_model(
                f"bert/{self.config.n_model}",
                self.config.lang,
            ).eval().to(device))
            return PororoBertSts(model, self.config)


[docs]class PororoBertSts(PororoBiencoderBase):

    def __init__(self, model, config):
        super().__init__(config)
        self._model = model

[docs]    def predict(self, sent_a: str, sent_b: str):
        """
        Conduct semantic textual similarity task with BERT

        Args:
            sent_a (str): first sentence to be encoded
            sent_b (str): second sentence to be encoded

        Returns:
            float: similarity score

        """
        sim = self._model.predict_output(sent_a, sent_b)
        return float("{:.3f}".format(sim))


[docs]class PororoSBertSts(PororoBiencoderBase):

    def __init__(self, model, config):
        super().__init__(config)
        self._model = model

[docs]    def predict(self, sent_a: str, sent_b: str, **kwargs) -> float:
        """
        Conduct semantic textual similariry task with S-BERT

        Args:
            sent_a (str): first sentence to be encoded
            sent_b (str): second sentence to be encoded

        Returns:
            float: similarity score

        """
        encoded = self._model.encode([sent_a, sent_b])
        vec_a, vec_b = encoded[0], encoded[-1]
        sim = 1 - spatial.distance.cosine(vec_a, vec_b)
        return float("{:.3f}".format(sim))