Source code for pororo.tasks.word_embedding

"""Word Embedding related modeling class"""

from collections import OrderedDict
from typing import Optional

from whoosh.qparser import QueryParser

from pororo.tasks.utils.base import PororoFactoryBase, PororoSimpleBase
from pororo.tasks.utils.download_utils import download_or_load


[docs]class PororoWordFactory(PororoFactoryBase):
    """
    Get vector or find similar word and entity from pretrained model using wikipedia

    See also:
        Wikipedia2Vec: An Efficient Toolkit for Learning and Visualizing the Embeddings of Words and Entities from Wikipedia (https://arxiv.org/abs/1812.06280)

    English (`wikipedia2vec.en`)

        - dataset: enwiki-20180420
        - metric: N/A

    Korean (`wikipedia2vec.ko`)

        - dataset: kowiki-20200720
        - metric: N/A

    Japanese (`wikipedia2vec.ja`)

        - dataset: jawiki-20180420
        - metric: N/A

    Chinese (`wikipedia2vec.zh`)

        - dataset: zhwiki-20180420
        - metric: N/A

    Args:
        query (str): input qeury
        top_n (int): number of result word or entity (need for `find_similar_words`)
        group (bool): return grouped dictionary or not (need for `find_similar_words`)

    Notes:
        PororoWikipedia2Vec has two diffrent kinds of output format following below.
        1. 'something' (word) : word2vec result (non-hyperlink in wikipedia documents)
        2. 'something' (other) : entity2vec result (hyperlink in wikipedia documents)

    Examples:
        >>> word2vec = Pororo("word2vec", lang="ko")
        >>> word2vec("사과")  # vector search
        OrderedDict([
            ('사과 (word)',
                tensor([-0.2660, -0.2157, -0.3058, -0.5231, ..., 0.0905, -0.0078,  0.6168,  0.6907], device='cuda:0')),
            ('사과 (pome;fruit of Maloideae;fruit)',
                tensor([ 0.6187, -0.9504, -1.5744,  0.1751, ..., 0.0470,  0.4685,  0.7006, -0.3036], device='cuda:0')),
            ('사향사과 (religious concept)',
                tensor([-0.0748, -0.5694, -1.3145, -1.8251, ..., -0.0657,  0.9534,  0.1697, -0.8623], device='cuda:0')),
            ('사과 (교육) (liberal arts education)',
                tensor([-3.6215e-02, -1.0046e-01, -5.8013e-01, -3.4734e-01, ..., -1.1415e-01,  6.7168e-02,  8.6065e-01, -7.3844e-01], device='cuda:0')),
            ('사과 (영화) (film)',
                tensor([-0.2731, -0.2932, -0.2658, -0.0709, ..., 0.0279,  0.4272, -0.0810, -0.1934], device='cuda:0')),
            ('사과 (행위) (intentional human action)',
                tensor([-0.2321, -0.4228, -0.2982, -0.6823, ..., -0.3684,  0.4122,  0.7825, -0.2925], device='cuda:0'))
        ])
        >>> word2vec.find_similar_words("카카오")  # word or entity search
        OrderedDict([
            ('카카오 (word)', ['몰랑이 (television series)', 'NHN벅스 (business)', '나뚜루 ()', '쿠키런: 오븐브레이크 (video game;mobile game)', '네이버 오디오 클립 ()']),
            ('카카오 (taxon)', ['커피나무 (taxon)', '코코아콩 (seed;intermediate good)', '커피콩 (seed;product)', '카카오 매스 (food ingredient;food;intermediate good)', '콜라나무속 (taxon)']),
            ('카카오 (2006~2014년 기업) (business)', ['줌 (포털 사이트) (website)', '넷츠고 ()', '줌인터넷 ()', 'SK커뮤니케이션즈 (1999~2007년 기업) ()', '드림위즈 (website)']),
            ('카카오 (기업) (enterprise;business)', ['분류:카카오 (Wikimedia category)', '카카오 (2006~2014년 기업) (business)', '줌인터넷 ()', '줌 (포털 사이트) (website)', '네이버 (기업) (enterprise;business)'])
        ])
        >>> word2vec.find_similar_words("카카오", group=True)  # word or entity search using grouping
        OrderedDict([
            ('카카오 (word)',
                OrderedDict([('television series', ['몰랑이']), ('business', ['NHN벅스']), ('', ['나뚜루', '네이버 오디오클립']), ('video game', ['쿠키런: 오븐브레이크']), ('mobile game', ['쿠키런: 오븐브레이크'])])),
            ('카카오 (taxon)',
                OrderedDict([('taxon', ['커피나무', '콜라나무속']), ('seed', ['코코아콩', '커피콩']), ('intermediate good', ['코코아콩', '카카오 매스']), ('product', ['커피콩']), ('food ingredient', ['카카오 매스']), ('food', ['카카오 매스'])])),
            ('카카오 (2006~2014년 기업) (business)',
                OrderedDict([('website', ['줌 (포털 사이트)', '드림위즈']), ('', ['넷츠고', '줌인터넷', 'SK커뮤니케이션즈 (1999~2007년 기업)'])])),
            ('카카오 (기업) (enterprise;business)',
                OrderedDict([('Wikimedia category', ['분류:카카오']), ('business', ['카카오 (2006~2014년 기업)', '네이버 (기업)']), ('', ['줌인터넷']), ('website', ['줌 (포털 사이트)']), ('enterprise', ['네이버 (기업)'])]))
        ])
        >>> word2vec = Pororo("word2vec", lang="en")
        >>> word2vec("apple")  # vector search
        OrderedDict([
            ('apple (word)',
                tensor([-1.8115e-01,  1.1258e+00, -3.3197e-01,  1.6572e-01,  ..., -6.4689e-01,  6.3094e-02, -8.8036e-02, -2.1675e-01], device='cuda:0')),
            ('Apple (fruit;pome;fruit of Maloideae)',
                tensor([-3.2076e-02,  1.5557e+00,  7.0766e-01, -7.8812e-01, ..., -4.7607e-02,  3.4023e-01,  5.3378e-01, -2.7254e-01], device='cuda:0')),
            ('Muggsy Bogues (human)',
                tensor([-1.0721,  0.9283,  1.2894,  0.4695, ..., 0.1366,  0.5774,  0.0939,  0.9778], device='cuda:0')),
            ('Ariane Passenger Payload Experiment (communications satellite)',
                tensor([ 7.5558e-02, -6.4360e-01,  2.9888e-01,  1.8166e-02,  ..., -7.9919e-01,  2.8561e-01, -4.6676e-01,  2.1841e-01], device='cuda:0')),
            ('Apple Inc. (business;enterprise;NASDAQ-100;giants of the web;Dow Jones Industrial Average)',
                tensor([-0.6466,  1.1077, -0.5390,  0.5268, ..., 0.0375,  0.3269,  1.4260, -0.0849], device='cuda:0')),
            ('Apple Records (record label)',
                tensor([-0.2443,  1.3124,  0.4259,  0.8220,  ..., -0.0310,  0.6967, -1.7474,  0.4733], device='cuda:0')),
            ('Apple (album) (studio album)',
                tensor([ 0.9694,  0.7516,  0.9456, -0.2018, ..., -0.0952, -0.3208, -1.1855,  0.1000], device='cuda:0')),
            ('Apple (automobile) (motor car)',
                tensor([ 0.0273, -0.0827,  0.3302,  0.0199, ..., 0.1942,  0.2985, -0.6952, -0.2728], device='cuda:0')),
            ('Apple River (Illinois) (river)',
                tensor([-0.2683,  1.0154,  0.3947, -0.4488,  ..., 0.3037,  0.0535, -0.4189,  1.3587], device='cuda:0')),
            ('The Apple (Star Trek: The Original Series) (Star Trek episode;television series episode)',
                tensor([ 2.9253e-01,  6.0142e-01,  5.8198e-01,  1.5138e-01, ..., -4.2186e-01,  9.4759e-01, -6.0089e-02,  1.0352e+00], device='cuda:0')),
            ('The Apple (1980 film) (film)',
                tensor([ 1.0943,  0.3313,  1.5675, -1.4343,  ..., -0.2276,  0.5506, -1.5071,  1.0106], device='cuda:0'))
        ])
        >>> word2vec.find_similar_words("apple")
            OrderedDict([
                ('apple (word)', ['blackberry (word)', 'silentype (word)', 'Apple Inc. (business;enterprise;NASDAQ-100;giants of the web;Dow Jones Industrial Average)', 'paulared (word)', 'trueimage (word)']),
                ('Apple (fruit;pome;fruit of Maloideae)', ['Pear (taxon)', 'Apricot (fruit)', 'Plum (taxon)', 'Peach (taxon)', 'Cherry (fruit;drupe)']),
                ('Muggsy Bogues (human)', ['Tom Gugliotta (human)', 'Billy Owens (human)', 'David Wingate (basketball) (human)', '1995–96 Cleveland Cavaliers season (basketball team season)', ':1989–90 Denver Nuggets season (misc)']),
                ('Ariane Passenger Payload Experiment (communications satellite)', ['INSAT-3E (communications satellite)', 'INSAT-3B (communications satellite)', 'INSAT-4E (communications satellite)', 'Rohini (satellite) (artificial satellite)', 'Bhaskara (satellite) (Earth observation satellite)']),
                ('Apple Inc. (business;enterprise;NASDAQ-100;giants of the web;Dow Jones Industrial Average)', ['Steve Jobs (human)', 'IPhone (model series;smartphone)', 'apple (word)', 'IPad (model series;tablet computer)', 'IOS 7 (mobile operating system;iOS;version, edition, or translation)']),
                ('Apple Records (record label)', ['Apple Corps (business;enterprise)', 'Come and Get It: The Best of Apple Records (compilation album;Apple Records Box Set)', 'beatles (word)', 'Maybe Tomorrow (The Iveys album) (album)', 'Maybe Tomorrow (The Iveys song) (Maybe Tomorrow;single)']),
                ('Apple (album) (studio album)', ['Shine (Mother Love Bone EP) (extended play)', 'Mother Love Bone (musical group)', 'The Rockfords (album) (album)', 'Temple of the Dog (album) (album)', 'Chloe Dancer/Crown of Thorns (Shine;song;single)']),
                ('Apple (automobile) (motor car)', ['Dayton Electric (automobile manufacturer)', 'Courier Car Co (automobile manufacturer)', 'Binghamton Electric (automobile manufacturer)', 'Century (automobile) (automobile manufacturer)', 'Babcock Electric Carriage Company (business)']),
                ('Apple River (Illinois) (river)', ['Little Menominee River (stream;river)', 'Plum River (river)', 'Nl:Lijst van rivieren in Illinois (misc)', "Fr:Liste des fleuves de l'Illinois (misc)", 'Sinsinawa River (river)']),
                ('The Apple (Star Trek: The Original Series) (Star Trek episode;television series episode)', ["Mudd's Women (television film;Star Trek episode;television series episode)", 'That Which Survives (Star Trek episode;television series episode)', 'Return to Tomorrow (Star Trek episode;television series episode)', 'The Deadly Years (Star Trek episode;television series episode)', 'By Any Other Name (Star Trek episode;television series episode)']),
                ('The Apple (1980 film) (film)', ['EST and The Forum in popular culture (cultural depiction)', "The Devil's Rain (Wikimedia disambiguation page)", 'Jesus Christ Superstar (film) (film)', 'Shock Treatment (film)', 'Xanadu (film) (film)'])
            ])
        >>> word2vec.find_similar_words("apple", top_n=3, group=True)
        OrderedDict([
            ('apple (word)',
                OrderedDict([('word', ['blackberry', 'silentype']), ('business', ['Apple Inc.']), ('enterprise', ['Apple Inc.']), ('NASDAQ-100', ['Apple Inc.']), ('giants of the web', ['Apple Inc.']), ('Dow Jones Industrial Average', ['Apple Inc.'])])),
            ('Apple (fruit;pome;fruit of Maloideae)',
                OrderedDict([('taxon', ['Pear', 'Plum']), ('fruit', ['Apricot'])])),
            ('Muggsy Bogues (human)',
                OrderedDict([('human', ['Tom Gugliotta', 'Billy Owens', 'David Wingate (basketball)'])])),
            ('Ariane Passenger Payload Experiment (communications satellite)',
                OrderedDict([('communications satellite', ['INSAT-3E', 'INSAT-3B', 'INSAT-4E'])])),
            ('Apple Inc. (business;enterprise;NASDAQ-100;giants of the web;Dow Jones Industrial Average)',
                OrderedDict([('human', ['Steve Jobs']), ('model series', ['IPhone']), ('smartphone', ['IPhone']), ('word', ['apple'])])),
            ('Apple Records (record label)',
                OrderedDict([('business', ['Apple Corps']), ('enterprise', ['Apple Corps']), ('compilation album', ['Come and Get It: The Best of Apple Records']), ('Apple Records Box Set', ['Come and Get It: The Best of Apple Records']), ('word', ['beatles'])])),
            ('Apple (album) (studio album)',
                OrderedDict([('extended play', ['Shine (Mother Love Bone EP)']), ('musical group', ['Mother Love Bone']), ('album', ['The Rockfords (album)'])])),
            ('Apple (automobile) (motor car)',
                OrderedDict([('automobile manufacturer', ['Dayton Electric', 'Courier Car Co', 'Binghamton Electric'])])),
            ('Apple River (Illinois) (river)',
                OrderedDict([('stream', ['Little Menominee River']), ('river', ['Little Menominee River', 'Plum River']), ('misc', ['Nl:Lijst van rivieren in Illinois'])])),
            ('The Apple (Star Trek: The Original Series) (Star Trek episode;television series episode)',
                OrderedDict([('television film', ["Mudd's Women"]), ('Star Trek episode', ["Mudd's Women", 'That Which Survives', 'Return to Tomorrow']), ('television series episode', ["Mudd's Women", 'That Which Survives', 'Return to Tomorrow'])])),
            ('The Apple (1980 film) (film)',
                OrderedDict([('cultural depiction', ['EST and The Forum in popular culture']), ('Wikimedia disambiguation page', ["The Devil's Rain"]), ('film', ['Jesus Christ Superstar (film)'])]))
        ])
        >>> word2vec = Pororo("word2vec", lang="ja")
        >>> word2vec("リンゴ")
        OrderedDict([
            ('リンゴ (word)', tensor([ 0.1310, -0.1558,  0.8368,  0.3689,  ..., 0.0253, -0.0910,  0.1332,  0.0920], device='cuda:0')),
            ('リンゴ (fruit;fruit of Maloideae;pome)', tensor([ 0.4617, -0.3032,  1.5106,  0.7717,  ..., -0.2006,  0.2382, -0.1939,  0.2378], device='cuda:0')),
            ('リンゴ (アルバム) (album)', tensor([-0.7952,  0.3122, -0.1794,  0.5237,  ...,  -0.4918, -0.1221, -0.0287,  0.6898], device='cuda:0'))
        ])
        >>> word2vec.find_similar_words("リンゴ")
        OrderedDict([
            ('リンゴ (word)', ['サクランボ (word)', 'イチゴ (word)', 'スターキングデリシャス (word)', 'ジュース (word)', 'アスパラガス (word)']),
            ('リンゴ (fruit;fruit of Maloideae;pome)', ['イチゴ (taxon)', 'モモ (taxon)', 'ブドウ (grape juice;berry)', 'ナシ (taxon)', 'サクランボ (drupe;fruit)']),
            ('リンゴ (アルバム) (album)', ['グッドナイト・ウィーン (album;studio album)', '想い出のフォトグラフ (Ringo;single;song)', '明日への願い (single)', "オール・シングス・マスト・パス (George Harrison's albums in chronological order;triple album;studio album)", 'バック・オフ・ブーガルー (Stop and Smell the Roses;single;song)'])
        ])
        >>> word2vec.find_similar_words("リンゴ", top_n=3, group=True)
        OrderedDict([
            ('リンゴ (word)',
                OrderedDict([('word', ['サクランボ', 'イチゴ', 'スターキングデリシャス'])])),
            ('リンゴ (fruit;fruit of Maloideae;pome)',
                OrderedDict([('taxon', ['イチゴ', 'モモ']), ('grape juice', ['ブドウ']), ('berry', ['ブドウ'])])),
            ('リンゴ (アルバム) (album)',
                OrderedDict([('album', ['グッドナイト・ウィーン']), ('studio album', ['グッドナイト・ウィーン']), ('Ringo', ['想い出のフォトグラフ']), ('single', ['想い出のフォトグラフ', '明日への願い']), ('song', ['想い出のフォトグラフ'])]))
        ])
        >>> word2vec = Pororo("word2vec", lang="zh")
        >>> word2vec("苹果")
        OrderedDict([
            ('苹果 (word)', tensor([-0.1839,  0.5122, -0.1008,  0.0722, ..., 0.3404, -0.2146,  0.3418, -0.3336], device='cuda:0')),
            ('苹果 (fruit;fruit of Maloideae;pome)', tensor([-0.5241,  0.2368, -1.1965, -0.5834,  ..., 0.3141, -0.7297,  0.5291, -0.2308], device='cuda:0')),
            ('苹果 (电影) (film)', tensor([-0.7060,  0.0215,  0.6849,  0.4374, ..., -0.1802,  0.3402, -0.9224, -0.1029], device='cuda:0')),
            ('蘋果公司 (NASDAQ-100;giants of the web;business;enterprise;Dow Jones Industrial Average)', tensor([-0.8581,  0.2706,  0.0931,  0.1566,  ..., -0.3404, -0.6099,  0.3207, -1.0029], device='cuda:0'))
        ])
        >>> word2vec.find_similar_words("苹果")
        OrderedDict([
            ('苹果 (word)', ['苹果公司 (word)', '黑莓 (word)', '苹果皮 (word)', '树莓 (word)', 'ibookstore (word)']),
            ('苹果 (fruit;fruit of Maloideae;pome)', ['杏仁 (apricot;stone;culinary nuts)', '梨 (taxon)', '無花果 (taxon)', '葡萄 (grape juice;berry)', '桃 (taxon)']),
            ('苹果 (电影) (film)', ['盲山 (film)', '我的父親母親 (misc)', '闯关东 (电视剧) (television program)', '摇摆de婚约 (misc)', '北京遇上西雅圖 (misc)']),
            ('蘋果公司 (NASDAQ-100;giants of the web;business;enterprise;Dow Jones Industrial Average)', ['苹果公司 (word)', 'IOS 9 (iOS;operating system;mobile operating system)', '苹果公司 (misc)', 'MacBook Air (Ultrabook;computer model;MacBook;Apple Macintosh)', 'WWDC (misc)'])
        ])
        >>> word2vec.find_similar_words("苹果", top_n=3, group=True)
        OrderedDict([
            ('苹果 (word)',
                OrderedDict([('word', ['苹果公司', '黑莓', '苹果皮'])])),
            ('苹果 (fruit;fruit of Maloideae;pome)',
                OrderedDict([('apricot', ['杏仁']), ('stone', ['杏仁']), ('culinary nuts', ['杏仁']), ('taxon', ['梨', '無花果'])])),
            ('苹果 (电影) (film)',
                OrderedDict([('film', ['盲山']), ('misc', ['我的父親母親']), ('television program', ['闯关东 (电视剧)'])])),
            ('蘋果公司 (NASDAQ-100;giants of the web;business;enterprise;Dow Jones Industrial Average)',
                OrderedDict([('word', ['苹果公司']), ('iOS', ['IOS 9']), ('operating system', ['IOS 9']), ('mobile operating system', ['IOS 9']), ('misc', ['苹果公司'])]))
        ])

    """

    def __init__(self, task: str, lang: str, model: Optional[str]):
        super().__init__(task, lang, model)

[docs]    @staticmethod
    def get_available_langs():
        return ["en", "ko", "ja", "zh"]

[docs]    @staticmethod
    def get_available_models():
        return {
            "en": ["wikipedia2vec.en"],
            "ko": ["wikipedia2vec.ko"],
            "ja": ["wikipedia2vec.ja"],
            "zh": ["wikipedia2vec.zh"],
        }

[docs]    def load(self, device: str):
        """
        Load user-selected task-specific model

        Args:
            device (str): device information

        Returns:
            object: User-selected task-specific model

        """
        if "wikipedia2vec" in self.config.n_model:
            import whoosh.index as index

            from pororo.models.wikipedia2vec import Wikipedia2Vec

            vec_map = {
                "ko": "kowiki_20200720_100d.pkl",
                "en": "enwiki_20180420_100d.pkl",
                "ja": "jawiki_20180420_100d.pkl",
                "zh": "zhwiki_20180420_100d.pkl",
            }

            f_wikipedia2vec = download_or_load(
                f"misc/{vec_map[self.config.lang]}",
                self.config.lang,
            )
            wikipedia2vec = Wikipedia2Vec(f_wikipedia2vec, device)

            f_index = download_or_load(
                f"misc/{self.config.lang}_indexdir.zip",
                self.config.lang,
            )
            index_dir = index.open_dir(f_index)
            return PororoWikipedia2Vec(wikipedia2vec, index_dir, self.config)


[docs]class PororoWikipedia2Vec(PororoSimpleBase):

    def __init__(self, model, index_dir, config):
        super().__init__(config)
        self._model = model
        self._ix = index_dir

    def _normalize(self, query):
        """
        normalize input query

        Args:
            query (str): input query

        Returns:
            str: normalized input qeury
        """
        searchterm = query.lower()
        searchterm = searchterm.replace(" ", "_")
        return searchterm

    def _get_word_vector(self, word: str):
        """
        get word vector from word string

        Args:
            word (str): word string

        Returns:
            OrderedDict: {word_string: word_vector}

        """
        headword2vec = OrderedDict()
        Word = self._model.get_word(word)

        if Word is not None:
            vec = self._model.get_word_vector(word)
            headword = f"{Word.text} (word)"
            headword2vec[headword] = vec

        return headword2vec

    def _get_entity_vectors(self, entity: str):
        """
        get entity vector from entity string

        Args:
            entity (str): entity string

        Returns:
            OrderedDict: {entity_string: entity_vector}

        """
        headword2vec = OrderedDict()
        with self._ix.searcher() as searcher:
            query = QueryParser("searchterms", self._ix.schema).parse(entity)
            hits = searcher.search(query)

            for hit in hits:
                if "wiki_title" in hit:
                    wiki_title = hit["wiki_title"]
                    category = hit["categories"]
                    headword = f"{wiki_title} ({category})"
                    Entity = self._model.get_entity(wiki_title)
                    if Entity is not None:
                        vec = self._model.get_entity_vector(wiki_title)
                        headword2vec[headword] = vec
        return headword2vec

    @staticmethod
    def _append(headword, relative, headword2relatives):
        """
        append relative to dictionary

        Args:
            headword: head word
            relative: relative word or entity dictionary
            headword2relatives: given result dictionary

        """

        if headword in headword2relatives:
            headword2relatives[headword].append(relative)
        else:
            headword2relatives[headword] = [relative]

    def _postprocess(self, headword2relatives):
        """
        postprocessing for better output format

        Args:
            headword2relatives (OrderedDict):

        Returns:
            OrderedDict: postprocessed output

        """
        new_headword2relatives = OrderedDict()
        for headword, relatives in headword2relatives.items():
            cat2words = OrderedDict()
            for relative in relatives:
                word, category = relative.rsplit(" (", 1)
                category = category[:-1]
                categories = category.split(";")
                for category in categories:
                    self._append(category, word, cat2words)
            new_headword2relatives[headword] = cat2words

        return new_headword2relatives

[docs]    def find_similar_words(self, query, top_n=5, group=False):
        """
        find similar words from input query

        Args:
            query (str): input query
            top_n (int): number of result
            group (bool): return grouped dictionary or not

        Returns:
            OrderedDict: word or entity search result

        """

        searchterm = self._normalize(query)

        # Final return
        headword2relatives = OrderedDict()

        with self._ix.searcher() as searcher:
            # Word
            Word = self._model.get_word(searchterm)
            if Word is not None:
                word = Word.text
                headword = f"{word} (word)"
                results = self._model.most_similar(Word, top_n + 1)
                # note that the first result is the word itself.
                if len(results) > 1:
                    for result in results[1:]:  # returned by wikipedia2vec
                        if hasattr(result[0], "text"):  # word
                            relative = result[0].text
                            relative_ = f"{relative} (word)"
                            self._append(
                                headword,
                                relative_,
                                headword2relatives,
                            )
                        else:  # entity
                            relative = result[0].title
                            idx = result[0].index.item()

                            from_idx = QueryParser(
                                "entity_idx",
                                self._ix.schema,
                            ).parse(str(idx))
                            hits = searcher.search(from_idx)
                            if len(hits) > 0:
                                category = hits[0]["categories"]
                                relative_ = f"{relative} ({category})"
                                self._append(
                                    headword,
                                    relative_,
                                    headword2relatives,
                                )
                            else:
                                relative_ = f"{relative} (misc)"
                                self._append(
                                    headword,
                                    relative_,
                                    headword2relatives,
                                )

            # Entity
            from_searchterms = QueryParser(
                "searchterms",
                self._ix.schema,
            ).parse(searchterm)
            hits = searcher.search(from_searchterms)

            # returned by indexer <Hit {'categories': 'human', 'display': 'Messi', 'wiki_title': 'Messi (2014 film)'}>
            for hit in hits:
                wiki_title = hit["wiki_title"]
                Entity = self._model.get_entity(wiki_title)
                entity = Entity.title
                category = hit["categories"]
                headword = f"{entity} ({category})"

                results = self._model.most_similar(Entity, top_n + 1)
                # note that the first result is the word itself.
                if len(results) > 1:
                    for result in results[1:]:
                        if hasattr(result[0], "text"):  # word
                            relative = result[0].text
                            relative_ = f"{relative} (word)"
                            self._append(
                                headword,
                                relative_,
                                headword2relatives,
                            )
                        else:  # entity
                            relative = result[0].title
                            idx = result[0].index.item()

                            from_idx = QueryParser(
                                "entity_idx",
                                self._ix.schema,
                            ).parse(str(idx))
                            hits = searcher.search(from_idx)
                            if len(hits) > 0:
                                category = hits[0]["categories"]
                                relative_ = f"{relative} ({category})"
                                self._append(
                                    headword,
                                    relative_,
                                    headword2relatives,
                                )
                            else:
                                relative_ = f"{relative} (misc)"
                                self._append(
                                    headword,
                                    relative_,
                                    headword2relatives,
                                )

        return self._postprocess(
            headword2relatives) if group else headword2relatives

[docs]    def predict(self, query: str, **kwargs):
        """
        predict to find similar words or entities

        Args:
            query (str): input qeury

        Returns:
            OrderedDict: vector search result

        """

        searchterm = self._normalize(query)
        word2vec = self._get_word_vector(searchterm)
        entity2vec = self._get_entity_vectors(searchterm)
        word2vec.update(entity2vec)

        if not word2vec:
            raise ValueError(f"Oops! {query} does NOT exist in our database.")
        return word2vec