Source code for pororo.tasks.word_embedding

"""Word Embedding related modeling class"""

from collections import OrderedDict
from typing import Optional

from whoosh.qparser import QueryParser

from pororo.tasks.utils.base import PororoFactoryBase, PororoSimpleBase
from pororo.tasks.utils.download_utils import download_or_load


[docs]class PororoWordFactory(PororoFactoryBase): """ Get vector or find similar word and entity from pretrained model using wikipedia See also: Wikipedia2Vec: An Efficient Toolkit for Learning and Visualizing the Embeddings of Words and Entities from Wikipedia (https://arxiv.org/abs/1812.06280) English (`wikipedia2vec.en`) - dataset: enwiki-20180420 - metric: N/A Korean (`wikipedia2vec.ko`) - dataset: kowiki-20200720 - metric: N/A Japanese (`wikipedia2vec.ja`) - dataset: jawiki-20180420 - metric: N/A Chinese (`wikipedia2vec.zh`) - dataset: zhwiki-20180420 - metric: N/A Args: query (str): input qeury top_n (int): number of result word or entity (need for `find_similar_words`) group (bool): return grouped dictionary or not (need for `find_similar_words`) Notes: PororoWikipedia2Vec has two diffrent kinds of output format following below. 1. 'something' (word) : word2vec result (non-hyperlink in wikipedia documents) 2. 'something' (other) : entity2vec result (hyperlink in wikipedia documents) Examples: >>> word2vec = Pororo("word2vec", lang="ko") >>> word2vec("사과") # vector search OrderedDict([ ('사과 (word)', tensor([-0.2660, -0.2157, -0.3058, -0.5231, ..., 0.0905, -0.0078, 0.6168, 0.6907], device='cuda:0')), ('사과 (pome;fruit of Maloideae;fruit)', tensor([ 0.6187, -0.9504, -1.5744, 0.1751, ..., 0.0470, 0.4685, 0.7006, -0.3036], device='cuda:0')), ('사향사과 (religious concept)', tensor([-0.0748, -0.5694, -1.3145, -1.8251, ..., -0.0657, 0.9534, 0.1697, -0.8623], device='cuda:0')), ('사과 (교육) (liberal arts education)', tensor([-3.6215e-02, -1.0046e-01, -5.8013e-01, -3.4734e-01, ..., -1.1415e-01, 6.7168e-02, 8.6065e-01, -7.3844e-01], device='cuda:0')), ('사과 (영화) (film)', tensor([-0.2731, -0.2932, -0.2658, -0.0709, ..., 0.0279, 0.4272, -0.0810, -0.1934], device='cuda:0')), ('사과 (행위) (intentional human action)', tensor([-0.2321, -0.4228, -0.2982, -0.6823, ..., -0.3684, 0.4122, 0.7825, -0.2925], device='cuda:0')) ]) >>> word2vec.find_similar_words("카카오") # word or entity search OrderedDict([ ('카카오 (word)', ['몰랑이 (television series)', 'NHN벅스 (business)', '나뚜루 ()', '쿠키런: 오븐브레이크 (video game;mobile game)', '네이버 오디오 클립 ()']), ('카카오 (taxon)', ['커피나무 (taxon)', '코코아콩 (seed;intermediate good)', '커피콩 (seed;product)', '카카오 매스 (food ingredient;food;intermediate good)', '콜라나무속 (taxon)']), ('카카오 (2006~2014년 기업) (business)', ['줌 (포털 사이트) (website)', '넷츠고 ()', '줌인터넷 ()', 'SK커뮤니케이션즈 (1999~2007년 기업) ()', '드림위즈 (website)']), ('카카오 (기업) (enterprise;business)', ['분류:카카오 (Wikimedia category)', '카카오 (2006~2014년 기업) (business)', '줌인터넷 ()', '줌 (포털 사이트) (website)', '네이버 (기업) (enterprise;business)']) ]) >>> word2vec.find_similar_words("카카오", group=True) # word or entity search using grouping OrderedDict([ ('카카오 (word)', OrderedDict([('television series', ['몰랑이']), ('business', ['NHN벅스']), ('', ['나뚜루', '네이버 오디오클립']), ('video game', ['쿠키런: 오븐브레이크']), ('mobile game', ['쿠키런: 오븐브레이크'])])), ('카카오 (taxon)', OrderedDict([('taxon', ['커피나무', '콜라나무속']), ('seed', ['코코아콩', '커피콩']), ('intermediate good', ['코코아콩', '카카오 매스']), ('product', ['커피콩']), ('food ingredient', ['카카오 매스']), ('food', ['카카오 매스'])])), ('카카오 (2006~2014년 기업) (business)', OrderedDict([('website', ['줌 (포털 사이트)', '드림위즈']), ('', ['넷츠고', '줌인터넷', 'SK커뮤니케이션즈 (1999~2007년 기업)'])])), ('카카오 (기업) (enterprise;business)', OrderedDict([('Wikimedia category', ['분류:카카오']), ('business', ['카카오 (2006~2014년 기업)', '네이버 (기업)']), ('', ['줌인터넷']), ('website', ['줌 (포털 사이트)']), ('enterprise', ['네이버 (기업)'])])) ]) >>> word2vec = Pororo("word2vec", lang="en") >>> word2vec("apple") # vector search OrderedDict([ ('apple (word)', tensor([-1.8115e-01, 1.1258e+00, -3.3197e-01, 1.6572e-01, ..., -6.4689e-01, 6.3094e-02, -8.8036e-02, -2.1675e-01], device='cuda:0')), ('Apple (fruit;pome;fruit of Maloideae)', tensor([-3.2076e-02, 1.5557e+00, 7.0766e-01, -7.8812e-01, ..., -4.7607e-02, 3.4023e-01, 5.3378e-01, -2.7254e-01], device='cuda:0')), ('Muggsy Bogues (human)', tensor([-1.0721, 0.9283, 1.2894, 0.4695, ..., 0.1366, 0.5774, 0.0939, 0.9778], device='cuda:0')), ('Ariane Passenger Payload Experiment (communications satellite)', tensor([ 7.5558e-02, -6.4360e-01, 2.9888e-01, 1.8166e-02, ..., -7.9919e-01, 2.8561e-01, -4.6676e-01, 2.1841e-01], device='cuda:0')), ('Apple Inc. (business;enterprise;NASDAQ-100;giants of the web;Dow Jones Industrial Average)', tensor([-0.6466, 1.1077, -0.5390, 0.5268, ..., 0.0375, 0.3269, 1.4260, -0.0849], device='cuda:0')), ('Apple Records (record label)', tensor([-0.2443, 1.3124, 0.4259, 0.8220, ..., -0.0310, 0.6967, -1.7474, 0.4733], device='cuda:0')), ('Apple (album) (studio album)', tensor([ 0.9694, 0.7516, 0.9456, -0.2018, ..., -0.0952, -0.3208, -1.1855, 0.1000], device='cuda:0')), ('Apple (automobile) (motor car)', tensor([ 0.0273, -0.0827, 0.3302, 0.0199, ..., 0.1942, 0.2985, -0.6952, -0.2728], device='cuda:0')), ('Apple River (Illinois) (river)', tensor([-0.2683, 1.0154, 0.3947, -0.4488, ..., 0.3037, 0.0535, -0.4189, 1.3587], device='cuda:0')), ('The Apple (Star Trek: The Original Series) (Star Trek episode;television series episode)', tensor([ 2.9253e-01, 6.0142e-01, 5.8198e-01, 1.5138e-01, ..., -4.2186e-01, 9.4759e-01, -6.0089e-02, 1.0352e+00], device='cuda:0')), ('The Apple (1980 film) (film)', tensor([ 1.0943, 0.3313, 1.5675, -1.4343, ..., -0.2276, 0.5506, -1.5071, 1.0106], device='cuda:0')) ]) >>> word2vec.find_similar_words("apple") OrderedDict([ ('apple (word)', ['blackberry (word)', 'silentype (word)', 'Apple Inc. (business;enterprise;NASDAQ-100;giants of the web;Dow Jones Industrial Average)', 'paulared (word)', 'trueimage (word)']), ('Apple (fruit;pome;fruit of Maloideae)', ['Pear (taxon)', 'Apricot (fruit)', 'Plum (taxon)', 'Peach (taxon)', 'Cherry (fruit;drupe)']), ('Muggsy Bogues (human)', ['Tom Gugliotta (human)', 'Billy Owens (human)', 'David Wingate (basketball) (human)', '1995–96 Cleveland Cavaliers season (basketball team season)', ':1989–90 Denver Nuggets season (misc)']), ('Ariane Passenger Payload Experiment (communications satellite)', ['INSAT-3E (communications satellite)', 'INSAT-3B (communications satellite)', 'INSAT-4E (communications satellite)', 'Rohini (satellite) (artificial satellite)', 'Bhaskara (satellite) (Earth observation satellite)']), ('Apple Inc. (business;enterprise;NASDAQ-100;giants of the web;Dow Jones Industrial Average)', ['Steve Jobs (human)', 'IPhone (model series;smartphone)', 'apple (word)', 'IPad (model series;tablet computer)', 'IOS 7 (mobile operating system;iOS;version, edition, or translation)']), ('Apple Records (record label)', ['Apple Corps (business;enterprise)', 'Come and Get It: The Best of Apple Records (compilation album;Apple Records Box Set)', 'beatles (word)', 'Maybe Tomorrow (The Iveys album) (album)', 'Maybe Tomorrow (The Iveys song) (Maybe Tomorrow;single)']), ('Apple (album) (studio album)', ['Shine (Mother Love Bone EP) (extended play)', 'Mother Love Bone (musical group)', 'The Rockfords (album) (album)', 'Temple of the Dog (album) (album)', 'Chloe Dancer/Crown of Thorns (Shine;song;single)']), ('Apple (automobile) (motor car)', ['Dayton Electric (automobile manufacturer)', 'Courier Car Co (automobile manufacturer)', 'Binghamton Electric (automobile manufacturer)', 'Century (automobile) (automobile manufacturer)', 'Babcock Electric Carriage Company (business)']), ('Apple River (Illinois) (river)', ['Little Menominee River (stream;river)', 'Plum River (river)', 'Nl:Lijst van rivieren in Illinois (misc)', "Fr:Liste des fleuves de l'Illinois (misc)", 'Sinsinawa River (river)']), ('The Apple (Star Trek: The Original Series) (Star Trek episode;television series episode)', ["Mudd's Women (television film;Star Trek episode;television series episode)", 'That Which Survives (Star Trek episode;television series episode)', 'Return to Tomorrow (Star Trek episode;television series episode)', 'The Deadly Years (Star Trek episode;television series episode)', 'By Any Other Name (Star Trek episode;television series episode)']), ('The Apple (1980 film) (film)', ['EST and The Forum in popular culture (cultural depiction)', "The Devil's Rain (Wikimedia disambiguation page)", 'Jesus Christ Superstar (film) (film)', 'Shock Treatment (film)', 'Xanadu (film) (film)']) ]) >>> word2vec.find_similar_words("apple", top_n=3, group=True) OrderedDict([ ('apple (word)', OrderedDict([('word', ['blackberry', 'silentype']), ('business', ['Apple Inc.']), ('enterprise', ['Apple Inc.']), ('NASDAQ-100', ['Apple Inc.']), ('giants of the web', ['Apple Inc.']), ('Dow Jones Industrial Average', ['Apple Inc.'])])), ('Apple (fruit;pome;fruit of Maloideae)', OrderedDict([('taxon', ['Pear', 'Plum']), ('fruit', ['Apricot'])])), ('Muggsy Bogues (human)', OrderedDict([('human', ['Tom Gugliotta', 'Billy Owens', 'David Wingate (basketball)'])])), ('Ariane Passenger Payload Experiment (communications satellite)', OrderedDict([('communications satellite', ['INSAT-3E', 'INSAT-3B', 'INSAT-4E'])])), ('Apple Inc. (business;enterprise;NASDAQ-100;giants of the web;Dow Jones Industrial Average)', OrderedDict([('human', ['Steve Jobs']), ('model series', ['IPhone']), ('smartphone', ['IPhone']), ('word', ['apple'])])), ('Apple Records (record label)', OrderedDict([('business', ['Apple Corps']), ('enterprise', ['Apple Corps']), ('compilation album', ['Come and Get It: The Best of Apple Records']), ('Apple Records Box Set', ['Come and Get It: The Best of Apple Records']), ('word', ['beatles'])])), ('Apple (album) (studio album)', OrderedDict([('extended play', ['Shine (Mother Love Bone EP)']), ('musical group', ['Mother Love Bone']), ('album', ['The Rockfords (album)'])])), ('Apple (automobile) (motor car)', OrderedDict([('automobile manufacturer', ['Dayton Electric', 'Courier Car Co', 'Binghamton Electric'])])), ('Apple River (Illinois) (river)', OrderedDict([('stream', ['Little Menominee River']), ('river', ['Little Menominee River', 'Plum River']), ('misc', ['Nl:Lijst van rivieren in Illinois'])])), ('The Apple (Star Trek: The Original Series) (Star Trek episode;television series episode)', OrderedDict([('television film', ["Mudd's Women"]), ('Star Trek episode', ["Mudd's Women", 'That Which Survives', 'Return to Tomorrow']), ('television series episode', ["Mudd's Women", 'That Which Survives', 'Return to Tomorrow'])])), ('The Apple (1980 film) (film)', OrderedDict([('cultural depiction', ['EST and The Forum in popular culture']), ('Wikimedia disambiguation page', ["The Devil's Rain"]), ('film', ['Jesus Christ Superstar (film)'])])) ]) >>> word2vec = Pororo("word2vec", lang="ja") >>> word2vec("リンゴ") OrderedDict([ ('リンゴ (word)', tensor([ 0.1310, -0.1558, 0.8368, 0.3689, ..., 0.0253, -0.0910, 0.1332, 0.0920], device='cuda:0')), ('リンゴ (fruit;fruit of Maloideae;pome)', tensor([ 0.4617, -0.3032, 1.5106, 0.7717, ..., -0.2006, 0.2382, -0.1939, 0.2378], device='cuda:0')), ('リンゴ (アルバム) (album)', tensor([-0.7952, 0.3122, -0.1794, 0.5237, ..., -0.4918, -0.1221, -0.0287, 0.6898], device='cuda:0')) ]) >>> word2vec.find_similar_words("リンゴ") OrderedDict([ ('リンゴ (word)', ['サクランボ (word)', 'イチゴ (word)', 'スターキングデリシャス (word)', 'ジュース (word)', 'アスパラガス (word)']), ('リンゴ (fruit;fruit of Maloideae;pome)', ['イチゴ (taxon)', 'モモ (taxon)', 'ブドウ (grape juice;berry)', 'ナシ (taxon)', 'サクランボ (drupe;fruit)']), ('リンゴ (アルバム) (album)', ['グッドナイト・ウィーン (album;studio album)', '想い出のフォトグラフ (Ringo;single;song)', '明日への願い (single)', "オール・シングス・マスト・パス (George Harrison's albums in chronological order;triple album;studio album)", 'バック・オフ・ブーガルー (Stop and Smell the Roses;single;song)']) ]) >>> word2vec.find_similar_words("リンゴ", top_n=3, group=True) OrderedDict([ ('リンゴ (word)', OrderedDict([('word', ['サクランボ', 'イチゴ', 'スターキングデリシャス'])])), ('リンゴ (fruit;fruit of Maloideae;pome)', OrderedDict([('taxon', ['イチゴ', 'モモ']), ('grape juice', ['ブドウ']), ('berry', ['ブドウ'])])), ('リンゴ (アルバム) (album)', OrderedDict([('album', ['グッドナイト・ウィーン']), ('studio album', ['グッドナイト・ウィーン']), ('Ringo', ['想い出のフォトグラフ']), ('single', ['想い出のフォトグラフ', '明日への願い']), ('song', ['想い出のフォトグラフ'])])) ]) >>> word2vec = Pororo("word2vec", lang="zh") >>> word2vec("苹果") OrderedDict([ ('苹果 (word)', tensor([-0.1839, 0.5122, -0.1008, 0.0722, ..., 0.3404, -0.2146, 0.3418, -0.3336], device='cuda:0')), ('苹果 (fruit;fruit of Maloideae;pome)', tensor([-0.5241, 0.2368, -1.1965, -0.5834, ..., 0.3141, -0.7297, 0.5291, -0.2308], device='cuda:0')), ('苹果 (电影) (film)', tensor([-0.7060, 0.0215, 0.6849, 0.4374, ..., -0.1802, 0.3402, -0.9224, -0.1029], device='cuda:0')), ('蘋果公司 (NASDAQ-100;giants of the web;business;enterprise;Dow Jones Industrial Average)', tensor([-0.8581, 0.2706, 0.0931, 0.1566, ..., -0.3404, -0.6099, 0.3207, -1.0029], device='cuda:0')) ]) >>> word2vec.find_similar_words("苹果") OrderedDict([ ('苹果 (word)', ['苹果公司 (word)', '黑莓 (word)', '苹果皮 (word)', '树莓 (word)', 'ibookstore (word)']), ('苹果 (fruit;fruit of Maloideae;pome)', ['杏仁 (apricot;stone;culinary nuts)', '梨 (taxon)', '無花果 (taxon)', '葡萄 (grape juice;berry)', '桃 (taxon)']), ('苹果 (电影) (film)', ['盲山 (film)', '我的父親母親 (misc)', '闯关东 (电视剧) (television program)', '摇摆de婚约 (misc)', '北京遇上西雅圖 (misc)']), ('蘋果公司 (NASDAQ-100;giants of the web;business;enterprise;Dow Jones Industrial Average)', ['苹果公司 (word)', 'IOS 9 (iOS;operating system;mobile operating system)', '苹果公司 (misc)', 'MacBook Air (Ultrabook;computer model;MacBook;Apple Macintosh)', 'WWDC (misc)']) ]) >>> word2vec.find_similar_words("苹果", top_n=3, group=True) OrderedDict([ ('苹果 (word)', OrderedDict([('word', ['苹果公司', '黑莓', '苹果皮'])])), ('苹果 (fruit;fruit of Maloideae;pome)', OrderedDict([('apricot', ['杏仁']), ('stone', ['杏仁']), ('culinary nuts', ['杏仁']), ('taxon', ['梨', '無花果'])])), ('苹果 (电影) (film)', OrderedDict([('film', ['盲山']), ('misc', ['我的父親母親']), ('television program', ['闯关东 (电视剧)'])])), ('蘋果公司 (NASDAQ-100;giants of the web;business;enterprise;Dow Jones Industrial Average)', OrderedDict([('word', ['苹果公司']), ('iOS', ['IOS 9']), ('operating system', ['IOS 9']), ('mobile operating system', ['IOS 9']), ('misc', ['苹果公司'])])) ]) """ def __init__(self, task: str, lang: str, model: Optional[str]): super().__init__(task, lang, model)
[docs] @staticmethod def get_available_langs(): return ["en", "ko", "ja", "zh"]
[docs] @staticmethod def get_available_models(): return { "en": ["wikipedia2vec.en"], "ko": ["wikipedia2vec.ko"], "ja": ["wikipedia2vec.ja"], "zh": ["wikipedia2vec.zh"], }
[docs] def load(self, device: str): """ Load user-selected task-specific model Args: device (str): device information Returns: object: User-selected task-specific model """ if "wikipedia2vec" in self.config.n_model: import whoosh.index as index from pororo.models.wikipedia2vec import Wikipedia2Vec vec_map = { "ko": "kowiki_20200720_100d.pkl", "en": "enwiki_20180420_100d.pkl", "ja": "jawiki_20180420_100d.pkl", "zh": "zhwiki_20180420_100d.pkl", } f_wikipedia2vec = download_or_load( f"misc/{vec_map[self.config.lang]}", self.config.lang, ) wikipedia2vec = Wikipedia2Vec(f_wikipedia2vec, device) f_index = download_or_load( f"misc/{self.config.lang}_indexdir.zip", self.config.lang, ) index_dir = index.open_dir(f_index) return PororoWikipedia2Vec(wikipedia2vec, index_dir, self.config)
[docs]class PororoWikipedia2Vec(PororoSimpleBase): def __init__(self, model, index_dir, config): super().__init__(config) self._model = model self._ix = index_dir def _normalize(self, query): """ normalize input query Args: query (str): input query Returns: str: normalized input qeury """ searchterm = query.lower() searchterm = searchterm.replace(" ", "_") return searchterm def _get_word_vector(self, word: str): """ get word vector from word string Args: word (str): word string Returns: OrderedDict: {word_string: word_vector} """ headword2vec = OrderedDict() Word = self._model.get_word(word) if Word is not None: vec = self._model.get_word_vector(word) headword = f"{Word.text} (word)" headword2vec[headword] = vec return headword2vec def _get_entity_vectors(self, entity: str): """ get entity vector from entity string Args: entity (str): entity string Returns: OrderedDict: {entity_string: entity_vector} """ headword2vec = OrderedDict() with self._ix.searcher() as searcher: query = QueryParser("searchterms", self._ix.schema).parse(entity) hits = searcher.search(query) for hit in hits: if "wiki_title" in hit: wiki_title = hit["wiki_title"] category = hit["categories"] headword = f"{wiki_title} ({category})" Entity = self._model.get_entity(wiki_title) if Entity is not None: vec = self._model.get_entity_vector(wiki_title) headword2vec[headword] = vec return headword2vec @staticmethod def _append(headword, relative, headword2relatives): """ append relative to dictionary Args: headword: head word relative: relative word or entity dictionary headword2relatives: given result dictionary """ if headword in headword2relatives: headword2relatives[headword].append(relative) else: headword2relatives[headword] = [relative] def _postprocess(self, headword2relatives): """ postprocessing for better output format Args: headword2relatives (OrderedDict): Returns: OrderedDict: postprocessed output """ new_headword2relatives = OrderedDict() for headword, relatives in headword2relatives.items(): cat2words = OrderedDict() for relative in relatives: word, category = relative.rsplit(" (", 1) category = category[:-1] categories = category.split(";") for category in categories: self._append(category, word, cat2words) new_headword2relatives[headword] = cat2words return new_headword2relatives
[docs] def find_similar_words(self, query, top_n=5, group=False): """ find similar words from input query Args: query (str): input query top_n (int): number of result group (bool): return grouped dictionary or not Returns: OrderedDict: word or entity search result """ searchterm = self._normalize(query) # Final return headword2relatives = OrderedDict() with self._ix.searcher() as searcher: # Word Word = self._model.get_word(searchterm) if Word is not None: word = Word.text headword = f"{word} (word)" results = self._model.most_similar(Word, top_n + 1) # note that the first result is the word itself. if len(results) > 1: for result in results[1:]: # returned by wikipedia2vec if hasattr(result[0], "text"): # word relative = result[0].text relative_ = f"{relative} (word)" self._append( headword, relative_, headword2relatives, ) else: # entity relative = result[0].title idx = result[0].index.item() from_idx = QueryParser( "entity_idx", self._ix.schema, ).parse(str(idx)) hits = searcher.search(from_idx) if len(hits) > 0: category = hits[0]["categories"] relative_ = f"{relative} ({category})" self._append( headword, relative_, headword2relatives, ) else: relative_ = f"{relative} (misc)" self._append( headword, relative_, headword2relatives, ) # Entity from_searchterms = QueryParser( "searchterms", self._ix.schema, ).parse(searchterm) hits = searcher.search(from_searchterms) # returned by indexer <Hit {'categories': 'human', 'display': 'Messi', 'wiki_title': 'Messi (2014 film)'}> for hit in hits: wiki_title = hit["wiki_title"] Entity = self._model.get_entity(wiki_title) entity = Entity.title category = hit["categories"] headword = f"{entity} ({category})" results = self._model.most_similar(Entity, top_n + 1) # note that the first result is the word itself. if len(results) > 1: for result in results[1:]: if hasattr(result[0], "text"): # word relative = result[0].text relative_ = f"{relative} (word)" self._append( headword, relative_, headword2relatives, ) else: # entity relative = result[0].title idx = result[0].index.item() from_idx = QueryParser( "entity_idx", self._ix.schema, ).parse(str(idx)) hits = searcher.search(from_idx) if len(hits) > 0: category = hits[0]["categories"] relative_ = f"{relative} ({category})" self._append( headword, relative_, headword2relatives, ) else: relative_ = f"{relative} (misc)" self._append( headword, relative_, headword2relatives, ) return self._postprocess( headword2relatives) if group else headword2relatives
[docs] def predict(self, query: str, **kwargs): """ predict to find similar words or entities Args: query (str): input qeury Returns: OrderedDict: vector search result """ searchterm = self._normalize(query) word2vec = self._get_word_vector(searchterm) entity2vec = self._get_entity_vectors(searchterm) word2vec.update(entity2vec) if not word2vec: raise ValueError(f"Oops! {query} does NOT exist in our database.") return word2vec