Word Embedding

Word Embedding related modeling class

class pororo.tasks.word_embedding.PororoWordFactory(task: str, lang: str, model: Optional[str])[source]

Bases: pororo.tasks.utils.base.PororoFactoryBase

Get vector or find similar word and entity from pretrained model using wikipedia

See also

Wikipedia2Vec: An Efficient Toolkit for Learning and Visualizing the Embeddings of Words and Entities from Wikipedia (https://arxiv.org/abs/1812.06280)

English (wikipedia2vec.en)

  • dataset: enwiki-20180420

  • metric: N/A

Korean (wikipedia2vec.ko)

  • dataset: kowiki-20200720

  • metric: N/A

Japanese (wikipedia2vec.ja)

  • dataset: jawiki-20180420

  • metric: N/A

Chinese (wikipedia2vec.zh)

  • dataset: zhwiki-20180420

  • metric: N/A

Parameters
  • query (str) – input qeury

  • top_n (int) – number of result word or entity (need for find_similar_words)

  • group (bool) – return grouped dictionary or not (need for find_similar_words)

Notes

PororoWikipedia2Vec has two diffrent kinds of output format following below. 1. ‘something’ (word) : word2vec result (non-hyperlink in wikipedia documents) 2. ‘something’ (other) : entity2vec result (hyperlink in wikipedia documents)

Examples

>>> word2vec = Pororo("word2vec", lang="ko")
>>> word2vec("사과")  # vector search
OrderedDict([
    ('사과 (word)',
        tensor([-0.2660, -0.2157, -0.3058, -0.5231, ..., 0.0905, -0.0078,  0.6168,  0.6907], device='cuda:0')),
    ('사과 (pome;fruit of Maloideae;fruit)',
        tensor([ 0.6187, -0.9504, -1.5744,  0.1751, ..., 0.0470,  0.4685,  0.7006, -0.3036], device='cuda:0')),
    ('사향사과 (religious concept)',
        tensor([-0.0748, -0.5694, -1.3145, -1.8251, ..., -0.0657,  0.9534,  0.1697, -0.8623], device='cuda:0')),
    ('사과 (교육) (liberal arts education)',
        tensor([-3.6215e-02, -1.0046e-01, -5.8013e-01, -3.4734e-01, ..., -1.1415e-01,  6.7168e-02,  8.6065e-01, -7.3844e-01], device='cuda:0')),
    ('사과 (영화) (film)',
        tensor([-0.2731, -0.2932, -0.2658, -0.0709, ..., 0.0279,  0.4272, -0.0810, -0.1934], device='cuda:0')),
    ('사과 (행위) (intentional human action)',
        tensor([-0.2321, -0.4228, -0.2982, -0.6823, ..., -0.3684,  0.4122,  0.7825, -0.2925], device='cuda:0'))
])
>>> word2vec.find_similar_words("카카오")  # word or entity search
OrderedDict([
    ('카카오 (word)', ['몰랑이 (television series)', 'NHN벅스 (business)', '나뚜루 ()', '쿠키런: 오븐브레이크 (video game;mobile game)', '네이버 오디오 클립 ()']),
    ('카카오 (taxon)', ['커피나무 (taxon)', '코코아콩 (seed;intermediate good)', '커피콩 (seed;product)', '카카오 매스 (food ingredient;food;intermediate good)', '콜라나무속 (taxon)']),
    ('카카오 (2006~2014년 기업) (business)', ['줌 (포털 사이트) (website)', '넷츠고 ()', '줌인터넷 ()', 'SK커뮤니케이션즈 (1999~2007년 기업) ()', '드림위즈 (website)']),
    ('카카오 (기업) (enterprise;business)', ['분류:카카오 (Wikimedia category)', '카카오 (2006~2014년 기업) (business)', '줌인터넷 ()', '줌 (포털 사이트) (website)', '네이버 (기업) (enterprise;business)'])
])
>>> word2vec.find_similar_words("카카오", group=True)  # word or entity search using grouping
OrderedDict([
    ('카카오 (word)',
        OrderedDict([('television series', ['몰랑이']), ('business', ['NHN벅스']), ('', ['나뚜루', '네이버 오디오클립']), ('video game', ['쿠키런: 오븐브레이크']), ('mobile game', ['쿠키런: 오븐브레이크'])])),
    ('카카오 (taxon)',
        OrderedDict([('taxon', ['커피나무', '콜라나무속']), ('seed', ['코코아콩', '커피콩']), ('intermediate good', ['코코아콩', '카카오 매스']), ('product', ['커피콩']), ('food ingredient', ['카카오 매스']), ('food', ['카카오 매스'])])),
    ('카카오 (2006~2014년 기업) (business)',
        OrderedDict([('website', ['줌 (포털 사이트)', '드림위즈']), ('', ['넷츠고', '줌인터넷', 'SK커뮤니케이션즈 (1999~2007년 기업)'])])),
    ('카카오 (기업) (enterprise;business)',
        OrderedDict([('Wikimedia category', ['분류:카카오']), ('business', ['카카오 (2006~2014년 기업)', '네이버 (기업)']), ('', ['줌인터넷']), ('website', ['줌 (포털 사이트)']), ('enterprise', ['네이버 (기업)'])]))
])
>>> word2vec = Pororo("word2vec", lang="en")
>>> word2vec("apple")  # vector search
OrderedDict([
    ('apple (word)',
        tensor([-1.8115e-01,  1.1258e+00, -3.3197e-01,  1.6572e-01,  ..., -6.4689e-01,  6.3094e-02, -8.8036e-02, -2.1675e-01], device='cuda:0')),
    ('Apple (fruit;pome;fruit of Maloideae)',
        tensor([-3.2076e-02,  1.5557e+00,  7.0766e-01, -7.8812e-01, ..., -4.7607e-02,  3.4023e-01,  5.3378e-01, -2.7254e-01], device='cuda:0')),
    ('Muggsy Bogues (human)',
        tensor([-1.0721,  0.9283,  1.2894,  0.4695, ..., 0.1366,  0.5774,  0.0939,  0.9778], device='cuda:0')),
    ('Ariane Passenger Payload Experiment (communications satellite)',
        tensor([ 7.5558e-02, -6.4360e-01,  2.9888e-01,  1.8166e-02,  ..., -7.9919e-01,  2.8561e-01, -4.6676e-01,  2.1841e-01], device='cuda:0')),
    ('Apple Inc. (business;enterprise;NASDAQ-100;giants of the web;Dow Jones Industrial Average)',
        tensor([-0.6466,  1.1077, -0.5390,  0.5268, ..., 0.0375,  0.3269,  1.4260, -0.0849], device='cuda:0')),
    ('Apple Records (record label)',
        tensor([-0.2443,  1.3124,  0.4259,  0.8220,  ..., -0.0310,  0.6967, -1.7474,  0.4733], device='cuda:0')),
    ('Apple (album) (studio album)',
        tensor([ 0.9694,  0.7516,  0.9456, -0.2018, ..., -0.0952, -0.3208, -1.1855,  0.1000], device='cuda:0')),
    ('Apple (automobile) (motor car)',
        tensor([ 0.0273, -0.0827,  0.3302,  0.0199, ..., 0.1942,  0.2985, -0.6952, -0.2728], device='cuda:0')),
    ('Apple River (Illinois) (river)',
        tensor([-0.2683,  1.0154,  0.3947, -0.4488,  ..., 0.3037,  0.0535, -0.4189,  1.3587], device='cuda:0')),
    ('The Apple (Star Trek: The Original Series) (Star Trek episode;television series episode)',
        tensor([ 2.9253e-01,  6.0142e-01,  5.8198e-01,  1.5138e-01, ..., -4.2186e-01,  9.4759e-01, -6.0089e-02,  1.0352e+00], device='cuda:0')),
    ('The Apple (1980 film) (film)',
        tensor([ 1.0943,  0.3313,  1.5675, -1.4343,  ..., -0.2276,  0.5506, -1.5071,  1.0106], device='cuda:0'))
])
>>> word2vec.find_similar_words("apple")
    OrderedDict([
        ('apple (word)', ['blackberry (word)', 'silentype (word)', 'Apple Inc. (business;enterprise;NASDAQ-100;giants of the web;Dow Jones Industrial Average)', 'paulared (word)', 'trueimage (word)']),
        ('Apple (fruit;pome;fruit of Maloideae)', ['Pear (taxon)', 'Apricot (fruit)', 'Plum (taxon)', 'Peach (taxon)', 'Cherry (fruit;drupe)']),
        ('Muggsy Bogues (human)', ['Tom Gugliotta (human)', 'Billy Owens (human)', 'David Wingate (basketball) (human)', '1995–96 Cleveland Cavaliers season (basketball team season)', ':1989–90 Denver Nuggets season (misc)']),
        ('Ariane Passenger Payload Experiment (communications satellite)', ['INSAT-3E (communications satellite)', 'INSAT-3B (communications satellite)', 'INSAT-4E (communications satellite)', 'Rohini (satellite) (artificial satellite)', 'Bhaskara (satellite) (Earth observation satellite)']),
        ('Apple Inc. (business;enterprise;NASDAQ-100;giants of the web;Dow Jones Industrial Average)', ['Steve Jobs (human)', 'IPhone (model series;smartphone)', 'apple (word)', 'IPad (model series;tablet computer)', 'IOS 7 (mobile operating system;iOS;version, edition, or translation)']),
        ('Apple Records (record label)', ['Apple Corps (business;enterprise)', 'Come and Get It: The Best of Apple Records (compilation album;Apple Records Box Set)', 'beatles (word)', 'Maybe Tomorrow (The Iveys album) (album)', 'Maybe Tomorrow (The Iveys song) (Maybe Tomorrow;single)']),
        ('Apple (album) (studio album)', ['Shine (Mother Love Bone EP) (extended play)', 'Mother Love Bone (musical group)', 'The Rockfords (album) (album)', 'Temple of the Dog (album) (album)', 'Chloe Dancer/Crown of Thorns (Shine;song;single)']),
        ('Apple (automobile) (motor car)', ['Dayton Electric (automobile manufacturer)', 'Courier Car Co (automobile manufacturer)', 'Binghamton Electric (automobile manufacturer)', 'Century (automobile) (automobile manufacturer)', 'Babcock Electric Carriage Company (business)']),
        ('Apple River (Illinois) (river)', ['Little Menominee River (stream;river)', 'Plum River (river)', 'Nl:Lijst van rivieren in Illinois (misc)', "Fr:Liste des fleuves de l'Illinois (misc)", 'Sinsinawa River (river)']),
        ('The Apple (Star Trek: The Original Series) (Star Trek episode;television series episode)', ["Mudd's Women (television film;Star Trek episode;television series episode)", 'That Which Survives (Star Trek episode;television series episode)', 'Return to Tomorrow (Star Trek episode;television series episode)', 'The Deadly Years (Star Trek episode;television series episode)', 'By Any Other Name (Star Trek episode;television series episode)']),
        ('The Apple (1980 film) (film)', ['EST and The Forum in popular culture (cultural depiction)', "The Devil's Rain (Wikimedia disambiguation page)", 'Jesus Christ Superstar (film) (film)', 'Shock Treatment (film)', 'Xanadu (film) (film)'])
    ])
>>> word2vec.find_similar_words("apple", top_n=3, group=True)
OrderedDict([
    ('apple (word)',
        OrderedDict([('word', ['blackberry', 'silentype']), ('business', ['Apple Inc.']), ('enterprise', ['Apple Inc.']), ('NASDAQ-100', ['Apple Inc.']), ('giants of the web', ['Apple Inc.']), ('Dow Jones Industrial Average', ['Apple Inc.'])])),
    ('Apple (fruit;pome;fruit of Maloideae)',
        OrderedDict([('taxon', ['Pear', 'Plum']), ('fruit', ['Apricot'])])),
    ('Muggsy Bogues (human)',
        OrderedDict([('human', ['Tom Gugliotta', 'Billy Owens', 'David Wingate (basketball)'])])),
    ('Ariane Passenger Payload Experiment (communications satellite)',
        OrderedDict([('communications satellite', ['INSAT-3E', 'INSAT-3B', 'INSAT-4E'])])),
    ('Apple Inc. (business;enterprise;NASDAQ-100;giants of the web;Dow Jones Industrial Average)',
        OrderedDict([('human', ['Steve Jobs']), ('model series', ['IPhone']), ('smartphone', ['IPhone']), ('word', ['apple'])])),
    ('Apple Records (record label)',
        OrderedDict([('business', ['Apple Corps']), ('enterprise', ['Apple Corps']), ('compilation album', ['Come and Get It: The Best of Apple Records']), ('Apple Records Box Set', ['Come and Get It: The Best of Apple Records']), ('word', ['beatles'])])),
    ('Apple (album) (studio album)',
        OrderedDict([('extended play', ['Shine (Mother Love Bone EP)']), ('musical group', ['Mother Love Bone']), ('album', ['The Rockfords (album)'])])),
    ('Apple (automobile) (motor car)',
        OrderedDict([('automobile manufacturer', ['Dayton Electric', 'Courier Car Co', 'Binghamton Electric'])])),
    ('Apple River (Illinois) (river)',
        OrderedDict([('stream', ['Little Menominee River']), ('river', ['Little Menominee River', 'Plum River']), ('misc', ['Nl:Lijst van rivieren in Illinois'])])),
    ('The Apple (Star Trek: The Original Series) (Star Trek episode;television series episode)',
        OrderedDict([('television film', ["Mudd's Women"]), ('Star Trek episode', ["Mudd's Women", 'That Which Survives', 'Return to Tomorrow']), ('television series episode', ["Mudd's Women", 'That Which Survives', 'Return to Tomorrow'])])),
    ('The Apple (1980 film) (film)',
        OrderedDict([('cultural depiction', ['EST and The Forum in popular culture']), ('Wikimedia disambiguation page', ["The Devil's Rain"]), ('film', ['Jesus Christ Superstar (film)'])]))
])
>>> word2vec = Pororo("word2vec", lang="ja")
>>> word2vec("リンゴ")
OrderedDict([
    ('リンゴ (word)', tensor([ 0.1310, -0.1558,  0.8368,  0.3689,  ..., 0.0253, -0.0910,  0.1332,  0.0920], device='cuda:0')),
    ('リンゴ (fruit;fruit of Maloideae;pome)', tensor([ 0.4617, -0.3032,  1.5106,  0.7717,  ..., -0.2006,  0.2382, -0.1939,  0.2378], device='cuda:0')),
    ('リンゴ (アルバム) (album)', tensor([-0.7952,  0.3122, -0.1794,  0.5237,  ...,  -0.4918, -0.1221, -0.0287,  0.6898], device='cuda:0'))
])
>>> word2vec.find_similar_words("リンゴ")
OrderedDict([
    ('リンゴ (word)', ['サクランボ (word)', 'イチゴ (word)', 'スターキングデリシャス (word)', 'ジュース (word)', 'アスパラガス (word)']),
    ('リンゴ (fruit;fruit of Maloideae;pome)', ['イチゴ (taxon)', 'モモ (taxon)', 'ブドウ (grape juice;berry)', 'ナシ (taxon)', 'サクランボ (drupe;fruit)']),
    ('リンゴ (アルバム) (album)', ['グッドナイト・ウィーン (album;studio album)', '想い出のフォトグラフ (Ringo;single;song)', '明日への願い (single)', "オール・シングス・マスト・パス (George Harrison's albums in chronological order;triple album;studio album)", 'バック・オフ・ブーガルー (Stop and Smell the Roses;single;song)'])
])
>>> word2vec.find_similar_words("リンゴ", top_n=3, group=True)
OrderedDict([
    ('リンゴ (word)',
        OrderedDict([('word', ['サクランボ', 'イチゴ', 'スターキングデリシャス'])])),
    ('リンゴ (fruit;fruit of Maloideae;pome)',
        OrderedDict([('taxon', ['イチゴ', 'モモ']), ('grape juice', ['ブドウ']), ('berry', ['ブドウ'])])),
    ('リンゴ (アルバム) (album)',
        OrderedDict([('album', ['グッドナイト・ウィーン']), ('studio album', ['グッドナイト・ウィーン']), ('Ringo', ['想い出のフォトグラフ']), ('single', ['想い出のフォトグラフ', '明日への願い']), ('song', ['想い出のフォトグラフ'])]))
])
>>> word2vec = Pororo("word2vec", lang="zh")
>>> word2vec("苹果")
OrderedDict([
    ('苹果 (word)', tensor([-0.1839,  0.5122, -0.1008,  0.0722, ..., 0.3404, -0.2146,  0.3418, -0.3336], device='cuda:0')),
    ('苹果 (fruit;fruit of Maloideae;pome)', tensor([-0.5241,  0.2368, -1.1965, -0.5834,  ..., 0.3141, -0.7297,  0.5291, -0.2308], device='cuda:0')),
    ('苹果 (电影) (film)', tensor([-0.7060,  0.0215,  0.6849,  0.4374, ..., -0.1802,  0.3402, -0.9224, -0.1029], device='cuda:0')),
    ('蘋果公司 (NASDAQ-100;giants of the web;business;enterprise;Dow Jones Industrial Average)', tensor([-0.8581,  0.2706,  0.0931,  0.1566,  ..., -0.3404, -0.6099,  0.3207, -1.0029], device='cuda:0'))
])
>>> word2vec.find_similar_words("苹果")
OrderedDict([
    ('苹果 (word)', ['苹果公司 (word)', '黑莓 (word)', '苹果皮 (word)', '树莓 (word)', 'ibookstore (word)']),
    ('苹果 (fruit;fruit of Maloideae;pome)', ['杏仁 (apricot;stone;culinary nuts)', '梨 (taxon)', '無花果 (taxon)', '葡萄 (grape juice;berry)', '桃 (taxon)']),
    ('苹果 (电影) (film)', ['盲山 (film)', '我的父親母親 (misc)', '闯关东 (电视剧) (television program)', '摇摆de婚约 (misc)', '北京遇上西雅圖 (misc)']),
    ('蘋果公司 (NASDAQ-100;giants of the web;business;enterprise;Dow Jones Industrial Average)', ['苹果公司 (word)', 'IOS 9 (iOS;operating system;mobile operating system)', '苹果公司 (misc)', 'MacBook Air (Ultrabook;computer model;MacBook;Apple Macintosh)', 'WWDC (misc)'])
])
>>> word2vec.find_similar_words("苹果", top_n=3, group=True)
OrderedDict([
    ('苹果 (word)',
        OrderedDict([('word', ['苹果公司', '黑莓', '苹果皮'])])),
    ('苹果 (fruit;fruit of Maloideae;pome)',
        OrderedDict([('apricot', ['杏仁']), ('stone', ['杏仁']), ('culinary nuts', ['杏仁']), ('taxon', ['梨', '無花果'])])),
    ('苹果 (电影) (film)',
        OrderedDict([('film', ['盲山']), ('misc', ['我的父親母親']), ('television program', ['闯关东 (电视剧)'])])),
    ('蘋果公司 (NASDAQ-100;giants of the web;business;enterprise;Dow Jones Industrial Average)',
        OrderedDict([('word', ['苹果公司']), ('iOS', ['IOS 9']), ('operating system', ['IOS 9']), ('mobile operating system', ['IOS 9']), ('misc', ['苹果公司'])]))
])
static get_available_langs()[source]
static get_available_models()[source]
load(device: str)[source]

Load user-selected task-specific model

Parameters

device (str) – device information

Returns

User-selected task-specific model

Return type

object

class pororo.tasks.word_embedding.PororoWikipedia2Vec(model, index_dir, config)[source]

Bases: pororo.tasks.utils.base.PororoSimpleBase

find_similar_words(query, top_n=5, group=False)[source]

find similar words from input query

Parameters
  • query (str) – input query

  • top_n (int) – number of result

  • group (bool) – return grouped dictionary or not

Returns

word or entity search result

Return type

OrderedDict

predict(query: str, **kwargs)[source]

predict to find similar words or entities

Parameters

query (str) – input qeury

Returns

vector search result

Return type

OrderedDict