829 lines
29 KiB
Plaintext
829 lines
29 KiB
Plaintext
.. Copyright (C) 2001-2023 NLTK Project
|
|
.. For license information, see LICENSE.TXT
|
|
|
|
=================
|
|
WordNet Interface
|
|
=================
|
|
|
|
WordNet is just another NLTK corpus reader, and can be imported like this:
|
|
|
|
>>> from nltk.corpus import wordnet
|
|
|
|
For more compact code, we recommend:
|
|
|
|
>>> from nltk.corpus import wordnet as wn
|
|
|
|
-----
|
|
Words
|
|
-----
|
|
|
|
Look up a word using ``synsets()``; this function has an optional ``pos`` argument
|
|
which lets you constrain the part of speech of the word:
|
|
|
|
>>> wn.synsets('dog')
|
|
[Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'),
|
|
Synset('frank.n.02'), Synset('pawl.n.01'), Synset('andiron.n.01'), Synset('chase.v.01')]
|
|
>>> wn.synsets('dog', pos=wn.VERB)
|
|
[Synset('chase.v.01')]
|
|
|
|
The other parts of speech are ``NOUN``, ``ADJ`` and ``ADV``.
|
|
A synset is identified with a 3-part name of the form: word.pos.nn:
|
|
|
|
>>> wn.synset('dog.n.01')
|
|
Synset('dog.n.01')
|
|
>>> print(wn.synset('dog.n.01').definition())
|
|
a member of the genus Canis (probably descended from the common wolf) that has been domesticated by man since prehistoric times; occurs in many breeds
|
|
>>> len(wn.synset('dog.n.01').examples())
|
|
1
|
|
>>> print(wn.synset('dog.n.01').examples()[0])
|
|
the dog barked all night
|
|
>>> wn.synset('dog.n.01').lemmas()
|
|
[Lemma('dog.n.01.dog'), Lemma('dog.n.01.domestic_dog'), Lemma('dog.n.01.Canis_familiaris')]
|
|
>>> [str(lemma.name()) for lemma in wn.synset('dog.n.01').lemmas()]
|
|
['dog', 'domestic_dog', 'Canis_familiaris']
|
|
>>> wn.lemma('dog.n.01.dog').synset()
|
|
Synset('dog.n.01')
|
|
|
|
The WordNet corpus reader gives access to the Open Multilingual
|
|
WordNet, using ISO-639 language codes. These languages are not
|
|
loaded by default, but only lazily, when needed.
|
|
|
|
>>> wn.langs()
|
|
['eng']
|
|
|
|
>>> wn.synsets(b'\xe7\x8a\xac'.decode('utf-8'), lang='jpn')
|
|
[Synset('dog.n.01'), Synset('spy.n.01')]
|
|
|
|
>>> wn.synset('spy.n.01').lemma_names('jpn')
|
|
['いぬ', 'まわし者', 'スパイ', '回し者', '回者', '密偵',
|
|
'工作員', '廻し者', '廻者', '探', '探り', '犬', '秘密捜査員',
|
|
'諜報員', '諜者', '間者', '間諜', '隠密']
|
|
|
|
>>> sorted(wn.langs())
|
|
['als', 'arb', 'bul', 'cat', 'cmn', 'dan', 'ell', 'eng', 'eus',
|
|
'fin', 'fra', 'glg', 'heb', 'hrv', 'ind', 'isl', 'ita', 'ita_iwn',
|
|
'jpn', 'lit', 'nld', 'nno', 'nob', 'pol', 'por', 'ron', 'slk',
|
|
'slv', 'spa', 'swe', 'tha', 'zsm']
|
|
|
|
>>> wn.synset('dog.n.01').lemma_names('ita')
|
|
['Canis_familiaris', 'cane']
|
|
>>> wn.lemmas('cane', lang='ita')
|
|
[Lemma('dog.n.01.cane'), Lemma('cramp.n.02.cane'), Lemma('hammer.n.01.cane'), Lemma('bad_person.n.01.cane'),
|
|
Lemma('incompetent.n.01.cane')]
|
|
>>> sorted(wn.synset('dog.n.01').lemmas('dan'))
|
|
[Lemma('dog.n.01.hund'), Lemma('dog.n.01.k\xf8ter'),
|
|
Lemma('dog.n.01.vovhund'), Lemma('dog.n.01.vovse')]
|
|
|
|
>>> sorted(wn.synset('dog.n.01').lemmas('por'))
|
|
[Lemma('dog.n.01.cachorra'), Lemma('dog.n.01.cachorro'), Lemma('dog.n.01.cadela'), Lemma('dog.n.01.c\xe3o')]
|
|
|
|
>>> dog_lemma = wn.lemma(b'dog.n.01.c\xc3\xa3o'.decode('utf-8'), lang='por')
|
|
>>> dog_lemma
|
|
Lemma('dog.n.01.c\xe3o')
|
|
>>> dog_lemma.lang()
|
|
'por'
|
|
>>> len(list(wordnet.all_lemma_names(pos='n', lang='jpn')))
|
|
66031
|
|
|
|
The synonyms of a word are returned as a nested list of synonyms of the different senses of
|
|
the input word in the given language, since these different senses are not mutual synonyms:
|
|
|
|
>>> wn.synonyms('car')
|
|
[['auto', 'automobile', 'machine', 'motorcar'], ['railcar', 'railroad_car', 'railway_car'], ['gondola'], ['elevator_car'], ['cable_car']]
|
|
>>> wn.synonyms('coche', lang='spa')
|
|
[['auto', 'automóvil', 'carro', 'máquina', 'turismo', 'vehículo'], ['automotor', 'vagón'], ['vagón', 'vagón_de_pasajeros']]
|
|
|
|
|
|
-------
|
|
Synsets
|
|
-------
|
|
|
|
`Synset`: a set of synonyms that share a common meaning.
|
|
|
|
>>> dog = wn.synset('dog.n.01')
|
|
>>> dog.hypernyms()
|
|
[Synset('canine.n.02'), Synset('domestic_animal.n.01')]
|
|
>>> dog.hyponyms()
|
|
[Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'), Synset('dalmatian.n.02'), ...]
|
|
>>> dog.member_holonyms()
|
|
[Synset('canis.n.01'), Synset('pack.n.06')]
|
|
>>> dog.root_hypernyms()
|
|
[Synset('entity.n.01')]
|
|
>>> wn.synset('dog.n.01').lowest_common_hypernyms(wn.synset('cat.n.01'))
|
|
[Synset('carnivore.n.01')]
|
|
|
|
Each synset contains one or more lemmas, which represent a specific
|
|
sense of a specific word.
|
|
|
|
Note that some relations are defined by WordNet only over Lemmas:
|
|
|
|
>>> good = wn.synset('good.a.01')
|
|
>>> good.antonyms()
|
|
Traceback (most recent call last):
|
|
File "<stdin>", line 1, in <module>
|
|
AttributeError: 'Synset' object has no attribute 'antonyms'
|
|
>>> good.lemmas()[0].antonyms()
|
|
[Lemma('bad.a.01.bad')]
|
|
|
|
The relations that are currently defined in this way are `antonyms`,
|
|
`derivationally_related_forms` and `pertainyms`.
|
|
|
|
If you know the byte offset used to identify a synset in the original
|
|
Princeton WordNet data file, you can use that to instantiate the synset
|
|
in NLTK:
|
|
|
|
>>> wn.synset_from_pos_and_offset('n', 4543158)
|
|
Synset('wagon.n.01')
|
|
|
|
Likewise, instantiate a synset from a known sense key:
|
|
>>> wn.synset_from_sense_key("driving%1:04:03::")
|
|
Synset('drive.n.06')
|
|
|
|
|
|
------
|
|
Lemmas
|
|
------
|
|
|
|
>>> eat = wn.lemma('eat.v.03.eat')
|
|
>>> eat
|
|
Lemma('feed.v.06.eat')
|
|
>>> print(eat.key())
|
|
eat%2:34:02::
|
|
>>> eat.count()
|
|
4
|
|
>>> wn.lemma_from_key(eat.key())
|
|
Lemma('feed.v.06.eat')
|
|
>>> wn.lemma_from_key(eat.key()).synset()
|
|
Synset('feed.v.06')
|
|
>>> wn.lemma_from_key('feebleminded%5:00:00:retarded:00')
|
|
Lemma('backward.s.03.feebleminded')
|
|
>>> for lemma in wn.synset('eat.v.03').lemmas():
|
|
... print(lemma, lemma.count())
|
|
...
|
|
Lemma('feed.v.06.feed') 3
|
|
Lemma('feed.v.06.eat') 4
|
|
>>> for lemma in wn.lemmas('eat', 'v'):
|
|
... print(lemma, lemma.count())
|
|
...
|
|
Lemma('eat.v.01.eat') 61
|
|
Lemma('eat.v.02.eat') 13
|
|
Lemma('feed.v.06.eat') 4
|
|
Lemma('eat.v.04.eat') 0
|
|
Lemma('consume.v.05.eat') 0
|
|
Lemma('corrode.v.01.eat') 0
|
|
>>> wn.lemma('jump.v.11.jump')
|
|
Lemma('jump.v.11.jump')
|
|
|
|
Lemmas can also have relations between them:
|
|
|
|
>>> vocal = wn.lemma('vocal.a.01.vocal')
|
|
>>> vocal.derivationally_related_forms()
|
|
[Lemma('vocalize.v.02.vocalize')]
|
|
>>> vocal.pertainyms()
|
|
[Lemma('voice.n.02.voice')]
|
|
>>> vocal.antonyms()
|
|
[Lemma('instrumental.a.01.instrumental')]
|
|
|
|
The three relations above exist only on lemmas, not on synsets.
|
|
|
|
-----------
|
|
Verb Frames
|
|
-----------
|
|
|
|
>>> wn.synset('think.v.01').frame_ids()
|
|
[5, 9]
|
|
>>> for lemma in wn.synset('think.v.01').lemmas():
|
|
... print(lemma, lemma.frame_ids())
|
|
... print(" | ".join(lemma.frame_strings()))
|
|
...
|
|
Lemma('think.v.01.think') [5, 9]
|
|
Something think something Adjective/Noun | Somebody think somebody
|
|
Lemma('think.v.01.believe') [5, 9]
|
|
Something believe something Adjective/Noun | Somebody believe somebody
|
|
Lemma('think.v.01.consider') [5, 9]
|
|
Something consider something Adjective/Noun | Somebody consider somebody
|
|
Lemma('think.v.01.conceive') [5, 9]
|
|
Something conceive something Adjective/Noun | Somebody conceive somebody
|
|
>>> wn.synset('stretch.v.02').frame_ids()
|
|
[8]
|
|
>>> for lemma in wn.synset('stretch.v.02').lemmas():
|
|
... print(lemma, lemma.frame_ids())
|
|
... print(" | ".join(lemma.frame_strings()))
|
|
...
|
|
Lemma('stretch.v.02.stretch') [8, 2]
|
|
Somebody stretch something | Somebody stretch
|
|
Lemma('stretch.v.02.extend') [8]
|
|
Somebody extend something
|
|
|
|
|
|
----------
|
|
Similarity
|
|
----------
|
|
|
|
>>> dog = wn.synset('dog.n.01')
|
|
>>> cat = wn.synset('cat.n.01')
|
|
|
|
>>> hit = wn.synset('hit.v.01')
|
|
>>> slap = wn.synset('slap.v.01')
|
|
|
|
|
|
``synset1.path_similarity(synset2):``
|
|
Return a score denoting how similar two word senses are, based on the
|
|
shortest path that connects the senses in the is-a (hypernym/hypnoym)
|
|
taxonomy. The score is in the range 0 to 1. By default, there is now
|
|
a fake root node added to verbs so for cases where previously a path
|
|
could not be found---and None was returned---it should return a value.
|
|
The old behavior can be achieved by setting simulate_root to be False.
|
|
A score of 1 represents identity i.e. comparing a sense with itself
|
|
will return 1.
|
|
|
|
>>> dog.path_similarity(cat)
|
|
0.2...
|
|
|
|
>>> hit.path_similarity(slap)
|
|
0.142...
|
|
|
|
>>> wn.path_similarity(hit, slap)
|
|
0.142...
|
|
|
|
>>> print(hit.path_similarity(slap, simulate_root=False))
|
|
None
|
|
|
|
>>> print(wn.path_similarity(hit, slap, simulate_root=False))
|
|
None
|
|
|
|
``synset1.lch_similarity(synset2):``
|
|
Leacock-Chodorow Similarity:
|
|
Return a score denoting how similar two word senses are, based on the
|
|
shortest path that connects the senses (as above) and the maximum depth
|
|
of the taxonomy in which the senses occur. The relationship is given
|
|
as -log(p/2d) where p is the shortest path length and d the taxonomy
|
|
depth.
|
|
|
|
>>> dog.lch_similarity(cat)
|
|
2.028...
|
|
|
|
>>> hit.lch_similarity(slap)
|
|
1.312...
|
|
|
|
>>> wn.lch_similarity(hit, slap)
|
|
1.312...
|
|
|
|
>>> print(hit.lch_similarity(slap, simulate_root=False))
|
|
None
|
|
|
|
>>> print(wn.lch_similarity(hit, slap, simulate_root=False))
|
|
None
|
|
|
|
``synset1.wup_similarity(synset2):``
|
|
Wu-Palmer Similarity:
|
|
Return a score denoting how similar two word senses are, based on the
|
|
depth of the two senses in the taxonomy and that of their Least Common
|
|
Subsumer (most specific ancestor node). Note that at this time the
|
|
scores given do **not** always agree with those given by Pedersen's Perl
|
|
implementation of Wordnet Similarity.
|
|
|
|
The LCS does not necessarily feature in the shortest path connecting the
|
|
two senses, as it is by definition the common ancestor deepest in the
|
|
taxonomy, not closest to the two senses. Typically, however, it will so
|
|
feature. Where multiple candidates for the LCS exist, that whose
|
|
shortest path to the root node is the longest will be selected. Where
|
|
the LCS has multiple paths to the root, the longer path is used for
|
|
the purposes of the calculation.
|
|
|
|
>>> dog.wup_similarity(cat)
|
|
0.857...
|
|
|
|
>>> hit.wup_similarity(slap)
|
|
0.25
|
|
|
|
>>> wn.wup_similarity(hit, slap)
|
|
0.25
|
|
|
|
>>> print(hit.wup_similarity(slap, simulate_root=False))
|
|
None
|
|
|
|
>>> print(wn.wup_similarity(hit, slap, simulate_root=False))
|
|
None
|
|
|
|
``wordnet_ic``
|
|
Information Content:
|
|
Load an information content file from the wordnet_ic corpus.
|
|
|
|
>>> from nltk.corpus import wordnet_ic
|
|
>>> brown_ic = wordnet_ic.ic('ic-brown.dat')
|
|
>>> semcor_ic = wordnet_ic.ic('ic-semcor.dat')
|
|
|
|
Or you can create an information content dictionary from a corpus (or
|
|
anything that has a words() method).
|
|
|
|
>>> from nltk.corpus import genesis
|
|
>>> genesis_ic = wn.ic(genesis, False, 0.0)
|
|
|
|
``synset1.res_similarity(synset2, ic):``
|
|
Resnik Similarity:
|
|
Return a score denoting how similar two word senses are, based on the
|
|
Information Content (IC) of the Least Common Subsumer (most specific
|
|
ancestor node). Note that for any similarity measure that uses
|
|
information content, the result is dependent on the corpus used to
|
|
generate the information content and the specifics of how the
|
|
information content was created.
|
|
|
|
>>> dog.res_similarity(cat, brown_ic)
|
|
7.911...
|
|
>>> dog.res_similarity(cat, genesis_ic)
|
|
7.204...
|
|
|
|
``synset1.jcn_similarity(synset2, ic):``
|
|
Jiang-Conrath Similarity
|
|
Return a score denoting how similar two word senses are, based on the
|
|
Information Content (IC) of the Least Common Subsumer (most specific
|
|
ancestor node) and that of the two input Synsets. The relationship is
|
|
given by the equation 1 / (IC(s1) + IC(s2) - 2 * IC(lcs)).
|
|
|
|
>>> dog.jcn_similarity(cat, brown_ic)
|
|
0.449...
|
|
>>> dog.jcn_similarity(cat, genesis_ic)
|
|
0.285...
|
|
|
|
``synset1.lin_similarity(synset2, ic):``
|
|
Lin Similarity:
|
|
Return a score denoting how similar two word senses are, based on the
|
|
Information Content (IC) of the Least Common Subsumer (most specific
|
|
ancestor node) and that of the two input Synsets. The relationship is
|
|
given by the equation 2 * IC(lcs) / (IC(s1) + IC(s2)).
|
|
|
|
>>> dog.lin_similarity(cat, semcor_ic)
|
|
0.886...
|
|
|
|
|
|
---------------------
|
|
Access to all Synsets
|
|
---------------------
|
|
|
|
Iterate over all the noun synsets:
|
|
|
|
>>> for synset in list(wn.all_synsets('n'))[:10]:
|
|
... print(synset)
|
|
...
|
|
Synset('entity.n.01')
|
|
Synset('physical_entity.n.01')
|
|
Synset('abstraction.n.06')
|
|
Synset('thing.n.12')
|
|
Synset('object.n.01')
|
|
Synset('whole.n.02')
|
|
Synset('congener.n.03')
|
|
Synset('living_thing.n.01')
|
|
Synset('organism.n.01')
|
|
Synset('benthos.n.02')
|
|
|
|
Get all synsets for this word, possibly restricted by POS:
|
|
|
|
>>> wn.synsets('dog')
|
|
[Synset('dog.n.01'), Synset('frump.n.01'), Synset('dog.n.03'), Synset('cad.n.01'), ...]
|
|
>>> wn.synsets('dog', pos='v')
|
|
[Synset('chase.v.01')]
|
|
|
|
Walk through the noun synsets looking at their hypernyms:
|
|
|
|
>>> from itertools import islice
|
|
>>> for synset in islice(wn.all_synsets('n'), 5):
|
|
... print(synset, synset.hypernyms())
|
|
...
|
|
Synset('entity.n.01') []
|
|
Synset('physical_entity.n.01') [Synset('entity.n.01')]
|
|
Synset('abstraction.n.06') [Synset('entity.n.01')]
|
|
Synset('thing.n.12') [Synset('physical_entity.n.01')]
|
|
Synset('object.n.01') [Synset('physical_entity.n.01')]
|
|
|
|
|
|
------
|
|
Morphy
|
|
------
|
|
|
|
Look up forms not in WordNet, with the help of Morphy:
|
|
|
|
>>> wn.morphy('denied', wn.NOUN)
|
|
>>> print(wn.morphy('denied', wn.VERB))
|
|
deny
|
|
>>> wn.synsets('denied', wn.NOUN)
|
|
[]
|
|
>>> wn.synsets('denied', wn.VERB)
|
|
[Synset('deny.v.01'), Synset('deny.v.02'), Synset('deny.v.03'), Synset('deny.v.04'),
|
|
Synset('deny.v.05'), Synset('traverse.v.03'), Synset('deny.v.07')]
|
|
|
|
Morphy uses a combination of inflectional ending rules and exception
|
|
lists to handle a variety of different possibilities:
|
|
|
|
>>> print(wn.morphy('dogs'))
|
|
dog
|
|
>>> print(wn.morphy('churches'))
|
|
church
|
|
>>> print(wn.morphy('aardwolves'))
|
|
aardwolf
|
|
>>> print(wn.morphy('abaci'))
|
|
abacus
|
|
>>> print(wn.morphy('book', wn.NOUN))
|
|
book
|
|
>>> wn.morphy('hardrock', wn.ADV)
|
|
>>> wn.morphy('book', wn.ADJ)
|
|
>>> wn.morphy('his', wn.NOUN)
|
|
>>>
|
|
|
|
---------------
|
|
Synset Closures
|
|
---------------
|
|
|
|
Compute transitive closures of synsets
|
|
|
|
>>> dog = wn.synset('dog.n.01')
|
|
>>> hypo = lambda s: s.hyponyms()
|
|
>>> hyper = lambda s: s.hypernyms()
|
|
>>> list(dog.closure(hypo, depth=1)) == dog.hyponyms()
|
|
True
|
|
>>> list(dog.closure(hyper, depth=1)) == dog.hypernyms()
|
|
True
|
|
>>> list(dog.closure(hypo))
|
|
[Synset('basenji.n.01'), Synset('corgi.n.01'), Synset('cur.n.01'),
|
|
Synset('dalmatian.n.02'), Synset('great_pyrenees.n.01'),
|
|
Synset('griffon.n.02'), Synset('hunting_dog.n.01'), Synset('lapdog.n.01'),
|
|
Synset('leonberg.n.01'), Synset('mexican_hairless.n.01'),
|
|
Synset('newfoundland.n.01'), Synset('pooch.n.01'), Synset('poodle.n.01'), ...]
|
|
>>> list(dog.closure(hyper))
|
|
[Synset('canine.n.02'), Synset('domestic_animal.n.01'), Synset('carnivore.n.01'), Synset('animal.n.01'),
|
|
Synset('placental.n.01'), Synset('organism.n.01'), Synset('mammal.n.01'), Synset('living_thing.n.01'),
|
|
Synset('vertebrate.n.01'), Synset('whole.n.02'), Synset('chordate.n.01'), Synset('object.n.01'),
|
|
Synset('physical_entity.n.01'), Synset('entity.n.01')]
|
|
|
|
|
|
----------------
|
|
Regression Tests
|
|
----------------
|
|
|
|
Bug 85: morphy returns the base form of a word, if it's input is given
|
|
as a base form for a POS for which that word is not defined:
|
|
|
|
>>> wn.synsets('book', wn.NOUN)
|
|
[Synset('book.n.01'), Synset('book.n.02'), Synset('record.n.05'), Synset('script.n.01'), Synset('ledger.n.01'), Synset('book.n.06'), Synset('book.n.07'), Synset('koran.n.01'), Synset('bible.n.01'), Synset('book.n.10'), Synset('book.n.11')]
|
|
>>> wn.synsets('book', wn.ADJ)
|
|
[]
|
|
>>> wn.morphy('book', wn.NOUN)
|
|
'book'
|
|
>>> wn.morphy('book', wn.ADJ)
|
|
>>>
|
|
|
|
Bug 160: wup_similarity breaks when the two synsets have no common hypernym
|
|
|
|
>>> t = wn.synsets('picasso')[0]
|
|
>>> m = wn.synsets('male')[1]
|
|
>>> t.wup_similarity(m)
|
|
0.631...
|
|
|
|
Issue #2278: wup_similarity not commutative when comparing a noun and a verb.
|
|
Patch #2650 resolved this error. As a result, the output of the following use of wup_similarity no longer returns None.
|
|
|
|
>>> t = wn.synsets('titan')[1]
|
|
>>> s = wn.synsets('say', wn.VERB)[0]
|
|
>>> t.wup_similarity(s)
|
|
0.142...
|
|
|
|
Bug 21: "instance of" not included in LCS (very similar to bug 160)
|
|
|
|
>>> a = wn.synsets("writings")[0]
|
|
>>> b = wn.synsets("scripture")[0]
|
|
>>> brown_ic = wordnet_ic.ic('ic-brown.dat')
|
|
>>> a.jcn_similarity(b, brown_ic)
|
|
0.175...
|
|
|
|
Bug 221: Verb root IC is zero
|
|
|
|
>>> from nltk.corpus.reader.wordnet import information_content
|
|
>>> s = wn.synsets('say', wn.VERB)[0]
|
|
>>> information_content(s, brown_ic)
|
|
4.623...
|
|
|
|
Bug 161: Comparison between WN keys/lemmas should not be case sensitive
|
|
|
|
>>> k = wn.synsets("jefferson")[0].lemmas()[0].key()
|
|
>>> wn.lemma_from_key(k)
|
|
Lemma('jefferson.n.01.Jefferson')
|
|
>>> wn.lemma_from_key(k.upper())
|
|
Lemma('jefferson.n.01.Jefferson')
|
|
|
|
Bug 99: WordNet root_hypernyms gives incorrect results
|
|
|
|
>>> from nltk.corpus import wordnet as wn
|
|
>>> for s in wn.all_synsets(wn.NOUN):
|
|
... if s.root_hypernyms()[0] != wn.synset('entity.n.01'):
|
|
... print(s, s.root_hypernyms())
|
|
...
|
|
>>>
|
|
|
|
Bug 382: JCN Division by zero error
|
|
|
|
>>> tow = wn.synset('tow.v.01')
|
|
>>> shlep = wn.synset('shlep.v.02')
|
|
>>> from nltk.corpus import wordnet_ic
|
|
>>> brown_ic = wordnet_ic.ic('ic-brown.dat')
|
|
>>> tow.jcn_similarity(shlep, brown_ic)
|
|
1...e+300
|
|
|
|
Bug 428: Depth is zero for instance nouns
|
|
|
|
>>> s = wn.synset("lincoln.n.01")
|
|
>>> s.max_depth() > 0
|
|
True
|
|
|
|
Bug 429: Information content smoothing used old reference to all_synsets
|
|
|
|
>>> genesis_ic = wn.ic(genesis, True, 1.0)
|
|
|
|
Bug 430: all_synsets used wrong pos lookup when synsets were cached
|
|
|
|
>>> for ii in wn.all_synsets(): pass
|
|
>>> for ii in wn.all_synsets(): pass
|
|
|
|
Bug 470: shortest_path_distance ignored instance hypernyms
|
|
|
|
>>> google = wordnet.synsets("google")[0]
|
|
>>> earth = wordnet.synsets("earth")[0]
|
|
>>> google.wup_similarity(earth)
|
|
0.1...
|
|
|
|
Bug 484: similarity metrics returned -1 instead of None for no LCS
|
|
|
|
>>> t = wn.synsets('fly', wn.VERB)[0]
|
|
>>> s = wn.synsets('say', wn.VERB)[0]
|
|
>>> print(s.shortest_path_distance(t))
|
|
None
|
|
>>> print(s.path_similarity(t, simulate_root=False))
|
|
None
|
|
>>> print(s.lch_similarity(t, simulate_root=False))
|
|
None
|
|
>>> print(s.wup_similarity(t, simulate_root=False))
|
|
None
|
|
|
|
Bug 427: "pants" does not return all the senses it should
|
|
|
|
>>> from nltk.corpus import wordnet
|
|
>>> wordnet.synsets("pants",'n')
|
|
[Synset('bloomers.n.01'), Synset('pant.n.01'), Synset('trouser.n.01'), Synset('gasp.n.01')]
|
|
|
|
Bug 482: Some nouns not being lemmatised by WordNetLemmatizer().lemmatize
|
|
|
|
>>> from nltk.stem.wordnet import WordNetLemmatizer
|
|
>>> WordNetLemmatizer().lemmatize("eggs", pos="n")
|
|
'egg'
|
|
>>> WordNetLemmatizer().lemmatize("legs", pos="n")
|
|
'leg'
|
|
|
|
Bug 284: instance hypernyms not used in similarity calculations
|
|
|
|
>>> wn.synset('john.n.02').lch_similarity(wn.synset('dog.n.01'))
|
|
1.335...
|
|
>>> wn.synset('john.n.02').wup_similarity(wn.synset('dog.n.01'))
|
|
0.571...
|
|
>>> wn.synset('john.n.02').res_similarity(wn.synset('dog.n.01'), brown_ic)
|
|
2.224...
|
|
>>> wn.synset('john.n.02').jcn_similarity(wn.synset('dog.n.01'), brown_ic)
|
|
0.075...
|
|
>>> wn.synset('john.n.02').lin_similarity(wn.synset('dog.n.01'), brown_ic)
|
|
0.252...
|
|
>>> wn.synset('john.n.02').hypernym_paths()
|
|
[[Synset('entity.n.01'), ..., Synset('john.n.02')]]
|
|
|
|
Issue 541: add domains to wordnet
|
|
|
|
>>> wn.synset('code.n.03').topic_domains()
|
|
[Synset('computer_science.n.01')]
|
|
>>> wn.synset('pukka.a.01').region_domains()
|
|
[Synset('india.n.01')]
|
|
>>> wn.synset('freaky.a.01').usage_domains()
|
|
[Synset('slang.n.02')]
|
|
|
|
Issue 629: wordnet failures when python run with -O optimizations
|
|
|
|
>>> # Run the test suite with python -O to check this
|
|
>>> wn.synsets("brunch")
|
|
[Synset('brunch.n.01'), Synset('brunch.v.01')]
|
|
|
|
Issue 395: wordnet returns incorrect result for lowest_common_hypernyms of chef and policeman
|
|
|
|
>>> wn.synset('policeman.n.01').lowest_common_hypernyms(wn.synset('chef.n.01'))
|
|
[Synset('person.n.01')]
|
|
|
|
Bug https://github.com/nltk/nltk/issues/1641: Non-English lemmas containing capital letters cannot be looked up using wordnet.lemmas() or wordnet.synsets()
|
|
|
|
>>> wn.lemmas('Londres', lang='fra')
|
|
[Lemma('united_kingdom.n.01.Londres'), Lemma('london.n.01.Londres'), Lemma('london.n.02.Londres')]
|
|
>>> wn.lemmas('londres', lang='fra')
|
|
[Lemma('united_kingdom.n.01.Londres'), Lemma('london.n.01.Londres'), Lemma('london.n.02.Londres')]
|
|
|
|
Patch-1 https://github.com/nltk/nltk/pull/2065 Adding 3 functions (relations) to WordNet class
|
|
|
|
>>> wn.synsets("computer_science")[0].in_topic_domains()[2]
|
|
Synset('access_time.n.01')
|
|
>>> wn.synsets("France")[0].in_region_domains()[18]
|
|
Synset('french.n.01')
|
|
>>> wn.synsets("slang")[1].in_usage_domains()[18]
|
|
Synset('can-do.s.01')
|
|
|
|
Issue 2721: WordNetCorpusReader.ic() does not add smoothing to N
|
|
|
|
>>> class FakeCorpus:
|
|
... def words(self): return ['word']
|
|
...
|
|
>>> fake_ic = wn.ic(FakeCorpus(), False, 1.0)
|
|
>>> word = wn.synset('word.n.01')
|
|
>>> information_content(word, fake_ic) > 0
|
|
True
|
|
|
|
Issue 3077: Incorrect part-of-speech filtering in all_synsets
|
|
|
|
>>> next(wn.all_synsets(pos="a"))
|
|
Synset('able.a.01')
|
|
>>> next(wn.all_synsets(pos="s"))
|
|
Synset('emergent.s.02')
|
|
>>> wn.add_omw()
|
|
>>> next(wn.all_synsets(lang="hrv"))
|
|
Synset('able.a.01')
|
|
>>> next(wn.all_synsets(lang="hrv", pos="n"))
|
|
Synset('entity.n.01')
|
|
>>> next(wn.all_synsets(lang="hrv", pos="v"))
|
|
Synset('breathe.v.01')
|
|
>>> next(wn.all_synsets(lang="hrv", pos="s"))
|
|
Synset('ideological.s.02')
|
|
>>> next(wn.all_synsets(lang="hrv", pos="a"))
|
|
Synset('able.a.01')
|
|
|
|
|
|
------------------------------------------------
|
|
Endlessness vs. intractability in relation trees
|
|
------------------------------------------------
|
|
|
|
1. Endlessness
|
|
--------------
|
|
|
|
Until NLTK v. 3.5, the ``tree()`` function looped forever on symmetric
|
|
relations (verb_groups, attributes, and most also_sees). But in
|
|
the current version, ``tree()`` now detects and discards these cycles:
|
|
|
|
>>> from pprint import pprint
|
|
>>> pprint(wn.synset('bound.a.01').tree(lambda s:s.also_sees()))
|
|
[Synset('bound.a.01'),
|
|
[Synset('unfree.a.02'),
|
|
[Synset('confined.a.02'),
|
|
[Synset('restricted.a.01'), [Synset('classified.a.02')]]],
|
|
[Synset('dependent.a.01')],
|
|
[Synset('restricted.a.01'),
|
|
[Synset('classified.a.02')],
|
|
[Synset('confined.a.02')]]]]
|
|
|
|
Specifying the "cut_mark" parameter increases verbosity, so that the cycles
|
|
are mentioned in the output, together with the level where they occur:
|
|
|
|
>>> pprint(wn.synset('bound.a.01').tree(lambda s:s.also_sees(),cut_mark='...'))
|
|
[Synset('bound.a.01'),
|
|
[Synset('unfree.a.02'),
|
|
"Cycle(Synset('bound.a.01'),-3,...)",
|
|
[Synset('confined.a.02'),
|
|
[Synset('restricted.a.01'),
|
|
[Synset('classified.a.02')],
|
|
"Cycle(Synset('confined.a.02'),-5,...)",
|
|
"Cycle(Synset('unfree.a.02'),-5,...)"],
|
|
"Cycle(Synset('unfree.a.02'),-4,...)"],
|
|
[Synset('dependent.a.01'), "Cycle(Synset('unfree.a.02'),-4,...)"],
|
|
[Synset('restricted.a.01'),
|
|
[Synset('classified.a.02')],
|
|
[Synset('confined.a.02'),
|
|
"Cycle(Synset('restricted.a.01'),-5,...)",
|
|
"Cycle(Synset('unfree.a.02'),-5,...)"],
|
|
"Cycle(Synset('unfree.a.02'),-4,...)"]]]
|
|
|
|
|
|
2. Intractability
|
|
-----------------
|
|
|
|
However, even after discarding the infinite cycles, some trees can remain
|
|
intractable, due to combinatorial explosion in a relation. This happens in
|
|
WordNet, because the ``also_sees()`` relation has a big Strongly Connected
|
|
Component (_SCC_) consisting in 758 synsets, where any member node is
|
|
transitively connected by the same relation, to all other members of the
|
|
same SCC. This produces intractable relation trees for each of these 758
|
|
synsets, i. e. trees that are too big to compute or display on any computer.
|
|
|
|
For example, the synset 'concrete.a.01' is a member of the largest SCC,
|
|
so its ``also_sees()`` tree is intractable, and can normally only be handled
|
|
by limiting the ``depth`` parameter to display a small number of levels:
|
|
|
|
>>> from pprint import pprint
|
|
>>> pprint(wn.synset('concrete.a.01').tree(lambda s:s.also_sees(),cut_mark='...',depth=2))
|
|
[Synset('concrete.a.01'),
|
|
[Synset('practical.a.01'),
|
|
"Cycle(Synset('concrete.a.01'),0,...)",
|
|
[Synset('possible.a.01'), '...'],
|
|
[Synset('realistic.a.01'), '...'],
|
|
[Synset('serviceable.a.01'), '...']],
|
|
[Synset('real.a.01'),
|
|
"Cycle(Synset('concrete.a.01'),0,...)",
|
|
[Synset('genuine.a.01'), '...'],
|
|
[Synset('realistic.a.01'), '...'],
|
|
[Synset('sincere.a.01'), '...']],
|
|
[Synset('tangible.a.01'), "Cycle(Synset('concrete.a.01'),0,...)"]]
|
|
|
|
|
|
2.1 First solution: ``acyclic_tree()``
|
|
......................................
|
|
|
|
On the other hand, the new ``acyclic_tree()`` function is able to also handle
|
|
the intractable cases. The ``also_sees()`` acyclic tree of 'concrete.a.01' is
|
|
several hundred lines long, so here is a simpler example, concerning a much
|
|
smaller SCC: counting only five members, the SCC that includes 'bound.a.01'
|
|
is tractable with the normal ``tree()`` function, as seen above.
|
|
|
|
But while ``tree()`` only prunes redundancy within local branches, ``acyclic_tree()``
|
|
prunes the tree globally, thus discarding any additional redundancy, and
|
|
produces a tree that includes all reachable nodes (i.e., a **spanning tree**).
|
|
This tree is **minimal** because it includes the reachable nodes only once,
|
|
but it is not necessarily a **Minimum Spanning Tree** (MST), because the
|
|
Depth-first search strategy does not guarantee that nodes are reached
|
|
through the lowest number of links (as Breadth-first search would).
|
|
|
|
>>> pprint(wn.synset('bound.a.01').acyclic_tree(lambda s:s.also_sees()))
|
|
[Synset('bound.a.01'),
|
|
[Synset('unfree.a.02'),
|
|
[Synset('confined.a.02'),
|
|
[Synset('restricted.a.01'), [Synset('classified.a.02')]]],
|
|
[Synset('dependent.a.01')]]]
|
|
|
|
Again, specifying the ``cut_mark`` parameter increases verbosity, so that the
|
|
cycles are mentioned in the output, together with the level where they occur:
|
|
|
|
>>> pprint(wn.synset('bound.a.01').acyclic_tree(lambda s:s.also_sees(),cut_mark='...'))
|
|
[Synset('bound.a.01'),
|
|
[Synset('unfree.a.02'),
|
|
"Cycle(Synset('bound.a.01'),-3,...)",
|
|
[Synset('confined.a.02'),
|
|
[Synset('restricted.a.01'),
|
|
[Synset('classified.a.02')],
|
|
"Cycle(Synset('confined.a.02'),-5,...)",
|
|
"Cycle(Synset('unfree.a.02'),-5,...)"],
|
|
"Cycle(Synset('unfree.a.02'),-4,...)"],
|
|
[Synset('dependent.a.01'), "Cycle(Synset('unfree.a.02'),-4,...)"],
|
|
"Cycle(Synset('restricted.a.01'),-3,...)"]]
|
|
|
|
|
|
2.2 Better solution: mst()
|
|
..........................
|
|
|
|
A Minimum Spanning Tree (MST) spans all the nodes of a relation subgraph once,
|
|
while guaranteeing that each node is reached through the shortest path possible.
|
|
In unweighted relation graphs like WordNet, a MST can be computed very efficiently
|
|
in linear time, using Breadth-First Search (BFS). Like acyclic_tree(), the new
|
|
``unweighted_minimum_spanning_tree()`` function (imported in the Wordnet
|
|
module as ``mst``) handles intractable trees, such as the example discussed above:
|
|
``wn.synset('concrete.a.01').mst(lambda s:s.also_sees())``.
|
|
|
|
But, while the also_sees() acyclic_tree of 'bound.a.01' reaches
|
|
'classified.a.02' through four links, using depth-first search as seen above
|
|
(bound.a.01 > unfree.a.02 > confined.a.02 > restricted.a.01 > classified.a.02),
|
|
in the following MST, the path to 'classified.a.02' is the shortest possible,
|
|
consisting only in three links (bound.a.01 > unfree.a.02 > restricted.a.01 >
|
|
classified.a.02):
|
|
|
|
>>> pprint(wn.synset('bound.a.01').mst(lambda s:s.also_sees()))
|
|
[Synset('bound.a.01'),
|
|
[Synset('unfree.a.02'),
|
|
[Synset('confined.a.02')],
|
|
[Synset('dependent.a.01')],
|
|
[Synset('restricted.a.01'), [Synset('classified.a.02')]]]]
|
|
|
|
|
|
----------------------------------------------------------------
|
|
Loading alternative Wordnet versions
|
|
----------------------------------------------------------------
|
|
|
|
>>> print("Wordnet {}".format(wn.get_version()))
|
|
Wordnet 3.0
|
|
|
|
>>> from nltk.corpus import wordnet31 as wn31
|
|
>>> print("Wordnet {}".format(wn31.get_version()))
|
|
Wordnet 3.1
|
|
|
|
>>> print(wn.synset('restrain.v.01').hyponyms())
|
|
[Synset('confine.v.03'), Synset('control.v.02'), Synset('hold.v.36'), Synset('inhibit.v.04')]
|
|
|
|
>>> print(wn31.synset('restrain.v.01').hyponyms())
|
|
[Synset('enchain.v.01'), Synset('fetter.v.01'), Synset('ground.v.02'), Synset('impound.v.02'), Synset('pen_up.v.01'), Synset('pinion.v.01'), Synset('pound.v.06'), Synset('tie_down.v.01')]
|
|
|
|
>>> print(wn31.synset('restrain.v.04').hyponyms())
|
|
[Synset('baffle.v.03'), Synset('confine.v.02'), Synset('control.v.02'), Synset('hold.v.36'), Synset('rule.v.07'), Synset('swallow.v.06'), Synset('wink.v.04')]
|
|
|
|
|
|
-------------
|
|
Teardown test
|
|
-------------
|
|
|
|
>>> from nltk.corpus import wordnet
|
|
>>> wordnet._unload()
|