#!/usr/bin/env python3
# Student name: NAME
# Student number: NUMBER
# UTORid: ID
Copyright By PowCoder代写 加微信 powcoder
from collections import Counter
from typing import *
from nltk.corpus import wordnet as wn
from nltk.corpus.reader.wordnet import Synset
import numpy as np
from numpy.linalg import norm
from q0 import stop_tokenize
from wsd import evaluate, load_eval, load_word2vec, WSDToken
def mfs(sentence: Sequence[WSDToken], word_index: int) -> Synset:
“””Most frequent sense of a word.
**IMPORTANT**: when looking up the word in WordNet, make sure you use the
lemma of the word, *not* the wordform. See the WSDToken class in wsd.py
for the relevant class attributes.
sentence (list of WSDToken): The sentence containing the word to be
disambiguated.
word_index (int): The index of the target word in the sentence.
Synset: The most frequent sense for the given word.
raise NotImplementedError
def lesk(sentence: Sequence[WSDToken], word_index: int) -> Synset:
“””Simplified Lesk algorithm.
**IMPORTANT**: when looking up the word in WordNet, make sure you use the
lemma of the word, *not* the wordform. For other cases, such as gathering
the context words, use the wordform. See the WSDToken class in wsd.py for
the relevant class attributes.
sentence (list of WSDToken): The sentence containing the word to be
disambiguated.
word_index (int): The index of the target word in the sentence.
Synset: The prediction of the correct sense for the given word.
raise NotImplementedError
def lesk_ext(sentence: Sequence[WSDToken], word_index: int) -> Synset:
“””Extended Lesk algorithm.
**IMPORTANT**: when looking up the word in WordNet, make sure you use the
lemma of the word, *not* the wordform. For other cases, such as gathering
the context words, use the wordform. See the WSDToken class in wsd.py for
the relevant class attributes.
sentence (list of WSDToken): The sentence containing the word to be
disambiguated.
word_index (int): The index of the target word in the sentence.
Synset: The prediction of the correct sense for the given word.
raise NotImplementedError
def lesk_cos(sentence: Sequence[WSDToken], word_index: int) -> Synset:
“””Extended Lesk algorithm using cosine similarity.
**IMPORTANT**: when looking up the word in WordNet, make sure you use the
lemma of the word, *not* the wordform. For other cases, such as gathering
the context words, use the wordform. See the WSDToken class in wsd.py for
the relevant class attributes.
sentence (list of WSDToken): The sentence containing the word to be
disambiguated.
word_index (int): The index of the target word in the sentence.
Synset: The prediction of the correct sense for the given word.
raise NotImplementedError
def lesk_cos_onesided(sentence: Sequence[WSDToken], word_index: int) -> Synset:
“””Extended Lesk algorithm using one-sided cosine similarity.
**IMPORTANT**: when looking up the word in WordNet, make sure you use the
lemma of the word, *not* the wordform. For other cases, such as gathering
the context words, use the wordform. See the WSDToken class in wsd.py for
the relevant class attributes.
sentence (list of WSDToken): The sentence containing the word to be
disambiguated.
word_index (int): The index of the target word in the sentence.
Synset: The prediction of the correct sense for the given word.
raise NotImplementedError
def lesk_w2v(sentence: Sequence[WSDToken], word_index: int,
vocab: Mapping[str, int], word2vec: np.ndarray) -> Synset:
“””Extended Lesk algorithm using word2vec-based cosine similarity.
**IMPORTANT**: when looking up the word in WordNet, make sure you use the
lemma of the word, *not* the wordform. For other cases, such as gathering
the context words, use the wordform. See the WSDToken class in wsd.py for
the relevant class attributes.
To look up the vector for a word, first you need to look up the word’s
index in the word2vec matrix, which you can then use to get the specific
vector. More directly, you can look up a string s using word2vec[vocab[s]].
To look up the vector for a *single word*, use the following rules:
* If the word exists in the vocabulary, then return the corresponding
* Otherwise, if the lower-cased version of the word exists in the
vocabulary, return the corresponding vector for the lower-cased version.
* Otherwise, return a vector of all zeros. You’ll need to ensure that
this vector has the same dimensions as the word2vec vectors.
But some wordforms are actually multi-word expressions and contain spaces.
word2vec can handle multi-word expressions, but uses the underscore
character to separate words rather than spaces. So, to look up a string
that has a space in it, use the following rules:
* If the string has a space in it, replace the space characters with
underscore characters and then follow the above steps on the new string
(i.e., try the string as-is, then the lower-cased version if that
fails), but do not return the zero vector if the lookup fails.
* If the version with underscores doesn’t yield anything, split the
string into multiple words according to the spaces and look each word
up individually according to the rules in the above paragraph (i.e.,
as-is, lower-cased, then zero). Take the mean of the vectors for each
word and return that.
Recursion will make for more compact code for these.
sentence (list of WSDToken): The sentence containing the word to be
disambiguated.
word_index (int): The index of the target word in the sentence.
vocab (dictionary mapping str to int): The word2vec vocabulary,
mapping strings to their respective indices in the word2vec array.
word2vec (np.ndarray): The word2vec word vectors, as a VxD matrix,
where V is the vocabulary and D is the dimensionality of the word
Synset: The prediction of the correct sense for the given word.
raise NotImplementedError
if __name__ == ‘__main__’:
np.random.seed(1234)
eval_data = load_eval()
for wsd_func in [mfs, lesk, lesk_ext, lesk_cos, lesk_cos_onesided]:
evaluate(eval_data, wsd_func)
evaluate(eval_data, lesk_w2v, *load_word2vec())
程序代写 CS代考 加微信: powcoder QQ: 1823890830 Email: powcoder@163.com