Created
May 29, 2019 23:34
-
-
Save matt-peters/58d6e09384638c31292f928a14da9a9c to your computer and use it in GitHub Desktop.
WSD ELMo
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
data from: http://lcl.uniroma1.it/wsdeval/ | |
(0.1) Run convert_all_wsd_datasets to convert to JsonFile | |
(0.2) run remove_semeval2007_gold to remove semeval2007 from the gold data | |
(1) dump embeddings for the datasets to hdf5 file | |
(2) Compute the average word embedding for each sense in the training data | |
with generate_sense_vectors | |
(3) write evaluation files with generate_evaluation_files | |
(3) concat predicted files: | |
cat *level1* > all.predicted.level1.bidirectional_2x4096_512_cnn2048.txt | |
(4) evaluate! -- from ~/data/wsd/WSD_Evaluation_Framework/Evaluation_Datasets run | |
java Scorer ALL/ALL_except_semeval2007.gold.key.txt ~/data/wsd/converted/all.predicted.level2.bidirectional_2x4096_512_cnn2048.txt | |
''' | |
# dataformat after conversion is JsonFile, one sentence per line. | |
# each sentence is a list of tokens, each token is a dict with various attributes | |
# including lemma, pos, token and senses / id if it's annotated. | |
# each data is named X.json, e.g. semcor.json | |
# | |
# hdf5 files of embeddings are keyed by str(line_number_in_training_file), | |
# with shape (num_layers, num_tokens, dim) embeddings for each sentence. | |
# named X.LM_NAME.hdf5, e.g. semcor.bidirectional_2x4096_512_cnn2048_skip.hdf5. | |
import os | |
import h5py | |
import json | |
import numpy as np | |
class JsonFile(object): | |
''' | |
A flat text file where each line is one json object | |
# to read though a file line by line | |
with JsonFile('file.json', 'r') as fin: | |
for line in fin: | |
# line is the deserialized json object | |
pass | |
# to write a file object by object | |
with JsonFile('file.json', 'w') as fout: | |
fout.write({'key1': 5, 'key2': 'token'}) | |
fout.write({'key1': 0, 'key2': 'the'}) | |
''' | |
def __init__(self, *args, **kwargs): | |
self._args = args | |
self._kwargs = kwargs | |
def __iter__(self): | |
for line in self._file: | |
yield json.loads(line) | |
def write(self, item): | |
item_as_json = json.dumps(item, ensure_ascii=False) | |
encoded = '{0}\n'.format(item_as_json) | |
self._file.write(encoded) | |
def __enter__(self): | |
self._file = open(*self._args, **self._kwargs) | |
self._file.__enter__() | |
return self | |
def __exit__(self, exc_type, exc_val, exc_tb): | |
self._file.__exit__(exc_type, exc_val, exc_tb) | |
def read_gold_data(fname): | |
gold_data = {} | |
with open(fname, 'r') as fin: | |
for line in fin: | |
ls = line.strip().split() | |
lid = ls[0] | |
if lid not in gold_data: | |
gold_data[lid] = set() | |
for sense in ls[1:]: | |
gold_data[lid].add(sense) | |
return gold_data | |
def read_wsd_data(fname, fname_gold): | |
from lxml import etree | |
gold_data = read_gold_data(fname_gold) | |
with open(fname, 'r') as fin: | |
data = fin.read() | |
corpus = etree.fromstring(data.encode('utf-8')) | |
sentences = [] | |
n_sentences = 0 | |
for node in corpus.iterdescendants(): | |
if node.tag == 'sentence': | |
sentence = [] | |
for token_node in node.iterdescendants(): | |
token = { | |
'token': token_node.text, | |
'lemma': token_node.attrib['lemma'], | |
'pos': token_node.attrib['pos'] | |
} | |
if token_node.tag == 'instance': | |
token['id'] = token_node.attrib['id'] | |
token['senses'] = [] | |
for sense in gold_data[token['id']]: | |
lemma, _, ss = sense.partition('%') | |
assert lemma == token['lemma'] | |
token['senses'].append(ss) | |
sentence.append(token) | |
sentences.append(sentence) | |
return sentences | |
def get_dataset_metadata(wsd_framework_root): | |
return [ | |
[ | |
'semcor', | |
os.path.join( | |
wsd_framework_root, 'Training_Corpora', 'SemCor', 'semcor' | |
) | |
], [ | |
'senseval2', | |
os.path.join( | |
wsd_framework_root, 'Evaluation_Datasets', 'senseval2', | |
'senseval2' | |
) | |
], [ | |
'senseval3', | |
os.path.join( | |
wsd_framework_root, 'Evaluation_Datasets', 'senseval3', | |
'senseval3' | |
) | |
], [ | |
'semeval2015', | |
os.path.join( | |
wsd_framework_root, 'Evaluation_Datasets', 'semeval2015', | |
'semeval2015' | |
) | |
], [ | |
'semeval2013', | |
os.path.join( | |
wsd_framework_root, 'Evaluation_Datasets', 'semeval2013', | |
'semeval2013' | |
) | |
] | |
] | |
def convert_all_wsd_datasets(outdir, wsd_framework_root): | |
datasets = get_dataset_metadata(wsd_framework_root) | |
for ds in datasets: | |
ds_name, ds_root = ds | |
data = read_wsd_data(ds_root + '.data.xml', ds_root + '.gold.key.txt') | |
with JsonFile(os.path.join(outdir, ds_name + '.json'), 'w') as fout: | |
for line in data: | |
fout.write(line) | |
def make_synset_ids(lemma_ids): | |
''' | |
lemma_ids = {(lemma, pos) -> {'sense' -> sense_id}, ...} | |
''' | |
from nltk.corpus import wordnet as wn | |
pos_map = {'NOUN': wn.NOUN, 'VERB': wn.VERB, 'ADJ': wn.ADJ, 'ADV': wn.ADV} | |
# (lemma, pos, sense) -> synset | |
lemma_pos_sense_synset = {} | |
# {synset -> id} | |
synset_ids = {} | |
for lemma_pos, senses in sorted(lemma_ids.items()): | |
for sense in sorted(senses.keys()): | |
lemma, pos = lemma_pos.split() | |
the_synset = None | |
for synset in wn.synsets(lemma, pos=pos_map[pos]): | |
for lem in synset.lemmas(): | |
wn_sense = lem.key() | |
if '{0}%{1}'.format(lemma, sense) == wn_sense: | |
the_synset = synset | |
assert the_synset is not None | |
key = ' '.join([lemma, pos, sense]) | |
lemma_pos_sense_synset[key] = the_synset | |
if the_synset not in synset_ids: | |
next_id = len(synset_ids) | |
synset_ids[the_synset] = next_id | |
return lemma_pos_sense_synset, synset_ids | |
def get_wn_first_sense(lemma, pos): | |
from nltk.corpus import wordnet as wn | |
pos_map = {'NOUN': wn.NOUN, 'VERB': wn.VERB, 'ADJ': wn.ADJ, 'ADV': wn.ADV} | |
first_synset = wn.synsets(lemma, pos=pos_map[pos])[0] | |
# get the first lemma that is the same as the input lemma | |
found = False | |
for lem in first_synset.lemmas(): | |
key = lem.key() | |
if key.startswith('{}%'.format(lemma)): | |
found = True | |
break | |
assert found | |
return key | |
def make_training_senses_vocab(rootdir): | |
# returns a map: | |
# (lemma, pos) -> {'sense1': id, 'sense2': id, ..} | |
# the senses are keyed by (lemma, pos), make a list of unique ones | |
# in the training data | |
fname_in = os.path.join(rootdir, 'semcor.json') | |
# a map from (lemma, pos) -> [id, [sense1, sense2, ...]] | |
lemmas = {} | |
with JsonFile(fname_in, 'r') as fin: | |
for line in fin: | |
for token in line: | |
if 'senses' in token: | |
pos = token['pos'] | |
lemma = token['lemma'] | |
key = lemma + ' ' + pos | |
if key not in lemmas: | |
next_id = len(lemmas) | |
lemmas[key] = [next_id, set()] | |
for sense in token['senses']: | |
lemmas[key][1].add(sense) | |
# finally change the sets of senses to lists | |
ret = {} | |
next_id = 0 | |
for key, val in lemmas.items(): | |
ret[key] = {} | |
for sense in val[1]: | |
ret[key][sense] = next_id | |
next_id += 1 | |
return ret | |
def generate_sense_vectors(rootdir, lm, lm_dim=1024, num_lm_layers=3): | |
lemma_ids = make_training_senses_vocab(rootdir) | |
lemma_pos_sense_synset, synset_ids = make_synset_ids(lemma_ids) | |
# one vector per synset | |
n_vectors = len(synset_ids) | |
sense_vectors = np.zeros((n_vectors, num_lm_layers, lm_dim), dtype=np.float32) | |
sense_counts = np.zeros(n_vectors) | |
fname_in = os.path.join(rootdir, 'semcor.json') | |
fname_embeddings = os.path.join(rootdir, 'semcor.' + lm + '.hdf5') | |
with JsonFile(fname_in, 'r') as fin, \ | |
h5py.File(fname_embeddings, 'r') as fembed: | |
for sent_id, line in enumerate(fin): | |
start = 0 | |
sentence_embeddings = fembed['{}'.format(sent_id)][...] | |
if sent_id % 100 == 0: | |
print(sent_id) | |
for token in line: | |
# some tokens are n-grams like "New York" | |
n_tokens = len(token['token'].strip().split()) | |
end = start + n_tokens | |
if 'senses' in token: | |
# get the context vector | |
context_vector = (sentence_embeddings[:, start:end, :] | |
).mean(axis=1) | |
key = token['lemma'] + ' ' + token['pos'] | |
for sense in token['senses']: | |
sense_id = lemma_ids[key][sense] | |
sense_vectors[sense_id, :, :] += context_vector | |
sense_counts[sense_id] += 1 | |
start = end | |
assert start == sentence_embeddings.shape[1] | |
# take the average vectors | |
for k in range(n_vectors): | |
sense_vectors[k, :, :] /= sense_counts[k] | |
# write to output file! | |
fileroot = 'semcor.sense_ids.' + lm | |
with open(os.path.join(rootdir, fileroot + '.json'), 'w') as fout: | |
fout.write(json.dumps(lemma_ids)) | |
with h5py.File(os.path.join(rootdir, fileroot + '.hdf5'), 'w') as fout: | |
_ = fout.create_dataset( | |
'sense_vectors', data=sense_vectors, dtype=np.float32 | |
) | |
def remove_semeval2007_gold(fname_in, fname_out): | |
with open(fname_in, 'r') as fin, open(fname_out, 'w') as fout: | |
for line in fin: | |
if not line.startswith('semeval2007'): | |
fout.write(line) | |
def generate_evaluation_files(rootdir, lm, outdir=None, num_lm_layers=3): | |
if outdir is None: | |
outdir = rootdir | |
# load the sense vectors | |
fileroot = 'semcor.sense_ids.' + lm | |
fname_sense_vectors = os.path.join(rootdir, fileroot + '.hdf5') | |
with h5py.File(fname_sense_vectors, 'r') as fin: | |
sense_vectors = fin['sense_vectors'][...] | |
# l2 normalize each vector | |
norms = np.sqrt( | |
np.sum(sense_vectors * sense_vectors, axis=2, keepdims=True) | |
) | |
sense_vectors /= norms | |
# load the sense_ids | |
with open(os.path.join(rootdir, fileroot + '.json'), 'r') as fin: | |
sense_ids = json.load(fin) | |
lemma_pos_sense_synset, synset_ids = make_synset_ids(sense_ids) | |
for ds in [ | |
#'semeval2007', 'semeval2015', 'senseval3', 'semeval2013', 'senseval2' | |
'semeval2015', 'senseval3', 'semeval2013', 'senseval2' | |
]: | |
# predictions for each layer (token, layer1, layer2, etc.) | |
predictions = [] | |
for ii in range(num_lm_layers): | |
predictions.append([]) | |
token_ids = [] | |
category = [] | |
fname_in = os.path.join(rootdir, ds + '.json') | |
fname_embeddings = os.path.join(rootdir, ds + '.' + lm + '.hdf5') | |
n_senses_oov = 0 | |
n_senses_single = 0 | |
n_senses_multiple = 0 | |
with JsonFile(fname_in, 'r') as fin, \ | |
h5py.File(fname_embeddings, 'r') as fembed: | |
for sent_id, line in enumerate(fin): | |
start = 0 | |
sentence_embeddings = fembed['{}'.format(sent_id)][...] | |
for token in line: | |
# some tokens are n-grams like "New York" | |
n_tokens = len(token['token'].strip().split()) | |
end = start + n_tokens | |
if 'senses' in token: | |
token_ids.append(token['id']) | |
# get the context vector | |
context_vector = ( | |
sentence_embeddings[:, start:end, :] | |
).mean(axis=1) | |
cv_norm = np.sqrt( | |
(context_vector * context_vector).sum( | |
axis=1, keepdims=True | |
) | |
) | |
context_vector /= cv_norm | |
key = token['lemma'] + ' ' + token['pos'] | |
if key not in sense_ids: | |
# this lemma/pos was not in training data | |
# get the first sense from wordnet as fallback | |
predicted_sense = get_wn_first_sense( | |
token['lemma'], token['pos'] | |
) | |
for kk in range(num_lm_layers): | |
predictions[kk].append(predicted_sense) | |
n_senses_oov += 1 | |
category.append(0) | |
elif len(sense_ids[key]) == 1: | |
# only one sense... | |
predicted_sense = list(sense_ids[key].keys())[0] | |
pred_with_lemma = '{0}%{1}'.format( | |
token['lemma'], predicted_sense | |
) | |
for kk in range(num_lm_layers): | |
predictions[kk].append(pred_with_lemma) | |
n_senses_single += 1 | |
category.append(1) | |
else: | |
n_senses_multiple += 1 | |
category.append(2) | |
# get the sense with closest vector to context | |
candidates = {} | |
for sense, sid in sense_ids[key].items(): | |
sv = sense_vectors[sid, :, :] | |
scores = (sv * context_vector).sum(axis=1) | |
candidates[sense] = scores | |
# get the predicted values | |
pp = [] | |
for kk in range(num_lm_layers): | |
pp.append([None, -100]) | |
for sense, scores in candidates.items(): | |
for kk in range(num_lm_layers): | |
if scores[kk] > pp[kk][1]: | |
pp[kk] = [sense, scores[kk]] | |
for kk in range(num_lm_layers): | |
predictions[kk].append( | |
'{0}%{1}'. | |
format(token['lemma'], pp[kk][0]) | |
) | |
start = end | |
assert start == sentence_embeddings.shape[1] | |
# write predictions to a file | |
print(ds, n_senses_oov, n_senses_single, n_senses_multiple) | |
for k in range(num_lm_layers): | |
level = 'level{}'.format(k) | |
fname_str = lm | |
fname_out = os.path.join( | |
outdir, | |
'{0}.predicted.{1}.{2}.txt'.format(ds, level, fname_str) | |
) | |
with open(fname_out, 'w') as fout: | |
for tid, pred in zip(token_ids, predictions[k]): | |
fout.write('{0}.{1} {2}\n'.format(ds, tid, pred)) | |
# the baselines | |
fname_out = os.path.join( | |
outdir, '{0}.baseline.{1}.txt'.format(ds, fname_str) | |
) | |
with open(fname_out, 'w') as fout: | |
for tid, pred in zip(token_ids, category): | |
if pred == 0: | |
fout.write( | |
'{0}.{1} {2}\n'.format(ds, tid, 'wn_first_sense') | |
) | |
elif pred == 1: | |
fout.write('{0}.{1} {2}\n'.format(ds, tid, 'single_sense')) | |
elif pred == 2: | |
fout.write('{0}.{1} {2}\n'.format(ds, tid, 'nn')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment