Skip to content

Instantly share code, notes, and snippets.

@matt-peters
Created May 29, 2019 23:34
Show Gist options
  • Save matt-peters/58d6e09384638c31292f928a14da9a9c to your computer and use it in GitHub Desktop.
Save matt-peters/58d6e09384638c31292f928a14da9a9c to your computer and use it in GitHub Desktop.
WSD ELMo
'''
data from: http://lcl.uniroma1.it/wsdeval/
(0.1) Run convert_all_wsd_datasets to convert to JsonFile
(0.2) run remove_semeval2007_gold to remove semeval2007 from the gold data
(1) dump embeddings for the datasets to hdf5 file
(2) Compute the average word embedding for each sense in the training data
with generate_sense_vectors
(3) write evaluation files with generate_evaluation_files
(3) concat predicted files:
cat *level1* > all.predicted.level1.bidirectional_2x4096_512_cnn2048.txt
(4) evaluate! -- from ~/data/wsd/WSD_Evaluation_Framework/Evaluation_Datasets run
java Scorer ALL/ALL_except_semeval2007.gold.key.txt ~/data/wsd/converted/all.predicted.level2.bidirectional_2x4096_512_cnn2048.txt
'''
# dataformat after conversion is JsonFile, one sentence per line.
# each sentence is a list of tokens, each token is a dict with various attributes
# including lemma, pos, token and senses / id if it's annotated.
# each data is named X.json, e.g. semcor.json
#
# hdf5 files of embeddings are keyed by str(line_number_in_training_file),
# with shape (num_layers, num_tokens, dim) embeddings for each sentence.
# named X.LM_NAME.hdf5, e.g. semcor.bidirectional_2x4096_512_cnn2048_skip.hdf5.
import os
import h5py
import json
import numpy as np
class JsonFile(object):
'''
A flat text file where each line is one json object
# to read though a file line by line
with JsonFile('file.json', 'r') as fin:
for line in fin:
# line is the deserialized json object
pass
# to write a file object by object
with JsonFile('file.json', 'w') as fout:
fout.write({'key1': 5, 'key2': 'token'})
fout.write({'key1': 0, 'key2': 'the'})
'''
def __init__(self, *args, **kwargs):
self._args = args
self._kwargs = kwargs
def __iter__(self):
for line in self._file:
yield json.loads(line)
def write(self, item):
item_as_json = json.dumps(item, ensure_ascii=False)
encoded = '{0}\n'.format(item_as_json)
self._file.write(encoded)
def __enter__(self):
self._file = open(*self._args, **self._kwargs)
self._file.__enter__()
return self
def __exit__(self, exc_type, exc_val, exc_tb):
self._file.__exit__(exc_type, exc_val, exc_tb)
def read_gold_data(fname):
gold_data = {}
with open(fname, 'r') as fin:
for line in fin:
ls = line.strip().split()
lid = ls[0]
if lid not in gold_data:
gold_data[lid] = set()
for sense in ls[1:]:
gold_data[lid].add(sense)
return gold_data
def read_wsd_data(fname, fname_gold):
from lxml import etree
gold_data = read_gold_data(fname_gold)
with open(fname, 'r') as fin:
data = fin.read()
corpus = etree.fromstring(data.encode('utf-8'))
sentences = []
n_sentences = 0
for node in corpus.iterdescendants():
if node.tag == 'sentence':
sentence = []
for token_node in node.iterdescendants():
token = {
'token': token_node.text,
'lemma': token_node.attrib['lemma'],
'pos': token_node.attrib['pos']
}
if token_node.tag == 'instance':
token['id'] = token_node.attrib['id']
token['senses'] = []
for sense in gold_data[token['id']]:
lemma, _, ss = sense.partition('%')
assert lemma == token['lemma']
token['senses'].append(ss)
sentence.append(token)
sentences.append(sentence)
return sentences
def get_dataset_metadata(wsd_framework_root):
return [
[
'semcor',
os.path.join(
wsd_framework_root, 'Training_Corpora', 'SemCor', 'semcor'
)
], [
'senseval2',
os.path.join(
wsd_framework_root, 'Evaluation_Datasets', 'senseval2',
'senseval2'
)
], [
'senseval3',
os.path.join(
wsd_framework_root, 'Evaluation_Datasets', 'senseval3',
'senseval3'
)
], [
'semeval2015',
os.path.join(
wsd_framework_root, 'Evaluation_Datasets', 'semeval2015',
'semeval2015'
)
], [
'semeval2013',
os.path.join(
wsd_framework_root, 'Evaluation_Datasets', 'semeval2013',
'semeval2013'
)
]
]
def convert_all_wsd_datasets(outdir, wsd_framework_root):
datasets = get_dataset_metadata(wsd_framework_root)
for ds in datasets:
ds_name, ds_root = ds
data = read_wsd_data(ds_root + '.data.xml', ds_root + '.gold.key.txt')
with JsonFile(os.path.join(outdir, ds_name + '.json'), 'w') as fout:
for line in data:
fout.write(line)
def make_synset_ids(lemma_ids):
'''
lemma_ids = {(lemma, pos) -> {'sense' -> sense_id}, ...}
'''
from nltk.corpus import wordnet as wn
pos_map = {'NOUN': wn.NOUN, 'VERB': wn.VERB, 'ADJ': wn.ADJ, 'ADV': wn.ADV}
# (lemma, pos, sense) -> synset
lemma_pos_sense_synset = {}
# {synset -> id}
synset_ids = {}
for lemma_pos, senses in sorted(lemma_ids.items()):
for sense in sorted(senses.keys()):
lemma, pos = lemma_pos.split()
the_synset = None
for synset in wn.synsets(lemma, pos=pos_map[pos]):
for lem in synset.lemmas():
wn_sense = lem.key()
if '{0}%{1}'.format(lemma, sense) == wn_sense:
the_synset = synset
assert the_synset is not None
key = ' '.join([lemma, pos, sense])
lemma_pos_sense_synset[key] = the_synset
if the_synset not in synset_ids:
next_id = len(synset_ids)
synset_ids[the_synset] = next_id
return lemma_pos_sense_synset, synset_ids
def get_wn_first_sense(lemma, pos):
from nltk.corpus import wordnet as wn
pos_map = {'NOUN': wn.NOUN, 'VERB': wn.VERB, 'ADJ': wn.ADJ, 'ADV': wn.ADV}
first_synset = wn.synsets(lemma, pos=pos_map[pos])[0]
# get the first lemma that is the same as the input lemma
found = False
for lem in first_synset.lemmas():
key = lem.key()
if key.startswith('{}%'.format(lemma)):
found = True
break
assert found
return key
def make_training_senses_vocab(rootdir):
# returns a map:
# (lemma, pos) -> {'sense1': id, 'sense2': id, ..}
# the senses are keyed by (lemma, pos), make a list of unique ones
# in the training data
fname_in = os.path.join(rootdir, 'semcor.json')
# a map from (lemma, pos) -> [id, [sense1, sense2, ...]]
lemmas = {}
with JsonFile(fname_in, 'r') as fin:
for line in fin:
for token in line:
if 'senses' in token:
pos = token['pos']
lemma = token['lemma']
key = lemma + ' ' + pos
if key not in lemmas:
next_id = len(lemmas)
lemmas[key] = [next_id, set()]
for sense in token['senses']:
lemmas[key][1].add(sense)
# finally change the sets of senses to lists
ret = {}
next_id = 0
for key, val in lemmas.items():
ret[key] = {}
for sense in val[1]:
ret[key][sense] = next_id
next_id += 1
return ret
def generate_sense_vectors(rootdir, lm, lm_dim=1024, num_lm_layers=3):
lemma_ids = make_training_senses_vocab(rootdir)
lemma_pos_sense_synset, synset_ids = make_synset_ids(lemma_ids)
# one vector per synset
n_vectors = len(synset_ids)
sense_vectors = np.zeros((n_vectors, num_lm_layers, lm_dim), dtype=np.float32)
sense_counts = np.zeros(n_vectors)
fname_in = os.path.join(rootdir, 'semcor.json')
fname_embeddings = os.path.join(rootdir, 'semcor.' + lm + '.hdf5')
with JsonFile(fname_in, 'r') as fin, \
h5py.File(fname_embeddings, 'r') as fembed:
for sent_id, line in enumerate(fin):
start = 0
sentence_embeddings = fembed['{}'.format(sent_id)][...]
if sent_id % 100 == 0:
print(sent_id)
for token in line:
# some tokens are n-grams like "New York"
n_tokens = len(token['token'].strip().split())
end = start + n_tokens
if 'senses' in token:
# get the context vector
context_vector = (sentence_embeddings[:, start:end, :]
).mean(axis=1)
key = token['lemma'] + ' ' + token['pos']
for sense in token['senses']:
sense_id = lemma_ids[key][sense]
sense_vectors[sense_id, :, :] += context_vector
sense_counts[sense_id] += 1
start = end
assert start == sentence_embeddings.shape[1]
# take the average vectors
for k in range(n_vectors):
sense_vectors[k, :, :] /= sense_counts[k]
# write to output file!
fileroot = 'semcor.sense_ids.' + lm
with open(os.path.join(rootdir, fileroot + '.json'), 'w') as fout:
fout.write(json.dumps(lemma_ids))
with h5py.File(os.path.join(rootdir, fileroot + '.hdf5'), 'w') as fout:
_ = fout.create_dataset(
'sense_vectors', data=sense_vectors, dtype=np.float32
)
def remove_semeval2007_gold(fname_in, fname_out):
with open(fname_in, 'r') as fin, open(fname_out, 'w') as fout:
for line in fin:
if not line.startswith('semeval2007'):
fout.write(line)
def generate_evaluation_files(rootdir, lm, outdir=None, num_lm_layers=3):
if outdir is None:
outdir = rootdir
# load the sense vectors
fileroot = 'semcor.sense_ids.' + lm
fname_sense_vectors = os.path.join(rootdir, fileroot + '.hdf5')
with h5py.File(fname_sense_vectors, 'r') as fin:
sense_vectors = fin['sense_vectors'][...]
# l2 normalize each vector
norms = np.sqrt(
np.sum(sense_vectors * sense_vectors, axis=2, keepdims=True)
)
sense_vectors /= norms
# load the sense_ids
with open(os.path.join(rootdir, fileroot + '.json'), 'r') as fin:
sense_ids = json.load(fin)
lemma_pos_sense_synset, synset_ids = make_synset_ids(sense_ids)
for ds in [
#'semeval2007', 'semeval2015', 'senseval3', 'semeval2013', 'senseval2'
'semeval2015', 'senseval3', 'semeval2013', 'senseval2'
]:
# predictions for each layer (token, layer1, layer2, etc.)
predictions = []
for ii in range(num_lm_layers):
predictions.append([])
token_ids = []
category = []
fname_in = os.path.join(rootdir, ds + '.json')
fname_embeddings = os.path.join(rootdir, ds + '.' + lm + '.hdf5')
n_senses_oov = 0
n_senses_single = 0
n_senses_multiple = 0
with JsonFile(fname_in, 'r') as fin, \
h5py.File(fname_embeddings, 'r') as fembed:
for sent_id, line in enumerate(fin):
start = 0
sentence_embeddings = fembed['{}'.format(sent_id)][...]
for token in line:
# some tokens are n-grams like "New York"
n_tokens = len(token['token'].strip().split())
end = start + n_tokens
if 'senses' in token:
token_ids.append(token['id'])
# get the context vector
context_vector = (
sentence_embeddings[:, start:end, :]
).mean(axis=1)
cv_norm = np.sqrt(
(context_vector * context_vector).sum(
axis=1, keepdims=True
)
)
context_vector /= cv_norm
key = token['lemma'] + ' ' + token['pos']
if key not in sense_ids:
# this lemma/pos was not in training data
# get the first sense from wordnet as fallback
predicted_sense = get_wn_first_sense(
token['lemma'], token['pos']
)
for kk in range(num_lm_layers):
predictions[kk].append(predicted_sense)
n_senses_oov += 1
category.append(0)
elif len(sense_ids[key]) == 1:
# only one sense...
predicted_sense = list(sense_ids[key].keys())[0]
pred_with_lemma = '{0}%{1}'.format(
token['lemma'], predicted_sense
)
for kk in range(num_lm_layers):
predictions[kk].append(pred_with_lemma)
n_senses_single += 1
category.append(1)
else:
n_senses_multiple += 1
category.append(2)
# get the sense with closest vector to context
candidates = {}
for sense, sid in sense_ids[key].items():
sv = sense_vectors[sid, :, :]
scores = (sv * context_vector).sum(axis=1)
candidates[sense] = scores
# get the predicted values
pp = []
for kk in range(num_lm_layers):
pp.append([None, -100])
for sense, scores in candidates.items():
for kk in range(num_lm_layers):
if scores[kk] > pp[kk][1]:
pp[kk] = [sense, scores[kk]]
for kk in range(num_lm_layers):
predictions[kk].append(
'{0}%{1}'.
format(token['lemma'], pp[kk][0])
)
start = end
assert start == sentence_embeddings.shape[1]
# write predictions to a file
print(ds, n_senses_oov, n_senses_single, n_senses_multiple)
for k in range(num_lm_layers):
level = 'level{}'.format(k)
fname_str = lm
fname_out = os.path.join(
outdir,
'{0}.predicted.{1}.{2}.txt'.format(ds, level, fname_str)
)
with open(fname_out, 'w') as fout:
for tid, pred in zip(token_ids, predictions[k]):
fout.write('{0}.{1} {2}\n'.format(ds, tid, pred))
# the baselines
fname_out = os.path.join(
outdir, '{0}.baseline.{1}.txt'.format(ds, fname_str)
)
with open(fname_out, 'w') as fout:
for tid, pred in zip(token_ids, category):
if pred == 0:
fout.write(
'{0}.{1} {2}\n'.format(ds, tid, 'wn_first_sense')
)
elif pred == 1:
fout.write('{0}.{1} {2}\n'.format(ds, tid, 'single_sense'))
elif pred == 2:
fout.write('{0}.{1} {2}\n'.format(ds, tid, 'nn'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment