We present the steps we took for computing word embeddings over the Wikipedia corpus, leveraging libSkylark's randomized SVD capability. This is followed by examples for visualizing the results and performance evaluation comparisons.
We used gensim.corpora.WikiCorpus
for parsing the Wikipedia dump and hyperwords
for converting to Positive Pointwise Mutual Information (PPMI) matrix.
The conversion is a multistage process heavily parameterized: - corpus2pairs.py
produces a 100GB file with all word pairs within a window around each word exceeding a threshold frequency; subsampling is used to mitigate the effect of very common words. - pairs2counts.sh
produces a 10GB file with the counts of all unique word pairs. - counts2vocab.py
generates counts for individual words and their contexts - also words in our setting. - counts2pmi.py
is used to compute the PPMI matrix.
For word w and context c, PMI(w,c) is defined as the log ratio between w and c's joint probability and the product of their marginal probabilities. For PPMI we zero negative entries in the PMI matrix. PPMI is a special case (k=1) of SPPMI (Shifted PPMI): SPPMI(w,c) = max(PMI(w,c) - log(k), 0).
To put things into perspective: Skip-Grams with Negative Sampling (SGNS) which is the approach underlying word2vec
can be seen as a factorization of shifted PMI: PMI(w,c) - log(k) while GloVe
implicitly factorizes the log-count matrix (i.e. the log of the counts as produced by pairs2counts.sh
above), shifted by the entire vocabularies' bias terms. word2vec
and GloVe
are the most popular packages for computing word embeddings. Using PPMI directly, or performing matrix factorization (SVD) of PPMI or SPPMI (for some k>1) are alternative approaches also used for this task.
randsvd
)In what follows we assume that self-describing path names have already been set to point to valid locations. We start with package imports and some utilities that will be used in what follows.
# standard packages
import os
import cPickle as pickle
import subprocess
# numpy, scipy and matplotlib
# http://www.scipy.org/
import numpy as np
from scipy.stats.stats import spearmanr
# nltk
# http://www.nltk.org/
import nltk
# hyperwords
# https://bitbucket.org/omerlevy/hyperwords
import hyperwords
from hyperwords.representations.matrix_serializer import load_vocabulary, save_vocabulary
# wordcloud
# https://github.com/amueller/word_cloud
import wordcloud
At the development machine - azmodan - execute this cell to initialize the paths to valid locations
top_dir = '/Westmarch/workshop/notebooks'
libskylark_install_dir = os.path.join(top_dir, 'install') # top installation dir for libskylark
font_path = os.path.join(top_dir, 'Vera.ttf') # fonts needed for the visualizations
ppmi_libsvm_path = os.path.join(top_dir, 'matrix.libsvm') # PPMI matrix in LIBSVM format
U_path = os.path.join(top_dir, 'out.U.txt') # matrix of left singular vectors
pmi_words_vocab_path = os.path.join(top_dir, 'pmi.words.vocab') # hyperwords-generated word-to-index mapping
randsvd_word_embed_path = os.path.join(top_dir, 'randsvd.embed.pickle') # word embeddings we compute (randsvd)
randsvd_noun_embed_path = os.path.join(top_dir, 'randsvd.noun.embed.pickle') # noun embeddings we compute (randsvd)
# word embeddings computed with Glove,
glove_word_embed_path = os.path.join(top_dir, 'glove.6B.100d.txt')
# Normalize the rows of a matrix
def normalize(matrix):
norm = np.sqrt(np.sum(matrix * matrix, axis=1))
matrix = matrix / norm[:, np.newaxis]
return matrix
# Convert a key-to-vector mapping into a matrix; also return
# corresponding key-to-index mapping and its inverse
def dict_to_matrix(data):
keys = data.keys()
key2idx = {}
idx2key = {}
for idx, key in enumerate(keys):
key2idx[key] = idx
idx2key[idx]= key
vectors = data.values()
keys_size = len(keys)
vector_size = len(vectors[0])
matrix = np.zeros((keys_size, vector_size))
for key, vector in data.iteritems():
idx = key2idx[key]
matrix[idx, :] = vector
return matrix, key2idx, idx2key
# Given a matrix containing word embeddings as its rows,
# word-to-index mapping and its inverse, return the topk most similar words
# to the input target word and their similarity scores
def compute_affinity(matrix,
key2idx,
idx2key,
target,
topk = 100):
vector = matrix[key2idx[target], :]
affinity = np.dot(matrix, vector)
indices = np.argsort(affinity)[::-1]
top_indices = indices[:topk]
top_keys = [idx2key[idx] for idx in top_indices]
top_scores = affinity[top_indices]
return top_keys, top_scores
# As in compute_affinity() but it produces a wordcloud for
# the topk most similar words to the target one; in this representation
# the similarity score determines the relative size ("more similar" means larger
# font size). font_path is assumed pointing to the respective matplotlib font path.
def visualize_affinity(matrix,
key2idx,
idx2key,
target,
topk = 100,
font_path = font_path):
top_keys, top_scores = compute_affinity(matrix, key2idx, idx2key, target, topk)
top_keys_freq = zip(top_keys, range(topk, 0, -1))
cloud = wordcloud.WordCloud(font_path=font_path)
cloud.generate_from_frequencies(top_keys_freq)
axis('off')
imshow(cloud, cmap=plt.cm.gray)
# For a given mapping word->vector word_vectors return
# the similarity of word1 and word2
def similarity(word1, word2, word_vectors):
v1 = word_vectors[word1]
v2 = word_vectors[word2]
return np.dot(v1, v2)
# Return Spearman rho for a given key -> vector embedding mapping
# against data in standard similarity test file in method_path;
# used and total pairs with user supplied scores from the file are
# also returned
def evaluate_word_similarity(word_vectors, method_path):
test_data = []
with open(method_path) as f:
for line in f:
x, y, sim = line.strip().lower().split()
test_data.append(((x, y), sim))
results = []
misses = 0
for i, ((x, y), sim) in enumerate(test_data):
try:
results.append((similarity(x, y, word_vectors), sim))
except:
misses += 1
pass
actual, expected = zip(*results)
total = i + 1
used = total - misses
return spearmanr(actual, expected)[0], used, total
We compute the randomized truncated SVD of PPMI using skylark_svd executable. 100 top singular triplets are approximated.
num_dims = 100 # Size of each embedding vector
executable_path = os.path.join(libskylark_install_dir, 'bin', 'skylark_svd')
command = '%s --sparse --rank=%d %s' % (executable_path, num_dims, ppmi_libsvm_path)
process = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE)
outdata, errdata = process.communicate()
The embeddings are generally a function of the left singular vectors (matrix U) scaled entrywised by a power of the respective singular values. Here we suppress the singular values (i.e. raise them to a vanishing exponent), aligned with experimental reports on improved qualitative performance for this choice and save the results.
U = np.loadtxt(U_path)
U = normalize(U) # Normalizes the rows of U; utility
key2idx, keys = load_vocabulary(pmi_words_vocab_path)
randsvd_word_embed = {}
for key, idx in key2idx.iteritems():
randsvd_word_embed[key] = U[idx]
# Save the embedding vectors for words
with open(randsvd_word_embed_path, 'w') as f:
pickle.dump(randsvd_word_embed, f)
Working with nouns is more natural to do e.g. in keyword expansion for searching
randsvd_noun_embed = {}
for key, vector in randsvd_word_embed.iteritems():
text = nltk.word_tokenize(key)
word, word_type = nltk.pos_tag(text)[0]
if word_type == 'NN':
randsvd_noun_embed[key] = vector
# Save the embedding vectors for nouns
with open(randsvd_noun_embed_path, 'w') as f:
pickle.dump(randsvd_noun_embed, f)
We now produce wordclouds for the most similar terms around some example target ones. These suggest that our embeddings are indeed very meaningful
randsvd_noun_matrix, randsvd_noun2idx, randsvd_idx2noun = dict_to_matrix(randsvd_noun_embed)
visualize_affinity(randsvd_noun_matrix, randsvd_noun2idx, randsvd_idx2noun, 'medicine')
visualize_affinity(randsvd_noun_matrix, randsvd_noun2idx, randsvd_idx2noun, 'eggplant')
visualize_affinity(randsvd_noun_matrix, randsvd_noun2idx, randsvd_idx2noun, 'sustainability')
To report actual scores:
topk = 20
target = 'sustainability'
similar_nouns, similarity_scores = compute_affinity(randsvd_noun_matrix, randsvd_noun2idx, randsvd_idx2noun,
target, topk)
print "Top %d similar words to '%s' and similarity scores" % (topk, target)
print '-' * 70
for noun, score in zip(*[similar_nouns, similarity_scores]):
print '%20s: %f' % (noun, score)
We load the word embeddings for Wikipedia generated with GloVe and publically available at http://www-nlp.stanford.edu/data/glove.6B.100d.txt.gz. These are 100-d vectors like the one we have computed with the libSkylark randsvd
approach.
glove_word_embed = {}
with open(glove_word_embed_path, 'r') as f:
for line in f:
content = line.split(' ')
word = content[0]
glove_word_embed[word] = np.array([float(x) for x in content[1:]])
We conduct similarity performance comparison for GloVe
and our randsvd
generated embeddings using similar methodology and suite of test datasets as in related papers from Omer Levy (the hyperwords
package developer).
method_paths = {'WS353' : os.path.join(top_dir, 'ws353.txt'),
'MEN' : os.path.join(top_dir, 'bruni_men.txt'),
'LUONG_RARE' : os.path.join(top_dir, 'luong_rare.txt'),
'RADINSKY_MTURK' : os.path.join(top_dir, 'radinsky_mturk.txt'),
'WS353_RELATEDNESS' : os.path.join(top_dir, 'ws353_relatedness.txt'),
'WS353_SIMILARITY' : os.path.join(top_dir, 'ws353_similarity.txt')
}
word_embeddings = {'randsvd_wikipedia_words' : randsvd_word_embed,
'glove_wikipedia_words' : glove_word_embed}
for method_tag, method_path in method_paths.iteritems():
print
for word_tag, word_vectors in word_embeddings.iteritems():
spearman_rho, used, total = evaluate_word_similarity(word_vectors, method_path)
print 'Spearman rho for %25s, %6s (%d/%d pairs): %f' % (word_tag, method_tag, used, total, spearman_rho)
We note that randsvd
performs consistently better than GloVe
in these word similarity tasks.