skipthoughts.py

'''
Skip-thought vectors
https://github.com/ryankiros/skip-thoughts
'''
import os

import theano
import theano.tensor as tensor

import cPickle as pkl
import numpy
import copy
import nltk

from collections import OrderedDict, defaultdict
from scipy.linalg import norm
from nltk.tokenize import word_tokenize

profile = False

#-----------------------------------------------------------------------------#
# Specify model and table locations here
#-----------------------------------------------------------------------------#
path_to_models = 'Data/skipthoughts/'
path_to_tables = 'Data/skipthoughts/'
#-----------------------------------------------------------------------------#

path_to_umodel = path_to_models + 'uni_skip.npz'
path_to_bmodel = path_to_models + 'bi_skip.npz'


def load_model():
	"""
	Load the model with saved tables
	"""
	# Load model options
	print 'Loading model parameters...'
	with open('%s.pkl'%path_to_umodel, 'rb') as f:
		uoptions = pkl.load(f)
	with open('%s.pkl'%path_to_bmodel, 'rb') as f:
		boptions = pkl.load(f)

	# Load parameters
	uparams = init_params(uoptions)
	uparams = load_params(path_to_umodel, uparams)
	utparams = init_tparams(uparams)
	bparams = init_params_bi(boptions)
	bparams = load_params(path_to_bmodel, bparams)
	btparams = init_tparams(bparams)

	# Extractor functions
	print 'Compiling encoders...'
	embedding, x_mask, ctxw2v = build_encoder(utparams, uoptions)
	f_w2v = theano.function([embedding, x_mask], ctxw2v, name='f_w2v')
	embedding, x_mask, ctxw2v = build_encoder_bi(btparams, boptions)
	f_w2v2 = theano.function([embedding, x_mask], ctxw2v, name='f_w2v2')

	# Tables
	print 'Loading tables...'
	utable, btable = load_tables()

	# Store everything we need in a dictionary
	print 'Packing up...'
	model = {}
	model['uoptions'] = uoptions
	model['boptions'] = boptions
	model['utable'] = utable
	model['btable'] = btable
	model['f_w2v'] = f_w2v
	model['f_w2v2'] = f_w2v2

	return model


def load_tables():
	"""
	Load the tables
	"""
	words = []
	utable = numpy.load(path_to_tables + 'utable.npy')
	btable = numpy.load(path_to_tables + 'btable.npy')
	f = open(path_to_tables + 'dictionary.txt', 'rb')
	for line in f:
		words.append(line.decode('utf-8').strip())
	f.close()
	utable = OrderedDict(zip(words, utable))
	btable = OrderedDict(zip(words, btable))
	return utable, btable


def encode(model, X, use_norm=True, verbose=True, batch_size=128, use_eos=False):
	"""
	Encode sentences in the list X. Each entry will return a vector
	"""
	# first, do preprocessing
	X = preprocess(X)

	# word dictionary and init
	d = defaultdict(lambda : 0)
	for w in model['utable'].keys():
		d[w] = 1
	ufeatures = numpy.zeros((len(X), model['uoptions']['dim']), dtype='float32')
	bfeatures = numpy.zeros((len(X), 2 * model['boptions']['dim']), dtype='float32')

	# length dictionary
	ds = defaultdict(list)
	captions = [s.split() for s in X]
	for i,s in enumerate(captions):
		ds[len(s)].append(i)

	# Get features. This encodes by length, in order to avoid wasting computation
	for k in ds.keys():
		if verbose:
			print k
		numbatches = len(ds[k]) / batch_size + 1
		for minibatch in range(numbatches):
			caps = ds[k][minibatch::numbatches]

			if use_eos:
				uembedding = numpy.zeros((k+1, len(caps), model['uoptions']['dim_word']), dtype='float32')
				bembedding = numpy.zeros((k+1, len(caps), model['boptions']['dim_word']), dtype='float32')
			else:
				uembedding = numpy.zeros((k, len(caps), model['uoptions']['dim_word']), dtype='float32')
				bembedding = numpy.zeros((k, len(caps), model['boptions']['dim_word']), dtype='float32')
			for ind, c in enumerate(caps):
				caption = captions[c]
				for j in range(len(caption)):
					if d[caption[j]] > 0:
						uembedding[j,ind] = model['utable'][caption[j]]
						bembedding[j,ind] = model['btable'][caption[j]]
					else:
						uembedding[j,ind] = model['utable']['UNK']
						bembedding[j,ind] = model['btable']['UNK']
				if use_eos:
					uembedding[-1,ind] = model['utable']['<eos>']
					bembedding[-1,ind] = model['btable']['<eos>']
			if use_eos:
				uff = model['f_w2v'](uembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32'))
				bff = model['f_w2v2'](bembedding, numpy.ones((len(caption)+1,len(caps)), dtype='float32'))
			else:
				uff = model['f_w2v'](uembedding, numpy.ones((len(caption),len(caps)), dtype='float32'))
				bff = model['f_w2v2'](bembedding, numpy.ones((len(caption),len(caps)), dtype='float32'))
			if use_norm:
				for j in range(len(uff)):
					uff[j] /= norm(uff[j])
					bff[j] /= norm(bff[j])
			for ind, c in enumerate(caps):
				ufeatures[c] = uff[ind]
				bfeatures[c] = bff[ind]
	
	features = numpy.c_[ufeatures, bfeatures]
	return features


def preprocess(text):
	"""
	Preprocess text for encoder
	"""
	X = []
	sent_detector = nltk.data.load('tokenizers/punkt/english.pickle')
	for t in text:
		sents = sent_detector.tokenize(t)
		result = ''
		for s in sents:
			tokens = word_tokenize(s)
			result += ' ' + ' '.join(tokens)
		X.append(result)
	return X


def nn(model, text, vectors, query, k=5):
	"""
	Return the nearest neighbour sentences to query
	text: list of sentences
	vectors: the corresponding representations for text
	query: a string to search
	"""
	qf = encode(model, [query])
	qf /= norm(qf)
	scores = numpy.dot(qf, vectors.T).flatten()
	sorted_args = numpy.argsort(scores)[::-1]
	sentences = [text[a] for a in sorted_args[:k]]
	print 'QUERY: ' + query
	print 'NEAREST: '
	for i, s in enumerate(sentences):
		print s, sorted_args[i]


def word_features(table):
	"""
	Extract word features into a normalized matrix
	"""
	features = numpy.zeros((len(table), 620), dtype='float32')
	keys = table.keys()
	for i in range(len(table)):
		f = table[keys[i]]
		features[i] = f / norm(f)
	return features


def nn_words(table, wordvecs, query, k=10):
	"""
	Get the nearest neighbour words
	"""
	keys = table.keys()
	qf = table[query]
	scores = numpy.dot(qf, wordvecs.T).flatten()
	sorted_args = numpy.argsort(scores)[::-1]
	words = [keys[a] for a in sorted_args[:k]]
	print 'QUERY: ' + query
	print 'NEAREST: '
	for i, w in enumerate(words):
		print w


def _p(pp, name):
	"""
	make prefix-appended name
	"""
	return '%s_%s'%(pp, name)


def init_tparams(params):
	"""
	initialize Theano shared variables according to the initial parameters
	"""
	tparams = OrderedDict()
	for kk, pp in params.iteritems():
		tparams[kk] = theano.shared(params[kk], name=kk)
	return tparams


def load_params(path, params):
	"""
	load parameters
	"""
	pp = numpy.load(path)
	for kk, vv in params.iteritems():
		if kk not in pp:
			warnings.warn('%s is not in the archive'%kk)
			continue
		params[kk] = pp[kk]
	return params


# layers: 'name': ('parameter initializer', 'feedforward')
layers = {'gru': ('param_init_gru', 'gru_layer')}

def get_layer(name):
	fns = layers[name]
	return (eval(fns[0]), eval(fns[1]))


def init_params(options):
	"""
	initialize all parameters needed for the encoder
	"""
	params = OrderedDict()

	# embedding
	params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word'])

	# encoder: GRU
	params = get_layer(options['encoder'])[0](options, params, prefix='encoder',
											  nin=options['dim_word'], dim=options['dim'])
	return params


def init_params_bi(options):
	"""
	initialize all paramters needed for bidirectional encoder
	"""
	params = OrderedDict()

	# embedding
	params['Wemb'] = norm_weight(options['n_words_src'], options['dim_word'])

	# encoder: GRU
	params = get_layer(options['encoder'])[0](options, params, prefix='encoder',
											  nin=options['dim_word'], dim=options['dim'])
	params = get_layer(options['encoder'])[0](options, params, prefix='encoder_r',
											  nin=options['dim_word'], dim=options['dim'])
	return params


def build_encoder(tparams, options):
	"""
	build an encoder, given pre-computed word embeddings
	"""
	# word embedding (source)
	embedding = tensor.tensor3('embedding', dtype='float32')
	x_mask = tensor.matrix('x_mask', dtype='float32')

	# encoder
	proj = get_layer(options['encoder'])[1](tparams, embedding, options,
											prefix='encoder',
											mask=x_mask)
	ctx = proj[0][-1]

	return embedding, x_mask, ctx


def build_encoder_bi(tparams, options):
	"""
	build bidirectional encoder, given pre-computed word embeddings
	"""
	# word embedding (source)
	embedding = tensor.tensor3('embedding', dtype='float32')
	embeddingr = embedding[::-1]
	x_mask = tensor.matrix('x_mask', dtype='float32')
	xr_mask = x_mask[::-1]

	# encoder
	proj = get_layer(options['encoder'])[1](tparams, embedding, options,
											prefix='encoder',
											mask=x_mask)
	projr = get_layer(options['encoder'])[1](tparams, embeddingr, options,
											 prefix='encoder_r',
											 mask=xr_mask)

	ctx = tensor.concatenate([proj[0][-1], projr[0][-1]], axis=1)

	return embedding, x_mask, ctx


# some utilities
def ortho_weight(ndim):
	W = numpy.random.randn(ndim, ndim)
	u, s, v = numpy.linalg.svd(W)
	return u.astype('float32')


def norm_weight(nin,nout=None, scale=0.1, ortho=True):
	if nout == None:
		nout = nin
	if nout == nin and ortho:
		W = ortho_weight(nin)
	else:
		W = numpy.random.uniform(low=-scale, high=scale, size=(nin, nout))
	return W.astype('float32')


def param_init_gru(options, params, prefix='gru', nin=None, dim=None):
	"""
	parameter init for GRU
	"""
	if nin == None:
		nin = options['dim_proj']
	if dim == None:
		dim = options['dim_proj']
	W = numpy.concatenate([norm_weight(nin,dim),
						   norm_weight(nin,dim)], axis=1)
	params[_p(prefix,'W')] = W
	params[_p(prefix,'b')] = numpy.zeros((2 * dim,)).astype('float32')
	U = numpy.concatenate([ortho_weight(dim),
						   ortho_weight(dim)], axis=1)
	params[_p(prefix,'U')] = U

	Wx = norm_weight(nin, dim)
	params[_p(prefix,'Wx')] = Wx
	Ux = ortho_weight(dim)
	params[_p(prefix,'Ux')] = Ux
	params[_p(prefix,'bx')] = numpy.zeros((dim,)).astype('float32')

	return params


def gru_layer(tparams, state_below, options, prefix='gru', mask=None, **kwargs):
	"""
	Forward pass through GRU layer
	"""
	nsteps = state_below.shape[0]
	if state_below.ndim == 3:
		n_samples = state_below.shape[1]
	else:
		n_samples = 1

	dim = tparams[_p(prefix,'Ux')].shape[1]

	if mask == None:
		mask = tensor.alloc(1., state_below.shape[0], 1)

	def _slice(_x, n, dim):
		if _x.ndim == 3:
			return _x[:, :, n*dim:(n+1)*dim]
		return _x[:, n*dim:(n+1)*dim]

	state_below_ = tensor.dot(state_below, tparams[_p(prefix, 'W')]) + tparams[_p(prefix, 'b')]
	state_belowx = tensor.dot(state_below, tparams[_p(prefix, 'Wx')]) + tparams[_p(prefix, 'bx')]
	U = tparams[_p(prefix, 'U')]
	Ux = tparams[_p(prefix, 'Ux')]

	def _step_slice(m_, x_, xx_, h_, U, Ux):
		preact = tensor.dot(h_, U)
		preact += x_

		r = tensor.nnet.sigmoid(_slice(preact, 0, dim))
		u = tensor.nnet.sigmoid(_slice(preact, 1, dim))

		preactx = tensor.dot(h_, Ux)
		preactx = preactx * r
		preactx = preactx + xx_

		h = tensor.tanh(preactx)

		h = u * h_ + (1. - u) * h
		h = m_[:,None] * h + (1. - m_)[:,None] * h_

		return h

	seqs = [mask, state_below_, state_belowx]
	_step = _step_slice

	rval, updates = theano.scan(_step,
								sequences=seqs,
								outputs_info = [tensor.alloc(0., n_samples, dim)],
								non_sequences = [tparams[_p(prefix, 'U')],
												 tparams[_p(prefix, 'Ux')]],
								name=_p(prefix, '_layers'),
								n_steps=nsteps,
								profile=profile,
								strict=True)
	rval = [rval]
	return rval