Skip to content

Commit

Permalink
init
Browse files Browse the repository at this point in the history
  • Loading branch information
xuehy committed Apr 14, 2017
1 parent 7efa8d2 commit 636bb23
Show file tree
Hide file tree
Showing 30 changed files with 392,814 additions and 0 deletions.
16 changes: 16 additions & 0 deletions README.org
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#+OPTION: ^:nil
* Open-Ended Video-QA

* Unifying the Video and Question Attentions for Open-Ended Video Question Answering

* Dataset
- [[./dataset/file_map.tsv][file_map]]: contains the urls of the videos
- [[./dataset/QA.tsv][QA]]: contains the question-answer pairs

* Code
** contains the methods in the paper

* Dependency
- [[https://github.com/Theano][Theano]]
- [[https://github.com/mila-udem/blocks][Blocks]]
- Python >= 3.4
287,824 changes: 287,824 additions & 0 deletions dataset/QA.tsv

Large diffs are not rendered by default.

102,068 changes: 102,068 additions & 0 deletions dataset/file_map.tsv

Large diffs are not rendered by default.

54 changes: 54 additions & 0 deletions src/decoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
import theano
import theano.tensor as T
from blocks.bricks import Linear, Softmax
from blocks.initialization import Constant, IsotropicGaussian
from blocks.bricks.recurrent import GatedRecurrent, LSTM
from theano.tensor.nnet import relu
from theano.tensor.nnet.nnet import sigmoid

class seqDecoder:
def __init__(self, feature_dim, memory_dim, fc1_dim, fc2_dim):
self.W = Linear(input_dim=feature_dim,
output_dim=memory_dim * 4,
weights_init=IsotropicGaussian(0.01),
biases_init=Constant(0),
use_bias=False,
name='seqDecoder_W')
self.GRU_A = LSTM(feature_dim,
name='seqDecoder_A',
weights_init=IsotropicGaussian(0.01),
biases_init=Constant(0))
self.GRU_B = LSTM(memory_dim,
name='seqDecoder_B',
weights_init=IsotropicGaussian(0.01),
biases_init=Constant(0))
self.W.initialize()
self.GRU_A.initialize()
self.GRU_B.initialize()
self.fc1 = Linear(input_dim=memory_dim,
output_dim=fc1_dim,
weights_init=IsotropicGaussian(0.01),
biases_init=Constant(0),
name='fc1')
self.fc2 = Linear(input_dim=fc1_dim,
output_dim=fc2_dim,
weights_init=IsotropicGaussian(0.01),
biases_init=Constant(0),
name='fc2')

self.fc1.initialize()
self.fc2.initialize()

# A: the encoding of GRU_A,
# B: the encoding of GRU_B
# padding: the tensor constant
def apply(self, output_length, A, B, padding):
A_, garbage = self.GRU_A.apply(padding, states=A)
WA_ = self.W.apply(A_)
# output_length x batch_size x output_dim
B_, garbage = self.GRU_B.apply(WA_, states=B)
# batch_size x output_length x output_dim
B_ = B_.swapaxes(0,1)
fc1_r = relu(self.fc1.apply(B_))
fc2_r = relu(self.fc2.apply(fc1_r))
return fc2_r
38 changes: 38 additions & 0 deletions src/embedding_layer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
import theano
import theano.tensor as T
from blocks.bricks import Linear, Softmax
from blocks.initialization import Constant, IsotropicGaussian


class embeddingLayer:
def __init__(self, word_dim, visual_dim, joint_dim):
self.word_embed = Linear(word_dim,
joint_dim,
name='word_to_joint',
weights_init=IsotropicGaussian(0.01),
biases_init=Constant(0))
self.visual_embed = Linear(visual_dim,
joint_dim,
name='visual_to_joint',
weights_init=IsotropicGaussian(0.01),
biases_init=Constant(0))
self.word_embed.initialize()
self.visual_embed.initialize()

# words: batch_size x q x word_dim
# video: batch_size x video_length x visual_dim
def apply(self, words, video, u1, u2):
w = self.word_embed.apply(words)
v = self.visual_embed.apply(video)
w = T.tanh(w)
v = T.tanh(v)
u = T.concatenate([u1, u2], axis=1)
u = self.word_embed.apply(u)
return w, v, u

def apply_sentence(self, words, u1, u2):
w = self.word_embed.apply(words)
w = T.tanh(w)
u = T.concatenate([u1, u2], axis=1)
u = self.word_embed.apply(u)
return w, u
97 changes: 97 additions & 0 deletions src/forgettable.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,97 @@
import theano
import theano.tensor as T
from blocks.bricks import Linear, Softmax
from blocks.initialization import Constant
from blocks.bricks import recurrent
from blocks.bricks.recurrent import LSTM, Bidirectional
from gif_encoder import *
from rereader_seq import *
from question_encoder import *
from rewatcher_seq import *
from embedding_layer import *
from blocks.bricks.cost import CategoricalCrossEntropy
from theano.tensor.extra_ops import to_one_hot
from decoder import *

class forgettable:
def __init__(self, batch_size, output_length,
visual_dim, word_dim,
visual_feature_dim,
question_feature_dim,
joint_dim,
memory_dim,
output_dim,
fc1_dim,
fc2_dim,
voc_size):
# the video encoder
self.video_encoder = visualEncoder(
visual_dim,
visual_feature_dim)
self.sentence_encoder = questionEncoder(
word_dim,
question_feature_dim)
self.toJoint = embeddingLayer(
2 * question_feature_dim,
2 * visual_feature_dim,
joint_dim)
self.rewatcher = impatientLayer(
joint_dim,
memory_dim,
output_dim)
self.rereader = iwLayer(
joint_dim,
memory_dim,
output_dim)
self.seq_gen = seqDecoder(
joint_dim,
output_dim,
fc1_dim,
fc2_dim)
self.softmax_layer = Softmax()
self.bs = batch_size
self.output_length = output_length
self.voc_size = voc_size


def build_model(self, frame, q, q_rev, mask, maskMat, mask01, padding):
bs = self.bs
# visual dim -> visual feature dim
video_embedding = self.video_encoder.apply(frame)
# wod_dim -> question feature dimA
question_embedding, u1, u2 = self.sentence_encoder.apply(q, q_rev, mask, bs)
# -> joint_dim
questionJoint, videoJoint, u = self.toJoint.apply(words=question_embedding,
video=video_embedding,
u1=u1,
u2=u2)
# bs x joint_dim, bs x output_dim
r_q, seq_r_q = self.rewatcher.apply(videoJoint, questionJoint,
mask, bs)
w_q, seq_w_q = self.rereader.apply(videoJoint, questionJoint,
maskMat, bs)
fc_r = self.seq_gen.apply(self.output_length, r_q, w_q, padding)
fc = fc_r.reshape((self.bs*self.output_length, self.voc_size))
self.softmax_result = self.softmax_layer.apply(fc)
self.pred = T.argmax(self.softmax_result, axis=1)
self.pred = self.pred.reshape((self.bs, self.output_length))

# groundtruth_: batch_size x output_length
# mask_01: (batch_size x output_length)
# this mask is a 0-1 matrix where 0 indicates padding area of the answer
def loss(self, groundtruth_, mask_01):
mask = mask_01.flatten()
gt = groundtruth_.flatten()

self.p = self.softmax_result[T.arange(self.bs * self.output_length),
gt]
self.cost_ = T.log(self.p + 1e-20)
self.cost = -T.sum(self.cost_ * mask) / self.bs
self.cost.name = 'softmax_cost'
return self.cost

def error(self, groundtruth, mask_01):
return T.neq(T.sum(T.neq(self.pred, groundtruth) * mask_01, axis=1), 0).sum() / self.bs

def predict(self):
return self.pred
48 changes: 48 additions & 0 deletions src/gif_encoder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import theano
import theano.tensor as T
from blocks.bricks import Linear, Softmax
from blocks.initialization import Constant, IsotropicGaussian
from blocks.bricks import recurrent
from blocks.bricks.recurrent import LSTM, Bidirectional

class visualEncoder:
def __init__(self, visual_dim, hidden_dim):
self.forward_lstm= LSTM(hidden_dim,
name='visual_forward_lstm',
weights_init=IsotropicGaussian(0.01),
biases_init=Constant(0))
self.backward_lstm= LSTM(hidden_dim,
name='visual_backward_lstm',
weights_init=IsotropicGaussian(0.01),
biases_init=Constant(0))
self.x_to_h_forward = Linear(visual_dim,
hidden_dim * 4,
name='visual_forward_x_to_h',
weights_init=IsotropicGaussian(0.01),
biases_init=Constant(0))
self.x_to_h_backward = Linear(visual_dim,
hidden_dim * 4,
name='visual_backward_x_to_h',
weights_init=IsotropicGaussian(0.01),
biases_init=Constant(0))

self.forward_lstm.initialize()
self.backward_lstm.initialize()
self.x_to_h_forward.initialize()
self.x_to_h_backward.initialize()

# fixed video_length
# frames: batch_size x video_length x visual_dim
def apply(self, frames):
Wx = self.x_to_h_forward.apply(frames)
Wx_r = self.x_to_h_backward.apply(frames[:, ::-1, :])
# video_length x batch_size x hidden_dim
Wx = Wx.swapaxes(0, 1)
Wx_r = Wx_r.swapaxes(0, 1)
# nSteps x batch size x dim
hf, cf = self.forward_lstm.apply(Wx)
hb, cb = self.backward_lstm.apply(Wx_r)
# video_length x batch_size x (2 x hidden_dim)
h = T.concatenate([hf, hb[::-1]], axis=2)
# batch_size x video_length x (2 x hidden_dim)
return h.swapaxes(0, 1)
Loading

0 comments on commit 636bb23

Please sign in to comment.