init

ZJULearning · Apr 14, 2017 · 636bb23 · 636bb23
1 parent 7efa8d2
commit 636bb23
Show file tree

Hide file tree

Showing 30 changed files with 392,814 additions and 0 deletions.
diff --git a/README.org b/README.org
@@ -0,0 +1,16 @@
+#+OPTION: ^:nil
+* Open-Ended Video-QA
+
+* Unifying the Video and Question Attentions for Open-Ended Video Question Answering
+
+* Dataset
+- [[./dataset/file_map.tsv][file_map]]: contains the urls of the videos
+- [[./dataset/QA.tsv][QA]]: contains the question-answer pairs
+
+* Code
+** contains the methods in the paper
+
+* Dependency
+- [[https://github.com/Theano][Theano]]
+- [[https://github.com/mila-udem/blocks][Blocks]]
+- Python >= 3.4
diff --git a/dataset/QA.tsv b/dataset/QA.tsv
diff --git a/dataset/file_map.tsv b/dataset/file_map.tsv
diff --git a/src/decoder.py b/src/decoder.py
@@ -0,0 +1,54 @@
+import theano
+import theano.tensor as T
+from blocks.bricks import Linear, Softmax
+from blocks.initialization import Constant, IsotropicGaussian
+from blocks.bricks.recurrent import GatedRecurrent, LSTM
+from theano.tensor.nnet import relu
+from theano.tensor.nnet.nnet import sigmoid
+
+class seqDecoder:
+    def __init__(self, feature_dim, memory_dim, fc1_dim, fc2_dim):
+        self.W = Linear(input_dim=feature_dim,
+                        output_dim=memory_dim * 4,
+                        weights_init=IsotropicGaussian(0.01),
+                        biases_init=Constant(0),
+                        use_bias=False,
+                        name='seqDecoder_W')
+        self.GRU_A = LSTM(feature_dim,
+                          name='seqDecoder_A',
+                          weights_init=IsotropicGaussian(0.01),
+                          biases_init=Constant(0))
+        self.GRU_B = LSTM(memory_dim,
+                          name='seqDecoder_B',
+                          weights_init=IsotropicGaussian(0.01),
+                          biases_init=Constant(0))
+        self.W.initialize()
+        self.GRU_A.initialize()
+        self.GRU_B.initialize()
+        self.fc1 = Linear(input_dim=memory_dim,
+                          output_dim=fc1_dim,
+                          weights_init=IsotropicGaussian(0.01),
+                          biases_init=Constant(0),
+                          name='fc1')
+        self.fc2 = Linear(input_dim=fc1_dim,
+                          output_dim=fc2_dim,
+                          weights_init=IsotropicGaussian(0.01),
+                          biases_init=Constant(0),
+                          name='fc2')
+
+        self.fc1.initialize()
+        self.fc2.initialize()
+
+    # A: the encoding of GRU_A,
+    # B: the encoding of GRU_B
+    # padding: the tensor constant
+    def apply(self, output_length, A, B, padding):
+        A_, garbage = self.GRU_A.apply(padding, states=A)
+        WA_ = self.W.apply(A_)
+        # output_length x batch_size x output_dim
+        B_, garbage = self.GRU_B.apply(WA_, states=B)
+        # batch_size x output_length x output_dim
+        B_ =  B_.swapaxes(0,1)
+        fc1_r = relu(self.fc1.apply(B_))
+        fc2_r = relu(self.fc2.apply(fc1_r))
+        return fc2_r
diff --git a/src/embedding_layer.py b/src/embedding_layer.py
@@ -0,0 +1,38 @@
+import theano
+import theano.tensor as T
+from blocks.bricks import Linear, Softmax
+from blocks.initialization import Constant, IsotropicGaussian
+
+
+class embeddingLayer:
+    def __init__(self, word_dim, visual_dim, joint_dim):
+        self.word_embed = Linear(word_dim,
+                                 joint_dim,
+                                 name='word_to_joint',
+                                 weights_init=IsotropicGaussian(0.01),
+                                 biases_init=Constant(0))
+        self.visual_embed = Linear(visual_dim,
+                                   joint_dim,
+                                   name='visual_to_joint',
+                                   weights_init=IsotropicGaussian(0.01),
+                                   biases_init=Constant(0))
+        self.word_embed.initialize()
+        self.visual_embed.initialize()
+
+    # words: batch_size x q x word_dim
+    # video: batch_size x video_length x visual_dim
+    def apply(self, words, video, u1, u2):
+        w = self.word_embed.apply(words)
+        v = self.visual_embed.apply(video)
+        w = T.tanh(w)
+        v = T.tanh(v)
+        u = T.concatenate([u1, u2], axis=1)
+        u = self.word_embed.apply(u)
+        return w, v, u
+
+    def apply_sentence(self, words, u1, u2):
+        w = self.word_embed.apply(words)
+        w = T.tanh(w)
+        u = T.concatenate([u1, u2], axis=1)
+        u = self.word_embed.apply(u)
+        return w, u
diff --git a/src/forgettable.py b/src/forgettable.py
@@ -0,0 +1,97 @@
+import theano
+import theano.tensor as T
+from blocks.bricks import Linear, Softmax
+from blocks.initialization import Constant
+from blocks.bricks import recurrent
+from blocks.bricks.recurrent import LSTM, Bidirectional
+from gif_encoder import *
+from rereader_seq import *
+from question_encoder import *
+from rewatcher_seq import *
+from embedding_layer import *
+from blocks.bricks.cost import CategoricalCrossEntropy
+from theano.tensor.extra_ops import to_one_hot      
+from decoder import *
+
+class forgettable:
+    def __init__(self, batch_size, output_length,
+                 visual_dim, word_dim,
+                 visual_feature_dim,
+                 question_feature_dim,
+                 joint_dim,
+                 memory_dim,
+                 output_dim,
+                 fc1_dim,
+                 fc2_dim,
+                 voc_size):
+        # the video encoder
+        self.video_encoder = visualEncoder(
+            visual_dim,
+            visual_feature_dim)
+        self.sentence_encoder = questionEncoder(
+            word_dim,
+            question_feature_dim)
+        self.toJoint = embeddingLayer(
+            2 * question_feature_dim,
+            2 * visual_feature_dim,
+            joint_dim)
+        self.rewatcher = impatientLayer(
+            joint_dim,
+            memory_dim,
+            output_dim)
+        self.rereader = iwLayer(
+            joint_dim,
+            memory_dim,
+            output_dim)
+        self.seq_gen = seqDecoder(
+            joint_dim,
+            output_dim,
+            fc1_dim,
+            fc2_dim)
+        self.softmax_layer = Softmax()
+        self.bs = batch_size
+        self.output_length = output_length
+        self.voc_size = voc_size
+
+
+    def build_model(self, frame, q, q_rev, mask, maskMat, mask01, padding):
+        bs = self.bs
+        # visual dim -> visual feature dim
+        video_embedding = self.video_encoder.apply(frame)
+        # wod_dim -> question feature dimA
+        question_embedding, u1, u2 = self.sentence_encoder.apply(q, q_rev, mask, bs)
+        # -> joint_dim
+        questionJoint, videoJoint, u = self.toJoint.apply(words=question_embedding,
+                                                          video=video_embedding,
+                                                          u1=u1,
+                                                          u2=u2)
+        # bs x joint_dim, bs x output_dim
+        r_q, seq_r_q = self.rewatcher.apply(videoJoint, questionJoint,
+                                            mask, bs)
+        w_q, seq_w_q = self.rereader.apply(videoJoint, questionJoint,
+                                           maskMat, bs)
+        fc_r = self.seq_gen.apply(self.output_length, r_q, w_q, padding)
+        fc = fc_r.reshape((self.bs*self.output_length, self.voc_size))
+        self.softmax_result = self.softmax_layer.apply(fc)
+        self.pred = T.argmax(self.softmax_result, axis=1)
+        self.pred = self.pred.reshape((self.bs, self.output_length))
+
+    # groundtruth_: batch_size x output_length
+    # mask_01: (batch_size x output_length)
+    # this mask is a 0-1 matrix where 0 indicates padding area of the answer
+    def loss(self, groundtruth_, mask_01):
+        mask = mask_01.flatten()
+        gt = groundtruth_.flatten()
+
+        self.p = self.softmax_result[T.arange(self.bs * self.output_length),
+                                     gt]
+        self.cost_ = T.log(self.p + 1e-20)
+        self.cost = -T.sum(self.cost_ * mask) / self.bs
+        self.cost.name = 'softmax_cost'
+        return self.cost
+
+    def error(self, groundtruth, mask_01):
+        return T.neq(T.sum(T.neq(self.pred, groundtruth) * mask_01, axis=1), 0).sum() / self.bs
+
+    def predict(self):
+        return self.pred
diff --git a/src/gif_encoder.py b/src/gif_encoder.py
@@ -0,0 +1,48 @@
+import theano
+import theano.tensor as T
+from blocks.bricks import Linear, Softmax
+from blocks.initialization import Constant, IsotropicGaussian
+from blocks.bricks import recurrent
+from blocks.bricks.recurrent import LSTM, Bidirectional
+
+class visualEncoder:
+    def __init__(self, visual_dim, hidden_dim):
+        self.forward_lstm= LSTM(hidden_dim,
+                                name='visual_forward_lstm',
+                                weights_init=IsotropicGaussian(0.01),
+                                biases_init=Constant(0))
+        self.backward_lstm= LSTM(hidden_dim,
+                                 name='visual_backward_lstm',
+                                 weights_init=IsotropicGaussian(0.01),
+                                 biases_init=Constant(0))
+        self.x_to_h_forward = Linear(visual_dim,
+                                     hidden_dim * 4,
+                                     name='visual_forward_x_to_h',
+                                     weights_init=IsotropicGaussian(0.01),
+                                     biases_init=Constant(0))
+        self.x_to_h_backward = Linear(visual_dim,
+                                      hidden_dim * 4,
+                                      name='visual_backward_x_to_h',
+                                      weights_init=IsotropicGaussian(0.01),
+                                      biases_init=Constant(0))
+
+        self.forward_lstm.initialize()
+        self.backward_lstm.initialize()
+        self.x_to_h_forward.initialize()
+        self.x_to_h_backward.initialize()
+
+    # fixed video_length
+    # frames: batch_size x video_length x visual_dim
+    def apply(self, frames):
+        Wx = self.x_to_h_forward.apply(frames)
+        Wx_r = self.x_to_h_backward.apply(frames[:, ::-1, :])
+        # video_length x batch_size x hidden_dim
+        Wx = Wx.swapaxes(0, 1)
+        Wx_r = Wx_r.swapaxes(0, 1)
+        # nSteps x batch size x dim 
+        hf, cf = self.forward_lstm.apply(Wx)
+        hb, cb = self.backward_lstm.apply(Wx_r)
+        # video_length x batch_size x (2 x hidden_dim)
+        h = T.concatenate([hf, hb[::-1]], axis=2)
+        # batch_size x video_length x (2 x hidden_dim)
+        return h.swapaxes(0, 1)