-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
30 changed files
with
392,814 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,16 @@ | ||
#+OPTION: ^:nil | ||
* Open-Ended Video-QA | ||
|
||
* Unifying the Video and Question Attentions for Open-Ended Video Question Answering | ||
|
||
* Dataset | ||
- [[./dataset/file_map.tsv][file_map]]: contains the urls of the videos | ||
- [[./dataset/QA.tsv][QA]]: contains the question-answer pairs | ||
|
||
* Code | ||
** contains the methods in the paper | ||
|
||
* Dependency | ||
- [[https://github.com/Theano][Theano]] | ||
- [[https://github.com/mila-udem/blocks][Blocks]] | ||
- Python >= 3.4 |
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,54 @@ | ||
import theano | ||
import theano.tensor as T | ||
from blocks.bricks import Linear, Softmax | ||
from blocks.initialization import Constant, IsotropicGaussian | ||
from blocks.bricks.recurrent import GatedRecurrent, LSTM | ||
from theano.tensor.nnet import relu | ||
from theano.tensor.nnet.nnet import sigmoid | ||
|
||
class seqDecoder: | ||
def __init__(self, feature_dim, memory_dim, fc1_dim, fc2_dim): | ||
self.W = Linear(input_dim=feature_dim, | ||
output_dim=memory_dim * 4, | ||
weights_init=IsotropicGaussian(0.01), | ||
biases_init=Constant(0), | ||
use_bias=False, | ||
name='seqDecoder_W') | ||
self.GRU_A = LSTM(feature_dim, | ||
name='seqDecoder_A', | ||
weights_init=IsotropicGaussian(0.01), | ||
biases_init=Constant(0)) | ||
self.GRU_B = LSTM(memory_dim, | ||
name='seqDecoder_B', | ||
weights_init=IsotropicGaussian(0.01), | ||
biases_init=Constant(0)) | ||
self.W.initialize() | ||
self.GRU_A.initialize() | ||
self.GRU_B.initialize() | ||
self.fc1 = Linear(input_dim=memory_dim, | ||
output_dim=fc1_dim, | ||
weights_init=IsotropicGaussian(0.01), | ||
biases_init=Constant(0), | ||
name='fc1') | ||
self.fc2 = Linear(input_dim=fc1_dim, | ||
output_dim=fc2_dim, | ||
weights_init=IsotropicGaussian(0.01), | ||
biases_init=Constant(0), | ||
name='fc2') | ||
|
||
self.fc1.initialize() | ||
self.fc2.initialize() | ||
|
||
# A: the encoding of GRU_A, | ||
# B: the encoding of GRU_B | ||
# padding: the tensor constant | ||
def apply(self, output_length, A, B, padding): | ||
A_, garbage = self.GRU_A.apply(padding, states=A) | ||
WA_ = self.W.apply(A_) | ||
# output_length x batch_size x output_dim | ||
B_, garbage = self.GRU_B.apply(WA_, states=B) | ||
# batch_size x output_length x output_dim | ||
B_ = B_.swapaxes(0,1) | ||
fc1_r = relu(self.fc1.apply(B_)) | ||
fc2_r = relu(self.fc2.apply(fc1_r)) | ||
return fc2_r |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,38 @@ | ||
import theano | ||
import theano.tensor as T | ||
from blocks.bricks import Linear, Softmax | ||
from blocks.initialization import Constant, IsotropicGaussian | ||
|
||
|
||
class embeddingLayer: | ||
def __init__(self, word_dim, visual_dim, joint_dim): | ||
self.word_embed = Linear(word_dim, | ||
joint_dim, | ||
name='word_to_joint', | ||
weights_init=IsotropicGaussian(0.01), | ||
biases_init=Constant(0)) | ||
self.visual_embed = Linear(visual_dim, | ||
joint_dim, | ||
name='visual_to_joint', | ||
weights_init=IsotropicGaussian(0.01), | ||
biases_init=Constant(0)) | ||
self.word_embed.initialize() | ||
self.visual_embed.initialize() | ||
|
||
# words: batch_size x q x word_dim | ||
# video: batch_size x video_length x visual_dim | ||
def apply(self, words, video, u1, u2): | ||
w = self.word_embed.apply(words) | ||
v = self.visual_embed.apply(video) | ||
w = T.tanh(w) | ||
v = T.tanh(v) | ||
u = T.concatenate([u1, u2], axis=1) | ||
u = self.word_embed.apply(u) | ||
return w, v, u | ||
|
||
def apply_sentence(self, words, u1, u2): | ||
w = self.word_embed.apply(words) | ||
w = T.tanh(w) | ||
u = T.concatenate([u1, u2], axis=1) | ||
u = self.word_embed.apply(u) | ||
return w, u |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,97 @@ | ||
import theano | ||
import theano.tensor as T | ||
from blocks.bricks import Linear, Softmax | ||
from blocks.initialization import Constant | ||
from blocks.bricks import recurrent | ||
from blocks.bricks.recurrent import LSTM, Bidirectional | ||
from gif_encoder import * | ||
from rereader_seq import * | ||
from question_encoder import * | ||
from rewatcher_seq import * | ||
from embedding_layer import * | ||
from blocks.bricks.cost import CategoricalCrossEntropy | ||
from theano.tensor.extra_ops import to_one_hot | ||
from decoder import * | ||
|
||
class forgettable: | ||
def __init__(self, batch_size, output_length, | ||
visual_dim, word_dim, | ||
visual_feature_dim, | ||
question_feature_dim, | ||
joint_dim, | ||
memory_dim, | ||
output_dim, | ||
fc1_dim, | ||
fc2_dim, | ||
voc_size): | ||
# the video encoder | ||
self.video_encoder = visualEncoder( | ||
visual_dim, | ||
visual_feature_dim) | ||
self.sentence_encoder = questionEncoder( | ||
word_dim, | ||
question_feature_dim) | ||
self.toJoint = embeddingLayer( | ||
2 * question_feature_dim, | ||
2 * visual_feature_dim, | ||
joint_dim) | ||
self.rewatcher = impatientLayer( | ||
joint_dim, | ||
memory_dim, | ||
output_dim) | ||
self.rereader = iwLayer( | ||
joint_dim, | ||
memory_dim, | ||
output_dim) | ||
self.seq_gen = seqDecoder( | ||
joint_dim, | ||
output_dim, | ||
fc1_dim, | ||
fc2_dim) | ||
self.softmax_layer = Softmax() | ||
self.bs = batch_size | ||
self.output_length = output_length | ||
self.voc_size = voc_size | ||
|
||
|
||
def build_model(self, frame, q, q_rev, mask, maskMat, mask01, padding): | ||
bs = self.bs | ||
# visual dim -> visual feature dim | ||
video_embedding = self.video_encoder.apply(frame) | ||
# wod_dim -> question feature dimA | ||
question_embedding, u1, u2 = self.sentence_encoder.apply(q, q_rev, mask, bs) | ||
# -> joint_dim | ||
questionJoint, videoJoint, u = self.toJoint.apply(words=question_embedding, | ||
video=video_embedding, | ||
u1=u1, | ||
u2=u2) | ||
# bs x joint_dim, bs x output_dim | ||
r_q, seq_r_q = self.rewatcher.apply(videoJoint, questionJoint, | ||
mask, bs) | ||
w_q, seq_w_q = self.rereader.apply(videoJoint, questionJoint, | ||
maskMat, bs) | ||
fc_r = self.seq_gen.apply(self.output_length, r_q, w_q, padding) | ||
fc = fc_r.reshape((self.bs*self.output_length, self.voc_size)) | ||
self.softmax_result = self.softmax_layer.apply(fc) | ||
self.pred = T.argmax(self.softmax_result, axis=1) | ||
self.pred = self.pred.reshape((self.bs, self.output_length)) | ||
|
||
# groundtruth_: batch_size x output_length | ||
# mask_01: (batch_size x output_length) | ||
# this mask is a 0-1 matrix where 0 indicates padding area of the answer | ||
def loss(self, groundtruth_, mask_01): | ||
mask = mask_01.flatten() | ||
gt = groundtruth_.flatten() | ||
|
||
self.p = self.softmax_result[T.arange(self.bs * self.output_length), | ||
gt] | ||
self.cost_ = T.log(self.p + 1e-20) | ||
self.cost = -T.sum(self.cost_ * mask) / self.bs | ||
self.cost.name = 'softmax_cost' | ||
return self.cost | ||
|
||
def error(self, groundtruth, mask_01): | ||
return T.neq(T.sum(T.neq(self.pred, groundtruth) * mask_01, axis=1), 0).sum() / self.bs | ||
|
||
def predict(self): | ||
return self.pred |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import theano | ||
import theano.tensor as T | ||
from blocks.bricks import Linear, Softmax | ||
from blocks.initialization import Constant, IsotropicGaussian | ||
from blocks.bricks import recurrent | ||
from blocks.bricks.recurrent import LSTM, Bidirectional | ||
|
||
class visualEncoder: | ||
def __init__(self, visual_dim, hidden_dim): | ||
self.forward_lstm= LSTM(hidden_dim, | ||
name='visual_forward_lstm', | ||
weights_init=IsotropicGaussian(0.01), | ||
biases_init=Constant(0)) | ||
self.backward_lstm= LSTM(hidden_dim, | ||
name='visual_backward_lstm', | ||
weights_init=IsotropicGaussian(0.01), | ||
biases_init=Constant(0)) | ||
self.x_to_h_forward = Linear(visual_dim, | ||
hidden_dim * 4, | ||
name='visual_forward_x_to_h', | ||
weights_init=IsotropicGaussian(0.01), | ||
biases_init=Constant(0)) | ||
self.x_to_h_backward = Linear(visual_dim, | ||
hidden_dim * 4, | ||
name='visual_backward_x_to_h', | ||
weights_init=IsotropicGaussian(0.01), | ||
biases_init=Constant(0)) | ||
|
||
self.forward_lstm.initialize() | ||
self.backward_lstm.initialize() | ||
self.x_to_h_forward.initialize() | ||
self.x_to_h_backward.initialize() | ||
|
||
# fixed video_length | ||
# frames: batch_size x video_length x visual_dim | ||
def apply(self, frames): | ||
Wx = self.x_to_h_forward.apply(frames) | ||
Wx_r = self.x_to_h_backward.apply(frames[:, ::-1, :]) | ||
# video_length x batch_size x hidden_dim | ||
Wx = Wx.swapaxes(0, 1) | ||
Wx_r = Wx_r.swapaxes(0, 1) | ||
# nSteps x batch size x dim | ||
hf, cf = self.forward_lstm.apply(Wx) | ||
hb, cb = self.backward_lstm.apply(Wx_r) | ||
# video_length x batch_size x (2 x hidden_dim) | ||
h = T.concatenate([hf, hb[::-1]], axis=2) | ||
# batch_size x video_length x (2 x hidden_dim) | ||
return h.swapaxes(0, 1) |
Oops, something went wrong.