-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathSemiSupervisedVDR.py
256 lines (212 loc) · 9.87 KB
/
SemiSupervisedVDR.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
import argparse
import subprocess
import re
import aux
import os
import sys
from RCNNObjectExtractor import RCNNObjectExtractor
import string
import glob
import timeit
from timeit import default_timer as timer
import gc
from nltk.stem.wordnet import WordNetLemmatizer
class SemiSupervisedVDR:
def __init__(self, args):
self.args = args
'''
Look through the Dependency Parse representation of this sentence to find
the best candidates for the subject and object.
The current implentation looks for the following:
nsubj -> Subject
dobj -> Object
A matching token is stripped of any punctuation, lowercased, and lemmatized so
there is a better chance of it firing the relevant object detector.
TODO: can we back-off to words tagged with NN if the sentence does not
contain a verb?
'''
def extractSubjectObject(self, sentence):
sub = None
obj = None
for word in sentence:
if sub == None and word[7] == "nsubj":
sub = word[2].translate(string.maketrans("",""), string.punctuation)
sub = sub.lower()
sub = WordNetLemmatizer().lemmatize(sub, pos="n")
if obj == None and word[7] == "dobj":
obj = word[2].translate(string.maketrans("",""), string.punctuation)
obj = obj.lower()
obj = WordNetLemmatizer().lemmatize(obj, pos="n")
# The sentence did not contain a verb, so we need to back-off to
# using the tokens tagged with NN (word[4]). Brute-force take the first
# two tokens tagged with NN.
#
# Example use : A big cow in a field. -> cow, field.
if sub == None:
for word in sentence:
if word[4] == "NN":
sub = word[2].translate(string.maketrans("",""), string.punctuation)
sub = sub.lower()
sub = WordNetLemmatizer().lemmatize(sub, pos="n")
break
if obj == None:
for word in sentence:
if word[4] == "NN":
proposal = word[2].translate(string.maketrans("",""), string.punctuation)
proposal = proposal.lower()
proposal = WordNetLemmatizer().lemmatize(proposal, pos="n")
if proposal != sub:
obj = proposal
break
return sub, obj
def createTrainingData(self):
# Prepare the RCNN Object Extractions in advance because this is memory
# intensive and should only happen once
start = timer()
# General arguments for the RCNNExtractor
rcnn_args = argparse.Namespace()
rcnn_args.clustersfile = self.args.clusters
rcnn_args.hdffile = self.args.images
rcnn_args.output = self.args.images
rcnn_args.verbose = self.args.verbose
rcnn_args.training = True
clusters = aux.load_clusters(rcnn_args.clustersfile)
print
print "Step 1: Gathering semi-supervised VDR training data"
training_files = glob.glob("%s/*.malt" % self.args.descriptions)
training_files.sort()
training_parsed = aux.load_conll("%s/descriptions-tagged-conll-parsed" % self.args.descriptions)
i =-1
useful_detections = []
for idx in range(0, len(training_files)):
sub = None
obj = None
others = []
training_image = training_files[idx]
if self.args.verbose:
print "---------------------"
print "Processing example %s" % training_image
# Extract the most conservative subject and object from the first
# sentence description of the image
if self.args.vlt:
sentence = training_parsed[2*idx]
else:
sentence = training_parsed[idx]
sub, obj = self.extractSubjectObject(sentence)
# Only try to create a semi-supervised training instance if we have
# found both a subject and an object.
detector_output = False
image_path = training_image.replace(self.args.descriptions, "")
if sub != None and obj != None:
if self.args.verbose:
print "Searching for %s and %s in the image" % (sub, obj)
detector = RCNNObjectExtractor(rcnn_args)
detector.args.sub = sub
detector.args.obj = obj
detector.args.others = others
detector.args.imagefilename = "%s/%s" % (self.args.images, re.sub(r".malt",".hdf", image_path))
detector_output = detector.process_hdf()
if detector_output == False:
if self.args.verbose:
print "Didn't find the desired objects, continuing to the next image\n"
continue
conll_file = self.xml2conll(rcnn_args.output, image_path, sub, obj, clusters)
useful_detections.append(conll_file)
else:
print "Didn't find a subject and object in the description, discarding"
# Now we create target-parsed-train-semi from the examples we managed to find in the data
handle = open("semi_supervised_list","w")
handle.write("cat %s > %s/target-parsed-train" % (aux.glist_to_str(useful_detections), self.args.images))
handle.close()
subprocess.check_call(["sh", "semi_supervised_list"])
os.remove("semi_supervised_list")
self.createAnnotationsFiles(useful_detections)
end = timer()
print "Created NoisyVDR training data in %f seconds" % (end - start)
'''
The annotations file is read by the VDR Parser to get at the object regions directly.
'''
def createAnnotationsFiles(self, detections):
target = "train"
ahandle = open("%s/annotations-%s" % (self.args.images, target), "w")
ihandle = open("%s/images-%s" % (self.args.images, target), "w")
for f in detections:
xmlname = re.sub(r".conll", ".xml", f)
jpgname = re.sub(r"-[1-5].semi.conll", ".jpg", f)
prefix = "/export/scratch2/elliott/src/private/"
ahandle.write("%s/%s\n" % (prefix, xmlname))
ihandle.write("%s/%s\n" % (prefix, jpgname))
ahandle.close()
ihandle.close()
def clustered_label(self, name, clusters):
if name in clusters:
return clusters[name]
else:
return name
'''
Reads the content of the XML file into memory and creates a semi-supervised
VDR from the data. Attaches the person to the object, given the automatically
calculated spatial relationship. Any other objects are root attached.
'''
def xml2conll(self, output_directory, filename, subject_label, object_label, clusters):
semi_objects = aux.parse_xml("%s/%s" % (output_directory, re.sub(r".malt", ".semi.xml", filename)))
handle = open("%s/%s" % (output_directory, re.sub(r".malt", ".semi.conll", filename)), "w")
clustered_target = self.clustered_label(object_label, clusters)
# First pass over the XML data to find the subject because will want to attach objects to it
subject_idx = None
for idx,o in enumerate(semi_objects):
if self.clustered_label(o[0], clusters) == subject_label:
subject_idx = idx
elif o[0] == subject_label:
subject_idx = idx
# We will always write idx+1 because CoNLL-X format is 1-indexed, not 0-indexed
for idx,o in enumerate(semi_objects):
#cobject = self.clustered_label(o[0], clusters)
cobject = o[0]
# People get special ROOT attachments (idx=0)
if idx == subject_idx:
sc = aux.centroid(o[1:4])
s_centroid = "%s|%s|%s" % (sc[0], sc[1], o[5])
s = str.format("%d\t%s\t%s\tNN\tNN\t%s\t%s\t%s\t_\t_\n" % (idx+1, o[0], o[0], s_centroid, 0, "-"))
elif cobject == clustered_target or cobject == object_label:
if subject_idx == None:
# Unusual situation but it needs to be covered
oc = aux.centroid(o[1:4])
o_centroid = "%s|%s|%s" % (oc[0], oc[1], o[5])
s = str.format("%d\t%s\t%s\tNN\tNN\t%s\t%s\t%s\t_\t_\n" % (idx+1, o[0], o[0], o_centroid, 0, "-"))
else:
# Get the subject and object bounding box co-ordinates
s_bounds = "%s|%s|%s|%s|%s" % (semi_objects[subject_idx][1], semi_objects[subject_idx][2], semi_objects[subject_idx][3], semi_objects[subject_idx][4], semi_objects[subject_idx][5])
o_bounds = "%s|%s|%s|%s|%s" % (o[1], o[2], o[3], o[4], o[5])
# Get the centroid of the person and object
oc = aux.centroid(o[1:4])
sc = aux.centroid(semi_objects[subject_idx][1:4])
o_centroid = "%s|%s|%s" % (oc[0], oc[1], o[5])
s_centroid = "%s|%s|%s" % (sc[0], sc[1], semi_objects[subject_idx][5])
# Calculate the best spatial relationship from the image pixels
relationship = aux.calculate_spatial_relation(o_centroid, s_centroid, o_bounds, s_bounds)
s = str.format("%d\t%s\t%s\tNN\tNN\t%s\t%s\t%s\t_\t_\n" % (idx+1, o[0], o[0], o_centroid, subject_idx+1, relationship))
# This is a background object; attach it to the ROOT (idx = 0)
elif cobject != subject_label:
oc = aux.centroid(o[1:4])
o_centroid = "%s|%s|%s" % (oc[0], oc[1], o[5])
s = str.format("%d\t%s\t%s\tNN\tNN\t%s\t%s\t%s\t_\t_\n" % (idx+1, o[0], o[0], o_centroid, 0, "-"))
handle.write(s)
s = None
handle.write("\n")
handle.close()
if self.args.verbose:
print "Written semi-supervised CoNLL-X file to %s/%s\n" % (output_directory, re.sub(r".malt", ".semi.conll", filename))
return "%s/%s" % (output_directory, re.sub(r".malt", ".semi.conll", filename))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Creates semi-supervised Visual Dependency Representations that can be used to agument the gold-standard training data for the VDR Parser.")
parser.add_argument("--images", help="Path to the extracted visual data", required=True)
parser.add_argument("--descriptions", help="Path to all the text files.")
parser.add_argument("--clusters", help="Path to all the clusters file.")
parser.add_argument("--verbose", help="Should the output be verbose?", action="store_true")
parser.add_argument("--vlt", help="VLT data set?", action="store_true")
if len(sys.argv)==1:
parser.print_help()
sys.exit(1)
c = SemiSupervisedVDR(parser.parse_args())
c.createTrainingData()