forked from pierluigiferrari/ssd_keras
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathssd_output_decoder.py
530 lines (471 loc) · 42.3 KB
/
ssd_output_decoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
'''
Includes:
* Functions to decode and filter raw SSD model output. These are only needed if the
SSD model does not have a `DecodeDetections` layer.
* Functions to perform greedy non-maximum suppression
Copyright (C) 2018 Pierluigi Ferrari
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
'''
from __future__ import division
import numpy as np
from bounding_box_utils.bounding_box_utils import iou, convert_coordinates
def greedy_nms(y_pred_decoded, iou_threshold=0.45, coords='corners', border_pixels='half'):
'''
Perform greedy non-maximum suppression on the input boxes.
Greedy NMS works by selecting the box with the highest score and
removing all boxes around it that are too close to it measured by IoU-similarity.
Out of the boxes that are left over, once again the one with the highest
score is selected and so on, until no boxes with too much overlap are left.
Arguments:
y_pred_decoded (list): A batch of decoded predictions. For a given batch size `n` this
is a list of length `n` where each list element is a 2D Numpy array.
For a batch item with `k` predicted boxes this 2D Numpy array has
shape `(k, 6)`, where each row contains the coordinates of the respective
box in the format `[class_id, score, xmin, xmax, ymin, ymax]`.
Technically, the number of columns doesn't have to be 6, it can be
arbitrary as long as the first four elements of each row are
`xmin`, `xmax`, `ymin`, `ymax` (in this order) and the last element
is the score assigned to the prediction. Note that this function is
agnostic to the scale of the score or what it represents.
iou_threshold (float, optional): All boxes with a Jaccard similarity of
greater than `iou_threshold` with a locally maximal box will be removed
from the set of predictions, where 'maximal' refers to the box score.
coords (str, optional): The coordinate format of `y_pred_decoded`.
Can be one of the formats supported by `iou()`.
border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
If 'half', then one of each of the two horizontal and vertical borders belong
to the boxex, but not the other.
Returns:
The predictions after removing non-maxima. The format is the same as the input format.
'''
y_pred_decoded_nms = []
for batch_item in y_pred_decoded: # For the labels of each batch item...
boxes_left = np.copy(batch_item)
maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
y_pred_decoded_nms.append(np.array(maxima))
return y_pred_decoded_nms
def _greedy_nms(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
'''
The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
function for per-class NMS in `decode_detections()`.
'''
boxes_left = np.copy(predictions)
maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
maximum_index = np.argmax(boxes_left[:,0]) # ...get the index of the next box with the highest confidence...
maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
similarities = iou(boxes_left[:,1:], maximum_box[1:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
return np.array(maxima)
def _greedy_nms2(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
'''
The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
function in `decode_detections_fast()`.
'''
boxes_left = np.copy(predictions)
maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
return np.array(maxima)
def decode_detections(y_pred,
confidence_thresh=0.01,
iou_threshold=0.45,
top_k=200,
input_coords='centroids',
normalize_coords=True,
img_height=None,
img_width=None,
border_pixels='half'):
'''
Convert model prediction output back to a format that contains only the positive box predictions
(i.e. the same format that `SSDInputEncoder` takes as input).
After the decoding, two stages of prediction filtering are performed for each class individually:
First confidence thresholding, then greedy non-maximum suppression. The filtering results for all
classes are concatenated and the `top_k` overall highest confidence results constitute the final
predictions for a given batch item. This procedure follows the original Caffe implementation.
For a slightly different and more efficient alternative to decode raw model output that performs
non-maximum suppresion globally instead of per class, see `decode_detections_fast()` below.
Arguments:
y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
boxes predicted by the model per image and the last axis contains
`[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
positive class in order to be considered for the non-maximum suppression stage for the respective class.
A lower value will result in a larger part of the selection process being done by the non-maximum suppression
stage, while a larger value will result in a larger part of the selection process happening in the confidence
thresholding stage.
iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
to the box score.
top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
non-maximum suppression stage.
input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
`(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
coordinates. Requires `img_height` and `img_width` if set to `True`.
img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
If 'half', then one of each of the two horizontal and vertical borders belong
to the boxex, but not the other.
Returns:
A python list of length `batch_size` where each list element represents the predicted boxes
for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
a non-background class for the respective image in the format `[class_id, confidence, xmin, ymin, xmax, ymax]`.
'''
if normalize_coords and ((img_height is None) or (img_width is None)):
raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
# 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`
if input_coords == 'centroids':
y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
y_pred_decoded_raw[:,:,[-2,-1]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
y_pred_decoded_raw[:,:,[-4,-3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
y_pred_decoded_raw[:,:,[-4,-3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
elif input_coords == 'minmax':
y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
elif input_coords == 'corners':
y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
else:
raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")
# 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
if normalize_coords:
y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates
# 3: Apply confidence thresholding and non-maximum suppression per class
n_classes = y_pred_decoded_raw.shape[-1] - 4 # The number of classes is the length of the last axis minus the four box coordinates
y_pred_decoded = [] # Store the final predictions in this list
for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
pred = [] # Store the final predictions for this batch item here
for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)...
single_class = batch_item[:,[class_id, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 5]` and...
threshold_met = single_class[single_class[:,0] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
if threshold_met.shape[0] > 0: # If any boxes made the threshold...
maxima = _greedy_nms(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
maxima_output[:,0] = class_id # Write the class ID to the first column...
maxima_output[:,1:] = maxima # ...and write the maxima to the other columns...
pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
# Once we're through with all classes, keep only the `top_k` maxima with the highest scores
if pred: # If there are any predictions left after confidence-thresholding...
pred = np.concatenate(pred, axis=0)
if top_k != 'all' and pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
top_k_indices = np.argpartition(pred[:,1], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima...
pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
else:
pred = np.array(pred) # Even if empty, `pred` must become a Numpy array.
y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list
return y_pred_decoded
def decode_detections_fast(y_pred,
confidence_thresh=0.5,
iou_threshold=0.45,
top_k='all',
input_coords='centroids',
normalize_coords=True,
img_height=None,
img_width=None,
border_pixels='half'):
'''
Convert model prediction output back to a format that contains only the positive box predictions
(i.e. the same format that `enconde_y()` takes as input).
Optionally performs confidence thresholding and greedy non-maximum suppression after the decoding stage.
Note that the decoding procedure used here is not the same as the procedure used in the original Caffe implementation.
For each box, the procedure used here assigns the box's highest confidence as its predicted class. Then it removes
all boxes for which the highest confidence is the background class. This results in less work for the subsequent
non-maximum suppression, because the vast majority of the predictions will be filtered out just by the fact that
their highest confidence is for the background class. It is much more efficient than the procedure of the original
implementation, but the results may also differ.
Arguments:
y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
boxes predicted by the model per image and the last axis contains
`[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in any positive
class required for a given box to be considered a positive prediction. A lower value will result
in better recall, while a higher value will result in better precision. Do not use this parameter with the
goal to combat the inevitably many duplicates that an SSD will produce, the subsequent non-maximum suppression
stage will take care of those.
iou_threshold (float, optional): `None` or a float in [0,1]. If `None`, no non-maximum suppression will be
performed. If not `None`, greedy NMS will be performed after the confidence thresholding stage, meaning
all boxes with a Jaccard similarity of greater than `iou_threshold` with a locally maximal box will be removed
from the set of predictions, where 'maximal' refers to the box score.
top_k (int, optional): 'all' or an integer with number of highest scoring predictions to be kept for each batch item
after the non-maximum suppression stage. If 'all', all predictions left after the NMS stage will be kept.
input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
`(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
coordinates. Requires `img_height` and `img_width` if set to `True`.
img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
If 'half', then one of each of the two horizontal and vertical borders belong
to the boxex, but not the other.
Returns:
A python list of length `batch_size` where each list element represents the predicted boxes
for one image and contains a Numpy array of shape `(boxes, 6)` where each row is a box prediction for
a non-background class for the respective image in the format `[class_id, confidence, xmin, xmax, ymin, ymax]`.
'''
if normalize_coords and ((img_height is None) or (img_width is None)):
raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
# 1: Convert the classes from one-hot encoding to their class ID
y_pred_converted = np.copy(y_pred[:,:,-14:-8]) # Slice out the four offset predictions plus two elements whereto we'll write the class IDs and confidences in the next step
y_pred_converted[:,:,0] = np.argmax(y_pred[:,:,:-12], axis=-1) # The indices of the highest confidence values in the one-hot class vectors are the class ID
y_pred_converted[:,:,1] = np.amax(y_pred[:,:,:-12], axis=-1) # Store the confidence values themselves, too
# 2: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
if input_coords == 'centroids':
y_pred_converted[:,:,[4,5]] = np.exp(y_pred_converted[:,:,[4,5]] * y_pred[:,:,[-2,-1]]) # exp(ln(w(pred)/w(anchor)) / w_variance * w_variance) == w(pred) / w(anchor), exp(ln(h(pred)/h(anchor)) / h_variance * h_variance) == h(pred) / h(anchor)
y_pred_converted[:,:,[4,5]] *= y_pred[:,:,[-6,-5]] # (w(pred) / w(anchor)) * w(anchor) == w(pred), (h(pred) / h(anchor)) * h(anchor) == h(pred)
y_pred_converted[:,:,[2,3]] *= y_pred[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] # (delta_cx(pred) / w(anchor) / cx_variance) * cx_variance * w(anchor) == delta_cx(pred), (delta_cy(pred) / h(anchor) / cy_variance) * cy_variance * h(anchor) == delta_cy(pred)
y_pred_converted[:,:,[2,3]] += y_pred[:,:,[-8,-7]] # delta_cx(pred) + cx(anchor) == cx(pred), delta_cy(pred) + cy(anchor) == cy(pred)
y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='centroids2corners')
elif input_coords == 'minmax':
y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
y_pred_converted[:,:,[2,3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
y_pred_converted[:,:,[4,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
y_pred_converted = convert_coordinates(y_pred_converted, start_index=-4, conversion='minmax2corners')
elif input_coords == 'corners':
y_pred_converted[:,:,2:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
y_pred_converted[:,:,[2,4]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
y_pred_converted[:,:,[3,5]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
y_pred_converted[:,:,2:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
else:
raise ValueError("Unexpected value for `coords`. Supported values are 'minmax', 'corners' and 'centroids'.")
# 3: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
if normalize_coords:
y_pred_converted[:,:,[2,4]] *= img_width # Convert xmin, xmax back to absolute coordinates
y_pred_converted[:,:,[3,5]] *= img_height # Convert ymin, ymax back to absolute coordinates
# 4: Decode our huge `(batch, #boxes, 6)` tensor into a list of length `batch` where each list entry is an array containing only the positive predictions
y_pred_decoded = []
for batch_item in y_pred_converted: # For each image in the batch...
boxes = batch_item[np.nonzero(batch_item[:,0])] # ...get all boxes that don't belong to the background class,...
boxes = boxes[boxes[:,1] >= confidence_thresh] # ...then filter out those positive boxes for which the prediction confidence is too low and after that...
if iou_threshold: # ...if an IoU threshold is set...
boxes = _greedy_nms2(boxes, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on the remaining boxes.
if top_k != 'all' and boxes.shape[0] > top_k: # If we have more than `top_k` results left at this point...
top_k_indices = np.argpartition(boxes[:,1], kth=boxes.shape[0]-top_k, axis=0)[boxes.shape[0]-top_k:] # ...get the indices of the `top_k` highest-scoring boxes...
boxes = boxes[top_k_indices] # ...and keep only those boxes...
y_pred_decoded.append(boxes) # ...and now that we're done, append the array of final predictions for this batch item to the output list
return y_pred_decoded
################################################################################################
# Debugging tools, not relevant for normal use
################################################################################################
# The functions below are for debugging, so you won't normally need them. That is,
# unless you need to debug your model, of course.
def decode_detections_debug(y_pred,
confidence_thresh=0.01,
iou_threshold=0.45,
top_k=200,
input_coords='centroids',
normalize_coords=True,
img_height=None,
img_width=None,
variance_encoded_in_target=False,
border_pixels='half'):
'''
This decoder performs the same processing as `decode_detections()`, but the output format for each left-over
predicted box is `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.
That is, in addition to the usual data, each predicted box has the internal index of that box within
the model (`box_id`) prepended to it. This allows you to know exactly which part of the model made a given
box prediction; in particular, it allows you to know which predictor layer made a given prediction.
This can be useful for debugging.
Arguments:
y_pred (array): The prediction output of the SSD model, expected to be a Numpy array
of shape `(batch_size, #boxes, #classes + 4 + 4 + 4)`, where `#boxes` is the total number of
boxes predicted by the model per image and the last axis contains
`[one-hot vector for the classes, 4 predicted coordinate offsets, 4 anchor box coordinates, 4 variances]`.
confidence_thresh (float, optional): A float in [0,1), the minimum classification confidence in a specific
positive class in order to be considered for the non-maximum suppression stage for the respective class.
A lower value will result in a larger part of the selection process being done by the non-maximum suppression
stage, while a larger value will result in a larger part of the selection process happening in the confidence
thresholding stage.
iou_threshold (float, optional): A float in [0,1]. All boxes with a Jaccard similarity of greater than `iou_threshold`
with a locally maximal box will be removed from the set of predictions for a given class, where 'maximal' refers
to the box score.
top_k (int, optional): The number of highest scoring predictions to be kept for each batch item after the
non-maximum suppression stage.
input_coords (str, optional): The box coordinate format that the model outputs. Can be either 'centroids'
for the format `(cx, cy, w, h)` (box center coordinates, width, and height), 'minmax' for the format
`(xmin, xmax, ymin, ymax)`, or 'corners' for the format `(xmin, ymin, xmax, ymax)`.
normalize_coords (bool, optional): Set to `True` if the model outputs relative coordinates (i.e. coordinates in [0,1])
and you wish to transform these relative coordinates back to absolute coordinates. If the model outputs
relative coordinates, but you do not want to convert them back to absolute coordinates, set this to `False`.
Do not set this to `True` if the model already outputs absolute coordinates, as that would result in incorrect
coordinates. Requires `img_height` and `img_width` if set to `True`.
img_height (int, optional): The height of the input images. Only needed if `normalize_coords` is `True`.
img_width (int, optional): The width of the input images. Only needed if `normalize_coords` is `True`.
border_pixels (str, optional): How to treat the border pixels of the bounding boxes.
Can be 'include', 'exclude', or 'half'. If 'include', the border pixels belong
to the boxes. If 'exclude', the border pixels do not belong to the boxes.
If 'half', then one of each of the two horizontal and vertical borders belong
to the boxex, but not the other.
Returns:
A python list of length `batch_size` where each list element represents the predicted boxes
for one image and contains a Numpy array of shape `(boxes, 7)` where each row is a box prediction for
a non-background class for the respective image in the format `[box_id, class_id, confidence, xmin, ymin, xmax, ymax]`.
'''
if normalize_coords and ((img_height is None) or (img_width is None)):
raise ValueError("If relative box coordinates are supposed to be converted to absolute coordinates, the decoder needs the image size in order to decode the predictions, but `img_height == {}` and `img_width == {}`".format(img_height, img_width))
# 1: Convert the box coordinates from the predicted anchor box offsets to predicted absolute coordinates
y_pred_decoded_raw = np.copy(y_pred[:,:,:-8]) # Slice out the classes and the four offsets, throw away the anchor coordinates and variances, resulting in a tensor of shape `[batch, n_boxes, n_classes + 4 coordinates]`
if input_coords == 'centroids':
if variance_encoded_in_target:
# Decode the predicted box center x and y coordinates.
y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] + y_pred[:,:,[-8,-7]]
# Decode the predicted box width and heigt.
y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]]
else:
# Decode the predicted box center x and y coordinates.
y_pred_decoded_raw[:,:,[-4,-3]] = y_pred_decoded_raw[:,:,[-4,-3]] * y_pred[:,:,[-6,-5]] * y_pred[:,:,[-4,-3]] + y_pred[:,:,[-8,-7]]
# Decode the predicted box width and heigt.
y_pred_decoded_raw[:,:,[-2,-1]] = np.exp(y_pred_decoded_raw[:,:,[-2,-1]] * y_pred[:,:,[-2,-1]]) * y_pred[:,:,[-6,-5]]
y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='centroids2corners')
elif input_coords == 'minmax':
y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
y_pred_decoded_raw[:,:,[-4,-3]] *= np.expand_dims(y_pred[:,:,-7] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
y_pred_decoded_raw[:,:,[-2,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-6], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
y_pred_decoded_raw = convert_coordinates(y_pred_decoded_raw, start_index=-4, conversion='minmax2corners')
elif input_coords == 'corners':
y_pred_decoded_raw[:,:,-4:] *= y_pred[:,:,-4:] # delta(pred) / size(anchor) / variance * variance == delta(pred) / size(anchor) for all four coordinates, where 'size' refers to w or h, respectively
y_pred_decoded_raw[:,:,[-4,-2]] *= np.expand_dims(y_pred[:,:,-6] - y_pred[:,:,-8], axis=-1) # delta_xmin(pred) / w(anchor) * w(anchor) == delta_xmin(pred), delta_xmax(pred) / w(anchor) * w(anchor) == delta_xmax(pred)
y_pred_decoded_raw[:,:,[-3,-1]] *= np.expand_dims(y_pred[:,:,-5] - y_pred[:,:,-7], axis=-1) # delta_ymin(pred) / h(anchor) * h(anchor) == delta_ymin(pred), delta_ymax(pred) / h(anchor) * h(anchor) == delta_ymax(pred)
y_pred_decoded_raw[:,:,-4:] += y_pred[:,:,-8:-4] # delta(pred) + anchor == pred for all four coordinates
else:
raise ValueError("Unexpected value for `input_coords`. Supported input coordinate formats are 'minmax', 'corners' and 'centroids'.")
# 2: If the model predicts normalized box coordinates and they are supposed to be converted back to absolute coordinates, do that
if normalize_coords:
y_pred_decoded_raw[:,:,[-4,-2]] *= img_width # Convert xmin, xmax back to absolute coordinates
y_pred_decoded_raw[:,:,[-3,-1]] *= img_height # Convert ymin, ymax back to absolute coordinates
# 3: For each batch item, prepend each box's internal index to its coordinates.
y_pred_decoded_raw2 = np.zeros((y_pred_decoded_raw.shape[0], y_pred_decoded_raw.shape[1], y_pred_decoded_raw.shape[2] + 1)) # Expand the last axis by one.
y_pred_decoded_raw2[:,:,1:] = y_pred_decoded_raw
y_pred_decoded_raw2[:,:,0] = np.arange(y_pred_decoded_raw.shape[1]) # Put the box indices as the first element for each box via broadcasting.
y_pred_decoded_raw = y_pred_decoded_raw2
# 4: Apply confidence thresholding and non-maximum suppression per class
n_classes = y_pred_decoded_raw.shape[-1] - 5 # The number of classes is the length of the last axis minus the four box coordinates and minus the index
y_pred_decoded = [] # Store the final predictions in this list
for batch_item in y_pred_decoded_raw: # `batch_item` has shape `[n_boxes, n_classes + 4 coords]`
pred = [] # Store the final predictions for this batch item here
for class_id in range(1, n_classes): # For each class except the background class (which has class ID 0)...
single_class = batch_item[:,[0, class_id + 1, -4, -3, -2, -1]] # ...keep only the confidences for that class, making this an array of shape `[n_boxes, 6]` and...
threshold_met = single_class[single_class[:,1] > confidence_thresh] # ...keep only those boxes with a confidence above the set threshold.
if threshold_met.shape[0] > 0: # If any boxes made the threshold...
maxima = _greedy_nms_debug(threshold_met, iou_threshold=iou_threshold, coords='corners', border_pixels=border_pixels) # ...perform NMS on them.
maxima_output = np.zeros((maxima.shape[0], maxima.shape[1] + 1)) # Expand the last dimension by one element to have room for the class ID. This is now an arrray of shape `[n_boxes, 6]`
maxima_output[:,0] = maxima[:,0] # Write the box index to the first column...
maxima_output[:,1] = class_id # ...and write the class ID to the second column...
maxima_output[:,2:] = maxima[:,1:] # ...and write the rest of the maxima data to the other columns...
pred.append(maxima_output) # ...and append the maxima for this class to the list of maxima for this batch item.
# Once we're through with all classes, keep only the `top_k` maxima with the highest scores
pred = np.concatenate(pred, axis=0)
if pred.shape[0] > top_k: # If we have more than `top_k` results left at this point, otherwise there is nothing to filter,...
top_k_indices = np.argpartition(pred[:,2], kth=pred.shape[0]-top_k, axis=0)[pred.shape[0]-top_k:] # ...get the indices of the `top_k` highest-score maxima...
pred = pred[top_k_indices] # ...and keep only those entries of `pred`...
y_pred_decoded.append(pred) # ...and now that we're done, append the array of final predictions for this batch item to the output list
return y_pred_decoded
def _greedy_nms_debug(predictions, iou_threshold=0.45, coords='corners', border_pixels='half'):
'''
The same greedy non-maximum suppression algorithm as above, but slightly modified for use as an internal
function for per-class NMS in `decode_detections_debug()`. The difference is that it keeps the indices of all
left-over boxes for each batch item, which allows you to know which predictor layer predicted a given output
box and is thus useful for debugging.
'''
boxes_left = np.copy(predictions)
maxima = [] # This is where we store the boxes that make it through the non-maximum suppression
while boxes_left.shape[0] > 0: # While there are still boxes left to compare...
maximum_index = np.argmax(boxes_left[:,1]) # ...get the index of the next box with the highest confidence...
maximum_box = np.copy(boxes_left[maximum_index]) # ...copy that box and...
maxima.append(maximum_box) # ...append it to `maxima` because we'll definitely keep it
boxes_left = np.delete(boxes_left, maximum_index, axis=0) # Now remove the maximum box from `boxes_left`
if boxes_left.shape[0] == 0: break # If there are no boxes left after this step, break. Otherwise...
similarities = iou(boxes_left[:,2:], maximum_box[2:], coords=coords, mode='element-wise', border_pixels=border_pixels) # ...compare (IoU) the other left over boxes to the maximum box...
boxes_left = boxes_left[similarities <= iou_threshold] # ...so that we can remove the ones that overlap too much with the maximum box
return np.array(maxima)
def get_num_boxes_per_pred_layer(predictor_sizes, aspect_ratios, two_boxes_for_ar1):
'''
Returns a list of the number of boxes that each predictor layer predicts.
`aspect_ratios` must be a nested list, containing a list of aspect ratios
for each predictor layer.
'''
num_boxes_per_pred_layer = []
for i in range(len(predictor_sizes)):
if two_boxes_for_ar1:
num_boxes_per_pred_layer.append(predictor_sizes[i][0] * predictor_sizes[i][1] * (len(aspect_ratios[i]) + 1))
else:
num_boxes_per_pred_layer.append(predictor_sizes[i][0] * predictor_sizes[i][1] * len(aspect_ratios[i]))
return num_boxes_per_pred_layer
def get_pred_layers(y_pred_decoded, num_boxes_per_pred_layer):
'''
For a given prediction tensor decoded with `decode_detections_debug()`, returns a list
with the indices of the predictor layers that made each predictions.
That is, this function lets you know which predictor layer is responsible
for a given prediction.
Arguments:
y_pred_decoded (array): The decoded model output tensor. Must have been
decoded with `decode_detections_debug()` so that it contains the internal box index
for each predicted box.
num_boxes_per_pred_layer (list): A list that contains the total number
of boxes that each predictor layer predicts.
'''
pred_layers_all = []
cum_boxes_per_pred_layer = np.cumsum(num_boxes_per_pred_layer)
for batch_item in y_pred_decoded:
pred_layers = []
for prediction in batch_item:
if (prediction[0] < 0) or (prediction[0] >= cum_boxes_per_pred_layer[-1]):
raise ValueError("Box index is out of bounds of the possible indices as given by the values in `num_boxes_per_pred_layer`.")
for i in range(len(cum_boxes_per_pred_layer)):
if prediction[0] < cum_boxes_per_pred_layer[i]:
pred_layers.append(i)
break
pred_layers_all.append(pred_layers)
return pred_layers_all