diff --git a/README.md b/README.md index 79ee89f..250f4d5 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # SSD-based object and text detection with Keras This repository contains the implementation of various approaches to object detection in general and text detection/recognition in particular. -Its code was initially used to carry out the experiments for the author's master thesis [End-to-End Scene Text Recognition based on Artificial Neural Networks](http://83.169.39.135/thesis/thesis.pdf) and later extended with the implementation of more recent approaches. +Its code was initially used to carry out the experiments for the author's master thesis [End-to-End Scene Text Recognition based on Artificial Neural Networks](http://46.163.79.21/thesis/thesis.pdf) and later extended with the implementation of more recent approaches. ## Technical background @@ -60,19 +60,19 @@ The usage of the code is quite straightforward, clone the repository and run the ## Pretrained models Pretrained SSD models can be converted from the [original Caffe implementation](https://github.com/weiliu89/caffe/tree/ssd). -#### [Converted SSD300 VOC](http://83.169.39.135/ssd_detectors/ssd300_voc_weights_fixed.zip) +#### [Converted SSD300 VOC](http://46.163.79.21/ssd_detectors/ssd300_voc_weights_fixed.zip) PASCAL VOC 07+12+COCO SSD300* from Caffe implementation -#### [Converted SSD512 VOC](http://83.169.39.135/ssd_detectors/ssd512_voc_weights_fixed.zip) +#### [Converted SSD512 VOC](http://46.163.79.21/ssd_detectors/ssd512_voc_weights_fixed.zip) PASCAL VOC 07+12+COCO SSD512* from Caffe implementation -#### [Converted SSD300 COCO](http://83.169.39.135/ssd_detectors/ssd300_coco_weights_fixed.zip) +#### [Converted SSD300 COCO](http://46.163.79.21/ssd_detectors/ssd300_coco_weights_fixed.zip) COCO trainval35k SSD300* from Caffe implementation -#### [Converted SSD512 COCO](http://83.169.39.135/ssd_detectors/ssd512_coco_weights_fixed.zip) +#### [Converted SSD512 COCO](http://46.163.79.21/ssd_detectors/ssd512_coco_weights_fixed.zip) COCO trainval35k SSD512* from Caffe implementation -#### [SegLink](http://83.169.39.135/ssd_detectors/201809231008_sl512_synthtext.zip) +#### [SegLink](http://46.163.79.21/ssd_detectors/201809231008_sl512_synthtext.zip) initialized with converted SSD512 weights trained and tested on subsets of SynthText segment_threshold 0.60 @@ -83,7 +83,7 @@ f-measure 0.869 parameters 24,358,681 model size 94 MB -#### [SegLink with DSOD backbone and Focal Loss](http://83.169.39.135/ssd_detectors/201806021007_dsodsl512_synthtext.zip) +#### [SegLink with DSOD backbone and Focal Loss](http://46.163.79.21/ssd_detectors/201806021007_dsodsl512_synthtext.zip) trained and tested on subsets of SynthText segment_threshold 0.60 link_threshold 0.50 @@ -93,7 +93,7 @@ f-measure 0.932 parameters 12,905,177 model size 50 MB -#### [TextBoxes++ with DSOD backbone and Focal Loss](http://83.169.39.135/ssd_detectors/201906190710_dsodtbpp512fl_synthtext.zip) +#### [TextBoxes++ with DSOD backbone and Focal Loss](http://46.163.79.21/ssd_detectors/201906190710_dsodtbpp512fl_synthtext.zip) trained and tested on subsets of SynthText threshold 0.35 precision 0.984 @@ -102,7 +102,7 @@ f-measure 0.934 parameters 23,477,798 model size 91 MB -#### [TextBoxes++ with dense blocks, separable convolution and Focal Loss](http://83.169.39.135/ssd_detectors/202003070004_dstbpp512fl_synthtext.zip) +#### [TextBoxes++ with dense blocks, separable convolution and Focal Loss](http://46.163.79.21/ssd_detectors/202003070004_dstbpp512fl_synthtext.zip) the number of parameters has been reduced by ≈ 0.94% compared to the original TextBoxes++ with VGG backbone (35,763,078 parameters) trained and tested on subsets of SynthText threshold 0.45 @@ -113,7 +113,7 @@ parameters 2,226,374 model size 9 MB -#### [CRNN with LSTM](http://83.169.39.135/ssd_detectors/201806162129_crnn_lstm_synthtext.zip) +#### [CRNN with LSTM](http://46.163.79.21/ssd_detectors/201806162129_crnn_lstm_synthtext.zip) trained and tested on cropped word level bounding boxes form SynthText mean editdistance 0.332 mean normalized editdistance 0.081 @@ -124,7 +124,7 @@ parameters 8,747,351 model size 34 MB runtime (GPU) 114 ms ± 2.75 ms -#### [CRNN with GRU](http://83.169.39.135/ssd_detectors/201806190711_crnn_gru_synthtext.zip) +#### [CRNN with GRU](http://46.163.79.21/ssd_detectors/201806190711_crnn_gru_synthtext.zip) trained and tested on cropped word level bounding boxes form SynthText mean editdistance 0.333 mean normalized editdistance 0.081 @@ -135,7 +135,7 @@ parameters 7,959,895 model size 31 MB runtime (GPU) 85.1 ms ± 1.19 ms -#### [CRNN with CNN](http://83.169.39.135/ssd_detectors/202001131747_crnn_cnn_synthtext.zip) +#### [CRNN with CNN](http://46.163.79.21/ssd_detectors/202001131747_crnn_cnn_synthtext.zip) fully convolutional architecture for the recognition stage (probably not optimal) trained and tested on cropped word level bounding boxes form SynthText mean editdistance 0.355 @@ -147,7 +147,7 @@ parameters 7,877,719 model size 31 MB runtime (GPU) 3.68 ms ± 24.5 µs -#### [CRNN with CNN concat](http://83.169.39.135/ssd_detectors/202002030820_crnn_cnn_synthtext_concat_continued.zip) +#### [CRNN with CNN concat](http://46.163.79.21/ssd_detectors/202002030820_crnn_cnn_synthtext_concat_continued.zip) fine-tuned fully convolutional model on concatenated word images form SynthText mean editdistance 1.842 @@ -193,5 +193,5 @@ iterations 600k+100k

#### SegLink with DenseNet, Focal Loss and CRNN end-to-end real-time recogniton -[](http://83.169.39.135/ssd_detectors/dsodslcrnn_end2end_record.mp4) +[](http://46.163.79.21/ssd_detectors/dsodslcrnn_end2end_record.mp4) diff --git a/SL_predict.ipynb b/SL_predict.ipynb index 8b301ae..bf80ae2 100644 --- a/SL_predict.ipynb +++ b/SL_predict.ipynb @@ -4,7 +4,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } }, "outputs": [], "source": [ @@ -27,7 +30,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } }, "outputs": [], "source": [ @@ -41,7 +47,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } }, "outputs": [], "source": [ @@ -55,7 +64,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } }, "outputs": [], "source": [ @@ -74,7 +86,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } }, "outputs": [], "source": [ @@ -89,7 +104,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } }, "outputs": [], "source": [ @@ -102,8 +120,7 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true, - "scrolled": false + "tags": [] }, "outputs": [], "source": [ @@ -138,7 +155,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } }, "outputs": [], "source": [ @@ -164,7 +184,9 @@ "execution_count": null, "metadata": { "collapsed": true, - "scrolled": false + "jupyter": { + "outputs_hidden": true + } }, "outputs": [], "source": [ @@ -186,7 +208,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } }, "outputs": [], "source": [] @@ -195,7 +220,10 @@ "cell_type": "code", "execution_count": null, "metadata": { - "collapsed": true + "collapsed": true, + "jupyter": { + "outputs_hidden": true + } }, "outputs": [], "source": [] @@ -263,5 +291,5 @@ } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 } diff --git a/data_utils.py b/data_utils.py index 8241d95..d332fa6 100644 --- a/data_utils.py +++ b/data_utils.py @@ -79,14 +79,14 @@ def lighting(self, img): def horizontal_flip(self, img, y): if np.random.random() < self.hflip_prob: - img = img[:, ::-1] - y[:, [0, 2]] = 1 - y[:, [2, 0]] + img = img[:,::-1] + y[:,(0,2)] = 1 - y[:,(2,0)] return img, y def vertical_flip(self, img, y): if np.random.random() < self.vflip_prob: - img = img[::-1] - y[:, [1, 3]] = 1 - y[:, [3, 1]] + img = img[::-1,:] + y[:,(1,3)] = 1 - y[:,(3,1)] return img, y def random_sized_crop(self, img, targets): diff --git a/utils/model.py b/utils/model.py index ce84b2d..846c863 100644 --- a/utils/model.py +++ b/utils/model.py @@ -122,7 +122,16 @@ def plot_parameter_statistic(model, layer_types=['Dense', 'Conv2D'], trainable=T offset += np.array(counts_non_trainable, dtype=int) legend.append('non-trainable') if outputs: - counts_outputs = [np.sum([np.sum([np.prod(s[1:]) for s in n.output_shapes]) for n in l._inbound_nodes]) for l in layers] + #counts_outputs = [np.sum([np.sum([np.prod(s[1:]) for s in n.output_shapes]) for n in l._inbound_nodes]) for l in layers] + counts_outputs = [] + for l in layers: + shapes = [] + for n in l._inbound_nodes: + if type(n.output_shapes) == list: + shapes.extend(n.output_shapes) + else: + shapes.append(n.output_shapes) + counts_outputs.append(np.sum([np.prod(s[1:]) for s in shapes])) plt.barh(y, counts_outputs, align='center', color=colors[2], left=offset) offset += np.array(counts_outputs, dtype=int) legend.append('outputs') diff --git a/utils/training.py b/utils/training.py index 8da4c21..a33f9bc 100644 --- a/utils/training.py +++ b/utils/training.py @@ -126,12 +126,13 @@ def reduced_focal_loss(y_true, y_pred, gamma=2., alpha=1., th=0.5): return tf.reduce_sum(loss, axis=-1) -def ciou_loss(y_true, y_pred): +def ciou_loss(y_true, y_pred, variant='diou'): '''Conpute Distance-IoU loss. # Arguments y_true: Ground truth bounding boxes, tensor of shape (..., 4) y_pred: Predicted bounding boxes, tensor of shape (..., 4) + variant: 'diou', 'ciou', 'logciou' # Returns loss: Distance-IoU loss, tensor of shape (...) @@ -196,18 +197,16 @@ def ciou_loss(y_true, y_pred): w_temp = 2 * w_pred ar = (8 / (np.pi ** 2)) * arctan * ((w_pred - w_temp) * h_pred) - # calculate diou - diouk = 1-iouk + u - - # calculate ciou - #ciouk = 1-iouk + u + alpha*ar - - # "I found that -log(IoU) is more stable and converge faster than (1-IoU)" - #ciouk = -tf.math.log(iouk) + u + alpha*ar - - return diouk - #return ciouk - + # calculate diou, ciou, ... + if variant == 'diou': + return 1-iouk + u + elif variant == 'ciou': + return 1-iouk + u + alpha*ar + elif variant == 'logciou': + # "I found that -log(IoU) is more stable and converge faster than (1-IoU)" + return -tf.math.log(iouk) + u + alpha*ar + else: + return None class LearningRateDecay(Callback): def __init__(self, methode='linear', base_lr=1e-3, n_desired=40000, desired=0.1, bias=0.0, minimum=0.1): @@ -477,7 +476,7 @@ def filter_signal(x, y, window_length=1000): return x, y -def plot_log(log_dirs, names=None, limits=None, window_length=250, filtered_only=False, autoscale=True): +def plot_log(log_dirs, names=None, limits=None, window_length=250, filtered_only=False, autoscale=True, legend_loc='best'): """Plot and compares the training log contained in './checkpoints/'. # Agrumets @@ -494,7 +493,7 @@ def plot_log(log_dirs, names=None, limits=None, window_length=250, filtered_only Different batch size leads to different epoch length. """ - loss_terms = {'loss', 'error'} + loss_terms = {'loss', 'error', 'abs'} metric_terms = {'precision', 'recall', 'fmeasure', 'accuracy', 'sparsity', 'visibility'} if type(log_dirs) == str: @@ -580,7 +579,7 @@ def plot_log(log_dirs, names=None, limits=None, window_length=250, filtered_only if ymax > 0: plt.title(k, y=1.05) - plt.legend() + plt.legend(loc=legend_loc) ax1 = plt.gca() ax1.set_xlim(xmin, xmax) @@ -611,7 +610,7 @@ def plot_log(log_dirs, names=None, limits=None, window_length=250, filtered_only def plot_history(log_dirs, names=None, limits=None, autoscale=True): - loss_terms = {'loss', 'error'} + loss_terms = {'loss', 'error', 'abs'} metric_terms = {'precision', 'recall', 'fmeasure', 'accuracy', 'sparsity', 'visibility'} if type(log_dirs) == str: