Merge pull request #11 from CodeWithKyrian/add-object-detection-pipeline

Add Object Detection Pipeline
CodeWithKyrian · Apr 3, 2024 · eb27c1e · eb27c1e
2 parents ae63823 + 3e130e2
commit eb27c1e
Show file tree

Hide file tree

Showing 21 changed files with 686 additions and 40 deletions.
diff --git a/examples/bootstrap.php b/examples/bootstrap.php
@@ -6,5 +6,7 @@
 
 require_once './vendor/autoload.php';
 
-Transformers::setup()->apply();
+Transformers::setup()
+//    ->setImageDriver(\Codewithkyrian\Transformers\Utils\ImageDriver::GD)
+    ->apply();
 
diff --git a/examples/image-test.php b/examples/image-test.php
@@ -2,18 +2,55 @@
 
 declare(strict_types=1);
 
-use Codewithkyrian\Transformers\Processors\AutoProcessor;
-use Codewithkyrian\Transformers\Utils\Image1;
+use Codewithkyrian\Transformers\Transformers;
 use Codewithkyrian\Transformers\Utils\Image;
-use function Codewithkyrian\Transformers\Utils\memoryUsage;
+use Codewithkyrian\Transformers\Utils\ImageDriver;
+use Codewithkyrian\Transformers\Utils\Tensor;
 use function Codewithkyrian\Transformers\Utils\timeUsage;
 
 require_once './bootstrap.php';
 
-$processor = AutoProcessor::fromPretrained('Xenova/vit-base-patch16-224');
+function toTensorTest(ImageDriver $imageDriver): Tensor
+{
+    timeUsage();
 
-$image = Image::read('images/kyrian-cartoon.jpeg');
+    Transformers::setup()
+        ->setImageDriver($imageDriver)
+        ->apply();
 
-$imageInputs = $processor($image);
+    $image = Image::read('images/butterfly.jpg');
 
-dd($imageInputs['pixel_values']->shape(), $imageInputs['original_sizes'], $imageInputs['reshaped_input_sizes']);
+    $image->rgb();
+
+    $tensor =  $image->toTensor();
+
+    dump("$imageDriver->name (toTensor) : ". timeUsage());
+
+    return $tensor;
+}
+
+function fromTensorTest(ImageDriver $imageDriver, Tensor $tensor) : Image
+{
+    Transformers::setup()
+        ->setImageDriver($imageDriver)
+        ->apply();
+
+    $image =  Image::fromTensor($tensor);
+
+    dump("$imageDriver->name (fromTensor) : ". timeUsage());
+
+    return $image;
+}
+
+
+// Run the test
+dump("------------ toTensor ------------");
+$tensor = toTensorTest(ImageDriver::IMAGICK);
+$tensor = toTensorTest(ImageDriver::GD);
+$tensor = toTensorTest(ImageDriver::VIPS);
+
+
+dump("------------ fromTensor ------------");
+$image = fromTensorTest(ImageDriver::IMAGICK, $tensor);
+$image = fromTensorTest(ImageDriver::GD, $tensor);
+$image = fromTensorTest(ImageDriver::VIPS, $tensor);
diff --git a/examples/pipelines/object-detection.php b/examples/pipelines/object-detection.php
@@ -0,0 +1,21 @@
+<?php
+
+declare(strict_types=1);
+
+namespace Codewithkyrian\Transformers\Pipelines;
+
+use function Codewithkyrian\Transformers\Utils\memoryUsage;
+use function Codewithkyrian\Transformers\Utils\timeUsage;
+
+require_once './bootstrap.php';
+
+ini_set('memory_limit', '-1');
+
+$detector = pipeline('object-detection', 'Xenova/detr-resnet-50');
+
+$img = __DIR__. '/../images/cats.jpg';
+
+$output = $detector($img, threshold: 0.9);
+
+dd($output, timeUsage(), memoryUsage());
+
diff --git a/src/FeatureExtractors/DetrFeatureExtractor.php b/src/FeatureExtractors/DetrFeatureExtractor.php
@@ -0,0 +1,53 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\FeatureExtractors;
+
+use Codewithkyrian\Transformers\Models\Output\ObjectDetectionOutput;
+use Codewithkyrian\Transformers\Models\Output\ModelOutput;
+use Codewithkyrian\Transformers\Processors\Processor;
+use Codewithkyrian\Transformers\Utils\Image;
+use Codewithkyrian\Transformers\Utils\Tensor;
+use Interop\Polite\Math\Matrix\NDArray;
+
+class DetrFeatureExtractor extends ImageFeatureExtractor
+{
+    /**
+     * Calls the feature extraction process on an array of images, preprocesses
+     * each image, and concatenates the resulting features into a single Tensor.
+     * @param Image|array $images The image(s) to extract features from.
+     * @return array An object containing the concatenated pixel values of the preprocessed images.
+     */
+    public function __invoke(Image|array $images, ...$args): array
+    {
+        $result = parent::__invoke($images, $args);
+
+
+        // TODO support differently-sized images, for now assume all images are the same size.
+        // TODO support different mask sizes (not just 64x64)
+        // Currently, just fill pixel mask with 1s
+        $maskSize = [$result['pixel_values']->shape()[0], 64, 64];
+
+        $pixelMaskData = array_fill(0, array_product($maskSize), 1);
+
+        $pixelMask = new Tensor($pixelMaskData, NDArray::int64, $maskSize);
+
+        return ['pixel_values' => $result['pixel_values'], 'pixel_mask' => $pixelMask];
+    }
+
+
+    /**
+     * Post-processes the outputs of the model (for object detection).
+     * @param ObjectDetectionOutput $outputs The outputs of the model that must be post-processed
+     * @param float $threshold The threshold to use for the scores.
+     * @param array|null $targetSizes The sizes of the original images.
+     * @param bool $isZeroShot Whether zero-shot object detection was performed.
+     * @return array An array of objects containing the post-processed outputs.
+     */
+    public function postProcessObjectDetection(ObjectDetectionOutput $outputs, float $threshold = 0.5, ?array $targetSizes = null, bool $isZeroShot = false): array
+    {
+        return Processor::postProcessObjectDetection($outputs, $threshold, $targetSizes, $isZeroShot);
+    }
+}
diff --git a/src/FeatureExtractors/ImageFeatureExtractor.php b/src/FeatureExtractors/ImageFeatureExtractor.php
@@ -294,8 +294,8 @@ private function calculateReflectOffset(int $val, int $max): int
      */
     public function rescale(array &$pixelData): void
     {
-        foreach ($pixelData as &$pixel) {
-            $pixel *= $this->rescaleFactor;
+        for ($i = 0; $i < count($pixelData); ++$i) {
+            $pixelData[$i] *= $this->rescaleFactor;
         }
     }
 
@@ -337,14 +337,15 @@ public function getResizeOutputImageSize(Image $image, int|array|null $size): ar
             $newWidth = $srcWidth * $shortResizeFactor;
             $newHeight = $srcHeight * $shortResizeFactor;
 
-            // Downscale to ensure the largest dimension is longestEdge
+            // The new width and height might be greater than `longest_edge`, so
+            // we downscale to ensure the largest dimension is longestEdge
             $longResizeFactor = $longestEdge !== null
                 ? min($longestEdge / $newWidth, $longestEdge / $newHeight)
                 : 1;
 
             // Round to avoid floating point precision issues
-            $finalWidth = (int)floor($newWidth * $longResizeFactor);
-            $finalHeight = (int)floor($newHeight * $longResizeFactor);
+            $finalWidth = (int)floor(round($srcWidth * $longResizeFactor, 2));
+            $finalHeight = (int)floor(round($srcHeight * $longResizeFactor, 2));
 
             if ($this->sizeDivisibility !== null) {
                 [$finalWidth, $finalHeight] = $this->enforceSizeDivisibility([$finalWidth, $finalHeight], $this->sizeDivisibility);
@@ -453,11 +454,14 @@ public function preprocess(
 
         $reshapedInputSize = [$image->height(), $image->width()];
 
+
         // All pixel-level manipulation occurs with data in the hwc format (height, width, channels),
         // to emulate the behavior of the original Python code (w/ numpy).
         $pixelData = $image->pixelData();
+
         $imgShape = [$image->height(), $image->width(), $image->channels];
 
+
         if ($this->doRescale) {
             $this->rescale($pixelData);
         }

diff --git a/src/Models/Auto/AutoModel.php b/src/Models/Auto/AutoModel.php
@@ -18,6 +18,9 @@ class AutoModel extends PretrainedMixin
         "clip" => \Codewithkyrian\Transformers\Models\Pretrained\CLIPModel::class,
         "vit" => \Codewithkyrian\Transformers\Models\Pretrained\ViTModel::class,
         "deit" => \Codewithkyrian\Transformers\Models\Pretrained\DeiTModel::class,
+
+        'detr' => \Codewithkyrian\Transformers\Models\Pretrained\DETRModel::class,
+        'yolos' => \Codewithkyrian\Transformers\Models\Pretrained\YOLOSModel::class,
     ];
 
     const ENCODER_DECODER_MODEL_MAPPING = [

diff --git a/src/Models/Auto/AutoModelForObjectDetection.php b/src/Models/Auto/AutoModelForObjectDetection.php
@@ -0,0 +1,19 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Auto;
+
+class AutoModelForObjectDetection extends PretrainedMixin
+{
+    const MODEL_CLASS_MAPPING = [
+        'detr' => \Codewithkyrian\Transformers\Models\Pretrained\DetrForObjectDetection::class,
+        'yolos' => \Codewithkyrian\Transformers\Models\Pretrained\YolosForObjectDetection::class,
+    ];
+
+    const MODEL_CLASS_MAPPINGS = [
+        self::MODEL_CLASS_MAPPING,
+    ];
+
+}
diff --git a/src/Models/Output/DetrSegmentationOutput.php b/src/Models/Output/DetrSegmentationOutput.php
@@ -0,0 +1,28 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Output;
+
+use Codewithkyrian\Transformers\Utils\Tensor;
+
+class DetrSegmentationOutput implements ModelOutput
+{
+    /**
+     * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding).
+     *
+     * @param Tensor $logits Classification logits (including no-object) for all queries.
+     * @param Tensor $predBoxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height).
+     * @param Tensor $predMasks Segmentation masks for all queries.
+     */
+    public function __construct(public readonly Tensor $logits, public readonly Tensor $predBoxes, public readonly Tensor $predMasks)
+    {
+    }
+
+
+    public static function fromOutput(array $array): self
+    {
+        return new self($array['logits'], $array['pred_boxes'], $array['pred_masks']);
+    }
+}
diff --git a/src/Models/Output/ObjectDetectionOutput.php b/src/Models/Output/ObjectDetectionOutput.php
@@ -0,0 +1,27 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Output;
+
+use Codewithkyrian\Transformers\Utils\Tensor;
+
+class ObjectDetectionOutput  implements ModelOutput
+{
+    /**
+     * These values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding possible padding).
+     *
+     * @param Tensor $logits Classification logits (including no-object) for all queries.
+     * @param Tensor $predBoxes Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height).
+     */
+    public function __construct(public readonly Tensor $logits, public readonly Tensor $predBoxes)
+    {
+    }
+
+
+    public static function fromOutput(array $array): self
+    {
+        return new self($array['logits'], $array['pred_boxes']);
+    }
+}
diff --git a/src/Models/Pretrained/DetrForObjectDetection.php b/src/Models/Pretrained/DetrForObjectDetection.php
@@ -0,0 +1,16 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+use Codewithkyrian\Transformers\Models\Output\ObjectDetectionOutput;
+
+class DetrForObjectDetection extends DetrPretrainedModel
+{
+    public function __invoke(array $modelInputs): ObjectDetectionOutput
+    {
+        return ObjectDetectionOutput::fromOutput(parent::__invoke($modelInputs));
+    }
+}
diff --git a/src/Models/Pretrained/DetrForSegmentation.php b/src/Models/Pretrained/DetrForSegmentation.php
@@ -0,0 +1,16 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+use Codewithkyrian\Transformers\Models\Output\DetrSegmentationOutput;
+
+class DetrForSegmentation extends DetrPretrainedModel
+{
+    public function __invoke(array $modelInputs): DetrSegmentationOutput
+    {
+        return DetrSegmentationOutput::fromOutput(parent::__invoke($modelInputs));
+    }
+}
diff --git a/src/Models/Pretrained/DetrModel.php b/src/Models/Pretrained/DetrModel.php
@@ -0,0 +1,11 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+class DetrModel extends DetrPretrainedModel
+{
+
+}
diff --git a/src/Models/Pretrained/DetrPretrainedModel.php b/src/Models/Pretrained/DetrPretrainedModel.php
@@ -0,0 +1,11 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+class DetrPretrainedModel extends PretrainedModel
+{
+
+}
diff --git a/src/Models/Pretrained/YolosForObjectDetection.php b/src/Models/Pretrained/YolosForObjectDetection.php
@@ -0,0 +1,16 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+use Codewithkyrian\Transformers\Models\Output\ObjectDetectionOutput;
+
+class YolosForObjectDetection extends YolosPretrainedModel
+{
+    public function __invoke(array $modelInputs): ObjectDetectionOutput
+    {
+        return ObjectDetectionOutput::fromOutput(parent::__invoke($modelInputs));
+    }
+}
diff --git a/src/Models/Pretrained/YolosModel.php b/src/Models/Pretrained/YolosModel.php
@@ -0,0 +1,11 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+class YolosModel extends YolosPretrainedModel
+{
+
+}
diff --git a/src/Models/Pretrained/YolosPretrainedModel.php b/src/Models/Pretrained/YolosPretrainedModel.php
@@ -0,0 +1,11 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+class YolosPretrainedModel extends PretrainedModel
+{
+
+}