Merge pull request #17 from CodeWithKyrian/add-image-to-image-pipeline

Add image to image pipeline
CodeWithKyrian · Apr 8, 2024 · d0b1f0d · d0b1f0d
2 parents bc3ef74 + 7ea5560
commit d0b1f0d
Show file tree

Hide file tree

Showing 14 changed files with 199 additions and 27 deletions.
diff --git a/examples/pipelines/image-to-image.php b/examples/pipelines/image-to-image.php
@@ -0,0 +1,22 @@
+<?php
+
+declare(strict_types=1);
+
+use Codewithkyrian\Transformers\Generation\Streamers\StdOutStreamer;
+use function Codewithkyrian\Transformers\Pipelines\pipeline;
+use function Codewithkyrian\Transformers\Utils\memoryUsage;
+use function Codewithkyrian\Transformers\Utils\timeUsage;
+
+require_once './bootstrap.php';
+
+ini_set('memory_limit', '2048M');
+
+$upscaler = pipeline('image-to-image', 'Xenova/swin2SR-classical-sr-x2-64');
+
+$url = __DIR__. '/../images/butterfly.jpg';
+
+$output = $upscaler($url);
+
+$output->save(__DIR__. '/../images/butterfly-super-resolution.jpg');
+
+dd($output->size(), timeUsage(), memoryUsage());
diff --git a/src/FeatureExtractors/ImageFeatureExtractor.php b/src/FeatureExtractors/ImageFeatureExtractor.php
@@ -16,13 +16,13 @@ class ImageFeatureExtractor extends FeatureExtractor
      * The mean values for image normalization.
      * @var int|int[]
      */
-    protected int|array $imageMean;
+    protected int|array|null $imageMean;
 
     /**
      * The standard deviation values for image normalization.
      * @var int|int[]
      */
-    protected int|array $imageStd;
+    protected int|array|null $imageStd;
 
     /*
      * What method to use for resampling.
@@ -65,14 +65,13 @@ class ImageFeatureExtractor extends FeatureExtractor
     protected array|int|null $cropSize;
     protected ?bool $doConvertRGB;
     protected ?bool $doCropMargin;
-    protected ?array $padSize;
+    protected array|int|null $padSize;
     protected ?bool $doPad;
 
     public function __construct(public array $config)
     {
-
-        $this->imageMean = $config['image_mean'] ?? $config['mean'];
-        $this->imageStd = $config['image_std'] ?? $config['std'];
+        $this->imageMean = $config['image_mean'] ?? $config['mean'] ?? null;
+        $this->imageStd = $config['image_std'] ?? $config['std'] ?? null;
 
         $this->resample = $config['resample'] ?? 2; // 2 => bilinear
         $this->doRescale = $config['do_rescale'] ?? true;
@@ -493,7 +492,7 @@ public function preprocess(
         // Perform padding after rescaling/normalizing
         if ($doPad ?? $this->doPad) {
             if ($this->padSize !== null) {
-                $pixelData = $this->padImage($pixelData, $imgShape, $this->padSize);
+                [$pixelData, $imgShape] = $this->padImage($pixelData, $imgShape, $this->padSize);
             } elseif ($this->sizeDivisibility !== null) {
                 [$paddedWidth, $paddedHeight] = $this->enforceSizeDivisibility([$imgShape[1], $imgShape[0]], $this->sizeDivisibility);
                 [$pixelData, $imgShape] = $this->padImage($pixelData, $imgShape, ['width' => $paddedWidth, 'height' => $paddedHeight]);

diff --git a/src/FeatureExtractors/Swin2SRImageProcessor.php b/src/FeatureExtractors/Swin2SRImageProcessor.php
@@ -0,0 +1,33 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\FeatureExtractors;
+
+class Swin2SRImageProcessor extends ImageFeatureExtractor
+{
+    public function padImage(
+        array     $pixelData,
+        array     $imgShape,
+        int|array $padSize,
+        string    $mode = 'constant',
+        bool      $center = false,
+        int       $constantValues = 0
+    ): array
+    {
+        // NOTE: In this case, `padSize` represents the size of the sliding window for the local attention.
+        // In other words, the image is padded so that its width and height are multiples of `padSize`.
+        [$imageHeight, $imageWidth, $imageChannels] = $imgShape;
+
+        // NOTE: For Swin2SR models, the original python implementation adds padding even when the image's width/height is already
+        // a multiple of `pad_size`. However, this is most likely a bug (PR: https://github.com/mv-lab/swin2sr/pull/19).
+        // For this reason, we only add padding when the image's width/height is not a multiple of `pad_size`.
+        $padSize = [
+            'width' => $imageWidth + ($padSize - $imageWidth % $padSize) % $padSize,
+            'height' => $imageHeight + ($padSize - $imageHeight % $padSize) % $padSize,
+        ];
+
+        return parent::padImage($pixelData, $imgShape, $padSize, 'symmetric', false, -1);
+    }
+}
diff --git a/src/Models/Auto/AutoModel.php b/src/Models/Auto/AutoModel.php
@@ -24,6 +24,7 @@ class AutoModel extends PretrainedMixin
         'yolos' => \Codewithkyrian\Transformers\Models\Pretrained\YOLOSModel::class,
         'owlvit' => \Codewithkyrian\Transformers\Models\Pretrained\OwlVitModel::class,
         'owlv2' => \Codewithkyrian\Transformers\Models\Pretrained\OwlV2Model::class,
+        'swin2sr' => \Codewithkyrian\Transformers\Models\Pretrained\Swin2SRModel::class,
     ];
 
     const ENCODER_DECODER_MODEL_MAPPING = [

diff --git a/src/Models/Auto/AutoModelForImageToImage.php b/src/Models/Auto/AutoModelForImageToImage.php
@@ -0,0 +1,17 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Auto;
+
+class AutoModelForImageToImage extends PretrainedMixin
+{
+    const MODEL_CLASS_MAPPING = [
+        'swin2sr' => \Codewithkyrian\Transformers\Models\Pretrained\Swin2SRForImageSuperResolution::class,
+    ];
+
+    const MODEL_CLASS_MAPPINGS = [
+        self::MODEL_CLASS_MAPPING,
+    ];
+}
diff --git a/src/Models/Auto/PretrainedMixin.php b/src/Models/Auto/PretrainedMixin.php
@@ -10,6 +10,7 @@
 use Codewithkyrian\Transformers\Models\Pretrained\PretrainedModel;
 use Codewithkyrian\Transformers\Utils\AutoConfig;
 use Symfony\Component\Console\Output\OutputInterface;
+use function Codewithkyrian\Transformers\Utils\timeUsage;
 
 /**
  * Base class of all AutoModels. Contains the `from_pretrained` function
@@ -50,6 +51,7 @@ public static function fromPretrained(
         ?OutputInterface $output = null
     ): PretrainedModel
     {
+
         $config = AutoConfig::fromPretrained($modelNameOrPath, $config, $cacheDir, $revision, $output);
 
         foreach (static::MODEL_CLASS_MAPPINGS as $modelClassMapping) {

diff --git a/src/Models/Pretrained/PretrainedModel.php b/src/Models/Pretrained/PretrainedModel.php
@@ -33,6 +33,7 @@
 use OnnxRuntime\InferenceSession;
 use Symfony\Component\Console\Output\OutputInterface;
 use function Codewithkyrian\Transformers\Utils\array_some;
+use function Codewithkyrian\Transformers\Utils\timeUsage;
 
 /**
  * A base class for pre-trained models that provides the model configuration and an ONNX session.
@@ -92,7 +93,6 @@ public static function fromPretrained(
             $config = AutoConfig::fromPretrained($modelNameOrPath, $config, $cacheDir, $revision, $output);
         }
 
-
         switch ($modelArchitecture) {
             case ModelArchitecture::DecoderOnly:
             {
@@ -153,9 +153,11 @@ public static function fromPretrained(
                     echo "WARNING: {$modelArchitecture->value} is not a valid model group. Defaulting to EncoderOnly.";
                 }
 
+
                 $session = self::constructSession(modelNameOrPath: $modelNameOrPath,
                     fileName: 'model', cacheDir: $cacheDir, revision: $revision, output: $output);
 
+
                 return new static($config, $session, $modelArchitecture);
             }
         }

diff --git a/src/Models/Pretrained/Swin2SRForImageSuperResolution.php b/src/Models/Pretrained/Swin2SRForImageSuperResolution.php
@@ -0,0 +1,14 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+/**
+ * Swin2SR Model transformer with an upsampler head on top for image super resolution and restoration.
+ */
+class Swin2SRForImageSuperResolution extends Swin2SRPretrainedModel
+{
+
+}
diff --git a/src/Models/Pretrained/Swin2SRModel.php b/src/Models/Pretrained/Swin2SRModel.php
@@ -0,0 +1,11 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+class Swin2SRModel extends Swin2SRPretrainedModel
+{
+
+}
diff --git a/src/Models/Pretrained/Swin2SRPretrainedModel.php b/src/Models/Pretrained/Swin2SRPretrainedModel.php
@@ -0,0 +1,11 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+class Swin2SRPretrainedModel extends PretrainedModel
+{
+
+}
diff --git a/src/Pipelines/ImageToImagePipeline.php b/src/Pipelines/ImageToImagePipeline.php
@@ -0,0 +1,62 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Pipelines;
+
+use Codewithkyrian\Transformers\Utils\Image;
+use Codewithkyrian\Transformers\Utils\Tensor;
+use Interop\Polite\Math\Matrix\NDArray;
+use function Codewithkyrian\Transformers\Utils\prepareImages;
+use function Codewithkyrian\Transformers\Utils\timeUsage;
+
+/**
+ * Image to Image pipeline using any `AutoModelForImageToImage`. This pipeline generates an image based on a previous image input.
+ *
+ * **Example:** Super-resolution w/ `Xenova/swin2SR-classical-sr-x2-64`
+ * ```php
+ * $upscaler = pipeline('image-to-image', 'Xenova/swin2SR-classical-sr-x2-64');
+ * $url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/butterfly.jpg';
+ * $output = $upscaler($url);
+ * // Image {
+ * //   data: array(786432) [ 41, 31, 24,  43, ... ],
+ * //   width: 512,
+ * //   height: 512,
+ * //   channels: 3
+ * // }
+ * ```
+ */
+class ImageToImagePipeline extends Pipeline
+{
+
+    /**
+     * @param array|string $inputs
+     * @param mixed ...$args
+     * @return Image|Image[]
+     * @throws \Exception
+     */
+    public function __invoke(array|string $inputs, ...$args): array|Image
+    {
+        $preparedImages = prepareImages($inputs);
+
+        $inputs = ($this->processor)($preparedImages);
+
+        $outputs = $this->model->__invoke($inputs);
+
+        $toReturn = [];
+
+        /** @var Tensor $batch */
+        foreach ($outputs['reconstruction'] as $batch) {
+            $output = $batch->squeeze()
+                ->clamp(0, 1)
+                ->multiplyScalar(255)
+                ->round()
+                ->to(NDArray::uint8);
+
+            $toReturn[] = Image::fromTensor($output);
+        }
+
+        return count($toReturn) > 1 ? $toReturn : $toReturn[0];
+    }
+}
diff --git a/src/Pipelines/Pipeline.php b/src/Pipelines/Pipeline.php
@@ -10,6 +10,8 @@
 use Codewithkyrian\Transformers\PretrainedTokenizers\PretrainedTokenizer;
 use Codewithkyrian\Transformers\Processors\AutoProcessor;
 use Codewithkyrian\Transformers\Processors\Processor;
+use Codewithkyrian\Transformers\Utils\Image;
+use Codewithkyrian\Transformers\Utils\Tensor;
 use Symfony\Component\Console\Output\OutputInterface;
 use function Codewithkyrian\Transformers\Utils\timeUsage;
 
@@ -29,7 +31,7 @@ public function __construct(
      * @param ...$args
      * @return array
      */
-    public function __invoke(array|string $inputs, ...$args): array
+    public function __invoke(array|string $inputs, ...$args): array|Tensor|Image
     {
         return [];
     }

diff --git a/src/Pipelines/Task.php b/src/Pipelines/Task.php
@@ -7,6 +7,7 @@
 use Codewithkyrian\Transformers\Models\Auto\AutoModel;
 use Codewithkyrian\Transformers\Models\Auto\AutoModelForCausalLM;
 use Codewithkyrian\Transformers\Models\Auto\AutoModelForImageClassification;
+use Codewithkyrian\Transformers\Models\Auto\AutoModelForImageToImage;
 use Codewithkyrian\Transformers\Models\Auto\AutoModelForImageFeatureExtraction;
 use Codewithkyrian\Transformers\Models\Auto\AutoModelForMaskedLM;
 use Codewithkyrian\Transformers\Models\Auto\AutoModelForObjectDetection;
@@ -44,6 +45,7 @@ enum Task: string
     case ImageClassification = 'image-classification';
     case ImageFeatureExtraction = 'image-feature-extraction';
     case ZeroShotImageClassification = 'zero-shot-image-classification';
+    case ImageToImage = 'image-to-image';
 
     case ObjectDetection = 'object-detection';
     case ZeroShotObjectDetection = 'zero-shot-object-detection';
@@ -83,6 +85,8 @@ public function pipeline(PretrainedModel $model, ?PretrainedTokenizer $tokenizer
 
             self::ZeroShotImageClassification => new ZeroShotImageClassificationPipeline($this, $model, $tokenizer, $processor),
 
+            self::ImageToImage => new ImageToImagePipeline($this, $model, processor: $processor),
+
             self::ObjectDetection => new ObjectDetectionPipeline($this, $model, $tokenizer, $processor),
 
             self::ZeroShotObjectDetection => new ZeroShotObjectDetectionPipeline($this, $model, $tokenizer, $processor),
@@ -121,6 +125,8 @@ public function defaultModelName(): string
 
             self::ZeroShotImageClassification => 'Xenova/clip-vit-base-patch32', // Original: 'openai/clip-vit-base-patch32'
 
+            self::ImageToImage => 'Xenova/swin2SR-classical-sr-x2-64', // Original: 'caidas/swin2SR-classical-sr-x2-64'
+
             self::ObjectDetection => 'Xenova/detr-resnet-50', // Original: 'facebook/detr-resnet-50',
 
             self::ZeroShotObjectDetection => 'Xenova/owlvit-base-patch32', // Original: 'google/owlvit-base-patch32',
@@ -166,6 +172,8 @@ public function autoModel(
 
             self::ZeroShotImageClassification => AutoModel::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),
 
+            self::ImageToImage => AutoModelForImageToImage::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),
+
             self::ObjectDetection => AutoModelForObjectDetection::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),
 
             self::ZeroShotObjectDetection => AutoModelForZeroShotObjectDetection::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),
@@ -184,6 +192,7 @@ public function autoTokenizer(
         return match ($this) {
 
             self::ImageClassification,
+            self::ImageToImage,
             self::ImageFeatureExtraction,
             self::ObjectDetection => null,
 
@@ -221,6 +230,7 @@ public function autoProcessor(
             self::ImageClassification,
             self::ImageFeatureExtraction,
             self::ZeroShotImageClassification,
+            self::ImageToImage,
             self::ObjectDetection,
             self::ZeroShotObjectDetection => AutoProcessor::fromPretrained($modelNameOrPath, $config, $cacheDir, $revision, $output),
 

diff --git a/src/Utils/Tensor.php b/src/Utils/Tensor.php
@@ -598,23 +598,9 @@ public function squeeze(?int $dim = null): static
     {
         $mo = self::getMo();
 
-        $result = clone $this;
-
-        if ($dim === null) {
-            $result->buffer = array_filter($result->buffer, fn($value) => $value !== 1);
-            $result->shape = array_filter($result->shape, fn($value) => $value !== 1);
-        } else {
-            $dim = $result->safeIndex($dim, $result->ndim());
-
-            if ($result->shape[$dim] !== 1) {
-                throw new Exception("DimensionError: cannot select an axis to squeeze out which has size not equal to one");
-            }
-
-            array_splice($result->buffer, $dim, 1);
-            array_splice($result->shape, $dim, 1);
-        }
+        $ndArray = $mo->la()->squeeze($this, $dim);
 
-        return $result;
+        return new static($ndArray->buffer(), $ndArray->dtype(), $ndArray->shape(), $ndArray->offset());
     }
 
     /**
@@ -684,10 +670,10 @@ public function round(): static
     /**
      * Performs Tensor dtype conversion.
      *
-     * @param string $dtype The target data type.
+     * @param int $dtype The target data type.
      * @return static The converted tensor.
      */
-    public function to(string $dtype): static
+    public function to(int $dtype): static
     {
         if ($this->dtype() === $dtype) {
             return $this;