Merge pull request #16 from CodeWithKyrian/add-image-feature-extracti…

…on-pipeline Add image feature extraction pipeline
CodeWithKyrian · Apr 8, 2024 · bc3ef74 · bc3ef74
2 parents 1fb1f68 + f78e1ec
commit bc3ef74
Showing 10 changed files with 207 additions and 0 deletions.
diff --git a/examples/pipelines/image-feature-extraction.php b/examples/pipelines/image-feature-extraction.php
@@ -0,0 +1,19 @@
+<?php
+
+declare(strict_types=1);
+
+
+use Codewithkyrian\Transformers\Generation\Streamers\StdOutStreamer;
+use function Codewithkyrian\Transformers\Pipelines\pipeline;
+use function Codewithkyrian\Transformers\Utils\memoryUsage;
+use function Codewithkyrian\Transformers\Utils\timeUsage;
+
+require_once './bootstrap.php';
+
+$imageFeatureExtractor = pipeline('image-feature-extraction', 'Xenova/vit-base-patch16-224-in21k');
+
+$url = __DIR__. '/../images/cats.jpg';
+
+$features = $imageFeatureExtractor($url);
+
+dd(count($features[0]), timeUsage(), memoryUsage());
diff --git a/src/Models/Auto/AutoModel.php b/src/Models/Auto/AutoModel.php
@@ -18,6 +18,7 @@ class AutoModel extends PretrainedMixin
         "clip" => \Codewithkyrian\Transformers\Models\Pretrained\CLIPModel::class,
         "vit" => \Codewithkyrian\Transformers\Models\Pretrained\ViTModel::class,
         "deit" => \Codewithkyrian\Transformers\Models\Pretrained\DeiTModel::class,
+        "siglip" => \Codewithkyrian\Transformers\Models\Pretrained\SigLipModel::class,
 
         'detr' => \Codewithkyrian\Transformers\Models\Pretrained\DETRModel::class,
         'yolos' => \Codewithkyrian\Transformers\Models\Pretrained\YOLOSModel::class,

diff --git a/src/Models/Auto/AutoModelForImageFeatureExtraction.php b/src/Models/Auto/AutoModelForImageFeatureExtraction.php
@@ -0,0 +1,20 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Auto;
+
+class AutoModelForImageFeatureExtraction extends PretrainedMixin
+{
+    const MODEL_CLASS_MAPPING = [
+        'clip' => \Codewithkyrian\Transformers\Models\Pretrained\CLIPVisionModelWithProjection::class,
+        'siglip' => \Codewithkyrian\Transformers\Models\Pretrained\SiglipVisionModel::class,
+    ];
+
+    const MODEL_CLASS_MAPPINGS = [
+        self::MODEL_CLASS_MAPPING,
+        AutoModel::ENCODER_ONLY_MODEL_MAPPING,
+        AutoModel::DECODER_ONLY_MODEL_MAPPING,
+    ];
+}
diff --git a/src/Models/Pretrained/CLIPVisionModelWithProjection.php b/src/Models/Pretrained/CLIPVisionModelWithProjection.php
@@ -0,0 +1,25 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+use Codewithkyrian\Transformers\Models\ModelArchitecture;
+use Codewithkyrian\Transformers\Utils\AutoConfig;
+use Symfony\Component\Console\Output\OutputInterface;
+
+/**
+ * CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output)
+ *
+ * Particularly useful for image feature extraction tasks.
+ */
+class CLIPVisionModelWithProjection extends CLIPPretrainedModel
+{
+    public static function fromPretrained(string $modelNameOrPath, bool $quantized = true, AutoConfig|array $config = null, ?string $cacheDir = null, ?string $token = null, string $revision = 'main', ?string $modelFilename = null, ModelArchitecture $modelArchitecture = ModelArchitecture::EncoderOnly, ?OutputInterface $output = null): PretrainedModel
+    {
+        // Update default model file name if not provided
+        $modelFilename ??= 'vision_model';
+        return parent::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $token, $revision, $modelFilename, $modelArchitecture, $output);
+    }
+}
diff --git a/src/Models/Pretrained/SiglipModel.php b/src/Models/Pretrained/SiglipModel.php
@@ -0,0 +1,14 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+/**
+ * SigLIP Text and Vision Model with a projection layers on top
+ */
+class SiglipModel extends SiglipPretrainedModel
+{
+
+}
diff --git a/src/Models/Pretrained/SiglipPretrainedModel.php b/src/Models/Pretrained/SiglipPretrainedModel.php
@@ -0,0 +1,11 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+class SiglipPretrainedModel extends PretrainedModel
+{
+
+}
diff --git a/src/Models/Pretrained/SiglipTextModel.php b/src/Models/Pretrained/SiglipTextModel.php
@@ -0,0 +1,23 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+use Codewithkyrian\Transformers\Models\ModelArchitecture;
+use Codewithkyrian\Transformers\Utils\AutoConfig;
+use Symfony\Component\Console\Output\OutputInterface;
+
+/**
+ * The text model from SigLIP without any head or projection on top.
+ */
+class SiglipTextModel extends SiglipPretrainedModel
+{
+    public static function fromPretrained(string $modelNameOrPath, bool $quantized = true, AutoConfig|array $config = null, ?string $cacheDir = null, ?string $token = null, string $revision = 'main', ?string $modelFilename = null, ModelArchitecture $modelArchitecture = ModelArchitecture::EncoderOnly, ?OutputInterface $output = null): PretrainedModel
+    {
+        // Update default model file name if not provided
+        $modelFilename ??= 'text_model';
+        return parent::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $token, $revision, $modelFilename, $modelArchitecture, $output);
+    }
+}
diff --git a/src/Models/Pretrained/SiglipVisionModel.php b/src/Models/Pretrained/SiglipVisionModel.php
@@ -0,0 +1,20 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Models\Pretrained;
+
+use Codewithkyrian\Transformers\Models\ModelArchitecture;
+use Codewithkyrian\Transformers\Utils\AutoConfig;
+use Symfony\Component\Console\Output\OutputInterface;
+
+class SiglipVisionModel extends CLIPPretrainedModel
+{
+    public static function fromPretrained(string $modelNameOrPath, bool $quantized = true, AutoConfig|array $config = null, ?string $cacheDir = null, ?string $token = null, string $revision = 'main', ?string $modelFilename = null, ModelArchitecture $modelArchitecture = ModelArchitecture::EncoderOnly, ?OutputInterface $output = null): PretrainedModel
+    {
+        // Update default model file name if not provided
+        $modelFilename ??= 'vision_model';
+        return parent::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $token, $revision, $modelFilename, $modelArchitecture, $output);
+    }
+}
diff --git a/src/Pipelines/ImageFeatureExtractionPipeline.php b/src/Pipelines/ImageFeatureExtractionPipeline.php
@@ -0,0 +1,64 @@
+<?php
+
+declare(strict_types=1);
+
+
+namespace Codewithkyrian\Transformers\Pipelines;
+
+use function Codewithkyrian\Transformers\Utils\prepareImages;
+
+/**
+ * Image feature extraction pipeline using no model head. This pipeline extracts the hidden
+ * states from the base transformer, which can be used as features in downstream tasks.
+ *
+ * **Example:** Perform image feature extraction with `Xenova/vit-base-patch16-224-in21k`.
+ * ```php
+ * $imageFeatureExtractor = pipeline('image-feature-extraction', 'Xenova/vit-base-patch16-224-in21k');
+ * $url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png';
+ * $features = $imageFeatureExtractor($url);
+ * // Tensor {
+ * //   shape: [ 1, 197, 768 ],
+ * //   buffer: [ ... ],
+ * //   size: 151296
+ * // }
+ * ```
+ *
+ * **Example:** Compute image embeddings with `Xenova/clip-vit-base-patch32`.
+ * ```javascript
+ * $imageFeatureExtractor = await pipeline('image-feature-extraction', 'Xenova/clip-vit-base-patch32');
+ * $url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png';
+ * $features = $imageFeatureExtractor(url);
+ * // Tensor {
+ * //   shape: [ 1, 512 ],
+ * //   buffer: [ ... ],
+ * //   size: 512
+ * // }
+ * ```
+ */
+class ImageFeatureExtractionPipeline extends Pipeline
+{
+    public function __invoke(array|string $inputs, ...$args): array
+    {
+        $pool = $args['pool'] ?? null;
+        $preparedImages = prepareImages($inputs);
+
+
+        ['pixel_values' => $pixelValues] = ($this->processor)($preparedImages);
+
+        $output = $this->model->__invoke(['pixel_values' => $pixelValues]);
+
+        $result = [];
+
+        if ($pool) {
+            if (!isset($output['pooler_output'])) {
+                throw new \Exception("No pooled output was returned. Make sure the model has a 'pooler' layer when using the 'pool' option.");
+            }
+
+            $result = $output['pooler_output'];
+        } else {
+            $result = $output['last_hidden_state'] ?? $output['logits'] ?? $output['image_embeds'];
+        }
+
+        return $result->toArray();
+    }
+}
diff --git a/src/Pipelines/Task.php b/src/Pipelines/Task.php
@@ -7,6 +7,7 @@
 use Codewithkyrian\Transformers\Models\Auto\AutoModel;
 use Codewithkyrian\Transformers\Models\Auto\AutoModelForCausalLM;
 use Codewithkyrian\Transformers\Models\Auto\AutoModelForImageClassification;
+use Codewithkyrian\Transformers\Models\Auto\AutoModelForImageFeatureExtraction;
 use Codewithkyrian\Transformers\Models\Auto\AutoModelForMaskedLM;
 use Codewithkyrian\Transformers\Models\Auto\AutoModelForObjectDetection;
 use Codewithkyrian\Transformers\Models\Auto\AutoModelForQuestionAnswering;
@@ -41,6 +42,7 @@ enum Task: string
 
     case ImageToText = 'image-to-text';
     case ImageClassification = 'image-classification';
+    case ImageFeatureExtraction = 'image-feature-extraction';
     case ZeroShotImageClassification = 'zero-shot-image-classification';
 
     case ObjectDetection = 'object-detection';
@@ -77,6 +79,8 @@ public function pipeline(PretrainedModel $model, ?PretrainedTokenizer $tokenizer
 
             self::ImageClassification => new ImageClassificationPipeline($this, $model, processor: $processor),
 
+            self::ImageFeatureExtraction => new ImageFeatureExtractionPipeline($this, $model, processor: $processor),
+
             self::ZeroShotImageClassification => new ZeroShotImageClassificationPipeline($this, $model, $tokenizer, $processor),
 
             self::ObjectDetection => new ObjectDetectionPipeline($this, $model, $tokenizer, $processor),
@@ -113,6 +117,8 @@ public function defaultModelName(): string
 
             self::ImageClassification => 'Xenova/vit-base-patch16-224', // Original: 'google/vit-base-patch16-224'
 
+            self::ImageFeatureExtraction => 'Xenova/vit-base-patch16-224-in21k', // Original: 'google/vit-base-patch16-224-in21k'
+
             self::ZeroShotImageClassification => 'Xenova/clip-vit-base-patch32', // Original: 'openai/clip-vit-base-patch32'
 
             self::ObjectDetection => 'Xenova/detr-resnet-50', // Original: 'facebook/detr-resnet-50',
@@ -156,6 +162,8 @@ public function autoModel(
 
             self::ImageClassification => AutoModelForImageClassification::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),
 
+            self::ImageFeatureExtraction => AutoModelForImageFeatureExtraction::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),
+
             self::ZeroShotImageClassification => AutoModel::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),
 
             self::ObjectDetection => AutoModelForObjectDetection::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),
@@ -176,6 +184,7 @@ public function autoTokenizer(
         return match ($this) {
 
             self::ImageClassification,
+            self::ImageFeatureExtraction,
             self::ObjectDetection => null,
 
 
@@ -210,6 +219,7 @@ public function autoProcessor(
 
             self::ImageToText,
             self::ImageClassification,
+            self::ImageFeatureExtraction,
             self::ZeroShotImageClassification,
             self::ObjectDetection,
             self::ZeroShotObjectDetection => AutoProcessor::fromPretrained($modelNameOrPath, $config, $cacheDir, $revision, $output),