Skip to content

Commit

Permalink
Merge pull request #16 from CodeWithKyrian/add-image-feature-extracti…
Browse files Browse the repository at this point in the history
…on-pipeline

Add image feature extraction pipeline
CodeWithKyrian authored Apr 8, 2024
2 parents 1fb1f68 + f78e1ec commit bc3ef74
Showing 10 changed files with 207 additions and 0 deletions.
19 changes: 19 additions & 0 deletions examples/pipelines/image-feature-extraction.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?php

declare(strict_types=1);


use Codewithkyrian\Transformers\Generation\Streamers\StdOutStreamer;
use function Codewithkyrian\Transformers\Pipelines\pipeline;
use function Codewithkyrian\Transformers\Utils\memoryUsage;
use function Codewithkyrian\Transformers\Utils\timeUsage;

require_once './bootstrap.php';

$imageFeatureExtractor = pipeline('image-feature-extraction', 'Xenova/vit-base-patch16-224-in21k');

$url = __DIR__. '/../images/cats.jpg';

$features = $imageFeatureExtractor($url);

dd(count($features[0]), timeUsage(), memoryUsage());
1 change: 1 addition & 0 deletions src/Models/Auto/AutoModel.php
Original file line number Diff line number Diff line change
@@ -18,6 +18,7 @@ class AutoModel extends PretrainedMixin
"clip" => \Codewithkyrian\Transformers\Models\Pretrained\CLIPModel::class,
"vit" => \Codewithkyrian\Transformers\Models\Pretrained\ViTModel::class,
"deit" => \Codewithkyrian\Transformers\Models\Pretrained\DeiTModel::class,
"siglip" => \Codewithkyrian\Transformers\Models\Pretrained\SigLipModel::class,

'detr' => \Codewithkyrian\Transformers\Models\Pretrained\DETRModel::class,
'yolos' => \Codewithkyrian\Transformers\Models\Pretrained\YOLOSModel::class,
20 changes: 20 additions & 0 deletions src/Models/Auto/AutoModelForImageFeatureExtraction.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Models\Auto;

class AutoModelForImageFeatureExtraction extends PretrainedMixin
{
const MODEL_CLASS_MAPPING = [
'clip' => \Codewithkyrian\Transformers\Models\Pretrained\CLIPVisionModelWithProjection::class,
'siglip' => \Codewithkyrian\Transformers\Models\Pretrained\SiglipVisionModel::class,
];

const MODEL_CLASS_MAPPINGS = [
self::MODEL_CLASS_MAPPING,
AutoModel::ENCODER_ONLY_MODEL_MAPPING,
AutoModel::DECODER_ONLY_MODEL_MAPPING,
];
}
25 changes: 25 additions & 0 deletions src/Models/Pretrained/CLIPVisionModelWithProjection.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Models\Pretrained;

use Codewithkyrian\Transformers\Models\ModelArchitecture;
use Codewithkyrian\Transformers\Utils\AutoConfig;
use Symfony\Component\Console\Output\OutputInterface;

/**
* CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output)
*
* Particularly useful for image feature extraction tasks.
*/
class CLIPVisionModelWithProjection extends CLIPPretrainedModel
{
public static function fromPretrained(string $modelNameOrPath, bool $quantized = true, AutoConfig|array $config = null, ?string $cacheDir = null, ?string $token = null, string $revision = 'main', ?string $modelFilename = null, ModelArchitecture $modelArchitecture = ModelArchitecture::EncoderOnly, ?OutputInterface $output = null): PretrainedModel
{
// Update default model file name if not provided
$modelFilename ??= 'vision_model';
return parent::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $token, $revision, $modelFilename, $modelArchitecture, $output);
}
}
14 changes: 14 additions & 0 deletions src/Models/Pretrained/SiglipModel.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Models\Pretrained;

/**
* SigLIP Text and Vision Model with a projection layers on top
*/
class SiglipModel extends SiglipPretrainedModel
{

}
11 changes: 11 additions & 0 deletions src/Models/Pretrained/SiglipPretrainedModel.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Models\Pretrained;

class SiglipPretrainedModel extends PretrainedModel
{

}
23 changes: 23 additions & 0 deletions src/Models/Pretrained/SiglipTextModel.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Models\Pretrained;

use Codewithkyrian\Transformers\Models\ModelArchitecture;
use Codewithkyrian\Transformers\Utils\AutoConfig;
use Symfony\Component\Console\Output\OutputInterface;

/**
* The text model from SigLIP without any head or projection on top.
*/
class SiglipTextModel extends SiglipPretrainedModel
{
public static function fromPretrained(string $modelNameOrPath, bool $quantized = true, AutoConfig|array $config = null, ?string $cacheDir = null, ?string $token = null, string $revision = 'main', ?string $modelFilename = null, ModelArchitecture $modelArchitecture = ModelArchitecture::EncoderOnly, ?OutputInterface $output = null): PretrainedModel
{
// Update default model file name if not provided
$modelFilename ??= 'text_model';
return parent::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $token, $revision, $modelFilename, $modelArchitecture, $output);
}
}
20 changes: 20 additions & 0 deletions src/Models/Pretrained/SiglipVisionModel.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Models\Pretrained;

use Codewithkyrian\Transformers\Models\ModelArchitecture;
use Codewithkyrian\Transformers\Utils\AutoConfig;
use Symfony\Component\Console\Output\OutputInterface;

class SiglipVisionModel extends CLIPPretrainedModel
{
public static function fromPretrained(string $modelNameOrPath, bool $quantized = true, AutoConfig|array $config = null, ?string $cacheDir = null, ?string $token = null, string $revision = 'main', ?string $modelFilename = null, ModelArchitecture $modelArchitecture = ModelArchitecture::EncoderOnly, ?OutputInterface $output = null): PretrainedModel
{
// Update default model file name if not provided
$modelFilename ??= 'vision_model';
return parent::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $token, $revision, $modelFilename, $modelArchitecture, $output);
}
}
64 changes: 64 additions & 0 deletions src/Pipelines/ImageFeatureExtractionPipeline.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Pipelines;

use function Codewithkyrian\Transformers\Utils\prepareImages;

/**
* Image feature extraction pipeline using no model head. This pipeline extracts the hidden
* states from the base transformer, which can be used as features in downstream tasks.
*
* **Example:** Perform image feature extraction with `Xenova/vit-base-patch16-224-in21k`.
* ```php
* $imageFeatureExtractor = pipeline('image-feature-extraction', 'Xenova/vit-base-patch16-224-in21k');
* $url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png';
* $features = $imageFeatureExtractor($url);
* // Tensor {
* // shape: [ 1, 197, 768 ],
* // buffer: [ ... ],
* // size: 151296
* // }
* ```
*
* **Example:** Compute image embeddings with `Xenova/clip-vit-base-patch32`.
* ```javascript
* $imageFeatureExtractor = await pipeline('image-feature-extraction', 'Xenova/clip-vit-base-patch32');
* $url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png';
* $features = $imageFeatureExtractor(url);
* // Tensor {
* // shape: [ 1, 512 ],
* // buffer: [ ... ],
* // size: 512
* // }
* ```
*/
class ImageFeatureExtractionPipeline extends Pipeline
{
public function __invoke(array|string $inputs, ...$args): array
{
$pool = $args['pool'] ?? null;
$preparedImages = prepareImages($inputs);


['pixel_values' => $pixelValues] = ($this->processor)($preparedImages);

$output = $this->model->__invoke(['pixel_values' => $pixelValues]);

$result = [];

if ($pool) {
if (!isset($output['pooler_output'])) {
throw new \Exception("No pooled output was returned. Make sure the model has a 'pooler' layer when using the 'pool' option.");
}

$result = $output['pooler_output'];
} else {
$result = $output['last_hidden_state'] ?? $output['logits'] ?? $output['image_embeds'];
}

return $result->toArray();
}
}
10 changes: 10 additions & 0 deletions src/Pipelines/Task.php
Original file line number Diff line number Diff line change
@@ -7,6 +7,7 @@
use Codewithkyrian\Transformers\Models\Auto\AutoModel;
use Codewithkyrian\Transformers\Models\Auto\AutoModelForCausalLM;
use Codewithkyrian\Transformers\Models\Auto\AutoModelForImageClassification;
use Codewithkyrian\Transformers\Models\Auto\AutoModelForImageFeatureExtraction;
use Codewithkyrian\Transformers\Models\Auto\AutoModelForMaskedLM;
use Codewithkyrian\Transformers\Models\Auto\AutoModelForObjectDetection;
use Codewithkyrian\Transformers\Models\Auto\AutoModelForQuestionAnswering;
@@ -41,6 +42,7 @@ enum Task: string

case ImageToText = 'image-to-text';
case ImageClassification = 'image-classification';
case ImageFeatureExtraction = 'image-feature-extraction';
case ZeroShotImageClassification = 'zero-shot-image-classification';

case ObjectDetection = 'object-detection';
@@ -77,6 +79,8 @@ public function pipeline(PretrainedModel $model, ?PretrainedTokenizer $tokenizer

self::ImageClassification => new ImageClassificationPipeline($this, $model, processor: $processor),

self::ImageFeatureExtraction => new ImageFeatureExtractionPipeline($this, $model, processor: $processor),

self::ZeroShotImageClassification => new ZeroShotImageClassificationPipeline($this, $model, $tokenizer, $processor),

self::ObjectDetection => new ObjectDetectionPipeline($this, $model, $tokenizer, $processor),
@@ -113,6 +117,8 @@ public function defaultModelName(): string

self::ImageClassification => 'Xenova/vit-base-patch16-224', // Original: 'google/vit-base-patch16-224'

self::ImageFeatureExtraction => 'Xenova/vit-base-patch16-224-in21k', // Original: 'google/vit-base-patch16-224-in21k'

self::ZeroShotImageClassification => 'Xenova/clip-vit-base-patch32', // Original: 'openai/clip-vit-base-patch32'

self::ObjectDetection => 'Xenova/detr-resnet-50', // Original: 'facebook/detr-resnet-50',
@@ -156,6 +162,8 @@ public function autoModel(

self::ImageClassification => AutoModelForImageClassification::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),

self::ImageFeatureExtraction => AutoModelForImageFeatureExtraction::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),

self::ZeroShotImageClassification => AutoModel::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),

self::ObjectDetection => AutoModelForObjectDetection::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),
@@ -176,6 +184,7 @@ public function autoTokenizer(
return match ($this) {

self::ImageClassification,
self::ImageFeatureExtraction,
self::ObjectDetection => null,


@@ -210,6 +219,7 @@ public function autoProcessor(

self::ImageToText,
self::ImageClassification,
self::ImageFeatureExtraction,
self::ZeroShotImageClassification,
self::ObjectDetection,
self::ZeroShotObjectDetection => AutoProcessor::fromPretrained($modelNameOrPath, $config, $cacheDir, $revision, $output),

0 comments on commit bc3ef74

Please sign in to comment.