-
Notifications
You must be signed in to change notification settings - Fork 35
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #16 from CodeWithKyrian/add-image-feature-extracti…
…on-pipeline Add image feature extraction pipeline
Showing
10 changed files
with
207 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
|
||
use Codewithkyrian\Transformers\Generation\Streamers\StdOutStreamer; | ||
use function Codewithkyrian\Transformers\Pipelines\pipeline; | ||
use function Codewithkyrian\Transformers\Utils\memoryUsage; | ||
use function Codewithkyrian\Transformers\Utils\timeUsage; | ||
|
||
require_once './bootstrap.php'; | ||
|
||
$imageFeatureExtractor = pipeline('image-feature-extraction', 'Xenova/vit-base-patch16-224-in21k'); | ||
|
||
$url = __DIR__. '/../images/cats.jpg'; | ||
|
||
$features = $imageFeatureExtractor($url); | ||
|
||
dd(count($features[0]), timeUsage(), memoryUsage()); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
|
||
namespace Codewithkyrian\Transformers\Models\Auto; | ||
|
||
class AutoModelForImageFeatureExtraction extends PretrainedMixin | ||
{ | ||
const MODEL_CLASS_MAPPING = [ | ||
'clip' => \Codewithkyrian\Transformers\Models\Pretrained\CLIPVisionModelWithProjection::class, | ||
'siglip' => \Codewithkyrian\Transformers\Models\Pretrained\SiglipVisionModel::class, | ||
]; | ||
|
||
const MODEL_CLASS_MAPPINGS = [ | ||
self::MODEL_CLASS_MAPPING, | ||
AutoModel::ENCODER_ONLY_MODEL_MAPPING, | ||
AutoModel::DECODER_ONLY_MODEL_MAPPING, | ||
]; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
|
||
namespace Codewithkyrian\Transformers\Models\Pretrained; | ||
|
||
use Codewithkyrian\Transformers\Models\ModelArchitecture; | ||
use Codewithkyrian\Transformers\Utils\AutoConfig; | ||
use Symfony\Component\Console\Output\OutputInterface; | ||
|
||
/** | ||
* CLIP Vision Model with a projection layer on top (a linear layer on top of the pooled output) | ||
* | ||
* Particularly useful for image feature extraction tasks. | ||
*/ | ||
class CLIPVisionModelWithProjection extends CLIPPretrainedModel | ||
{ | ||
public static function fromPretrained(string $modelNameOrPath, bool $quantized = true, AutoConfig|array $config = null, ?string $cacheDir = null, ?string $token = null, string $revision = 'main', ?string $modelFilename = null, ModelArchitecture $modelArchitecture = ModelArchitecture::EncoderOnly, ?OutputInterface $output = null): PretrainedModel | ||
{ | ||
// Update default model file name if not provided | ||
$modelFilename ??= 'vision_model'; | ||
return parent::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $token, $revision, $modelFilename, $modelArchitecture, $output); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,14 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
|
||
namespace Codewithkyrian\Transformers\Models\Pretrained; | ||
|
||
/** | ||
* SigLIP Text and Vision Model with a projection layers on top | ||
*/ | ||
class SiglipModel extends SiglipPretrainedModel | ||
{ | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
|
||
namespace Codewithkyrian\Transformers\Models\Pretrained; | ||
|
||
class SiglipPretrainedModel extends PretrainedModel | ||
{ | ||
|
||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
|
||
namespace Codewithkyrian\Transformers\Models\Pretrained; | ||
|
||
use Codewithkyrian\Transformers\Models\ModelArchitecture; | ||
use Codewithkyrian\Transformers\Utils\AutoConfig; | ||
use Symfony\Component\Console\Output\OutputInterface; | ||
|
||
/** | ||
* The text model from SigLIP without any head or projection on top. | ||
*/ | ||
class SiglipTextModel extends SiglipPretrainedModel | ||
{ | ||
public static function fromPretrained(string $modelNameOrPath, bool $quantized = true, AutoConfig|array $config = null, ?string $cacheDir = null, ?string $token = null, string $revision = 'main', ?string $modelFilename = null, ModelArchitecture $modelArchitecture = ModelArchitecture::EncoderOnly, ?OutputInterface $output = null): PretrainedModel | ||
{ | ||
// Update default model file name if not provided | ||
$modelFilename ??= 'text_model'; | ||
return parent::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $token, $revision, $modelFilename, $modelArchitecture, $output); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
|
||
namespace Codewithkyrian\Transformers\Models\Pretrained; | ||
|
||
use Codewithkyrian\Transformers\Models\ModelArchitecture; | ||
use Codewithkyrian\Transformers\Utils\AutoConfig; | ||
use Symfony\Component\Console\Output\OutputInterface; | ||
|
||
class SiglipVisionModel extends CLIPPretrainedModel | ||
{ | ||
public static function fromPretrained(string $modelNameOrPath, bool $quantized = true, AutoConfig|array $config = null, ?string $cacheDir = null, ?string $token = null, string $revision = 'main', ?string $modelFilename = null, ModelArchitecture $modelArchitecture = ModelArchitecture::EncoderOnly, ?OutputInterface $output = null): PretrainedModel | ||
{ | ||
// Update default model file name if not provided | ||
$modelFilename ??= 'vision_model'; | ||
return parent::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $token, $revision, $modelFilename, $modelArchitecture, $output); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,64 @@ | ||
<?php | ||
|
||
declare(strict_types=1); | ||
|
||
|
||
namespace Codewithkyrian\Transformers\Pipelines; | ||
|
||
use function Codewithkyrian\Transformers\Utils\prepareImages; | ||
|
||
/** | ||
* Image feature extraction pipeline using no model head. This pipeline extracts the hidden | ||
* states from the base transformer, which can be used as features in downstream tasks. | ||
* | ||
* **Example:** Perform image feature extraction with `Xenova/vit-base-patch16-224-in21k`. | ||
* ```php | ||
* $imageFeatureExtractor = pipeline('image-feature-extraction', 'Xenova/vit-base-patch16-224-in21k'); | ||
* $url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png'; | ||
* $features = $imageFeatureExtractor($url); | ||
* // Tensor { | ||
* // shape: [ 1, 197, 768 ], | ||
* // buffer: [ ... ], | ||
* // size: 151296 | ||
* // } | ||
* ``` | ||
* | ||
* **Example:** Compute image embeddings with `Xenova/clip-vit-base-patch32`. | ||
* ```javascript | ||
* $imageFeatureExtractor = await pipeline('image-feature-extraction', 'Xenova/clip-vit-base-patch32'); | ||
* $url = 'https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/cats.png'; | ||
* $features = $imageFeatureExtractor(url); | ||
* // Tensor { | ||
* // shape: [ 1, 512 ], | ||
* // buffer: [ ... ], | ||
* // size: 512 | ||
* // } | ||
* ``` | ||
*/ | ||
class ImageFeatureExtractionPipeline extends Pipeline | ||
{ | ||
public function __invoke(array|string $inputs, ...$args): array | ||
{ | ||
$pool = $args['pool'] ?? null; | ||
$preparedImages = prepareImages($inputs); | ||
|
||
|
||
['pixel_values' => $pixelValues] = ($this->processor)($preparedImages); | ||
|
||
$output = $this->model->__invoke(['pixel_values' => $pixelValues]); | ||
|
||
$result = []; | ||
|
||
if ($pool) { | ||
if (!isset($output['pooler_output'])) { | ||
throw new \Exception("No pooled output was returned. Make sure the model has a 'pooler' layer when using the 'pool' option."); | ||
} | ||
|
||
$result = $output['pooler_output']; | ||
} else { | ||
$result = $output['last_hidden_state'] ?? $output['logits'] ?? $output['image_embeds']; | ||
} | ||
|
||
return $result->toArray(); | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters