Skip to content

Commit

Permalink
Merge pull request #14 from CodeWithKyrian/add-zero-shot-object-detec…
Browse files Browse the repository at this point in the history
…tion-pipeline

Add Zero Shot Object Detection Pipeline and OwlVit models
  • Loading branch information
CodeWithKyrian authored Apr 5, 2024
2 parents f794e9e + e4abee3 commit daa10f3
Show file tree
Hide file tree
Showing 21 changed files with 303 additions and 8 deletions.
2 changes: 1 addition & 1 deletion bin/transformers
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ $application = new Application();
try {
$application->setName('Transformers PHP CLI');

$application->add(new Codewithkyrian\Transformers\Commands\InitCommand());
// $application->add(new Codewithkyrian\Transformers\Commands\InitCommand());
$application->add(new Codewithkyrian\Transformers\Commands\DownloadModelCommand());

$application->run();
Expand Down
Binary file added examples/images/astronaut.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added examples/images/beach.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
4 changes: 2 additions & 2 deletions examples/pipelines/image-to-text.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
//$captioner = pipeline('image-to-text', 'Xenova/vit-gpt2-image-captioning');
$captioner = pipeline('image-to-text', 'Xenova/trocr-small-handwritten');

$streamer = StdOutStreamer::make($captioner->tokenizer);
//$streamer = StdOutStreamer::make($captioner->tokenizer);

//$url = __DIR__. '/../images/cats.jpg';
$url = __DIR__. '/../images/beach.png';
//$url = __DIR__. '/../images/handwriting.jpg';
//$url = __DIR__. '/../images/handwriting3.png';
$url = __DIR__. '/../images/handwriting4.jpeg';
Expand Down
24 changes: 24 additions & 0 deletions examples/pipelines/zero-shot-object-detection.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
<?php

declare(strict_types=1);

namespace Codewithkyrian\Transformers\Pipelines;

use function Codewithkyrian\Transformers\Utils\memoryUsage;
use function Codewithkyrian\Transformers\Utils\timeUsage;

require_once './bootstrap.php';

ini_set('memory_limit', '-1');

$detector = pipeline('zero-shot-object-detection', 'Xenova/owlvit-base-patch32');

$url = __DIR__. '/../images/astronaut.png';
$candidateLabels = ['human face', 'rocket', 'helmet', 'american flag'];

$url = __DIR__. '/../images/beach.png';
$candidateLabels = ['hat', 'book', 'sunglasses', 'camera'];

$output = $detector($url, $candidateLabels, topK: 4, threshold: 0.05);

dd($output, timeUsage(), memoryUsage());
25 changes: 25 additions & 0 deletions src/FeatureExtractors/OwlViTFeatureExtractor.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\FeatureExtractors;

use Codewithkyrian\Transformers\Models\Output\ObjectDetectionOutput;
use Codewithkyrian\Transformers\Processors\Processor;

class OwlViTFeatureExtractor extends ImageFeatureExtractor
{
/**
* Post-processes the outputs of the model (for object detection).
* @param ObjectDetectionOutput $outputs The outputs of the model that must be post-processed
* @param float $threshold The threshold to use for the scores.
* @param array|null $targetSizes The sizes of the original images.
* @param bool $isZeroShot Whether zero-shot object detection was performed.
* @return array An array of objects containing the post-processed outputs.
*/
public function postProcessObjectDetection(ObjectDetectionOutput $outputs, float $threshold = 0.5, ?array $targetSizes = null, bool $isZeroShot = false): array
{
return Processor::postProcessObjectDetection($outputs, $threshold, $targetSizes, $isZeroShot);
}
}
11 changes: 11 additions & 0 deletions src/FeatureExtractors/Owlv2ImageProcessor.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\FeatureExtractors;

class Owlv2ImageProcessor extends OwlViTFeatureExtractor
{

}
6 changes: 5 additions & 1 deletion src/Models/Auto/AutoModel.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,8 @@ class AutoModel extends PretrainedMixin

'detr' => \Codewithkyrian\Transformers\Models\Pretrained\DETRModel::class,
'yolos' => \Codewithkyrian\Transformers\Models\Pretrained\YOLOSModel::class,
'owlvit' => \Codewithkyrian\Transformers\Models\Pretrained\OwlVitModel::class,
'owlv2' => \Codewithkyrian\Transformers\Models\Pretrained\OwlV2Model::class,
];

const ENCODER_DECODER_MODEL_MAPPING = [
Expand Down Expand Up @@ -48,7 +50,9 @@ class AutoModel extends PretrainedMixin
AutoModelForMaskedLM::MODEL_CLASS_MAPPING,
AutoModelForQuestionAnswering::MODEL_CLASS_MAPPING,
AutoModelForImageClassification::MODEL_CLASS_MAPPING,
AutoModelForVision2Seq::MODEL_CLASS_MAPPING
AutoModelForVision2Seq::MODEL_CLASS_MAPPING,
AutoModelForObjectDetection::MODEL_CLASS_MAPPING,
AutoModelForZeroShotObjectDetection::MODEL_CLASS_MAPPING,
];


Expand Down
19 changes: 19 additions & 0 deletions src/Models/Auto/AutoModelForZeroShotObjectDetection.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Models\Auto;

class AutoModelForZeroShotObjectDetection extends PretrainedMixin
{
const MODEL_CLASS_MAPPING = [
'owlvit' => \Codewithkyrian\Transformers\Models\Pretrained\OwlViTForObjectDetection::class,
'owlv2' => \Codewithkyrian\Transformers\Models\Pretrained\Owlv2ForObjectDetection::class,
];

const MODEL_CLASS_MAPPINGS = [
self::MODEL_CLASS_MAPPING,
];

}
16 changes: 16 additions & 0 deletions src/Models/Pretrained/OwlViTForObjectDetection.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Models\Pretrained;

use Codewithkyrian\Transformers\Models\Output\ObjectDetectionOutput;

class OwlViTForObjectDetection extends OwlViTPretrainedModel
{
public function __invoke(array $modelInputs): ObjectDetectionOutput
{
return ObjectDetectionOutput::fromOutput(parent::__invoke($modelInputs));
}
}
11 changes: 11 additions & 0 deletions src/Models/Pretrained/OwlViTModel.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Models\Pretrained;

class OwlViTModel extends OwlViTPretrainedModel
{

}
11 changes: 11 additions & 0 deletions src/Models/Pretrained/OwlViTPretrainedModel.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Models\Pretrained;

class OwlViTPretrainedModel extends PretrainedModel
{

}
16 changes: 16 additions & 0 deletions src/Models/Pretrained/Owlv2ForObjectDetection.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Models\Pretrained;

use Codewithkyrian\Transformers\Models\Output\ObjectDetectionOutput;

class Owlv2ForObjectDetection extends Owlv2PretrainedModel
{
public function __invoke(array $modelInputs): ObjectDetectionOutput
{
return ObjectDetectionOutput::fromOutput(parent::__invoke($modelInputs));
}
}
11 changes: 11 additions & 0 deletions src/Models/Pretrained/Owlv2Model.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Models\Pretrained;

class Owlv2Model extends Owlv2PretrainedModel
{

}
11 changes: 11 additions & 0 deletions src/Models/Pretrained/Owlv2PretrainedModel.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Models\Pretrained;

class Owlv2PretrainedModel extends PretrainedModel
{

}
14 changes: 12 additions & 2 deletions src/Pipelines/Task.php
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
use Codewithkyrian\Transformers\Models\Auto\AutoModelForSequenceClassification;
use Codewithkyrian\Transformers\Models\Auto\AutoModelForTokenClassification;
use Codewithkyrian\Transformers\Models\Auto\AutoModelForVision2Seq;
use Codewithkyrian\Transformers\Models\Auto\AutoModelForZeroShotObjectDetection;
use Codewithkyrian\Transformers\Models\Pretrained\PretrainedModel;
use Codewithkyrian\Transformers\PretrainedTokenizers\AutoTokenizer;
use Codewithkyrian\Transformers\PretrainedTokenizers\PretrainedTokenizer;
Expand Down Expand Up @@ -43,6 +44,7 @@ enum Task: string
case ZeroShotImageClassification = 'zero-shot-image-classification';

case ObjectDetection = 'object-detection';
case ZeroShotObjectDetection = 'zero-shot-object-detection';


public function pipeline(PretrainedModel $model, ?PretrainedTokenizer $tokenizer, ?Processor $processor): Pipeline
Expand Down Expand Up @@ -78,6 +80,8 @@ public function pipeline(PretrainedModel $model, ?PretrainedTokenizer $tokenizer
self::ZeroShotImageClassification => new ZeroShotImageClassificationPipeline($this, $model, $tokenizer, $processor),

self::ObjectDetection => new ObjectDetectionPipeline($this, $model, $tokenizer, $processor),

self::ZeroShotObjectDetection => new ZeroShotObjectDetectionPipeline($this, $model, $tokenizer, $processor),
};
}

Expand Down Expand Up @@ -112,6 +116,8 @@ public function defaultModelName(): string
self::ZeroShotImageClassification => 'Xenova/clip-vit-base-patch32', // Original: 'openai/clip-vit-base-patch32'

self::ObjectDetection => 'Xenova/detr-resnet-50', // Original: 'facebook/detr-resnet-50',

self::ZeroShotObjectDetection => 'Xenova/owlvit-base-patch32', // Original: 'google/owlvit-base-patch32',
};
}

Expand Down Expand Up @@ -153,6 +159,8 @@ public function autoModel(
self::ZeroShotImageClassification => AutoModel::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),

self::ObjectDetection => AutoModelForObjectDetection::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),

self::ZeroShotObjectDetection => AutoModelForZeroShotObjectDetection::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, $modelFilename, $output),
};
}

Expand Down Expand Up @@ -185,7 +193,8 @@ public function autoTokenizer(
self::TokenClassification,
self::Ner,
self::ImageToText,
self::ZeroShotImageClassification => AutoTokenizer::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, null, $output),
self::ZeroShotImageClassification,
self::ZeroShotObjectDetection => AutoTokenizer::fromPretrained($modelNameOrPath, $quantized, $config, $cacheDir, $revision, null, $output),
};
}

Expand All @@ -202,7 +211,8 @@ public function autoProcessor(
self::ImageToText,
self::ImageClassification,
self::ZeroShotImageClassification,
self::ObjectDetection => AutoProcessor::fromPretrained($modelNameOrPath, $config, $cacheDir, $revision, $output),
self::ObjectDetection,
self::ZeroShotObjectDetection => AutoProcessor::fromPretrained($modelNameOrPath, $config, $cacheDir, $revision, $output),


self::SentimentAnalysis,
Expand Down
104 changes: 104 additions & 0 deletions src/Pipelines/ZeroShotObjectDetectionPipeline.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Pipelines;

use Codewithkyrian\Transformers\Models\Output\ObjectDetectionOutput;
use Codewithkyrian\Transformers\Utils\Tensor;
use function Codewithkyrian\Transformers\Utils\getBoundingBox;
use function Codewithkyrian\Transformers\Utils\prepareImages;

/**
* Zero-shot object detection pipeline. This pipeline predicts bounding boxes of
* objects when you provide an image and a set of `candidate_labels`.
*
* **Example:** Zero-shot object detection w/ `Xenova/owlvit-base-patch32`.
* ```php
* $detector = pipeline('zero-shot-object-detection', 'Xenova/owlvit-base-patch32');
* $url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/astronaut.png';
* $candidateLabels = ['human face', 'rocket', 'helmet', 'american flag'];
* $output = $detector($url, $candidateLabels);
* // [
* // [
* // score: 0.24392342567443848,
* // label: 'human face',
* // box: { xmin: 180, ymin: 67, xmax: 274, ymax: 175 }
* // ],
* // ...
* // ]
* ```
*
* **Example:** Zero-shot object detection w/ `Xenova/owlvit-base-patch32` (returning top 4 matches and setting a threshold).
* ```javascript
* $detector = pipeline('zero-shot-object-detection', 'Xenova/owlvit-base-patch32');
* $url = 'https://huggingface.co/datasets/Xenova/transformers.js-docs/resolve/main/beach.png';
* $candidateLabels = ['hat', 'book', 'sunglasses', 'camera'];
* $output = $detector($url, $candidateLabels, topK : 4, threshold : 0.05);
* // [
* // [
* // score: 0.1606510728597641,
* // label: 'sunglasses',
* // box: { xmin: 347, ymin: 229, xmax: 429, ymax: 264 }
* // ],
* // ...
* // ]
* ```
*/
class ZeroShotObjectDetectionPipeline extends Pipeline
{

public function __invoke(array|string $inputs, ...$args): array
{
$candidateLabels = $args[0];
$threshold = $args['threshold'] ?? 0.1;
$topK = $args['topK'] ?? null;
$percentage = $args['percentage'] ?? false;

$isBatched = is_array($inputs);

$preparedImages = prepareImages($inputs);

// Run tokenization
$textInputs = $this->tokenizer->tokenize($candidateLabels, padding: true, truncation: true);

// Run processor
$modelInputs = ($this->processor)($preparedImages);

$toReturn = [];
foreach ($preparedImages as $i => $image) {
$imageSize = $percentage ? null : [[$image->height(), $image->width()]];
$pixelValues = $modelInputs['pixel_values'][$i];

$pixelValues = Tensor::fromNdArray($pixelValues)->unsqueeze(0);

// Run model with both text and pixel inputs
/** @var ObjectDetectionOutput $output */
$output = $this->model->__invoke(array_merge($textInputs, ['pixel_values' => $pixelValues]));

// Perform post-processing
$processed = $this->processor->featureExtractor->postProcessObjectDetection($output, $threshold, $imageSize, true)[0];

$result = [];

foreach ($processed['boxes'] as $j => $box) {
$result[] = [
'score' => $processed['scores'][$j],
'label' => $candidateLabels[$processed['classes'][$j]],
'box' => getBoundingBox($box, !$percentage),
];
}
// Sort by score
usort($result, fn($a, $b) => $b['score'] <=> $a['score']);

if ($topK !== null) {
$result = array_slice($result, 0, $topK);
}

$toReturn[] = $result;
}

return $isBatched ? $toReturn : $toReturn[0];
}
}
11 changes: 11 additions & 0 deletions src/Processors/OwlViTProcessor.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
<?php

declare(strict_types=1);


namespace Codewithkyrian\Transformers\Processors;

class OwlViTProcessor extends Processor
{

}
Loading

0 comments on commit daa10f3

Please sign in to comment.