Merge pull request #207 from enoch3712/59-image-too-big-for-the-llm-a…

…utomatic-quality-handler set_max_size for image added to the DL
enoch3712 · Jan 20, 2025 · 135f0d3 · 135f0d3
2 parents e615a58 + 46a4c5d
commit 135f0d3
Show file tree

Hide file tree

Showing 4 changed files with 41 additions and 9 deletions.
diff --git a/docs/assets/document_loader.png b/docs/assets/document_loader.png
diff --git a/docs/core-concepts/document-loaders/index.md b/docs/core-concepts/document-loaders/index.md
@@ -95,7 +95,15 @@ for page in pages:
     image = page.get("image")  # Available in vision mode
 ```
 
+### Image Resizing
+
+```python
+loader = DocumentLoader()
+loader.set_max_image_size(2000)
+```
+
 ### Image Conversion
+
 The base loader includes utilities for converting documents to images:
 
 ```python
@@ -111,10 +119,8 @@ images = loader.convert_to_images(
 All Document Loaders implement these core methods:
 
 - `load(source)`: Main entry point for loading documents
-- `load_content_from_file(file_path)`: Process files from disk
-- `load_content_from_stream(stream)`: Process BytesIO streams
-- `load_content_list(source)`: Load and split into pages
 - `set_vision_mode(enabled)`: Enable/disable vision mode
+- `set_max_image_size(size)`: Set the maximum image size
 
 ## Best Practices
 
@@ -158,6 +164,4 @@ ExtractThinker provides several specialized Document Loaders:
 - `Nanonets` <span class="coming-soon">Coming Soon</span>: API-based document processing
 - `Mindee` <span class="coming-soon">Coming Soon</span>: Specialized document parsing APIs
 - `Rossum` <span class="coming-soon">Coming Soon</span>: AI-powered document understanding
-- `Kofax` <span class="coming-soon">Coming Soon</span>: Intelligent document processing
-
-For more examples and advanced usage, check out the [examples directory](examples/) in the repository.
+- `Kofax` <span class="coming-soon">Coming Soon</span>: Intelligent document processing
diff --git a/extract_thinker/document_loader/document_loader.py b/extract_thinker/document_loader/document_loader.py
@@ -21,6 +21,11 @@ def __init__(self, content: Any = None, cache_ttl: int = 300):
         self.file_path = None
         self.cache = TTLCache(maxsize=100, ttl=cache_ttl)
         self.vision_mode = False
+        self.max_image_size = None  # Changed to None by default
+
+    def set_max_image_size(self, size: int) -> None:
+        """Set the maximum image size."""
+        self.max_image_size = size
 
     def set_vision_mode(self, enabled: bool = True) -> None:
         """Enable or disable vision mode processing."""
@@ -113,6 +118,27 @@ def _convert_stream_to_images(self, file_stream: io.BytesIO, scale: float) -> Di
         # If it's not an image, proceed with the conversion
         return self._convert_pdf_to_images(pdfium.PdfDocument(file_stream), scale)
 
+    def _resize_if_needed(self, image: Image.Image) -> Image.Image:
+        """Resize image if it exceeds maximum dimensions while maintaining aspect ratio.
+        
+        Args:
+            image: PIL Image object
+            
+        Returns:
+            PIL Image object (resized if necessary)
+        """
+        if self.max_image_size is None:  # Skip resizing if max_image_size not set
+            return image
+
+        width, height = image.size
+        if width > self.max_image_size or height > self.max_image_size:
+            # Calculate scaling factor to fit within max dimensions
+            scale = self.max_image_size / max(width, height)
+            new_width = int(width * scale)
+            new_height = int(height * scale)
+            return image.resize((new_width, new_height), Image.Resampling.LANCZOS)
+        return image
+
     def _convert_pdf_to_images(self, pdf_file, scale: float) -> Dict[int, bytes]:
         # Get all pages at once
         renderer = pdf_file.render(
@@ -124,6 +150,8 @@ def _convert_pdf_to_images(self, pdf_file, scale: float) -> Dict[int, bytes]:
         # Convert all images to bytes and store in dictionary
         final_images = {}
         for page_index, image in enumerate(renderer):
+            # Resize image if needed
+            image = self._resize_if_needed(image)
             image_byte_array = BytesIO()
             image.save(image_byte_array, format="jpeg", optimize=True)
             final_images[page_index] = image_byte_array.getvalue()

diff --git a/mkdocs.yml b/mkdocs.yml
@@ -79,9 +79,9 @@ nav:
           - Paginate: core-concepts/completion-strategies/paginate.md
   - "Examples":
       - Resume Processing: examples/resume-processing.md
-      - Azure Document Intelligence: examples/azure-document-intelligence.md
-      - AWS Textract: examples/aws-textract.md
-      - Google Document AI: examples/google-document-ai.md
+      - Azure Stack: examples/azure-stack.md
+      - AWS Stack: examples/aws-stack.md
+      - Google Stack: examples/google-stack.md
       - Local Processing: examples/local-processing.md
       - Groq Processing: examples/groq-processing.md