Skip to content

Commit

Permalink
fix: Added retries for whisper-status API in LLMW v2 (#126)
Browse files Browse the repository at this point in the history
* Added retries for whisper-status API in LLMW v2

* Corrected mentions of LLMWhisperer

* Corrected mentions of LLMWhisperer - 2

* Reverted LLMW error message key back to

* Fixed imports, used MimeType constant and minor error handling changes

* Bumped version to 0.54.0rc4

---------

Signed-off-by: Chandrasekharan M <[email protected]>
  • Loading branch information
chandrasekharan-zipstack authored Dec 3, 2024
1 parent 265d5b9 commit a1399a6
Show file tree
Hide file tree
Showing 19 changed files with 128 additions and 98 deletions.
2 changes: 1 addition & 1 deletion src/unstract/sdk/__init__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
__version__ = "0.54.0rc3"
__version__ = "0.54.0rc4"


def get_sdk_version():
Expand Down
43 changes: 34 additions & 9 deletions src/unstract/sdk/adapters/utils.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import logging
from pathlib import Path

import filetype
Expand All @@ -6,8 +7,11 @@
from requests.exceptions import RequestException

from unstract.sdk.adapters.constants import Common
from unstract.sdk.constants import MimeType
from unstract.sdk.file_storage import FileStorage, FileStorageProvider

logger = logging.getLogger(__name__)


class AdapterUtils:
@staticmethod
Expand All @@ -25,17 +29,38 @@ def get_msg_from_request_exc(
Returns:
str: Error message returned by the server
"""
if hasattr(err, "response"):
err_response: Response = err.response # type: ignore
if err_response.headers["Content-Type"] == "application/json":
err_json = err_response.json()
if message_key in err_json:
return str(err_json[message_key])
elif err_response.headers["Content-Type"] == "text/plain":
return err_response.text # type: ignore
if not hasattr(err, "response"):
return default_err

err_response: Response = err.response # type: ignore
err_content_type = err_response.headers.get("Content-Type")

if not err_content_type:
logger.warning(
f"Content-Type header not found in {err_response}, "
f"returning {default_err}"
)
return default_err

if err_content_type == MimeType.JSON:
err_json = err_response.json()
if message_key in err_json:
return str(err_json[message_key])
else:
logger.warning(
f"Unable to parse error with key '{message_key}' for "
f"'{err_json}', returning '{default_err}' instead."
)
elif err_content_type == MimeType.TEXT:
return err_response.text # type: ignore
else:
logger.warning(
f"Unhandled err_response type '{err_content_type}' "
f"for {err_response}, returning {default_err}"
)
return default_err

# ToDo: get_file_mime_type() to be removed once migrated to FileStorage
# TODO: get_file_mime_type() to be removed once migrated to FileStorage
# FileStorage has mime_type() which could be used instead.
@staticmethod
def get_file_mime_type(
Expand Down
3 changes: 2 additions & 1 deletion src/unstract/sdk/adapters/x2text/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from unstract.sdk.adapters.exceptions import AdapterError
from unstract.sdk.adapters.utils import AdapterUtils
from unstract.sdk.adapters.x2text.constants import X2TextConstants
from unstract.sdk.constants import MimeType
from unstract.sdk.file_storage import FileStorage, FileStorageProvider

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -111,7 +112,7 @@ def make_request(
X2TextConstants.PLATFORM_SERVICE_API_KEY
)
headers = {
"accept": "application/json",
"accept": MimeType.JSON,
"Authorization": f"Bearer {platform_service_api_key}",
}
body = {
Expand Down
4 changes: 2 additions & 2 deletions src/unstract/sdk/adapters/x2text/llm_whisperer/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Unstract LLM Whisperer X2Text Adapter
# Unstract LLMWhisperer X2Text Adapter

## Env variables

The below env variables are resolved by LLM Whisperer adapter
The below env variables are resolved by LLMWhisperer adapter

| Variable | Description |
| ---------------------------- | -------------------------------------------------------------------------------------------- |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ class WhispererEndpoint:


class WhispererEnv:
"""Env variables for LLM whisperer.
"""Env variables for LLMWhisperer.
Can be used to alter behaviour at runtime.
Expand Down Expand Up @@ -89,7 +89,7 @@ class WhisperStatus:


class WhispererDefaults:
"""Defaults meant for LLM whisperer."""
"""Defaults meant for LLMWhisperer."""

MEDIAN_FILTER_SIZE = 0
GAUSSIAN_BLUR_RADIUS = 0.0
Expand All @@ -104,4 +104,3 @@ class WhispererDefaults:
PAGE_SEPARATOR = "<<< >>>"
MARK_VERTICAL_LINES = False
MARK_HORIZONTAL_LINES = False

Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
WhisperStatus,
)
from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter
from unstract.sdk.constants import MimeType
from unstract.sdk.file_storage import FileStorage, FileStorageProvider

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -61,13 +62,13 @@ def get_json_schema() -> str:
return schema

def _get_request_headers(self) -> dict[str, Any]:
"""Obtains the request headers to authenticate with LLM Whisperer.
"""Obtains the request headers to authenticate with LLMWhisperer.
Returns:
str: Request headers
"""
return {
"accept": "application/json",
"accept": MimeType.JSON,
WhispererHeader.UNSTRACT_KEY: self.config.get(WhispererConfig.UNSTRACT_KEY),
}

Expand All @@ -79,11 +80,11 @@ def _make_request(
params: Optional[dict[str, Any]] = None,
data: Optional[Any] = None,
) -> Response:
"""Makes a request to LLM whisperer service.
"""Makes a request to LLMWhisperer service.
Args:
request_method (HTTPMethod): HTTPMethod to call. Can be GET or POST
request_endpoint (str): LLM whisperer endpoint to hit
request_endpoint (str): LLMWhisperer endpoint to hit
headers (Optional[dict[str, Any]], optional): Headers to pass.
Defaults to None.
params (Optional[dict[str, Any]], optional): Query params to pass.
Expand Down Expand Up @@ -119,15 +120,15 @@ def _make_request(
except ConnectionError as e:
logger.error(f"Adapter error: {e}")
raise ExtractorError(
"Unable to connect to LLM Whisperer service, please check the URL"
"Unable to connect to LLMWhisperer service, please check the URL"
)
except Timeout as e:
msg = "Request to LLM whisperer has timed out"
msg = "Request to LLMWhisperer has timed out"
logger.error(f"{msg}: {e}")
raise ExtractorError(msg)
except HTTPError as e:
logger.error(f"Adapter error: {e}")
default_err = "Error while calling the LLM Whisperer service"
default_err = "Error while calling the LLMWhisperer service"
msg = AdapterUtils.get_msg_from_request_exc(
err=e, message_key="message", default_err=default_err
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"title": "LLM Whisperer X2Text",
"title": "LLMWhisperer X2Text",
"type": "object",
"required": [
"adapter_name",
Expand All @@ -11,14 +11,14 @@
"type": "string",
"title": "Name",
"default": "",
"description": "Provide a unique name for this adapter instance. Example: LLM Whisperer 1"
"description": "Provide a unique name for this adapter instance. Example: LLMWhisperer 1"
},
"url": {
"type": "string",
"title": "URL",
"format": "uri",
"default": "https://llmwhisperer-api.unstract.com",
"description": "Provide the URL of the LLM Whisperer service. Please note that this version of LLM Whisperer is deprecated."
"description": "Provide the URL of the LLMWhisperer service. Please note that this version of LLMWhisperer is deprecated."
},
"unstract_key": {
"type": "string",
Expand Down
4 changes: 2 additions & 2 deletions src/unstract/sdk/adapters/x2text/llm_whisperer_v2/README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Unstract LLM Whisperer v2 X2Text Adapter
# Unstract LLMWWhisperer v2 X2Text Adapter

## Env variables

The below env variables are resolved by LLM Whisperer adapter
The below env variables are resolved by LLMWhisperer adapter

| Variable | Description |
| ---------------------------- | -------------------------------------------------------------------------------------------- |
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ class WhispererEndpoint:


class WhispererEnv:
"""Env variables for LLM whisperer.
"""Env variables for LLMWhisperer.
Can be used to alter behaviour at runtime.
Expand All @@ -42,10 +42,13 @@ class WhispererEnv:
LLMWhisperer's status API. Defaults to 30s
MAX_POLLS: Total number of times to poll the status API.
Set to -1 to poll indefinitely. Defaults to -1
STATUS_RETRIES: Number of times to retry calling LLLMWhisperer's status API
on failure during polling. Defaults to 5.
"""

POLL_INTERVAL = "ADAPTER_LLMW_POLL_INTERVAL"
MAX_POLLS = "ADAPTER_LLMW_MAX_POLLS"
STATUS_RETRIES = "ADAPTER_LLMW_STATUS_RETRIES"


class WhispererConfig:
Expand Down Expand Up @@ -84,7 +87,7 @@ class WhisperStatus:


class WhispererDefaults:
"""Defaults meant for LLM whisperer."""
"""Defaults meant for LLMWhisperer."""

MEDIAN_FILTER_SIZE = 0
GAUSSIAN_BLUR_RADIUS = 0.0
Expand All @@ -94,6 +97,7 @@ class WhispererDefaults:
HORIZONTAL_STRETCH_FACTOR = 1.0
POLL_INTERVAL = int(os.getenv(WhispererEnv.POLL_INTERVAL, 30))
MAX_POLLS = int(os.getenv(WhispererEnv.MAX_POLLS, 30))
STATUS_RETRIES = int(os.getenv(WhispererEnv.STATUS_RETRIES, 5))
PAGES_TO_EXTRACT = ""
PAGE_SEPARATOR = "<<<"
MARK_VERTICAL_LINES = False
Expand Down
46 changes: 28 additions & 18 deletions src/unstract/sdk/adapters/x2text/llm_whisperer_v2/src/helper.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,23 +20,22 @@
WhispererHeader,
WhisperStatus,
)
from unstract.sdk.file_storage.fs_impl import FileStorage
from unstract.sdk.file_storage.fs_provider import FileStorageProvider
from unstract.sdk.constants import MimeType
from unstract.sdk.file_storage import FileStorage, FileStorageProvider

logger = logging.getLogger(__name__)


class LLMWhispererHelper:

@staticmethod
def get_request_headers(config: dict[str, Any]) -> dict[str, Any]:
"""Obtains the request headers to authenticate with LLM Whisperer.
"""Obtains the request headers to authenticate with LLMWhisperer.
Returns:
str: Request headers
"""
return {
"accept": "application/json",
"accept": MimeType.JSON,
WhispererHeader.UNSTRACT_KEY: config.get(WhispererConfig.UNSTRACT_KEY),
}

Expand All @@ -49,11 +48,11 @@ def make_request(
params: Optional[dict[str, Any]] = None,
data: Optional[Any] = None,
) -> Response:
"""Makes a request to LLM whisperer service.
"""Makes a request to LLMWhisperer service.
Args:
request_method (HTTPMethod): HTTPMethod to call. Can be GET or POST
request_endpoint (str): LLM whisperer endpoint to hit
request_endpoint (str): LLMWhisperer endpoint to hit
headers (Optional[dict[str, Any]], optional): Headers to pass.
Defaults to None.
params (Optional[dict[str, Any]], optional): Query params to pass.
Expand Down Expand Up @@ -89,15 +88,15 @@ def make_request(
except ConnectionError as e:
logger.error(f"Adapter error: {e}")
raise ExtractorError(
"Unable to connect to LLM Whisperer service, please check the URL"
"Unable to connect to LLMWhisperer service, please check the URL"
)
except Timeout as e:
msg = "Request to LLM whisperer has timed out"
msg = "Request to LLMWhisperer has timed out"
logger.error(f"{msg}: {e}")
raise ExtractorError(msg)
except HTTPError as e:
logger.error(f"Adapter error: {e}")
default_err = "Error while calling the LLM Whisperer service"
default_err = "Error while calling the LLMWhisperer service"
msg = AdapterUtils.get_msg_from_request_exc(
err=e, message_key="message", default_err=default_err
)
Expand Down Expand Up @@ -197,14 +196,16 @@ def check_status_until_ready(
"""
POLL_INTERVAL = WhispererDefaults.POLL_INTERVAL
MAX_POLLS = WhispererDefaults.MAX_POLLS
STATUS_RETRY_THRESHOLD = WhispererDefaults.STATUS_RETRIES
status_retry_count = 0
request_count = 0

# Check status in fixed intervals upto max poll count.
while True:
request_count += 1
logger.info(
f"Checking status with interval: {POLL_INTERVAL}s"
f", request count: {request_count} [max: {MAX_POLLS}]"
f"Checking status for whisper-hash '{whisper_hash}' with interval: "
f"{POLL_INTERVAL}s, request count: {request_count} [max: {MAX_POLLS}]"
)
status_response = LLMWhispererHelper.make_request(
config=config,
Expand All @@ -216,19 +217,28 @@ def check_status_until_ready(
if status_response.status_code == 200:
status_data = status_response.json()
status = status_data.get(WhisperStatus.STATUS, WhisperStatus.UNKNOWN)
logger.info(f"Whisper status for {whisper_hash}: {status}")
logger.info(f"Whisper status for '{whisper_hash}': {status}")
if status in [WhisperStatus.PROCESSED, WhisperStatus.DELIVERED]:
break
else:
raise ExtractorError(
"Error checking LLMWhisperer status: "
f"{status_response.status_code} - {status_response.text}"
)
if status_retry_count >= STATUS_RETRY_THRESHOLD:
raise ExtractorError(
f"Error checking LLMWhisperer status for whisper-hash "
f"'{whisper_hash}': {status_response.text}"
)
else:
status_retry_count += 1
logger.warning(
f"Whisper status for '{whisper_hash}' failed "
f"{status_retry_count} time(s), retrying... "
f"[threshold: {STATUS_RETRY_THRESHOLD}]: {status_response.text}"
)

# Exit with error if max poll count is reached
if request_count >= MAX_POLLS:
raise ExtractorError(
"Unable to extract text after attempting" f" {request_count} times"
f"Unable to extract text for whisper-hash '{whisper_hash}' "
f"after attempting {request_count} times"
)
time.sleep(POLL_INTERVAL)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
)
from unstract.sdk.adapters.x2text.llm_whisperer_v2.src.helper import LLMWhispererHelper
from unstract.sdk.adapters.x2text.x2text_adapter import X2TextAdapter
from unstract.sdk.file_storage.fs_impl import FileStorage
from unstract.sdk.file_storage.fs_provider import FileStorageProvider
from unstract.sdk.file_storage import FileStorage, FileStorageProvider

logger = logging.getLogger(__name__)

Expand Down
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
{
"title": "LLM Whisperer X2Text v2",
"title": "LLMWhisperer X2Text v2",
"type": "object",
"required": [
"adapter_name",
Expand All @@ -11,7 +11,7 @@
"type": "string",
"title": "Name",
"default": "llm-whisperer-v2",
"description": "Provide a unique name for this adapter instance. Example: LLM Whisperer 1"
"description": "Provide a unique name for this adapter instance. Example: LLMWhisperer 1"
},
"url": {
"type": "string",
Expand Down
Loading

0 comments on commit a1399a6

Please sign in to comment.