Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,5 +11,7 @@ wheels/
/scratch/
/prices/source_prices/
.claude/

.coverage
TODO.md
/.html_cache/
2 changes: 1 addition & 1 deletion packages/python/genai_prices/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,7 @@
id='fireworks',
name='Fireworks',
api_pattern='https://api\\.fireworks\\.ai',
pricing_urls=['https://fireworks.ai/pricing'],
pricing_urls=['https://fireworks.ai/models?filter=Featured'],
model_match=ClauseStartsWith(starts_with='accounts/fireworks/models/'),
models=[
ModelInfo(
Expand Down
2 changes: 1 addition & 1 deletion prices/data.json

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion prices/data_slim.json

Large diffs are not rendered by default.

6 changes: 6 additions & 0 deletions prices/providers/.schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,12 @@
"description": "Date indicating when the prices were last checked for discrepancies.",
"title": "Prices Checked"
},
"prices_checked_ai": {
"format": "date",
"type": "string",
"description": "Date indicating when the prices were last checked or updated by AI.",
"title": "Prices Checked Ai"
},
"collapse": {
"default": true,
"description": "Flag indicating whether this price should be collapsed into other prices.",
Expand Down
2 changes: 1 addition & 1 deletion prices/providers/fireworks.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
name: Fireworks
id: fireworks
pricing_urls:
- https://fireworks.ai/pricing
- https://fireworks.ai/models?filter=Featured
api_pattern: 'https://api\.fireworks\.ai'
model_match:
starts_with: accounts/fireworks/models/
Expand Down
8 changes: 7 additions & 1 deletion prices/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,4 +10,10 @@ readme = "README.md"
# Do not publish this package to PyPI
classifiers = ["Private :: do not release"]
requires-python = ">=3.9"
dependencies = ["httpx>=0.28.1", "pydantic>=2.11.7", "ruamel-yaml>=0.18.14"]
dependencies = [
"beautifulsoup4>=4.13.4",
"httpx>=0.28.1",
"pydantic>=2.11.7",
"pydantic-ai>=0.4.2",
"ruamel-yaml>=0.18.14",
]
4 changes: 3 additions & 1 deletion prices/src/prices/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from .inject_providers import inject_providers
from .package_data import package_data
from .price_discrepancies import check_for_price_discrepancies, update_price_discrepancies
from .source_ai import get_ai_prices
from .source_litellm import get_litellm_prices
from .source_openrouter import get_openrouter_prices, update_from_openrouter
from .source_simonw_prices import get_simonw_prices
Expand All @@ -19,12 +20,13 @@ def main():
get_litellm_prices,
get_openrouter_prices,
get_simonw_prices,
get_ai_prices,
update_price_discrepancies,
check_for_price_discrepancies,
package_data,
inject_providers,
)
if len(sys.argv) == 2:
if len(sys.argv) >= 2:
command = sys.argv[1]
action = next((f for f in actions if f.__name__ == command), None)
if action:
Expand Down
1 change: 1 addition & 0 deletions prices/src/prices/inject_providers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@


def inject_providers():
"""Injects a list of providers into the README.md file."""
readme_path = root_dir / 'README.md'
readme_content = readme_path.read_text()
text, count = re.subn(
Expand Down
22 changes: 11 additions & 11 deletions prices/src/prices/price_discrepancies.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,13 +12,13 @@ def update_price_discrepancies(check_threshold: date | None = None):
prices = load_source_prices()
providers_yml = get_providers_yaml()
if check_threshold is None:
check_threshold = date.today() - timedelta(days=30)
check_threshold = date.today() - timedelta(days=0)

print(f'Checking price discrepancies since {check_threshold}')
found = False

for provider_yml in providers_yml.values():
discs = 0
ai_updates = 0
for source, source_prices in prices.items():
if provider_prices := source_prices.get(provider_yml.provider.id):
for model_id, price in provider_prices.items():
Expand All @@ -28,17 +28,17 @@ def update_price_discrepancies(check_threshold: date | None = None):
if prices_conflict(model.prices, price):
provider_yml.set_price_discrepency(model.id, source, price)
discs += 1

if discs:
if not found:
found = True
print('price discrepancies:')
print(f'{provider_yml.provider.name:>20}: {discs}')
else:
provider_yml.set_model_field(model.id, 'prices_checked_ai', date.today())
ai_updates += 1

if discs or ai_updates:
if discs:
print(f'{provider_yml.provider.name:>20}: {discs} discrepancies')
if ai_updates:
print(f'{provider_yml.provider.name:>20}: {ai_updates} AI updates')
provider_yml.save()

if not found:
print('no price discrepancies found')


def check_for_price_discrepancies() -> int:
"""List price discrepancies between providers and source prices.
Expand Down
200 changes: 200 additions & 0 deletions prices/src/prices/source_ai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,200 @@
from __future__ import annotations

import asyncio
import hashlib
import os
import re
import sys

import httpx
from bs4 import BeautifulSoup, Comment, Tag
from pydantic import BaseModel, Field
from pydantic_ai import Agent, RunContext

from . import source_prices, types, update, utils

zenrows_api_key = os.getenv('ZENROWS_API_KEY')


def get_ai_prices():
"""Retrieves AI prices for a given provider using zenrows.com and Pydantic AI."""
if len(sys.argv) == 3:
provider_id = sys.argv[2]
else:
provider_id = input('Enter provider id: ')
providers_yml = update.get_providers_yaml()
provider = providers_yml[provider_id].provider
provider_prices = asyncio.get_event_loop().run_until_complete(update_get_provider(provider))

source_prices.write_source_prices(f'ai-{provider_id}', {provider_id: provider_prices})


async def update_get_provider(provider: types.Provider) -> source_prices.ProvidePrices:
if not provider.pricing_urls:
print(f'No pricing URLs found for {provider.name}')
return {}
model_ids = [model.id for model in provider.models]
provider_prices: source_prices.ProvidePrices = {}
async with httpx.AsyncClient(timeout=30) as client:
for pricing_url in provider.pricing_urls:
extra_prices = await update_get_provider_page(client, str(pricing_url), model_ids)
provider_prices.update(extra_prices)

return provider_prices


async def update_get_provider_page(
client: httpx.AsyncClient, url: str, model_ids: list[str]
) -> source_prices.ProvidePrices:
html = await cache_get(client, url)

content_id = None
if m := re.search('#(.+)', url):
content_id = m.group(1)

cleaned_html = clean_html(html, content_id)

result = await html_agent.run(
cleaned_html, model='anthropic:claude-sonnet-4-0', deps=AgentDeps(known_model_ids=model_ids)
)
provider_prices = {m.id: m.prices for m in result.output.models if not m.prices.is_free()}
print(f'{url} found {len(provider_prices)} models')

return provider_prices


fetch_directly = {
# this URl is blocked by zenrows but is rendered as HTML anyway
'https://ai.google.dev/gemini-api/docs/pricing',
}


async def cache_get(client: httpx.AsyncClient, url: str):
cache_dir = utils.root_dir / '.html_cache'
cache_dir.mkdir(exist_ok=True)

cache_file = cache_dir / f'{hashlib.md5(url.encode()).hexdigest()}.html'
if cache_file.exists():
return cache_file.read_text()
else:
if url in fetch_directly:
print(f'getting content from {url} directly...')
response = await client.get(url)
else:
print(f'getting content from {url} with zenrows...')
assert zenrows_api_key, 'ZENROWS_API_KEY environment variable is not set'
params = {'url': url, 'apikey': zenrows_api_key, 'js_render': 'true'}
response = await client.get('https://api.zenrows.com/v1/', params=params)

if not response.is_success:
raise ValueError(f'Failed to get content from {url} -> {response.status_code}:\n{response.text}')
html = response.text
cache_file.write_text(html)
return html


class _Model(BaseModel, extra='forbid', use_attribute_docstrings=True):
"""Custom abstract based model with config"""


class ModelInfo(_Model):
"""Information about an LLM model"""

id: str
"""Primary unique identifier for the model"""
# name: str | None = None
# """Name of the model"""
# aliases: list[str] | None = None
# """Alternative IDs for the model"""
# context_window: int | None = None
# """Maximum number of input tokens allowed for this model"""
prices: types.ModelPrice


class ProviderPricingPage(_Model):
"""Pricing page for a provider"""

models: list[ModelInfo] = Field(default_factory=list)
"""List of models with information"""


class AgentDeps(BaseModel):
known_model_ids: list[str]


html_agent = Agent(
output_type=ProviderPricingPage,
deps_type=AgentDeps,
instructions="""\
Your job is to inspect the HTML page and extract information about all LLM models included in the page,
either information about the models or links to pages with more details about the models.

If information about a model exists (model_infos), do NOT include it in model_links.

These are the models we already know of, if you find models matching these IDs, make sure
to use these IDs, otherwise use the most appropriate ID for the model which should approxmiately match this format:
""",
)


@html_agent.instructions
def add_known_model_ids(ctx: RunContext[AgentDeps]) -> str:
return '\n'.join(ctx.deps.known_model_ids)


keep_htmls_attrs = {'id', 'href', 'type', 'src'}


def clean_html(html: str, content_id: str | None = None) -> str:
print(f'full page size: {len(html)}')
# Parse the HTML content
page_soup = BeautifulSoup(html, 'html.parser')

soup = None
if content_id is not None:
soup = page_soup.find(id=content_id)
if soup is None:
print(f'Content with id {content_id} not found, fallback to body')
else:
assert isinstance(soup, Tag)

if soup is None:
# Extract the body
soup = page_soup.body
assert soup is not None, 'body not found'

# Remove all script and svg tags
for script_or_svg in soup(['script', 'svg']):
script_or_svg.decompose()

# Remove all class attributes
for tag in soup.find_all(True):
assert isinstance(tag, Tag)
if not tag.contents:
tag.decompose()
continue

# If tag has only one child, replace it with the child
if len(tag.contents) == 1 and isinstance(tag.contents[0], Tag):
child = tag.contents[0]
tag.replace_with(child)

for tag in soup.find_all(True):
assert isinstance(tag, Tag)
for key in list(tag.attrs):
if key not in keep_htmls_attrs:
del tag.attrs[key]

for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
assert isinstance(comment, Comment)
# If the comment is empty or contains only whitespace, remove it
stripped = comment.strip()
if len(stripped) <= 2:
comment.decompose()

# pretty_html = body.prettify(formatter='html')
# assert isinstance(pretty_html, str)
compact_content = str(soup)
print('size after cleaning:', len(compact_content))

return compact_content
2 changes: 2 additions & 0 deletions prices/src/prices/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -116,6 +116,8 @@ class ModelInfo(_Model):
"""List of price discrepancies based on external sources."""
prices_checked: date | None = Field(default=None, exclude=True)
"""Date indicating when the prices were last checked for discrepancies."""
prices_checked_ai: date | None = Field(default=None, exclude=True)
"""Date indicating when the prices were last checked or updated by AI."""
collapse: bool = Field(default=True, exclude=True)
"""Flag indicating whether this price should be collapsed into other prices."""

Expand Down
10 changes: 10 additions & 0 deletions prices/src/prices/update.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import re
from dataclasses import dataclass
from datetime import date
from io import StringIO
from operator import itemgetter
from pathlib import Path
Expand Down Expand Up @@ -108,6 +109,15 @@ def set_price_discrepency(self, lookup_id: str, source: str, price: ModelPrice)
else:
yaml_model['price_discrepancies'] = {source: data}

def set_model_field(self, lookup_id: str, key: str, value: date | str | int) -> None:
yaml_model = self._get_model(lookup_id)
if key in yaml_model:
yaml_model[key] = value
else:
# insert key before prices
keys: list[str] = list(yaml_model)
yaml_model.insert(keys.index('prices'), key, value) # pyright: ignore[reportUnknownMemberType]

def add_model(self, model: ModelInfo) -> int:
if next((m for m in self._extra_prices if m.id == model.id), None):
return 0
Expand Down
Loading