From 9064abeaaf87f32fe994062034981c29ba3e4cd7 Mon Sep 17 00:00:00 2001 From: Daofeng Wu Date: Fri, 18 Oct 2024 12:29:13 +0900 Subject: [PATCH] refactor(page_analyzer): use most contentful elements to infer selectors --- npiai/core/browser/_playwright.py | 2 +- npiai/core/tool/_browser.py | 11 +-- .../web/page_analyzer/__test__/full_tests.py | 12 ++- npiai/tools/web/page_analyzer/app.py | 88 ++++++++----------- npiai/tools/web/scraper/app.py | 29 +----- npiai/utils/__init__.py | 2 + npiai/utils/html_to_markdown.py | 30 +++++++ 7 files changed, 83 insertions(+), 91 deletions(-) create mode 100644 npiai/utils/html_to_markdown.py diff --git a/npiai/core/browser/_playwright.py b/npiai/core/browser/_playwright.py index c8e0700e..59d9fb33 100644 --- a/npiai/core/browser/_playwright.py +++ b/npiai/core/browser/_playwright.py @@ -12,7 +12,7 @@ FileChooser, ) -__BROWSER_UTILS_VERSION__ = "0.0.4" +__BROWSER_UTILS_VERSION__ = "0.0.8" def _prepare_browser_utils(): diff --git a/npiai/core/tool/_browser.py b/npiai/core/tool/_browser.py index efb1a6f1..c69a1d31 100644 --- a/npiai/core/tool/_browser.py +++ b/npiai/core/tool/_browser.py @@ -1,20 +1,13 @@ import base64 -from markdownify import MarkdownConverter from playwright.async_api import ElementHandle, Error from npiai.core.browser import PlaywrightContext -from npiai.utils import logger +from npiai.utils import logger, html_to_markdown from ._function import FunctionTool, function -class MdConverter(MarkdownConverter): - # skip