From cc75b0873f46a73ea66c0bcf5a2d8c7a98a7a499 Mon Sep 17 00:00:00 2001 From: benoit74 Date: Fri, 22 Nov 2024 10:46:40 +0000 Subject: [PATCH] New changes for mindtouch --- .../src/common/schemas/offliners/mindtouch.py | 23 ++++++++++++------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/dispatcher/backend/src/common/schemas/offliners/mindtouch.py b/dispatcher/backend/src/common/schemas/offliners/mindtouch.py index 638eea18..04cae6f9 100644 --- a/dispatcher/backend/src/common/schemas/offliners/mindtouch.py +++ b/dispatcher/backend/src/common/schemas/offliners/mindtouch.py @@ -171,16 +171,23 @@ class Meta: metadata={"label": "Debug", "description": "Enable verbose output"}, ) - html_issues_warn_only = fields.Boolean( - truthy=[True], - falsy=[False], + bad_assets_regex = String( + metadata={ + "label": "Bad assets regex", + "description": "Regular expression of asset URLs known to not be available." + "Case insensitive.", + }, + data_key="bad-assets-regex", + ) + + bad_assets_threshold = fields.Integer( metadata={ - "label": "HTML issues warn only", - "description": "[dev] Only log a warning when unexpected HTML is " - "encountered. Use with caution because activating this option means that " - "ZIM HTML will probably lead to online resources without user noticing it.", + "label": "Bad assets threshold", + "description": "[dev] Number of assets allowed to fail to download before " + "failing the scraper. Assets already excluded with --bad-assets-regex are " + "not counted for this threshold. Defaults to 10 assets.", }, - data_key="html-issues-warn-only", + data_key="bad-assets-threshold", ) stats_filename = String(