zytedata · Gallaecio · Dec 16, 2024 · Dec 16, 2024 · Dec 17, 2024
diff --git a/setup.py b/setup.py
@@ -20,6 +20,7 @@
         "formasaurus>=0.10.0",
         "jmespath>=0.9.5",
         "pydantic>=2.1",
+        "python-slugify>=6.0.1",
         "requests>=2.31.0",
         "scrapinghub >= 2.4.0",
         "scrapy>=2.11.0",

diff --git a/tests/__init__.py b/tests/__init__.py
@@ -1,3 +1,5 @@
+import contextlib
+import os
 from typing import Any, Dict, Optional, Type
 
 import pytest
@@ -20,3 +22,15 @@ def get_crawler(
     runner = CrawlerRunner(settings)
     crawler = runner.create_crawler(spider_cls)
     return crawler
+
+
+# https://stackoverflow.com/a/34333710
+@contextlib.contextmanager
+def set_env(**environ):
+    old_environ = dict(os.environ)
+    os.environ.update(environ)
+    try:
+        yield
+    finally:
+        os.environ.clear()
+        os.environ.update(old_environ)
diff --git a/tests/incremental/__init__.py b/tests/incremental/__init__.py
diff --git a/tests/incremental/test_collection_fp_manager.py b/tests/incremental/test_collection_fp_manager.py
@@ -2,14 +2,20 @@
 from unittest.mock import MagicMock, patch
 
 import pytest
+from scrapy import Spider
 from scrapy.statscollectors import StatsCollector
 from scrapy.utils.request import RequestFingerprinter
+from scrapy.utils.test import get_crawler as _get_crawler
 from twisted.internet.defer import Deferred, inlineCallbacks
 
-from tests import get_crawler
-from zyte_spider_templates._incremental.manager import CollectionsFingerprintsManager
+from zyte_spider_templates._incremental.manager import (
+    CollectionsFingerprintsManager,
+    _get_collection_name,
+)
 from zyte_spider_templates.spiders.article import ArticleSpider
 
+from .. import get_crawler, set_env
+
 
 @pytest.fixture
 def mock_crawler():
@@ -207,3 +213,39 @@ def test_spider_closed(mock_scrapinghub_client):
     fp_manager.save_batch = MagicMock(side_effect=fp_manager.save_batch)  # type: ignore
     fp_manager.spider_closed()
     fp_manager.save_batch.assert_called_once()
+
+
+@pytest.mark.parametrize(
+    ("env_vars", "settings", "spider_name", "collection_name"),
+    (
+        # INCREMENTAL_CRAWL_COLLECTION_NAME > SHUB_VIRTUAL_SPIDER > Spider.name
+        # INCREMENTAL_CRAWL_COLLECTION_NAME is used as is, others are
+        # slugified, length-limited and they and get an “_incremental” suffix.
+        (
+            {},
+            {},
+            "a A-1.α" + "a" * 2048,
+            "a_A_1_a" + "a" * (2048 - len("a_A_1_a_incremental")) + "_incremental",
+        ),
+        (
+            {"SHUB_VIRTUAL_SPIDER": "a A-1.α" + "a" * 2048},
+            {},
+            "foo",
+            "a_A_1_a" + "a" * (2048 - len("a_A_1_a_incremental")) + "_incremental",
+        ),
+        (
+            {"SHUB_VIRTUAL_SPIDER": "bar"},
+            {"INCREMENTAL_CRAWL_COLLECTION_NAME": "a A-1.α" + "a" * 2048},
+            "foo",
+            "a A-1.α" + "a" * 2048,
+        ),
+    ),
+)
+def test_collection_name(env_vars, settings, spider_name, collection_name):
+    class TestSpider(Spider):
+        name = spider_name
+
+    crawler = _get_crawler(settings_dict=settings, spidercls=TestSpider)
+    crawler.spider = TestSpider()
+    with set_env(**env_vars):
+        assert _get_collection_name(crawler) == collection_name
diff --git a/tox.ini b/tox.ini
@@ -28,6 +28,7 @@ deps =
     formasaurus==0.10.0
     jmespath==0.9.5
     pydantic==2.1
+    python-slugify==6.0.1
     requests==2.31.0
     scrapinghub==2.4.0
     scrapy==2.11.0

diff --git a/zyte_spider_templates/_incremental/manager.py b/zyte_spider_templates/_incremental/manager.py
@@ -10,6 +10,7 @@
 from scrapy import signals
 from scrapy.crawler import Crawler
 from scrapy.http.request import Request
+from slugify import slugify
 from zyte_common_items import Item
 
 from zyte_spider_templates.utils import (
@@ -22,11 +23,19 @@
 logger = logging.getLogger(__name__)
 
 INCREMENTAL_SUFFIX = "_incremental"
+_MAX_LENGTH = 2048 - len(INCREMENTAL_SUFFIX)
 COLLECTION_API_URL = "https://storage.scrapinghub.com/collections"
 
 THREAD_POOL_EXECUTOR = ThreadPoolExecutor(max_workers=10)
 
 
+def _get_collection_name(crawler: Crawler) -> str:
+    if name := crawler.settings.get("INCREMENTAL_CRAWL_COLLECTION_NAME"):
+        return name
+    name = get_spider_name(crawler).rstrip("_")[:_MAX_LENGTH] + INCREMENTAL_SUFFIX
+    return slugify(name, separator="_", lowercase=False, regex_pattern=r"[^a-zA-Z0-9_]")
+
+
 class CollectionsFingerprintsManager:
     def __init__(self, crawler: Crawler) -> None:
         self.writer = None
@@ -37,7 +46,7 @@ def __init__(self, crawler: Crawler) -> None:
         self.batch_size = crawler.settings.getint("INCREMENTAL_CRAWL_BATCH_SIZE", 50)
 
         project_id = get_project_id(crawler)
-        collection_name = self.get_collection_name(crawler)
+        collection_name = _get_collection_name(crawler)
 
         self.init_collection(project_id, collection_name)
         self.api_url = f"{COLLECTION_API_URL}/{project_id}/s/{collection_name}"
@@ -51,12 +60,6 @@ def __init__(self, crawler: Crawler) -> None:
 
         crawler.signals.connect(self.spider_closed, signal=signals.spider_closed)
 
-    def get_collection_name(self, crawler):
-        return (
-            crawler.settings.get("INCREMENTAL_CRAWL_COLLECTION_NAME")
-            or f"{get_spider_name(crawler)}{INCREMENTAL_SUFFIX}"
-        )
-
     def init_collection(self, project_id, collection_name) -> None:
         client = get_client()
         collection = client.get_project(project_id).collections.get_store(