Uninett · lunkwill42 · Sep 24, 2024 · Sep 12, 2024 · Sep 20, 2024 · Sep 23, 2024
diff --git a/tests/integration/web/crawler_test.py b/tests/integration/web/crawler_test.py
@@ -3,15 +3,12 @@
 The crawler attempts to retrieve any NAV web UI page that can be reached with
 parameterless GET requests, while logged in as an administrator.
 
-We want one test for each such URL, but since generating more tests while
-running existing tests isn't easily supported under pytest (yield tests are
-becoming deprecated under pytest 4), the crawler is the de-facto reachability
-tester. A dummy test will be generated for each seen URL, and the dummy tests
-will assert that the response code of the URL was 200 OK.
-
-In addition, HTML validation tests (using libtidy) will be generated for all
-URLs that report a Content-Type of text/html.
-
+In some respects, it would be preferable to generate 1 named test for each
+reachable page, but the tests need to be generated during the test collection
+phase, which means that a full web server needs to be running before pytest
+runs - and it would also be preferable that the web server is started from a
+fixture. Instead, the webcrawler is itself a fixture that allows iteration over
+all reachable pages.
 """
 
 from collections import namedtuple
@@ -76,6 +73,14 @@
 
 Page = namedtuple('Page', 'url response content_type content')
 
+if not HOST_URL:
+    pytest.skip(
+        msg="Missing environment variable TARGETURL "
+        "(ADMINUSERNAME, ADMINPASSWORD) , skipping crawler "
+        "tests!",
+        allow_module_level=True,
+    )
+
 
 def normalize_path(url):
     url = urlsplit(url).path.rstrip('/')
@@ -216,32 +221,27 @@ def _quote_url(url):
 
 
 #
-# test functions
+# fixtures
 #
 
-# just one big, global crawler instance to ensure it's results are cached
-# throughout all the tests in a single session
-if HOST_URL:
+
+@pytest.fixture(scope="session")
+def webcrawler():
     crawler = WebCrawler(HOST_URL, USERNAME, PASSWORD)
-else:
-    crawler = Mock()
-    crawler.crawl.return_value = []
+    yield crawler
 
 
-def page_id(page):
-    """Extracts a URL as a test id from a page"""
-    return normalize_path(page.url)
+#
+# test functions
+#
 
 
-@pytest.mark.skipif(
-    not HOST_URL,
-    reason="Missing environment variable TARGETURL "
-    "(ADMINUSERNAME, ADMINPASSWORD) , skipping crawler "
-    "tests!",
-)
-@pytest.mark.parametrize("page", crawler.crawl(), ids=page_id)
-def test_link_should_be_reachable(page):
-    assert page.response == 200, _content_as_string(page.content)
+def test_all_links_should_be_reachable(webcrawler):
+    for page in webcrawler.crawl():
+        if page.response != 200:
+            # No need to fill up the test report files with contents of OK pages
+            print(_content_as_string(page.content))
+        assert page.response == 200, "{} is not reachable".format(page.url)
 
 
 def _content_as_string(content):
@@ -251,23 +251,17 @@ def _content_as_string(content):
         return content.decode('utf-8')
 
 
-@pytest.mark.skipif(
-    not HOST_URL,
-    reason="Missing environment variable TARGETURL "
-    "(ADMINUSERNAME, ADMINPASSWORD) , skipping crawler "
-    "tests!",
-)
-@pytest.mark.parametrize("page", crawler.crawl_only_html(), ids=page_id)
-def test_page_should_be_valid_html(page):
-    if page.response != 200:
-        pytest.skip("not validating non-reachable page")
-    if not page.content:
-        pytest.skip("page has no content")
+def test_page_should_be_valid_html(webcrawler):
+    for page in webcrawler.crawl_only_html():
+        if page.response != 200 or not page.content:
+            continue
 
-    document, errors = tidy_document(page.content, TIDY_OPTIONS)
-    errors = filter_errors(errors)
+        document, errors = tidy_document(page.content, TIDY_OPTIONS)
+        errors = filter_errors(errors)
+        if errors:
+            print(errors)
 
-    assert not errors, "Found following validation errors:\n" + errors
+        assert not errors, "{} did not validate as HTML".format(page.url)
 
 
 def should_validate(page: Page):