Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Async URL checks #375

Open
wants to merge 8 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 34 additions & 20 deletions actions/utils/common_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
# Ultralytics πŸš€ AGPL-3.0 License - https://ultralytics.com/license

import asyncio
import re
import time
from concurrent.futures import ThreadPoolExecutor
from urllib import parse

import requests
import aiohttp

REQUESTS_HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
Expand Down Expand Up @@ -70,8 +69,8 @@ def clean_url(url):
return url


def is_url(url, session=None, check=True, max_attempts=3, timeout=2):
"""Check if string is URL and optionally verify it exists."""
async def is_url_async(url, session, check=True, max_attempts=3, timeout=2):
"""Asynchronously check if string is URL and optionally verify it exists."""
try:
# Check allow list
if any(x in url for x in URL_IGNORE_LIST):
Expand All @@ -84,41 +83,41 @@ def is_url(url, session=None, check=True, max_attempts=3, timeout=2):
return False

if check:
requester = session or requests
bad_codes = {404, 410, 500, 502, 503, 504}
kwargs = {"timeout": timeout, "allow_redirects": True}
if not session:
kwargs["headers"] = REQUESTS_HEADERS
kwargs = {"timeout": aiohttp.ClientTimeout(total=timeout)}

for attempt in range(max_attempts):
try:
# Try HEAD first, then GET if needed
for method in (requester.head, requester.get):
if method(url, stream=method == requester.get, **kwargs).status_code not in bad_codes:
return True
for method in (session.head, session.get):
async with method(url, **kwargs) as response:
if response.status not in bad_codes:
return True
return False
except Exception:
except (aiohttp.ClientError, asyncio.TimeoutError):
if attempt == max_attempts - 1: # last attempt
return False
time.sleep(2**attempt) # exponential backoff
await asyncio.sleep(2**attempt) # exponential backoff
return False
return True
except Exception:
return False


def check_links_in_string(text, verbose=True, return_bad=False):
"""Process a given text, find unique URLs within it, and check for any 404 errors."""
async def check_links_in_string_async(text, verbose=True, return_bad=False):
"""Asynchronously process a given text, find unique URLs within it, and check for any 404 errors."""
all_urls = []
for md_text, md_url, plain_url in URL_PATTERN.findall(text):
url = md_url or plain_url
if url and parse.urlparse(url).scheme:
all_urls.append(url)

urls = set(map(clean_url, all_urls)) # remove extra characters and make unique
with requests.Session() as session, ThreadPoolExecutor(max_workers=16) as executor:
session.headers.update(REQUESTS_HEADERS)
bad_urls = [url for url, valid in zip(urls, executor.map(lambda x: not is_url(x, session), urls)) if valid]

async with aiohttp.ClientSession(headers=REQUESTS_HEADERS) as session:
tasks = [is_url_async(url, session) for url in urls]
results = await asyncio.gather(*tasks)
bad_urls = [url for url, valid in zip(urls, results) if not valid]

passing = not bad_urls
if verbose and not passing:
Expand All @@ -127,5 +126,20 @@ def check_links_in_string(text, verbose=True, return_bad=False):
return (passing, bad_urls) if return_bad else passing


async def main():
# Example usage
passing, bad_urls = await check_links_in_string_async(
"Check out https://ultralytics.com/images/bus.jpg and this non-existent link https://ultralytics.com/invalid"
)
print(f"Passing: {passing}")
if not passing:
print(f"Bad URLs: {bad_urls}")

# Test is_url_async directly
async with aiohttp.ClientSession() as session:
result = await is_url_async("https://ultralytics.com/images/bus.jpg", session)
print(f"Is valid URL: {result}")


if __name__ == "__main__":
print(is_url("https://ultralytics.com/images/bus.jpg"))
asyncio.run(main())
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ dependencies = [
"requests>=2.32.3",
"ruff>=0.9.1",
"docformatter>=1.7.5",
"aiohttp>=3.11.11",
]

[project.optional-dependencies]
Expand Down
67 changes: 40 additions & 27 deletions tests/test_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@

# Continuous Integration (CI) GitHub Actions tests


import aiohttp
import pytest

from actions.utils.common_utils import check_links_in_string, is_url
from actions.utils.common_utils import check_links_in_string_async, is_url_async

URLS = [
"https://docs.ultralytics.com/help/contributing",
Expand Down Expand Up @@ -32,79 +34,90 @@ def verbose():
return False # Set False to suppress print statements during tests


def test_is_url():
"""Test each URL using is_url function."""
for url in URLS:
assert is_url(url), f"URL check failed: {url}"
@pytest.mark.asyncio
async def test_is_url():
"""Test each URL using is_url_async function."""
async with aiohttp.ClientSession() as session:
for url in URLS:
assert await is_url_async(url, session), f"URL check failed: {url}"


def test_html_links(verbose):
@pytest.mark.asyncio
async def test_html_links(verbose):
"""Tests the validity of URLs within HTML anchor tags and returns any invalid URLs found."""
text = "Visit <a href='https://err.com'>our site</a> or <a href=\"http://test.org\">test site</a>"
result, urls = check_links_in_string(text, verbose, return_bad=True)
result, urls = await check_links_in_string_async(text, verbose, return_bad=True)
assert result is False
assert set(urls) == {"https://err.com", "http://test.org"}


def test_markdown_links(verbose):
"""Validates URLs in Markdown links within a given text using check_links_in_string."""
@pytest.mark.asyncio
async def test_markdown_links(verbose):
"""Validates URLs in Markdown links within a given text using check_links_in_string_async."""
text = "Check [Example](https://err.com) or [Test](http://test.org)"
result, urls = check_links_in_string(text, verbose, return_bad=True)
result, urls = await check_links_in_string_async(text, verbose, return_bad=True)
assert result is False
assert set(urls) == {"https://err.com", "http://test.org"}


def test_mixed_formats(verbose):
"""Tests URL detection in mixed text formats (HTML, Markdown, plain text) using check_links_in_string."""
@pytest.mark.asyncio
async def test_mixed_formats(verbose):
"""Tests URL detection in mixed text formats (HTML, Markdown, plain text) using check_links_in_string_async."""
text = "A <a href='https://1.com'>link</a> and [markdown](https://2.org/) and https://3.net"
result, urls = check_links_in_string(text, return_bad=True)
result, urls = await check_links_in_string_async(text, return_bad=True)
assert result is False
assert set(urls) == {"https://1.com", "https://3.net"}


def test_duplicate_urls(verbose):
"""Tests detection of duplicate URLs in various text formats using the check_links_in_string function."""
@pytest.mark.asyncio
async def test_duplicate_urls(verbose):
"""Tests detection of duplicate URLs in various text formats using the check_links_in_string_async function."""
text = "Same URL: https://err.com and <a href='https://err.com'>link</a>"
result, urls = check_links_in_string(text, verbose, return_bad=True)
result, urls = await check_links_in_string_async(text, verbose, return_bad=True)
assert result is False
assert set(urls) == {"https://err.com"}


def test_no_urls(verbose):
"""Tests that a string with no URLs returns True when checked using the check_links_in_string function."""
@pytest.mark.asyncio
async def test_no_urls(verbose):
"""Tests that a string with no URLs returns True when checked using the check_links_in_string_async function."""
text = "This text contains no URLs."
result, urls = check_links_in_string(text, verbose, return_bad=True)
result, urls = await check_links_in_string_async(text, verbose, return_bad=True)
assert result is True
assert not set(urls)


def test_invalid_urls(verbose):
@pytest.mark.asyncio
async def test_invalid_urls(verbose):
"""Test invalid URLs."""
text = "Invalid URL: http://.com"
result, urls = check_links_in_string(text, verbose, return_bad=True)
result, urls = await check_links_in_string_async(text, verbose, return_bad=True)
assert result is False
assert set(urls) == {"http://.com"}


def test_urls_with_paths_and_queries(verbose):
@pytest.mark.asyncio
async def test_urls_with_paths_and_queries(verbose):
"""Test URLs with paths and query parameters to ensure they are correctly identified and validated."""
text = "Complex URL: https://err.com/path?query=value#fragment"
result, urls = check_links_in_string(text, verbose, return_bad=True)
result, urls = await check_links_in_string_async(text, verbose, return_bad=True)
assert result is False
assert set(urls) == {"https://err.com/path?query=value#fragment"}


def test_urls_with_different_tlds(verbose):
@pytest.mark.asyncio
async def test_urls_with_different_tlds(verbose):
"""Test URLs with various top-level domains (TLDs) to ensure correct identification and handling."""
text = "Different TLDs: https://err.ml https://err.org https://err.net https://err.io https://err.ai"
result, urls = check_links_in_string(text, verbose, return_bad=True)
result, urls = await check_links_in_string_async(text, verbose, return_bad=True)
assert result is False
assert set(urls) == {"https://err.ml", "https://err.io", "https://err.ai"}


def test_case_sensitivity(verbose):
@pytest.mark.asyncio
async def test_case_sensitivity(verbose):
"""Tests URL case sensitivity by verifying that URLs with different cases are correctly identified and handled."""
text = "Case test: HTTPS://err.com and https://err.com"
result, urls = check_links_in_string(text, verbose, return_bad=True)
result, urls = await check_links_in_string_async(text, verbose, return_bad=True)
assert result is False
assert set(urls) == {"https://err.com"}
Loading