Skip to content

Commit

Permalink
v0.0.44 Improved URL check robustness (#372)
Browse files Browse the repository at this point in the history
Signed-off-by: Glenn Jocher <[email protected]>
Signed-off-by: UltralyticsAssistant <[email protected]>
Co-authored-by: UltralyticsAssistant <[email protected]>
  • Loading branch information
glenn-jocher and UltralyticsAssistant authored Jan 18, 2025
1 parent 31f9975 commit 8a24152
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 14 deletions.
2 changes: 1 addition & 1 deletion actions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@
# ├── test_summarize_pr.py
# └── ...

__version__ = "0.0.43"
__version__ = "0.0.44"
30 changes: 22 additions & 8 deletions actions/utils/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ def clean_url(url):


def is_url(url, check=True, max_attempts=3, timeout=2):
"""Check if string is URL and check if URL exists."""
"""Check if string is URL and optionally verify it exists."""
allow_list = (
"localhost",
"127.0.0",
Expand Down Expand Up @@ -56,15 +56,29 @@ def is_url(url, check=True, max_attempts=3, timeout=2):

# Check response
if check:
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "en-US,en;q=0.9,es;q=0.8,zh-CN;q=0.7,zh;q=0.6",
"Accept-Encoding": "gzip, deflate, br, zstd",
"sec-ch-ua": '"Chromium";v="132", "Google Chrome";v="132", "Not_A Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"Sec-Fetch-Site": "none",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Referer": "https://www.google.com/",
"Origin": "https://www.google.com/",
}
bad_codes = {404, 410, 500, 502, 503, 504}
for attempt in range(max_attempts):
try:
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept": "*/*", # Wildcard for maximum compatibility
"Accept-Language": "*", # Wildcard for any language
"Accept-Encoding": "*", # Wildcard for any encoding
}
return requests.head(url, headers=headers, timeout=timeout, allow_redirects=True).status_code < 400
response = requests.head(url, headers=headers, timeout=timeout, allow_redirects=True)
if response.status_code not in bad_codes:
return True
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True, stream=True)
return response.status_code not in bad_codes # Try GET if HEAD fails
except Exception:
if attempt == max_attempts - 1: # last attempt
return False
Expand Down
17 changes: 12 additions & 5 deletions tests/test_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,19 @@
from actions.utils.common_utils import check_links_in_string, is_url

URLS = [
"https://docs.ultralytics.com/help/CLA/",
"https://docs.ultralytics.com/help/contributing",
"https://docs.ultralytics.com",
"https://ultralytics.com",
"https://ultralytics.com/images/bus.jpg",
"https://github.com/ultralytics/ultralytics",
"https://azure.microsoft.com/",
"https://www.tableau.com/",
"https://openai.com/research/gpt-4",
"https://azure.microsoft.com/en-us/services/machine-learning/",
"https://azure.microsoft.com/en-us/products/storage/blobs",
"https://www.reuters.com/article/idUSKCN1MK08G/",
"https://www.kdnuggets.com/",
"https://www.datacamp.com/tutorial/understanding-logistic-regression-python",
"https://www.statisticshowto.com/probability-and-statistics/find-outliers/",
]


Expand All @@ -37,7 +44,7 @@ def test_html_links(verbose):


def test_markdown_links(verbose):
"""Validates URLs in markdown links within a given text using check_links_in_string."""
"""Validates URLs in Markdown links within a given text using check_links_in_string."""
text = "Check [Example](https://err.com) or [Test](http://test.org)"
result, urls = check_links_in_string(text, verbose, return_bad=True)
assert result is False
Expand All @@ -49,7 +56,7 @@ def test_mixed_formats(verbose):
text = "A <a href='https://1.com'>link</a> and [markdown](https://2.org) and https://3.net"
result, urls = check_links_in_string(text, return_bad=True)
assert result is False
assert set(urls) == {"https://1.com", "https://2.org", "https://3.net"}
assert set(urls) == {"https://1.com", "https://3.net"}


def test_duplicate_urls(verbose):
Expand Down Expand Up @@ -89,7 +96,7 @@ def test_urls_with_different_tlds(verbose):
text = "Different TLDs: https://err.ml https://err.org https://err.net https://err.io https://err.ai"
result, urls = check_links_in_string(text, verbose, return_bad=True)
assert result is False
assert set(urls) == {"https://err.ml", "https://err.org", "https://err.net", "https://err.io", "https://err.ai"}
assert set(urls) == {"https://err.ml", "https://err.io", "https://err.ai"}


def test_case_sensitivity(verbose):
Expand Down

0 comments on commit 8a24152

Please sign in to comment.