Skip to content

Commit

Permalink
v0.0.45 requests HEADERS constant (#374)
Browse files Browse the repository at this point in the history
Signed-off-by: Glenn Jocher <[email protected]>
Co-authored-by: UltralyticsAssistant <[email protected]>
  • Loading branch information
glenn-jocher and UltralyticsAssistant authored Jan 19, 2025
1 parent 17ec5eb commit 37ce210
Show file tree
Hide file tree
Showing 4 changed files with 36 additions and 28 deletions.
2 changes: 1 addition & 1 deletion actions/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,4 +22,4 @@
# ├── test_summarize_pr.py
# └── ...

__version__ = "0.0.44"
__version__ = "0.0.45"
3 changes: 2 additions & 1 deletion actions/utils/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Ultralytics 🚀 AGPL-3.0 License - https://ultralytics.com/license

from .common_utils import remove_html_comments
from .common_utils import REQUESTS_HEADERS, remove_html_comments
from .github_utils import (
GITHUB_API_URL,
Action,
Expand All @@ -11,6 +11,7 @@

__all__ = (
"GITHUB_API_URL",
"REQUESTS_HEADERS",
"Action",
"check_pypi_version",
"get_completion",
Expand Down
56 changes: 30 additions & 26 deletions actions/utils/common_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,22 @@

import requests

REQUESTS_HEADERS = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "en-US,en;q=0.9,es;q=0.8,zh-CN;q=0.7,zh;q=0.6",
"Accept-Encoding": "gzip, deflate, br, zstd",
"sec-ch-ua": '"Chromium";v="132", "Google Chrome";v="132", "Not_A Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"Sec-Fetch-Site": "none",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Referer": "https://www.google.com/",
"Origin": "https://www.google.com/",
}


def remove_html_comments(body: str) -> str:
"""Removes HTML comments from a string using regex pattern matching."""
Expand All @@ -20,7 +36,7 @@ def clean_url(url):
return url


def is_url(url, check=True, max_attempts=3, timeout=2):
def is_url(url, session=None, check=True, max_attempts=3, timeout=2):
"""Check if string is URL and optionally verify it exists."""
allow_list = (
"localhost",
Expand All @@ -36,8 +52,6 @@ def is_url(url, check=True, max_attempts=3, timeout=2):
"example",
"mailto:",
"github.com", # ignore GitHub links that may be private repos
"kaggle.com", # blocks automated header requests
"reddit.com", # blocks automated header requests
"linkedin.com",
"twitter.com",
"x.com",
Expand All @@ -54,31 +68,20 @@ def is_url(url, check=True, max_attempts=3, timeout=2):
if not result.scheme or not partition[0] or not partition[2]:
return False

# Check response
if check:
headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"Accept-Language": "en-US,en;q=0.9,es;q=0.8,zh-CN;q=0.7,zh;q=0.6",
"Accept-Encoding": "gzip, deflate, br, zstd",
"sec-ch-ua": '"Chromium";v="132", "Google Chrome";v="132", "Not_A Brand";v="99"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"macOS"',
"Sec-Fetch-Site": "none",
"Sec-Fetch-Mode": "navigate",
"Sec-Fetch-User": "?1",
"Sec-Fetch-Dest": "document",
"Referer": "https://www.google.com/",
"Origin": "https://www.google.com/",
}
requester = session if session else requests
bad_codes = {404, 410, 500, 502, 503, 504}
kwargs = {"timeout": timeout, "allow_redirects": True}
if not session:
kwargs["headers"] = REQUESTS_HEADERS

for attempt in range(max_attempts):
try:
response = requests.head(url, headers=headers, timeout=timeout, allow_redirects=True)
if response.status_code not in bad_codes:
return True
response = requests.get(url, headers=headers, timeout=timeout, allow_redirects=True, stream=True)
return response.status_code not in bad_codes # Try GET if HEAD fails
# Try HEAD first, then GET if needed
for method in (requester.head, requester.get):
if method(url, stream=method == requester.get, **kwargs).status_code not in bad_codes:
return True
return False
except Exception:
if attempt == max_attempts - 1: # last attempt
return False
Expand Down Expand Up @@ -111,8 +114,9 @@ def check_links_in_string(text, verbose=True, return_bad=False):

urls = set(map(clean_url, all_urls)) # remove extra characters and make unique
# bad_urls = [x for x in urls if not is_url(x, check=True)] # single-thread
with ThreadPoolExecutor(max_workers=16) as executor: # multi-thread
bad_urls = [url for url, valid in zip(urls, executor.map(lambda x: not is_url(x, check=True), urls)) if valid]
with requests.Session() as session, ThreadPoolExecutor(max_workers=16) as executor:
session.headers.update(REQUESTS_HEADERS)
bad_urls = [url for url, valid in zip(urls, executor.map(lambda x: not is_url(x, session), urls)) if valid]

passing = not bad_urls
if verbose and not passing:
Expand Down
3 changes: 3 additions & 0 deletions tests/test_urls.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,9 @@
"https://www.kdnuggets.com/",
"https://www.datacamp.com/tutorial/understanding-logistic-regression-python",
"https://www.statisticshowto.com/probability-and-statistics/find-outliers/",
"https://www.reddit.com/r/Ultralytics/comments/1fw3605/release_megathread/",
"https://www.kaggle.com/models/ultralytics/yolo11",
"https://apps.apple.com/xk/app/ultralytics/id1583935240",
]


Expand Down

0 comments on commit 37ce210

Please sign in to comment.