Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Url check to include redirect #74

Merged
merged 8 commits into from
Jan 24, 2023
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,17 @@

## [Unreleased]

### Added

- Check to ensure that a topic URL resolves on discourse after allowing for any
redirects.

## Changed

- URL checks are now after allowing for any discourse redirects rather than
before letting topic URLs pass that do not include the slug or do not include
the topic id as long as the URL after redirects is valid.

## [v0.2.1] - 2023-01-20

### Fixed
Expand Down
107 changes: 78 additions & 29 deletions src/discourse.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,11 @@ class _ValidationResultValid(typing.NamedTuple):
Attrs:
value: The validation result, always True.
message: The validation message, always None.
final_url: The topic link after any redirects have been resolved.

"""

final_url: str
value: typing.Literal[True] = True
message: None = None

Expand All @@ -49,11 +51,13 @@ class _ValidationResultInvalid(typing.NamedTuple):
Attrs:
value: The validation result, always False.
message: The validation message as the reason the validation failed.
final_url: Always set to None since the url is not valid.

"""

message: str
value: typing.Literal[False] = False
final_url: None = None


_ValidationResult = _ValidationResultValid | _ValidationResultInvalid
Expand Down Expand Up @@ -83,59 +87,97 @@ def __init__(self, base_path: str, api_username: str, api_key: str, category_id:
self._api_username = api_username
self._api_key = api_key

@staticmethod
def _topic_url_path_components_valid(
path_components: typing.Sequence[str], url: str
) -> str | None:
"""Check whether the path components of a topic URL are valid.

Args:
path_components: The elements of the URL path after splitting on /.
url: The original URL, used for messages describing what is wrong.

Returns:
Whether the message for what is wrong with the path components or None if no issues
were found.
"""
if not len(path_components) == 3:
return (
"Unexpected number of path components, "
f"expected: 3, got: {len(path_components)}, {url=}"
)

if not path_components[0] == "t":
return (
"Unexpected first path component, "
f"expected: {'t'!r}, got: {path_components[0]!r}, {url=}"
)

if not path_components[1]:
return f"Empty second path component topic slug, got: {path_components[1]!r}, {url=}"

if not path_components[2].isnumeric():
return (
"unexpected third path component topic id, "
"expected: a string that can be converted to an integer, "
f"got: {path_components[2]!r}, {url=}"
)

return None

def topic_url_valid(self, url: str) -> _ValidationResult:
"""Check whether a url to a topic is valid. Assume the url is well formatted.

Validations:
1. The URL must start with the base path configured during construction.
2. The URL must have 3 components in its path.
3. The first component in the path must be the literal 't'.
4. The second component in the path must be the slug to the topic which must have at
2. The URL must resolve on a discourse HEAD request.
3. The URL must have 3 components in its path.
4. The first component in the path must be the literal 't'.
5. The second component in the path must be the slug to the topic which must have at
least 1 character.
5. The third component must the the topic id as an integer.
6. The third component must the the topic id as an integer.

Args:
url: The URL to check.

Returns:
Whether the URL is a valid topic URL.

"""
if not url.startswith((self._base_path, _URL_PATH_PREFIX)):
return _ValidationResultInvalid(
"The base path is different to the expected base path, "
f"expected: {self._base_path}, {url=}"
)

parsed_url = parse.urlparse(url=url)
# Remove trailing / and ignore first element which is always empty
path_components = parsed_url.path.rstrip("/").split("/")[1:]

if not len(path_components) == 3:
return _ValidationResultInvalid(
"Unexpected number of path components, "
f"expected: 3, got: {len(path_components)}, {url=}"
try:
response = self._get_requests_session().head(
url if url.startswith(self._base_path) else f"{self._base_path}{url}",
allow_redirects=True,
)

if not path_components[0] == "t":
response.raise_for_status()
url = response.url
except (
requests.HTTPError,
requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
requests.exceptions.RequestException,
) as exc:
return _ValidationResultInvalid(
"Unexpected first path component, "
f"expected: {'t'!r}, got: {path_components[0]!r}, {url=}"
f"The topic URL could not be resolved on discourse, error: {exc}, {url=}"
)

if not path_components[1]:
return _ValidationResultInvalid(
f"Empty second path component topic slug, got: {path_components[1]!r}, {url=}"
)
parsed_url = parse.urlparse(url=url)
# Remove trailing / and ignore first element which is always empty
path_components = parsed_url.path.rstrip("/").split("/")[1:]

if not path_components[2].isnumeric():
return _ValidationResultInvalid(
"unexpected third path component topic id, "
"expected: a string that can be converted to an integer, "
f"got: {path_components[2]!r}, {url=}"
if (
components_message := self._topic_url_path_components_valid(
path_components=path_components, url=url
)
) is not None:
return _ValidationResultInvalid(components_message)

return _ValidationResultValid()
return _ValidationResultValid(final_url=url)

def _url_to_topic_info(self, url: str) -> _DiscourseTopicInfo:
"""Retrieve the topic information from the url to the topic.
Expand All @@ -148,12 +190,14 @@ def _url_to_topic_info(self, url: str) -> _DiscourseTopicInfo:

Raises:
DiscourseError: if the url is not valid.

"""
result = self.topic_url_valid(url=url)
if not result.value:
raise DiscourseError(result.message)

# If the result is valid, the final_url is guaranteed to be a string
url = typing.cast(str, result.final_url)

path_components = parse.urlparse(url=url).path.split("/")
return _DiscourseTopicInfo(slug=path_components[-2], id_=int(path_components[-1]))

Expand Down Expand Up @@ -327,7 +371,12 @@ def retrieve_topic(self, url: str) -> str:
)
try:
response.raise_for_status()
except requests.HTTPError as exc:
except (
requests.HTTPError,
requests.exceptions.ConnectionError,
requests.exceptions.Timeout,
requests.exceptions.RequestException,
) as exc:
raise DiscourseError(f"Error retrieving the topic, {url=!r}") from exc

return response.content.decode("utf-8")
Expand Down
44 changes: 41 additions & 3 deletions tests/integration/test_discourse.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
pytestmark = pytest.mark.discourse


SLUG_REGEX = r"\/t\/[\w-]*\/"
jdkandersson marked this conversation as resolved.
Show resolved Hide resolved


def change_url_slug(url: str, new_slug: str) -> str:
"""Change the slug of a topic URL.

Expand All @@ -27,7 +30,31 @@ def change_url_slug(url: str, new_slug: str) -> str:
Returns:
The URL with the changed slug.
"""
return re.sub(r"\/t\/[\w-]*\/", f"/t/{new_slug}/", url)
return re.sub(SLUG_REGEX, f"/t/{new_slug}/", url)


def remove_url_slug(url: str) -> str:
"""Remove the slug from a topic URL.

Args:
url: The topic URL to change.

Returns:
The URL without the slug.
"""
return re.sub(SLUG_REGEX, "/t/", url)


def remove_url_topic_id(url: str) -> str:
"""Remove the topic id from a topic URL.

Args:
url: The topic URL to change.

Returns:
The URL without the topic id.
"""
return re.sub(r"\/\d+$", "", url)


@pytest.mark.asyncio
Expand Down Expand Up @@ -86,10 +113,11 @@ async def test_create_retrieve_update_delete_topic(


@pytest.mark.asyncio
async def test_retrieve_wrong_slug(discourse_api: Discourse):
async def test_retrieve_wrong_url(discourse_api: Discourse):
"""
arrange: given running discourse server
act: when a topic is created and retrieved with the wrong slug
act: when a topic is created and retrieved with the wrong slug, without the slug or without the
topic id
assert: then the correct content is returned.
"""
title = "title 1 padding so it is long enough test_retrieve_wrong_slug"
Expand All @@ -101,6 +129,16 @@ async def test_retrieve_wrong_slug(discourse_api: Discourse):

assert returned_content == content

url_missing_slug = remove_url_slug(url)
returned_content = discourse_api.retrieve_topic(url=url_missing_slug)

assert returned_content == content

url_missing_topic_id = remove_url_topic_id(url)
returned_content = discourse_api.retrieve_topic(url=url_missing_topic_id)

assert returned_content == content


@pytest.mark.asyncio
async def test_update_wrong_slug(discourse_api: Discourse):
Expand Down
46 changes: 39 additions & 7 deletions tests/unit/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,28 +3,60 @@

"""Fixtures for all unit tests."""

# pylint: disable=redefined-outer-name

from pathlib import Path
from unittest import mock

import pytest
import requests

from src import index
from src.discourse import Discourse

from . import helpers


@pytest.fixture(scope="module")
def base_path() -> str:
@pytest.fixture(scope="module", name="base_path")
def fixture_base_path() -> str:
"""Get the base path for discourse."""
return "http://discourse"
return helpers.get_discourse_base_path()


@pytest.fixture()
def discourse(base_path: str) -> Discourse:
@pytest.fixture(name="discourse")
def fixture_discourse(base_path: str) -> Discourse:
"""Get the discourse client."""
return Discourse(base_path=base_path, api_username="", api_key="", category_id=0)


@pytest.fixture(name="discourse_mocked_get_requests_session")
def fixture_discourse_mocked_get_requests_session(
discourse: Discourse, monkeypatch: pytest.MonkeyPatch
) -> Discourse:
"""Get the mocked get_session."""
# Have to access protected attributes to be able to mock them for tests
# pylint: disable=protected-access
mock_get_requests_session = mock.MagicMock(spec=discourse._get_requests_session)
mocked_session = mock.MagicMock(spec=requests.Session)
mock_get_requests_session.return_value = mocked_session
mocked_get_response = mock.MagicMock(spec=requests.Response)
mocked_session.get.return_value = mocked_get_response
mocked_head_response = mock.MagicMock(spec=requests.Response)
mocked_session.head.return_value = mocked_head_response
monkeypatch.setattr(discourse, "_get_requests_session", mock_get_requests_session)
return discourse


@pytest.fixture(name="topic_url")
def fixture_topic_url(discourse_mocked_get_requests_session: Discourse) -> str:
"""Get the base path for discourse."""
url = helpers.get_discourse_topic_url()
discourse = discourse_mocked_get_requests_session
# Have to access protected attributes to be able to mock them for tests
# pylint: disable=protected-access
# mypy complains that _get_requests_session has no attribute ..., it is actually mocked
discourse._get_requests_session.return_value.head.return_value.url = url # type: ignore
return url


@pytest.fixture()
def index_file_content(tmp_path: Path) -> str:
"""Create index file."""
Expand Down
19 changes: 19 additions & 0 deletions tests/unit/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
from pathlib import Path

from src import metadata
from src.discourse import _URL_PATH_PREFIX


def create_metadata_yaml(content: str, path: Path) -> None:
Expand Down Expand Up @@ -43,3 +44,21 @@ def path_to_markdown(path: Path) -> Path:
Path with last path being a markdown file.
"""
return Path(f"{path}.md")


def get_discourse_base_path() -> str:
"""Get the base path for discourse.

Returns:
The base path for discourse.
"""
return "http://discourse"


def get_discourse_topic_url() -> str:
"""Get a topic url for discourse.

Returns:
A topic url for discourse.
"""
return f"{get_discourse_base_path()}{_URL_PATH_PREFIX}slug/1"
Loading