scrapinghub · BurnzZ · Jun 1, 2022 · Jun 1, 2022 · Jun 1, 2022 · Jun 1, 2022
diff --git a/setup.py b/setup.py
@@ -25,6 +25,7 @@
         'url-matcher',
         'multidict',
         'w3lib >= 1.22.0',
+        'yarl',
     ],
     classifiers=[
         'Development Status :: 2 - Pre-Alpha',

diff --git a/tests/test_mixins.py b/tests/test_mixins.py
@@ -16,7 +16,7 @@ def my_page(book_list_html_response):
 
 
 def test_url(my_page):
-    assert my_page.url == 'http://books.toscrape.com/index.html'
+    assert str(my_page.url) == 'http://books.toscrape.com/index.html'
 
 
 def test_html(my_page, book_list_html):
@@ -56,7 +56,7 @@ def test_custom_baseurl():
     )
     page = MyPage(response=response)
 
-    assert page.url == 'http://www.example.com/path'
+    assert str(page.url) == 'http://www.example.com/path'
     assert page.base_url == 'http://example.com/foo/'
     assert page.urljoin("bar") == 'http://example.com/foo/bar'
     assert page.urljoin("http://example.com/1") == "http://example.com/1"
diff --git a/tests/test_page_inputs.py b/tests/test_page_inputs.py
@@ -4,8 +4,11 @@
 import pytest
 import requests
 
+import yarl
 import parsel
 from web_poet.page_inputs import (
+    RequestUrl,
+    ResponseUrl,
     HttpRequest,
     HttpResponse,
     HttpRequestBody,
@@ -16,6 +19,70 @@
 )
 
 
+@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl])
+def test_url(cls):
+    url_value = "https://example.com/category/product?query=123&id=xyz#frag1"
+
+    url = cls(url_value)
+
+    assert str(url) == url_value
+    assert url.scheme == "https"
+    assert url.host == "example.com"
+    assert url.path == "/category/product"
+    assert url.query_string == "query=123&id=xyz"
+    assert url.fragment == "frag1"
+
+    new_url = cls(url)
+    assert url == new_url
+    assert str(url) == str(new_url)
+
+
+@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl])
+def test_url_init(cls):
+    # via string
+    url_value = "https://example.com"
+    url = cls(url_value)
+
+    # via yarl
+    assert cls(yarl.URL(url_value)) == url
+
+    # via _Url subclasses
+    assert cls(cls(url_value)) == url
+
+
+@pytest.mark.parametrize("compare_cls", [True, False])
+@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl])
+def test_url_equality(compare_cls, cls):
+    # Trailing / in the base URL
+    no_trail = cls("https://example.com")
+    with_trail = "https://example.com/"
+    if compare_cls:
+        with_trail = cls(with_trail)
+        assert no_trail == with_trail
+    else:
+        assert no_trail != with_trail
+    assert str(no_trail) != str(with_trail)
+
+    # Trailing / in the path URL
+    no_trail = cls("https://example.com/foo")
+    with_trail = "https://example.com/foo/"
+    if compare_cls:
+        with_trail = cls(with_trail)
+    assert no_trail != with_trail  # Should not be equal
+    assert str(no_trail) != str(with_trail)
+
+
+@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl])
+def test_url_encoding(cls):
+    url_value = "http://εμπορικόσήμα.eu/путь/這裡"
+
+    url = cls(url_value)
+    str(url) == url_value
+
+    url = cls(url_value, encoded=False)
+    str(url) == "http://xn--jxagkqfkduily1i.eu/%D0%BF%D1%83%D1%82%D1%8C/%E9%80%99%E8%A3%A1"
+
+
 @pytest.mark.parametrize("body_cls", [HttpRequestBody, HttpResponseBody])
 def test_http_body_hashable(body_cls):
     http_body = body_cls(b"content")
@@ -62,17 +129,18 @@ def test_http_response_body_json():
 
 
 @pytest.mark.parametrize(
-    ["cls", "body_cls"],
+    ["cls", "body_cls", "url_cls"],
     [
-        (HttpRequest, HttpRequestBody),
-        (HttpResponse, HttpResponseBody),
+        (HttpRequest, HttpRequestBody, RequestUrl),
+        (HttpResponse, HttpResponseBody, ResponseUrl),
     ]
 )
-def test_http_defaults(cls, body_cls):
+def test_http_defaults(cls, body_cls, url_cls):
     http_body = body_cls(b"content")
 
     obj = cls("url", body=http_body)
-    assert obj.url == "url"
+    assert isinstance(obj.url, url_cls)
+    assert str(obj.url) == "url"
     assert obj.body == b"content"
     assert not obj.headers
     assert obj.headers.get("user-agent") is None
@@ -164,7 +232,8 @@ def test_http_headers_init_dict(cls, headers_cls):
 
 def test_http_request_init_minimal():
     req = HttpRequest("url")
-    assert req.url == "url"
+    assert isinstance(req.url, RequestUrl)
+    assert str(req.url) == "url"
     assert req.method == "GET"
     assert isinstance(req.method, str)
     assert not req.headers

diff --git a/tests/test_pages.py b/tests/test_pages.py
@@ -34,7 +34,7 @@ class MyWebPage(ItemWebPage):
 
         def to_item(self) -> dict:
             return {
-                'url': self.url,
+                'url': str(self.url),
                 'title': self.css('title::text').get().strip(),
             }
 

diff --git a/tests/test_requests.py b/tests/test_requests.py
@@ -3,6 +3,7 @@
 import pytest
 from web_poet.exceptions import RequestBackendError, HttpResponseError
 from web_poet.page_inputs import (
+    ResponseUrl,
     HttpClient,
     HttpRequest,
     HttpResponse,
@@ -37,7 +38,8 @@ async def test_perform_request_from_httpclient(async_mock):
     response = await client.get(url)
 
     # The async downloader implementation should return the HttpResponse
-    assert response.url == url
+    assert isinstance(response.url, ResponseUrl)
+    assert str(response.url) == url
     assert isinstance(response, HttpResponse)
 
 
@@ -161,8 +163,9 @@ async def test_http_client_execute(async_mock):
     request = HttpRequest("url-1")
     response = await client.execute(request)
 
+    assert isinstance(response.url, ResponseUrl)
     assert isinstance(response, HttpResponse)
-    assert response.url == "url-1"
+    assert str(response.url) == "url-1"
 
 
 @pytest.mark.asyncio

diff --git a/web_poet/__init__.py b/web_poet/__init__.py
@@ -10,8 +10,8 @@
     HttpRequestBody,
     HttpResponseBody,
     Meta,
-    RequestURL,
-    ResponseURL,
+    RequestUrl,
+    ResponseUrl,
 )
 from .overrides import PageObjectRegistry, consume_modules, OverrideRule
 

diff --git a/web_poet/mixins.py b/web_poet/mixins.py
@@ -67,7 +67,7 @@ def base_url(self) -> str:
         # FIXME: move it to HttpResponse
         if self._cached_base_url is None:
             text = self.html[:4096]
-            self._cached_base_url = get_base_url(text, self.url)
+            self._cached_base_url = get_base_url(text, str(self.url))
         return self._cached_base_url
 
     def urljoin(self, url: str) -> str:

diff --git a/web_poet/page_inputs/__init__.py b/web_poet/page_inputs/__init__.py
@@ -7,7 +7,7 @@
     HttpResponseHeaders,
     HttpRequestBody,
     HttpResponseBody,
-    RequestURL,
-    ResponseURL
+    RequestUrl,
+    ResponseUrl
 )
 from .browser import BrowserHtml
diff --git a/web_poet/page_inputs/http.py b/web_poet/page_inputs/http.py
@@ -9,6 +9,7 @@
     http_content_type_encoding
 )
 
+import yarl
 from web_poet._base import _HttpHeaders
 from web_poet.utils import memoizemethod_noargs
 from web_poet.mixins import SelectableMixin
@@ -18,13 +19,64 @@
 _AnyStrDict = Dict[AnyStr, Union[AnyStr, List[AnyStr], Tuple[AnyStr, ...]]]
 
 
-class ResponseURL(str):
-    """ URL of the response """
+class _Url:
+    def __init__(self, url: Union[str, yarl.URL, '_Url'], encoded=True):
+        self._url = yarl.URL(str(url), encoded=encoded)
+
+    def __str__(self) -> str:
+        return str(self._url)
+
+    def __repr__(self) -> str:
+        return f'{type(self).__name__}({str(self._url)!r})'
+
+    def __eq__(self, other) -> bool:
+        if not isinstance(other, type(self)):
+            return False
+        if self._url.path == "/":
+            if self._url.path == other.path:
+                return True
+        return str(self._url) == str(other)
+
+    @property
+    def scheme(self) -> str:
+        return self._url.scheme
+
+    @property
+    def host(self) -> Optional[str]:
+        return self._url.host
+
+    @property
+    def path(self) -> str:
+        return self._url.path
+
+    @property
+    def query_string(self) -> str:
+        return self._url.query_string
+
+    @property
+    def fragment(self) -> str:
+        return self._url.fragment
+
+
+class ResponseUrl(_Url):
+    """ URL of the response
+
+    :param url: a string representation of a URL.
+    :param encoded: If set to False, the given ``url`` would be auto-encoded.
+        However, there's no guarantee that correct encoding is used. Thus,
+        it's recommended to set this in the *default* ``False`` value.
+    """
     pass
 
 
-class RequestURL(str):
-    """ URL of the request """
+class RequestUrl(_Url):
+    """ URL of the request
+
+    :param url: a string representation of a URL.
+    :param encoded: If set to False, the given ``url`` would be auto-encoded.
+        However, there's no guarantee that correct encoding is used. Thus,
+        it's recommended to set this in the *default* ``False`` value.
+    """
     pass
 
 
@@ -162,7 +214,7 @@ class HttpRequest:
     **web-poet** like :class:`~.HttpClient`.
     """
 
-    url: RequestURL = attrs.field(converter=RequestURL)
+    url: RequestUrl = attrs.field(converter=RequestUrl)
     method: str = attrs.field(default="GET", kw_only=True)
     headers: HttpRequestHeaders = attrs.field(
         factory=HttpRequestHeaders, converter=HttpRequestHeaders, kw_only=True
@@ -195,7 +247,7 @@ class HttpResponse(SelectableMixin):
     is auto-detected from headers and body content.
     """
 
-    url: ResponseURL = attrs.field(converter=ResponseURL)
+    url: ResponseUrl = attrs.field(converter=ResponseUrl)
     body: HttpResponseBody = attrs.field(converter=HttpResponseBody)
     status: Optional[int] = attrs.field(default=None, kw_only=True)
     headers: HttpResponseHeaders = attrs.field(factory=HttpResponseHeaders,