Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

RequestUrl and ResponseUrl using yarl #45

Open
wants to merge 9 commits into
base: url-page-inputs
Choose a base branch
from
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
'url-matcher',
'multidict',
'w3lib >= 1.22.0',
'yarl',
],
classifiers=[
'Development Status :: 2 - Pre-Alpha',
Expand Down
4 changes: 2 additions & 2 deletions tests/test_mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ def my_page(book_list_html_response):


def test_url(my_page):
assert my_page.url == 'http://books.toscrape.com/index.html'
assert str(my_page.url) == 'http://books.toscrape.com/index.html'


def test_html(my_page, book_list_html):
Expand Down Expand Up @@ -56,7 +56,7 @@ def test_custom_baseurl():
)
page = MyPage(response=response)

assert page.url == 'http://www.example.com/path'
assert str(page.url) == 'http://www.example.com/path'
assert page.base_url == 'http://example.com/foo/'
assert page.urljoin("bar") == 'http://example.com/foo/bar'
assert page.urljoin("http://example.com/1") == "http://example.com/1"
81 changes: 75 additions & 6 deletions tests/test_page_inputs.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,11 @@
import pytest
import requests

import yarl
import parsel
from web_poet.page_inputs import (
RequestUrl,
ResponseUrl,
HttpRequest,
HttpResponse,
HttpRequestBody,
Expand All @@ -16,6 +19,70 @@
)


@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl])
def test_url(cls):
url_value = "https://example.com/category/product?query=123&id=xyz#frag1"

url = cls(url_value)

assert str(url) == url_value
assert url.scheme == "https"
assert url.host == "example.com"
assert url.path == "/category/product"
assert url.query_string == "query=123&id=xyz"
assert url.fragment == "frag1"

new_url = cls(url)
assert url == new_url
assert str(url) == str(new_url)


@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl])
def test_url_init(cls):
# via string
url_value = "https://example.com"
url = cls(url_value)

# via yarl
assert cls(yarl.URL(url_value)) == url

# via _Url subclasses
assert cls(cls(url_value)) == url


@pytest.mark.parametrize("compare_cls", [True, False])
@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl])
def test_url_equality(compare_cls, cls):
# Trailing / in the base URL
no_trail = cls("https://example.com")
with_trail = "https://example.com/"
if compare_cls:
with_trail = cls(with_trail)
assert no_trail == with_trail
else:
assert no_trail != with_trail
assert str(no_trail) != str(with_trail)

# Trailing / in the path URL
no_trail = cls("https://example.com/foo")
with_trail = "https://example.com/foo/"
if compare_cls:
with_trail = cls(with_trail)
assert no_trail != with_trail # Should not be equal
assert str(no_trail) != str(with_trail)


@pytest.mark.parametrize("cls", [RequestUrl, ResponseUrl])
def test_url_encoding(cls):
url_value = "http://εμπορικόσήμα.eu/путь/這裡"

url = cls(url_value)
str(url) == url_value

url = cls(url_value, encoded=False)
str(url) == "http://xn--jxagkqfkduily1i.eu/%D0%BF%D1%83%D1%82%D1%8C/%E9%80%99%E8%A3%A1"


@pytest.mark.parametrize("body_cls", [HttpRequestBody, HttpResponseBody])
def test_http_body_hashable(body_cls):
http_body = body_cls(b"content")
Expand Down Expand Up @@ -62,17 +129,18 @@ def test_http_response_body_json():


@pytest.mark.parametrize(
["cls", "body_cls"],
["cls", "body_cls", "url_cls"],
[
(HttpRequest, HttpRequestBody),
(HttpResponse, HttpResponseBody),
(HttpRequest, HttpRequestBody, RequestUrl),
(HttpResponse, HttpResponseBody, ResponseUrl),
]
)
def test_http_defaults(cls, body_cls):
def test_http_defaults(cls, body_cls, url_cls):
http_body = body_cls(b"content")

obj = cls("url", body=http_body)
assert obj.url == "url"
assert isinstance(obj.url, url_cls)
assert str(obj.url) == "url"
assert obj.body == b"content"
assert not obj.headers
assert obj.headers.get("user-agent") is None
Expand Down Expand Up @@ -164,7 +232,8 @@ def test_http_headers_init_dict(cls, headers_cls):

def test_http_request_init_minimal():
req = HttpRequest("url")
assert req.url == "url"
assert isinstance(req.url, RequestUrl)
assert str(req.url) == "url"
assert req.method == "GET"
assert isinstance(req.method, str)
assert not req.headers
Expand Down
2 changes: 1 addition & 1 deletion tests/test_pages.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ class MyWebPage(ItemWebPage):

def to_item(self) -> dict:
return {
'url': self.url,
'url': str(self.url),
'title': self.css('title::text').get().strip(),
}

Expand Down
7 changes: 5 additions & 2 deletions tests/test_requests.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pytest
from web_poet.exceptions import RequestBackendError, HttpResponseError
from web_poet.page_inputs import (
ResponseUrl,
HttpClient,
HttpRequest,
HttpResponse,
Expand Down Expand Up @@ -37,7 +38,8 @@ async def test_perform_request_from_httpclient(async_mock):
response = await client.get(url)

# The async downloader implementation should return the HttpResponse
assert response.url == url
assert isinstance(response.url, ResponseUrl)
assert str(response.url) == url
assert isinstance(response, HttpResponse)


Expand Down Expand Up @@ -161,8 +163,9 @@ async def test_http_client_execute(async_mock):
request = HttpRequest("url-1")
response = await client.execute(request)

assert isinstance(response.url, ResponseUrl)
assert isinstance(response, HttpResponse)
assert response.url == "url-1"
assert str(response.url) == "url-1"


@pytest.mark.asyncio
Expand Down
4 changes: 2 additions & 2 deletions web_poet/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,8 @@
HttpRequestBody,
HttpResponseBody,
Meta,
RequestURL,
ResponseURL,
RequestUrl,
ResponseUrl,
)
from .overrides import PageObjectRegistry, consume_modules, OverrideRule

Expand Down
2 changes: 1 addition & 1 deletion web_poet/mixins.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ def base_url(self) -> str:
# FIXME: move it to HttpResponse
if self._cached_base_url is None:
text = self.html[:4096]
self._cached_base_url = get_base_url(text, self.url)
self._cached_base_url = get_base_url(text, str(self.url))
return self._cached_base_url

def urljoin(self, url: str) -> str:
Expand Down
4 changes: 2 additions & 2 deletions web_poet/page_inputs/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
HttpResponseHeaders,
HttpRequestBody,
HttpResponseBody,
RequestURL,
ResponseURL
RequestUrl,
ResponseUrl
)
from .browser import BrowserHtml
64 changes: 58 additions & 6 deletions web_poet/page_inputs/http.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
http_content_type_encoding
)

import yarl
from web_poet._base import _HttpHeaders
from web_poet.utils import memoizemethod_noargs
from web_poet.mixins import SelectableMixin
Expand All @@ -18,13 +19,64 @@
_AnyStrDict = Dict[AnyStr, Union[AnyStr, List[AnyStr], Tuple[AnyStr, ...]]]


class ResponseURL(str):
""" URL of the response """
class _Url:
def __init__(self, url: Union[str, yarl.URL, '_Url'], encoded=True):
self._url = yarl.URL(str(url), encoded=encoded)

def __str__(self) -> str:
return str(self._url)

def __repr__(self) -> str:
return f'{type(self).__name__}({str(self._url)!r})'

def __eq__(self, other) -> bool:
if not isinstance(other, type(self)):
return False
if self._url.path == "/":
if self._url.path == other.path:
return True
return str(self._url) == str(other)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The property methods below can be added mapped dynamically with yarl's. However, we lose the benefit of defining docstrings within them.

Copy link
Member

@Gallaecio Gallaecio Jun 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We would also lose API governance, so +1 to manual definition.

However, I wonder if we should define them at all for the initial implementation. We want to make sure we get the API right encoding-wise, and if we expose a part of the Yarl interface already as is, I imagine we are introducing the encoding issue in our implementation, with the caveat of not supporting encoded=True in __init__ to at least prevent Yarl from messing things up.

Maybe the initial implementation should use a string internally instead, and we can convert it into Yarl later.

Copy link
Contributor Author

@BurnzZ BurnzZ Jun 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A good point about handling the encoding. What do you think about setting encoded=False by default to prevent yarl from messing things up due to incorrect encoding? be37f39. This would be equivalent to having a str internally, aside from the "smart" helper methods.

Copy link
Member

@Gallaecio Gallaecio Jun 1, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The thing is, Yarl exposes encoded in __init__ as a workaround for proper encoding handling, which they set off not to implement. But I believe what @kmike has in mind is for us to have a URL class that does proper encoding handling, in which case we should probably not expose encoded at all (maybe encoding instead, defaulting to "utf8").

I would wait for feedback from @kmike before making more API decisions. I am personally not sure of the best approach here, what parts of w3lib.url we want to apply and how.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think the goal shouldn't be to implement a general URL class; the goal is to implement URL class useful for web scraping.

If that's hard to use yarl or other library's URL class directly, and we're defining the API anyways, we probably should think about it from the API point of view: what's the API we want, what are the features commonly used in web scraping? After figuring out how we'd like API to look like, we can see what's the best way to implement it - wrap yarl, wrap w3lib, do something else.

Based on our previous discussions, I think a scraping-ready URL class should have:

  • a way to manipulate query string: add/remove/get/update query parameters
  • some kind of urljoin method, probably via / operation
  • probably - a way to extract the domain name?
  • anything else?

In addition to this, there is whole bunch of questions about encoding, normalization, converting URLs to ascii-only encoded strings suitable for downloading, etc. The best API to handle all that might require some thought. I wonder if we can side-step it for now somehow.

At the same time, I'm not sure properties like .scheme are that essential. They're are essential for a general-purpose URL class, but are people who write scraping code commonly parse URLs to get their scheme? We can add such methods and properties for sure, but we can do it later. These methods are probably useful for authors of web scraping frameworks / http clients, but less so for people who write web scraping code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The property methods below can be added mapped dynamically with yarl's. However, we lose the benefit of defining docstrings within them.

It's also about return types - some yarl.URL methods are going to return yarl.URL objects, while here it would make more sense to return _Url objects.

@property
def scheme(self) -> str:
return self._url.scheme

@property
def host(self) -> Optional[str]:
return self._url.host

@property
def path(self) -> str:
return self._url.path

@property
def query_string(self) -> str:
return self._url.query_string

@property
def fragment(self) -> str:
return self._url.fragment


class ResponseUrl(_Url):
""" URL of the response

:param url: a string representation of a URL.
:param encoded: If set to False, the given ``url`` would be auto-encoded.
However, there's no guarantee that correct encoding is used. Thus,
it's recommended to set this in the *default* ``False`` value.
"""
pass


class RequestURL(str):
""" URL of the request """
class RequestUrl(_Url):
""" URL of the request

:param url: a string representation of a URL.
:param encoded: If set to False, the given ``url`` would be auto-encoded.
However, there's no guarantee that correct encoding is used. Thus,
it's recommended to set this in the *default* ``False`` value.
"""
pass


Expand Down Expand Up @@ -162,7 +214,7 @@ class HttpRequest:
**web-poet** like :class:`~.HttpClient`.
"""

url: RequestURL = attrs.field(converter=RequestURL)
url: RequestUrl = attrs.field(converter=RequestUrl)
method: str = attrs.field(default="GET", kw_only=True)
headers: HttpRequestHeaders = attrs.field(
factory=HttpRequestHeaders, converter=HttpRequestHeaders, kw_only=True
Expand Down Expand Up @@ -195,7 +247,7 @@ class HttpResponse(SelectableMixin):
is auto-detected from headers and body content.
"""

url: ResponseURL = attrs.field(converter=ResponseURL)
url: ResponseUrl = attrs.field(converter=ResponseUrl)
body: HttpResponseBody = attrs.field(converter=HttpResponseBody)
status: Optional[int] = attrs.field(default=None, kw_only=True)
headers: HttpResponseHeaders = attrs.field(factory=HttpResponseHeaders,
Expand Down