Skip to content

Commit ee59914

Browse files
authored
Merge pull request #26 from D4Vinci/dev
v0.2.91
2 parents 60df72c + bb39869 commit ee59914

File tree

9 files changed

+106
-54
lines changed

9 files changed

+106
-54
lines changed

scrapling/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
from scrapling.parser import Adaptor, Adaptors
66

77
__author__ = "Karim Shoair ([email protected])"
8-
__version__ = "0.2.9"
8+
__version__ = "0.2.91"
99
__copyright__ = "Copyright (c) 2024 Karim Shoair"
1010

1111

scrapling/core/_types.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55
from typing import (TYPE_CHECKING, Any, Callable, Dict, Generator, Iterable,
66
List, Literal, Optional, Pattern, Tuple, Type, Union)
77

8+
SelectorWaitStates = Literal["attached", "detached", "hidden", "visible"]
9+
810
try:
911
from typing import Protocol
1012
except ImportError:

scrapling/engines/camo.py

Lines changed: 44 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from camoufox.sync_api import Camoufox
44

55
from scrapling.core._types import (Callable, Dict, List, Literal, Optional,
6-
Union)
6+
SelectorWaitStates, Union)
77
from scrapling.core.utils import log
88
from scrapling.engines.toolbelt import (Response, StatusText,
99
async_intercept_route,
@@ -18,7 +18,7 @@ def __init__(
1818
self, headless: Optional[Union[bool, Literal['virtual']]] = True, block_images: Optional[bool] = False, disable_resources: Optional[bool] = False,
1919
block_webrtc: Optional[bool] = False, allow_webgl: Optional[bool] = True, network_idle: Optional[bool] = False, humanize: Optional[Union[bool, float]] = True,
2020
timeout: Optional[float] = 30000, page_action: Callable = None, wait_selector: Optional[str] = None, addons: Optional[List[str]] = None,
21-
wait_selector_state: str = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
21+
wait_selector_state: Optional[SelectorWaitStates] = 'attached', google_search: Optional[bool] = True, extra_headers: Optional[Dict[str, str]] = None,
2222
proxy: Optional[Union[str, Dict[str, str]]] = None, os_randomize: Optional[bool] = None, disable_ads: Optional[bool] = True,
2323
geoip: Optional[bool] = False,
2424
adaptor_arguments: Dict = None,
@@ -84,6 +84,14 @@ def fetch(self, url: str) -> Response:
8484
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
8585
"""
8686
addons = [] if self.disable_ads else [DefaultAddons.UBO]
87+
# Store the final response
88+
final_response = None
89+
90+
def handle_response(finished_response):
91+
nonlocal final_response
92+
if finished_response.request.resource_type == "document":
93+
final_response = finished_response
94+
8795
with Camoufox(
8896
geoip=self.geoip,
8997
proxy=self.proxy,
@@ -100,13 +108,15 @@ def fetch(self, url: str) -> Response:
100108
page = browser.new_page()
101109
page.set_default_navigation_timeout(self.timeout)
102110
page.set_default_timeout(self.timeout)
111+
# Listen for all responses
112+
page.on("response", handle_response)
103113
if self.disable_resources:
104114
page.route("**/*", intercept_route)
105115

106116
if self.extra_headers:
107117
page.set_extra_http_headers(self.extra_headers)
108118

109-
res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
119+
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
110120
page.wait_for_load_state(state="domcontentloaded")
111121
if self.network_idle:
112122
page.wait_for_load_state('networkidle')
@@ -123,21 +133,24 @@ def fetch(self, url: str) -> Response:
123133
if self.network_idle:
124134
page.wait_for_load_state('networkidle')
125135

136+
response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
137+
# In case we didn't catch a document type somehow
138+
final_response = final_response if final_response else first_response
126139
# This will be parsed inside `Response`
127-
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
140+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
128141
# PlayWright API sometimes give empty status text for some reason!
129-
status_text = res.status_text or StatusText.get(res.status)
142+
status_text = final_response.status_text or StatusText.get(final_response.status)
130143

131144
response = Response(
132-
url=res.url,
145+
url=final_response.url,
133146
text=page.content(),
134-
body=page.content().encode('utf-8'),
135-
status=res.status,
147+
body=response_bytes,
148+
status=final_response.status,
136149
reason=status_text,
137150
encoding=encoding,
138151
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
139-
headers=res.all_headers(),
140-
request_headers=res.request.all_headers(),
152+
headers=final_response.all_headers(),
153+
request_headers=final_response.request.all_headers(),
141154
**self.adaptor_arguments
142155
)
143156
page.close()
@@ -151,6 +164,14 @@ async def async_fetch(self, url: str) -> Response:
151164
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
152165
"""
153166
addons = [] if self.disable_ads else [DefaultAddons.UBO]
167+
# Store the final response
168+
final_response = None
169+
170+
async def handle_response(finished_response):
171+
nonlocal final_response
172+
if finished_response.request.resource_type == "document":
173+
final_response = finished_response
174+
154175
async with AsyncCamoufox(
155176
geoip=self.geoip,
156177
proxy=self.proxy,
@@ -167,13 +188,15 @@ async def async_fetch(self, url: str) -> Response:
167188
page = await browser.new_page()
168189
page.set_default_navigation_timeout(self.timeout)
169190
page.set_default_timeout(self.timeout)
191+
# Listen for all responses
192+
page.on("response", handle_response)
170193
if self.disable_resources:
171194
await page.route("**/*", async_intercept_route)
172195

173196
if self.extra_headers:
174197
await page.set_extra_http_headers(self.extra_headers)
175198

176-
res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
199+
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
177200
await page.wait_for_load_state(state="domcontentloaded")
178201
if self.network_idle:
179202
await page.wait_for_load_state('networkidle')
@@ -190,21 +213,24 @@ async def async_fetch(self, url: str) -> Response:
190213
if self.network_idle:
191214
await page.wait_for_load_state('networkidle')
192215

216+
response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
217+
# In case we didn't catch a document type somehow
218+
final_response = final_response if final_response else first_response
193219
# This will be parsed inside `Response`
194-
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
220+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
195221
# PlayWright API sometimes give empty status text for some reason!
196-
status_text = res.status_text or StatusText.get(res.status)
222+
status_text = final_response.status_text or StatusText.get(final_response.status)
197223

198224
response = Response(
199-
url=res.url,
225+
url=final_response.url,
200226
text=await page.content(),
201-
body=(await page.content()).encode('utf-8'),
202-
status=res.status,
227+
body=response_bytes,
228+
status=final_response.status,
203229
reason=status_text,
204230
encoding=encoding,
205231
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
206-
headers=await res.all_headers(),
207-
request_headers=await res.request.all_headers(),
232+
headers=await final_response.all_headers(),
233+
request_headers=await final_response.request.all_headers(),
208234
**self.adaptor_arguments
209235
)
210236
await page.close()

scrapling/engines/pw.py

Lines changed: 47 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import json
22

3-
from scrapling.core._types import Callable, Dict, Optional, Union
3+
from scrapling.core._types import (Callable, Dict, Optional,
4+
SelectorWaitStates, Union)
45
from scrapling.core.utils import log, lru_cache
56
from scrapling.engines.constants import (DEFAULT_STEALTH_FLAGS,
67
NSTBROWSER_DEFAULT_QUERY)
@@ -23,7 +24,7 @@ def __init__(
2324
page_action: Callable = None,
2425
wait_selector: Optional[str] = None,
2526
locale: Optional[str] = 'en-US',
26-
wait_selector_state: Optional[str] = 'attached',
27+
wait_selector_state: SelectorWaitStates = 'attached',
2728
stealth: Optional[bool] = False,
2829
real_chrome: Optional[bool] = False,
2930
hide_canvas: Optional[bool] = False,
@@ -193,12 +194,21 @@ def fetch(self, url: str) -> Response:
193194
:param url: Target url.
194195
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
195196
"""
197+
from playwright.sync_api import Response as PlaywrightResponse
196198
if not self.stealth or self.real_chrome:
197199
# Because rebrowser_playwright doesn't play well with real browsers
198200
from playwright.sync_api import sync_playwright
199201
else:
200202
from rebrowser_playwright.sync_api import sync_playwright
201203

204+
# Store the final response
205+
final_response = None
206+
207+
def handle_response(finished_response: PlaywrightResponse):
208+
nonlocal final_response
209+
if finished_response.request.resource_type == "document":
210+
final_response = finished_response
211+
202212
with sync_playwright() as p:
203213
# Creating the browser
204214
if self.cdp_url:
@@ -212,6 +222,8 @@ def fetch(self, url: str) -> Response:
212222
page = context.new_page()
213223
page.set_default_navigation_timeout(self.timeout)
214224
page.set_default_timeout(self.timeout)
225+
# Listen for all responses
226+
page.on("response", handle_response)
215227

216228
if self.extra_headers:
217229
page.set_extra_http_headers(self.extra_headers)
@@ -223,7 +235,7 @@ def fetch(self, url: str) -> Response:
223235
for script in self.__stealth_scripts():
224236
page.add_init_script(path=script)
225237

226-
res = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
238+
first_response = page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
227239
page.wait_for_load_state(state="domcontentloaded")
228240
if self.network_idle:
229241
page.wait_for_load_state('networkidle')
@@ -240,21 +252,24 @@ def fetch(self, url: str) -> Response:
240252
if self.network_idle:
241253
page.wait_for_load_state('networkidle')
242254

255+
response_bytes = final_response.body() if final_response else page.content().encode('utf-8')
256+
# In case we didn't catch a document type somehow
257+
final_response = final_response if final_response else first_response
243258
# This will be parsed inside `Response`
244-
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
259+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
245260
# PlayWright API sometimes give empty status text for some reason!
246-
status_text = res.status_text or StatusText.get(res.status)
261+
status_text = final_response.status_text or StatusText.get(final_response.status)
247262

248263
response = Response(
249-
url=res.url,
264+
url=final_response.url,
250265
text=page.content(),
251-
body=page.content().encode('utf-8'),
252-
status=res.status,
266+
body=response_bytes,
267+
status=final_response.status,
253268
reason=status_text,
254269
encoding=encoding,
255270
cookies={cookie['name']: cookie['value'] for cookie in page.context.cookies()},
256-
headers=res.all_headers(),
257-
request_headers=res.request.all_headers(),
271+
headers=final_response.all_headers(),
272+
request_headers=final_response.request.all_headers(),
258273
**self.adaptor_arguments
259274
)
260275
page.close()
@@ -266,12 +281,21 @@ async def async_fetch(self, url: str) -> Response:
266281
:param url: Target url.
267282
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
268283
"""
284+
from playwright.async_api import Response as PlaywrightResponse
269285
if not self.stealth or self.real_chrome:
270286
# Because rebrowser_playwright doesn't play well with real browsers
271287
from playwright.async_api import async_playwright
272288
else:
273289
from rebrowser_playwright.async_api import async_playwright
274290

291+
# Store the final response
292+
final_response = None
293+
294+
async def handle_response(finished_response: PlaywrightResponse):
295+
nonlocal final_response
296+
if finished_response.request.resource_type == "document":
297+
final_response = finished_response
298+
275299
async with async_playwright() as p:
276300
# Creating the browser
277301
if self.cdp_url:
@@ -285,6 +309,8 @@ async def async_fetch(self, url: str) -> Response:
285309
page = await context.new_page()
286310
page.set_default_navigation_timeout(self.timeout)
287311
page.set_default_timeout(self.timeout)
312+
# Listen for all responses
313+
page.on("response", handle_response)
288314

289315
if self.extra_headers:
290316
await page.set_extra_http_headers(self.extra_headers)
@@ -296,7 +322,7 @@ async def async_fetch(self, url: str) -> Response:
296322
for script in self.__stealth_scripts():
297323
await page.add_init_script(path=script)
298324

299-
res = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
325+
first_response = await page.goto(url, referer=generate_convincing_referer(url) if self.google_search else None)
300326
await page.wait_for_load_state(state="domcontentloaded")
301327
if self.network_idle:
302328
await page.wait_for_load_state('networkidle')
@@ -313,21 +339,24 @@ async def async_fetch(self, url: str) -> Response:
313339
if self.network_idle:
314340
await page.wait_for_load_state('networkidle')
315341

342+
response_bytes = await final_response.body() if final_response else (await page.content()).encode('utf-8')
343+
# In case we didn't catch a document type somehow
344+
final_response = final_response if final_response else first_response
316345
# This will be parsed inside `Response`
317-
encoding = res.headers.get('content-type', '') or 'utf-8' # default encoding
346+
encoding = final_response.headers.get('content-type', '') or 'utf-8' # default encoding
318347
# PlayWright API sometimes give empty status text for some reason!
319-
status_text = res.status_text or StatusText.get(res.status)
348+
status_text = final_response.status_text or StatusText.get(final_response.status)
320349

321350
response = Response(
322-
url=res.url,
351+
url=final_response.url,
323352
text=await page.content(),
324-
body=(await page.content()).encode('utf-8'),
325-
status=res.status,
353+
body=response_bytes,
354+
status=final_response.status,
326355
reason=status_text,
327356
encoding=encoding,
328357
cookies={cookie['name']: cookie['value'] for cookie in await page.context.cookies()},
329-
headers=await res.all_headers(),
330-
request_headers=await res.request.all_headers(),
358+
headers=await final_response.all_headers(),
359+
request_headers=await final_response.request.all_headers(),
331360
**self.adaptor_arguments
332361
)
333362
await page.close()

scrapling/engines/toolbelt/custom.py

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,6 @@ def get_value(cls, content_type: Optional[str], text: Optional[str] = 'test') ->
8484
class Response(Adaptor):
8585
"""This class is returned by all engines as a way to unify response type between different libraries."""
8686

87-
_is_response_result_logged = False # Class-level flag, initialized to False
88-
8987
def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, cookies: Dict, headers: Dict, request_headers: Dict,
9088
encoding: str = 'utf-8', method: str = 'GET', **adaptor_arguments: Dict):
9189
automatch_domain = adaptor_arguments.pop('automatch_domain', None)
@@ -99,9 +97,7 @@ def __init__(self, url: str, text: str, body: bytes, status: int, reason: str, c
9997
# For back-ward compatibility
10098
self.adaptor = self
10199
# For easier debugging while working from a Python shell
102-
if not Response._is_response_result_logged:
103-
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
104-
Response._is_response_result_logged = True
100+
log.info(f'Fetched ({status}) <{method} {url}> (referer: {request_headers.get("referer")})')
105101

106102
# def __repr__(self):
107103
# return f'<{self.__class__.__name__} [{self.status} {self.reason}]>'

0 commit comments

Comments
 (0)