1
1
import json
2
2
3
- from scrapling .core ._types import Callable , Dict , Optional , Union
3
+ from scrapling .core ._types import (Callable , Dict , Optional ,
4
+ SelectorWaitStates , Union )
4
5
from scrapling .core .utils import log , lru_cache
5
6
from scrapling .engines .constants import (DEFAULT_STEALTH_FLAGS ,
6
7
NSTBROWSER_DEFAULT_QUERY )
@@ -23,7 +24,7 @@ def __init__(
23
24
page_action : Callable = None ,
24
25
wait_selector : Optional [str ] = None ,
25
26
locale : Optional [str ] = 'en-US' ,
26
- wait_selector_state : Optional [ str ] = 'attached' ,
27
+ wait_selector_state : SelectorWaitStates = 'attached' ,
27
28
stealth : Optional [bool ] = False ,
28
29
real_chrome : Optional [bool ] = False ,
29
30
hide_canvas : Optional [bool ] = False ,
@@ -193,12 +194,21 @@ def fetch(self, url: str) -> Response:
193
194
:param url: Target url.
194
195
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
195
196
"""
197
+ from playwright .sync_api import Response as PlaywrightResponse
196
198
if not self .stealth or self .real_chrome :
197
199
# Because rebrowser_playwright doesn't play well with real browsers
198
200
from playwright .sync_api import sync_playwright
199
201
else :
200
202
from rebrowser_playwright .sync_api import sync_playwright
201
203
204
+ # Store the final response
205
+ final_response = None
206
+
207
+ def handle_response (finished_response : PlaywrightResponse ):
208
+ nonlocal final_response
209
+ if finished_response .request .resource_type == "document" :
210
+ final_response = finished_response
211
+
202
212
with sync_playwright () as p :
203
213
# Creating the browser
204
214
if self .cdp_url :
@@ -212,6 +222,8 @@ def fetch(self, url: str) -> Response:
212
222
page = context .new_page ()
213
223
page .set_default_navigation_timeout (self .timeout )
214
224
page .set_default_timeout (self .timeout )
225
+ # Listen for all responses
226
+ page .on ("response" , handle_response )
215
227
216
228
if self .extra_headers :
217
229
page .set_extra_http_headers (self .extra_headers )
@@ -223,7 +235,7 @@ def fetch(self, url: str) -> Response:
223
235
for script in self .__stealth_scripts ():
224
236
page .add_init_script (path = script )
225
237
226
- res = page .goto (url , referer = generate_convincing_referer (url ) if self .google_search else None )
238
+ first_response = page .goto (url , referer = generate_convincing_referer (url ) if self .google_search else None )
227
239
page .wait_for_load_state (state = "domcontentloaded" )
228
240
if self .network_idle :
229
241
page .wait_for_load_state ('networkidle' )
@@ -240,21 +252,24 @@ def fetch(self, url: str) -> Response:
240
252
if self .network_idle :
241
253
page .wait_for_load_state ('networkidle' )
242
254
255
+ response_bytes = final_response .body () if final_response else page .content ().encode ('utf-8' )
256
+ # In case we didn't catch a document type somehow
257
+ final_response = final_response if final_response else first_response
243
258
# This will be parsed inside `Response`
244
- encoding = res .headers .get ('content-type' , '' ) or 'utf-8' # default encoding
259
+ encoding = final_response .headers .get ('content-type' , '' ) or 'utf-8' # default encoding
245
260
# PlayWright API sometimes give empty status text for some reason!
246
- status_text = res .status_text or StatusText .get (res .status )
261
+ status_text = final_response .status_text or StatusText .get (final_response .status )
247
262
248
263
response = Response (
249
- url = res .url ,
264
+ url = final_response .url ,
250
265
text = page .content (),
251
- body = page . content (). encode ( 'utf-8' ) ,
252
- status = res .status ,
266
+ body = response_bytes ,
267
+ status = final_response .status ,
253
268
reason = status_text ,
254
269
encoding = encoding ,
255
270
cookies = {cookie ['name' ]: cookie ['value' ] for cookie in page .context .cookies ()},
256
- headers = res .all_headers (),
257
- request_headers = res .request .all_headers (),
271
+ headers = final_response .all_headers (),
272
+ request_headers = final_response .request .all_headers (),
258
273
** self .adaptor_arguments
259
274
)
260
275
page .close ()
@@ -266,12 +281,21 @@ async def async_fetch(self, url: str) -> Response:
266
281
:param url: Target url.
267
282
:return: A `Response` object that is the same as `Adaptor` object except it has these added attributes: `status`, `reason`, `cookies`, `headers`, and `request_headers`
268
283
"""
284
+ from playwright .async_api import Response as PlaywrightResponse
269
285
if not self .stealth or self .real_chrome :
270
286
# Because rebrowser_playwright doesn't play well with real browsers
271
287
from playwright .async_api import async_playwright
272
288
else :
273
289
from rebrowser_playwright .async_api import async_playwright
274
290
291
+ # Store the final response
292
+ final_response = None
293
+
294
+ async def handle_response (finished_response : PlaywrightResponse ):
295
+ nonlocal final_response
296
+ if finished_response .request .resource_type == "document" :
297
+ final_response = finished_response
298
+
275
299
async with async_playwright () as p :
276
300
# Creating the browser
277
301
if self .cdp_url :
@@ -285,6 +309,8 @@ async def async_fetch(self, url: str) -> Response:
285
309
page = await context .new_page ()
286
310
page .set_default_navigation_timeout (self .timeout )
287
311
page .set_default_timeout (self .timeout )
312
+ # Listen for all responses
313
+ page .on ("response" , handle_response )
288
314
289
315
if self .extra_headers :
290
316
await page .set_extra_http_headers (self .extra_headers )
@@ -296,7 +322,7 @@ async def async_fetch(self, url: str) -> Response:
296
322
for script in self .__stealth_scripts ():
297
323
await page .add_init_script (path = script )
298
324
299
- res = await page .goto (url , referer = generate_convincing_referer (url ) if self .google_search else None )
325
+ first_response = await page .goto (url , referer = generate_convincing_referer (url ) if self .google_search else None )
300
326
await page .wait_for_load_state (state = "domcontentloaded" )
301
327
if self .network_idle :
302
328
await page .wait_for_load_state ('networkidle' )
@@ -313,21 +339,24 @@ async def async_fetch(self, url: str) -> Response:
313
339
if self .network_idle :
314
340
await page .wait_for_load_state ('networkidle' )
315
341
342
+ response_bytes = await final_response .body () if final_response else (await page .content ()).encode ('utf-8' )
343
+ # In case we didn't catch a document type somehow
344
+ final_response = final_response if final_response else first_response
316
345
# This will be parsed inside `Response`
317
- encoding = res .headers .get ('content-type' , '' ) or 'utf-8' # default encoding
346
+ encoding = final_response .headers .get ('content-type' , '' ) or 'utf-8' # default encoding
318
347
# PlayWright API sometimes give empty status text for some reason!
319
- status_text = res .status_text or StatusText .get (res .status )
348
+ status_text = final_response .status_text or StatusText .get (final_response .status )
320
349
321
350
response = Response (
322
- url = res .url ,
351
+ url = final_response .url ,
323
352
text = await page .content (),
324
- body = ( await page . content ()). encode ( 'utf-8' ) ,
325
- status = res .status ,
353
+ body = response_bytes ,
354
+ status = final_response .status ,
326
355
reason = status_text ,
327
356
encoding = encoding ,
328
357
cookies = {cookie ['name' ]: cookie ['value' ] for cookie in await page .context .cookies ()},
329
- headers = await res .all_headers (),
330
- request_headers = await res .request .all_headers (),
358
+ headers = await final_response .all_headers (),
359
+ request_headers = await final_response .request .all_headers (),
331
360
** self .adaptor_arguments
332
361
)
333
362
await page .close ()
0 commit comments