Skip to content

Commit 23997f2

Browse files
authored
v0.3.1 (#78)
2 parents e7482a2 + 934d8a1 commit 23997f2

File tree

10 files changed

+81
-21
lines changed

10 files changed

+81
-21
lines changed

docs/fetching/dynamic.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ Scrapling provides many options with this fetcher. To make it as simple as possi
7979
| wait | The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object. | ✔️ |
8080
| page_action | Added for automation. Pass a function that takes the `page` object and does the necessary automation, then returns `page` again. | ✔️ |
8181
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
82+
| init_script | An absolute path to a JavaScript file to be executed on page creation for all pages in this session. | ✔️ |
8283
| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
8384
| google_search | Enabled by default, Scrapling will set the referer header as if this request came from a Google search of this website's domain name. | ✔️ |
8485
| extra_headers | A dictionary of extra headers to add to the request. _The referer set by the `google_search` argument takes priority over the referer set here if used together._ | ✔️ |

docs/fetching/stealthy.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ Before jumping to [examples](#examples), here's the full list of arguments
4343
| timeout | The timeout used in all operations and waits through the page. It's in milliseconds, and the default is 30000. | ✔️ |
4444
| wait | The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the `Response` object. | ✔️ |
4545
| wait_selector | Wait for a specific css selector to be in a specific state. | ✔️ |
46+
| init_script | An absolute path to a JavaScript file to be executed on page creation for all pages in this session. | ✔️ |
4647
| wait_selector_state | Scrapling will wait for the given state to be fulfilled for the selector given with `wait_selector`. _Default state is `attached`._ | ✔️ |
4748
| proxy | The proxy to be used with requests. It can be a string or a dictionary with the keys 'server', 'username', and 'password' only. | ✔️ |
4849
| additional_args | Additional arguments to be passed to Camoufox as additional settings, and they take higher priority than Scrapling's settings. | ✔️ |

scrapling/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
__author__ = "Karim Shoair ([email protected])"
2-
__version__ = "0.3"
2+
__version__ = "0.3.1"
33
__copyright__ = "Copyright (c) 2024 Karim Shoair"
44

55

scrapling/core/shell.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,6 @@
2020
getLevelName,
2121
)
2222

23-
from IPython.terminal.embed import InteractiveShellEmbed
2423
from orjson import loads as json_loads, JSONDecodeError
2524

2625
from scrapling import __version__
@@ -394,8 +393,7 @@ def convert2fetcher(self, curl_command: Request | str) -> Optional[Response]:
394393

395394
else: # pragma: no cover
396395
log.error("Input must be a valid curl command string or a Request object.")
397-
398-
return None
396+
return None
399397

400398

401399
def show_page_in_browser(page: Selector): # pragma: no cover
@@ -544,6 +542,8 @@ def show_help(self): # pragma: no cover
544542

545543
def start(self): # pragma: no cover
546544
"""Start the interactive shell"""
545+
from IPython.terminal.embed import InteractiveShellEmbed
546+
547547
# Get our namespace with application objects
548548
namespace = self.get_namespace()
549549
ipython_shell = InteractiveShellEmbed(

scrapling/engines/_browsers/_camoufox.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ class StealthySession:
6060
"timeout",
6161
"page_action",
6262
"wait_selector",
63+
"init_script",
6364
"addons",
6465
"wait_selector_state",
6566
"cookies",
@@ -95,6 +96,7 @@ def __init__(
9596
timeout: int | float = 30000,
9697
page_action: Optional[Callable] = None,
9798
wait_selector: Optional[str] = None,
99+
init_script: Optional[str] = None,
98100
addons: Optional[List[str]] = None,
99101
wait_selector_state: SelectorWaitStates = "attached",
100102
cookies: Optional[List[Dict]] = None,
@@ -128,6 +130,7 @@ def __init__(
128130
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
129131
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
130132
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
133+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
131134
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
132135
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
133136
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
@@ -153,6 +156,7 @@ def __init__(
153156
"timeout": timeout,
154157
"page_action": page_action,
155158
"wait_selector": wait_selector,
159+
"init_script": init_script,
156160
"addons": addons,
157161
"wait_selector_state": wait_selector_state,
158162
"cookies": cookies,
@@ -180,6 +184,7 @@ def __init__(
180184
self.timeout = config.timeout
181185
self.page_action = config.page_action
182186
self.wait_selector = config.wait_selector
187+
self.init_script = config.init_script
183188
self.addons = config.addons
184189
self.wait_selector_state = config.wait_selector_state
185190
self.cookies = config.cookies
@@ -234,6 +239,9 @@ def __create__(self):
234239
**self.launch_options
235240
)
236241
)
242+
if self.init_script: # pragma: no cover
243+
self.context.add_init_script(path=self.init_script)
244+
237245
if self.cookies: # pragma: no cover
238246
self.context.add_cookies(self.cookies)
239247

@@ -474,6 +482,7 @@ def __init__(
474482
timeout: int | float = 30000,
475483
page_action: Optional[Callable] = None,
476484
wait_selector: Optional[str] = None,
485+
init_script: Optional[str] = None,
477486
addons: Optional[List[str]] = None,
478487
wait_selector_state: SelectorWaitStates = "attached",
479488
cookies: Optional[List[Dict]] = None,
@@ -507,6 +516,7 @@ def __init__(
507516
:param timeout: The timeout in milliseconds that is used in all operations and waits through the page. The default is 30,000
508517
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
509518
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
519+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
510520
:param geoip: Recommended to use with proxies; Automatically use IP's longitude, latitude, timezone, country, locale, and spoof the WebRTC IP address.
511521
It will also calculate and spoof the browser's language based on the distribution of language speakers in the target region.
512522
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
@@ -531,6 +541,7 @@ def __init__(
531541
timeout,
532542
page_action,
533543
wait_selector,
544+
init_script,
534545
addons,
535546
wait_selector_state,
536547
cookies,
@@ -557,6 +568,9 @@ async def __create__(self):
557568
**self.launch_options
558569
)
559570
)
571+
if self.init_script: # pragma: no cover
572+
await self.context.add_init_script(path=self.init_script)
573+
560574
if self.cookies:
561575
await self.context.add_cookies(self.cookies)
562576

scrapling/engines/_browsers/_controllers.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -60,6 +60,7 @@ class DynamicSession:
6060
"disable_resources",
6161
"network_idle",
6262
"wait_selector",
63+
"init_script",
6364
"wait_selector_state",
6465
"wait",
6566
"playwright",
@@ -94,6 +95,7 @@ def __init__(
9495
timeout: int | float = 30000,
9596
disable_resources: bool = False,
9697
wait_selector: Optional[str] = None,
98+
init_script: Optional[str] = None,
9799
cookies: Optional[List[Dict]] = None,
98100
network_idle: bool = False,
99101
wait_selector_state: SelectorWaitStates = "attached",
@@ -112,6 +114,7 @@ def __init__(
112114
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
113115
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
114116
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
117+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
115118
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
116119
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
117120
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
@@ -143,6 +146,7 @@ def __init__(
143146
"selector_config": selector_config,
144147
"disable_resources": disable_resources,
145148
"wait_selector": wait_selector,
149+
"init_script": init_script,
146150
"cookies": cookies,
147151
"network_idle": network_idle,
148152
"wait_selector_state": wait_selector_state,
@@ -168,6 +172,7 @@ def __init__(
168172
self.cdp_url = config.cdp_url
169173
self.network_idle = config.network_idle
170174
self.wait_selector = config.wait_selector
175+
self.init_script = config.init_script
171176
self.wait_selector_state = config.wait_selector_state
172177

173178
self.playwright: Optional[Playwright] = None
@@ -243,6 +248,9 @@ def __create__(self):
243248
user_data_dir="", **self.launch_options
244249
)
245250

251+
if self.init_script: # pragma: no cover
252+
self.context.add_init_script(path=self.init_script)
253+
246254
if self.cookies: # pragma: no cover
247255
self.context.add_cookies(self.cookies)
248256

@@ -409,6 +417,7 @@ def __init__(
409417
timeout: int | float = 30000,
410418
disable_resources: bool = False,
411419
wait_selector: Optional[str] = None,
420+
init_script: Optional[str] = None,
412421
cookies: Optional[List[Dict]] = None,
413422
network_idle: bool = False,
414423
wait_selector_state: SelectorWaitStates = "attached",
@@ -427,6 +436,7 @@ def __init__(
427436
:param wait: The time (milliseconds) the fetcher will wait after everything finishes before closing the page and returning the ` Response ` object.
428437
:param page_action: Added for automation. A function that takes the `page` object, does the automation you need, then returns `page` again.
429438
:param wait_selector: Wait for a specific CSS selector to be in a specific state.
439+
:param init_script: An absolute path to a JavaScript file to be executed on page creation for all pages in this session.
430440
:param locale: Set the locale for the browser if wanted. The default value is `en-US`.
431441
:param wait_selector_state: The state to wait for the selector given with `wait_selector`. The default state is `attached`.
432442
:param stealth: Enables stealth mode, check the documentation to see what stealth mode does currently.
@@ -459,6 +469,7 @@ def __init__(
459469
timeout,
460470
disable_resources,
461471
wait_selector,
472+
init_script,
462473
cookies,
463474
network_idle,
464475
wait_selector_state,
@@ -494,6 +505,9 @@ async def __create__(self):
494505
)
495506
)
496507

508+
if self.init_script: # pragma: no cover
509+
await self.context.add_init_script(path=self.init_script)
510+
497511
if self.cookies:
498512
await self.context.add_cookies(self.cookies)
499513

scrapling/engines/_browsers/_validators.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ class PlaywrightConfig(Struct, kw_only=True, frozen=False):
3232
extra_headers: Optional[Dict[str, str]] = None
3333
useragent: Optional[str] = None
3434
timeout: int | float = 30000
35+
init_script: Optional[str] = None
3536
disable_resources: bool = False
3637
wait_selector: Optional[str] = None
3738
cookies: Optional[List[Dict]] = None
@@ -58,6 +59,15 @@ def __post_init__(self):
5859
if not self.selector_config:
5960
self.selector_config = {}
6061

62+
if self.init_script is not None:
63+
script_path = Path(self.init_script)
64+
if not script_path.exists():
65+
raise ValueError("Init script path not found")
66+
elif not script_path.is_file():
67+
raise ValueError("Init script is not a file")
68+
elif not script_path.is_absolute():
69+
raise ValueError("Init script is not a absolute path")
70+
6171
@staticmethod
6272
def __validate_cdp(cdp_url):
6373
try:
@@ -90,6 +100,7 @@ class CamoufoxConfig(Struct, kw_only=True, frozen=False):
90100
solve_cloudflare: bool = False
91101
wait: int | float = 0
92102
timeout: int | float = 30000
103+
init_script: Optional[str] = None
93104
page_action: Optional[Callable] = None
94105
wait_selector: Optional[str] = None
95106
addons: Optional[List[str]] = None
@@ -131,6 +142,15 @@ def __post_init__(self):
131142
f"Addon's path is not a folder, you need to pass a folder of the extracted addon: {addon}"
132143
)
133144

145+
if self.init_script is not None:
146+
script_path = Path(self.init_script)
147+
if not script_path.exists():
148+
raise ValueError("Init script path not found")
149+
elif not script_path.is_file():
150+
raise ValueError("Init script is not a file")
151+
elif not script_path.is_absolute():
152+
raise ValueError("Init script is not a absolute path")
153+
134154
if not self.cookies:
135155
self.cookies = []
136156
if self.solve_cloudflare and self.timeout < 60_000:

0 commit comments

Comments
 (0)