-
Notifications
You must be signed in to change notification settings - Fork 1.4k
/
quickstart_sync.py
312 lines (266 loc) · 13.9 KB
/
quickstart_sync.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
import os
import time
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import *
from rich import print
from rich.console import Console
from functools import lru_cache
console = Console()
@lru_cache()
def create_crawler():
crawler = WebCrawler(verbose=True)
crawler.warmup()
return crawler
def print_result(result):
# Print each key in one line and just the first 10 characters of each one's value and three dots
console.print(f"\t[bold]Result:[/bold]")
for key, value in result.model_dump().items():
if isinstance(value, str) and value:
console.print(f"\t{key}: [green]{value[:20]}...[/green]")
if result.extracted_content:
items = json.loads(result.extracted_content)
print(f"\t[bold]{len(items)} blocks is extracted![/bold]")
def cprint(message, press_any_key=False):
console.print(message)
if press_any_key:
console.print("Press any key to continue...", style="")
input()
def basic_usage(crawler):
cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
result = crawler.run(url="https://www.nbcnews.com/business", only_text = True)
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
print_result(result)
def basic_usage_some_params(crawler):
cprint("🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]")
result = crawler.run(url="https://www.nbcnews.com/business", word_count_threshold=1, only_text = True)
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
print_result(result)
def screenshot_usage(crawler):
cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
# Save the screenshot to a file
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result.screenshot))
cprint("Screenshot saved to 'screenshot.png'!")
print_result(result)
def understanding_parameters(crawler):
cprint("\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]")
cprint("By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action.")
# First crawl (reads from cache)
cprint("1️⃣ First crawl (caches the result):", True)
start_time = time.time()
result = crawler.run(url="https://www.nbcnews.com/business")
end_time = time.time()
cprint(f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]")
print_result(result)
# Force to crawl again
cprint("2️⃣ Second crawl (Force to crawl again):", True)
start_time = time.time()
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
end_time = time.time()
cprint(f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]")
print_result(result)
def add_chunking_strategy(crawler):
# Adding a chunking strategy: RegexChunking
cprint("\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]", True)
cprint("RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!")
result = crawler.run(
url="https://www.nbcnews.com/business",
chunking_strategy=RegexChunking(patterns=["\n\n"])
)
cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
print_result(result)
# Adding another chunking strategy: NlpSentenceChunking
cprint("\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]", True)
cprint("NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!")
result = crawler.run(
url="https://www.nbcnews.com/business",
chunking_strategy=NlpSentenceChunking()
)
cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
print_result(result)
def add_extraction_strategy(crawler):
# Adding an extraction strategy: CosineStrategy
cprint("\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]", True)
cprint("CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!")
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=CosineStrategy(word_count_threshold=10, max_dist=0.2, linkage_method="ward", top_k=3, sim_threshold = 0.3, verbose=True)
)
cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
print_result(result)
# Using semantic_filter with CosineStrategy
cprint("You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!")
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=CosineStrategy(
semantic_filter="inflation rent prices",
)
)
cprint("[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]")
print_result(result)
def add_llm_extraction_strategy(crawler):
# Adding an LLM extraction strategy without instructions
cprint("\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]", True)
cprint("LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!")
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY'))
)
cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]")
print_result(result)
# Adding an LLM extraction strategy with instructions
cprint("\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]", True)
cprint("Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!")
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o",
api_token=os.getenv('OPENAI_API_KEY'),
instruction="I am interested in only financial news"
)
)
cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]")
print_result(result)
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o",
api_token=os.getenv('OPENAI_API_KEY'),
instruction="Extract only content related to technology"
)
)
cprint("[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]")
print_result(result)
def targeted_extraction(crawler):
# Using a CSS selector to extract only H2 tags
cprint("\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]", True)
result = crawler.run(
url="https://www.nbcnews.com/business",
css_selector="h2"
)
cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
print_result(result)
def interactive_extraction(crawler):
# Passing JavaScript code to interact with the page
cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
js_code = """
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
"""
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
result = crawler.run(
url="https://www.nbcnews.com/business",
js = js_code
)
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
print_result(result)
def multiple_scrip(crawler):
# Passing JavaScript code to interact with the page
cprint("\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]", True)
cprint("In this example we try to click the 'Load More' button on the page using JavaScript code.")
js_code = ["""
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
"""] * 2
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
result = crawler.run(
url="https://www.nbcnews.com/business",
js = js_code
)
cprint("[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]")
print_result(result)
def using_crawler_hooks(crawler):
# Example usage of the hooks for authentication and setting a cookie
def on_driver_created(driver):
print("[HOOK] on_driver_created")
# Example customization: maximize the window
driver.maximize_window()
# Example customization: logging in to a hypothetical website
driver.get('https://example.com/login')
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.NAME, 'username'))
)
driver.find_element(By.NAME, 'username').send_keys('testuser')
driver.find_element(By.NAME, 'password').send_keys('password123')
driver.find_element(By.NAME, 'login').click()
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, 'welcome'))
)
# Add a custom cookie
driver.add_cookie({'name': 'test_cookie', 'value': 'cookie_value'})
return driver
def before_get_url(driver):
print("[HOOK] before_get_url")
# Example customization: add a custom header
# Enable Network domain for sending headers
driver.execute_cdp_cmd('Network.enable', {})
# Add a custom header
driver.execute_cdp_cmd('Network.setExtraHTTPHeaders', {'headers': {'X-Test-Header': 'test'}})
return driver
def after_get_url(driver):
print("[HOOK] after_get_url")
# Example customization: log the URL
print(driver.current_url)
return driver
def before_return_html(driver, html):
print("[HOOK] before_return_html")
# Example customization: log the HTML
print(len(html))
return driver
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]", True)
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
crawler_strategy.set_hook('on_driver_created', on_driver_created)
crawler_strategy.set_hook('before_get_url', before_get_url)
crawler_strategy.set_hook('after_get_url', after_get_url)
crawler_strategy.set_hook('before_return_html', before_return_html)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()
result = crawler.run(url="https://example.com")
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
print_result(result= result)
def using_crawler_hooks_dleay_example(crawler):
def delay(driver):
print("Delaying for 5 seconds...")
time.sleep(5)
print("Resuming...")
def create_crawler():
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
crawler_strategy.set_hook('after_get_url', delay)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()
return crawler
cprint("\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]")
crawler = create_crawler()
result = crawler.run(url="https://google.com", bypass_cache=True)
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
print_result(result)
def main():
cprint("🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]")
cprint("⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]")
cprint("If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files.")
crawler = create_crawler()
crawler.always_by_pass_cache = True
basic_usage(crawler)
# basic_usage_some_params(crawler)
understanding_parameters(crawler)
crawler.always_by_pass_cache = True
screenshot_usage(crawler)
add_chunking_strategy(crawler)
add_extraction_strategy(crawler)
add_llm_extraction_strategy(crawler)
targeted_extraction(crawler)
interactive_extraction(crawler)
multiple_scrip(crawler)
cprint("\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]")
if __name__ == "__main__":
main()