-
Notifications
You must be signed in to change notification settings - Fork 0
/
crawleeplaywright.py
154 lines (116 loc) · 4.79 KB
/
crawleeplaywright.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
"""
I wanted to use crawlee directly in my project
- I am running code in a thread (not a main thread)
- crawlee uses asyncio, therefore I also have to use it
- I tried creating my own loop in a thread, and on windows such system works
- on linux raspberry it does not. Asyncio does not allow to define new loop from task
set_wakeup_fd only works in main thread of the main interpreter
- full asyncio http server also does not work, as only first request works, then crawlee
complains that not all async tasks have been completed.
No joke, asyncio http server has not completed, therefore it cannot work together
Therefore crawlee is called from a separate script. We cut off crawlee.
"""
import argparse
import sys
import os
from datetime import timedelta
import json
import webtools
import traceback
import shutil
os.environ["CRAWLEE_STORAGE_DIR"] = "./storage/{}".format(os.getpid())
def cleanup_storage():
path = os.environ["CRAWLEE_STORAGE_DIR"]
# cannot remove it yet, when program is running :(
#shutil.rmtree(path)
crawlee_feataure_enabled = True
try:
import asyncio
# https://github.com/apify/crawlee-python
# https://crawlee.dev/python/api
from crawlee.beautifulsoup_crawler import (
BeautifulSoupCrawler,
BeautifulSoupCrawlingContext,
)
from crawlee.basic_crawler import BasicCrawler
from crawlee.basic_crawler.types import BasicCrawlingContext
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext
except Exception as E:
print(str(E))
crawlee_feataure_enabled = False
def on_close(interface, response, status_code = 0):
interface.response = response
interface.save_response()
cleanup_storage()
#crawlee complains if we kill it like this sys.exit(0)
async def main() -> None:
webtools.WebConfig.use_print_logging()
parser = webtools.ScriptCrawlerParser()
parser.parse()
if not parser.is_valid():
sys.exit(1)
return
if not crawlee_feataure_enabled:
print("Python: crawlee package is not available")
sys.exit(1)
return
request = parser.get_request()
if parser.args.verbose:
print("Running request:{} with PlaywrightCrawler".format(request))
interface = webtools.ScriptCrawlerInterface(parser, request)
response = webtools.PageResponseObject(request.url)
crawler = PlaywrightCrawler(
# Limit the crawl to max requests. Remove or increase it for crawling all links.
max_requests_per_crawl=10,
request_handler_timeout=timedelta(seconds=request.timeout_s),
)
# Define the default request handler, which will be called for every request.
@crawler.router.default_handler
async def request_handler(context: PlaywrightCrawlingContext) -> None:
print(f"Processing {context.request.url} ...")
try:
# maybe we could send header information that we accept text/rss
headers = {}
for item in context.response.headers:
headers[item] = context.response.headers[item]
response.url = context.request.loaded_url
response.request_url = request.url
# result['loaded_url'] = context.page.url
response.status_code = context.response.status
response.headers = headers
if request.ping:
on_close(interface, response)
return
if response.get_content_length() > webtools.PAGE_TOO_BIG_BYTES:
print("Response too big")
response.status_code = webtools.HTTP_STATUS_CODE_FILE_TOO_BIG
on_close(interface, response)
content_type = response.get_content_type()
if content_type and not response.is_content_type_supported():
print("Content not supported")
response.status_code = webtools.HTTP_STATUS_CODE_PAGE_UNSUPPORTED
on_close(interface, response)
return
response.set_text(await context.page.content())
print(f"Processing {context.request.url} ...DONE")
on_close(interface, response)
return
except Exception as E:
print(str(E))
error_text = traceback.format_exc()
print(error_text)
response.status_code = webtools.HTTP_STATUS_CODE_EXCEPTION
on_close(interface, response, 1)
return
try:
# Run the crawler with the initial list of URLs.
await crawler.run([parser.args.url])
except Exception as E:
print(str(E))
error_text = traceback.format_exc()
print(error_text)
response.status_code = webtools.HTTP_STATUS_CODE_EXCEPTION
on_close(interface, response, 1)
return
if __name__ == "__main__":
asyncio.run(main())