1
1
import asyncio # noqa: F401
2
+ import json
2
3
import math
3
- from functools import cached_property
4
- from candore .modules .ssh import Session
5
4
import re
5
+ from functools import cached_property
6
+ from pathlib import Path
7
+
6
8
import aiohttp
7
9
10
+ from candore .modules .ssh import Session
11
+
8
12
# Max observed request duration in testing was approximately 888 seconds
9
13
# so we set the timeout to 2000 seconds to be overly safe
10
14
EXTENDED_TIMEOUT = aiohttp .ClientTimeout (total = 2000 , connect = 60 , sock_read = 2000 , sock_connect = 60 )
15
+ RESUME_FILE = Path ("_resume_info.json" )
16
+ PARTIAL_FILE = Path ("_partial_extraction.json" )
11
17
12
18
13
19
class Extractor :
@@ -27,6 +33,12 @@ def __init__(self, settings, apilister=None):
27
33
self .apilister = apilister
28
34
self .full = False
29
35
self .semaphore = asyncio .Semaphore (self .settings .candore .max_connections )
36
+ self ._all_data = {}
37
+ self ._api_endpoints = None
38
+ self ._completed_entities = []
39
+ self ._current_entity = None
40
+ self ._current_endpoint = None
41
+ self ._retry_limit = 3
30
42
31
43
@cached_property
32
44
def dependent_components (self ):
@@ -40,7 +52,9 @@ def ignore_components(self):
40
52
41
53
@cached_property
42
54
def api_endpoints (self ):
43
- return self .apilister .lister_endpoints ()
55
+ if not self ._api_endpoints :
56
+ self ._api_endpoints = self .apilister .lister_endpoints ()
57
+ return self ._api_endpoints
44
58
45
59
async def _start_session (self ):
46
60
if not self .client :
@@ -56,13 +70,37 @@ async def __aenter__(self):
56
70
57
71
async def __aexit__ (self , exc_type , exc_val , exc_tb ):
58
72
await self ._end_session ()
73
+ if exc_val :
74
+ with open ("_partial_extraction.json" , "w" ) as partial_file :
75
+ json .dump (self ._all_data , partial_file )
76
+ with open ("_resume_info.json" , "w" ) as resume_file :
77
+ json .dump (self .to_resume_dict (), resume_file , indent = 4 )
78
+
79
+ async def _retry_get (self , retries = None , ** get_params ):
80
+ if not retries :
81
+ retries = self ._retry_limit
82
+ try :
83
+ async with self .client .get (** get_params ) as response :
84
+ if response .status == 200 :
85
+ json_data = await response .json ()
86
+ return response .status , json_data
87
+ else :
88
+ return response .status , {}
89
+ except aiohttp .ClientError :
90
+ if retries > 0 :
91
+ return await self ._retry_get (retries = retries - 1 , ** get_params )
92
+ else :
93
+ print (
94
+ f"Failed to get data from { get_params .get ('url' )} "
95
+ f"in { self ._retry_limit } retries."
96
+ )
97
+ raise
59
98
60
99
async def paged_results (self , ** get_params ):
61
- async with self .client .get (** get_params , timeout = EXTENDED_TIMEOUT ) as response :
62
- if response .status == 200 :
63
- _paged_results = await response .json ()
64
- _paged_results = _paged_results .get ("results" )
65
- return _paged_results
100
+ status , _paged_results = await self ._retry_get (** get_params , timeout = EXTENDED_TIMEOUT )
101
+ if status == 200 :
102
+ _paged_results = _paged_results .get ("results" )
103
+ return _paged_results
66
104
67
105
async def fetch_page (self , page , _request ):
68
106
async with self .semaphore :
@@ -95,18 +133,17 @@ async def fetch_component_entities(self, **comp_params):
95
133
_request = {"url" : self .base + "/" + endpoint , "params" : {}}
96
134
if data and dependency :
97
135
_request ["params" ].update ({f"{ dependency } _id" : data })
98
- async with self .client .get (** _request ) as response :
99
- if response .status == 200 :
100
- results = await response .json ()
101
- if "results" in results :
102
- entity_data .extend (results .get ("results" ))
103
- else :
104
- # Return an empty directory for endpoints
105
- # like services, api etc
106
- # which does not have results
107
- return entity_data
136
+ status , results = await self ._retry_get (** _request )
137
+ if status == 200 :
138
+ if "results" in results :
139
+ entity_data .extend (results .get ("results" ))
108
140
else :
141
+ # Return an empty directory for endpoints
142
+ # like services, api etc
143
+ # which does not have results
109
144
return entity_data
145
+ else :
146
+ return entity_data
110
147
total_pages = results .get ("total" ) // results .get ("per_page" ) + 1
111
148
if total_pages > 1 :
112
149
print (f"Endpoint { endpoint } has { total_pages } pages." )
@@ -154,11 +191,12 @@ async def component_params(self, component_endpoint):
154
191
155
192
async def process_entities (self , endpoints ):
156
193
"""
157
- endpoints = ['katello/api/actiovationkeys ']
194
+ endpoints = ['katello/api/activationkeys ']
158
195
"""
159
196
comp_data = []
160
197
entities = None
161
198
for endpoint in endpoints :
199
+ self ._current_endpoint = endpoint
162
200
comp_params = await self .component_params (component_endpoint = endpoint )
163
201
if comp_params :
164
202
entities = []
@@ -183,21 +221,40 @@ async def extract_all_entities(self):
183
221
184
222
:return:
185
223
"""
186
- all_data = {}
187
224
for component , endpoints in self .api_endpoints .items ():
188
- if endpoints :
225
+ self ._current_entity = component
226
+ if endpoints and component not in self ._completed_entities :
189
227
comp_entities = await self .process_entities (endpoints = endpoints )
190
- all_data [component ] = comp_entities
191
- return all_data
228
+ self ._all_data [component ] = comp_entities
229
+ self ._completed_entities .append (component )
230
+ return self ._all_data
192
231
193
232
async def extract_all_rpms (self ):
194
233
"""Extracts all installed RPMs from server"""
195
234
with Session (settings = self .settings ) as ssh_client :
196
235
rpms = ssh_client .execute ('rpm -qa' ).stdout
197
236
rpms = rpms .splitlines ()
198
237
name_version_pattern = rf'{ self .settings .rpms .regex_pattern } '
199
- rpms_matches = [
200
- re .compile (name_version_pattern ).match (rpm ) for rpm in rpms
201
- ]
238
+ rpms_matches = [re .compile (name_version_pattern ).match (rpm ) for rpm in rpms ]
202
239
rpms_list = [rpm_match .groups ()[:- 1 ] for rpm_match in rpms_matches if rpm_match ]
203
240
return dict (rpms_list )
241
+
242
+ def to_resume_dict (self ):
243
+ """Exports our latest extraction progress information to a dictionary"""
244
+ return {
245
+ "api_endpoints" : self ._api_endpoints ,
246
+ "completed_entities" : self ._completed_entities ,
247
+ "current_entity" : self ._current_entity ,
248
+ "current_endpoint" : self ._current_endpoint ,
249
+ }
250
+
251
+ def load_resume_info (self ):
252
+ """Resumes our extraction from the last known state"""
253
+ resume_info = json .load (RESUME_FILE .read_text ())
254
+ self ._api_endpoints = resume_info ["api_endpoints" ]
255
+ self ._completed_entities = resume_info ["completed_entities" ]
256
+ self ._current_entity = resume_info ["current_entity" ]
257
+ self ._current_endpoint = resume_info ["current_endpoint" ]
258
+ self ._all_data = json .loads (PARTIAL_FILE .read_text ())
259
+ RESUME_FILE .unlink ()
260
+ PARTIAL_FILE .unlink ()
0 commit comments