Merge Rust Engine

ddbnl · May 2, 2022 · e3e4be9 · e3e4be9
2 parents 5bc491c + 677217d
commit e3e4be9
Show file tree

Hide file tree

Showing 26 changed files with 2,137 additions and 863 deletions.
diff --git a/ConfigExamples/fullConfig.yaml b/ConfigExamples/fullConfig.yaml
@@ -8,6 +8,7 @@ collect:  # Settings determining which audit logs to collect and how to do it
     Audit.Exchange: True
     Audit.SharePoint: True
     DLP.All: True
+  rustEngine: True  # Use False to revert to the old Python engine. If running from python instead of executable, make sure to install the python wheel in the RustEngineWheels folder
   schedule: 0 1 0  # How often to run in days/hours/minutes. Delete this line to just run once and exit.
   maxThreads: 50  # Maximum number of simultaneous threads retrieving logs
   retries: 3  # Times to retry retrieving a content blob if it fails

diff --git a/Linux/LINUX-OfficeAuditLogCollector-V1.5 → Linux/LINUX-OfficeAuditLogCollector-V2.0 b/Linux/LINUX-OfficeAuditLogCollector-V1.5 → Linux/LINUX-OfficeAuditLogCollector-V2.0
diff --git a/README.md b/README.md
@@ -1,3 +1,19 @@
+# Announcement:
+
+To hugely boost performance and add reliability the engine of the log collector has been rewritten in Rust. Consider downloading the newest
+executable to automatically use it. 
+
+If you run python code directly instead of using the executables, install the RustEngine wheel under
+the "RustEngineWheels" folder in this repo. To turn off the new engine (in case of issues or for whatever reason), use the following
+in your config.yaml:
+
+```
+collect:  
+  rustEngine: False
+```
+In my own tests the Rust engine has been at least 10x faster and stable. If you run into any problems, please use the
+above setting to revert to the old engine, and consider creating an issue here on Github so I can fix it.
+
 # Office365 audit log collector
 
 Collect/retrieve Office365, Azure and DLP audit logs, optionally filter them, then send them to one or more outputs 

diff --git a/RustEngineWheels/alc-0.1.0-cp310-none-win_amd64.whl b/RustEngineWheels/alc-0.1.0-cp310-none-win_amd64.whl
diff --git a/RustEngineWheels/alc-0.1.0-cp38-cp38-linux_x86_64.whl b/RustEngineWheels/alc-0.1.0-cp38-cp38-linux_x86_64.whl
diff --git a/Source/AuditLogCollector.py b/Source/AuditLogCollector.py
@@ -1,5 +1,6 @@
 from Interfaces import AzureOMSInterface, SqlInterface, GraylogInterface, PRTGInterface, FileInterface, \
     AzureTableInterface, AzureBlobInterface, FluentdInterface
+import alc  # Rust based log collector Engine
 import AuditLogSubscriber
 import ApiConnection
 import os
@@ -45,6 +46,7 @@ def __init__(self, config_path, **kwargs):
         self.run_started = None
         self.logs_retrieved = 0
         self.errors_retrieving = 0
+        self.retries = 0
 
     def force_stop(self, *args):
 
@@ -65,13 +67,46 @@ def run_once(self):
         """
         self._prepare_to_run()
         logging.log(level=logging.INFO, msg='Starting run @ {}. Content: {}.'.format(
-            datetime.datetime.now(), self._remaining_content_types))
-        self._start_monitoring()
-        self._get_all_available_content()
-        while self.monitor_thread.is_alive():
-            self.monitor_thread.join(1)
+            datetime.datetime.now(), self.config['collect', 'contentTypes']))
+        if not self.config['collect', 'rustEngine'] is False:
+            self._start_interfaces()
+            self.receive_results_from_rust_engine()
+            self._stop_interfaces(force=False)
+        else:
+            self._start_monitoring()
+            self._get_all_available_content()
+            while self.monitor_thread.is_alive():
+                self.monitor_thread.join(1)
         self._finish_run()
 
+    def receive_results_from_rust_engine(self):
+
+        runs = self._get_needed_runs(content_types=self.config['collect', 'contentTypes'].copy())
+        engine = alc.RustEngine(self.tenant_id, self.client_key, self.secret_key, self.publisher_id or self.tenant_id,
+                                self.config['collect', 'contentTypes'], runs,
+                                self.config['collect', 'maxThreads'] or 50,
+                                self.config['collect', 'retries'] or 3)
+        engine.run_once()
+        last_received = datetime.datetime.now()
+        while True:
+            try:
+                result = engine.get_result()
+            except ValueError:  # RustEngine throws this error when no logs are in the results recv queue
+                now = datetime.datetime.now()
+                if now - last_received > datetime.timedelta(seconds=60):
+                    logging.error("Timed out waiting for results from engine")
+                    break
+                last_received = now
+            except EOFError:  # RustEngine throws this error when all content has been retrieved
+                logging.info("Rust engine finished receiving all content")
+                break
+            else:
+                content_json, content_id, content_expiration, content_type = result
+                self._handle_retrieved_content(content_id=content_id, content_expiration=content_expiration,
+                                               content_type=content_type, results=json.loads(content_json))
+                self.logs_retrieved += 1
+        _, _, self.retries, self.errors_retrieving = engine.stop()
+
     def run_scheduled(self):
         """
         Run according to the schedule set in the config file. Collector will not exit unless manually stopped.
@@ -155,8 +190,8 @@ def _log_statistics(self):
         """
         Write run statistics to log file / console.
         """
-        logging.info("Finished. Total logs retrieved: {}. Total logs with errors: {}. Run time: {}.".format(
-            self.logs_retrieved, self.errors_retrieving, datetime.datetime.now() - self.run_started))
+        logging.info("Finished. Total logs retrieved: {}. Total retries: {}. Total logs with errors: {}. Run time: {}."
+            .format(self.logs_retrieved, self.retries, self.errors_retrieving, datetime.datetime.now() - self.run_started))
         for interface in self._all_enabled_interfaces:
             logging.info("{} reports: {} successfully sent, {} errors".format(
                 interface.__class__.__name__, interface.successfully_sent, interface.unsuccessfully_sent))
@@ -224,12 +259,15 @@ def _auto_subscribe(self):
             logging.info("Auto subscribing to: {}".format(content_type))
             subscriber.set_sub_status(content_type=content_type, action='start')
 
-    def _get_all_available_content(self):
+    def _get_needed_runs(self, content_types):
         """
-        Start a thread to retrieve available content blobs for each content type to be collected.
+        Return the start- and end times needed to retrieve content for each content type. If the timespan to retrieve
+        logs for exceeds 24 hours, we need to split it up into 24 hour runs (limit by Office API).
         """
+        runs = {}
         end_time = datetime.datetime.now(datetime.timezone.utc)
-        for content_type in self._remaining_content_types.copy():
+        for content_type in content_types:
+            runs[content_type] = []
             if self.config['collect', 'resume'] and content_type in self._last_run_times.keys():
                 start_time = self._last_run_times[content_type]
                 logging.info("{} - resuming from: {}".format(content_type, start_time))
@@ -244,15 +282,29 @@ def _get_all_available_content(self):
                 if end_time - start_time > datetime.timedelta(hours=24):
                     split_start_time = start_time
                     split_end_time = start_time + datetime.timedelta(hours=24)
-                    self._start_get_available_content_thread(
-                        content_type=content_type, start_time=split_start_time, end_time=split_end_time)
+                    formatted_start_time = str(split_start_time).replace(' ', 'T').rsplit('.', maxsplit=1)[0]
+                    formatted_end_time = str(split_end_time).replace(' ', 'T').rsplit('.', maxsplit=1)[0]
+                    runs[content_type].append((formatted_start_time, formatted_end_time))
                     start_time = split_end_time
                     self._remaining_content_types.append(content_type)
                 else:
-                    self._start_get_available_content_thread(
-                        content_type=content_type, start_time=start_time, end_time=end_time)
+                    formatted_start_time = str(start_time).replace(' ', 'T').rsplit('.', maxsplit=1)[0]
+                    formatted_end_time = str(end_time).replace(' ', 'T').rsplit('.', maxsplit=1)[0]
+                    runs[content_type].append((formatted_start_time, formatted_end_time))
                     break
             self._last_run_times[content_type] = end_time.strftime("%Y-%m-%dT%H:%M:%SZ")
+        return runs
+
+    def _get_all_available_content(self):
+        """
+        Start a thread to retrieve available content blobs for each content type to be collected.
+        """
+        runs = self._get_needed_runs(content_types=self._remaining_content_types.copy())
+        for content_type, run_dates in runs.items():
+            for run_date in run_dates:
+                start_time, end_time = run_date
+                self._start_get_available_content_thread(
+                    content_type=content_type, start_time=start_time, end_time=end_time)
 
     def _start_get_available_content_thread(self, content_type, start_time, end_time):
 
@@ -268,12 +320,10 @@ def _get_available_content(self, content_type, start_time, end_time):
         """
         try:
             logging.log(level=logging.DEBUG, msg='Getting available content for type: "{}"'.format(content_type))
-            formatted_end_time = str(end_time).replace(' ', 'T').rsplit('.', maxsplit=1)[0]
-            formatted_start_time = str(start_time).replace(' ', 'T').rsplit('.', maxsplit=1)[0]
             logging.info("Retrieving {}. Start time: {}. End time: {}.".format(
-                content_type, formatted_start_time, formatted_end_time))
+                content_type, start_time, end_time))
             response = self.make_api_request(url='subscriptions/content?contentType={0}&startTime={1}&endTime={2}'.
-                                                 format(content_type, formatted_start_time, formatted_end_time))
+                                                 format(content_type, start_time, end_time))
             self.blobs_to_collect[content_type] += response.json()
             while 'NextPageUri' in response.headers.keys() and response.headers['NextPageUri']:
                 logging.log(level=logging.DEBUG, msg='Getting next page of content for type: "{0}"'.
@@ -351,24 +401,28 @@ def _retrieve_content(self, content_json, content_type, retries):
                 return
         except Exception as e:
             if retries:
+                self.retries += 1
                 time.sleep(self.config['collect', 'retryCooldown'] or 3)
                 return self._retrieve_content(content_json=content_json, content_type=content_type, retries=retries - 1)
             else:
                 self.errors_retrieving += 1
                 logging.error("Error retrieving content: {}".format(e))
                 return
         else:
-            self._handle_retrieved_content(content_json=content_json, content_type=content_type, results=results)
+            self._handle_retrieved_content(
+                content_id=content_json['contentId'], content_expiration=content_json['contentExpiration'],
+                content_type=content_type, results=results)
 
-    def _handle_retrieved_content(self, content_json, content_type, results):
+    def _handle_retrieved_content(self, content_id, content_expiration, content_type, results):
         """
         Check known logs, filter results and output what remains.
-        :param content_json: JSON dict of the content blob as retrieved from the API (dict)
+        :param content_id: ID of content blob from API (str)
+        :param content_expiration: date string of expiration of content blob from API (str)
         :param content_type: Type of API being retrieved for, e.g. 'Audit.Exchange' (str)
         :param results: list of JSON
         """
         if self.config['collect', 'skipKnownLogs']:
-            self._known_content[content_json['contentId']] = content_json['contentExpiration']
+            self._known_content[content_id] = content_expiration
         for log in results.copy():
             if self.config['collect', 'skipKnownLogs']:
                 if log['Id'] in self.known_logs: