From ccad610a9487fe200974fe1abe2ba2cb24c21193 Mon Sep 17 00:00:00 2001
From: ppcad <45867125+ppcad@users.noreply.github.com>
Date: Mon, 9 Dec 2024 14:38:45 +0100
Subject: [PATCH] Fix log arrival timezone (#715)

* Add option to TimeParser to get current timestamp with timezone
* Fix timezone in log arrival and delta time
* Add TimeParser and log arrival time changes to changelog

---------

Co-authored-by: dtrai2 <95028228+dtrai2@users.noreply.github.com>
---
 CHANGELOG.md                 |  2 ++
 logprep/abc/input.py         | 17 ++++++++++-------
 logprep/util/time.py         |  9 +++++++--
 tests/unit/util/test_time.py | 19 +++++++++++++++----
 4 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index e82beaadf..7d96ba655 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -26,6 +26,7 @@ the list is now fixed inside the packaged logprep
 * refactored some processors to make use of the new helper methods
 * add `pre-commit` hooks to the repository, install new dev dependency and run `pre-commit install` in the root dir
 * the default `securityContext`for the pod is now configurable
+* allow `TimeParser` to get the current time with a specified timezone instead of always using local time and setting the timezone to UTC
 * remove `tldextract` dependency
 * remove `urlextract` dependency
 
@@ -34,6 +35,7 @@ the list is now fixed inside the packaged logprep
 * fix `confluent_kafka.store_offsets` if `last_valid_record` is `None`, can happen if a rebalancing happens
   before the first message was pulled.
 * fix pseudonymizer cache metrics not updated
+* fix incorrect timezones for log arrival time and delta time in input preprocessing
 * fix `_get_value` in `FilterExpression` so that keys don't match on values
 * fix `auto_rule_tester` to work with `LOGPREP_BYPASS_RULE_TREE` enabled
 
diff --git a/logprep/abc/input.py b/logprep/abc/input.py
index 396d995c0..585a5f4de 100644
--- a/logprep/abc/input.py
+++ b/logprep/abc/input.py
@@ -8,9 +8,10 @@
 import zlib
 from abc import abstractmethod
 from copy import deepcopy
-from functools import partial
+from functools import partial, cached_property
 from hmac import HMAC
 from typing import Optional, Tuple
+from zoneinfo import ZoneInfo
 
 from attrs import define, field, validators
 
@@ -198,6 +199,11 @@ def _add_version_info(self):
         """Check and return if the version info should be added to the event."""
         return bool(self._config.preprocessing.get("version_info_target_field"))
 
+    @cached_property
+    def _log_arrival_timestamp_timezone(self):
+        """Returns the timezone for log arrival timestamps"""
+        return ZoneInfo("UTC")
+
     @property
     def _add_log_arrival_time_information(self):
         """Check and return if the log arrival time info should be added to the event."""
@@ -311,12 +317,9 @@ def _add_env_enrichment_to_event(self, event: dict):
         add_fields_to(event, fields)
 
     def _add_arrival_time_information_to_event(self, event: dict):
-        new_field = {
-            self._config.preprocessing.get(
-                "log_arrival_time_target_field"
-            ): TimeParser.now().isoformat()
-        }
-        add_fields_to(event, new_field)
+        target = self._config.preprocessing.get("log_arrival_time_target_field")
+        time = TimeParser.now(self._log_arrival_timestamp_timezone).isoformat()
+        add_fields_to(event, {target: time})
 
     def _add_arrival_timedelta_information_to_event(self, event: dict):
         log_arrival_timedelta_config = self._config.preprocessing.get("log_arrival_timedelta")
diff --git a/logprep/util/time.py b/logprep/util/time.py
index 66046ca22..75a57dc86 100644
--- a/logprep/util/time.py
+++ b/logprep/util/time.py
@@ -61,15 +61,20 @@ def from_timestamp(cls, timestamp: Union[int, float]) -> datetime:
         return time_object
 
     @classmethod
-    def now(cls) -> datetime:
+    def now(cls, timezone: tzinfo = None) -> datetime:
         """returns the current time
 
+        Parameters
+        ----------
+        timezone : tzinfo
+            the timezone to use for the timestamp
+
         Returns
         -------
         datetime
             current date and time as datetime
         """
-        time_object = datetime.now()
+        time_object = datetime.now(timezone)
         time_object = cls._set_utc_if_timezone_is_missing(time_object)
         return time_object
 
diff --git a/tests/unit/util/test_time.py b/tests/unit/util/test_time.py
index 4599f6fda..f3d3c876c 100644
--- a/tests/unit/util/test_time.py
+++ b/tests/unit/util/test_time.py
@@ -60,11 +60,22 @@ def test_from_format_returns(self, source, format_str, expected):
         for attribute, value in expected.items():
             assert getattr(timestamp, attribute) == value
 
+    @pytest.mark.parametrize("timezone", [None, ZoneInfo("UTC"), ZoneInfo("Europe/Berlin")])
+    def test_has_utc_if_timezone_was_set(self, timezone):
+        datetime_time = datetime.now(timezone)
+        time_parser_time = TimeParser.now(timezone)
+        assert time_parser_time.second == pytest.approx(datetime_time.second, abs=1)
+        if timezone is None:
+            assert time_parser_time.tzinfo == ZoneInfo("UTC")
+        else:
+            assert time_parser_time.tzinfo == timezone
+
     def test_set_utc_if_timezone_is_missing_sets_timezone(self):
-        time_object = datetime.now()
-        assert time_object.tzinfo is None
-        time_object = TimeParser._set_utc_if_timezone_is_missing(time_object)
-        assert time_object.tzinfo is ZoneInfo("UTC")
+        datetime_time = datetime.now()
+        assert datetime_time.tzinfo is None
+        time_parser_time = TimeParser._set_utc_if_timezone_is_missing(datetime_time)
+        assert time_parser_time.tzinfo is ZoneInfo("UTC")
+        assert time_parser_time.second == pytest.approx(datetime_time.second, abs=1)
 
     @pytest.mark.parametrize(
         "timestamp, source_format, source_timezone, expected_timezone_name, expected",