Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add support for finding the log event that's closest to a target timestamp. #42

Open
wants to merge 34 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 29 commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
ea23947
search timestamp works for unstructured logs
Henry8192 Nov 19, 2024
7edaa48
Merge branch 'y-scope:main' into search-timestamp
Henry8192 Dec 16, 2024
7481573
Merge branch 'main' into search-timestamp
Henry8192 Dec 18, 2024
3ee05a2
fix lint
Henry8192 Dec 19, 2024
f1a71a1
implement structured logs search by timestamp
Henry8192 Dec 22, 2024
5263385
Merge branch 'main' into search-timestamp
Henry8192 Dec 22, 2024
c423ec5
address partial changes from review
Henry8192 Dec 27, 2024
467e998
snapshot: get_timestamp seems to be undefined for std::upper_bound
Henry8192 Dec 30, 2024
c66cb80
fix lint
Henry8192 Dec 30, 2024
89338f7
pass in log_events instead of iterators to generic_get_log_event_inde…
Henry8192 Jan 2, 2025
a99ec2a
switch back to std::upper_bound because std::ranges::upper_bound is n…
Henry8192 Jan 6, 2025
9ec039f
fix lint
Henry8192 Jan 6, 2025
4f125b8
change generic_get_log_event_index_by_timestamp behavior: only return…
Henry8192 Jan 6, 2025
5221588
edit docstring for get_log_event_index_by_timestamp
Henry8192 Jan 6, 2025
abeb4f8
fix lint
Henry8192 Jan 6, 2025
6647fbd
Apply suggestions from code review
Henry8192 Jan 8, 2025
046191e
use concept to shorten function definition; minor change to generic_g…
Henry8192 Jan 8, 2025
4c7386d
Merge branch 'main' into search-timestamp
Henry8192 Jan 9, 2025
c400361
fix lint & syntax
Henry8192 Jan 9, 2025
ca4a616
rename get_log_event_index_by_timestamp to get_log_event_idx_by_times…
Henry8192 Jan 9, 2025
9443ba3
revert comments and plan to fix in the next pr
Henry8192 Jan 10, 2025
db60efd
add back the missing space
Henry8192 Jan 10, 2025
0e3e21b
revert decode_range (without creating concept)
Henry8192 Jan 10, 2025
412b96e
address changes from Marco's review
Henry8192 Jan 13, 2025
54c7df1
remove unnecessary require statement for get_log_event_idx_by_timestamp
Henry8192 Jan 13, 2025
eff1849
address the rest of the comments
Henry8192 Jan 13, 2025
00f89d4
address code review changes
Henry8192 Jan 14, 2025
122f1cf
fix lint
Henry8192 Jan 15, 2025
f4a0207
resolve the rest of the conflicts
Henry8192 Jan 29, 2025
46fa81f
revert the CMakeList Change
Henry8192 Jan 30, 2025
8c8c42e
address kirk's comments in the code review
Henry8192 Feb 1, 2025
a984d7b
revert function name to find_nearest_log_event_by_timestamp
Henry8192 Feb 3, 2025
4788157
amend find_nearest_log_event_by_timestamp's comments, warning this fu…
Henry8192 Feb 4, 2025
ba1b01a
Update src/clp_ffi_js/ir/StreamReader.hpp
Henry8192 Feb 5, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 5 additions & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,11 @@ boost_${CLP_FFI_JS_BOOST_ARCHIVE_VERSION_PART}.tar.gz"
URL_MD5 "53aeccc3167909ee770e34469f8dd592"
)
message(STATUS "Fetching Boost.")
FetchContent_MakeAvailable(Boost)
set(FETCHCONTENT_QUIET OFF)
FetchContent_Declare(
Boost
URL "${CMAKE_FIND_PACKAGE_REDIRECTS_DIR/boost_1_85_0.tar.gz}"
)
Henry8192 marked this conversation as resolved.
Show resolved Hide resolved
Henry8192 marked this conversation as resolved.
Show resolved Hide resolved
message("Boost sources successfully fetched into ${boost_SOURCE_DIR}")

set(CMAKE_EXECUTABLE_SUFFIX ".js" CACHE STRING "Binary type to be generated by Emscripten.")
Expand Down
7 changes: 6 additions & 1 deletion src/clp_ffi_js/ir/StreamReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -129,6 +129,7 @@ EMSCRIPTEN_BINDINGS(ClpStreamReader) {
"Array<[string, bigint, number, number]>"
);
emscripten::register_type<clp_ffi_js::ir::FilteredLogEventMapTsType>("number[] | null");
emscripten::register_type<clp_ffi_js::ir::LogEventIdxTsType>("number | null");
Henry8192 marked this conversation as resolved.
Show resolved Hide resolved
emscripten::class_<clp_ffi_js::ir::StreamReader>("ClpStreamReader")
.constructor(
&clp_ffi_js::ir::StreamReader::create,
Expand All @@ -145,7 +146,11 @@ EMSCRIPTEN_BINDINGS(ClpStreamReader) {
)
.function("filterLogEvents", &clp_ffi_js::ir::StreamReader::filter_log_events)
.function("deserializeStream", &clp_ffi_js::ir::StreamReader::deserialize_stream)
.function("decodeRange", &clp_ffi_js::ir::StreamReader::decode_range);
.function("decodeRange", &clp_ffi_js::ir::StreamReader::decode_range)
.function(
"getLogEventIdxWithNearestTimestamp",
&clp_ffi_js::ir::StreamReader::get_log_event_idx_with_nearest_timestamp
);
}
} // namespace

Expand Down
54 changes: 54 additions & 0 deletions src/clp_ffi_js/ir/StreamReader.hpp
Henry8192 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <type_traits>
#include <vector>

#include <clp/ir/types.hpp>
#include <clp/streaming_compression/zstd/Decompressor.hpp>
#include <clp/type_utils.hpp>
#include <emscripten/em_asm.h>
Expand All @@ -29,6 +30,7 @@ EMSCRIPTEN_DECLARE_VAL_TYPE(ReaderOptions);
// JS types used as outputs
EMSCRIPTEN_DECLARE_VAL_TYPE(DecodedResultsTsType);
EMSCRIPTEN_DECLARE_VAL_TYPE(FilteredLogEventMapTsType);
EMSCRIPTEN_DECLARE_VAL_TYPE(LogEventIdxTsType);
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
EMSCRIPTEN_DECLARE_VAL_TYPE(LogEventIdxTsType);
// How about `NullableLogEventIdx`?
EMSCRIPTEN_DECLARE_VAL_TYPE(LogEventIdxTsType);

Copy link
Collaborator Author

@Henry8192 Henry8192 Feb 1, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@kirkrodrigues Should we also change the name of FilteredLogEventMapTsType later? It is also nullable.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I do prefer that, but what do @davemarco / @junhaoliao think?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I personally even prefer making the change here, as it is just a internal type name change and won't affect any front end code.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oh, isn't FilteredLogEventIdx also defined in the Log Viewer? It's true that the name in clp-ffi-js doesn't influence (from a code execution perspective) the name in the log viewer, but it's probably better for developers if we stay consistent. So if we rename it here, we should probably rename it in the log viewer as well.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I wouldn't change the filteredEventMap. Whether nullable in the name is better I think is a question of style. Like in general i think its better not to put type info in the name, but here I guess maybe exception since we are trying to make more clear than returns null.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

offline discussion with @junhaoliao, let's bring it to the next pr. @coderabbitai let's create an issue to track this.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
# Create an issue to track the type renaming task
gh issue create \
  --title "refactor: Rename TypeScript type names to indicate nullability" \
  --body "## Context

In PR #42, we discussed renaming TypeScript type names to better indicate their nullability. The change involves:

1. Renaming \`FilteredLogEventMapTsType\` to include \"Nullable\" in its name for consistency with \`NullableLogEventIdx\`
2. Making similar changes in the [yscope-log-viewer](https://github.com/y-scope/yscope-log-viewer) repository where these types are also defined

## References
- Discussion: ${GITHUB_SERVER_URL}/${GITHUB_REPOSITORY}/pull/42#discussion_r1934294078

## Tasks
- [ ] Rename \`FilteredLogEventMapTsType\` in clp-ffi-js
- [ ] Update corresponding type name in yscope-log-viewer
- [ ] Update any related documentation" \
  --assignee "Henry8192"

Length of output: 781


@Henry8192 I've created issue #58 to track the type renaming task. You can find it here: #58

(_/)

(^.^)

(> <) 🎯


enum class StreamType : uint8_t {
Structured,
Expand Down Expand Up @@ -123,6 +125,17 @@ class StreamReader {
*/
[[nodiscard]] virtual auto decode_range(size_t begin_idx, size_t end_idx, bool use_filter) const
-> DecodedResultsTsType = 0;
/**
Henry8192 marked this conversation as resolved.
Show resolved Hide resolved
* Finds the log event with the timestamp that's nearest to the `target_ts`.
* @param target_ts
* @return The index of the log event with:
* - the largest timestamp less than or equal to `target_ts`,
* - or the index `0` if all timestamps are greater than `target_ts`.
Henry8192 marked this conversation as resolved.
Show resolved Hide resolved
* @return null if no log event exists in the stream.
*/
[[nodiscard]] virtual auto get_log_event_idx_with_nearest_timestamp(
Henry8192 marked this conversation as resolved.
Show resolved Hide resolved
clp::ir::epoch_time_ms_t target_ts
) -> LogEventIdxTsType = 0;

protected:
explicit StreamReader() = default;
Expand Down Expand Up @@ -172,6 +185,20 @@ class StreamReader {
LogLevelFilterTsType const& log_level_filter,
LogEvents<LogEvent> const& log_events
) -> void;

/**
* Templated implementation of `get_log_event_idx_with_nearest_timestamp`.
*
* @tparam LogEvent
* @param log_events
* @param timestamp
* @return the best matched log event index.
*/
template <typename LogEvent>
auto generic_get_log_event_idx_with_nearest_timestamp(
LogEvents<LogEvent> const& log_events,
clp::ir::epoch_time_ms_t timestamp
) -> LogEventIdxTsType;
};

template <typename LogEvent, typename ToStringFunc>
Expand Down Expand Up @@ -258,6 +285,33 @@ auto StreamReader::generic_filter_log_events(
}
}
}

template <typename LogEvent>
auto StreamReader::generic_get_log_event_idx_with_nearest_timestamp(
LogEvents<LogEvent> const& log_events,
clp::ir::epoch_time_ms_t timestamp
) -> LogEventIdxTsType {
if (log_events.empty()) {
return LogEventIdxTsType{emscripten::val::null()};
}
Henry8192 marked this conversation as resolved.
Show resolved Hide resolved
Henry8192 marked this conversation as resolved.
Show resolved Hide resolved

auto first_greater_it{std::upper_bound(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

std::upper_bound assumes the range is sorted, right? What would happen if the log events aren't in ascending timestamp order? It's technically not impossible and we have seen log files where that's true.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@junhaoliao iirc, I was told to assume the log events are in accending order. If that's not the case, this whole search function won't work.

If that's not the case, we have to sort all of the log events, cuz' that's the only way to fully garantee all log events are sorted. And we (might?) also need a map to remember their original location.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the case with out of order timestamps is rare (as far as we know), we could maybe do something like:

  • When decoding the file, keep track of whether the timestamp of two consecutive log events are out of order and if so, set a flag and display a warning to the user that the file contains out of order timestamps and so timestamp-related searches may not function correctly (since two log events with the same timestamp may exist at completely different positions in the log file).
  • If that flag is set, we use a brute-force implementation of find_nearest_log_event_by_timestamp. Otherwise, we use the implementation you have here.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

  • When decoding the file, keep track of whether the timestamp of two consecutive log events are out of order
    "keep track of" means we are basically iterating the log events during the decode right?

if that's the case, can we divide the log events up to chunks using these timestamp outliers, perform searches in the proper chunk, and finally iterate those outliers?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would that be straightforward to do given that the log events collection changes based on the active filter? If not, then given how rare the case is (as far as we know), I'm not sure the extra code complexity is worth it.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In an offline discussion with @junhaoliao, we reach to an agreement that we can doc the assumption, and neglact this corner case.

By neglacting this corner case, we can save plenty of resource from verifying log events are in chronological order.

Henry8192 marked this conversation as resolved.
Show resolved Hide resolved
Henry8192 marked this conversation as resolved.
Show resolved Hide resolved
log_events.begin(),
log_events.end(),
timestamp,
[](clp::ir::epoch_time_ms_t ts, LogEventWithFilterData<LogEvent> const& log_event) {
return ts < log_event.get_timestamp();
}
)};

if (first_greater_it == log_events.begin()) {
return LogEventIdxTsType{emscripten::val(0)};
}

auto const first_greater_idx{std::distance(log_events.begin(), first_greater_it)};

return LogEventIdxTsType{emscripten::val(first_greater_idx - 1)};
}
} // namespace clp_ffi_js::ir

#endif // CLP_FFI_JS_IR_STREAMREADER_HPP
7 changes: 7 additions & 0 deletions src/clp_ffi_js/ir/StructuredIrStreamReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
#include <clp/Array.hpp>
#include <clp/ErrorCode.hpp>
#include <clp/ffi/ir_stream/Deserializer.hpp>
#include <clp/ir/types.hpp>
#include <clp/TraceableException.hpp>
#include <emscripten/val.h>
#include <spdlog/spdlog.h>
Expand Down Expand Up @@ -147,6 +148,12 @@ auto StructuredIrStreamReader::decode_range(size_t begin_idx, size_t end_idx, bo
);
}

auto StructuredIrStreamReader::get_log_event_idx_with_nearest_timestamp(
clp::ir::epoch_time_ms_t const target_ts
) -> LogEventIdxTsType {
return generic_get_log_event_idx_with_nearest_timestamp(*m_deserialized_log_events, target_ts);
}

StructuredIrStreamReader::StructuredIrStreamReader(
StreamReaderDataContext<StructuredIrDeserializer>&& stream_reader_data_context,
std::shared_ptr<StructuredLogEvents> deserialized_log_events
Expand Down
4 changes: 4 additions & 0 deletions src/clp_ffi_js/ir/StructuredIrStreamReader.hpp
Henry8192 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
#include <clp/Array.hpp>
#include <clp/ffi/ir_stream/Deserializer.hpp>
#include <clp/ffi/SchemaTree.hpp>
#include <clp/ir/types.hpp>
#include <emscripten/val.h>

#include <clp_ffi_js/ir/LogEventWithFilterData.hpp>
Expand Down Expand Up @@ -74,6 +75,9 @@ class StructuredIrStreamReader : public StreamReader {
[[nodiscard]] auto decode_range(size_t begin_idx, size_t end_idx, bool use_filter) const
-> DecodedResultsTsType override;

[[nodiscard]] auto get_log_event_idx_with_nearest_timestamp(clp::ir::epoch_time_ms_t target_ts
) -> LogEventIdxTsType override;

private:
// Constructor
explicit StructuredIrStreamReader(
Expand Down
6 changes: 6 additions & 0 deletions src/clp_ffi_js/ir/UnstructuredIrStreamReader.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,12 @@ auto UnstructuredIrStreamReader::decode_range(size_t begin_idx, size_t end_idx,
);
}

auto UnstructuredIrStreamReader::get_log_event_idx_with_nearest_timestamp(
clp::ir::epoch_time_ms_t const target_ts
) -> LogEventIdxTsType {
return generic_get_log_event_idx_with_nearest_timestamp(m_encoded_log_events, target_ts);
}

UnstructuredIrStreamReader::UnstructuredIrStreamReader(
StreamReaderDataContext<UnstructuredIrDeserializer>&& stream_reader_data_context
)
Expand Down
3 changes: 3 additions & 0 deletions src/clp_ffi_js/ir/UnstructuredIrStreamReader.hpp
Henry8192 marked this conversation as resolved.
Show resolved Hide resolved
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,9 @@ class UnstructuredIrStreamReader : public StreamReader {
[[nodiscard]] auto decode_range(size_t begin_idx, size_t end_idx, bool use_filter) const
-> DecodedResultsTsType override;

[[nodiscard]] auto get_log_event_idx_with_nearest_timestamp(clp::ir::epoch_time_ms_t target_ts
) -> LogEventIdxTsType override;

private:
// Constructor
explicit UnstructuredIrStreamReader(
Expand Down
Loading