From 7040951584c5388efce9430c1e323ab1a30abfe5 Mon Sep 17 00:00:00 2001 From: Cyril Tovena Date: Fri, 22 Nov 2024 15:10:07 +0100 Subject: [PATCH 1/2] chore: Improve containsLower performance using quick rejection --- pkg/logql/log/filter.go | 60 +++++++++++++--------- pkg/logql/log/filter_test.go | 97 ++++++++++++++++++++++++++++++++++++ 2 files changed, 133 insertions(+), 24 deletions(-) diff --git a/pkg/logql/log/filter.go b/pkg/logql/log/filter.go index dbe5c5e99ce2..54e4a81e972b 100644 --- a/pkg/logql/log/filter.go +++ b/pkg/logql/log/filter.go @@ -440,6 +440,8 @@ func contains(line, substr []byte, caseInsensitive bool) bool { return containsLower(line, substr) } +// containsLower verifies if substr is a substring of line, with case insensitive comparison. +// substr is expected to be in lowercase. func containsLower(line, substr []byte) bool { if len(substr) == 0 { return true @@ -447,35 +449,45 @@ func containsLower(line, substr []byte) bool { if len(substr) > len(line) { return false } - j := 0 - for len(line) > 0 { - // ascii fast case - if c := line[0]; c < utf8.RuneSelf && substr[j] < utf8.RuneSelf { - if c == substr[j] || c+'a'-'A' == substr[j] || c == substr[j]+'a'-'A' { - j++ - if j == len(substr) { - return true + + // Fast path - try to find first byte of substr + firstByte := substr[0] + maxIndex := len(line) - len(substr) + + i := 0 + for i <= maxIndex { + // Find potential first byte match + c := line[i] + if c != firstByte && c+'a'-'A' != firstByte && c != firstByte+'a'-'A' { + i++ + continue + } + + // Found potential match, check rest of substr + matched := true + for j := 1; j < len(substr); j++ { + c = line[i+j] + s := substr[j] + // Fast ASCII comparison + if c < utf8.RuneSelf && s < utf8.RuneSelf { + if c != s && c+'a'-'A' != s && c != s+'a'-'A' { + matched = false + break } - line = line[1:] continue } - line = line[1:] - j = 0 - continue - } - // unicode slow case - lr, lwid := utf8.DecodeRune(line) - mr, mwid := utf8.DecodeRune(substr[j:]) - if lr == mr || mr == unicode.To(unicode.LowerCase, lr) { - j += mwid - if j == len(substr) { - return true + // Slower Unicode path only when needed + lr, _ := utf8.DecodeRune(line[i+j:]) + mr, _ := utf8.DecodeRune(substr[j:]) + if lr != mr && mr != unicode.To(unicode.LowerCase, lr) { + matched = false + break } - line = line[lwid:] - continue } - line = line[lwid:] - j = 0 + if matched { + return true + } + i++ } return false } diff --git a/pkg/logql/log/filter_test.go b/pkg/logql/log/filter_test.go index 9699ca2fcd44..724316f1ca5c 100644 --- a/pkg/logql/log/filter_test.go +++ b/pkg/logql/log/filter_test.go @@ -219,3 +219,100 @@ func benchmarkRegex(b *testing.B, re, line string, match bool) { func Test_rune(t *testing.T) { require.True(t, newContainsFilter([]byte("foo"), true).Filter([]byte("foo"))) } + +func BenchmarkContainsLower(b *testing.B) { + cases := []struct { + name string + line string + substr string + expected bool + }{ + { + name: "short_line_no_match", + line: "this is a short log line", + substr: "missing", + expected: false, + }, + { + name: "short_line_with_match", + line: "this is a short log line", + substr: "SHORT", + expected: true, + }, + { + name: "long_line_no_match", + line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", + substr: "nonexistent", + expected: false, + }, + { + name: "long_line_match_start", + line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", + substr: "2023", + expected: true, + }, + { + name: "long_line_match_middle", + line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", + substr: "LEVELS", + expected: true, + }, + { + name: "long_line_match_end", + line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", + substr: "status", + expected: true, + }, + { + name: "short_unicode_line_no_match", + line: "🌟 Unicode line with emojis 🎉 and special chars ñ é ß", + substr: "missing", + expected: false, + }, + { + name: "short_unicode_line_with_match", + line: "🌟 Unicode line with emojis 🎉 and special chars ñ é ß", + substr: "EMOJIS", + expected: true, + }, + { + name: "long_unicode_line_no_match", + line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", + substr: "nonexistent", + expected: false, + }, + { + name: "long_unicode_line_match_start", + line: "2023-06-14T12:34:56.789Z 🚀[МИКРОСервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", + substr: "микросервис", + expected: true, + }, + { + name: "long_unicode_line_match_middle", + line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", + substr: "UNICODE", + expected: true, + }, + { + name: "long_unicode_line_match_end", + line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", + substr: "τέλος", + expected: true, + }, + } + + var m bool + for _, c := range cases { + b.Run(c.name, func(b *testing.B) { + line := []byte(c.line) + substr := []byte(c.substr) + for i := 0; i < b.N; i++ { + m = containsLower(line, substr) + } + if m != c.expected { + b.Fatalf("expected %v but got %v", c.expected, m) + } + }) + } + res = m // Avoid compiler optimization +} From 17f715c8faa4d8c807df2872f990316ffd7df681 Mon Sep 17 00:00:00 2001 From: Cyril Tovena Date: Tue, 26 Nov 2024 15:05:31 +0100 Subject: [PATCH 2/2] fixes unicode matching --- pkg/logql/log/filter.go | 31 ++++++++++++++++++++++++------- pkg/logql/log/filter_test.go | 2 +- 2 files changed, 25 insertions(+), 8 deletions(-) diff --git a/pkg/logql/log/filter.go b/pkg/logql/log/filter.go index 54e4a81e972b..03a53b92b93c 100644 --- a/pkg/logql/log/filter.go +++ b/pkg/logql/log/filter.go @@ -465,26 +465,43 @@ func containsLower(line, substr []byte) bool { // Found potential match, check rest of substr matched := true - for j := 1; j < len(substr); j++ { - c = line[i+j] - s := substr[j] + linePos := i + substrPos := 0 + + for linePos < len(line) && substrPos < len(substr) { + c := line[linePos] + s := substr[substrPos] + // Fast ASCII comparison if c < utf8.RuneSelf && s < utf8.RuneSelf { if c != s && c+'a'-'A' != s && c != s+'a'-'A' { matched = false break } + linePos++ + substrPos++ continue } + // Slower Unicode path only when needed - lr, _ := utf8.DecodeRune(line[i+j:]) - mr, _ := utf8.DecodeRune(substr[j:]) - if lr != mr && mr != unicode.To(unicode.LowerCase, lr) { + lr, lineSize := utf8.DecodeRune(line[linePos:]) + mr, substrSize := utf8.DecodeRune(substr[substrPos:]) + + if lr == utf8.RuneError || mr == utf8.RuneError { + matched = false + break + } + + if unicode.ToLower(lr) != mr { matched = false break } + + linePos += lineSize + substrPos += substrSize } - if matched { + + if matched && substrPos == len(substr) { return true } i++ diff --git a/pkg/logql/log/filter_test.go b/pkg/logql/log/filter_test.go index 724316f1ca5c..3568e92557cb 100644 --- a/pkg/logql/log/filter_test.go +++ b/pkg/logql/log/filter_test.go @@ -112,7 +112,7 @@ func Test_SimplifiedRegex(t *testing.T) { // tests all lines with both filter, they should have the same result. for _, line := range fixtures { l := []byte(line) - require.Equal(t, d.Filter(l), f.Filter(l), "regexp %s failed line: %s", test.re, line) + require.Equal(t, d.Filter(l), f.Filter(l), "regexp %s failed line: %s re:%v simplified:%v", test.re, line, d.Filter(l), f.Filter(l)) } }) }