diff --git a/pkg/logql/log/filter.go b/pkg/logql/log/filter.go index dbe5c5e99ce2..03a53b92b93c 100644 --- a/pkg/logql/log/filter.go +++ b/pkg/logql/log/filter.go @@ -440,6 +440,8 @@ func contains(line, substr []byte, caseInsensitive bool) bool { return containsLower(line, substr) } +// containsLower verifies if substr is a substring of line, with case insensitive comparison. +// substr is expected to be in lowercase. func containsLower(line, substr []byte) bool { if len(substr) == 0 { return true @@ -447,35 +449,62 @@ func containsLower(line, substr []byte) bool { if len(substr) > len(line) { return false } - j := 0 - for len(line) > 0 { - // ascii fast case - if c := line[0]; c < utf8.RuneSelf && substr[j] < utf8.RuneSelf { - if c == substr[j] || c+'a'-'A' == substr[j] || c == substr[j]+'a'-'A' { - j++ - if j == len(substr) { - return true + + // Fast path - try to find first byte of substr + firstByte := substr[0] + maxIndex := len(line) - len(substr) + + i := 0 + for i <= maxIndex { + // Find potential first byte match + c := line[i] + if c != firstByte && c+'a'-'A' != firstByte && c != firstByte+'a'-'A' { + i++ + continue + } + + // Found potential match, check rest of substr + matched := true + linePos := i + substrPos := 0 + + for linePos < len(line) && substrPos < len(substr) { + c := line[linePos] + s := substr[substrPos] + + // Fast ASCII comparison + if c < utf8.RuneSelf && s < utf8.RuneSelf { + if c != s && c+'a'-'A' != s && c != s+'a'-'A' { + matched = false + break } - line = line[1:] + linePos++ + substrPos++ continue } - line = line[1:] - j = 0 - continue - } - // unicode slow case - lr, lwid := utf8.DecodeRune(line) - mr, mwid := utf8.DecodeRune(substr[j:]) - if lr == mr || mr == unicode.To(unicode.LowerCase, lr) { - j += mwid - if j == len(substr) { - return true + + // Slower Unicode path only when needed + lr, lineSize := utf8.DecodeRune(line[linePos:]) + mr, substrSize := utf8.DecodeRune(substr[substrPos:]) + + if lr == utf8.RuneError || mr == utf8.RuneError { + matched = false + break } - line = line[lwid:] - continue + + if unicode.ToLower(lr) != mr { + matched = false + break + } + + linePos += lineSize + substrPos += substrSize } - line = line[lwid:] - j = 0 + + if matched && substrPos == len(substr) { + return true + } + i++ } return false } diff --git a/pkg/logql/log/filter_test.go b/pkg/logql/log/filter_test.go index 9699ca2fcd44..3568e92557cb 100644 --- a/pkg/logql/log/filter_test.go +++ b/pkg/logql/log/filter_test.go @@ -112,7 +112,7 @@ func Test_SimplifiedRegex(t *testing.T) { // tests all lines with both filter, they should have the same result. for _, line := range fixtures { l := []byte(line) - require.Equal(t, d.Filter(l), f.Filter(l), "regexp %s failed line: %s", test.re, line) + require.Equal(t, d.Filter(l), f.Filter(l), "regexp %s failed line: %s re:%v simplified:%v", test.re, line, d.Filter(l), f.Filter(l)) } }) } @@ -219,3 +219,100 @@ func benchmarkRegex(b *testing.B, re, line string, match bool) { func Test_rune(t *testing.T) { require.True(t, newContainsFilter([]byte("foo"), true).Filter([]byte("foo"))) } + +func BenchmarkContainsLower(b *testing.B) { + cases := []struct { + name string + line string + substr string + expected bool + }{ + { + name: "short_line_no_match", + line: "this is a short log line", + substr: "missing", + expected: false, + }, + { + name: "short_line_with_match", + line: "this is a short log line", + substr: "SHORT", + expected: true, + }, + { + name: "long_line_no_match", + line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", + substr: "nonexistent", + expected: false, + }, + { + name: "long_line_match_start", + line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", + substr: "2023", + expected: true, + }, + { + name: "long_line_match_middle", + line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", + substr: "LEVELS", + expected: true, + }, + { + name: "long_line_match_end", + line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200", + substr: "status", + expected: true, + }, + { + name: "short_unicode_line_no_match", + line: "🌟 Unicode line with emojis 🎉 and special chars ñ é ß", + substr: "missing", + expected: false, + }, + { + name: "short_unicode_line_with_match", + line: "🌟 Unicode line with emojis 🎉 and special chars ñ é ß", + substr: "EMOJIS", + expected: true, + }, + { + name: "long_unicode_line_no_match", + line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", + substr: "nonexistent", + expected: false, + }, + { + name: "long_unicode_line_match_start", + line: "2023-06-14T12:34:56.789Z 🚀[МИКРОСервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", + substr: "микросервис", + expected: true, + }, + { + name: "long_unicode_line_match_middle", + line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", + substr: "UNICODE", + expected: true, + }, + { + name: "long_unicode_line_match_end", + line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος", + substr: "τέλος", + expected: true, + }, + } + + var m bool + for _, c := range cases { + b.Run(c.name, func(b *testing.B) { + line := []byte(c.line) + substr := []byte(c.substr) + for i := 0; i < b.N; i++ { + m = containsLower(line, substr) + } + if m != c.expected { + b.Fatalf("expected %v but got %v", c.expected, m) + } + }) + } + res = m // Avoid compiler optimization +}