Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore: Improve containsLower performance using quick rejection #15076

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
77 changes: 53 additions & 24 deletions pkg/logql/log/filter.go
Original file line number Diff line number Diff line change
Expand Up @@ -440,42 +440,71 @@ func contains(line, substr []byte, caseInsensitive bool) bool {
return containsLower(line, substr)
}

// containsLower verifies if substr is a substring of line, with case insensitive comparison.
// substr is expected to be in lowercase.
func containsLower(line, substr []byte) bool {
if len(substr) == 0 {
return true
}
if len(substr) > len(line) {
return false
}
j := 0
for len(line) > 0 {
// ascii fast case
if c := line[0]; c < utf8.RuneSelf && substr[j] < utf8.RuneSelf {
if c == substr[j] || c+'a'-'A' == substr[j] || c == substr[j]+'a'-'A' {
j++
if j == len(substr) {
return true
Comment on lines -450 to -457
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Interesting I've always thought Loki was using strings.Index but I guess one cannot ignore character casing there.

Anyhow, during my work on a SIMD substring search I've found that the underlying Rabin-Karp algorithm is soo much faster. I'm curious if it could be adapted to be case-insensitive 🤷

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We don't for case sensitive yes. What do you think of the PR though that would help me to get a review.


// Fast path - try to find first byte of substr
firstByte := substr[0]
maxIndex := len(line) - len(substr)

i := 0
for i <= maxIndex {
// Find potential first byte match
c := line[i]
if c != firstByte && c+'a'-'A' != firstByte && c != firstByte+'a'-'A' {
i++
continue
}

// Found potential match, check rest of substr
matched := true
linePos := i
substrPos := 0

for linePos < len(line) && substrPos < len(substr) {
c := line[linePos]
s := substr[substrPos]

// Fast ASCII comparison
if c < utf8.RuneSelf && s < utf8.RuneSelf {
if c != s && c+'a'-'A' != s && c != s+'a'-'A' {
matched = false
break
}
line = line[1:]
linePos++
substrPos++
continue
}
line = line[1:]
j = 0
continue
}
// unicode slow case
lr, lwid := utf8.DecodeRune(line)
mr, mwid := utf8.DecodeRune(substr[j:])
if lr == mr || mr == unicode.To(unicode.LowerCase, lr) {
j += mwid
if j == len(substr) {
return true

// Slower Unicode path only when needed
lr, lineSize := utf8.DecodeRune(line[linePos:])
mr, substrSize := utf8.DecodeRune(substr[substrPos:])

if lr == utf8.RuneError || mr == utf8.RuneError {
matched = false
break
}
line = line[lwid:]
continue

if unicode.ToLower(lr) != mr {
matched = false
break
}

linePos += lineSize
substrPos += substrSize
}
line = line[lwid:]
j = 0

if matched && substrPos == len(substr) {
return true
}
i++
}
return false
}
Expand Down
99 changes: 98 additions & 1 deletion pkg/logql/log/filter_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -112,7 +112,7 @@ func Test_SimplifiedRegex(t *testing.T) {
// tests all lines with both filter, they should have the same result.
for _, line := range fixtures {
l := []byte(line)
require.Equal(t, d.Filter(l), f.Filter(l), "regexp %s failed line: %s", test.re, line)
require.Equal(t, d.Filter(l), f.Filter(l), "regexp %s failed line: %s re:%v simplified:%v", test.re, line, d.Filter(l), f.Filter(l))
}
})
}
Expand Down Expand Up @@ -219,3 +219,100 @@ func benchmarkRegex(b *testing.B, re, line string, match bool) {
func Test_rune(t *testing.T) {
require.True(t, newContainsFilter([]byte("foo"), true).Filter([]byte("foo")))
}

func BenchmarkContainsLower(b *testing.B) {
cases := []struct {
name string
line string
substr string
expected bool
}{
{
name: "short_line_no_match",
line: "this is a short log line",
substr: "missing",
expected: false,
},
{
name: "short_line_with_match",
line: "this is a short log line",
substr: "SHORT",
expected: true,
},
{
name: "long_line_no_match",
line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
substr: "nonexistent",
expected: false,
},
{
name: "long_line_match_start",
line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
substr: "2023",
expected: true,
},
{
name: "long_line_match_middle",
line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
substr: "LEVELS",
expected: true,
},
{
name: "long_line_match_end",
line: "2023-06-14T12:34:56.789Z INFO [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
substr: "status",
expected: true,
},
{
name: "short_unicode_line_no_match",
line: "🌟 Unicode line with emojis 🎉 and special chars ñ é ß",
substr: "missing",
expected: false,
},
{
name: "short_unicode_line_with_match",
line: "🌟 Unicode line with emojis 🎉 and special chars ñ é ß",
substr: "EMOJIS",
expected: true,
},
{
name: "long_unicode_line_no_match",
line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
substr: "nonexistent",
expected: false,
},
{
name: "long_unicode_line_match_start",
line: "2023-06-14T12:34:56.789Z 🚀[МИКРОСервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
substr: "микросервис",
expected: true,
},
{
name: "long_unicode_line_match_middle",
line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
substr: "UNICODE",
expected: true,
},
{
name: "long_unicode_line_match_end",
line: "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
substr: "τέλος",
expected: true,
},
}

var m bool
for _, c := range cases {
b.Run(c.name, func(b *testing.B) {
line := []byte(c.line)
substr := []byte(c.substr)
for i := 0; i < b.N; i++ {
m = containsLower(line, substr)
}
if m != c.expected {
b.Fatalf("expected %v but got %v", c.expected, m)
}
})
}
res = m // Avoid compiler optimization
}
Loading