From 7040951584c5388efce9430c1e323ab1a30abfe5 Mon Sep 17 00:00:00 2001
From: Cyril Tovena <cyril.tovena@gmail.com>
Date: Fri, 22 Nov 2024 15:10:07 +0100
Subject: [PATCH 1/2] chore: Improve containsLower performance using quick
 rejection

---
 pkg/logql/log/filter.go      | 60 +++++++++++++---------
 pkg/logql/log/filter_test.go | 97 ++++++++++++++++++++++++++++++++++++
 2 files changed, 133 insertions(+), 24 deletions(-)

diff --git a/pkg/logql/log/filter.go b/pkg/logql/log/filter.go
index dbe5c5e99ce2..54e4a81e972b 100644
--- a/pkg/logql/log/filter.go
+++ b/pkg/logql/log/filter.go
@@ -440,6 +440,8 @@ func contains(line, substr []byte, caseInsensitive bool) bool {
 	return containsLower(line, substr)
 }
 
+// containsLower verifies if substr is a substring of line, with case insensitive comparison.
+// substr is expected to be in lowercase.
 func containsLower(line, substr []byte) bool {
 	if len(substr) == 0 {
 		return true
@@ -447,35 +449,45 @@ func containsLower(line, substr []byte) bool {
 	if len(substr) > len(line) {
 		return false
 	}
-	j := 0
-	for len(line) > 0 {
-		// ascii fast case
-		if c := line[0]; c < utf8.RuneSelf && substr[j] < utf8.RuneSelf {
-			if c == substr[j] || c+'a'-'A' == substr[j] || c == substr[j]+'a'-'A' {
-				j++
-				if j == len(substr) {
-					return true
+
+	// Fast path - try to find first byte of substr
+	firstByte := substr[0]
+	maxIndex := len(line) - len(substr)
+
+	i := 0
+	for i <= maxIndex {
+		// Find potential first byte match
+		c := line[i]
+		if c != firstByte && c+'a'-'A' != firstByte && c != firstByte+'a'-'A' {
+			i++
+			continue
+		}
+
+		// Found potential match, check rest of substr
+		matched := true
+		for j := 1; j < len(substr); j++ {
+			c = line[i+j]
+			s := substr[j]
+			// Fast ASCII comparison
+			if c < utf8.RuneSelf && s < utf8.RuneSelf {
+				if c != s && c+'a'-'A' != s && c != s+'a'-'A' {
+					matched = false
+					break
 				}
-				line = line[1:]
 				continue
 			}
-			line = line[1:]
-			j = 0
-			continue
-		}
-		// unicode slow case
-		lr, lwid := utf8.DecodeRune(line)
-		mr, mwid := utf8.DecodeRune(substr[j:])
-		if lr == mr || mr == unicode.To(unicode.LowerCase, lr) {
-			j += mwid
-			if j == len(substr) {
-				return true
+			// Slower Unicode path only when needed
+			lr, _ := utf8.DecodeRune(line[i+j:])
+			mr, _ := utf8.DecodeRune(substr[j:])
+			if lr != mr && mr != unicode.To(unicode.LowerCase, lr) {
+				matched = false
+				break
 			}
-			line = line[lwid:]
-			continue
 		}
-		line = line[lwid:]
-		j = 0
+		if matched {
+			return true
+		}
+		i++
 	}
 	return false
 }
diff --git a/pkg/logql/log/filter_test.go b/pkg/logql/log/filter_test.go
index 9699ca2fcd44..724316f1ca5c 100644
--- a/pkg/logql/log/filter_test.go
+++ b/pkg/logql/log/filter_test.go
@@ -219,3 +219,100 @@ func benchmarkRegex(b *testing.B, re, line string, match bool) {
 func Test_rune(t *testing.T) {
 	require.True(t, newContainsFilter([]byte("foo"), true).Filter([]byte("foo")))
 }
+
+func BenchmarkContainsLower(b *testing.B) {
+	cases := []struct {
+		name     string
+		line     string
+		substr   string
+		expected bool
+	}{
+		{
+			name:     "short_line_no_match",
+			line:     "this is a short log line",
+			substr:   "missing",
+			expected: false,
+		},
+		{
+			name:     "short_line_with_match",
+			line:     "this is a short log line",
+			substr:   "SHORT",
+			expected: true,
+		},
+		{
+			name:     "long_line_no_match",
+			line:     "2023-06-14T12:34:56.789Z INFO  [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
+			substr:   "nonexistent",
+			expected: false,
+		},
+		{
+			name:     "long_line_match_start",
+			line:     "2023-06-14T12:34:56.789Z INFO  [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
+			substr:   "2023",
+			expected: true,
+		},
+		{
+			name:     "long_line_match_middle",
+			line:     "2023-06-14T12:34:56.789Z INFO  [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
+			substr:   "LEVELS",
+			expected: true,
+		},
+		{
+			name:     "long_line_match_end",
+			line:     "2023-06-14T12:34:56.789Z INFO  [service_name] This is a much longer log line with timestamps, levels and other information that typically appears in production logs. RequestID=123456 UserID=789 Action=GetUser Duration=123ms Status=200",
+			substr:   "status",
+			expected: true,
+		},
+		{
+			name:     "short_unicode_line_no_match",
+			line:     "🌟 Unicode line with emojis 🎉 and special chars ñ é ß",
+			substr:   "missing",
+			expected: false,
+		},
+		{
+			name:     "short_unicode_line_with_match",
+			line:     "🌟 Unicode line with emojis 🎉 and special chars ñ é ß",
+			substr:   "EMOJIS",
+			expected: true,
+		},
+		{
+			name:     "long_unicode_line_no_match",
+			line:     "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
+			substr:   "nonexistent",
+			expected: false,
+		},
+		{
+			name:     "long_unicode_line_match_start",
+			line:     "2023-06-14T12:34:56.789Z 🚀[МИКРОСервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
+			substr:   "микросервис",
+			expected: true,
+		},
+		{
+			name:     "long_unicode_line_match_middle",
+			line:     "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
+			substr:   "UNICODE",
+			expected: true,
+		},
+		{
+			name:     "long_unicode_line_match_end",
+			line:     "2023-06-14T12:34:56.789Z 🚀 [микросервис] Длинное сообщение с Unicode символами 统一码 が大好き! エラー分析: システムは正常に動作しています。RequestID=123456 状態=良好 Résultat=Succès ß=γ 🎯 τέλος",
+			substr:   "τέλος",
+			expected: true,
+		},
+	}
+
+	var m bool
+	for _, c := range cases {
+		b.Run(c.name, func(b *testing.B) {
+			line := []byte(c.line)
+			substr := []byte(c.substr)
+			for i := 0; i < b.N; i++ {
+				m = containsLower(line, substr)
+			}
+			if m != c.expected {
+				b.Fatalf("expected %v but got %v", c.expected, m)
+			}
+		})
+	}
+	res = m // Avoid compiler optimization
+}

From 17f715c8faa4d8c807df2872f990316ffd7df681 Mon Sep 17 00:00:00 2001
From: Cyril Tovena <cyril.tovena@gmail.com>
Date: Tue, 26 Nov 2024 15:05:31 +0100
Subject: [PATCH 2/2] fixes unicode matching

---
 pkg/logql/log/filter.go      | 31 ++++++++++++++++++++++++-------
 pkg/logql/log/filter_test.go |  2 +-
 2 files changed, 25 insertions(+), 8 deletions(-)

diff --git a/pkg/logql/log/filter.go b/pkg/logql/log/filter.go
index 54e4a81e972b..03a53b92b93c 100644
--- a/pkg/logql/log/filter.go
+++ b/pkg/logql/log/filter.go
@@ -465,26 +465,43 @@ func containsLower(line, substr []byte) bool {
 
 		// Found potential match, check rest of substr
 		matched := true
-		for j := 1; j < len(substr); j++ {
-			c = line[i+j]
-			s := substr[j]
+		linePos := i
+		substrPos := 0
+
+		for linePos < len(line) && substrPos < len(substr) {
+			c := line[linePos]
+			s := substr[substrPos]
+
 			// Fast ASCII comparison
 			if c < utf8.RuneSelf && s < utf8.RuneSelf {
 				if c != s && c+'a'-'A' != s && c != s+'a'-'A' {
 					matched = false
 					break
 				}
+				linePos++
+				substrPos++
 				continue
 			}
+
 			// Slower Unicode path only when needed
-			lr, _ := utf8.DecodeRune(line[i+j:])
-			mr, _ := utf8.DecodeRune(substr[j:])
-			if lr != mr && mr != unicode.To(unicode.LowerCase, lr) {
+			lr, lineSize := utf8.DecodeRune(line[linePos:])
+			mr, substrSize := utf8.DecodeRune(substr[substrPos:])
+
+			if lr == utf8.RuneError || mr == utf8.RuneError {
+				matched = false
+				break
+			}
+
+			if unicode.ToLower(lr) != mr {
 				matched = false
 				break
 			}
+
+			linePos += lineSize
+			substrPos += substrSize
 		}
-		if matched {
+
+		if matched && substrPos == len(substr) {
 			return true
 		}
 		i++
diff --git a/pkg/logql/log/filter_test.go b/pkg/logql/log/filter_test.go
index 724316f1ca5c..3568e92557cb 100644
--- a/pkg/logql/log/filter_test.go
+++ b/pkg/logql/log/filter_test.go
@@ -112,7 +112,7 @@ func Test_SimplifiedRegex(t *testing.T) {
 			// tests all lines with both filter, they should have the same result.
 			for _, line := range fixtures {
 				l := []byte(line)
-				require.Equal(t, d.Filter(l), f.Filter(l), "regexp %s failed line: %s", test.re, line)
+				require.Equal(t, d.Filter(l), f.Filter(l), "regexp %s failed line: %s re:%v simplified:%v", test.re, line, d.Filter(l), f.Filter(l))
 			}
 		})
 	}