23
23
24
24
25
25
# The number of calls that make up a sequence.
26
- SEQUENCE_SIZE = 5
26
+ #
27
+ # The larger this is, the more calls are grouped together to match rule logic.
28
+ # This means a longer chain can be recognized; however, its a bit more expensive.
29
+ SEQUENCE_SIZE = 20
27
30
28
31
29
32
@dataclass
@@ -69,7 +72,8 @@ def find_thread_capabilities(
69
72
ruleset : RuleSet , extractor : DynamicFeatureExtractor , ph : ProcessHandle , th : ThreadHandle
70
73
) -> ThreadCapabilities :
71
74
"""
72
- find matches for the given rules within the given thread.
75
+ find matches for the given rules within the given thread,
76
+ which includes matches for all the sequences and calls within it.
73
77
"""
74
78
# all features found within this thread,
75
79
# includes features found within calls.
@@ -82,8 +86,18 @@ def find_thread_capabilities(
82
86
# matches found at the sequence scope.
83
87
sequence_matches : MatchResults = collections .defaultdict (list )
84
88
89
+ # We matches sequences as the sliding window of calls with size SEQUENCE_SIZE.
90
+ #
91
+ # For each call, we consider the window of SEQUENCE_SIZE calls leading up to it,
92
+ # merging all their features and doing a match.
93
+ # Here's the primary data structure: a deque of those features found in the prior calls.
94
+ # We'll append to it, and as it grows larger than SEQUENCE_SIZE, the oldest items are removed.
85
95
sequence : collections .deque [FeatureSet ] = collections .deque (maxlen = SEQUENCE_SIZE )
86
96
97
+ # the names of rules matched at the last sequence,
98
+ # so that we can deduplicate long strings of the same matche.
99
+ last_sequence_matches : set [str ] = set ()
100
+
87
101
for ch in extractor .get_calls (ph , th ):
88
102
call_capabilities = find_call_capabilities (ruleset , extractor , ph , th , ch )
89
103
for feature , vas in call_capabilities .features .items ():
@@ -92,16 +106,35 @@ def find_thread_capabilities(
92
106
for rule_name , res in call_capabilities .matches .items ():
93
107
call_matches [rule_name ].extend (res )
94
108
109
+ #
110
+ # sequence scope matching
111
+ #
112
+ # as we add items to the end of the deque, the oldest items will overflow and get dropped.
95
113
sequence .append (call_capabilities .features )
114
+ # collect all the features seen across the last SEQUENCE_SIZE calls,
115
+ # and match against them.
96
116
sequence_features : FeatureSet = collections .defaultdict (set )
97
117
for call in sequence :
98
118
for feature , vas in call .items ():
99
119
sequence_features [feature ].update (vas )
100
120
101
121
_ , smatches = ruleset .match (Scope .SEQUENCE , sequence_features , ch .address )
102
122
for rule_name , res in smatches .items ():
123
+ if rule_name in last_sequence_matches :
124
+ # don't emit match results for rules seen during the immediately preceeding sequence.
125
+ #
126
+ # This means that we won't emit duplicate matches when there are multiple sequences
127
+ # that overlap a single matching event.
128
+ # It also handles the case of a tight loop containing matched logic;
129
+ # only the first match will be recorded.
130
+ #
131
+ # In theory, this means the result document doesn't have *every* possible match location,
132
+ # but in practice, humans will only be interested in the first handful anyways.
133
+ continue
103
134
sequence_matches [rule_name ].extend (res )
104
135
136
+ last_sequence_matches = set (smatches .keys ())
137
+
105
138
for feature , va in itertools .chain (extractor .extract_thread_features (ph , th ), extractor .extract_global_features ()):
106
139
features [feature ].add (va )
107
140
0 commit comments