Skip to content

Commit b10d591

Browse files
sequence: only match first overlapping sequence
also, for repeating behavior, match only the first instance.
1 parent 37f6ccb commit b10d591

File tree

2 files changed

+45
-6
lines changed

2 files changed

+45
-6
lines changed

capa/capabilities/dynamic.py

+35-2
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,10 @@
2323

2424

2525
# The number of calls that make up a sequence.
26-
SEQUENCE_SIZE = 5
26+
#
27+
# The larger this is, the more calls are grouped together to match rule logic.
28+
# This means a longer chain can be recognized; however, its a bit more expensive.
29+
SEQUENCE_SIZE = 20
2730

2831

2932
@dataclass
@@ -69,7 +72,8 @@ def find_thread_capabilities(
6972
ruleset: RuleSet, extractor: DynamicFeatureExtractor, ph: ProcessHandle, th: ThreadHandle
7073
) -> ThreadCapabilities:
7174
"""
72-
find matches for the given rules within the given thread.
75+
find matches for the given rules within the given thread,
76+
which includes matches for all the sequences and calls within it.
7377
"""
7478
# all features found within this thread,
7579
# includes features found within calls.
@@ -82,8 +86,18 @@ def find_thread_capabilities(
8286
# matches found at the sequence scope.
8387
sequence_matches: MatchResults = collections.defaultdict(list)
8488

89+
# We matches sequences as the sliding window of calls with size SEQUENCE_SIZE.
90+
#
91+
# For each call, we consider the window of SEQUENCE_SIZE calls leading up to it,
92+
# merging all their features and doing a match.
93+
# Here's the primary data structure: a deque of those features found in the prior calls.
94+
# We'll append to it, and as it grows larger than SEQUENCE_SIZE, the oldest items are removed.
8595
sequence: collections.deque[FeatureSet] = collections.deque(maxlen=SEQUENCE_SIZE)
8696

97+
# the names of rules matched at the last sequence,
98+
# so that we can deduplicate long strings of the same matche.
99+
last_sequence_matches: set[str] = set()
100+
87101
for ch in extractor.get_calls(ph, th):
88102
call_capabilities = find_call_capabilities(ruleset, extractor, ph, th, ch)
89103
for feature, vas in call_capabilities.features.items():
@@ -92,16 +106,35 @@ def find_thread_capabilities(
92106
for rule_name, res in call_capabilities.matches.items():
93107
call_matches[rule_name].extend(res)
94108

109+
#
110+
# sequence scope matching
111+
#
112+
# as we add items to the end of the deque, the oldest items will overflow and get dropped.
95113
sequence.append(call_capabilities.features)
114+
# collect all the features seen across the last SEQUENCE_SIZE calls,
115+
# and match against them.
96116
sequence_features: FeatureSet = collections.defaultdict(set)
97117
for call in sequence:
98118
for feature, vas in call.items():
99119
sequence_features[feature].update(vas)
100120

101121
_, smatches = ruleset.match(Scope.SEQUENCE, sequence_features, ch.address)
102122
for rule_name, res in smatches.items():
123+
if rule_name in last_sequence_matches:
124+
# don't emit match results for rules seen during the immediately preceeding sequence.
125+
#
126+
# This means that we won't emit duplicate matches when there are multiple sequences
127+
# that overlap a single matching event.
128+
# It also handles the case of a tight loop containing matched logic;
129+
# only the first match will be recorded.
130+
#
131+
# In theory, this means the result document doesn't have *every* possible match location,
132+
# but in practice, humans will only be interested in the first handful anyways.
133+
continue
103134
sequence_matches[rule_name].extend(res)
104135

136+
last_sequence_matches = set(smatches.keys())
137+
105138
for feature, va in itertools.chain(extractor.extract_thread_features(ph, th), extractor.extract_global_features()):
106139
features[feature].add(va)
107140

tests/test_dynamic_sequence_scope.py

+10-4
Original file line numberDiff line numberDiff line change
@@ -134,7 +134,7 @@ def test_dynamic_sequence_scope():
134134
assert 12 in get_call_ids(capabilities.matches[r.name])
135135

136136

137-
# show the sequence is only 5 calls long, and doesn't match beyond that 5-tuple.
137+
# show that when the sequence is only 5 calls long (for example), it doesn't match beyond that 5-tuple.
138138
#
139139
# proc: 0000A65749F5902C4D82.exe (ppid=2456, pid=3052)
140140
# thread: 3064
@@ -168,7 +168,13 @@ def test_dynamic_sequence_scope2():
168168
r = capa.rules.Rule.from_yaml(rule)
169169
ruleset = capa.rules.RuleSet([r])
170170

171-
capabilities = capa.capabilities.dynamic.find_dynamic_capabilities(ruleset, extractor, disable_progress=True)
171+
# patch SEQUENCE_SIZE since we may use a much larger value in the real world.
172+
from pytest import MonkeyPatch
173+
174+
with MonkeyPatch.context() as m:
175+
m.setattr(capa.capabilities.dynamic, "SEQUENCE_SIZE", 5)
176+
capabilities = capa.capabilities.dynamic.find_dynamic_capabilities(ruleset, extractor, disable_progress=True)
177+
172178
assert r.name not in capabilities.matches
173179

174180

@@ -215,7 +221,6 @@ def test_dynamic_sequence_example():
215221

216222

217223
# show how sequences that overlap a single event are handled.
218-
# TODO(williballenthin): but I think we really just want one match for this, not copies of the same thing.
219224
#
220225
# proc: 0000A65749F5902C4D82.exe (ppid=2456, pid=3052)
221226
# thread: 3064
@@ -252,4 +257,5 @@ def test_dynamic_sequence_multiple_sequences_overlapping_single_event():
252257

253258
capabilities = capa.capabilities.dynamic.find_dynamic_capabilities(ruleset, extractor, disable_progress=True)
254259
assert r.name in capabilities.matches
255-
assert [11, 12, 13, 14, 15] == list(get_call_ids(capabilities.matches[r.name]))
260+
# we only match the first overlapping sequence
261+
assert [11] == list(get_call_ids(capabilities.matches[r.name]))

0 commit comments

Comments
 (0)