Skip to content

Commit a8abb16

Browse files
capabilities: use dataclasses to represent complicated return types
1 parent 6d05d3c commit a8abb16

20 files changed

+274
-217
lines changed

capa/capabilities/common.py

+23-7
Original file line numberDiff line numberDiff line change
@@ -9,17 +9,28 @@
99
import logging
1010
import itertools
1111
import collections
12-
from typing import Any
12+
from typing import Optional
13+
from dataclasses import dataclass
1314

1415
from capa.rules import Scope, RuleSet
1516
from capa.engine import FeatureSet, MatchResults
1617
from capa.features.address import NO_ADDRESS
18+
from capa.render.result_document import LibraryFunction, StaticFeatureCounts, DynamicFeatureCounts
1719
from capa.features.extractors.base_extractor import FeatureExtractor, StaticFeatureExtractor, DynamicFeatureExtractor
1820

1921
logger = logging.getLogger(__name__)
2022

2123

22-
def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, function_features: FeatureSet):
24+
@dataclass
25+
class FileCapabilities:
26+
features: FeatureSet
27+
matches: MatchResults
28+
feature_count: int
29+
30+
31+
def find_file_capabilities(
32+
ruleset: RuleSet, extractor: FeatureExtractor, function_features: FeatureSet
33+
) -> FileCapabilities:
2334
file_features: FeatureSet = collections.defaultdict(set)
2435

2536
for feature, va in itertools.chain(extractor.extract_file_features(), extractor.extract_global_features()):
@@ -36,8 +47,8 @@ def find_file_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, functi
3647

3748
file_features.update(function_features)
3849

39-
_, matches = ruleset.match(Scope.FILE, file_features, NO_ADDRESS)
40-
return matches, len(file_features)
50+
features, matches = ruleset.match(Scope.FILE, file_features, NO_ADDRESS)
51+
return FileCapabilities(features, matches, len(file_features))
4152

4253

4354
def has_file_limitation(rules: RuleSet, capabilities: MatchResults, is_standalone=True) -> bool:
@@ -62,9 +73,14 @@ def has_file_limitation(rules: RuleSet, capabilities: MatchResults, is_standalon
6273
return False
6374

6475

65-
def find_capabilities(
66-
ruleset: RuleSet, extractor: FeatureExtractor, disable_progress=None, **kwargs
67-
) -> tuple[MatchResults, Any]:
76+
@dataclass
77+
class Capabilities:
78+
matches: MatchResults
79+
feature_counts: StaticFeatureCounts | DynamicFeatureCounts
80+
library_functions: Optional[tuple[LibraryFunction, ...]] = None
81+
82+
83+
def find_capabilities(ruleset: RuleSet, extractor: FeatureExtractor, disable_progress=None, **kwargs) -> Capabilities:
6884
from capa.capabilities.static import find_static_capabilities
6985
from capa.capabilities.dynamic import find_dynamic_capabilities
7086

capa/capabilities/dynamic.py

+57-44
Original file line numberDiff line numberDiff line change
@@ -9,14 +9,14 @@
99
import logging
1010
import itertools
1111
import collections
12-
from typing import Any
12+
from dataclasses import dataclass
1313

1414
import capa.perf
1515
import capa.features.freeze as frz
1616
import capa.render.result_document as rdoc
1717
from capa.rules import Scope, RuleSet
1818
from capa.engine import FeatureSet, MatchResults
19-
from capa.capabilities.common import find_file_capabilities
19+
from capa.capabilities.common import Capabilities, find_file_capabilities
2020
from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle, DynamicFeatureExtractor
2121

2222
logger = logging.getLogger(__name__)
@@ -26,13 +26,17 @@
2626
SEQUENCE_SIZE = 5
2727

2828

29+
@dataclass
30+
class CallCapabilities:
31+
features: FeatureSet
32+
matches: MatchResults
33+
34+
2935
def find_call_capabilities(
3036
ruleset: RuleSet, extractor: DynamicFeatureExtractor, ph: ProcessHandle, th: ThreadHandle, ch: CallHandle
31-
) -> tuple[FeatureSet, MatchResults]:
37+
) -> CallCapabilities:
3238
"""
3339
find matches for the given rules for the given call.
34-
35-
returns: tuple containing (features for call, match results for call)
3640
"""
3741
# all features found for the call.
3842
features: FeatureSet = collections.defaultdict(set)
@@ -50,16 +54,22 @@ def find_call_capabilities(
5054
for addr, _ in res:
5155
capa.engine.index_rule_matches(features, rule, [addr])
5256

53-
return features, matches
57+
return CallCapabilities(features, matches)
58+
59+
60+
@dataclass
61+
class ThreadCapabilities:
62+
features: FeatureSet
63+
thread_matches: MatchResults
64+
sequence_matches: MatchResults
65+
call_matches: MatchResults
5466

5567

5668
def find_thread_capabilities(
5769
ruleset: RuleSet, extractor: DynamicFeatureExtractor, ph: ProcessHandle, th: ThreadHandle
58-
) -> tuple[FeatureSet, MatchResults, MatchResults, MatchResults]:
70+
) -> ThreadCapabilities:
5971
"""
6072
find matches for the given rules within the given thread.
61-
62-
returns: tuple containing (features for thread, match results for thread, match results for sequences, match results for calls)
6373
"""
6474
# all features found within this thread,
6575
# includes features found within calls.
@@ -75,20 +85,20 @@ def find_thread_capabilities(
7585
sequence: collections.deque[FeatureSet] = collections.deque(maxlen=SEQUENCE_SIZE)
7686

7787
for ch in extractor.get_calls(ph, th):
78-
cfeatures, cmatches = find_call_capabilities(ruleset, extractor, ph, th, ch)
79-
for feature, vas in cfeatures.items():
88+
call_capabilities = find_call_capabilities(ruleset, extractor, ph, th, ch)
89+
for feature, vas in call_capabilities.features.items():
8090
features[feature].update(vas)
8191

82-
for rule_name, res in cmatches.items():
92+
for rule_name, res in call_capabilities.matches.items():
8393
call_matches[rule_name].extend(res)
8494

85-
sequence.append(cfeatures)
86-
sfeatures: FeatureSet = collections.defaultdict(set)
95+
sequence.append(call_capabilities.features)
96+
sequence_features: FeatureSet = collections.defaultdict(set)
8797
for call in sequence:
8898
for feature, vas in call.items():
89-
sfeatures[feature].update(vas)
99+
sequence_features[feature].update(vas)
90100

91-
_, smatches = ruleset.match(Scope.SEQUENCE, sfeatures, ch.address)
101+
_, smatches = ruleset.match(Scope.SEQUENCE, sequence_features, ch.address)
92102
for rule_name, res in smatches.items():
93103
sequence_matches[rule_name].extend(res)
94104

@@ -103,16 +113,23 @@ def find_thread_capabilities(
103113
for va, _ in res:
104114
capa.engine.index_rule_matches(features, rule, [va])
105115

106-
return features, matches, sequence_matches, call_matches
116+
return ThreadCapabilities(features, matches, sequence_matches, call_matches)
117+
118+
119+
@dataclass
120+
class ProcessCapabilities:
121+
process_matches: MatchResults
122+
thread_matches: MatchResults
123+
sequence_matches: MatchResults
124+
call_matches: MatchResults
125+
feature_count: int
107126

108127

109128
def find_process_capabilities(
110129
ruleset: RuleSet, extractor: DynamicFeatureExtractor, ph: ProcessHandle
111-
) -> tuple[MatchResults, MatchResults, MatchResults, MatchResults, int]:
130+
) -> ProcessCapabilities:
112131
"""
113132
find matches for the given rules within the given process.
114-
115-
returns: tuple containing (match results for process, match results for threads, match results for calls, number of features)
116133
"""
117134
# all features found within this process,
118135
# includes features found within threads (and calls).
@@ -131,29 +148,29 @@ def find_process_capabilities(
131148
call_matches: MatchResults = collections.defaultdict(list)
132149

133150
for th in extractor.get_threads(ph):
134-
features, tmatches, smatches, cmatches = find_thread_capabilities(ruleset, extractor, ph, th)
135-
for feature, vas in features.items():
151+
thread_capabilities = find_thread_capabilities(ruleset, extractor, ph, th)
152+
for feature, vas in thread_capabilities.features.items():
136153
process_features[feature].update(vas)
137154

138-
for rule_name, res in tmatches.items():
155+
for rule_name, res in thread_capabilities.thread_matches.items():
139156
thread_matches[rule_name].extend(res)
140157

141-
for rule_name, res in smatches.items():
158+
for rule_name, res in thread_capabilities.sequence_matches.items():
142159
sequence_matches[rule_name].extend(res)
143160

144-
for rule_name, res in cmatches.items():
161+
for rule_name, res in thread_capabilities.call_matches.items():
145162
call_matches[rule_name].extend(res)
146163

147164
for feature, va in itertools.chain(extractor.extract_process_features(ph), extractor.extract_global_features()):
148165
process_features[feature].add(va)
149166

150167
_, process_matches = ruleset.match(Scope.PROCESS, process_features, ph.address)
151-
return process_matches, thread_matches, sequence_matches, call_matches, len(process_features)
168+
return ProcessCapabilities(process_matches, thread_matches, sequence_matches, call_matches, len(process_features))
152169

153170

154171
def find_dynamic_capabilities(
155172
ruleset: RuleSet, extractor: DynamicFeatureExtractor, disable_progress=None
156-
) -> tuple[MatchResults, Any]:
173+
) -> Capabilities:
157174
all_process_matches: MatchResults = collections.defaultdict(list)
158175
all_thread_matches: MatchResults = collections.defaultdict(list)
159176
all_sequence_matches: MatchResults = collections.defaultdict(list)
@@ -170,21 +187,21 @@ def find_dynamic_capabilities(
170187
) as pbar:
171188
task = pbar.add_task("matching", total=n_processes, unit="processes")
172189
for p in processes:
173-
process_matches, thread_matches, sequence_matches, call_matches, feature_count = find_process_capabilities(
174-
ruleset, extractor, p
175-
)
190+
process_capabilities = find_process_capabilities(ruleset, extractor, p)
176191
feature_counts.processes += (
177-
rdoc.ProcessFeatureCount(address=frz.Address.from_capa(p.address), count=feature_count),
192+
rdoc.ProcessFeatureCount(
193+
address=frz.Address.from_capa(p.address), count=process_capabilities.feature_count
194+
),
178195
)
179-
logger.debug("analyzed %s and extracted %d features", p.address, feature_count)
196+
logger.debug("analyzed %s and extracted %d features", p.address, process_capabilities.feature_count)
180197

181-
for rule_name, res in process_matches.items():
198+
for rule_name, res in process_capabilities.process_matches.items():
182199
all_process_matches[rule_name].extend(res)
183-
for rule_name, res in thread_matches.items():
200+
for rule_name, res in process_capabilities.thread_matches.items():
184201
all_thread_matches[rule_name].extend(res)
185-
for rule_name, res in sequence_matches.items():
202+
for rule_name, res in process_capabilities.sequence_matches.items():
186203
all_sequence_matches[rule_name].extend(res)
187-
for rule_name, res in call_matches.items():
204+
for rule_name, res in process_capabilities.call_matches.items():
188205
all_call_matches[rule_name].extend(res)
189206

190207
pbar.advance(task)
@@ -199,8 +216,8 @@ def find_dynamic_capabilities(
199216
rule = ruleset[rule_name]
200217
capa.engine.index_rule_matches(process_and_lower_features, rule, locations)
201218

202-
all_file_matches, feature_count = find_file_capabilities(ruleset, extractor, process_and_lower_features)
203-
feature_counts.file = feature_count
219+
all_file_capabilities = find_file_capabilities(ruleset, extractor, process_and_lower_features)
220+
feature_counts.file = all_file_capabilities.feature_count
204221

205222
matches = dict(
206223
itertools.chain(
@@ -211,12 +228,8 @@ def find_dynamic_capabilities(
211228
all_sequence_matches.items(),
212229
all_thread_matches.items(),
213230
all_process_matches.items(),
214-
all_file_matches.items(),
231+
all_file_capabilities.matches.items(),
215232
)
216233
)
217234

218-
meta = {
219-
"feature_counts": feature_counts,
220-
}
221-
222-
return matches, meta
235+
return Capabilities(matches, feature_counts)

0 commit comments

Comments
 (0)