From fe55d3ad371fec56e36268f4fe52ec12a5aafd59 Mon Sep 17 00:00:00 2001 From: Soufiane Fariss Date: Thu, 30 May 2024 04:04:51 +0200 Subject: [PATCH 1/6] display capabilities for file limitations This commit deals with: - displaying the capabilities for files matching a file limitations rule - inform users about potential false positive due to few library functions - wip: report the number of api calls made, and inform the user if the number is low --- capa/capabilities/common.py | 3 +-- capa/capabilities/static.py | 4 ++++ capa/main.py | 17 ++--------------- capa/render/default.py | 10 ++++++++++ 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/capa/capabilities/common.py b/capa/capabilities/common.py index a73f40afe..d71d2a12f 100644 --- a/capa/capabilities/common.py +++ b/capa/capabilities/common.py @@ -52,8 +52,7 @@ def has_file_limitation(rules: RuleSet, capabilities: MatchResults, is_standalon logger.warning(" %s", line) logger.warning(" Identified via rule: %s", file_limitation_rule.name) if is_standalone: - logger.warning(" ") - logger.warning(" Use -v or -vv if you really want to see the capabilities identified by capa.") + pass logger.warning("-" * 80) # bail on first file limitation diff --git a/capa/capabilities/static.py b/capa/capabilities/static.py index 4f3b3b6a1..01bbd4f7d 100644 --- a/capa/capabilities/static.py +++ b/capa/capabilities/static.py @@ -21,6 +21,7 @@ from capa.rules import Scope, RuleSet from capa.engine import FeatureSet, MatchResults from capa.helpers import redirecting_print_to_tqdm +from capa.features.insn import API from capa.capabilities.common import find_file_capabilities from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, StaticFeatureExtractor @@ -118,6 +119,9 @@ def find_code_capabilities( features, bmatches, imatches = find_basic_block_capabilities(ruleset, extractor, fh, bb) for feature, vas in features.items(): function_features[feature].update(vas) + if isinstance(feature, API): + # delcare a global variable (a set) and append to it here? + pass for rule_name, res in bmatches.items(): bb_matches[rule_name].extend(res) diff --git a/capa/main.py b/capa/main.py index eb43769d2..8d2beca7c 100644 --- a/capa/main.py +++ b/capa/main.py @@ -75,7 +75,7 @@ FORMAT_RESULT, ) from capa.capabilities.common import find_capabilities, has_file_limitation, find_file_capabilities -from capa.features.extractors.base_extractor import FeatureExtractor, StaticFeatureExtractor, DynamicFeatureExtractor +from capa.features.extractors.base_extractor import FeatureExtractor, DynamicFeatureExtractor RULES_PATH_DEFAULT_STRING = "(embedded rules)" SIGNATURES_PATH_DEFAULT_STRING = "(embedded signatures)" @@ -666,16 +666,9 @@ def find_file_limitations_from_cli(args, rules: RuleSet, file_extractors: List[F except (ELFError, OverflowError) as e: logger.error("Input file '%s' is not a valid ELF file: %s", args.input_file, str(e)) raise ShouldExitError(E_CORRUPT_FILE) from e - # file limitations that rely on non-file scope won't be detected here. # nor on FunctionName features, because pefile doesn't support this. found_file_limitation = has_file_limitation(rules, pure_file_capabilities) - if found_file_limitation: - # bail if capa encountered file limitation e.g. a packed binary - # do show the output in verbose mode, though. - if not (args.verbose or args.vverbose or args.json): - logger.debug("file limitation short circuit, won't analyze fully.") - raise ShouldExitError(E_FILE_LIMITATION) return found_file_limitation @@ -804,7 +797,7 @@ def main(argv: Optional[List[str]] = None): input_format = get_input_format_from_cli(args) rules = get_rules_from_cli(args) file_extractors = get_file_extractors_from_cli(args, input_format) - found_file_limitation = find_file_limitations_from_cli(args, rules, file_extractors) + _ = find_file_limitations_from_cli(args, rules, file_extractors) except ShouldExitError as e: return e.status_code @@ -837,12 +830,6 @@ def main(argv: Optional[List[str]] = None): meta = capa.loader.collect_metadata(argv, args.input_file, input_format, os_, args.rules, extractor, counts) meta.analysis.layout = capa.loader.compute_layout(rules, extractor, capabilities) - if isinstance(extractor, StaticFeatureExtractor) and found_file_limitation: - # bail if capa's static feature extractor encountered file limitation e.g. a packed binary - # do show the output in verbose mode, though. - if not (args.verbose or args.vverbose or args.json): - return E_FILE_LIMITATION - if args.json: print(capa.render.json.render(meta, rules, capabilities)) elif args.vverbose: diff --git a/capa/render/default.py b/capa/render/default.py index 2e5064740..0c3390980 100644 --- a/capa/render/default.py +++ b/capa/render/default.py @@ -18,6 +18,7 @@ from capa.render.utils import StringIO tabulate.PRESERVE_WHITESPACE = True +MIN_LIBFUNCS_COUNT = 5 def width(s: str, character_count: int) -> str: @@ -29,6 +30,15 @@ def width(s: str, character_count: int) -> str: def render_meta(doc: rd.ResultDocument, ostream: StringIO): + # check if analysis is Static analysis to inform users about + # potential false postive due to low number of library functions + if isinstance(doc.meta.analysis, rd.StaticAnalysis): + n_libs: int = len(doc.meta.analysis.library_functions) + if n_libs <= MIN_LIBFUNCS_COUNT: + ostream.write( + "Few library functions recognized by FLIRT signatures, results may contain false positives\n\n" + ) + rows = [ (width("md5", 22), width(doc.meta.sample.md5, 82)), ("sha1", doc.meta.sample.sha1), From 9a8a18eddf1c9482bc1f240e134ddafe7f9d43f0 Mon Sep 17 00:00:00 2001 From: Soufiane Fariss Date: Thu, 30 May 2024 04:13:23 +0200 Subject: [PATCH 2/6] changelog: display analysis information to user --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 99f92a8bf..6746f80fa 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,7 @@ - add function in capa/helpers to load plain and compressed JSON reports #1883 @Rohit1123 - document Antivirus warnings and VirusTotal false positive detections #2028 @RionEV @mr-tz - replace Halo spinner with Rich #2086 @s-ff +- display analysis information for user #857 @s-ff ### Breaking Changes From 2597c84ffefb36077483f56a0385e0176c97e977 Mon Sep 17 00:00:00 2001 From: Soufiane Fariss Date: Thu, 6 Jun 2024 16:32:08 +0200 Subject: [PATCH 3/6] print few library function found banner --- capa/capabilities/static.py | 6 ++---- capa/loader.py | 1 + capa/main.py | 1 + capa/render/default.py | 10 +++++++--- capa/render/result_document.py | 1 + 5 files changed, 12 insertions(+), 7 deletions(-) diff --git a/capa/capabilities/static.py b/capa/capabilities/static.py index 01bbd4f7d..d81988942 100644 --- a/capa/capabilities/static.py +++ b/capa/capabilities/static.py @@ -144,6 +144,7 @@ def find_static_capabilities( all_insn_matches: MatchResults = collections.defaultdict(list) feature_counts = rdoc.StaticFeatureCounts(file=0, functions=()) + n_funcs: int = 0 library_functions: Tuple[rdoc.LibraryFunction, ...] = () assert isinstance(extractor, StaticFeatureExtractor) @@ -242,9 +243,6 @@ def pbar(s, *args, **kwargs): ) ) - meta = { - "feature_counts": feature_counts, - "library_functions": library_functions, - } + meta = {"feature_counts": feature_counts, "library_functions": library_functions, "function_count": n_funcs} return matches, meta diff --git a/capa/loader.py b/capa/loader.py index 8e91fae0f..b970700bd 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -349,6 +349,7 @@ def get_sample_analysis(format_, arch, os_, extractor, rules_path, counts): # "functions": { 0x401000: { "matched_basic_blocks": [ 0x401000, 0x401005, ... ] }, ... } ), feature_counts=counts["feature_counts"], + function_count=counts["function_count"], library_functions=counts["library_functions"], ) elif isinstance(extractor, DynamicFeatureExtractor): diff --git a/capa/main.py b/capa/main.py index 8d2beca7c..6a22d6aa2 100644 --- a/capa/main.py +++ b/capa/main.py @@ -876,6 +876,7 @@ def ida_main(): capabilities, counts = find_capabilities(rules, capa.features.extractors.ida.extractor.IdaFeatureExtractor()) meta.analysis.feature_counts = counts["feature_counts"] + meta.analysis.function_count = counts["function_count"] meta.analysis.library_functions = counts["library_functions"] if has_file_limitation(rules, capabilities, is_standalone=False): diff --git a/capa/render/default.py b/capa/render/default.py index 0c3390980..16cfafc76 100644 --- a/capa/render/default.py +++ b/capa/render/default.py @@ -18,7 +18,7 @@ from capa.render.utils import StringIO tabulate.PRESERVE_WHITESPACE = True -MIN_LIBFUNCS_COUNT = 5 +MIN_LIB_FUNCS_PERCENTAGE = 30 def width(s: str, character_count: int) -> str: @@ -34,9 +34,13 @@ def render_meta(doc: rd.ResultDocument, ostream: StringIO): # potential false postive due to low number of library functions if isinstance(doc.meta.analysis, rd.StaticAnalysis): n_libs: int = len(doc.meta.analysis.library_functions) - if n_libs <= MIN_LIBFUNCS_COUNT: + n_funcs: int = doc.meta.analysis.function_count + lib_percentage = round(100 * (n_libs / n_funcs), 2) + if lib_percentage <= MIN_LIB_FUNCS_PERCENTAGE: ostream.write( - "Few library functions recognized by FLIRT signatures, results may contain false positives\n\n" + rutils.warn( + f"Few library functions (%{lib_percentage} of all functions) recognized by FLIRT signatures, results may contain false positives\n\n", + ) ) rows = [ diff --git a/capa/render/result_document.py b/capa/render/result_document.py index d2de49d73..595c6e4e6 100644 --- a/capa/render/result_document.py +++ b/capa/render/result_document.py @@ -114,6 +114,7 @@ class StaticAnalysis(Model): layout: StaticLayout feature_counts: StaticFeatureCounts library_functions: Tuple[LibraryFunction, ...] + function_count: int class DynamicAnalysis(Model): From e8ccd2a8b41704816cb880d6ba6d0040af10c284 Mon Sep 17 00:00:00 2001 From: Soufiane Fariss Date: Thu, 6 Jun 2024 23:29:37 +0200 Subject: [PATCH 4/6] display analysis information to users - Inform users if few library functions were recognized by FLIRT signatures (this should be around ~40-50% of all functions) - Inform users if there are very few API calls(<10), this could indicate that it is packed, corrupted or tiny --- capa/capabilities/static.py | 44 ++++++++++++++++++++++++++++++---- capa/loader.py | 1 - capa/main.py | 1 - capa/render/default.py | 14 ----------- capa/render/result_document.py | 1 - 5 files changed, 40 insertions(+), 21 deletions(-) diff --git a/capa/capabilities/static.py b/capa/capabilities/static.py index d81988942..53d86c826 100644 --- a/capa/capabilities/static.py +++ b/capa/capabilities/static.py @@ -16,6 +16,7 @@ import tqdm.contrib.logging import capa.perf +import capa.render.utils as rutils import capa.features.freeze as frz import capa.render.result_document as rdoc from capa.rules import Scope, RuleSet @@ -27,6 +28,9 @@ logger = logging.getLogger(__name__) +MIN_LIB_FUNCS_RATIO = 0.4 +MIN_API_CALLS = 10 + def find_instruction_capabilities( ruleset: RuleSet, extractor: StaticFeatureExtractor, f: FunctionHandle, bb: BBHandle, insn: InsnHandle @@ -97,7 +101,7 @@ def find_basic_block_capabilities( def find_code_capabilities( ruleset: RuleSet, extractor: StaticFeatureExtractor, fh: FunctionHandle -) -> Tuple[MatchResults, MatchResults, MatchResults, int]: +) -> Tuple[MatchResults, MatchResults, MatchResults, FeatureSet]: """ find matches for the given rules within the given function. @@ -133,7 +137,7 @@ def find_code_capabilities( function_features[feature].add(va) _, function_matches = ruleset.match(Scope.FUNCTION, function_features, fh.address) - return function_matches, bb_matches, insn_matches, len(function_features) + return function_matches, bb_matches, insn_matches, function_features def find_static_capabilities( @@ -146,6 +150,7 @@ def find_static_capabilities( feature_counts = rdoc.StaticFeatureCounts(file=0, functions=()) n_funcs: int = 0 library_functions: Tuple[rdoc.LibraryFunction, ...] = () + api_calls: int = 0 assert isinstance(extractor, StaticFeatureExtractor) with redirecting_print_to_tqdm(disable_progress): @@ -185,12 +190,24 @@ def pbar(s, *args, **kwargs): pb.set_postfix_str(f"skipped {n_libs} library functions ({percentage}%)") continue - function_matches, bb_matches, insn_matches, feature_count = find_code_capabilities( + function_matches, bb_matches, insn_matches, function_features = find_code_capabilities( ruleset, extractor, f ) + feature_count = len(function_features) feature_counts.functions += ( rdoc.FunctionFeatureCount(address=frz.Address.from_capa(f.address), count=feature_count), ) + + # for each function, count the number of API features, + # and cumulatively it to the total count of API calls made + call_addresses = { + addr + for feature, addresses in function_features.items() + if isinstance(feature, API) + for addr in addresses + } + api_calls += len(call_addresses) + t1 = time.time() match_count = 0 @@ -218,6 +235,25 @@ def pbar(s, *args, **kwargs): for rule_name, res in insn_matches.items(): all_insn_matches[rule_name].extend(res) + # inform users if few library functions are recognized + # via FLIRT signatures, results may contain false positives + # from library code + if n_funcs: + lib_ratio = len(library_functions) / n_funcs + if lib_ratio < MIN_LIB_FUNCS_RATIO: + print( + rutils.warn( + "Few library functions (%.2f%% of all functions) recognized by FLIRT signatures, results may contain false positives" + % (lib_ratio * 100) + ) + ) + + if api_calls < MIN_API_CALLS: + print( + rutils.warn( + "The analyzed sample reports very few API calls, this could indicate that it is packed, corrupted, or tiny" + ) + ) # collection of features that captures the rule matches within function, BB, and instruction scopes. # mapping from feature (matched rule) to set of addresses at which it matched. function_and_lower_features: FeatureSet = collections.defaultdict(set) @@ -243,6 +279,6 @@ def pbar(s, *args, **kwargs): ) ) - meta = {"feature_counts": feature_counts, "library_functions": library_functions, "function_count": n_funcs} + meta = {"feature_counts": feature_counts, "library_functions": library_functions} return matches, meta diff --git a/capa/loader.py b/capa/loader.py index b970700bd..8e91fae0f 100644 --- a/capa/loader.py +++ b/capa/loader.py @@ -349,7 +349,6 @@ def get_sample_analysis(format_, arch, os_, extractor, rules_path, counts): # "functions": { 0x401000: { "matched_basic_blocks": [ 0x401000, 0x401005, ... ] }, ... } ), feature_counts=counts["feature_counts"], - function_count=counts["function_count"], library_functions=counts["library_functions"], ) elif isinstance(extractor, DynamicFeatureExtractor): diff --git a/capa/main.py b/capa/main.py index 6a22d6aa2..8d2beca7c 100644 --- a/capa/main.py +++ b/capa/main.py @@ -876,7 +876,6 @@ def ida_main(): capabilities, counts = find_capabilities(rules, capa.features.extractors.ida.extractor.IdaFeatureExtractor()) meta.analysis.feature_counts = counts["feature_counts"] - meta.analysis.function_count = counts["function_count"] meta.analysis.library_functions = counts["library_functions"] if has_file_limitation(rules, capabilities, is_standalone=False): diff --git a/capa/render/default.py b/capa/render/default.py index 16cfafc76..2e5064740 100644 --- a/capa/render/default.py +++ b/capa/render/default.py @@ -18,7 +18,6 @@ from capa.render.utils import StringIO tabulate.PRESERVE_WHITESPACE = True -MIN_LIB_FUNCS_PERCENTAGE = 30 def width(s: str, character_count: int) -> str: @@ -30,19 +29,6 @@ def width(s: str, character_count: int) -> str: def render_meta(doc: rd.ResultDocument, ostream: StringIO): - # check if analysis is Static analysis to inform users about - # potential false postive due to low number of library functions - if isinstance(doc.meta.analysis, rd.StaticAnalysis): - n_libs: int = len(doc.meta.analysis.library_functions) - n_funcs: int = doc.meta.analysis.function_count - lib_percentage = round(100 * (n_libs / n_funcs), 2) - if lib_percentage <= MIN_LIB_FUNCS_PERCENTAGE: - ostream.write( - rutils.warn( - f"Few library functions (%{lib_percentage} of all functions) recognized by FLIRT signatures, results may contain false positives\n\n", - ) - ) - rows = [ (width("md5", 22), width(doc.meta.sample.md5, 82)), ("sha1", doc.meta.sample.sha1), diff --git a/capa/render/result_document.py b/capa/render/result_document.py index 595c6e4e6..d2de49d73 100644 --- a/capa/render/result_document.py +++ b/capa/render/result_document.py @@ -114,7 +114,6 @@ class StaticAnalysis(Model): layout: StaticLayout feature_counts: StaticFeatureCounts library_functions: Tuple[LibraryFunction, ...] - function_count: int class DynamicAnalysis(Model): From a5e5167e2e4fef8c765afa9aef4240191e3df449 Mon Sep 17 00:00:00 2001 From: Soufiane Fariss Date: Thu, 6 Jun 2024 23:47:39 +0200 Subject: [PATCH 5/6] code style: do not use % --- capa/capabilities/static.py | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) diff --git a/capa/capabilities/static.py b/capa/capabilities/static.py index 53d86c826..84da874c9 100644 --- a/capa/capabilities/static.py +++ b/capa/capabilities/static.py @@ -16,7 +16,6 @@ import tqdm.contrib.logging import capa.perf -import capa.render.utils as rutils import capa.features.freeze as frz import capa.render.result_document as rdoc from capa.rules import Scope, RuleSet @@ -235,24 +234,17 @@ def pbar(s, *args, **kwargs): for rule_name, res in insn_matches.items(): all_insn_matches[rule_name].extend(res) - # inform users if few library functions are recognized - # via FLIRT signatures, results may contain false positives - # from library code if n_funcs: lib_ratio = len(library_functions) / n_funcs if lib_ratio < MIN_LIB_FUNCS_RATIO: - print( - rutils.warn( - "Few library functions (%.2f%% of all functions) recognized by FLIRT signatures, results may contain false positives" - % (lib_ratio * 100) - ) + logger.info( + "Few library functions (%.2f%% of all functions) recognized by FLIRT signatures, results may contain false positives", + lib_ratio * 100, ) if api_calls < MIN_API_CALLS: - print( - rutils.warn( - "The analyzed sample reports very few API calls, this could indicate that it is packed, corrupted, or tiny" - ) + logger.info( + "The analyzed sample reports very few API calls, this could indicate that it is packed, corrupted, or tiny" ) # collection of features that captures the rule matches within function, BB, and instruction scopes. # mapping from feature (matched rule) to set of addresses at which it matched. From ae226e626de118b7152d862b8f46fbdaa77c62bf Mon Sep 17 00:00:00 2001 From: Soufiane Fariss Date: Fri, 7 Jun 2024 00:00:05 +0200 Subject: [PATCH 6/6] start using todos --- capa/capabilities/common.py | 2 ++ capa/capabilities/static.py | 3 --- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/capa/capabilities/common.py b/capa/capabilities/common.py index d71d2a12f..f3833096f 100644 --- a/capa/capabilities/common.py +++ b/capa/capabilities/common.py @@ -51,6 +51,8 @@ def has_file_limitation(rules: RuleSet, capabilities: MatchResults, is_standalon for line in file_limitation_rule.meta.get("description", "").split("\n"): logger.warning(" %s", line) logger.warning(" Identified via rule: %s", file_limitation_rule.name) + # TODO(s-ff): remove is_standalone flag as it is no longer need + # #2111 if is_standalone: pass logger.warning("-" * 80) diff --git a/capa/capabilities/static.py b/capa/capabilities/static.py index 84da874c9..73dcb7074 100644 --- a/capa/capabilities/static.py +++ b/capa/capabilities/static.py @@ -122,9 +122,6 @@ def find_code_capabilities( features, bmatches, imatches = find_basic_block_capabilities(ruleset, extractor, fh, bb) for feature, vas in features.items(): function_features[feature].update(vas) - if isinstance(feature, API): - # delcare a global variable (a set) and append to it here? - pass for rule_name, res in bmatches.items(): bb_matches[rule_name].extend(res)