From bbd3f70a36c63082d01db796bb696ef441942220 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Mon, 27 Jun 2022 11:29:21 -0400 Subject: [PATCH 01/51] Added initial capa control flow for scripts in C#. --- capa/features/common.py | 1 + capa/features/extractors/scripts.py | 20 ++++++++++ capa/features/extractors/ts/__init__.py | 0 capa/features/extractors/ts/extractor.py | 47 ++++++++++++++++++++++++ capa/helpers.py | 9 ++++- capa/main.py | 38 ++++++++++++++++++- 6 files changed, 112 insertions(+), 3 deletions(-) create mode 100644 capa/features/extractors/scripts.py create mode 100644 capa/features/extractors/ts/__init__.py create mode 100644 capa/features/extractors/ts/extractor.py diff --git a/capa/features/common.py b/capa/features/common.py index 30a4c0b25..154328bcb 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -414,6 +414,7 @@ def __init__(self, value: str, description=None): FORMAT_SC32 = "sc32" FORMAT_SC64 = "sc64" FORMAT_FREEZE = "freeze" +FORMAT_CS = "script_cs" FORMAT_UNKNOWN = "unknown" diff --git a/capa/features/extractors/scripts.py b/capa/features/extractors/scripts.py new file mode 100644 index 000000000..e37e1daad --- /dev/null +++ b/capa/features/extractors/scripts.py @@ -0,0 +1,20 @@ +from typing import Tuple, Iterator + +from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_CS, Arch, Feature +from capa.features.address import NO_ADDRESS, Address + +LANG_CS = "c_sharp" + + +def extract_arch() -> Iterator[Tuple[Feature, Address]]: + yield Arch(ARCH_ANY), NO_ADDRESS + + +def extract_os() -> Iterator[Tuple[Feature, Address]]: + yield OS(OS_ANY), NO_ADDRESS + + +def get_language_from_format(format_: str) -> str: + if format_ == FORMAT_CS: + return LANG_CS + return "unknown" diff --git a/capa/features/extractors/ts/__init__.py b/capa/features/extractors/ts/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py new file mode 100644 index 000000000..139f45045 --- /dev/null +++ b/capa/features/extractors/ts/extractor.py @@ -0,0 +1,47 @@ +from typing import Tuple, Union, Iterator + +import capa.features.extractors.scripts +from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress +from capa.features.extractors.base_extractor import Feature, BBHandle, InsnHandle, FunctionHandle, FeatureExtractor + + +class TreeSitterFeatureExtractor(FeatureExtractor): + def __init__(self, path: str, format_: str): + super().__init__() + self.path = path + self.languages = [capa.features.extractors.scripts.get_language_from_format(format_)] + + def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: + return NO_ADDRESS + + def extract_global_features(self): + raise NotImplementedError("not implemented") + + def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: + raise NotImplementedError("not implemented") + + def get_functions(self) -> Iterator[FunctionHandle]: + raise NotImplementedError("not implemented") + + def extract_function_features(self, f: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: + raise NotImplementedError("not implemented") + + def get_basic_blocks(self, f: FunctionHandle) -> Iterator[BBHandle]: + raise NotImplementedError("not implemented") + + def extract_basic_block_features(self, f: FunctionHandle, bb: BBHandle) -> Iterator[Tuple[Feature, Address]]: + raise NotImplementedError("not implemented") + + def get_instructions(self, f: FunctionHandle, bb: BBHandle): + raise NotImplementedError("not implemented") + + def extract_insn_features( + self, f: FunctionHandle, bb: BBHandle, insn: InsnHandle + ) -> Iterator[Tuple[Feature, Address]]: + raise NotImplementedError("not implemented") + + def is_library_function(self, addr: Address) -> bool: + raise NotImplementedError("not implemented") + + def get_function_name(self, addr: Address) -> str: + raise NotImplementedError("not implemented") diff --git a/capa/helpers.py b/capa/helpers.py index 9c4c285e8..9f96de632 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -10,11 +10,16 @@ from typing import NoReturn from capa.exceptions import UnsupportedFormatError -from capa.features.common import FORMAT_SC32, FORMAT_SC64, FORMAT_UNKNOWN +from capa.features.common import FORMAT_CS, FORMAT_SC32, FORMAT_SC64, FORMAT_UNKNOWN EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32") EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64") +<<<<<<< HEAD EXTENSIONS_ELF = "elf_" +======= +EXTENSION_CS = "cs" + +>>>>>>> Added initial capa control flow for scripts in C#. logger = logging.getLogger("capa") @@ -51,6 +56,8 @@ def get_format_from_extension(sample: str) -> str: return FORMAT_SC32 elif sample.endswith(EXTENSIONS_SHELLCODE_64): return FORMAT_SC64 + elif sample.endswith(EXTENSION_CS): + return FORMAT_CS return FORMAT_UNKNOWN diff --git a/capa/main.py b/capa/main.py index d53221ad9..7179bfb26 100644 --- a/capa/main.py +++ b/capa/main.py @@ -43,6 +43,7 @@ import capa.features.extractors.pefile import capa.features.extractors.dnfile_ import capa.features.extractors.elffile +import capa.features.extractors.scripts import capa.features.extractors.dotnetfile import capa.features.extractors.base_extractor from capa.rules import Rule, Scope, RuleSet @@ -344,6 +345,13 @@ def has_file_limitation(rules: RuleSet, capabilities: MatchResults, is_standalon return False +def is_supported_script(format_: str): + """ + If the script format was recognized, then it is supported. + """ + return format_.startswith("script") + + def is_supported_format(sample: str) -> bool: """ Return if this is a supported file based on magic header values @@ -372,6 +380,14 @@ def get_arch(sample: str) -> str: return "unknown" +def get_script_arch() -> str: + for feature, _ in capa.features.extractors.scripts.extract_arch(): + assert isinstance(feature.value, str) + return feature.value + + return "unknown" + + def is_supported_os(sample: str) -> bool: with open(sample, "rb") as f: buf = f.read() @@ -390,6 +406,14 @@ def get_os(sample: str) -> str: return "unknown" +def get_script_os() -> str: + for feature, _ in capa.features.extractors.scripts.extract_os(): + assert isinstance(feature.value, str) + return feature.value + + return "unknown" + + def get_meta_str(vw): """ Return workspace meta information string @@ -497,6 +521,11 @@ def get_extractor( UnsupportedArchError UnsupportedOSError """ + if is_supported_script(format_): + import capa.features.extractors.ts.extractor + + return capa.features.extractors.ts.extractor.TreeSitterFeatureExtractor(path, format_) + if format_ not in (FORMAT_SC32, FORMAT_SC64): if not is_supported_format(path): raise UnsupportedFormatError() @@ -675,8 +704,13 @@ def collect_metadata( rules_path = [os.path.abspath(os.path.normpath(r)) for r in rules_path] format_ = get_format(sample_path) - arch = get_arch(sample_path) - os_ = get_os(sample_path) + + if is_supported_script(format_): + arch = get_script_arch() + os_ = get_script_os() + else: + arch = get_arch(sample_path) + os_ = get_os(sample_path) return { "timestamp": datetime.datetime.now().isoformat(), From 8173397b2503e69cc0b4145ebc3b90044f21ec3c Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Mon, 27 Jun 2022 16:32:48 -0400 Subject: [PATCH 02/51] Implemented some further basic TreeSitter Extractor-related concepts such as byte-range address. --- capa/features/address.py | 20 +++++++++++++++ capa/features/common.py | 6 +++++ capa/features/extractors/scripts.py | 8 ++++-- capa/features/extractors/ts/extractor.py | 32 ++++++++++++++++-------- 4 files changed, 53 insertions(+), 13 deletions(-) diff --git a/capa/features/address.py b/capa/features/address.py index 2033c24ef..413845f1f 100644 --- a/capa/features/address.py +++ b/capa/features/address.py @@ -53,6 +53,26 @@ def __repr__(self): return f"file(0x{self:x})" +class FileOffsetRangeAddress(Address): + """an address range relative to the start of a file""" + + def __init__(self, start_byte, end_byte): + self.start_byte = start_byte + self.end_byte = end_byte + + def __eq__(self, other): + return (self.start_byte, self.end_byte) == (self.end_byte, other.end_byte) + + def __lt__(self, other): + return (self.start_byte, self.end_byte) < (other.start_byte, other.end_byte) + + def __hash__(self): + return hash((self.start_byte, self.end_byte)) + + def __repr__(self): + return f"file(0x{self.start_byte:x}, 0x{self.end_byte:x})" + + class DNTokenAddress(Address): """a .NET token""" diff --git a/capa/features/common.py b/capa/features/common.py index 154328bcb..3baeef7e6 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -405,6 +405,12 @@ def __init__(self, value: str, description=None): self.name = "os" +class ScriptLanguage(Feature): + def __init__(self, value: str, description=None): + super().__init__(value, description=description) + self.name = "script language" + + FORMAT_PE = "pe" FORMAT_ELF = "elf" FORMAT_DOTNET = "dotnet" diff --git a/capa/features/extractors/scripts.py b/capa/features/extractors/scripts.py index e37e1daad..bc5f08b25 100644 --- a/capa/features/extractors/scripts.py +++ b/capa/features/extractors/scripts.py @@ -1,7 +1,7 @@ from typing import Tuple, Iterator -from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_CS, Arch, Feature -from capa.features.address import NO_ADDRESS, Address +from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_CS, Arch, Feature, ScriptLanguage +from capa.features.address import NO_ADDRESS, Address, FileOffsetRangeAddress LANG_CS = "c_sharp" @@ -10,6 +10,10 @@ def extract_arch() -> Iterator[Tuple[Feature, Address]]: yield Arch(ARCH_ANY), NO_ADDRESS +def extract_language(language: str, addr: FileOffsetRangeAddress) -> Iterator[Tuple[Feature, Address]]: + yield ScriptLanguage(language), addr + + def extract_os() -> Iterator[Tuple[Feature, Address]]: yield OS(OS_ANY), NO_ADDRESS diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index 139f45045..16ce25fb9 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -1,7 +1,7 @@ -from typing import Tuple, Union, Iterator +from typing import List, Tuple, Union, Iterator import capa.features.extractors.scripts -from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress +from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress, FileOffsetRangeAddress from capa.features.extractors.base_extractor import Feature, BBHandle, InsnHandle, FunctionHandle, FeatureExtractor @@ -9,13 +9,23 @@ class TreeSitterFeatureExtractor(FeatureExtractor): def __init__(self, path: str, format_: str): super().__init__() self.path = path - self.languages = [capa.features.extractors.scripts.get_language_from_format(format_)] + self.language = capa.features.extractors.scripts.get_language_from_format(format_) + with open(self.path, "rb") as f: + self.buf = f.read() + + # pre-compute these because we'll yield them at *every* scope. + self.global_features: List[Tuple[Feature, Address]] = [] + self.global_features.extend( + capa.features.extractors.scripts.extract_language(self.language, FileOffsetRangeAddress(0, len(self.buf))) + ) + self.global_features.extend(capa.features.extractors.scripts.extract_os()) + self.global_features.extend(capa.features.extractors.scripts.extract_arch()) def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: return NO_ADDRESS def extract_global_features(self): - raise NotImplementedError("not implemented") + yield from self.global_features def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: raise NotImplementedError("not implemented") @@ -27,21 +37,21 @@ def extract_function_features(self, f: FunctionHandle) -> Iterator[Tuple[Feature raise NotImplementedError("not implemented") def get_basic_blocks(self, f: FunctionHandle) -> Iterator[BBHandle]: - raise NotImplementedError("not implemented") + yield from [] def extract_basic_block_features(self, f: FunctionHandle, bb: BBHandle) -> Iterator[Tuple[Feature, Address]]: raise NotImplementedError("not implemented") - def get_instructions(self, f: FunctionHandle, bb: BBHandle): - raise NotImplementedError("not implemented") + def get_instructions(self, f: FunctionHandle, bb: BBHandle) -> Iterator[InsnHandle]: + yield from [] def extract_insn_features( self, f: FunctionHandle, bb: BBHandle, insn: InsnHandle ) -> Iterator[Tuple[Feature, Address]]: raise NotImplementedError("not implemented") - def is_library_function(self, addr: Address) -> bool: - raise NotImplementedError("not implemented") + def is_library_function(self, addr) -> bool: + return False - def get_function_name(self, addr: Address) -> str: - raise NotImplementedError("not implemented") + def get_function_name(self, addr) -> str: + return self.buf[addr.start_byte : addr.end_byte].decode() From 428f6bc98c73efa39d39446f93dc87a15aa2cef7 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 28 Jun 2022 09:25:33 -0400 Subject: [PATCH 03/51] Modified mypy config file to ignore tree-sitter's missing exports. --- .github/mypy/mypy.ini | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/mypy/mypy.ini b/.github/mypy/mypy.ini index 3d22b05f7..9add4ef5c 100644 --- a/.github/mypy/mypy.ini +++ b/.github/mypy/mypy.ini @@ -76,4 +76,7 @@ ignore_missing_imports = True ignore_missing_imports = True [mypy-dncil.*] +ignore_missing_imports = True + +[mypy-tree_sitter.*] ignore_missing_imports = True \ No newline at end of file From a6d7ba25ac198a446eecf3b30b5b243dedfbac73 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 28 Jun 2022 09:29:08 -0400 Subject: [PATCH 04/51] Implemented core tree sitter engine component with C# queries that serves as an interface to the language-specific tree-sitter queries. --- capa/features/extractors/ts/build.py | 4 ++ capa/features/extractors/ts/engine.py | 58 ++++++++++++++++++++++++ capa/features/extractors/ts/extractor.py | 6 ++- capa/features/extractors/ts/query.py | 35 ++++++++++++++ 4 files changed, 102 insertions(+), 1 deletion(-) create mode 100644 capa/features/extractors/ts/build.py create mode 100644 capa/features/extractors/ts/engine.py create mode 100644 capa/features/extractors/ts/query.py diff --git a/capa/features/extractors/ts/build.py b/capa/features/extractors/ts/build.py new file mode 100644 index 000000000..4d1b13184 --- /dev/null +++ b/capa/features/extractors/ts/build.py @@ -0,0 +1,4 @@ +build_dir = "build/my-languages.so" +languages = [ + "vendor/tree-sitter-c-sharp", +] diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py new file mode 100644 index 000000000..03c7db429 --- /dev/null +++ b/capa/features/extractors/ts/engine.py @@ -0,0 +1,58 @@ +from tree_sitter import Node, Tree, Parser + +import capa.features.extractors.ts.query + + +class TreeSitterExtractorEngine: + def __init__(self, language): + self.query = capa.features.extractors.ts.query.QueryBinding(language) + + def get_ts_language(self): + return self.query.language + + def parse(self, source: bytes) -> Tree: + parser = Parser() + parser.set_language(self.get_ts_language()) + return parser.parse(source) + + def get_new_objects(self, node: Node): + return self.query.new_object.captures(node) + + def get_object_id(self, node: Node): + return node.child_by_field_name(self.query.new_object_field_name) + + def get_functions(self, node: Node): + return self.query.function_def.captures(node) + + def get_function_definition_id(self, node: Node): + return node.child_by_field_name(self.query.function_def_field_name) + + def get_function_calls(self, node: Node): + return self.query.function_call.captures(node) + + def get_function_call_id(self, node: Node): + return node.child_by_field_name(self.query.function_call_field_name) + + def extract_string_literals(self, node: Node): + return self.query.string_literal.captures(node) + + def extract_integer_literals(self, node: Node): + return self.query.integer_literal.captures(node) + + def extract_namespaces(self, node: Node): + return self.query.namespace.captures(node) + + def extract_blocks(self, node: Node): + return self.query.block.captures(node) + + def extract_node(self, tree, tgt_node_name): + cursor = tree.walk() + while True: + if cursor.node.type == tgt_node_name: + yield cursor.node + if cursor.goto_first_child() or cursor.goto_next_sibling(): + continue + while cursor.goto_parent() and not cursor.goto_next_sibling(): + continue + if cursor.node.parent is None: + break diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index 16ce25fb9..d006f953b 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -1,6 +1,7 @@ from typing import List, Tuple, Union, Iterator import capa.features.extractors.scripts +import capa.features.extractors.ts.engine from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress, FileOffsetRangeAddress from capa.features.extractors.base_extractor import Feature, BBHandle, InsnHandle, FunctionHandle, FeatureExtractor @@ -12,6 +13,8 @@ def __init__(self, path: str, format_: str): self.language = capa.features.extractors.scripts.get_language_from_format(format_) with open(self.path, "rb") as f: self.buf = f.read() + self.engine = capa.features.extractors.ts.engine.TreeSitterExtractorEngine(self.language) + self.tree = self.engine.parse(self.buf) # pre-compute these because we'll yield them at *every* scope. self.global_features: List[Tuple[Feature, Address]] = [] @@ -31,7 +34,8 @@ def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: raise NotImplementedError("not implemented") def get_functions(self) -> Iterator[FunctionHandle]: - raise NotImplementedError("not implemented") + for node, _ in self.engine.get_functions(self.tree): + yield FunctionHandle(address=FileOffsetRangeAddress(node.start_byte, node.end_byte), inner=node) def extract_function_features(self, f: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: raise NotImplementedError("not implemented") diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py new file mode 100644 index 000000000..d7bdd5e74 --- /dev/null +++ b/capa/features/extractors/ts/query.py @@ -0,0 +1,35 @@ +from dataclasses import dataclass + +from tree_sitter import Language +from tree_sitter.binding import Query + +import capa.features.extractors.ts.build + + +@dataclass +class QueryBinding: + language: Language + new_object: Query + new_object_field_name: str + function_def: Query + function_def_field_name: str + function_call: Query + function_call_field_name: str + string_literal: Query + integer_literal: Query + namespace: Query + + def __init__(self, language: str): + self.language = Language(capa.features.extractors.ts.build.build_dir, language) + if language == "c_sharp": + self.new_object = self.language.query("(object_creation_expression) @object.new") + self.new_object_field_name = "type" + self.function_def = self.language.query("(local_function_statement) @function.def") + self.function_def_field_name = "name" + self.function_call = self.language.query("(invocation_expression) @function.call") + self.function_call_field_name = "function" + self.string_literal = self.language.query("(string_literal) @string-literal") + self.integer_literal = self.language.query("(integer_literal) @integer-literal") + self.namespace = self.language.query("(using_directive\n\t(qualified_name) @namespace)") + else: + raise NotImplementedError(f"Tree-sitter queries for {language} are not implemented.") From 80bf78bfbb2d4dd55939527d3a29c2c9eb755ebb Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 28 Jun 2022 09:59:27 -0400 Subject: [PATCH 05/51] Implemented script global extraction handlers (mostly wrapping existing language-independent extractors). --- .../extractors/{scripts.py => script.py} | 0 capa/features/extractors/ts/extractor.py | 22 +++++------- capa/features/extractors/ts/global_.py | 34 +++++++++++++++++++ capa/main.py | 6 ++-- 4 files changed, 45 insertions(+), 17 deletions(-) rename capa/features/extractors/{scripts.py => script.py} (100%) create mode 100644 capa/features/extractors/ts/global_.py diff --git a/capa/features/extractors/scripts.py b/capa/features/extractors/script.py similarity index 100% rename from capa/features/extractors/scripts.py rename to capa/features/extractors/script.py diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index d006f953b..81fb600db 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -1,7 +1,8 @@ from typing import List, Tuple, Union, Iterator -import capa.features.extractors.scripts +import capa.features.extractors.script import capa.features.extractors.ts.engine +import capa.features.extractors.ts.global_ from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress, FileOffsetRangeAddress from capa.features.extractors.base_extractor import Feature, BBHandle, InsnHandle, FunctionHandle, FeatureExtractor @@ -10,28 +11,21 @@ class TreeSitterFeatureExtractor(FeatureExtractor): def __init__(self, path: str, format_: str): super().__init__() self.path = path - self.language = capa.features.extractors.scripts.get_language_from_format(format_) + self.language = capa.features.extractors.script.get_language_from_format(format_) with open(self.path, "rb") as f: self.buf = f.read() self.engine = capa.features.extractors.ts.engine.TreeSitterExtractorEngine(self.language) self.tree = self.engine.parse(self.buf) - # pre-compute these because we'll yield them at *every* scope. - self.global_features: List[Tuple[Feature, Address]] = [] - self.global_features.extend( - capa.features.extractors.scripts.extract_language(self.language, FileOffsetRangeAddress(0, len(self.buf))) - ) - self.global_features.extend(capa.features.extractors.scripts.extract_os()) - self.global_features.extend(capa.features.extractors.scripts.extract_arch()) - def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: return NO_ADDRESS - def extract_global_features(self): - yield from self.global_features + def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: + ctx = capa.features.extractors.ts.global_.GlobalScriptContext(self.language, self.tree) + yield from capa.features.extractors.ts.global_.extract_features(ctx) def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: - raise NotImplementedError("not implemented") + yield from [] def get_functions(self) -> Iterator[FunctionHandle]: for node, _ in self.engine.get_functions(self.tree): @@ -44,7 +38,7 @@ def get_basic_blocks(self, f: FunctionHandle) -> Iterator[BBHandle]: yield from [] def extract_basic_block_features(self, f: FunctionHandle, bb: BBHandle) -> Iterator[Tuple[Feature, Address]]: - raise NotImplementedError("not implemented") + yield from [] def get_instructions(self, f: FunctionHandle, bb: BBHandle) -> Iterator[InsnHandle]: yield from [] diff --git a/capa/features/extractors/ts/global_.py b/capa/features/extractors/ts/global_.py new file mode 100644 index 000000000..80ddb54fd --- /dev/null +++ b/capa/features/extractors/ts/global_.py @@ -0,0 +1,34 @@ +from dataclasses import dataclass + +from tree_sitter import Tree + +import capa.features.extractors.script +from capa.features.address import FileOffsetRangeAddress + + +@dataclass +class GlobalScriptContext: + language: str + tree: Tree + + +def extract_arch(ctx: GlobalScriptContext): + yield from capa.features.extractors.script.extract_arch() + + +def extract_language(ctx: GlobalScriptContext): + node = ctx.tree.root_node + addr = FileOffsetRangeAddress(node.start_byte, node.end_byte) + yield from capa.features.extractors.script.extract_language(ctx.language, addr) + + +def extract_os(ctx: GlobalScriptContext): + yield from capa.features.extractors.script.extract_os() + + +def extract_features(ctx: GlobalScriptContext): + for glob_handler in GLOBAL_HANDLERS: + yield glob_handler(ctx) + + +GLOBAL_HANDLERS = (extract_arch, extract_os, extract_language) diff --git a/capa/main.py b/capa/main.py index 7179bfb26..730778f3a 100644 --- a/capa/main.py +++ b/capa/main.py @@ -41,9 +41,9 @@ import capa.features.extractors import capa.features.extractors.common import capa.features.extractors.pefile +import capa.features.extractors.script import capa.features.extractors.dnfile_ import capa.features.extractors.elffile -import capa.features.extractors.scripts import capa.features.extractors.dotnetfile import capa.features.extractors.base_extractor from capa.rules import Rule, Scope, RuleSet @@ -381,7 +381,7 @@ def get_arch(sample: str) -> str: def get_script_arch() -> str: - for feature, _ in capa.features.extractors.scripts.extract_arch(): + for feature, _ in capa.features.extractors.script.extract_arch(): assert isinstance(feature.value, str) return feature.value @@ -407,7 +407,7 @@ def get_os(sample: str) -> str: def get_script_os() -> str: - for feature, _ in capa.features.extractors.scripts.extract_os(): + for feature, _ in capa.features.extractors.script.extract_os(): assert isinstance(feature.value, str) return feature.value From cf3dc7e0c91073d645663bb3b94d36f74f827247 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 28 Jun 2022 11:01:35 -0400 Subject: [PATCH 06/51] Reworked format parsing to align better with the rest of capa logic. --- capa/features/common.py | 2 +- capa/features/extractors/script.py | 10 ++++++---- capa/features/extractors/ts/extractor.py | 4 ++-- capa/helpers.py | 12 ++++-------- capa/main.py | 11 ++++++----- 5 files changed, 19 insertions(+), 20 deletions(-) diff --git a/capa/features/common.py b/capa/features/common.py index 3baeef7e6..4327ac1e2 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -420,7 +420,7 @@ def __init__(self, value: str, description=None): FORMAT_SC32 = "sc32" FORMAT_SC64 = "sc64" FORMAT_FREEZE = "freeze" -FORMAT_CS = "script_cs" +FORMAT_SCRIPT = "script" FORMAT_UNKNOWN = "unknown" diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py index bc5f08b25..e2842ae2e 100644 --- a/capa/features/extractors/script.py +++ b/capa/features/extractors/script.py @@ -1,6 +1,7 @@ +import os from typing import Tuple, Iterator -from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_CS, Arch, Feature, ScriptLanguage +from capa.features.common import OS, OS_ANY, ARCH_ANY, Arch, Feature, ScriptLanguage from capa.features.address import NO_ADDRESS, Address, FileOffsetRangeAddress LANG_CS = "c_sharp" @@ -18,7 +19,8 @@ def extract_os() -> Iterator[Tuple[Feature, Address]]: yield OS(OS_ANY), NO_ADDRESS -def get_language_from_format(format_: str) -> str: - if format_ == FORMAT_CS: +def get_language_from_ext(path: str): + _, ext = os.path.splitext(path) + if ext == ".cs": return LANG_CS - return "unknown" + raise ValueError("{path} has an unrecognized or an unsupported extension.") diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index 81fb600db..88f041fd8 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -8,10 +8,10 @@ class TreeSitterFeatureExtractor(FeatureExtractor): - def __init__(self, path: str, format_: str): + def __init__(self, path: str): super().__init__() self.path = path - self.language = capa.features.extractors.script.get_language_from_format(format_) + self.language = capa.features.extractors.script.get_language_from_ext(path) with open(self.path, "rb") as f: self.buf = f.read() self.engine = capa.features.extractors.ts.engine.TreeSitterExtractorEngine(self.language) diff --git a/capa/helpers.py b/capa/helpers.py index 9f96de632..d9907cc58 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -10,16 +10,12 @@ from typing import NoReturn from capa.exceptions import UnsupportedFormatError -from capa.features.common import FORMAT_CS, FORMAT_SC32, FORMAT_SC64, FORMAT_UNKNOWN +from capa.features.common import FORMAT_SC32, FORMAT_SC64, FORMAT_SCRIPT, FORMAT_UNKNOWN EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32") EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64") -<<<<<<< HEAD EXTENSIONS_ELF = "elf_" -======= -EXTENSION_CS = "cs" - ->>>>>>> Added initial capa control flow for scripts in C#. +EXTENSIONS_SUPPORTED_SCRIPTS = "cs" logger = logging.getLogger("capa") @@ -56,8 +52,8 @@ def get_format_from_extension(sample: str) -> str: return FORMAT_SC32 elif sample.endswith(EXTENSIONS_SHELLCODE_64): return FORMAT_SC64 - elif sample.endswith(EXTENSION_CS): - return FORMAT_CS + elif sample.endswith(EXTENSIONS_SUPPORTED_SCRIPTS): + return FORMAT_SCRIPT return FORMAT_UNKNOWN diff --git a/capa/main.py b/capa/main.py index 730778f3a..ba428ca13 100644 --- a/capa/main.py +++ b/capa/main.py @@ -65,6 +65,7 @@ FORMAT_SC64, FORMAT_DOTNET, FORMAT_FREEZE, + FORMAT_SCRIPT, ) from capa.features.address import NO_ADDRESS from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle, FeatureExtractor @@ -345,11 +346,11 @@ def has_file_limitation(rules: RuleSet, capabilities: MatchResults, is_standalon return False -def is_supported_script(format_: str): +def is_script_format(format_: str): """ If the script format was recognized, then it is supported. """ - return format_.startswith("script") + return format_ == FORMAT_SCRIPT def is_supported_format(sample: str) -> bool: @@ -521,10 +522,10 @@ def get_extractor( UnsupportedArchError UnsupportedOSError """ - if is_supported_script(format_): + if format_ == FORMAT_SCRIPT: import capa.features.extractors.ts.extractor - return capa.features.extractors.ts.extractor.TreeSitterFeatureExtractor(path, format_) + return capa.features.extractors.ts.extractor.TreeSitterFeatureExtractor(path) if format_ not in (FORMAT_SC32, FORMAT_SC64): if not is_supported_format(path): @@ -705,7 +706,7 @@ def collect_metadata( format_ = get_format(sample_path) - if is_supported_script(format_): + if format_ == FORMAT_SCRIPT: arch = get_script_arch() os_ = get_script_os() else: From 9d7f575562eb0255147d4ede01c1609d51728a97 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Wed, 29 Jun 2022 16:42:06 -0400 Subject: [PATCH 07/51] Implemented a large part of the C# functionality; refactored the TreeSitterEngine class. --- capa/features/extractors/script.py | 8 +- capa/features/extractors/ts/engine.py | 108 +++++++++++++----- capa/features/extractors/ts/extractor.py | 28 +++-- capa/features/extractors/ts/file.py | 50 ++++++++ capa/features/extractors/ts/global_.py | 30 ++--- capa/features/extractors/ts/query.py | 3 +- capa/features/extractors/ts/sig.py | 23 ++++ .../extractors/ts/signatures/__init__.py | 0 .../features/extractors/ts/signatures/cs.json | 8 ++ 9 files changed, 189 insertions(+), 69 deletions(-) create mode 100644 capa/features/extractors/ts/file.py create mode 100644 capa/features/extractors/ts/sig.py create mode 100644 capa/features/extractors/ts/signatures/__init__.py create mode 100644 capa/features/extractors/ts/signatures/cs.json diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py index e2842ae2e..67145aea9 100644 --- a/capa/features/extractors/script.py +++ b/capa/features/extractors/script.py @@ -1,7 +1,7 @@ import os from typing import Tuple, Iterator -from capa.features.common import OS, OS_ANY, ARCH_ANY, Arch, Feature, ScriptLanguage +from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_SCRIPT, Arch, Format, Feature, ScriptLanguage from capa.features.address import NO_ADDRESS, Address, FileOffsetRangeAddress LANG_CS = "c_sharp" @@ -19,8 +19,12 @@ def extract_os() -> Iterator[Tuple[Feature, Address]]: yield OS(OS_ANY), NO_ADDRESS +def extract_format() -> Iterator[Tuple[Feature, Address]]: + yield Format(FORMAT_SCRIPT), NO_ADDRESS + + def get_language_from_ext(path: str): _, ext = os.path.splitext(path) if ext == ".cs": return LANG_CS - raise ValueError("{path} has an unrecognized or an unsupported extension.") + raise ValueError(f"{path} has an unrecognized or an unsupported extension.") diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index 03c7db429..d06ab5011 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -1,58 +1,106 @@ +from typing import List, Tuple + from tree_sitter import Node, Tree, Parser +import capa.features.extractors.ts.sig import capa.features.extractors.ts.query +from capa.features.address import FileOffsetRangeAddress +from capa.features.extractors.ts.query import QueryBinding class TreeSitterExtractorEngine: - def __init__(self, language): + query: QueryBinding + path: str + buf: bytes + tree: Tree + import_signatures: set + + def __init__(self, language: str, path: str): + self.language = language self.query = capa.features.extractors.ts.query.QueryBinding(language) + self.import_signatures = capa.features.extractors.ts.sig.load_import_signatures(language) + self.path = path + with open(self.path, "rb") as f: + self.buf = f.read() + self.parse() - def get_ts_language(self): - return self.query.language + def get_language(self): + return self.language - def parse(self, source: bytes) -> Tree: - parser = Parser() - parser.set_language(self.get_ts_language()) - return parser.parse(source) + def parse(self): + self.parser = Parser() + self.parser.set_language(self.get_ts_language()) + self.tree = self.parser.parse(self.buf) - def get_new_objects(self, node: Node): + def get_new_objects(self, node: Node) -> List[Tuple[Node, str]]: return self.query.new_object.captures(node) - def get_object_id(self, node: Node): + def get_all_new_objects(self) -> List[Tuple[Node, str]]: + return self.get_new_objects(self.tree.root_node) + + def get_object_id(self, node: Node) -> Node: return node.child_by_field_name(self.query.new_object_field_name) - def get_functions(self, node: Node): + def get_all_import_names(self) -> List[Tuple[Node, str]]: + join_names = capa.features.extractors.ts.sig.get_name_joiner(self.language) + import_names = [] + namespaces = set([self.get_range(node) for node, _ in self.get_all_namespaces()]) + for node, _ in self.get_all_new_objects(): + for namespace in namespaces: + name = join_names(namespace, self.get_range(node)) + if name in self.import_signatures: + import_names.append(name) + return import_names + + def get_functions(self, node: Node) -> List[Tuple[Node, str]]: return self.query.function_def.captures(node) - def get_function_definition_id(self, node: Node): + def get_all_functions(self) -> List[Tuple[Node, str]]: + return self.get_functions(self.tree.root_node) + + def get_function_definition_id(self, node: Node) -> Node: return node.child_by_field_name(self.query.function_def_field_name) - def get_function_calls(self, node: Node): + def get_function_calls(self, node: Node) -> List[Tuple[Node, str]]: return self.query.function_call.captures(node) - def get_function_call_id(self, node: Node): + def get_all_function_calls(self) -> List[Tuple[Node, str]]: + return self.get_function_calls(self.tree.root_node) + + def get_function_call_id(self, node: Node) -> Node: return node.child_by_field_name(self.query.function_call_field_name) - def extract_string_literals(self, node: Node): + def get_all_function_names(self) -> List[Tuple[Node, str]]: + join_names = capa.features.extractors.ts.sig.get_name_joiner(self.language) + function_names = [] + namespaces = set([self.get_range(node) for node, _ in self.get_all_namespaces()]) + for node, _ in self.get_all_function_calls(): + for namespace in namespaces: + name = join_names(namespace, self.get_range(node)) + if name in self.import_signatures: + function_names.append(name) + return function_names + + def get_string_literals(self, node: Node) -> List[Tuple[Node, str]]: return self.query.string_literal.captures(node) - def extract_integer_literals(self, node: Node): + def get_all_string_literals(self) -> List[Tuple[Node, str]]: + return self.get_string_literals(self.tree.root_node) + + def get_integer_literals(self, node: Node) -> List[Tuple[Node, str]]: return self.query.integer_literal.captures(node) - def extract_namespaces(self, node: Node): + def get_namespaces(self, node: Node) -> List[Tuple[Node, str]]: return self.query.namespace.captures(node) - def extract_blocks(self, node: Node): - return self.query.block.captures(node) - - def extract_node(self, tree, tgt_node_name): - cursor = tree.walk() - while True: - if cursor.node.type == tgt_node_name: - yield cursor.node - if cursor.goto_first_child() or cursor.goto_next_sibling(): - continue - while cursor.goto_parent() and not cursor.goto_next_sibling(): - continue - if cursor.node.parent is None: - break + def get_all_namespaces(self) -> List[Tuple[Node, str]]: + return self.get_namespaces(self.tree.root_node) + + def get_range(self, node: Node) -> str: + return self.buf[node.start_byte : node.end_byte].decode() + + def get_address(self, node: Node): + return FileOffsetRangeAddress(node.start_byte, node.end_byte) + + def get_default_address(self): + return self.get_addr(self.tree.root_node) diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index 88f041fd8..45436f205 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -1,35 +1,33 @@ from typing import List, Tuple, Union, Iterator import capa.features.extractors.script +import capa.features.extractors.ts.file import capa.features.extractors.ts.engine import capa.features.extractors.ts.global_ -from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress, FileOffsetRangeAddress +from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress +from capa.features.extractors.ts.engine import TreeSitterExtractorEngine from capa.features.extractors.base_extractor import Feature, BBHandle, InsnHandle, FunctionHandle, FeatureExtractor class TreeSitterFeatureExtractor(FeatureExtractor): + engine: TreeSitterExtractorEngine + def __init__(self, path: str): super().__init__() - self.path = path - self.language = capa.features.extractors.script.get_language_from_ext(path) - with open(self.path, "rb") as f: - self.buf = f.read() - self.engine = capa.features.extractors.ts.engine.TreeSitterExtractorEngine(self.language) - self.tree = self.engine.parse(self.buf) + self.engine = TreeSitterExtractorEngine(capa.features.extractors.script.get_language_from_ext(path), path) def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: return NO_ADDRESS def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: - ctx = capa.features.extractors.ts.global_.GlobalScriptContext(self.language, self.tree) - yield from capa.features.extractors.ts.global_.extract_features(ctx) + yield from capa.features.extractors.ts.global_.extract_features() def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: - yield from [] + yield from capa.features.extractors.ts.file.extract_features(self.engine) def get_functions(self) -> Iterator[FunctionHandle]: - for node, _ in self.engine.get_functions(self.tree): - yield FunctionHandle(address=FileOffsetRangeAddress(node.start_byte, node.end_byte), inner=node) + for node, _ in self.engine.get_all_functions(): + yield FunctionHandle(address=self.engine.get_address(node), inner=node) def extract_function_features(self, f: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: raise NotImplementedError("not implemented") @@ -41,15 +39,15 @@ def extract_basic_block_features(self, f: FunctionHandle, bb: BBHandle) -> Itera yield from [] def get_instructions(self, f: FunctionHandle, bb: BBHandle) -> Iterator[InsnHandle]: - yield from [] + yield InsnHandle(address=self.engine.get_default_address(), inner=self.engine.tree.root_node) def extract_insn_features( self, f: FunctionHandle, bb: BBHandle, insn: InsnHandle ) -> Iterator[Tuple[Feature, Address]]: - raise NotImplementedError("not implemented") + yield from [] def is_library_function(self, addr) -> bool: return False def get_function_name(self, addr) -> str: - return self.buf[addr.start_byte : addr.end_byte].decode() + return self.engine.tree.buf[addr.start_byte : addr.end_byte].decode() diff --git a/capa/features/extractors/ts/file.py b/capa/features/extractors/ts/file.py new file mode 100644 index 000000000..20e00c038 --- /dev/null +++ b/capa/features/extractors/ts/file.py @@ -0,0 +1,50 @@ +from typing import Tuple, Iterator + +import capa.features.extractors.script +from capa.features.file import Import, FunctionName +from capa.features.common import String, Feature, Namespace +from capa.features.address import Address +from capa.features.extractors.ts.engine import TreeSitterExtractorEngine + + +def extract_file_format(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.script.extract_format() + + +def extract_language(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.script.extract_language(engine.get_language(), engine.get_default_address()) + + +def extract_file_strings(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node, _ in engine.get_all_string_literals(): + yield String(engine.get_range(node).strip('"')), engine.get_address(node) + + +def extract_namespaces(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node, _ in engine.get_all_namespaces(): + yield Namespace(engine.get_range(node)), engine.get_address(node) + + +def extract_file_function_names(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node, name in engine.get_all_function_names(): + yield FunctionName(name), engine.get_address(node) + + +def extract_file_import_names(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node, name in engine.get_all_import_names(): + yield Import(name), engine.get_address(node) + + +def extract_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for file_handler in FILE_HANDLERS: + for feature, addr in file_handler(engine): + yield feature, addr + + +FILE_HANDLERS = ( + extract_file_strings, + extract_file_function_names, + extract_file_import_names, + extract_file_format, + extract_language, +) diff --git a/capa/features/extractors/ts/global_.py b/capa/features/extractors/ts/global_.py index 80ddb54fd..23db0cb47 100644 --- a/capa/features/extractors/ts/global_.py +++ b/capa/features/extractors/ts/global_.py @@ -1,34 +1,22 @@ -from dataclasses import dataclass - -from tree_sitter import Tree +from typing import Tuple, Iterator import capa.features.extractors.script -from capa.features.address import FileOffsetRangeAddress - - -@dataclass -class GlobalScriptContext: - language: str - tree: Tree +from capa.features.common import Feature +from capa.features.address import Address -def extract_arch(ctx: GlobalScriptContext): +def extract_arch() -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.script.extract_arch() -def extract_language(ctx: GlobalScriptContext): - node = ctx.tree.root_node - addr = FileOffsetRangeAddress(node.start_byte, node.end_byte) - yield from capa.features.extractors.script.extract_language(ctx.language, addr) - - -def extract_os(ctx: GlobalScriptContext): +def extract_os() -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.script.extract_os() -def extract_features(ctx: GlobalScriptContext): +def extract_features() -> Iterator[Tuple[Feature, Address]]: for glob_handler in GLOBAL_HANDLERS: - yield glob_handler(ctx) + for feature, addr in glob_handler(): + yield feature, addr -GLOBAL_HANDLERS = (extract_arch, extract_os, extract_language) +GLOBAL_HANDLERS = (extract_arch, extract_os) diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index d7bdd5e74..68c1b78af 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -4,6 +4,7 @@ from tree_sitter.binding import Query import capa.features.extractors.ts.build +from capa.features.extractors.script import LANG_CS @dataclass @@ -21,7 +22,7 @@ class QueryBinding: def __init__(self, language: str): self.language = Language(capa.features.extractors.ts.build.build_dir, language) - if language == "c_sharp": + if language == LANG_CS: self.new_object = self.language.query("(object_creation_expression) @object.new") self.new_object_field_name = "type" self.function_def = self.language.query("(local_function_statement) @function.def") diff --git a/capa/features/extractors/ts/sig.py b/capa/features/extractors/ts/sig.py new file mode 100644 index 000000000..4034fc5ac --- /dev/null +++ b/capa/features/extractors/ts/sig.py @@ -0,0 +1,23 @@ +import json +import importlib.resources +from typing import Callable + +import capa.features.extractors.ts.signatures +from capa.features.extractors.script import LANG_CS + + +def get_sig_file(language: str) -> str: + if language == LANG_CS: + return "cs.json" + raise ValueError("Language {language} does not have an import signature file") + + +def load_import_signatures(language: str) -> set: + sig_file = get_sig_file(language) + return set(json.loads(importlib.resources.read_text(capa.features.extractors.ts.signatures, sig_file))) + + +def get_name_joiner(language: str) -> Callable: + if language == LANG_CS: + return lambda qualified_name, identifier: qualified_name + "." + identifier + raise ValueError("Language {language} does not have a name joiner") diff --git a/capa/features/extractors/ts/signatures/__init__.py b/capa/features/extractors/ts/signatures/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/capa/features/extractors/ts/signatures/cs.json b/capa/features/extractors/ts/signatures/cs.json new file mode 100644 index 000000000..d0d7abe68 --- /dev/null +++ b/capa/features/extractors/ts/signatures/cs.json @@ -0,0 +1,8 @@ +[ + "System.Convert.ToBase64String", + "System.Convert.FromBase64String", + "System.Diagnostics.Process", + "System.Diagnostics.ProcessStartInfo", + "System.Security.Cryptography.RijndaelManaged", + "System.Security.Cryptography.CryptoStream" +] \ No newline at end of file From 3d4b4ec67730f317935c47901f396e69684882f3 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Thu, 30 Jun 2022 10:34:32 -0400 Subject: [PATCH 08/51] Added function-level feature extraction. --- capa/features/extractors/ts/engine.py | 20 ++++------- capa/features/extractors/ts/extractor.py | 7 ++-- capa/features/extractors/ts/file.py | 27 ++++++++++----- capa/features/extractors/ts/function.py | 44 ++++++++++++++++++++++++ capa/features/extractors/ts/query.py | 2 ++ 5 files changed, 76 insertions(+), 24 deletions(-) create mode 100644 capa/features/extractors/ts/function.py diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index d06ab5011..640b7909b 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -35,17 +35,14 @@ def parse(self): def get_new_objects(self, node: Node) -> List[Tuple[Node, str]]: return self.query.new_object.captures(node) - def get_all_new_objects(self) -> List[Tuple[Node, str]]: - return self.get_new_objects(self.tree.root_node) - def get_object_id(self, node: Node) -> Node: return node.child_by_field_name(self.query.new_object_field_name) - def get_all_import_names(self) -> List[Tuple[Node, str]]: + def get_import_names(self, node: Node) -> List[Tuple[Node, str]]: join_names = capa.features.extractors.ts.sig.get_name_joiner(self.language) import_names = [] namespaces = set([self.get_range(node) for node, _ in self.get_all_namespaces()]) - for node, _ in self.get_all_new_objects(): + for node, _ in self.get_new_objects(node): for namespace in namespaces: name = join_names(namespace, self.get_range(node)) if name in self.import_signatures: @@ -64,17 +61,14 @@ def get_function_definition_id(self, node: Node) -> Node: def get_function_calls(self, node: Node) -> List[Tuple[Node, str]]: return self.query.function_call.captures(node) - def get_all_function_calls(self) -> List[Tuple[Node, str]]: - return self.get_function_calls(self.tree.root_node) - def get_function_call_id(self, node: Node) -> Node: return node.child_by_field_name(self.query.function_call_field_name) - def get_all_function_names(self) -> List[Tuple[Node, str]]: + def get_function_names(self, node: Node) -> List[Tuple[Node, str]]: join_names = capa.features.extractors.ts.sig.get_name_joiner(self.language) function_names = [] namespaces = set([self.get_range(node) for node, _ in self.get_all_namespaces()]) - for node, _ in self.get_all_function_calls(): + for node, _ in self.get_function_calls(node): for namespace in namespaces: name = join_names(namespace, self.get_range(node)) if name in self.import_signatures: @@ -84,9 +78,6 @@ def get_all_function_names(self) -> List[Tuple[Node, str]]: def get_string_literals(self, node: Node) -> List[Tuple[Node, str]]: return self.query.string_literal.captures(node) - def get_all_string_literals(self) -> List[Tuple[Node, str]]: - return self.get_string_literals(self.tree.root_node) - def get_integer_literals(self, node: Node) -> List[Tuple[Node, str]]: return self.query.integer_literal.captures(node) @@ -96,6 +87,9 @@ def get_namespaces(self, node: Node) -> List[Tuple[Node, str]]: def get_all_namespaces(self) -> List[Tuple[Node, str]]: return self.get_namespaces(self.tree.root_node) + def get_global_statements(self) -> List[Tuple[Node, str]]: + return self.query.global_statement.captures(self.tree.root_node) + def get_range(self, node: Node) -> str: return self.buf[node.start_byte : node.end_byte].decode() diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index 45436f205..d80cc45fb 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -1,9 +1,10 @@ -from typing import List, Tuple, Union, Iterator +from typing import Tuple, Union, Iterator import capa.features.extractors.script import capa.features.extractors.ts.file import capa.features.extractors.ts.engine import capa.features.extractors.ts.global_ +import capa.features.extractors.ts.function from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress from capa.features.extractors.ts.engine import TreeSitterExtractorEngine from capa.features.extractors.base_extractor import Feature, BBHandle, InsnHandle, FunctionHandle, FeatureExtractor @@ -30,7 +31,7 @@ def get_functions(self) -> Iterator[FunctionHandle]: yield FunctionHandle(address=self.engine.get_address(node), inner=node) def extract_function_features(self, f: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: - raise NotImplementedError("not implemented") + yield from capa.features.extractors.ts.function.extract_features(f, self.engine) def get_basic_blocks(self, f: FunctionHandle) -> Iterator[BBHandle]: yield from [] @@ -39,7 +40,7 @@ def extract_basic_block_features(self, f: FunctionHandle, bb: BBHandle) -> Itera yield from [] def get_instructions(self, f: FunctionHandle, bb: BBHandle) -> Iterator[InsnHandle]: - yield InsnHandle(address=self.engine.get_default_address(), inner=self.engine.tree.root_node) + yield from [] def extract_insn_features( self, f: FunctionHandle, bb: BBHandle, insn: InsnHandle diff --git a/capa/features/extractors/ts/file.py b/capa/features/extractors/ts/file.py index 20e00c038..24630084c 100644 --- a/capa/features/extractors/ts/file.py +++ b/capa/features/extractors/ts/file.py @@ -2,6 +2,7 @@ import capa.features.extractors.script from capa.features.file import Import, FunctionName +from capa.features.insn import Number from capa.features.common import String, Feature, Namespace from capa.features.address import Address from capa.features.extractors.ts.engine import TreeSitterExtractorEngine @@ -16,8 +17,15 @@ def extract_language(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Featur def extract_file_strings(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, _ in engine.get_all_string_literals(): - yield String(engine.get_range(node).strip('"')), engine.get_address(node) + for global_node, _ in engine.get_global_statements(): + for node, _ in engine.get_string_literals(global_node): + yield String(engine.get_range(node).strip('"')), engine.get_address(node) + + +def extract_file_integer_literals(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for global_node, _ in engine.get_global_statements(): + for node, _ in engine.get_integer_literals(global_node): + yield Number(int(engine.get_range(node))), engine.get_address(node) def extract_namespaces(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: @@ -26,13 +34,15 @@ def extract_namespaces(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feat def extract_file_function_names(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, name in engine.get_all_function_names(): - yield FunctionName(name), engine.get_address(node) + for global_node, _ in engine.get_global_statements(): + for node, name in engine.get_function_names(global_node): + yield FunctionName(name), engine.get_address(node) def extract_file_import_names(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, name in engine.get_all_import_names(): - yield Import(name), engine.get_address(node) + for global_node, _ in engine.get_global_statements(): + for node, name in engine.get_import_names(global_node): + yield Import(name), engine.get_address(node) def extract_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: @@ -42,9 +52,10 @@ def extract_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Featur FILE_HANDLERS = ( - extract_file_strings, + extract_file_format, extract_file_function_names, extract_file_import_names, - extract_file_format, + extract_file_integer_literals, + extract_file_strings, extract_language, ) diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py new file mode 100644 index 000000000..230caf523 --- /dev/null +++ b/capa/features/extractors/ts/function.py @@ -0,0 +1,44 @@ +from typing import Tuple, Iterator + +from capa.features.file import Import, FunctionName +from capa.features.insn import Number +from capa.features.common import String, Feature +from capa.features.address import Address +from capa.features.extractors.ts.engine import TreeSitterExtractorEngine +from capa.features.extractors.base_extractor import FunctionHandle + + +def extract_strings(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node, _ in engine.get_string_literals(fh.inner): + yield String(engine.get_range(node).strip('"')), engine.get_address(node) + + +def extract_integer_literals( + fh: FunctionHandle, engine: TreeSitterExtractorEngine +) -> Iterator[Tuple[Feature, Address]]: + for node, _ in engine.get_integer_literals(fh.inner): + yield Number(int(engine.get_range(node))), engine.get_address(node) + + +def extract_function_names(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node, name in engine.get_function_names(fh.inner): + yield FunctionName(name), engine.get_address(node) + + +def extract_import_names(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node, name in engine.get_import_names(fh.inner): + yield Import(name), engine.get_address(node) + + +def extract_features(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for file_handler in FUNCTION_HANDLERS: + for feature, addr in file_handler(fh=fh, engine=engine): + yield feature, addr + + +FUNCTION_HANDLERS = ( + extract_function_names, + extract_import_names, + extract_integer_literals, + extract_strings, +) diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index 68c1b78af..a0f15c371 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -19,6 +19,7 @@ class QueryBinding: string_literal: Query integer_literal: Query namespace: Query + global_statement: Query def __init__(self, language: str): self.language = Language(capa.features.extractors.ts.build.build_dir, language) @@ -32,5 +33,6 @@ def __init__(self, language: str): self.string_literal = self.language.query("(string_literal) @string-literal") self.integer_literal = self.language.query("(integer_literal) @integer-literal") self.namespace = self.language.query("(using_directive\n\t(qualified_name) @namespace)") + self.global_statement = self.language.query("(global_statement) @global-statement") else: raise NotImplementedError(f"Tree-sitter queries for {language} are not implemented.") From eca7ead6bf87b233718771f55c9441d2b4760944 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Thu, 30 Jun 2022 16:39:48 -0400 Subject: [PATCH 09/51] Bug fixes and code refactoring of the Tree Sitter extractor. --- capa/features/extractors/ts/build.py | 6 +++++ capa/features/extractors/ts/engine.py | 34 +++++++++++++++++------- capa/features/extractors/ts/extractor.py | 2 +- capa/features/extractors/ts/query.py | 8 +++--- 4 files changed, 35 insertions(+), 15 deletions(-) diff --git a/capa/features/extractors/ts/build.py b/capa/features/extractors/ts/build.py index 4d1b13184..f04006bc8 100644 --- a/capa/features/extractors/ts/build.py +++ b/capa/features/extractors/ts/build.py @@ -1,4 +1,10 @@ +from tree_sitter import Language + build_dir = "build/my-languages.so" languages = [ "vendor/tree-sitter-c-sharp", ] + + +def ts_build(): + Language.build_library(build_dir, languages) diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index 640b7909b..916113245 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -1,8 +1,9 @@ -from typing import List, Tuple +from typing import List, Tuple, Iterator from tree_sitter import Node, Tree, Parser import capa.features.extractors.ts.sig +import capa.features.extractors.ts.build import capa.features.extractors.ts.query from capa.features.address import FileOffsetRangeAddress from capa.features.extractors.ts.query import QueryBinding @@ -16,6 +17,7 @@ class TreeSitterExtractorEngine: import_signatures: set def __init__(self, language: str, path: str): + capa.features.extractors.ts.build.ts_build() self.language = language self.query = capa.features.extractors.ts.query.QueryBinding(language) self.import_signatures = capa.features.extractors.ts.sig.load_import_signatures(language) @@ -27,6 +29,9 @@ def __init__(self, language: str, path: str): def get_language(self): return self.language + def get_ts_language(self): + return self.query.language + def parse(self): self.parser = Parser() self.parser.set_language(self.get_ts_language()) @@ -38,25 +43,30 @@ def get_new_objects(self, node: Node) -> List[Tuple[Node, str]]: def get_object_id(self, node: Node) -> Node: return node.child_by_field_name(self.query.new_object_field_name) + def get_new_object_ids(self, node: Node) -> Iterator[Node]: + for obj_node, _ in self.get_new_objects(node): + yield self.get_object_id(obj_node) + def get_import_names(self, node: Node) -> List[Tuple[Node, str]]: join_names = capa.features.extractors.ts.sig.get_name_joiner(self.language) import_names = [] namespaces = set([self.get_range(node) for node, _ in self.get_all_namespaces()]) - for node, _ in self.get_new_objects(node): + for node, _ in self.get_new_object_ids(node): for namespace in namespaces: name = join_names(namespace, self.get_range(node)) if name in self.import_signatures: import_names.append(name) return import_names - def get_functions(self, node: Node) -> List[Tuple[Node, str]]: - return self.query.function_def.captures(node) - - def get_all_functions(self) -> List[Tuple[Node, str]]: - return self.get_functions(self.tree.root_node) + def get_function_definitions(self, node: Node = None) -> List[Tuple[Node, str]]: + return self.query.function_definition.captures(node if node is not None else self.tree.root_node) def get_function_definition_id(self, node: Node) -> Node: - return node.child_by_field_name(self.query.function_def_field_name) + return node.child_by_field_name(self.query.function_definition_field_name) + + def get_function_definition_ids(self, node: Node) -> Iterator[Node]: + for fn_node, _ in self.get_function_definitions(node): + yield self.get_function_definition_id(fn_node) def get_function_calls(self, node: Node) -> List[Tuple[Node, str]]: return self.query.function_call.captures(node) @@ -64,11 +74,15 @@ def get_function_calls(self, node: Node) -> List[Tuple[Node, str]]: def get_function_call_id(self, node: Node) -> Node: return node.child_by_field_name(self.query.function_call_field_name) + def get_function_call_ids(self, node: Node) -> Iterator[Node]: + for fn_node, _ in self.get_function_calls(node): + yield self.get_function_call_id(fn_node) + def get_function_names(self, node: Node) -> List[Tuple[Node, str]]: join_names = capa.features.extractors.ts.sig.get_name_joiner(self.language) function_names = [] namespaces = set([self.get_range(node) for node, _ in self.get_all_namespaces()]) - for node, _ in self.get_function_calls(node): + for node, _ in self.get_function_call_ids(node): for namespace in namespaces: name = join_names(namespace, self.get_range(node)) if name in self.import_signatures: @@ -97,4 +111,4 @@ def get_address(self, node: Node): return FileOffsetRangeAddress(node.start_byte, node.end_byte) def get_default_address(self): - return self.get_addr(self.tree.root_node) + return self.get_address(self.tree.root_node) diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index d80cc45fb..75b183097 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -27,7 +27,7 @@ def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.ts.file.extract_features(self.engine) def get_functions(self) -> Iterator[FunctionHandle]: - for node, _ in self.engine.get_all_functions(): + for node, _ in self.engine.get_function_definitions(): yield FunctionHandle(address=self.engine.get_address(node), inner=node) def extract_function_features(self, f: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index a0f15c371..ddcc5c375 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -12,8 +12,8 @@ class QueryBinding: language: Language new_object: Query new_object_field_name: str - function_def: Query - function_def_field_name: str + function_definition: Query + function_definition_field_name: str function_call: Query function_call_field_name: str string_literal: Query @@ -26,8 +26,8 @@ def __init__(self, language: str): if language == LANG_CS: self.new_object = self.language.query("(object_creation_expression) @object.new") self.new_object_field_name = "type" - self.function_def = self.language.query("(local_function_statement) @function.def") - self.function_def_field_name = "name" + self.function_definition = self.language.query("(local_function_statement) @function.definition") + self.function_definition_field_name = "name" self.function_call = self.language.query("(invocation_expression) @function.call") self.function_call_field_name = "function" self.string_literal = self.language.query("(string_literal) @string-literal") From 5fd953f22c07307cddd4879daef37467ec1e6277 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Thu, 30 Jun 2022 16:46:53 -0400 Subject: [PATCH 10/51] Added tree_sitter to requirements in setup.py. --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 560533a8d..2c531fb38 100644 --- a/setup.py +++ b/setup.py @@ -29,6 +29,7 @@ "dnfile==0.11.0", "dncil==1.0.0", "pydantic==1.9.1", + "tree_sitter==0.20.0", ] # this sets __version__ From 1f79db9f982d0fb16478e51efa25c874ffbdf46a Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Fri, 1 Jul 2022 09:27:41 -0400 Subject: [PATCH 11/51] Added tests for TreeSitterExtractorEngine initialization, new object and function definition parsing for a pure C# sample. --- tests/fixtures.py | 17 ++++++++ tests/test_ts_engine.py | 93 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 110 insertions(+) create mode 100644 tests/test_ts_engine.py diff --git a/tests/fixtures.py b/tests/fixtures.py index 88a63de19..594a33e35 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -44,6 +44,7 @@ CD = os.path.dirname(__file__) DOTNET_DIR = os.path.join(CD, "data", "dotnet") DNFILE_TESTFILES = os.path.join(DOTNET_DIR, "dnfile-testfiles") +SCRIPT_DIR = os.path.join(CD, "data", "scripts") @contextlib.contextmanager @@ -169,6 +170,13 @@ def get_dnfile_extractor(path): return extractor +@lru_cache(maxsize=1) +def get_ts_extractor_engine(language, path): + import capa.features.extractors.ts.engine + + return capa.features.extractors.ts.engine.TreeSitterExtractorEngine(language, path) + + def extract_global_features(extractor): features = collections.defaultdict(set) for feature, va in extractor.extract_global_features(): @@ -279,6 +287,10 @@ def get_data_path_by_name(name): return os.path.join(CD, "data", "dotnet", "1c444ebeba24dcba8628b7dfe5fec7c6.exe_") elif name.startswith("_692f"): return os.path.join(CD, "data", "dotnet", "692f7fd6d198e804d6af98eb9e390d61.exe_") + elif name.startswith("cs_f397cb"): + return os.path.join(SCRIPT_DIR, "f397cb676353873cdc8fcfbf0e3a317334353cc63946099e5ea22db6d1eebfb8.cs_") + elif name.startswith("aspx_f397cb"): + return os.path.join(SCRIPT_DIR, "f397cb676353873cdc8fcfbf0e3a317334353cc63946099e5ea22db6d1eebfb8.aspx_") else: raise ValueError("unexpected sample fixture: %s" % name) @@ -904,3 +916,8 @@ def _1c444_dotnetfile_extractor(): @pytest.fixture def _692f_dotnetfile_extractor(): return get_dnfile_extractor(get_data_path_by_name("_692f")) + + +@pytest.fixture +def cs_f397cb_extractor_engine(): + return get_ts_extractor_engine("c_sharp", get_data_path_by_name("cs_f397cb")) diff --git a/tests/test_ts_engine.py b/tests/test_ts_engine.py new file mode 100644 index 000000000..f5d717854 --- /dev/null +++ b/tests/test_ts_engine.py @@ -0,0 +1,93 @@ +from typing import List, Tuple + +import pytest +from fixtures import * +from tree_sitter import Node, Tree, Parser + +from capa.features.address import FileOffsetRangeAddress +from capa.features.extractors.script import LANG_CS +from capa.features.extractors.ts.query import QueryBinding +from capa.features.extractors.ts.engine import TreeSitterExtractorEngine + + +def do_test_ts_engine_init(engine: TreeSitterExtractorEngine): + assert engine.language == LANG_CS + assert isinstance(engine.query, QueryBinding) + assert isinstance(engine.import_signatures, set) and len(engine.import_signatures) > 0 + assert isinstance(engine.path, str) and len(engine.path) > 0 + assert isinstance(engine.buf, bytes) and len(engine.buf) > 0 + assert isinstance(engine.parser, Parser) + assert isinstance(engine.tree, Tree) + assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) + addr = engine.get_default_address() + assert addr.start_byte == engine.tree.root_node.start_byte and addr.end_byte == engine.tree.root_node.end_byte + + +def do_test_ts_engine_object_parsing(engine: TreeSitterExtractorEngine, expected_list: List[Tuple[str, str]]): + for (node, name), (expected_range, expected_id_range) in zip( + engine.get_new_objects(engine.tree.root_node), expected_list + ): + assert isinstance(node, Node) + assert name == "object.new" + assert engine.get_range(node) == expected_range + assert isinstance(engine.get_address(node), FileOffsetRangeAddress) + addr = engine.get_address(node) + assert addr.start_byte == node.start_byte and addr.end_byte == node.end_byte + assert engine.get_range(engine.get_object_id(node)) == expected_id_range + + for node, (_, expected_id_range) in zip(engine.get_new_object_ids(engine.tree.root_node), expected_list): + assert isinstance(node, Node) + assert engine.get_range(node) == expected_id_range + assert isinstance(engine.get_address(node), FileOffsetRangeAddress) + addr = engine.get_address(node) + assert addr.start_byte == node.start_byte and addr.end_byte == node.end_byte + + +def do_test_ts_engine_function_definition_parsing( + engine: TreeSitterExtractorEngine, expected_list: List[Tuple[str, str]] +): + for (node, name), (expected_range, expected_id_range) in zip( + engine.get_function_definitions(engine.tree.root_node), expected_list + ): + assert isinstance(node, Node) + assert name == "function.definition" + assert engine.get_range(node).startswith(expected_range) + assert isinstance(engine.get_address(node), FileOffsetRangeAddress) + addr = engine.get_address(node) + assert addr.start_byte == node.start_byte and addr.end_byte == node.end_byte + assert engine.get_range(engine.get_function_definition_id(node)) == expected_id_range + + for node, (_, expected_id_range) in zip(engine.get_function_definition_ids(engine.tree.root_node), expected_list): + assert isinstance(node, Node) + assert engine.get_range(node) == expected_id_range + assert isinstance(engine.get_address(node), FileOffsetRangeAddress) + addr = engine.get_address(node) + assert addr.start_byte == node.start_byte and addr.end_byte == node.end_byte + + +@parametrize( + "engine_str,expected_dict", + [ + ( + "cs_f397cb_extractor_engine", + { + "global objects": [ + ( + 'new Diagnostics.ProcessStartInfo("cmd", "/c " + Request.Form["c"])', + "Diagnostics.ProcessStartInfo", + ), + ("new System.Diagnostics.Process()", "System.Diagnostics.Process"), + ], + "global function definitions": [ + ("void die()", "die"), + ("void Page_Load(object sender, System.EventArgs e)", "Page_Load"), + ], + }, + ), + ], +) +def test_ts_engine(request: pytest.FixtureRequest, engine_str: str, expected_dict: dict): + engine: TreeSitterExtractorEngine = request.getfixturevalue(engine_str) + do_test_ts_engine_init(engine) + do_test_ts_engine_object_parsing(engine, expected_dict["global objects"]) + do_test_ts_engine_function_definition_parsing(engine, expected_dict["global function definitions"]) From a58bc0bc14eec45e8da8abc9e1d7a35d1bfdb5cc Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Fri, 1 Jul 2022 12:07:36 -0400 Subject: [PATCH 12/51] Added more TreeSitterExtractorEngine tests for pure C#. --- tests/test_ts_engine.py | 164 ++++++++++++++++++++++++++++++++-------- 1 file changed, 134 insertions(+), 30 deletions(-) diff --git a/tests/test_ts_engine.py b/tests/test_ts_engine.py index f5d717854..14c7e12d9 100644 --- a/tests/test_ts_engine.py +++ b/tests/test_ts_engine.py @@ -23,46 +23,88 @@ def do_test_ts_engine_init(engine: TreeSitterExtractorEngine): assert addr.start_byte == engine.tree.root_node.start_byte and addr.end_byte == engine.tree.root_node.end_byte -def do_test_ts_engine_object_parsing(engine: TreeSitterExtractorEngine, expected_list: List[Tuple[str, str]]): - for (node, name), (expected_range, expected_id_range) in zip( - engine.get_new_objects(engine.tree.root_node), expected_list - ): +def do_test_range(engine: TreeSitterExtractorEngine, node: Node, expected_range: str, startswith: bool = False): + assert engine.get_range(node).startswith(expected_range) if startswith else engine.get_range(node) == expected_range + + +def do_test_id_range(engine: TreeSitterExtractorEngine, node: Node, expected_id_range: str, startswith: bool = False): + do_test_range(engine, engine.get_object_id(node), expected_id_range, startswith) + + +def do_test_range_address(engine: TreeSitterExtractorEngine, node: Node): + assert isinstance(engine.get_address(node), FileOffsetRangeAddress) + addr = engine.get_address(node) + assert addr.start_byte == node.start_byte and addr.end_byte == node.end_byte + + +def do_test_ts_engine_object_parsing( + engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[Tuple[str, str]] +): + for (node, name), (expected_range, expected_id_range) in zip(engine.get_new_objects(root_node), expected_list): assert isinstance(node, Node) assert name == "object.new" - assert engine.get_range(node) == expected_range - assert isinstance(engine.get_address(node), FileOffsetRangeAddress) - addr = engine.get_address(node) - assert addr.start_byte == node.start_byte and addr.end_byte == node.end_byte - assert engine.get_range(engine.get_object_id(node)) == expected_id_range + do_test_range(engine, node, expected_range) + do_test_range_address(engine, node) + do_test_range(engine, engine.get_object_id(node), expected_id_range) - for node, (_, expected_id_range) in zip(engine.get_new_object_ids(engine.tree.root_node), expected_list): + for node, (_, expected_id_range) in zip(engine.get_new_object_ids(root_node), expected_list): assert isinstance(node, Node) - assert engine.get_range(node) == expected_id_range - assert isinstance(engine.get_address(node), FileOffsetRangeAddress) - addr = engine.get_address(node) - assert addr.start_byte == node.start_byte and addr.end_byte == node.end_byte + do_test_range(engine, node, expected_id_range) + do_test_range_address(engine, node) def do_test_ts_engine_function_definition_parsing( - engine: TreeSitterExtractorEngine, expected_list: List[Tuple[str, str]] + engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[Tuple[str, str]] ): for (node, name), (expected_range, expected_id_range) in zip( - engine.get_function_definitions(engine.tree.root_node), expected_list + engine.get_function_definitions(root_node), expected_list ): assert isinstance(node, Node) assert name == "function.definition" - assert engine.get_range(node).startswith(expected_range) - assert isinstance(engine.get_address(node), FileOffsetRangeAddress) - addr = engine.get_address(node) - assert addr.start_byte == node.start_byte and addr.end_byte == node.end_byte - assert engine.get_range(engine.get_function_definition_id(node)) == expected_id_range + do_test_range(engine, node, expected_range, startswith=True) + do_test_range_address(engine, node) + do_test_range(engine, engine.get_function_definition_id(node), expected_id_range) + + for node, (_, expected_id_range) in zip(engine.get_function_definition_ids(root_node), expected_list): + assert isinstance(node, Node) + do_test_range(engine, node, expected_id_range) + do_test_range_address(engine, node) + + +def do_test_ts_engine_function_call_parsing( + engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[Tuple[str, str]] +): + for (node, name), (expected_range, expected_id_range) in zip(engine.get_function_calls(root_node), expected_list): + assert isinstance(node, Node) + assert name == "function.call" + do_test_range(engine, node, expected_range) + do_test_range_address(engine, node) + do_test_range(engine, engine.get_function_call_id(node), expected_id_range) + + for node, (_, expected_id_range) in zip(engine.get_function_call_ids(root_node), expected_list): + assert isinstance(node, Node) + do_test_range(engine, node, expected_id_range) + do_test_range_address(engine, node) + + +def do_test_ts_engine_string_literals_parsing( + engine: TreeSitterExtractorEngine, root_node: Node, expected_list: Tuple[str] +): + for (node, name), expected_range in zip(engine.get_string_literals(root_node), expected_list): + assert isinstance(node, Node) + assert name == "string-literal" + do_test_range(engine, node, expected_range) + do_test_range_address(engine, node) - for node, (_, expected_id_range) in zip(engine.get_function_definition_ids(engine.tree.root_node), expected_list): + +def do_test_ts_engine_integer_literals_parsing( + engine: TreeSitterExtractorEngine, root_node: Node, expected_list: Tuple[str] +): + for (node, name), expected_range in zip(engine.get_integer_literals(root_node), expected_list): assert isinstance(node, Node) - assert engine.get_range(node) == expected_id_range - assert isinstance(engine.get_address(node), FileOffsetRangeAddress) - addr = engine.get_address(node) - assert addr.start_byte == node.start_byte and addr.end_byte == node.end_byte + assert name == "integer-literal" + do_test_range(engine, node, expected_range) + do_test_range_address(engine, node) @parametrize( @@ -71,17 +113,74 @@ def do_test_ts_engine_function_definition_parsing( ( "cs_f397cb_extractor_engine", { - "global objects": [ + "all objects": [ ( 'new Diagnostics.ProcessStartInfo("cmd", "/c " + Request.Form["c"])', "Diagnostics.ProcessStartInfo", ), ("new System.Diagnostics.Process()", "System.Diagnostics.Process"), ], - "global function definitions": [ + "all function definitions": [ ("void die()", "die"), ("void Page_Load(object sender, System.EventArgs e)", "Page_Load"), ], + "all function calls": [ + ( + 'HttpContext.Current.Response.Write("

404 Not Found

")', + "HttpContext.Current.Response.Write", + ), + ( + "HttpContext.Current.Server.ClearError()", + "HttpContext.Current.Server.ClearError", + ), + ( + "HttpContext.Current.Response.End()", + "HttpContext.Current.Response.End", + ), + ( + "HttpContext.Current.Request.Headers[\"X-Forwarded-For\"].Split(new char[] { ',' })", + 'HttpContext.Current.Request.Headers["X-Forwarded-For"].Split', + ), + ( + "die()", + "die", + ), + ( + "p.Start()", + "p.Start", + ), + ( + "p.StandardOutput.ReadToEnd()", + "p.StandardOutput.ReadToEnd", + ), + ( + "p.StandardError.ReadToEnd()", + "p.StandardError.ReadToEnd", + ), + ( + "Page_Load(sender, e)", + "Page_Load", + ), + ], + "all string literals": ( + '""', + '""', + '"Not Found"', + '"

404 Not Found

"', + '"::1"', + '"192.168.0.1"', + '"127.0.0.1"', + '"X-Forwarded-For"', + '"X-Forwarded-For"', + '"c"', + '"cmd"', + '"/c "', + '"c"', + ), + "all integer literals": ( + "404", + "0", + ), }, ), ], @@ -89,5 +188,10 @@ def do_test_ts_engine_function_definition_parsing( def test_ts_engine(request: pytest.FixtureRequest, engine_str: str, expected_dict: dict): engine: TreeSitterExtractorEngine = request.getfixturevalue(engine_str) do_test_ts_engine_init(engine) - do_test_ts_engine_object_parsing(engine, expected_dict["global objects"]) - do_test_ts_engine_function_definition_parsing(engine, expected_dict["global function definitions"]) + do_test_ts_engine_object_parsing(engine, engine.tree.root_node, expected_dict["all objects"]) + do_test_ts_engine_function_definition_parsing( + engine, engine.tree.root_node, expected_dict["all function definitions"] + ) + do_test_ts_engine_function_call_parsing(engine, engine.tree.root_node, expected_dict["all function calls"]) + do_test_ts_engine_string_literals_parsing(engine, engine.tree.root_node, expected_dict["all string literals"]) + do_test_ts_engine_integer_literals_parsing(engine, engine.tree.root_node, expected_dict["all integer literals"]) From 5ddb8baa0d018ad1d7592b5ac1573751db4f9f62 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Fri, 1 Jul 2022 17:28:02 -0400 Subject: [PATCH 13/51] Added last remaining tests for the TreeSitterExtractorEngine class and fixed bugs found in the process. --- capa/features/extractors/ts/engine.py | 69 ++++++++++----------- capa/features/extractors/ts/file.py | 8 ++- capa/features/extractors/ts/query.py | 4 +- tests/test_ts_engine.py | 88 +++++++++++++++++++++++---- 4 files changed, 117 insertions(+), 52 deletions(-) diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index 916113245..fe84e077a 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -4,38 +4,32 @@ import capa.features.extractors.ts.sig import capa.features.extractors.ts.build -import capa.features.extractors.ts.query from capa.features.address import FileOffsetRangeAddress from capa.features.extractors.ts.query import QueryBinding class TreeSitterExtractorEngine: - query: QueryBinding - path: str buf: bytes - tree: Tree import_signatures: set + language: str + path: str + query: QueryBinding + tree: Tree def __init__(self, language: str, path: str): capa.features.extractors.ts.build.ts_build() self.language = language - self.query = capa.features.extractors.ts.query.QueryBinding(language) + self.query = QueryBinding(language) self.import_signatures = capa.features.extractors.ts.sig.load_import_signatures(language) self.path = path with open(self.path, "rb") as f: self.buf = f.read() - self.parse() - - def get_language(self): - return self.language - - def get_ts_language(self): - return self.query.language + self.tree = self.parse() def parse(self): - self.parser = Parser() - self.parser.set_language(self.get_ts_language()) - self.tree = self.parser.parse(self.buf) + parser = Parser() + parser.set_language(self.query.language) + return parser.parse(self.buf) def get_new_objects(self, node: Node) -> List[Tuple[Node, str]]: return self.query.new_object.captures(node) @@ -47,16 +41,19 @@ def get_new_object_ids(self, node: Node) -> Iterator[Node]: for obj_node, _ in self.get_new_objects(node): yield self.get_object_id(obj_node) - def get_import_names(self, node: Node) -> List[Tuple[Node, str]]: + # TODO: move this elsewhere, does not fit this class + def get_import_names(self, node: Node) -> Iterator[Tuple[Node, str]]: join_names = capa.features.extractors.ts.sig.get_name_joiner(self.language) - import_names = [] - namespaces = set([self.get_range(node) for node, _ in self.get_all_namespaces()]) - for node, _ in self.get_new_object_ids(node): + namespaces = set([self.get_range(ns_node) for ns_node, _ in self.get_namespaces()]) + for obj_node in self.get_new_object_ids(node): + obj_name = self.get_range(obj_node) + if obj_name in self.import_signatures: + yield (obj_node, obj_name) + continue for namespace in namespaces: - name = join_names(namespace, self.get_range(node)) - if name in self.import_signatures: - import_names.append(name) - return import_names + obj_name = join_names(namespace, obj_name) + if obj_name in self.import_signatures: + yield (obj_node, obj_name) def get_function_definitions(self, node: Node = None) -> List[Tuple[Node, str]]: return self.query.function_definition.captures(node if node is not None else self.tree.root_node) @@ -78,16 +75,19 @@ def get_function_call_ids(self, node: Node) -> Iterator[Node]: for fn_node, _ in self.get_function_calls(node): yield self.get_function_call_id(fn_node) - def get_function_names(self, node: Node) -> List[Tuple[Node, str]]: + # TODO: move this elsewhere, does not fit this class + def get_function_names(self, node: Node) -> Iterator[Tuple[Node, str]]: join_names = capa.features.extractors.ts.sig.get_name_joiner(self.language) - function_names = [] - namespaces = set([self.get_range(node) for node, _ in self.get_all_namespaces()]) - for node, _ in self.get_function_call_ids(node): + namespaces = set([self.get_range(ns_node) for ns_node, _ in self.get_namespaces()]) + for fn_node in self.get_function_call_ids(node): + fn_name = self.get_range(fn_node) + if fn_name in self.import_signatures: + yield (fn_node, fn_name) + continue for namespace in namespaces: - name = join_names(namespace, self.get_range(node)) - if name in self.import_signatures: - function_names.append(name) - return function_names + fn_name = join_names(namespace, fn_name) + if fn_name in self.import_signatures: + yield (fn_node, fn_name) def get_string_literals(self, node: Node) -> List[Tuple[Node, str]]: return self.query.string_literal.captures(node) @@ -95,11 +95,8 @@ def get_string_literals(self, node: Node) -> List[Tuple[Node, str]]: def get_integer_literals(self, node: Node) -> List[Tuple[Node, str]]: return self.query.integer_literal.captures(node) - def get_namespaces(self, node: Node) -> List[Tuple[Node, str]]: - return self.query.namespace.captures(node) - - def get_all_namespaces(self) -> List[Tuple[Node, str]]: - return self.get_namespaces(self.tree.root_node) + def get_namespaces(self, node: Node = None) -> List[Tuple[Node, str]]: + return self.query.namespace.captures(node if node is not None else self.tree.root_node) def get_global_statements(self) -> List[Tuple[Node, str]]: return self.query.global_statement.captures(self.tree.root_node) diff --git a/capa/features/extractors/ts/file.py b/capa/features/extractors/ts/file.py index 24630084c..a092169ab 100644 --- a/capa/features/extractors/ts/file.py +++ b/capa/features/extractors/ts/file.py @@ -13,13 +13,15 @@ def extract_file_format(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Fea def extract_language(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - yield from capa.features.extractors.script.extract_language(engine.get_language(), engine.get_default_address()) + yield from capa.features.extractors.script.extract_language(engine.language, engine.get_default_address()) def extract_file_strings(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: for global_node, _ in engine.get_global_statements(): for node, _ in engine.get_string_literals(global_node): - yield String(engine.get_range(node).strip('"')), engine.get_address(node) + s = engine.get_range(node).strip('"') + if len(s) > 0: + yield String(engine.get_range(node).strip('"')), engine.get_address(node) def extract_file_integer_literals(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: @@ -29,7 +31,7 @@ def extract_file_integer_literals(engine: TreeSitterExtractorEngine) -> Iterator def extract_namespaces(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, _ in engine.get_all_namespaces(): + for node, _ in engine.get_namespaces(): yield Namespace(engine.get_range(node)), engine.get_address(node) diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index ddcc5c375..25de4c22b 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -32,7 +32,9 @@ def __init__(self, language: str): self.function_call_field_name = "function" self.string_literal = self.language.query("(string_literal) @string-literal") self.integer_literal = self.language.query("(integer_literal) @integer-literal") - self.namespace = self.language.query("(using_directive\n\t(qualified_name) @namespace)") + self.namespace = self.language.query( + "(using_directive [(identifier) @namespace (qualified_name) @namespace])" + ) self.global_statement = self.language.query("(global_statement) @global-statement") else: raise NotImplementedError(f"Tree-sitter queries for {language} are not implemented.") diff --git a/tests/test_ts_engine.py b/tests/test_ts_engine.py index 14c7e12d9..8c6c0484d 100644 --- a/tests/test_ts_engine.py +++ b/tests/test_ts_engine.py @@ -2,7 +2,7 @@ import pytest from fixtures import * -from tree_sitter import Node, Tree, Parser +from tree_sitter import Node, Tree from capa.features.address import FileOffsetRangeAddress from capa.features.extractors.script import LANG_CS @@ -16,7 +16,6 @@ def do_test_ts_engine_init(engine: TreeSitterExtractorEngine): assert isinstance(engine.import_signatures, set) and len(engine.import_signatures) > 0 assert isinstance(engine.path, str) and len(engine.path) > 0 assert isinstance(engine.buf, bytes) and len(engine.buf) > 0 - assert isinstance(engine.parser, Parser) assert isinstance(engine.tree, Tree) assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) addr = engine.get_default_address() @@ -37,9 +36,17 @@ def do_test_range_address(engine: TreeSitterExtractorEngine, node: Node): assert addr.start_byte == node.start_byte and addr.end_byte == node.end_byte +def do_test_ts_engine_default_range_address(engine: TreeSitterExtractorEngine): + assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) + addr1 = engine.get_address(engine.tree.root_node) + addr2 = engine.get_default_address() + assert addr1.start_byte == addr2.start_byte and addr1.end_byte == addr2.end_byte + + def do_test_ts_engine_object_parsing( engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[Tuple[str, str]] ): + assert len(engine.get_new_objects(root_node)) == len(expected_list) for (node, name), (expected_range, expected_id_range) in zip(engine.get_new_objects(root_node), expected_list): assert isinstance(node, Node) assert name == "object.new" @@ -47,6 +54,7 @@ def do_test_ts_engine_object_parsing( do_test_range_address(engine, node) do_test_range(engine, engine.get_object_id(node), expected_id_range) + assert len(list(engine.get_new_object_ids(root_node))) == len(expected_list) for node, (_, expected_id_range) in zip(engine.get_new_object_ids(root_node), expected_list): assert isinstance(node, Node) do_test_range(engine, node, expected_id_range) @@ -56,6 +64,8 @@ def do_test_ts_engine_object_parsing( def do_test_ts_engine_function_definition_parsing( engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[Tuple[str, str]] ): + assert engine.get_function_definitions(engine.tree.root_node) == engine.get_function_definitions() + assert len(engine.get_function_definitions(root_node)) == len(expected_list) for (node, name), (expected_range, expected_id_range) in zip( engine.get_function_definitions(root_node), expected_list ): @@ -65,6 +75,7 @@ def do_test_ts_engine_function_definition_parsing( do_test_range_address(engine, node) do_test_range(engine, engine.get_function_definition_id(node), expected_id_range) + assert len(list(engine.get_function_definition_ids(root_node))) == len(expected_list) for node, (_, expected_id_range) in zip(engine.get_function_definition_ids(root_node), expected_list): assert isinstance(node, Node) do_test_range(engine, node, expected_id_range) @@ -74,6 +85,7 @@ def do_test_ts_engine_function_definition_parsing( def do_test_ts_engine_function_call_parsing( engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[Tuple[str, str]] ): + assert len(engine.get_function_calls(root_node)) == len(expected_list) for (node, name), (expected_range, expected_id_range) in zip(engine.get_function_calls(root_node), expected_list): assert isinstance(node, Node) assert name == "function.call" @@ -81,6 +93,7 @@ def do_test_ts_engine_function_call_parsing( do_test_range_address(engine, node) do_test_range(engine, engine.get_function_call_id(node), expected_id_range) + assert len(list(engine.get_function_call_ids(root_node))) == len(expected_list) for node, (_, expected_id_range) in zip(engine.get_function_call_ids(root_node), expected_list): assert isinstance(node, Node) do_test_range(engine, node, expected_id_range) @@ -88,8 +101,9 @@ def do_test_ts_engine_function_call_parsing( def do_test_ts_engine_string_literals_parsing( - engine: TreeSitterExtractorEngine, root_node: Node, expected_list: Tuple[str] + engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[str] ): + assert len(engine.get_string_literals(root_node)) == len(expected_list) for (node, name), expected_range in zip(engine.get_string_literals(root_node), expected_list): assert isinstance(node, Node) assert name == "string-literal" @@ -98,8 +112,9 @@ def do_test_ts_engine_string_literals_parsing( def do_test_ts_engine_integer_literals_parsing( - engine: TreeSitterExtractorEngine, root_node: Node, expected_list: Tuple[str] + engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[str] ): + assert len(engine.get_integer_literals(root_node)) == len(expected_list) for (node, name), expected_range in zip(engine.get_integer_literals(root_node), expected_list): assert isinstance(node, Node) assert name == "integer-literal" @@ -107,6 +122,45 @@ def do_test_ts_engine_integer_literals_parsing( do_test_range_address(engine, node) +def do_test_ts_engine_namespaces_parsing(engine: TreeSitterExtractorEngine, expected_list: List[str]): + assert engine.get_namespaces(engine.tree.root_node) == engine.get_namespaces() + assert len(engine.get_namespaces()) == len(expected_list) + for (node, name), expected_range in zip(engine.get_namespaces(), expected_list): + assert isinstance(node, Node) + assert name == "namespace" + do_test_range(engine, node, expected_range) + do_test_range_address(engine, node) + + +def do_test_ts_engine_global_statements_parsing(engine: TreeSitterExtractorEngine, expected_list: List[str]): + assert len(engine.get_global_statements()) == len(expected_list) + for (node, name), expected_range in zip(engine.get_global_statements(), expected_list): + assert isinstance(node, Node) + assert name == "global-statement" + do_test_range(engine, node, expected_range, startswith=True) + do_test_range_address(engine, node) + + +def do_test_ts_engine_import_names_parsing( + engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[str] +): + assert len(list(engine.get_import_names(root_node))) == len(expected_list) + for (node, import_name), expected_import_name in zip(list(engine.get_import_names(root_node)), expected_list): + assert isinstance(node, Node) + assert import_name == expected_import_name + do_test_range_address(engine, node) + + +def do_test_ts_engine_function_names_parsing( + engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[str] +): + assert len(list(engine.get_function_names(root_node))) == len(expected_list) + for (node, function_name), expected_function_name in zip(list(engine.get_function_names(root_node)), expected_list): + assert isinstance(node, Node) + assert function_name == expected_function_name + do_test_range_address(engine, node) + + @parametrize( "engine_str,expected_dict", [ @@ -157,12 +211,8 @@ def do_test_ts_engine_integer_literals_parsing( "p.StandardError.ReadToEnd()", "p.StandardError.ReadToEnd", ), - ( - "Page_Load(sender, e)", - "Page_Load", - ), ], - "all string literals": ( + "all string literals": [ '""', '""', '"Not Found"', @@ -176,11 +226,20 @@ def do_test_ts_engine_integer_literals_parsing( '"cmd"', '"/c "', '"c"', - ), - "all integer literals": ( + ], + "all integer literals": [ "404", "0", - ), + ], + "namespaces": ["System"], + "global statements": [ + 'string stdout = "";', + 'string stderr = "";', + "void die() {", + "void Page_Load(object sender, System.EventArgs e) {", + ], + "all import names": ["System.Diagnostics.ProcessStartInfo", "System.Diagnostics.Process"], + "all function names": [], }, ), ], @@ -195,3 +254,8 @@ def test_ts_engine(request: pytest.FixtureRequest, engine_str: str, expected_dic do_test_ts_engine_function_call_parsing(engine, engine.tree.root_node, expected_dict["all function calls"]) do_test_ts_engine_string_literals_parsing(engine, engine.tree.root_node, expected_dict["all string literals"]) do_test_ts_engine_integer_literals_parsing(engine, engine.tree.root_node, expected_dict["all integer literals"]) + do_test_ts_engine_import_names_parsing(engine, engine.tree.root_node, expected_dict["all import names"]) + do_test_ts_engine_function_names_parsing(engine, engine.tree.root_node, expected_dict["all function names"]) + do_test_ts_engine_global_statements_parsing(engine, expected_dict["global statements"]) + do_test_ts_engine_namespaces_parsing(engine, expected_dict["namespaces"]) + do_test_ts_engine_default_range_address(engine) From 31e2fb9dd1c007328bdbe0dbf5ab28ef2fffc32b Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 5 Jul 2022 12:09:16 -0400 Subject: [PATCH 14/51] Reverted yielding only non-empty strings in order to stay consistent and not introduce unspecified rule-exceptions. --- capa/features/extractors/ts/file.py | 5 ++--- tests/{test_ts_engine.py => test_ts.py} | 0 2 files changed, 2 insertions(+), 3 deletions(-) rename tests/{test_ts_engine.py => test_ts.py} (100%) diff --git a/capa/features/extractors/ts/file.py b/capa/features/extractors/ts/file.py index a092169ab..360291868 100644 --- a/capa/features/extractors/ts/file.py +++ b/capa/features/extractors/ts/file.py @@ -19,9 +19,7 @@ def extract_language(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Featur def extract_file_strings(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: for global_node, _ in engine.get_global_statements(): for node, _ in engine.get_string_literals(global_node): - s = engine.get_range(node).strip('"') - if len(s) > 0: - yield String(engine.get_range(node).strip('"')), engine.get_address(node) + yield String(engine.get_range(node).strip('"')), engine.get_address(node) def extract_file_integer_literals(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: @@ -60,4 +58,5 @@ def extract_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Featur extract_file_integer_literals, extract_file_strings, extract_language, + extract_namespaces, ) diff --git a/tests/test_ts_engine.py b/tests/test_ts.py similarity index 100% rename from tests/test_ts_engine.py rename to tests/test_ts.py From 5bf3f188716198e948db04548a2f9a48b0014058 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 5 Jul 2022 12:11:11 -0400 Subject: [PATCH 15/51] Removing functions that should not be used in tree-sitter extractor (default to the base extractor level). --- capa/features/extractors/ts/extractor.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index 75b183097..37b24cdf3 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -5,7 +5,7 @@ import capa.features.extractors.ts.engine import capa.features.extractors.ts.global_ import capa.features.extractors.ts.function -from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress +from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress, FileOffsetRangeAddress from capa.features.extractors.ts.engine import TreeSitterExtractorEngine from capa.features.extractors.base_extractor import Feature, BBHandle, InsnHandle, FunctionHandle, FeatureExtractor @@ -46,9 +46,3 @@ def extract_insn_features( self, f: FunctionHandle, bb: BBHandle, insn: InsnHandle ) -> Iterator[Tuple[Feature, Address]]: yield from [] - - def is_library_function(self, addr) -> bool: - return False - - def get_function_name(self, addr) -> str: - return self.engine.tree.buf[addr.start_byte : addr.end_byte].decode() From a4529fc17302ca589a7ad7792c8c1ce3d935b69b Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 5 Jul 2022 12:23:45 -0400 Subject: [PATCH 16/51] Modifying extraction of global statements to omit local function declarations. --- capa/features/extractors/ts/query.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index 25de4c22b..796817e9a 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -35,6 +35,8 @@ def __init__(self, language: str): self.namespace = self.language.query( "(using_directive [(identifier) @namespace (qualified_name) @namespace])" ) - self.global_statement = self.language.query("(global_statement) @global-statement") + self.global_statement = self.language.query( + "(global_statement [(expression_statement) @global-statement (local_declaration_statement) @global-statement])" + ) else: raise NotImplementedError(f"Tree-sitter queries for {language} are not implemented.") From d5de9a15e66844e5da023c9a76921eeba0511564 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 5 Jul 2022 12:29:05 -0400 Subject: [PATCH 17/51] Added script language feature to freeze. --- capa/features/freeze/features.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/capa/features/freeze/features.py b/capa/features/freeze/features.py index 8f8665ca5..633a49a5d 100644 --- a/capa/features/freeze/features.py +++ b/capa/features/freeze/features.py @@ -24,6 +24,9 @@ def to_capa(self) -> capa.features.common.Feature: elif isinstance(self, FormatFeature): return capa.features.common.Format(self.format, description=self.description) + elif isinstance(self, ScriptLanguageFeature): + return capa.features.common.ScriptLanguage(self.language, description=self.description) + elif isinstance(self, MatchFeature): return capa.features.common.MatchedRule(self.match, description=self.description) @@ -106,6 +109,9 @@ def feature_from_capa(f: capa.features.common.Feature) -> "Feature": elif isinstance(f, capa.features.common.Format): return FormatFeature(format=f.value, description=f.description) + elif isinstance(f, capa.features.common.ScriptLanguage): + return ScriptLanguageFeature(language=f.value, description=f.description) + elif isinstance(f, capa.features.common.MatchedRule): return MatchFeature(match=f.value, description=f.description) @@ -189,6 +195,12 @@ class FormatFeature(FeatureModel): description: Optional[str] +class ScriptLanguageFeature(FeatureModel): + type: str = "script language" + language: str + description: Optional[str] + + class MatchFeature(FeatureModel): type: str = "match" match: str @@ -308,6 +320,7 @@ class OperandOffsetFeature(FeatureModel): OSFeature, ArchFeature, FormatFeature, + ScriptLanguageFeature, MatchFeature, CharacteristicFeature, ExportFeature, From 6c10458784c20d6bff0653ca8f7cf13f9bdf61df Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 5 Jul 2022 12:31:57 -0400 Subject: [PATCH 18/51] Added test cases for TS Extractor. --- capa/features/extractors/script.py | 3 +- tests/fixtures.py | 49 ++++++++++++++++++++++++++++-- tests/test_ts.py | 32 ++++++++++++++----- 3 files changed, 72 insertions(+), 12 deletions(-) diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py index 67145aea9..4ffa9cf76 100644 --- a/capa/features/extractors/script.py +++ b/capa/features/extractors/script.py @@ -24,7 +24,6 @@ def extract_format() -> Iterator[Tuple[Feature, Address]]: def get_language_from_ext(path: str): - _, ext = os.path.splitext(path) - if ext == ".cs": + if path.endswith((".cs", ".cs_")): return LANG_CS raise ValueError(f"{path} has an unrecognized or an unsupported extension.") diff --git a/tests/fixtures.py b/tests/fixtures.py index 594a33e35..0843a198b 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -13,7 +13,7 @@ import itertools import contextlib import collections -from typing import Set, Dict +from typing import Set, Dict, Union from functools import lru_cache import pytest @@ -38,6 +38,7 @@ Feature, ) from capa.features.address import Address +from capa.features.extractors.ts.extractor import TreeSitterFeatureExtractor from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor @@ -177,6 +178,13 @@ def get_ts_extractor_engine(language, path): return capa.features.extractors.ts.engine.TreeSitterExtractorEngine(language, path) +@lru_cache(maxsize=1) +def get_ts_extractor(path): + import capa.features.extractors.ts.extractor + + return capa.features.extractors.ts.extractor.TreeSitterFeatureExtractor(path) + + def extract_global_features(extractor): features = collections.defaultdict(set) for feature, va in extractor.extract_global_features(): @@ -359,9 +367,13 @@ def sample(request): return resolve_sample(request.param) -def get_function(extractor, fva: int) -> FunctionHandle: +def get_function(extractor, fva: Union[int, tuple]) -> FunctionHandle: + if isinstance(fva, tuple) and not isinstance(extractor, TreeSitterFeatureExtractor): + raise ValueError("invalid fva format") for fh in extractor.get_functions(): - if isinstance(extractor, DnfileFeatureExtractor): + if isinstance(extractor, TreeSitterFeatureExtractor): + addr = (fh.inner.start_byte, fh.inner.end_byte) + elif isinstance(extractor, DnfileFeatureExtractor): addr = fh.inner.offset else: addr = fh.address @@ -475,6 +487,37 @@ def scope(request): return resolve_scope(request.param) +def resolve_scope_ts(scope): + if scope == "global": + inner_fn = lambda extractor: extract_global_features(extractor) + elif scope == "file": + + def inner_fn(extractor): + features = extract_file_features(extractor) + for k, vs in extract_global_features(extractor).items(): + features[k].update(vs) + return features + + elif scope.startswith("function"): + # like `function=(155, 192)` + def inner_fn(extractor): + fh = get_function(extractor, eval(scope.partition("=")[2])) + features = extract_function_features(extractor, fh) + for k, vs in extract_global_features(extractor).items(): + features[k].update(vs) + return features + + else: + raise ValueError("unexpected scope fixture") + inner_fn.__name__ = scope + return inner_fn + + +@pytest.fixture +def scope_ts(request): + return resolve_scope_ts(request.param) + + def make_test_id(values): return "-".join(map(str, values)) diff --git a/tests/test_ts.py b/tests/test_ts.py index 8c6c0484d..a3dc20c3d 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -1,10 +1,13 @@ from typing import List, Tuple import pytest +import fixtures from fixtures import * from tree_sitter import Node, Tree -from capa.features.address import FileOffsetRangeAddress +from capa.features.file import Import +from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_SCRIPT, Arch, Format, String, Namespace, ScriptLanguage +from capa.features.address import NO_ADDRESS, FileOffsetRangeAddress from capa.features.extractors.script import LANG_CS from capa.features.extractors.ts.query import QueryBinding from capa.features.extractors.ts.engine import TreeSitterExtractorEngine @@ -26,10 +29,6 @@ def do_test_range(engine: TreeSitterExtractorEngine, node: Node, expected_range: assert engine.get_range(node).startswith(expected_range) if startswith else engine.get_range(node) == expected_range -def do_test_id_range(engine: TreeSitterExtractorEngine, node: Node, expected_id_range: str, startswith: bool = False): - do_test_range(engine, engine.get_object_id(node), expected_id_range, startswith) - - def do_test_range_address(engine: TreeSitterExtractorEngine, node: Node): assert isinstance(engine.get_address(node), FileOffsetRangeAddress) addr = engine.get_address(node) @@ -235,8 +234,6 @@ def do_test_ts_engine_function_names_parsing( "global statements": [ 'string stdout = "";', 'string stderr = "";', - "void die() {", - "void Page_Load(object sender, System.EventArgs e) {", ], "all import names": ["System.Diagnostics.ProcessStartInfo", "System.Diagnostics.Process"], "all function names": [], @@ -259,3 +256,24 @@ def test_ts_engine(request: pytest.FixtureRequest, engine_str: str, expected_dic do_test_ts_engine_global_statements_parsing(engine, expected_dict["global statements"]) do_test_ts_engine_namespaces_parsing(engine, expected_dict["namespaces"]) do_test_ts_engine_default_range_address(engine) + + +FEATURE_PRESENCE_TESTS_SCRIPTS = sorted( + [ + ("cs_f397cb", "global", Arch(ARCH_ANY), True), + ("cs_f397cb", "global", OS(OS_ANY), True), + ("cs_f397cb", "file", Format(FORMAT_SCRIPT), True), + ("cs_f397cb", "file", ScriptLanguage(LANG_CS), True), + ("cs_f397cb", "file", Namespace("System"), True), + ("cs_f397cb", "file", String(""), True), + ("cs_f397cb", "function=(0x38,0x16c)", String("Not Found"), True), + ("cs_f397cb", "function=(0x16e,0x7ce)", String("127.0.0.1"), True), + ("cs_f397cb", "function=(0x16e,0x7ce)", Import("System.Diagnostics.ProcessStartInfo"), True), + ("cs_f397cb", "function=(0x16e,0x7ce)", Import("System.Diagnostics.Process"), True), + ] +) + + +@parametrize("sample, scope_ts, feature, expected", FEATURE_PRESENCE_TESTS_SCRIPTS, indirect=["sample", "scope_ts"]) +def test_ts_extractor(sample, scope_ts, feature, expected): + fixtures.do_test_feature_presence(fixtures.get_ts_extractor, sample, scope_ts, feature, expected) From 9bd98242539ea6296d2272e814659423b0f0789d Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Wed, 6 Jul 2022 09:19:51 -0400 Subject: [PATCH 19/51] Refactored query bindings. --- capa/features/extractors/script.py | 3 ++ capa/features/extractors/ts/engine.py | 6 +-- capa/features/extractors/ts/extractor.py | 2 +- capa/features/extractors/ts/query.py | 58 ++++++++++++++++-------- 4 files changed, 47 insertions(+), 22 deletions(-) diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py index 4ffa9cf76..d905f30d7 100644 --- a/capa/features/extractors/script.py +++ b/capa/features/extractors/script.py @@ -4,6 +4,7 @@ from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_SCRIPT, Arch, Format, Feature, ScriptLanguage from capa.features.address import NO_ADDRESS, Address, FileOffsetRangeAddress +LANG_ASPX = "aspx" LANG_CS = "c_sharp" @@ -24,6 +25,8 @@ def extract_format() -> Iterator[Tuple[Feature, Address]]: def get_language_from_ext(path: str): + if path.endswith((".aspx", "aspx_")): + return LANG_ASPX if path.endswith((".cs", ".cs_")): return LANG_CS raise ValueError(f"{path} has an unrecognized or an unsupported extension.") diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index fe84e077a..b787e6741 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -5,7 +5,7 @@ import capa.features.extractors.ts.sig import capa.features.extractors.ts.build from capa.features.address import FileOffsetRangeAddress -from capa.features.extractors.ts.query import QueryBinding +from capa.features.extractors.ts.query import QueryBinding, QueryBindingFactory class TreeSitterExtractorEngine: @@ -19,14 +19,14 @@ class TreeSitterExtractorEngine: def __init__(self, language: str, path: str): capa.features.extractors.ts.build.ts_build() self.language = language - self.query = QueryBinding(language) + self.query = QueryBindingFactory.from_language(language) self.import_signatures = capa.features.extractors.ts.sig.load_import_signatures(language) self.path = path with open(self.path, "rb") as f: self.buf = f.read() self.tree = self.parse() - def parse(self): + def parse(self) -> Tree: parser = Parser() parser.set_language(self.query.language) return parser.parse(self.buf) diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index 37b24cdf3..285f34bd3 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -5,7 +5,7 @@ import capa.features.extractors.ts.engine import capa.features.extractors.ts.global_ import capa.features.extractors.ts.function -from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress, FileOffsetRangeAddress +from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress from capa.features.extractors.ts.engine import TreeSitterExtractorEngine from capa.features.extractors.base_extractor import Feature, BBHandle, InsnHandle, FunctionHandle, FeatureExtractor diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index 796817e9a..ff176b71b 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -6,6 +6,19 @@ import capa.features.extractors.ts.build from capa.features.extractors.script import LANG_CS +CS_BINDING = { + "query": { + "new_object": "(object_creation_expression) @object.new", + "function_definition": "(local_function_statement) @function.definition", + "function_call": "(invocation_expression) @function.call", + "string_literal": "(string_literal) @string-literal", + "integer_literal": "(integer_literal) @integer-literal", + "namespace": "(using_directive [(identifier) @namespace (qualified_name) @namespace])", + "global_statement": "(global_statement [(expression_statement) @global-statement (local_declaration_statement) @global-statement])", + }, + "field_name": {"new_object": "type", "function_definition": "name", "function_call": "function"}, +} + @dataclass class QueryBinding: @@ -21,22 +34,31 @@ class QueryBinding: namespace: Query global_statement: Query - def __init__(self, language: str): - self.language = Language(capa.features.extractors.ts.build.build_dir, language) + +class QueryBindingFactory: + @staticmethod + def from_language(language: str) -> QueryBinding: + ts_language = Language(capa.features.extractors.ts.build.build_dir, language) if language == LANG_CS: - self.new_object = self.language.query("(object_creation_expression) @object.new") - self.new_object_field_name = "type" - self.function_definition = self.language.query("(local_function_statement) @function.definition") - self.function_definition_field_name = "name" - self.function_call = self.language.query("(invocation_expression) @function.call") - self.function_call_field_name = "function" - self.string_literal = self.language.query("(string_literal) @string-literal") - self.integer_literal = self.language.query("(integer_literal) @integer-literal") - self.namespace = self.language.query( - "(using_directive [(identifier) @namespace (qualified_name) @namespace])" - ) - self.global_statement = self.language.query( - "(global_statement [(expression_statement) @global-statement (local_declaration_statement) @global-statement])" - ) - else: - raise NotImplementedError(f"Tree-sitter queries for {language} are not implemented.") + return QueryBinding(language=ts_language, **QueryBindingFactory.deserialize(ts_language, CS_BINDING)) + raise NotImplementedError(f"Tree-sitter queries for {language} are not implemented.") + + @staticmethod + def deserialize(language: Language, binding: dict) -> dict: + deserialized_binding = {} + for construct, query in binding["query"].items(): + deserialized_binding[construct] = language.query(query) + for construct, field_name in binding["field_name"].items(): + deserialized_binding[f"{construct}_field_name"] = field_name + return deserialized_binding + + +@dataclass +class EmbeddedQueryBinding: + language: Language + code: Query + + def __init__(self): + self.language = Language(capa.features.extractors.ts.build.build_dir, "embedded_template") + self.content = self.language.query("(content) @content") + self.code = self.language.query("(code) @code") From 2594849588c29eaede2872177671f6936206a28f Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Wed, 6 Jul 2022 10:22:13 -0400 Subject: [PATCH 20/51] Added support for template parsing. --- capa/features/extractors/script.py | 4 +- capa/features/extractors/ts/engine.py | 75 +++++++++++++++++++++++---- capa/features/extractors/ts/query.py | 27 +++++++--- 3 files changed, 86 insertions(+), 20 deletions(-) diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py index d905f30d7..0c45a876c 100644 --- a/capa/features/extractors/script.py +++ b/capa/features/extractors/script.py @@ -4,7 +4,7 @@ from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_SCRIPT, Arch, Format, Feature, ScriptLanguage from capa.features.address import NO_ADDRESS, Address, FileOffsetRangeAddress -LANG_ASPX = "aspx" +LANG_TEM = "embedded_template" LANG_CS = "c_sharp" @@ -26,7 +26,7 @@ def extract_format() -> Iterator[Tuple[Feature, Address]]: def get_language_from_ext(path: str): if path.endswith((".aspx", "aspx_")): - return LANG_ASPX + return LANG_TEM if path.endswith((".cs", ".cs_")): return LANG_CS raise ValueError(f"{path} has an unrecognized or an unsupported extension.") diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index b787e6741..6c34af45a 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -1,16 +1,22 @@ -from typing import List, Tuple, Iterator +import re +from typing import List, Tuple, Union, Iterator +from dataclasses import dataclass from tree_sitter import Node, Tree, Parser import capa.features.extractors.ts.sig import capa.features.extractors.ts.build from capa.features.address import FileOffsetRangeAddress -from capa.features.extractors.ts.query import QueryBinding, QueryBindingFactory +from capa.features.extractors.ts.query import ( + QueryBinding, + ScriptQueryBinding, + QueryBindingFactory, + TemplateQueryBinding, +) -class TreeSitterExtractorEngine: +class TreeSitterBaseEngine: buf: bytes - import_signatures: set language: str path: str query: QueryBinding @@ -31,6 +37,26 @@ def parse(self) -> Tree: parser.set_language(self.query.language) return parser.parse(self.buf) + def get_byte_range(self, node: Node) -> bytes: + return self.buf[node.start_byte : node.end_byte] + + def get_range(self, node: Node) -> str: + return self.get_byte_range(node).decode() + + def get_address(self, node: Node): + return FileOffsetRangeAddress(node.start_byte, node.end_byte) + + def get_default_address(self): + return self.get_address(self.tree.root_node) + + +class TreeSitterExtractorEngine(TreeSitterBaseEngine): + query: ScriptQueryBinding + import_signatures: set + + def __init__(self, language: str, path: str): + super().__init__(language, path) + def get_new_objects(self, node: Node) -> List[Tuple[Node, str]]: return self.query.new_object.captures(node) @@ -101,11 +127,40 @@ def get_namespaces(self, node: Node = None) -> List[Tuple[Node, str]]: def get_global_statements(self) -> List[Tuple[Node, str]]: return self.query.global_statement.captures(self.tree.root_node) - def get_range(self, node: Node) -> str: - return self.buf[node.start_byte : node.end_byte].decode() - def get_address(self, node: Node): - return FileOffsetRangeAddress(node.start_byte, node.end_byte) +@dataclass +class ASPXPseudoNode: + start_byte: int + end_byte: int - def get_default_address(self): - return self.get_address(self.tree.root_node) + +class TreeSitterTemplateEngine(TreeSitterBaseEngine): + query: TemplateQueryBinding + + def __init__(self, language: str, path: str): + super().__init__(language, path) + + def get_code_sections(self) -> List[Tuple[Node, str]]: + return self.query.code.captures(self.tree.root_node) + + def get_content_sections(self) -> List[Tuple[Node, str]]: + return self.query.content.captures(self.tree.root_node) + + def get_template_namespaces(self) -> Iterator[ASPXPseudoNode]: + for node, _ in self.get_code_sections(): + if self.is_aspx_import_directive: + ns = self.get_aspx_namespace(node) + if ns is not None: + yield ns + + def is_aspx(self, node: Node) -> bool: + return self.get_byte_range(node).startswith(b"@") + + def is_aspx_import_directive(self, node: Node) -> bool: + return self.get_byte_range(node).startswith(b"@ Import namespace=") + + def get_aspx_namespace(self, node: Node) -> Union[ASPXPseudoNode, None]: + match = re.search(b'@ Import namespace="(.*?)"', self.get_byte_range(node)) + if match is None: + return None + return ASPXPseudoNode(node.start_byte + match.span()[0], node.start_byte + match.span()[1]) diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index ff176b71b..d27752827 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -4,7 +4,7 @@ from tree_sitter.binding import Query import capa.features.extractors.ts.build -from capa.features.extractors.script import LANG_CS +from capa.features.extractors.script import LANG_CS, LANG_TEM CS_BINDING = { "query": { @@ -19,10 +19,19 @@ "field_name": {"new_object": "type", "function_definition": "name", "function_call": "function"}, } +TM_BINDING = { + "code": "(code) @code", + "content": "(content) @content", +} + @dataclass class QueryBinding: language: Language + + +@dataclass +class ScriptQueryBinding(QueryBinding): new_object: Query new_object_field_name: str function_definition: Query @@ -35,12 +44,20 @@ class QueryBinding: global_statement: Query +@dataclass +class TemplateQueryBinding(QueryBinding): + code: Query + content: Query + + class QueryBindingFactory: @staticmethod def from_language(language: str) -> QueryBinding: ts_language = Language(capa.features.extractors.ts.build.build_dir, language) if language == LANG_CS: - return QueryBinding(language=ts_language, **QueryBindingFactory.deserialize(ts_language, CS_BINDING)) + return ScriptQueryBinding(language=ts_language, **QueryBindingFactory.deserialize(ts_language, CS_BINDING)) + if language in LANG_TEM: + return TemplateQueryBinding(language=ts_language, **TM_BINDING) raise NotImplementedError(f"Tree-sitter queries for {language} are not implemented.") @staticmethod @@ -52,12 +69,6 @@ def deserialize(language: Language, binding: dict) -> dict: deserialized_binding[f"{construct}_field_name"] = field_name return deserialized_binding - -@dataclass -class EmbeddedQueryBinding: - language: Language - code: Query - def __init__(self): self.language = Language(capa.features.extractors.ts.build.build_dir, "embedded_template") self.content = self.language.query("(content) @content") From 619ed94eca2ece4366d4a6f13763f4f27ad49fa0 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Wed, 6 Jul 2022 12:09:24 -0400 Subject: [PATCH 21/51] Added support for HTML parsing. --- capa/features/extractors/script.py | 6 ++++- capa/features/extractors/ts/engine.py | 36 +++++++++++++++++++++++++-- capa/features/extractors/ts/query.py | 19 +++++++++++--- 3 files changed, 55 insertions(+), 6 deletions(-) diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py index 0c45a876c..e61783ccb 100644 --- a/capa/features/extractors/script.py +++ b/capa/features/extractors/script.py @@ -4,8 +4,10 @@ from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_SCRIPT, Arch, Format, Feature, ScriptLanguage from capa.features.address import NO_ADDRESS, Address, FileOffsetRangeAddress -LANG_TEM = "embedded_template" LANG_CS = "c_sharp" +LANG_HTML = "html" +LANG_JS = "javascript" +LANG_TEM = "embedded_template" def extract_arch() -> Iterator[Tuple[Feature, Address]]: @@ -29,4 +31,6 @@ def get_language_from_ext(path: str): return LANG_TEM if path.endswith((".cs", ".cs_")): return LANG_CS + if path.endswith(("html", "html_")): + return LANG_HTML raise ValueError(f"{path} has an unrecognized or an unsupported extension.") diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index 6c34af45a..9fe601b62 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -1,5 +1,6 @@ import re -from typing import List, Tuple, Union, Iterator +from typing import Dict, List, Tuple, Union, Iterator +from collections import defaultdict from dataclasses import dataclass from tree_sitter import Node, Tree, Parser @@ -7,8 +8,10 @@ import capa.features.extractors.ts.sig import capa.features.extractors.ts.build from capa.features.address import FileOffsetRangeAddress +from capa.features.extractors.script import LANG_CS, LANG_JS from capa.features.extractors.ts.query import ( QueryBinding, + HTMLQueryBinding, ScriptQueryBinding, QueryBindingFactory, TemplateQueryBinding, @@ -160,7 +163,36 @@ def is_aspx_import_directive(self, node: Node) -> bool: return self.get_byte_range(node).startswith(b"@ Import namespace=") def get_aspx_namespace(self, node: Node) -> Union[ASPXPseudoNode, None]: - match = re.search(b'@ Import namespace="(.*?)"', self.get_byte_range(node)) + match = re.search(r'@ Import namespace="(.*?)"'.encode(), self.get_byte_range(node)) if match is None: return None return ASPXPseudoNode(node.start_byte + match.span()[0], node.start_byte + match.span()[1]) + + +class TreeSitterHTMLEngine(TreeSitterBaseEngine): + query: HTMLQueryBinding + + def __init__(self, language: str, path: str): + super().__init__(language, path) + + def get_scripts(self) -> List[Tuple[Node, str]]: + return self.query.script_element.captures(self.tree.root_node) + + def get_attributes(self, node: Node) -> List[Tuple[Node, str]]: + return self.query.attribute.captures(self.tree.root_node) + + def get_code_sections_by_language(self) -> Dict[str, List[Node]]: + code_sections = defaultdict(list) + for script_node, _ in self.get_scripts(): + for attribute_node, _ in self.get_attributes(script_node): + script_language = self.identify_script_language(attribute_node) + code_sections[script_language].append(attribute_node) + return code_sections + + def identify_script_language(self, node: Node) -> str: + if self.is_server_side_c_sharp(node): + return LANG_CS + return LANG_JS + + def is_server_side_c_sharp(self, node: Node) -> bool: + return len(re.findall(r'runat\s*=\s*"server"'.encode(), self.get_byte_range(node))) > 0 diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index d27752827..1f213d0ff 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -4,7 +4,7 @@ from tree_sitter.binding import Query import capa.features.extractors.ts.build -from capa.features.extractors.script import LANG_CS, LANG_TEM +from capa.features.extractors.script import LANG_CS, LANG_TEM, LANG_HTML CS_BINDING = { "query": { @@ -19,11 +19,16 @@ "field_name": {"new_object": "type", "function_definition": "name", "function_call": "function"}, } -TM_BINDING = { +TEM_BINDING = { "code": "(code) @code", "content": "(content) @content", } +HTML_BINDING = { + "script_element": "(script_element) @script-element", + "attribute": "(attribute) @attribute", +} + @dataclass class QueryBinding: @@ -50,6 +55,12 @@ class TemplateQueryBinding(QueryBinding): content: Query +@dataclass +class HTMLQueryBinding(QueryBinding): + script_element: Query + attribute: Query + + class QueryBindingFactory: @staticmethod def from_language(language: str) -> QueryBinding: @@ -57,7 +68,9 @@ def from_language(language: str) -> QueryBinding: if language == LANG_CS: return ScriptQueryBinding(language=ts_language, **QueryBindingFactory.deserialize(ts_language, CS_BINDING)) if language in LANG_TEM: - return TemplateQueryBinding(language=ts_language, **TM_BINDING) + return TemplateQueryBinding(language=ts_language, **TEM_BINDING) + if language == LANG_HTML: + return HTMLQueryBinding(language=ts_language, **HTML_BINDING) raise NotImplementedError(f"Tree-sitter queries for {language} are not implemented.") @staticmethod From 5e2380234f46c0117ebfff1340e160c61a8dd22e Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Thu, 7 Jul 2022 10:15:52 -0400 Subject: [PATCH 22/51] Implemented the necessary modifications to support embedded templates/html: aspx. --- capa/features/extractors/script.py | 2 +- capa/features/extractors/ts/engine.py | 98 +++++++++++++----------- capa/features/extractors/ts/extractor.py | 59 +++++++++++--- capa/features/extractors/ts/file.py | 5 -- capa/features/extractors/ts/global_.py | 6 +- tests/fixtures.py | 8 +- tests/test_ts.py | 1 - 7 files changed, 115 insertions(+), 64 deletions(-) diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py index e61783ccb..d8ba79e5c 100644 --- a/capa/features/extractors/script.py +++ b/capa/features/extractors/script.py @@ -26,7 +26,7 @@ def extract_format() -> Iterator[Tuple[Feature, Address]]: yield Format(FORMAT_SCRIPT), NO_ADDRESS -def get_language_from_ext(path: str): +def get_language_from_ext(path: str) -> str: if path.endswith((".aspx", "aspx_")): return LANG_TEM if path.endswith((".cs", ".cs_")): diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index 9fe601b62..80b6cc888 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -1,14 +1,12 @@ import re -from typing import Dict, List, Tuple, Union, Iterator -from collections import defaultdict -from dataclasses import dataclass +from typing import List, Tuple, Iterator, Optional from tree_sitter import Node, Tree, Parser import capa.features.extractors.ts.sig import capa.features.extractors.ts.build from capa.features.address import FileOffsetRangeAddress -from capa.features.extractors.script import LANG_CS, LANG_JS +from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML from capa.features.extractors.ts.query import ( QueryBinding, HTMLQueryBinding, @@ -21,18 +19,14 @@ class TreeSitterBaseEngine: buf: bytes language: str - path: str query: QueryBinding tree: Tree - def __init__(self, language: str, path: str): + def __init__(self, language: str, buf: bytes): capa.features.extractors.ts.build.ts_build() self.language = language self.query = QueryBindingFactory.from_language(language) - self.import_signatures = capa.features.extractors.ts.sig.load_import_signatures(language) - self.path = path - with open(self.path, "rb") as f: - self.buf = f.read() + self.buf = buf self.tree = self.parse() def parse(self) -> Tree: @@ -46,19 +40,27 @@ def get_byte_range(self, node: Node) -> bytes: def get_range(self, node: Node) -> str: return self.get_byte_range(node).decode() - def get_address(self, node: Node): + def get_address(self, node: Node) -> FileOffsetRangeAddress: return FileOffsetRangeAddress(node.start_byte, node.end_byte) - def get_default_address(self): + def get_default_address(self) -> FileOffsetRangeAddress: return self.get_address(self.tree.root_node) class TreeSitterExtractorEngine(TreeSitterBaseEngine): query: ScriptQueryBinding import_signatures: set + buf_offset: int + namespaces: set[str] + + def __init__(self, language: str, buf: bytes, buf_offset: int = 0, additional_namespaces: set[str] = None): + super().__init__(language, buf) + self.buf_offset = buf_offset + self.import_signatures = capa.features.extractors.ts.sig.load_import_signatures(language) + self.namespaces = additional_namespaces if additional_namespaces is not None else set() - def __init__(self, language: str, path: str): - super().__init__(language, path) + def get_address(self, node: Node) -> FileOffsetRangeAddress: + return FileOffsetRangeAddress(self.buf_offset + node.start_byte, self.buf_offset + node.end_byte) def get_new_objects(self, node: Node) -> List[Tuple[Node, str]]: return self.query.new_object.captures(node) @@ -73,13 +75,13 @@ def get_new_object_ids(self, node: Node) -> Iterator[Node]: # TODO: move this elsewhere, does not fit this class def get_import_names(self, node: Node) -> Iterator[Tuple[Node, str]]: join_names = capa.features.extractors.ts.sig.get_name_joiner(self.language) - namespaces = set([self.get_range(ns_node) for ns_node, _ in self.get_namespaces()]) + self.namespaces = self.namespaces.union(set([self.get_range(ns_node) for ns_node, _ in self.get_namespaces()])) for obj_node in self.get_new_object_ids(node): obj_name = self.get_range(obj_node) if obj_name in self.import_signatures: yield (obj_node, obj_name) continue - for namespace in namespaces: + for namespace in self.namespaces: obj_name = join_names(namespace, obj_name) if obj_name in self.import_signatures: yield (obj_node, obj_name) @@ -107,13 +109,13 @@ def get_function_call_ids(self, node: Node) -> Iterator[Node]: # TODO: move this elsewhere, does not fit this class def get_function_names(self, node: Node) -> Iterator[Tuple[Node, str]]: join_names = capa.features.extractors.ts.sig.get_name_joiner(self.language) - namespaces = set([self.get_range(ns_node) for ns_node, _ in self.get_namespaces()]) + self.namespaces = self.namespaces.union(set([self.get_range(ns_node) for ns_node, _ in self.get_namespaces()])) for fn_node in self.get_function_call_ids(node): fn_name = self.get_range(fn_node) if fn_name in self.import_signatures: yield (fn_node, fn_name) continue - for namespace in namespaces: + for namespace in self.namespaces: fn_name = join_names(namespace, fn_name) if fn_name in self.import_signatures: yield (fn_node, fn_name) @@ -131,49 +133,56 @@ def get_global_statements(self) -> List[Tuple[Node, str]]: return self.query.global_statement.captures(self.tree.root_node) -@dataclass -class ASPXPseudoNode: - start_byte: int - end_byte: int - - class TreeSitterTemplateEngine(TreeSitterBaseEngine): query: TemplateQueryBinding - def __init__(self, language: str, path: str): - super().__init__(language, path) + def __init__(self, buf: bytes): + super().__init__(LANG_TEM, buf) def get_code_sections(self) -> List[Tuple[Node, str]]: return self.query.code.captures(self.tree.root_node) + def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: + template_namespaces = set(name for _, name in self.get_template_namespaces()) + for node, _ in self.get_code_sections(): + yield TreeSitterExtractorEngine( + self.identify_language(), self.get_byte_range(node), node.start_byte, template_namespaces + ) + def get_content_sections(self) -> List[Tuple[Node, str]]: return self.query.content.captures(self.tree.root_node) - def get_template_namespaces(self) -> Iterator[ASPXPseudoNode]: + def identify_language(self) -> str: + for node, _ in self.get_code_sections(): + if self.is_c_sharp(node): + return LANG_CS + return LANG_JS + + def get_template_namespaces(self) -> Iterator[Tuple[Node, str]]: for node, _ in self.get_code_sections(): if self.is_aspx_import_directive: - ns = self.get_aspx_namespace(node) - if ns is not None: - yield ns + namespace = self.get_aspx_namespace(node) + if namespace is not None: + yield node, namespace - def is_aspx(self, node: Node) -> bool: - return self.get_byte_range(node).startswith(b"@") + def is_c_sharp(self, node: Node) -> bool: + return len(re.findall(r'@ .*Page Language\s*=\s*"C#".*'.encode(), self.get_byte_range(node))) > 0 def is_aspx_import_directive(self, node: Node) -> bool: return self.get_byte_range(node).startswith(b"@ Import namespace=") - def get_aspx_namespace(self, node: Node) -> Union[ASPXPseudoNode, None]: + def get_aspx_namespace(self, node: Node) -> Optional[str]: match = re.search(r'@ Import namespace="(.*?)"'.encode(), self.get_byte_range(node)) - if match is None: - return None - return ASPXPseudoNode(node.start_byte + match.span()[0], node.start_byte + match.span()[1]) + return match.group().decode() if match is not None else None class TreeSitterHTMLEngine(TreeSitterBaseEngine): query: HTMLQueryBinding + namespaces: set[str] - def __init__(self, language: str, path: str): - super().__init__(language, path) + def __init__(self, buf: bytes, additional_namespaces: set[str] = None): + super().__init__(LANG_HTML, buf) + self.namespaces = additional_namespaces if additional_namespaces is not None else set() def get_scripts(self) -> List[Tuple[Node, str]]: return self.query.script_element.captures(self.tree.root_node) @@ -181,15 +190,16 @@ def get_scripts(self) -> List[Tuple[Node, str]]: def get_attributes(self, node: Node) -> List[Tuple[Node, str]]: return self.query.attribute.captures(self.tree.root_node) - def get_code_sections_by_language(self) -> Dict[str, List[Node]]: - code_sections = defaultdict(list) + def get_code_sections(self) -> Iterator[Node]: for script_node, _ in self.get_scripts(): for attribute_node, _ in self.get_attributes(script_node): - script_language = self.identify_script_language(attribute_node) - code_sections[script_language].append(attribute_node) - return code_sections + yield attribute_node + + def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: + for node in self.get_code_sections(): + yield TreeSitterExtractorEngine(self.identify_language(node), self.get_byte_range(node), node.start_byte) - def identify_script_language(self, node: Node) -> str: + def identify_language(self, node: Node) -> str: if self.is_server_side_c_sharp(node): return LANG_CS return LANG_JS diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index 285f34bd3..02ee3c9a1 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -1,37 +1,78 @@ -from typing import Tuple, Union, Iterator +from typing import List, Tuple, Union, Iterator + +from tree_sitter import Node import capa.features.extractors.script import capa.features.extractors.ts.file import capa.features.extractors.ts.engine import capa.features.extractors.ts.global_ import capa.features.extractors.ts.function -from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress -from capa.features.extractors.ts.engine import TreeSitterExtractorEngine +from capa.features.common import Namespace +from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress, FileOffsetRangeAddress +from capa.features.extractors.script import LANG_TEM, LANG_HTML +from capa.features.extractors.ts.engine import TreeSitterHTMLEngine, TreeSitterTemplateEngine, TreeSitterExtractorEngine from capa.features.extractors.base_extractor import Feature, BBHandle, InsnHandle, FunctionHandle, FeatureExtractor class TreeSitterFeatureExtractor(FeatureExtractor): - engine: TreeSitterExtractorEngine + code_sections: List[TreeSitterExtractorEngine] + template_namespaces: set[Tuple[Node, str]] + language: str def __init__(self, path: str): super().__init__() - self.engine = TreeSitterExtractorEngine(capa.features.extractors.script.get_language_from_ext(path), path) + self.path = path + with open(self.path, "rb") as f: + buf = f.read() + + self.language = capa.features.extractors.script.get_language_from_ext(path) + if self.language == LANG_TEM: + self.code_sections, self.template_namespaces = self.extract_code_from_template(buf) + elif self.language == LANG_HTML: + self.code_sections = list(self.extract_code_from_html(buf)) + else: + self.code_sections = [TreeSitterExtractorEngine(self.language, buf)] + + def extract_code_from_template(self, buf: bytes) -> Tuple[List[TreeSitterExtractorEngine], set[Tuple[Node, str]]]: + template_engine = TreeSitterTemplateEngine(buf) + template_namespaces = set(template_engine.get_template_namespaces()) + code_sections = list(template_engine.get_parsed_code_sections()) + + additional_namespaces = set(name for _, name in template_namespaces) + for section in template_engine.get_content_sections(): + section_buf = template_engine.get_byte_range(section) + code_sections.extend(list(self.extract_code_from_html(section_buf, additional_namespaces))) + return code_sections, template_namespaces + + def extract_code_from_html( + self, buf: bytes, additional_namespaces: set[str] = None + ) -> Iterator[TreeSitterExtractorEngine]: + yield from TreeSitterHTMLEngine(buf, additional_namespaces).get_parsed_code_sections() def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: return NO_ADDRESS + def extract_template_namespaces(self) -> Iterator[Tuple[Feature, Address]]: + for node, name in self.template_namespaces: + yield Namespace(name), FileOffsetRangeAddress(node.start_byte, node.end_byte) + def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.ts.global_.extract_features() def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: - yield from capa.features.extractors.ts.file.extract_features(self.engine) + if self.language == LANG_TEM: + yield from self.extract_template_namespaces() + for engine in self.code_sections: + yield from capa.features.extractors.ts.file.extract_features(engine) def get_functions(self) -> Iterator[FunctionHandle]: - for node, _ in self.engine.get_function_definitions(): - yield FunctionHandle(address=self.engine.get_address(node), inner=node) + for engine in self.code_sections: + for node, _ in engine.get_function_definitions(): + yield FunctionHandle(address=engine.get_address(node), inner=node) def extract_function_features(self, f: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: - yield from capa.features.extractors.ts.function.extract_features(f, self.engine) + for engine in self.code_sections: + yield from capa.features.extractors.ts.function.extract_features(f, engine) def get_basic_blocks(self, f: FunctionHandle) -> Iterator[BBHandle]: yield from [] diff --git a/capa/features/extractors/ts/file.py b/capa/features/extractors/ts/file.py index 360291868..45962d1c5 100644 --- a/capa/features/extractors/ts/file.py +++ b/capa/features/extractors/ts/file.py @@ -8,10 +8,6 @@ from capa.features.extractors.ts.engine import TreeSitterExtractorEngine -def extract_file_format(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - yield from capa.features.extractors.script.extract_format() - - def extract_language(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.script.extract_language(engine.language, engine.get_default_address()) @@ -52,7 +48,6 @@ def extract_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Featur FILE_HANDLERS = ( - extract_file_format, extract_file_function_names, extract_file_import_names, extract_file_integer_literals, diff --git a/capa/features/extractors/ts/global_.py b/capa/features/extractors/ts/global_.py index 23db0cb47..c0bab903b 100644 --- a/capa/features/extractors/ts/global_.py +++ b/capa/features/extractors/ts/global_.py @@ -19,4 +19,8 @@ def extract_features() -> Iterator[Tuple[Feature, Address]]: yield feature, addr -GLOBAL_HANDLERS = (extract_arch, extract_os) +def extract_file_format() -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.script.extract_format() + + +GLOBAL_HANDLERS = (extract_arch, extract_os, extract_file_format) diff --git a/tests/fixtures.py b/tests/fixtures.py index 0843a198b..784dbf83a 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -172,10 +172,10 @@ def get_dnfile_extractor(path): @lru_cache(maxsize=1) -def get_ts_extractor_engine(language, path): +def get_ts_extractor_engine(language, buf): import capa.features.extractors.ts.engine - return capa.features.extractors.ts.engine.TreeSitterExtractorEngine(language, path) + return capa.features.extractors.ts.engine.TreeSitterExtractorEngine(language, buf) @lru_cache(maxsize=1) @@ -963,4 +963,6 @@ def _692f_dotnetfile_extractor(): @pytest.fixture def cs_f397cb_extractor_engine(): - return get_ts_extractor_engine("c_sharp", get_data_path_by_name("cs_f397cb")) + with open(get_data_path_by_name("cs_f397cb"), "rb") as f: + buf = f.read() + return get_ts_extractor_engine("c_sharp", buf) diff --git a/tests/test_ts.py b/tests/test_ts.py index a3dc20c3d..d3470cd8a 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -17,7 +17,6 @@ def do_test_ts_engine_init(engine: TreeSitterExtractorEngine): assert engine.language == LANG_CS assert isinstance(engine.query, QueryBinding) assert isinstance(engine.import_signatures, set) and len(engine.import_signatures) > 0 - assert isinstance(engine.path, str) and len(engine.path) > 0 assert isinstance(engine.buf, bytes) and len(engine.buf) > 0 assert isinstance(engine.tree, Tree) assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) From 5d83e8df3e83b2cb0b2a8ca7d356c524b9a97bac Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Thu, 7 Jul 2022 14:56:31 -0400 Subject: [PATCH 23/51] Added more buildings to build; minor style improvement. --- capa/features/extractors/ts/build.py | 3 +++ capa/features/extractors/ts/query.py | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/capa/features/extractors/ts/build.py b/capa/features/extractors/ts/build.py index f04006bc8..0e78c2a92 100644 --- a/capa/features/extractors/ts/build.py +++ b/capa/features/extractors/ts/build.py @@ -3,6 +3,9 @@ build_dir = "build/my-languages.so" languages = [ "vendor/tree-sitter-c-sharp", + "vendor/tree-sitter-embedded-template", + "vendor/tree-sitter-html", + "vendor/tree-sitter-javascript", ] diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index 1f213d0ff..e9c3a62d1 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -83,6 +83,6 @@ def deserialize(language: Language, binding: dict) -> dict: return deserialized_binding def __init__(self): - self.language = Language(capa.features.extractors.ts.build.build_dir, "embedded_template") + self.language = Language(capa.features.extractors.ts.build.build_dir, LANG_TEM) self.content = self.language.query("(content) @content") self.code = self.language.query("(code) @code") From 95705234df866af3498bf171e4caaa1e0969c348 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Thu, 7 Jul 2022 17:24:59 -0400 Subject: [PATCH 24/51] Further refactored the Tree-sitter queries and fixed minor template engine bugs. --- capa/features/extractors/ts/engine.py | 44 +++++++-- capa/features/extractors/ts/extractor.py | 9 +- capa/features/extractors/ts/query.py | 108 +++++++++++++---------- 3 files changed, 105 insertions(+), 56 deletions(-) diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index 80b6cc888..f66e8f773 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -8,10 +8,10 @@ from capa.features.address import FileOffsetRangeAddress from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML from capa.features.extractors.ts.query import ( + BINDINGS, QueryBinding, HTMLQueryBinding, ScriptQueryBinding, - QueryBindingFactory, TemplateQueryBinding, ) @@ -25,7 +25,7 @@ class TreeSitterBaseEngine: def __init__(self, language: str, buf: bytes): capa.features.extractors.ts.build.ts_build() self.language = language - self.query = QueryBindingFactory.from_language(language) + self.query = BINDINGS[language] self.buf = buf self.tree = self.parse() @@ -53,7 +53,13 @@ class TreeSitterExtractorEngine(TreeSitterBaseEngine): buf_offset: int namespaces: set[str] - def __init__(self, language: str, buf: bytes, buf_offset: int = 0, additional_namespaces: set[str] = None): + def __init__( + self, + language: str, + buf: bytes, + buf_offset: int = 0, + additional_namespaces: set[str] = None, + ): super().__init__(language, buf) self.buf_offset = buf_offset self.import_signatures = capa.features.extractors.ts.sig.load_import_signatures(language) @@ -146,7 +152,10 @@ def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: template_namespaces = set(name for _, name in self.get_template_namespaces()) for node, _ in self.get_code_sections(): yield TreeSitterExtractorEngine( - self.identify_language(), self.get_byte_range(node), node.start_byte, template_namespaces + self.identify_language(), + self.get_byte_range(node), + node.start_byte, + template_namespaces, ) def get_content_sections(self) -> List[Tuple[Node, str]]: @@ -159,21 +168,38 @@ def identify_language(self) -> str: return LANG_JS def get_template_namespaces(self) -> Iterator[Tuple[Node, str]]: + # raise ValueError([self.get_range(node) for node, _ in self.get_code_sections()]) for node, _ in self.get_code_sections(): - if self.is_aspx_import_directive: + if self.is_aspx_import_directive(node): namespace = self.get_aspx_namespace(node) if namespace is not None: yield node, namespace def is_c_sharp(self, node: Node) -> bool: - return len(re.findall(r'@ .*Page Language\s*=\s*"C#".*'.encode(), self.get_byte_range(node))) > 0 + return bool( + re.match( + r'@ .*Page Language\s*=\s*"C#".*'.encode(), + self.get_byte_range(node), + re.IGNORECASE, + ) + ) def is_aspx_import_directive(self, node: Node) -> bool: - return self.get_byte_range(node).startswith(b"@ Import namespace=") + return bool( + re.match( + r"@ Import Namespace=".encode(), + self.get_byte_range(node), + re.IGNORECASE, + ) + ) def get_aspx_namespace(self, node: Node) -> Optional[str]: - match = re.search(r'@ Import namespace="(.*?)"'.encode(), self.get_byte_range(node)) - return match.group().decode() if match is not None else None + match = re.search( + r'@ Import namespace="(.*?)"'.encode(), + self.get_byte_range(node), + re.IGNORECASE, + ) + return match.group(1).decode() if match is not None else None class TreeSitterHTMLEngine(TreeSitterBaseEngine): diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index 02ee3c9a1..b62461eb8 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -27,7 +27,10 @@ def __init__(self, path: str): self.language = capa.features.extractors.script.get_language_from_ext(path) if self.language == LANG_TEM: - self.code_sections, self.template_namespaces = self.extract_code_from_template(buf) + ( + self.code_sections, + self.template_namespaces, + ) = self.extract_code_from_template(buf) elif self.language == LANG_HTML: self.code_sections = list(self.extract_code_from_html(buf)) else: @@ -49,7 +52,9 @@ def extract_code_from_html( ) -> Iterator[TreeSitterExtractorEngine]: yield from TreeSitterHTMLEngine(buf, additional_namespaces).get_parsed_code_sections() - def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: + def get_base_address( + self, + ) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: return NO_ADDRESS def extract_template_namespaces(self) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index e9c3a62d1..70fe046b7 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -4,30 +4,7 @@ from tree_sitter.binding import Query import capa.features.extractors.ts.build -from capa.features.extractors.script import LANG_CS, LANG_TEM, LANG_HTML - -CS_BINDING = { - "query": { - "new_object": "(object_creation_expression) @object.new", - "function_definition": "(local_function_statement) @function.definition", - "function_call": "(invocation_expression) @function.call", - "string_literal": "(string_literal) @string-literal", - "integer_literal": "(integer_literal) @integer-literal", - "namespace": "(using_directive [(identifier) @namespace (qualified_name) @namespace])", - "global_statement": "(global_statement [(expression_statement) @global-statement (local_declaration_statement) @global-statement])", - }, - "field_name": {"new_object": "type", "function_definition": "name", "function_call": "function"}, -} - -TEM_BINDING = { - "code": "(code) @code", - "content": "(content) @content", -} - -HTML_BINDING = { - "script_element": "(script_element) @script-element", - "attribute": "(attribute) @attribute", -} +from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML @dataclass @@ -61,28 +38,69 @@ class HTMLQueryBinding(QueryBinding): attribute: Query -class QueryBindingFactory: - @staticmethod - def from_language(language: str) -> QueryBinding: - ts_language = Language(capa.features.extractors.ts.build.build_dir, language) - if language == LANG_CS: - return ScriptQueryBinding(language=ts_language, **QueryBindingFactory.deserialize(ts_language, CS_BINDING)) - if language in LANG_TEM: - return TemplateQueryBinding(language=ts_language, **TEM_BINDING) - if language == LANG_HTML: - return HTMLQueryBinding(language=ts_language, **HTML_BINDING) - raise NotImplementedError(f"Tree-sitter queries for {language} are not implemented.") - - @staticmethod - def deserialize(language: Language, binding: dict) -> dict: - deserialized_binding = {} +def deserialize(language: str, binding: dict) -> dict: + deserialized_binding = {} + if "query" in binding: for construct, query in binding["query"].items(): - deserialized_binding[construct] = language.query(query) + deserialized_binding[construct] = TS_LANGUAGES[language].query(query) + if "field_name" in binding: for construct, field_name in binding["field_name"].items(): deserialized_binding[f"{construct}_field_name"] = field_name - return deserialized_binding + return deserialized_binding + - def __init__(self): - self.language = Language(capa.features.extractors.ts.build.build_dir, LANG_TEM) - self.content = self.language.query("(content) @content") - self.code = self.language.query("(code) @code") +TS_LANGUAGES: dict[str, Language] = { + LANG_CS: Language(capa.features.extractors.ts.build.build_dir, LANG_CS), + LANG_TEM: Language(capa.features.extractors.ts.build.build_dir, LANG_TEM), + LANG_HTML: Language(capa.features.extractors.ts.build.build_dir, LANG_HTML), + LANG_JS: Language(capa.features.extractors.ts.build.build_dir, LANG_JS), +} + +BINDINGS: dict[str, QueryBinding] = { + LANG_CS: ScriptQueryBinding( + TS_LANGUAGES[LANG_CS], + **deserialize( + LANG_CS, + { + "query": { + "new_object": "(object_creation_expression) @object.new", + "function_definition": "(local_function_statement) @function.definition", + "function_call": "(invocation_expression) @function.call", + "string_literal": "(string_literal) @string-literal", + "integer_literal": "(integer_literal) @integer-literal", + "namespace": "(using_directive [(identifier) @namespace (qualified_name) @namespace])", + "global_statement": "(global_statement [(expression_statement) @global-statement (local_declaration_statement) @global-statement])", + }, + "field_name": { + "new_object": "type", + "function_definition": "name", + "function_call": "function", + }, + }, + ), + ), + LANG_TEM: TemplateQueryBinding( + TS_LANGUAGES[LANG_TEM], + **deserialize( + LANG_TEM, + { + "query": { + "code": "(code) @code", + "content": "(content) @content", + }, + }, + ), + ), + LANG_HTML: HTMLQueryBinding( + TS_LANGUAGES[LANG_HTML], + **deserialize( + LANG_HTML, + { + "query": { + "script_element": "(script_element) @script-element", + "attribute": "(attribute) @attribute", + }, + }, + ), + ), +} From 7c5e6e302070489325e9b495e2681ba95d100c52 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Thu, 7 Jul 2022 17:27:00 -0400 Subject: [PATCH 25/51] Refactored extractor engine tests and began adding new template tests. --- tests/data | 2 +- tests/fixtures.py | 33 ++++++-- tests/test_ts.py | 198 ++++++++++++++++++++++++++++++---------------- 3 files changed, 154 insertions(+), 79 deletions(-) diff --git a/tests/data b/tests/data index 2e8257475..b0ba2f632 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 2e8257475ebfdc8808d7e180be9a3f94977fcf57 +Subproject commit b0ba2f6328160ef6b34951e280f0210d954ca8bf diff --git a/tests/fixtures.py b/tests/fixtures.py index 784dbf83a..1f4cdf1f1 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -38,6 +38,7 @@ Feature, ) from capa.features.address import Address +from capa.features.extractors.script import LANG_CS, LANG_TEM from capa.features.extractors.ts.extractor import TreeSitterFeatureExtractor from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor @@ -45,7 +46,9 @@ CD = os.path.dirname(__file__) DOTNET_DIR = os.path.join(CD, "data", "dotnet") DNFILE_TESTFILES = os.path.join(DOTNET_DIR, "dnfile-testfiles") -SCRIPT_DIR = os.path.join(CD, "data", "scripts") +SOURCE_DIR = os.path.join(CD, "data", "source") +ASPX_DIR = os.path.join(SOURCE_DIR, "aspx") +CS_DIR = os.path.join(SOURCE_DIR, "cs") @contextlib.contextmanager @@ -178,6 +181,13 @@ def get_ts_extractor_engine(language, buf): return capa.features.extractors.ts.engine.TreeSitterExtractorEngine(language, buf) +@lru_cache(maxsize=1) +def get_ts_template_engine(language, buf): + import capa.features.extractors.ts.engine + + return capa.features.extractors.ts.engine.TreeSitterTemplateEngine(buf) + + @lru_cache(maxsize=1) def get_ts_extractor(path): import capa.features.extractors.ts.extractor @@ -295,10 +305,10 @@ def get_data_path_by_name(name): return os.path.join(CD, "data", "dotnet", "1c444ebeba24dcba8628b7dfe5fec7c6.exe_") elif name.startswith("_692f"): return os.path.join(CD, "data", "dotnet", "692f7fd6d198e804d6af98eb9e390d61.exe_") - elif name.startswith("cs_f397cb"): - return os.path.join(SCRIPT_DIR, "f397cb676353873cdc8fcfbf0e3a317334353cc63946099e5ea22db6d1eebfb8.cs_") - elif name.startswith("aspx_f397cb"): - return os.path.join(SCRIPT_DIR, "f397cb676353873cdc8fcfbf0e3a317334353cc63946099e5ea22db6d1eebfb8.aspx_") + elif name.startswith("cs_138cdc"): + return os.path.join(CS_DIR, "138cdc4b10f3f5ece9c47bb0ec17fde5b70c1f9a90b267794c5e5dfa337fc798.cs_") + elif name.startswith("aspx_675375"): + return os.path.join(ASPX_DIR, "6753759936aaaddb29719010644edf886c0548a69aa06e469b546b5de647deeb.aspx_") else: raise ValueError("unexpected sample fixture: %s" % name) @@ -962,7 +972,14 @@ def _692f_dotnetfile_extractor(): @pytest.fixture -def cs_f397cb_extractor_engine(): - with open(get_data_path_by_name("cs_f397cb"), "rb") as f: +def cs_138cdc_extractor_engine(): + with open(get_data_path_by_name("cs_138cdc"), "rb") as f: + buf = f.read() + return get_ts_extractor_engine(LANG_CS, buf) + + +@pytest.fixture +def aspx_675375_extractor_engine(): + with open(get_data_path_by_name("aspx_675375"), "rb") as f: buf = f.read() - return get_ts_extractor_engine("c_sharp", buf) + return get_ts_template_engine(LANG_TEM, buf) diff --git a/tests/test_ts.py b/tests/test_ts.py index d3470cd8a..295f6ec28 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -7,59 +7,73 @@ from capa.features.file import Import from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_SCRIPT, Arch, Format, String, Namespace, ScriptLanguage -from capa.features.address import NO_ADDRESS, FileOffsetRangeAddress -from capa.features.extractors.script import LANG_CS -from capa.features.extractors.ts.query import QueryBinding -from capa.features.extractors.ts.engine import TreeSitterExtractorEngine +from capa.features.address import FileOffsetRangeAddress +from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML +from capa.features.extractors.ts.query import QueryBinding, TemplateQueryBinding +from capa.features.extractors.ts.engine import TreeSitterBaseEngine, TreeSitterTemplateEngine, TreeSitterExtractorEngine -def do_test_ts_engine_init(engine: TreeSitterExtractorEngine): - assert engine.language == LANG_CS +def do_test_ts_base_engine_init(engine: TreeSitterBaseEngine): + assert engine.language in [LANG_CS, LANG_TEM, LANG_HTML, LANG_JS] assert isinstance(engine.query, QueryBinding) - assert isinstance(engine.import_signatures, set) and len(engine.import_signatures) > 0 assert isinstance(engine.buf, bytes) and len(engine.buf) > 0 assert isinstance(engine.tree, Tree) - assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) - addr = engine.get_default_address() - assert addr.start_byte == engine.tree.root_node.start_byte and addr.end_byte == engine.tree.root_node.end_byte -def do_test_range(engine: TreeSitterExtractorEngine, node: Node, expected_range: str, startswith: bool = False): +def do_test_ts_base_engine_get_range( + engine: TreeSitterBaseEngine, node: Node, expected_range: str, startswith: bool = False +): assert engine.get_range(node).startswith(expected_range) if startswith else engine.get_range(node) == expected_range -def do_test_range_address(engine: TreeSitterExtractorEngine, node: Node): +def do_test_ts_base_engine_get_address(engine: TreeSitterBaseEngine, node: Node): assert isinstance(engine.get_address(node), FileOffsetRangeAddress) addr = engine.get_address(node) assert addr.start_byte == node.start_byte and addr.end_byte == node.end_byte -def do_test_ts_engine_default_range_address(engine: TreeSitterExtractorEngine): +def do_test_ts_base_engine_get_default_address(engine: TreeSitterBaseEngine): assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) addr1 = engine.get_address(engine.tree.root_node) addr2 = engine.get_default_address() assert addr1.start_byte == addr2.start_byte and addr1.end_byte == addr2.end_byte -def do_test_ts_engine_object_parsing( +def do_test_ts_extractor_engine_init(engine: TreeSitterExtractorEngine): + assert engine.language == LANG_CS + assert isinstance(engine.query, QueryBinding) + assert isinstance(engine.import_signatures, set) and len(engine.import_signatures) > 0 + assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) + assert isinstance(engine.buf_offset, int) and engine.buf_offset >= 0 + addr = engine.get_default_address() + assert addr.start_byte == engine.tree.root_node.start_byte and addr.end_byte == engine.tree.root_node.end_byte + + +def do_test_ts_extractor_engine_get_address( + engine: TreeSitterExtractorEngine, node: Node, expected_range: str, startswith: bool = False +): + assert engine.get_range(node).startswith(expected_range) if startswith else engine.get_range(node) == expected_range + + +def do_test_ts_extractor_engine_get_new_objects( engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[Tuple[str, str]] ): assert len(engine.get_new_objects(root_node)) == len(expected_list) for (node, name), (expected_range, expected_id_range) in zip(engine.get_new_objects(root_node), expected_list): assert isinstance(node, Node) assert name == "object.new" - do_test_range(engine, node, expected_range) - do_test_range_address(engine, node) - do_test_range(engine, engine.get_object_id(node), expected_id_range) + do_test_ts_base_engine_get_range(engine, node, expected_range) + do_test_ts_base_engine_get_address(engine, node) + do_test_ts_base_engine_get_range(engine, engine.get_object_id(node), expected_id_range) assert len(list(engine.get_new_object_ids(root_node))) == len(expected_list) for node, (_, expected_id_range) in zip(engine.get_new_object_ids(root_node), expected_list): assert isinstance(node, Node) - do_test_range(engine, node, expected_id_range) - do_test_range_address(engine, node) + do_test_ts_base_engine_get_range(engine, node, expected_id_range) + do_test_ts_base_engine_get_address(engine, node) -def do_test_ts_engine_function_definition_parsing( +def do_test_ts_extractor_engine_get_function_definitions( engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[Tuple[str, str]] ): assert engine.get_function_definitions(engine.tree.root_node) == engine.get_function_definitions() @@ -69,101 +83,101 @@ def do_test_ts_engine_function_definition_parsing( ): assert isinstance(node, Node) assert name == "function.definition" - do_test_range(engine, node, expected_range, startswith=True) - do_test_range_address(engine, node) - do_test_range(engine, engine.get_function_definition_id(node), expected_id_range) + do_test_ts_base_engine_get_range(engine, node, expected_range, startswith=True) + do_test_ts_base_engine_get_address(engine, node) + do_test_ts_base_engine_get_range(engine, engine.get_function_definition_id(node), expected_id_range) assert len(list(engine.get_function_definition_ids(root_node))) == len(expected_list) for node, (_, expected_id_range) in zip(engine.get_function_definition_ids(root_node), expected_list): assert isinstance(node, Node) - do_test_range(engine, node, expected_id_range) - do_test_range_address(engine, node) + do_test_ts_base_engine_get_range(engine, node, expected_id_range) + do_test_ts_base_engine_get_address(engine, node) -def do_test_ts_engine_function_call_parsing( +def do_test_ts_extractor_engine_get_function_calls( engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[Tuple[str, str]] ): assert len(engine.get_function_calls(root_node)) == len(expected_list) for (node, name), (expected_range, expected_id_range) in zip(engine.get_function_calls(root_node), expected_list): assert isinstance(node, Node) assert name == "function.call" - do_test_range(engine, node, expected_range) - do_test_range_address(engine, node) - do_test_range(engine, engine.get_function_call_id(node), expected_id_range) + do_test_ts_base_engine_get_range(engine, node, expected_range) + do_test_ts_base_engine_get_address(engine, node) + do_test_ts_base_engine_get_range(engine, engine.get_function_call_id(node), expected_id_range) assert len(list(engine.get_function_call_ids(root_node))) == len(expected_list) for node, (_, expected_id_range) in zip(engine.get_function_call_ids(root_node), expected_list): assert isinstance(node, Node) - do_test_range(engine, node, expected_id_range) - do_test_range_address(engine, node) + do_test_ts_base_engine_get_range(engine, node, expected_id_range) + do_test_ts_base_engine_get_address(engine, node) -def do_test_ts_engine_string_literals_parsing( +def do_test_ts_extractor_engine_get_string_literals( engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[str] ): assert len(engine.get_string_literals(root_node)) == len(expected_list) for (node, name), expected_range in zip(engine.get_string_literals(root_node), expected_list): assert isinstance(node, Node) assert name == "string-literal" - do_test_range(engine, node, expected_range) - do_test_range_address(engine, node) + do_test_ts_base_engine_get_range(engine, node, expected_range) + do_test_ts_base_engine_get_address(engine, node) -def do_test_ts_engine_integer_literals_parsing( +def do_test_ts_extractor_engine_get_integer_literals( engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[str] ): assert len(engine.get_integer_literals(root_node)) == len(expected_list) for (node, name), expected_range in zip(engine.get_integer_literals(root_node), expected_list): assert isinstance(node, Node) assert name == "integer-literal" - do_test_range(engine, node, expected_range) - do_test_range_address(engine, node) + do_test_ts_base_engine_get_range(engine, node, expected_range) + do_test_ts_base_engine_get_address(engine, node) -def do_test_ts_engine_namespaces_parsing(engine: TreeSitterExtractorEngine, expected_list: List[str]): +def do_test_ts_extractor_engine_get_namespaces(engine: TreeSitterExtractorEngine, expected_list: List[str]): assert engine.get_namespaces(engine.tree.root_node) == engine.get_namespaces() assert len(engine.get_namespaces()) == len(expected_list) for (node, name), expected_range in zip(engine.get_namespaces(), expected_list): assert isinstance(node, Node) assert name == "namespace" - do_test_range(engine, node, expected_range) - do_test_range_address(engine, node) + do_test_ts_base_engine_get_range(engine, node, expected_range) + do_test_ts_base_engine_get_address(engine, node) -def do_test_ts_engine_global_statements_parsing(engine: TreeSitterExtractorEngine, expected_list: List[str]): +def do_test_ts_extractor_engine_get_global_statements(engine: TreeSitterExtractorEngine, expected_list: List[str]): assert len(engine.get_global_statements()) == len(expected_list) for (node, name), expected_range in zip(engine.get_global_statements(), expected_list): assert isinstance(node, Node) assert name == "global-statement" - do_test_range(engine, node, expected_range, startswith=True) - do_test_range_address(engine, node) + do_test_ts_base_engine_get_range(engine, node, expected_range, startswith=True) + do_test_ts_base_engine_get_address(engine, node) -def do_test_ts_engine_import_names_parsing( +def do_test_ts_extractor_engine_get_import_names( engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[str] ): assert len(list(engine.get_import_names(root_node))) == len(expected_list) for (node, import_name), expected_import_name in zip(list(engine.get_import_names(root_node)), expected_list): assert isinstance(node, Node) assert import_name == expected_import_name - do_test_range_address(engine, node) + do_test_ts_base_engine_get_address(engine, node) -def do_test_ts_engine_function_names_parsing( +def do_test_ts_extractor_engine_get_function_names( engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[str] ): assert len(list(engine.get_function_names(root_node))) == len(expected_list) for (node, function_name), expected_function_name in zip(list(engine.get_function_names(root_node)), expected_list): assert isinstance(node, Node) assert function_name == expected_function_name - do_test_range_address(engine, node) + do_test_ts_base_engine_get_address(engine, node) @parametrize( "engine_str,expected_dict", [ ( - "cs_f397cb_extractor_engine", + "cs_138cdc_extractor_engine", { "all objects": [ ( @@ -240,35 +254,79 @@ def do_test_ts_engine_function_names_parsing( ), ], ) -def test_ts_engine(request: pytest.FixtureRequest, engine_str: str, expected_dict: dict): +def test_ts_extractor_engine(request: pytest.FixtureRequest, engine_str: str, expected_dict: dict): engine: TreeSitterExtractorEngine = request.getfixturevalue(engine_str) - do_test_ts_engine_init(engine) - do_test_ts_engine_object_parsing(engine, engine.tree.root_node, expected_dict["all objects"]) - do_test_ts_engine_function_definition_parsing( + do_test_ts_extractor_engine_init(engine) + do_test_ts_extractor_engine_get_new_objects(engine, engine.tree.root_node, expected_dict["all objects"]) + do_test_ts_extractor_engine_get_function_definitions( engine, engine.tree.root_node, expected_dict["all function definitions"] ) - do_test_ts_engine_function_call_parsing(engine, engine.tree.root_node, expected_dict["all function calls"]) - do_test_ts_engine_string_literals_parsing(engine, engine.tree.root_node, expected_dict["all string literals"]) - do_test_ts_engine_integer_literals_parsing(engine, engine.tree.root_node, expected_dict["all integer literals"]) - do_test_ts_engine_import_names_parsing(engine, engine.tree.root_node, expected_dict["all import names"]) - do_test_ts_engine_function_names_parsing(engine, engine.tree.root_node, expected_dict["all function names"]) - do_test_ts_engine_global_statements_parsing(engine, expected_dict["global statements"]) - do_test_ts_engine_namespaces_parsing(engine, expected_dict["namespaces"]) - do_test_ts_engine_default_range_address(engine) + do_test_ts_extractor_engine_get_function_calls(engine, engine.tree.root_node, expected_dict["all function calls"]) + do_test_ts_extractor_engine_get_string_literals(engine, engine.tree.root_node, expected_dict["all string literals"]) + do_test_ts_extractor_engine_get_integer_literals( + engine, engine.tree.root_node, expected_dict["all integer literals"] + ) + do_test_ts_extractor_engine_get_import_names(engine, engine.tree.root_node, expected_dict["all import names"]) + do_test_ts_extractor_engine_get_function_names(engine, engine.tree.root_node, expected_dict["all function names"]) + do_test_ts_extractor_engine_get_global_statements(engine, expected_dict["global statements"]) + do_test_ts_extractor_engine_get_namespaces(engine, expected_dict["namespaces"]) + do_test_ts_base_engine_get_default_address(engine) + + +def do_test_ts_template_engine_init(engine: TreeSitterTemplateEngine): + assert engine.language == LANG_TEM + assert isinstance(engine.query, QueryBinding) + assert isinstance(engine.buf, bytes) and len(engine.buf) > 0 + assert isinstance(engine.tree, Tree) + assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) + addr = engine.get_default_address() + assert addr.start_byte == engine.tree.root_node.start_byte and addr.end_byte == engine.tree.root_node.end_byte + assert isinstance(engine.query, TemplateQueryBinding) + pass + + +def do_test_ts_template_engine_get_template_namespaces(engine: TreeSitterTemplateEngine, expected_list: List[str]): + assert len(list(engine.get_template_namespaces())) == len(expected_list) + for (node, namespace), expected_namespace in zip(list(engine.get_template_namespaces()), expected_list): + assert isinstance(node, Node) + assert engine.is_aspx_import_directive(node) == True + assert engine.get_aspx_namespace(node) == expected_namespace + assert namespace == expected_namespace + + +@parametrize( + "engine_str,expected_dict", + [ + ( + "aspx_675375_extractor_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System"], + "code sections": [(2, 22), (28, 56), (3977, 4008), (4060, 4091)], + "content sections": [(24, 26), (58, 3975), (4010, 4058), (4093, 4415)], + }, + ) + ], +) +def test_ts_engine_template(request: pytest.FixtureRequest, engine_str: str, expected_dict: dict): + engine: TreeSitterTemplateEngine = request.getfixturevalue(engine_str) + do_test_ts_template_engine_init(engine) + assert engine.identify_language() == expected_dict["language"] + do_test_ts_template_engine_get_template_namespaces(engine, expected_dict["aspx namespaces"]) FEATURE_PRESENCE_TESTS_SCRIPTS = sorted( [ - ("cs_f397cb", "global", Arch(ARCH_ANY), True), - ("cs_f397cb", "global", OS(OS_ANY), True), - ("cs_f397cb", "file", Format(FORMAT_SCRIPT), True), - ("cs_f397cb", "file", ScriptLanguage(LANG_CS), True), - ("cs_f397cb", "file", Namespace("System"), True), - ("cs_f397cb", "file", String(""), True), - ("cs_f397cb", "function=(0x38,0x16c)", String("Not Found"), True), - ("cs_f397cb", "function=(0x16e,0x7ce)", String("127.0.0.1"), True), - ("cs_f397cb", "function=(0x16e,0x7ce)", Import("System.Diagnostics.ProcessStartInfo"), True), - ("cs_f397cb", "function=(0x16e,0x7ce)", Import("System.Diagnostics.Process"), True), + ("cs_138cdc", "global", Arch(ARCH_ANY), True), + ("cs_138cdc", "global", OS(OS_ANY), True), + ("cs_138cdc", "file", Format(FORMAT_SCRIPT), True), + ("cs_138cdc", "file", ScriptLanguage(LANG_CS), True), + ("cs_138cdc", "file", Namespace("System"), True), + ("cs_138cdc", "file", String(""), True), + ("cs_138cdc", "function=(0x38,0x16c)", String("Not Found"), True), + ("cs_138cdc", "function=(0x16e,0x7ce)", String("127.0.0.1"), True), + ("cs_138cdc", "function=(0x16e,0x7ce)", Import("System.Diagnostics.ProcessStartInfo"), True), + ("cs_138cdc", "function=(0x16e,0x7ce)", Import("System.Diagnostics.Process"), True), ] ) From 1e0326adb63501553d7a612badc2a27b7986ee35 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Fri, 8 Jul 2022 14:41:00 -0400 Subject: [PATCH 26/51] Added new tests for embedded template testing and refactored a few already implemented extractor engine tests. --- capa/features/extractors/ts/engine.py | 5 +- tests/fixtures.py | 131 ++++- tests/test_ts.py | 722 +++++++++++++++++++++++--- 3 files changed, 785 insertions(+), 73 deletions(-) diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index f66e8f773..425658856 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -168,7 +168,6 @@ def identify_language(self) -> str: return LANG_JS def get_template_namespaces(self) -> Iterator[Tuple[Node, str]]: - # raise ValueError([self.get_range(node) for node, _ in self.get_code_sections()]) for node, _ in self.get_code_sections(): if self.is_aspx_import_directive(node): namespace = self.get_aspx_namespace(node) @@ -187,7 +186,7 @@ def is_c_sharp(self, node: Node) -> bool: def is_aspx_import_directive(self, node: Node) -> bool: return bool( re.match( - r"@ Import Namespace=".encode(), + r"@\s*Import Namespace=".encode(), self.get_byte_range(node), re.IGNORECASE, ) @@ -195,7 +194,7 @@ def is_aspx_import_directive(self, node: Node) -> bool: def get_aspx_namespace(self, node: Node) -> Optional[str]: match = re.search( - r'@ Import namespace="(.*?)"'.encode(), + r'@\s*Import namespace="(.*?)"'.encode(), self.get_byte_range(node), re.IGNORECASE, ) diff --git a/tests/fixtures.py b/tests/fixtures.py index 1f4cdf1f1..9ea428a8d 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -182,9 +182,11 @@ def get_ts_extractor_engine(language, buf): @lru_cache(maxsize=1) -def get_ts_template_engine(language, buf): +def get_ts_template_engine(language, path): import capa.features.extractors.ts.engine + with open(path, "rb") as f: + buf = f.read() return capa.features.extractors.ts.engine.TreeSitterTemplateEngine(buf) @@ -307,12 +309,34 @@ def get_data_path_by_name(name): return os.path.join(CD, "data", "dotnet", "692f7fd6d198e804d6af98eb9e390d61.exe_") elif name.startswith("cs_138cdc"): return os.path.join(CS_DIR, "138cdc4b10f3f5ece9c47bb0ec17fde5b70c1f9a90b267794c5e5dfa337fc798.cs_") - elif name.startswith("aspx_675375"): - return os.path.join(ASPX_DIR, "6753759936aaaddb29719010644edf886c0548a69aa06e469b546b5de647deeb.aspx_") else: raise ValueError("unexpected sample fixture: %s" % name) +ASPX_DATA_PATH_BY_NAME = { + "aspx_4f6fa6": os.path.join(ASPX_DIR, "4f6fa6a45017397c7e1c9cd5a17235ccb1ff0f5087dfa6b7384552bf507e7fe1.aspx_"), + "aspx_5f959f": os.path.join(ASPX_DIR, "5f959f480a66a33d37d9a0ef6c8f7d0059625ca2a8ae9236b49b194733622655.aspx_"), + "aspx_10162f": os.path.join(ASPX_DIR, "10162feb5f063ea09c6a3d275f31abf0fe8a9e4e36fded0053b1f8e054da8161.aspx_"), + "aspx_2b71dd": os.path.join(ASPX_DIR, "2b71dd245520d9eb5f1e4c633fee61c7d83687591d9f64f9390c26dc95057c3c.aspx_"), + "aspx_f2bf20": os.path.join(ASPX_DIR, "f2bf20e7bb482d27da8f19aa0f8bd4927746a65300929b99166867074a38a4b4.aspx_"), + "aspx_f39dc0": os.path.join(ASPX_DIR, "f39dc0dfd43477d65c1380a7cff89296ad72bfa7fc3afcfd8e294f195632030e.aspx_"), + "aspx_ea2a01": os.path.join(ASPX_DIR, "ea2a01cae57c00df01bff6bb8a72585fdc0abb7a26a869dc1a0131bdff50b400.aspx_"), + "aspx_6f3261": os.path.join(ASPX_DIR, "6f3261eaaabf369bd928d179641b73ffd768184dfd4e00124da462a3075d4239.aspx_"), + "aspx_1f8f40": os.path.join(ASPX_DIR, "1f8f4054932ed1d5d055e9a92aa1e2abba49af3370506674cb1b2c70146ae81a.aspx_"), + "aspx_2e8c7e": os.path.join(ASPX_DIR, "2e8c7eacd739ca3f3dc4112b41a024157035096b8d0c26ba79d8b893136391bc.aspx_"), + "aspx_03bb5c": os.path.join(ASPX_DIR, "03bb5cab46b406bb8613ca6e32991ab3e10b5cd759d5c7813191e9e62868ea73.aspx_"), + "aspx_606dbf": os.path.join(ASPX_DIR, "606dbfebdc7751ecb6cb9a845853ae1905afd4b8a2cb54e1e4a98c932e268712.aspx_"), + "aspx_f397cb": os.path.join(ASPX_DIR, "f397cb676353873cdc8fcfbf0e3a317334353cc63946099e5ea22db6d1eebfb8.aspx_"), + "aspx_b4bb14": os.path.join(ASPX_DIR, "b4bb14aeb692f7afc107ee89f86d096f1cd8f9761b6c50788f626a9dccc8b077.aspx_"), + "aspx_54433d": os.path.join(ASPX_DIR, "54433dd57414773098a6d3292d262f91a6812855dfcbf8d421695608d1fad638.aspx_"), + "aspx_a35878": os.path.join(ASPX_DIR, "a35878e74425cd97ad98e3ec4b2583867bb536f4275d821cd8b82bc19380ba1a.aspx_"), + "aspx_a5c893": os.path.join(ASPX_DIR, "a5c8934836f5b36bba3a722eab691a9f1f926c138fefe5bae07e9074e7c49ae3.aspx_"), + "aspx_15eed4": os.path.join(ASPX_DIR, "15eed42e4904205b2ef2ff285ff1ce6c8138296c12cf075a2562c69a5fafd1cb.aspx_"), + "aspx_b75f16": os.path.join(ASPX_DIR, "b75f163ca9b9240bf4b37ad92bc7556b40a17e27c2b8ed5c8991385fe07d17d0.aspx_"), + "aspx_d460ca": os.path.join(ASPX_DIR, "d460cae7d34c51059ef57c5aadb3de099469efbac5fffcf76d0528a511192a28.aspx_"), +} + + def get_sample_md5_by_name(name): """used by IDA tests to ensure the correct IDB is loaded""" if name == "mimikatz": @@ -979,7 +1003,100 @@ def cs_138cdc_extractor_engine(): @pytest.fixture -def aspx_675375_extractor_engine(): - with open(get_data_path_by_name("aspx_675375"), "rb") as f: - buf = f.read() - return get_ts_template_engine(LANG_TEM, buf) +def aspx_4f6fa6_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_4f6fa6"]) + + +@pytest.fixture +def aspx_5f959f_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_5f959f"]) + + +@pytest.fixture +def aspx_10162f_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_10162f"]) + + +@pytest.fixture +def aspx_2b71dd_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_2b71dd"]) + + +@pytest.fixture +def aspx_f2bf20_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_f2bf20"]) + + +@pytest.fixture +def aspx_f39dc0_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_f39dc0"]) + + +@pytest.fixture +def aspx_ea2a01_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_ea2a01"]) + + +@pytest.fixture +def aspx_6f3261_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_6f3261"]) + + +@pytest.fixture +def aspx_1f8f40_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_1f8f40"]) + + +@pytest.fixture +def aspx_2e8c7e_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_2e8c7e"]) + + +@pytest.fixture +def aspx_03bb5c_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_03bb5c"]) + + +@pytest.fixture +def aspx_606dbf_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_606dbf"]) + + +@pytest.fixture +def aspx_f397cb_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_f397cb"]) + + +@pytest.fixture +def aspx_b4bb14_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_b4bb14"]) + + +@pytest.fixture +def aspx_54433d_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_54433d"]) + + +@pytest.fixture +def aspx_a35878_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_a35878"]) + + +@pytest.fixture +def aspx_a5c893_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_a5c893"]) + + +@pytest.fixture +def aspx_15eed4_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_15eed4"]) + + +@pytest.fixture +def aspx_b75f16_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_b75f16"]) + + +@pytest.fixture +def aspx_d460ca_template_engine(): + return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_d460ca"]) diff --git a/tests/test_ts.py b/tests/test_ts.py index 295f6ec28..64e6fb548 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -39,14 +39,17 @@ def do_test_ts_base_engine_get_default_address(engine: TreeSitterBaseEngine): assert addr1.start_byte == addr2.start_byte and addr1.end_byte == addr2.end_byte -def do_test_ts_extractor_engine_init(engine: TreeSitterExtractorEngine): - assert engine.language == LANG_CS +def do_test_ts_extractor_engine_init(engine: TreeSitterExtractorEngine, expected_language: str): + assert engine.language == expected_language assert isinstance(engine.query, QueryBinding) assert isinstance(engine.import_signatures, set) and len(engine.import_signatures) > 0 assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) assert isinstance(engine.buf_offset, int) and engine.buf_offset >= 0 addr = engine.get_default_address() - assert addr.start_byte == engine.tree.root_node.start_byte and addr.end_byte == engine.tree.root_node.end_byte + assert ( + addr.start_byte == engine.tree.root_node.start_byte + engine.buf_offset + and addr.end_byte == engine.tree.root_node.end_byte + engine.buf_offset + ) def do_test_ts_extractor_engine_get_address( @@ -56,67 +59,65 @@ def do_test_ts_extractor_engine_get_address( def do_test_ts_extractor_engine_get_new_objects( - engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[Tuple[str, str]] + engine: TreeSitterExtractorEngine, root_node: Node, expected: List[Tuple[str, str]] ): - assert len(engine.get_new_objects(root_node)) == len(expected_list) - for (node, name), (expected_range, expected_id_range) in zip(engine.get_new_objects(root_node), expected_list): + assert len(engine.get_new_objects(root_node)) == len(expected) + for (node, name), (expected_range, expected_id_range) in zip(engine.get_new_objects(root_node), expected): assert isinstance(node, Node) assert name == "object.new" do_test_ts_base_engine_get_range(engine, node, expected_range) do_test_ts_base_engine_get_address(engine, node) do_test_ts_base_engine_get_range(engine, engine.get_object_id(node), expected_id_range) - assert len(list(engine.get_new_object_ids(root_node))) == len(expected_list) - for node, (_, expected_id_range) in zip(engine.get_new_object_ids(root_node), expected_list): + assert len(list(engine.get_new_object_ids(root_node))) == len(expected) + for node, (_, expected_id_range) in zip(engine.get_new_object_ids(root_node), expected): assert isinstance(node, Node) do_test_ts_base_engine_get_range(engine, node, expected_id_range) do_test_ts_base_engine_get_address(engine, node) def do_test_ts_extractor_engine_get_function_definitions( - engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[Tuple[str, str]] + engine: TreeSitterExtractorEngine, root_node: Node, expected: List[Tuple[str, str]] ): assert engine.get_function_definitions(engine.tree.root_node) == engine.get_function_definitions() - assert len(engine.get_function_definitions(root_node)) == len(expected_list) - for (node, name), (expected_range, expected_id_range) in zip( - engine.get_function_definitions(root_node), expected_list - ): + assert len(engine.get_function_definitions(root_node)) == len(expected) + for (node, name), (expected_range, expected_id_range) in zip(engine.get_function_definitions(root_node), expected): assert isinstance(node, Node) assert name == "function.definition" do_test_ts_base_engine_get_range(engine, node, expected_range, startswith=True) do_test_ts_base_engine_get_address(engine, node) do_test_ts_base_engine_get_range(engine, engine.get_function_definition_id(node), expected_id_range) - assert len(list(engine.get_function_definition_ids(root_node))) == len(expected_list) - for node, (_, expected_id_range) in zip(engine.get_function_definition_ids(root_node), expected_list): + assert len(list(engine.get_function_definition_ids(root_node))) == len(expected) + for node, (_, expected_id_range) in zip(engine.get_function_definition_ids(root_node), expected): assert isinstance(node, Node) do_test_ts_base_engine_get_range(engine, node, expected_id_range) do_test_ts_base_engine_get_address(engine, node) def do_test_ts_extractor_engine_get_function_calls( - engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[Tuple[str, str]] + engine: TreeSitterExtractorEngine, root_node: Node, expected: List[Tuple[str, str]] ): - assert len(engine.get_function_calls(root_node)) == len(expected_list) - for (node, name), (expected_range, expected_id_range) in zip(engine.get_function_calls(root_node), expected_list): + assert len(engine.get_function_calls(root_node)) == len(expected) + for (node, name), (expected_range, expected_id_range) in zip(engine.get_function_calls(root_node), expected): assert isinstance(node, Node) assert name == "function.call" do_test_ts_base_engine_get_range(engine, node, expected_range) do_test_ts_base_engine_get_address(engine, node) do_test_ts_base_engine_get_range(engine, engine.get_function_call_id(node), expected_id_range) - assert len(list(engine.get_function_call_ids(root_node))) == len(expected_list) - for node, (_, expected_id_range) in zip(engine.get_function_call_ids(root_node), expected_list): + assert len(list(engine.get_function_call_ids(root_node))) == len(expected) + for node, (_, expected_id_range) in zip(engine.get_function_call_ids(root_node), expected): assert isinstance(node, Node) do_test_ts_base_engine_get_range(engine, node, expected_id_range) do_test_ts_base_engine_get_address(engine, node) def do_test_ts_extractor_engine_get_string_literals( - engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[str] + engine: TreeSitterExtractorEngine, root_node: Node, expected: List[str] ): - assert len(engine.get_string_literals(root_node)) == len(expected_list) - for (node, name), expected_range in zip(engine.get_string_literals(root_node), expected_list): + assert len(engine.get_string_literals(root_node)) == len(expected) + for (node, name), expected_range in zip(engine.get_string_literals(root_node), expected): assert isinstance(node, Node) assert name == "string-literal" do_test_ts_base_engine_get_range(engine, node, expected_range) @@ -124,29 +125,29 @@ def do_test_ts_extractor_engine_get_string_literals( def do_test_ts_extractor_engine_get_integer_literals( - engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[str] + engine: TreeSitterExtractorEngine, root_node: Node, expected: List[str] ): - assert len(engine.get_integer_literals(root_node)) == len(expected_list) - for (node, name), expected_range in zip(engine.get_integer_literals(root_node), expected_list): + assert len(engine.get_integer_literals(root_node)) == len(expected) + for (node, name), expected_range in zip(engine.get_integer_literals(root_node), expected): assert isinstance(node, Node) assert name == "integer-literal" do_test_ts_base_engine_get_range(engine, node, expected_range) do_test_ts_base_engine_get_address(engine, node) -def do_test_ts_extractor_engine_get_namespaces(engine: TreeSitterExtractorEngine, expected_list: List[str]): +def do_test_ts_extractor_engine_get_namespaces(engine: TreeSitterExtractorEngine, expected: List[str]): assert engine.get_namespaces(engine.tree.root_node) == engine.get_namespaces() - assert len(engine.get_namespaces()) == len(expected_list) - for (node, name), expected_range in zip(engine.get_namespaces(), expected_list): + assert len(engine.get_namespaces()) == len(expected) + for (node, name), expected_range in zip(engine.get_namespaces(), expected): assert isinstance(node, Node) assert name == "namespace" do_test_ts_base_engine_get_range(engine, node, expected_range) do_test_ts_base_engine_get_address(engine, node) -def do_test_ts_extractor_engine_get_global_statements(engine: TreeSitterExtractorEngine, expected_list: List[str]): - assert len(engine.get_global_statements()) == len(expected_list) - for (node, name), expected_range in zip(engine.get_global_statements(), expected_list): +def do_test_ts_extractor_engine_get_global_statements(engine: TreeSitterExtractorEngine, expected: List[str]): + assert len(engine.get_global_statements()) == len(expected) + for (node, name), expected_range in zip(engine.get_global_statements(), expected): assert isinstance(node, Node) assert name == "global-statement" do_test_ts_base_engine_get_range(engine, node, expected_range, startswith=True) @@ -154,31 +155,32 @@ def do_test_ts_extractor_engine_get_global_statements(engine: TreeSitterExtracto def do_test_ts_extractor_engine_get_import_names( - engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[str] + engine: TreeSitterExtractorEngine, root_node: Node, expected: List[str] ): - assert len(list(engine.get_import_names(root_node))) == len(expected_list) - for (node, import_name), expected_import_name in zip(list(engine.get_import_names(root_node)), expected_list): + assert len(list(engine.get_import_names(root_node))) == len(expected) + for (node, import_name), expected_import_name in zip(list(engine.get_import_names(root_node)), expected): assert isinstance(node, Node) assert import_name == expected_import_name do_test_ts_base_engine_get_address(engine, node) def do_test_ts_extractor_engine_get_function_names( - engine: TreeSitterExtractorEngine, root_node: Node, expected_list: List[str] + engine: TreeSitterExtractorEngine, root_node: Node, expected: List[str] ): - assert len(list(engine.get_function_names(root_node))) == len(expected_list) - for (node, function_name), expected_function_name in zip(list(engine.get_function_names(root_node)), expected_list): + assert len(list(engine.get_function_names(root_node))) == len(expected) + for (node, function_name), expected_function_name in zip(list(engine.get_function_names(root_node)), expected): assert isinstance(node, Node) assert function_name == expected_function_name do_test_ts_base_engine_get_address(engine, node) @parametrize( - "engine_str,expected_dict", + "engine_str,expected", [ ( "cs_138cdc_extractor_engine", { + "language": LANG_CS, "all objects": [ ( 'new Diagnostics.ProcessStartInfo("cmd", "/c " + Request.Form["c"])', @@ -254,22 +256,20 @@ def do_test_ts_extractor_engine_get_function_names( ), ], ) -def test_ts_extractor_engine(request: pytest.FixtureRequest, engine_str: str, expected_dict: dict): +def test_ts_extractor_engine(request: pytest.FixtureRequest, engine_str: str, expected: dict): engine: TreeSitterExtractorEngine = request.getfixturevalue(engine_str) - do_test_ts_extractor_engine_init(engine) - do_test_ts_extractor_engine_get_new_objects(engine, engine.tree.root_node, expected_dict["all objects"]) + do_test_ts_extractor_engine_init(engine, expected["language"]) + do_test_ts_extractor_engine_get_new_objects(engine, engine.tree.root_node, expected["all objects"]) do_test_ts_extractor_engine_get_function_definitions( - engine, engine.tree.root_node, expected_dict["all function definitions"] - ) - do_test_ts_extractor_engine_get_function_calls(engine, engine.tree.root_node, expected_dict["all function calls"]) - do_test_ts_extractor_engine_get_string_literals(engine, engine.tree.root_node, expected_dict["all string literals"]) - do_test_ts_extractor_engine_get_integer_literals( - engine, engine.tree.root_node, expected_dict["all integer literals"] + engine, engine.tree.root_node, expected["all function definitions"] ) - do_test_ts_extractor_engine_get_import_names(engine, engine.tree.root_node, expected_dict["all import names"]) - do_test_ts_extractor_engine_get_function_names(engine, engine.tree.root_node, expected_dict["all function names"]) - do_test_ts_extractor_engine_get_global_statements(engine, expected_dict["global statements"]) - do_test_ts_extractor_engine_get_namespaces(engine, expected_dict["namespaces"]) + do_test_ts_extractor_engine_get_function_calls(engine, engine.tree.root_node, expected["all function calls"]) + do_test_ts_extractor_engine_get_string_literals(engine, engine.tree.root_node, expected["all string literals"]) + do_test_ts_extractor_engine_get_integer_literals(engine, engine.tree.root_node, expected["all integer literals"]) + do_test_ts_extractor_engine_get_import_names(engine, engine.tree.root_node, expected["all import names"]) + do_test_ts_extractor_engine_get_function_names(engine, engine.tree.root_node, expected["all function names"]) + do_test_ts_extractor_engine_get_global_statements(engine, expected["global statements"]) + do_test_ts_extractor_engine_get_namespaces(engine, expected["namespaces"]) do_test_ts_base_engine_get_default_address(engine) @@ -285,34 +285,630 @@ def do_test_ts_template_engine_init(engine: TreeSitterTemplateEngine): pass -def do_test_ts_template_engine_get_template_namespaces(engine: TreeSitterTemplateEngine, expected_list: List[str]): - assert len(list(engine.get_template_namespaces())) == len(expected_list) - for (node, namespace), expected_namespace in zip(list(engine.get_template_namespaces()), expected_list): +def do_test_ts_template_engine_get_template_namespaces(engine: TreeSitterTemplateEngine, expected: List[str]): + assert len(list(engine.get_template_namespaces())) == len(expected) + for (node, namespace), expected_namespace in zip(list(engine.get_template_namespaces()), expected): assert isinstance(node, Node) assert engine.is_aspx_import_directive(node) == True assert engine.get_aspx_namespace(node) == expected_namespace assert namespace == expected_namespace +def do_test_ts_template_engine_get_code_sections(engine: TreeSitterTemplateEngine, expected: List[Tuple[int, int]]): + assert len(engine.get_code_sections()) == len(expected) + for (node, name), (expected_start_byte, expected_end_byte) in zip(list(engine.get_code_sections()), expected): + assert isinstance(node, Node) + assert name == "code" + assert node.start_byte == expected_start_byte and node.end_byte == expected_end_byte + + +def do_test_ts_template_engine_get_content_sections(engine: TreeSitterTemplateEngine, expected: List[Tuple[int, int]]): + assert len(engine.get_content_sections()) == len(expected) + for (node, name), (expected_start_byte, expected_end_byte) in zip(list(engine.get_content_sections()), expected): + assert isinstance(node, Node) + assert name == "content" + assert node.start_byte == expected_start_byte and node.end_byte == expected_end_byte + + +def do_test_ts_template_engine_get_parsed_code_sections( + engine: TreeSitterTemplateEngine, expected_language: str, expected: List[Tuple[int, int]] +): + assert len(list(engine.get_parsed_code_sections())) == len(expected) + addrs = [e.get_default_address() for e in engine.get_parsed_code_sections()] + for extractor_engine, (expected_start_byte, _) in zip(engine.get_parsed_code_sections(), expected): + do_test_ts_extractor_engine_init(extractor_engine, expected_language) + assert extractor_engine.buf_offset == expected_start_byte + root = extractor_engine.tree.root_node + addr = extractor_engine.get_default_address() + assert ( + addr.start_byte == root.start_byte + expected_start_byte + and addr.end_byte == root.end_byte + expected_start_byte + ) + addr = extractor_engine.get_address(extractor_engine.tree.root_node) + assert ( + addr.start_byte == root.start_byte + expected_start_byte + and addr.end_byte == root.end_byte + expected_start_byte + ) + + @parametrize( - "engine_str,expected_dict", + "engine_str,expected", [ ( - "aspx_675375_extractor_engine", + "aspx_1f8f40_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Reflection"], + "code sections": [(2, 23), (27, 64), (68, 469)], + "content sections": [], + }, + ), + ( + "aspx_2b71dd_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Diagnostics", "System.IO"], + "code sections": [(2, 50), (55, 95), (100, 131)], + "content sections": [(52, 53), (97, 98), (133, 1273)], + }, + ), + ( + "aspx_2e8c7e_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Diagnostics", "System.IO"], + "code sections": [(2, 23), (28, 67), (72, 103)], + "content sections": [(25, 26), (69, 70), (105, 2919)], + }, + ), + ( + "aspx_03bb5c_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Web.UI.WebControls", "System.Diagnostics", "System.IO"], + "code sections": [(2, 47), (53, 100), (106, 146), (152, 183), (1659, 7702)], + "content sections": [(49, 51), (102, 104), (148, 150), (185, 1657), (7704, 10790)], + }, + ), + ( + "aspx_4f6fa6_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Diagnostics", "System.IO", "System.IO.Compression"], + "code sections": [(2, 50), (55, 95), (100, 131), (136, 179), (186, 234)], + "content sections": [(52, 53), (97, 98), (133, 134), (181, 183), (237, 6039)], + }, + ), + ( + "aspx_a35878_template_engine", + { + "language": LANG_CS, + "aspx namespaces": [ + "System.IO", + "System.Diagnostics", + "System.Data", + "System.Management", + "System.Data.OleDb", + "Microsoft.Win32", + "System.Net.Sockets", + "System.Net", + "System.Web.UI", + "System.Runtime.InteropServices", + "System.DirectoryServices", + "System.ServiceProcess", + "System.Text.RegularExpressions", + "System.Threading", + "System.Data.SqlClient", + "Microsoft.VisualBasic", + ], + "code sections": [ + (2, 123), + (128, 158), + (163, 202), + (207, 239), + (244, 282), + (287, 325), + (330, 366), + (371, 411), + (416, 448), + (453, 487), + (492, 543), + (548, 593), + (598, 640), + (645, 696), + (701, 738), + (743, 785), + (790, 832), + (837, 943), + (948, 1047), + (1052, 1155), + (1160, 1266), + ], + "content sections": [ + (125, 126), + (160, 161), + (204, 205), + (241, 242), + (284, 285), + (327, 328), + (368, 369), + (413, 414), + (450, 451), + (489, 490), + (545, 546), + (595, 596), + (642, 643), + (698, 699), + (740, 741), + (787, 788), + (834, 835), + (945, 946), + (1049, 1050), + (1157, 1158), + (1268, 2680), + ], + }, + ), + ( + "aspx_10162f_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.IO"], + "code sections": [ + (2, 71), + (76, 106), + (162, 2122), + (25579, 25596), + (25625, 25642), + (25664, 25700), + (25738, 25747), + (25801, 25822), + (25960, 25973), + (26002, 26015), + (26092, 26115), + (26153, 26168), + (26278, 26295), + (26324, 26341), + (26402, 26455), + (26472, 26489), + (26550, 26555), + (26593, 26612), + (26752, 26765), + (26794, 26811), + (26863, 26880), + (26941, 26946), + (26995, 27020), + (27037, 27062), + (27123, 27128), + (27166, 27181), + (27291, 27308), + (27337, 27354), + (27456, 27475), + (27686, 27711), + (27740, 27761), + (27854, 27879), + (27896, 27926), + (27992, 28002), + (28040, 28055), + (28167, 28188), + (28271, 28312), + (28374, 28443), + (28511, 28548), + (28610, 28675), + (28699, 28728), + (28789, 28794), + (28813, 28826), + (28871, 28876), + (28921, 28932), + (29044, 29077), + (29141, 29158), + (29220, 29226), + (29264, 29275), + (29359, 29384), + (29446, 29452), + (29490, 29501), + (29585, 29602), + (29664, 29670), + (29708, 29719), + (30163, 30170), + ], + "content sections": [ + (73, 74), + (108, 160), + (2124, 25576), + (25598, 25622), + (25644, 25661), + (25702, 25735), + (25749, 25798), + (25824, 25957), + (25975, 25999), + (26017, 26089), + (26117, 26150), + (26170, 26275), + (26297, 26321), + (26343, 26399), + (26457, 26469), + (26491, 26547), + (26557, 26590), + (26614, 26749), + (26767, 26791), + (26813, 26860), + (26882, 26938), + (26948, 26992), + (27022, 27034), + (27064, 27120), + (27130, 27163), + (27183, 27288), + (27310, 27334), + (27356, 27453), + (27477, 27683), + (27713, 27737), + (27763, 27851), + (27881, 27893), + (27928, 27989), + (28004, 28037), + (28057, 28164), + (28190, 28268), + (28314, 28371), + (28445, 28508), + (28550, 28607), + (28677, 28696), + (28730, 28786), + (28796, 28810), + (28828, 28868), + (28878, 28918), + (28934, 29041), + (29079, 29138), + (29160, 29217), + (29228, 29261), + (29277, 29356), + (29386, 29443), + (29454, 29487), + (29503, 29582), + (29604, 29661), + (29672, 29705), + (29721, 30160), + (30172, 30635), + ], + }, + ), + ( + "aspx_606dbf_template_engine", + { + "language": LANG_CS, + "aspx namespaces": [ + "System", + "System.IO", + "System.Web", + "System.Web.SessionState", + "System.Web.UI", + "System.Web.Configuration", + "System.Threading", + "System.Net", + "System.Net.Sockets", + "System.Text", + ], + "code sections": [ + (2, 87), + (93, 121), + (127, 158), + (164, 196), + (202, 247), + (253, 288), + (294, 340), + (346, 384), + (390, 422), + (428, 468), + (474, 507), + ], + "content sections": [ + (89, 91), + (123, 125), + (160, 162), + (198, 200), + (249, 251), + (290, 292), + (342, 344), + (386, 388), + (424, 426), + (470, 472), + (509, 7078), + ], + }, + ), + ( + "aspx_ea2a01_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Diagnostics", "System.IO", "System.Security.Cryptography", "System"], + "code sections": [(2, 47), (53, 93), (99, 130), (136, 186), (192, 220), (228, 5811)], + "content sections": [(49, 51), (95, 97), (132, 134), (188, 190), (222, 226), (5813, 5818)], + }, + ), + ( + "aspx_a5c893_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Reflection"], + "code sections": [(2, 23), (27, 64), (68, 469)], + "content sections": [(471, 472)], + }, + ), + ( + "aspx_b75f16_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.IO"], + "code sections": [(2, 123), (127, 157), (303, 587)], + "content sections": [(159, 301), (589, 596)], + }, + ), + ( + "aspx_d460ca_template_engine", + { + "language": LANG_CS, + "aspx namespaces": [ + "System.Reflection", + "Microsoft.CSharp", + "System.CodeDom.Compiler", + "System.IO", + "System.Security.Cryptography", + ], + "code sections": [(2, 22), (27, 65), (70, 107), (112, 156), (161, 191), (196, 245)], + "content sections": [(24, 25), (67, 68), (109, 110), (158, 159), (193, 194), (247, 4866)], + }, + ), + ( + "aspx_b4bb14_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Diagnostics", "System.IO"], + "code sections": [(2, 50), (55, 95), (100, 131)], + "content sections": [(52, 53), (97, 98), (133, 1398)], + }, + ), + ( + "aspx_f2bf20_template_engine", + { + "language": LANG_CS, + "aspx namespaces": [ + "System.IO", + "System.IO.Compression", + "System.Diagnostics", + "System.Data", + "System.Data.OleDb", + "System.Data.Common", + "System.Data.SqlClient", + "System.Management", + "Microsoft.Win32", + "System.Net", + "System.Net.Sockets", + "System.Reflection", + "System.Runtime.InteropServices", + "System.DirectoryServices", + "System.ServiceProcess", + "System.Text.RegularExpressions", + "System.Security", + "System.Security.Permissions", + "System.Threading", + ], + "code sections": [ + (2, 125), + (133, 164), + (170, 213), + (219, 259), + (265, 298), + (304, 343), + (349, 389), + (395, 438), + (444, 483), + (489, 526), + (532, 564), + (570, 610), + (616, 655), + (661, 713), + (719, 765), + (771, 814), + (820, 872), + (878, 915), + (921, 970), + (976, 1014), + (1020, 1127), + (1133, 1233), + (1239, 1343), + (39508, 39563), + (45103, 45113), + (47599, 47609), + (48705, 48712), + ], + "content sections": [ + (127, 131), + (166, 168), + (215, 217), + (261, 263), + (300, 302), + (345, 347), + (391, 393), + (440, 442), + (485, 487), + (528, 530), + (566, 568), + (612, 614), + (657, 659), + (715, 717), + (767, 769), + (816, 818), + (874, 876), + (917, 919), + (972, 974), + (1016, 1018), + (1129, 1131), + (1235, 1237), + (1345, 39505), + (39565, 45100), + (45116, 47596), + (47612, 48702), + (48715, 55896), + ], + }, + ), + ( + "aspx_5f959f_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Diagnostics", "System.IO"], + "code sections": [(2, 50), (55, 95), (100, 131)], + "content sections": [(52, 53), (97, 98), (133, 1400)], + }, + ), + ( + "aspx_f39dc0_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Diagnostics", "System.IO", "System.Net"], + "code sections": [(2, 50), (56, 96), (102, 133), (139, 171), (678, 1421)], + "content sections": [(52, 54), (98, 100), (135, 137), (173, 676), (1423, 1441)], + }, + ), + ( + "aspx_54433d_template_engine", + { + "language": LANG_CS, + "aspx namespaces": [ + "System.Diagnostics", + "System.IO", + "System.IO.Compression", + "Microsoft.VisualBasic", + ], + "code sections": [(2, 50), (55, 95), (100, 131), (136, 179), (184, 227), (233, 280)], + "content sections": [(52, 53), (97, 98), (133, 134), (181, 182), (229, 230), (283, 10444)], + }, + ), + ( + "aspx_f397cb_template_engine", { "language": LANG_CS, "aspx namespaces": ["System"], - "code sections": [(2, 22), (28, 56), (3977, 4008), (4060, 4091)], - "content sections": [(24, 26), (58, 3975), (4010, 4058), (4093, 4415)], + "code sections": [(2, 22), (28, 56), (3950, 3981), (4033, 4064)], + "content sections": [(24, 26), (58, 3948), (3983, 4031), (4066, 4388)], }, - ) + ), + ( + "aspx_15eed4_template_engine", + { + "language": LANG_CS, + "aspx namespaces": [ + "System.IO", + "System.Diagnostics", + "System.Data", + "System.Management", + "System.Data.OleDb", + "Microsoft.Win32", + "System.Net.Sockets", + "System.Net", + "System.Runtime.InteropServices", + "System.DirectoryServices", + "System.ServiceProcess", + "System.Text.RegularExpressions", + "System.Threading", + "System.Data.SqlClient", + "Microsoft.VisualBasic", + ], + "code sections": [ + (2, 123), + (128, 158), + (163, 202), + (207, 239), + (244, 282), + (287, 325), + (330, 366), + (371, 411), + (416, 448), + (453, 504), + (509, 554), + (559, 601), + (606, 657), + (662, 699), + (704, 746), + (751, 793), + (798, 904), + (909, 1008), + (1013, 1116), + (1121, 1227), + (54081, 54091), + (55610, 55620), + (56304, 56315), + (57500, 57508), + (57995, 58004), + (58531, 58541), + (58984, 58994), + (59512, 59521), + (60014, 60024), + (60284, 60291), + (61559, 61564), + (62217, 62227), + (62711, 62721), + (66897, 66906), + (67954, 67962), + ], + "content sections": [ + (125, 126), + (160, 161), + (204, 205), + (241, 242), + (284, 285), + (327, 328), + (368, 369), + (413, 414), + (450, 451), + (506, 507), + (556, 557), + (603, 604), + (659, 660), + (701, 702), + (748, 749), + (795, 796), + (906, 907), + (1010, 1011), + (1118, 1119), + (1229, 54078), + (54094, 55607), + (55623, 56301), + (56318, 57497), + (57511, 57992), + (58007, 58528), + (58544, 58981), + (58997, 59509), + (59524, 60011), + (60027, 60281), + (60294, 61556), + (61567, 62214), + (62230, 62708), + (62724, 66894), + (66909, 67951), + (67965, 70053), + ], + }, + ), + ( + "aspx_6f3261_template_engine", + { + "language": LANG_CS, + "aspx namespaces": ["System.Data", "System.Data.SqlClient"], + "code sections": [(2, 23), (28, 60), (65, 107)], + "content sections": [(25, 26), (62, 63), (109, 3303)], + }, + ), ], ) -def test_ts_engine_template(request: pytest.FixtureRequest, engine_str: str, expected_dict: dict): +def test_ts_template_engine(request: pytest.FixtureRequest, engine_str: str, expected: dict): engine: TreeSitterTemplateEngine = request.getfixturevalue(engine_str) do_test_ts_template_engine_init(engine) - assert engine.identify_language() == expected_dict["language"] - do_test_ts_template_engine_get_template_namespaces(engine, expected_dict["aspx namespaces"]) + assert engine.identify_language() == expected["language"] + do_test_ts_template_engine_get_template_namespaces(engine, expected["aspx namespaces"]) + do_test_ts_template_engine_get_code_sections(engine, expected["code sections"]) + do_test_ts_template_engine_get_content_sections(engine, expected["content sections"]) + do_test_ts_template_engine_get_parsed_code_sections(engine, expected["language"], expected["code sections"]) FEATURE_PRESENCE_TESTS_SCRIPTS = sorted( From ca1939f148bafb7ffc6e06791ae0a7d4ed80096e Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Fri, 8 Jul 2022 17:01:22 -0400 Subject: [PATCH 27/51] Bug fixes in extractor and HTML Tree-sitter engine. --- capa/features/extractors/ts/engine.py | 38 ++++++++------ capa/features/extractors/ts/extractor.py | 11 ++-- capa/features/extractors/ts/query.py | 2 + tests/fixtures.py | 16 ++++++ tests/test_ts.py | 66 ++++++++++++++++++++---- 5 files changed, 104 insertions(+), 29 deletions(-) diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index 425658856..830bb6775 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -151,12 +151,14 @@ def get_code_sections(self) -> List[Tuple[Node, str]]: def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: template_namespaces = set(name for _, name in self.get_template_namespaces()) for node, _ in self.get_code_sections(): - yield TreeSitterExtractorEngine( - self.identify_language(), - self.get_byte_range(node), - node.start_byte, - template_namespaces, - ) + # TODO: support JS + if self.identify_language() == LANG_CS: + yield TreeSitterExtractorEngine( + self.identify_language(), + self.get_byte_range(node), + node.start_byte, + template_namespaces, + ) def get_content_sections(self) -> List[Tuple[Node, str]]: return self.query.content.captures(self.tree.root_node) @@ -213,20 +215,26 @@ def get_scripts(self) -> List[Tuple[Node, str]]: return self.query.script_element.captures(self.tree.root_node) def get_attributes(self, node: Node) -> List[Tuple[Node, str]]: - return self.query.attribute.captures(self.tree.root_node) + return self.query.attribute.captures(node) - def get_code_sections(self) -> Iterator[Node]: - for script_node, _ in self.get_scripts(): - for attribute_node, _ in self.get_attributes(script_node): - yield attribute_node + def get_identified_scripts(self) -> Iterator[Tuple[Node, str]]: + for node, _ in self.get_scripts(): + for content_node, _ in self.get_script_contents(node): + yield content_node, self.identify_language(node) + + def get_script_contents(self, node: Node) -> Iterator[Tuple[Node, str]]: + return self.query.script_content.captures(node) def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: - for node in self.get_code_sections(): - yield TreeSitterExtractorEngine(self.identify_language(node), self.get_byte_range(node), node.start_byte) + for node, language in self.get_identified_scripts(): + # TODO: support JS + if language == LANG_CS: + yield TreeSitterExtractorEngine(language, self.get_byte_range(node), node.start_byte) def identify_language(self, node: Node) -> str: - if self.is_server_side_c_sharp(node): - return LANG_CS + for attribute_node, _ in self.get_attributes(node): + if self.is_server_side_c_sharp(attribute_node): + return LANG_CS return LANG_JS def is_server_side_c_sharp(self, node: Node) -> bool: diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index b62461eb8..e4b823989 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -16,8 +16,9 @@ class TreeSitterFeatureExtractor(FeatureExtractor): code_sections: List[TreeSitterExtractorEngine] - template_namespaces: set[Tuple[Node, str]] + template_namespaces: List[Tuple[Node, str]] language: str + path: str def __init__(self, path: str): super().__init__() @@ -36,14 +37,14 @@ def __init__(self, path: str): else: self.code_sections = [TreeSitterExtractorEngine(self.language, buf)] - def extract_code_from_template(self, buf: bytes) -> Tuple[List[TreeSitterExtractorEngine], set[Tuple[Node, str]]]: + def extract_code_from_template(self, buf: bytes) -> Tuple[List[TreeSitterExtractorEngine], List[Tuple[Node, str]]]: template_engine = TreeSitterTemplateEngine(buf) - template_namespaces = set(template_engine.get_template_namespaces()) + template_namespaces = list(template_engine.get_template_namespaces()) code_sections = list(template_engine.get_parsed_code_sections()) additional_namespaces = set(name for _, name in template_namespaces) - for section in template_engine.get_content_sections(): - section_buf = template_engine.get_byte_range(section) + for node, _ in template_engine.get_content_sections(): + section_buf = template_engine.get_byte_range(node) code_sections.extend(list(self.extract_code_from_html(section_buf, additional_namespaces))) return code_sections, template_namespaces diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index 70fe046b7..169a8c552 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -35,6 +35,7 @@ class TemplateQueryBinding(QueryBinding): @dataclass class HTMLQueryBinding(QueryBinding): script_element: Query + script_content: Query attribute: Query @@ -99,6 +100,7 @@ def deserialize(language: str, binding: dict) -> dict: "query": { "script_element": "(script_element) @script-element", "attribute": "(attribute) @attribute", + "script_content": "(raw_text) @script-content", }, }, ), diff --git a/tests/fixtures.py b/tests/fixtures.py index 9ea428a8d..70aa9a44a 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -401,6 +401,22 @@ def sample(request): return resolve_sample(request.param) +def resolve_sample_ts(sample): + if sample.startswith("cs_"): + return get_data_path_by_name(sample) + if sample.startswith("aspx_"): + try: + return ASPX_DATA_PATH_BY_NAME[sample] + except KeyError: + raise ValueError(f"unexpected sample fixture: {sample}") + raise ValueError(f"unexpected sample fixture: {sample}") + + +@pytest.fixture +def sample_ts(request): + return resolve_sample_ts(request.param) + + def get_function(extractor, fva: Union[int, tuple]) -> FunctionHandle: if isinstance(fva, tuple) and not isinstance(extractor, TreeSitterFeatureExtractor): raise ValueError("invalid fva format") diff --git a/tests/test_ts.py b/tests/test_ts.py index 64e6fb548..58e2afc4a 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -9,8 +9,13 @@ from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_SCRIPT, Arch, Format, String, Namespace, ScriptLanguage from capa.features.address import FileOffsetRangeAddress from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML -from capa.features.extractors.ts.query import QueryBinding, TemplateQueryBinding -from capa.features.extractors.ts.engine import TreeSitterBaseEngine, TreeSitterTemplateEngine, TreeSitterExtractorEngine +from capa.features.extractors.ts.query import QueryBinding, HTMLQueryBinding, TemplateQueryBinding +from capa.features.extractors.ts.engine import ( + TreeSitterBaseEngine, + TreeSitterHTMLEngine, + TreeSitterTemplateEngine, + TreeSitterExtractorEngine, +) def do_test_ts_base_engine_init(engine: TreeSitterBaseEngine): @@ -275,14 +280,12 @@ def test_ts_extractor_engine(request: pytest.FixtureRequest, engine_str: str, ex def do_test_ts_template_engine_init(engine: TreeSitterTemplateEngine): assert engine.language == LANG_TEM - assert isinstance(engine.query, QueryBinding) + assert isinstance(engine.query, TemplateQueryBinding) assert isinstance(engine.buf, bytes) and len(engine.buf) > 0 assert isinstance(engine.tree, Tree) assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) addr = engine.get_default_address() assert addr.start_byte == engine.tree.root_node.start_byte and addr.end_byte == engine.tree.root_node.end_byte - assert isinstance(engine.query, TemplateQueryBinding) - pass def do_test_ts_template_engine_get_template_namespaces(engine: TreeSitterTemplateEngine, expected: List[str]): @@ -907,8 +910,24 @@ def test_ts_template_engine(request: pytest.FixtureRequest, engine_str: str, exp assert engine.identify_language() == expected["language"] do_test_ts_template_engine_get_template_namespaces(engine, expected["aspx namespaces"]) do_test_ts_template_engine_get_code_sections(engine, expected["code sections"]) - do_test_ts_template_engine_get_content_sections(engine, expected["content sections"]) do_test_ts_template_engine_get_parsed_code_sections(engine, expected["language"], expected["code sections"]) + do_test_ts_template_engine_get_content_sections(engine, expected["content sections"]) + for expected_start_byte, expected_end_byte in expected["content sections"]: + template_namespaces = list(engine.get_template_namespaces()) + additional_namespaces = set(name for _, name in template_namespaces) + html_engine = TreeSitterHTMLEngine(engine.buf[expected_start_byte:expected_end_byte], additional_namespaces) + do_test_ts_html_engine_init(html_engine) + + +def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): + assert engine.language == LANG_HTML + assert isinstance(engine.query, HTMLQueryBinding) + assert isinstance(engine.buf, bytes) and len(engine.buf) > 0 + assert isinstance(engine.tree, Tree) + assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) + assert isinstance(engine.namespaces, set) + addr = engine.get_default_address() + assert addr.start_byte == engine.tree.root_node.start_byte and addr.end_byte == engine.tree.root_node.end_byte FEATURE_PRESENCE_TESTS_SCRIPTS = sorted( @@ -923,10 +942,39 @@ def test_ts_template_engine(request: pytest.FixtureRequest, engine_str: str, exp ("cs_138cdc", "function=(0x16e,0x7ce)", String("127.0.0.1"), True), ("cs_138cdc", "function=(0x16e,0x7ce)", Import("System.Diagnostics.ProcessStartInfo"), True), ("cs_138cdc", "function=(0x16e,0x7ce)", Import("System.Diagnostics.Process"), True), + ("aspx_4f6fa6", "global", Arch(ARCH_ANY), True), + ("aspx_4f6fa6", "global", OS(OS_ANY), True), + ("aspx_4f6fa6", "file", Format(FORMAT_SCRIPT), True), + ("aspx_4f6fa6", "file", ScriptLanguage(LANG_CS), True), + ("aspx_4f6fa6", "file", Namespace("System.Diagnostics"), True), + ("aspx_4f6fa6", "file", Namespace("System.IO"), True), + ("aspx_4f6fa6", "file", Namespace("System.IO.Compression"), True), + ("aspx_4f6fa6", "function=(0xad, 0x28e)", String("powershell.exe"), True), + ("aspx_5f959f", "global", Arch(ARCH_ANY), True), + ("aspx_10162f", "global", Arch(ARCH_ANY), True), + ("aspx_2b71dd", "global", Arch(ARCH_ANY), True), + ("aspx_f2bf20", "global", Arch(ARCH_ANY), True), + ("aspx_f39dc0", "global", Arch(ARCH_ANY), True), + ("aspx_ea2a01", "global", Arch(ARCH_ANY), True), + ("aspx_6f3261", "global", Arch(ARCH_ANY), True), + ("aspx_1f8f40", "global", Arch(ARCH_ANY), True), + ("aspx_2e8c7e", "global", Arch(ARCH_ANY), True), + ("aspx_03bb5c", "global", Arch(ARCH_ANY), True), + ("aspx_606dbf", "global", Arch(ARCH_ANY), True), + ("aspx_f397cb", "global", Arch(ARCH_ANY), True), + ("aspx_b4bb14", "global", Arch(ARCH_ANY), True), + ("aspx_54433d", "global", Arch(ARCH_ANY), True), + ("aspx_a35878", "global", Arch(ARCH_ANY), True), + ("aspx_a5c893", "global", Arch(ARCH_ANY), True), + ("aspx_15eed4", "global", Arch(ARCH_ANY), True), + ("aspx_b75f16", "global", Arch(ARCH_ANY), True), + ("aspx_d460ca", "global", Arch(ARCH_ANY), True), ] ) -@parametrize("sample, scope_ts, feature, expected", FEATURE_PRESENCE_TESTS_SCRIPTS, indirect=["sample", "scope_ts"]) -def test_ts_extractor(sample, scope_ts, feature, expected): - fixtures.do_test_feature_presence(fixtures.get_ts_extractor, sample, scope_ts, feature, expected) +@parametrize( + "sample_ts, scope_ts, feature, expected", FEATURE_PRESENCE_TESTS_SCRIPTS, indirect=["sample_ts", "scope_ts"] +) +def test_ts_extractor(sample_ts, scope_ts, feature, expected): + fixtures.do_test_feature_presence(fixtures.get_ts_extractor, sample_ts, scope_ts, feature, expected) From d7ab2db0e6e7c1f378507795da508745b978feb2 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Mon, 11 Jul 2022 10:36:56 -0400 Subject: [PATCH 28/51] Fixed important namespace-parsing bugs. --- capa/features/extractors/ts/engine.py | 30 +++++++++++++++++---------- 1 file changed, 19 insertions(+), 11 deletions(-) diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index 830bb6775..1a5f97526 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -88,9 +88,9 @@ def get_import_names(self, node: Node) -> Iterator[Tuple[Node, str]]: yield (obj_node, obj_name) continue for namespace in self.namespaces: - obj_name = join_names(namespace, obj_name) - if obj_name in self.import_signatures: - yield (obj_node, obj_name) + joined_obj_name = join_names(namespace, obj_name) + if joined_obj_name in self.import_signatures: + yield (obj_node, joined_obj_name) def get_function_definitions(self, node: Node = None) -> List[Tuple[Node, str]]: return self.query.function_definition.captures(node if node is not None else self.tree.root_node) @@ -122,9 +122,9 @@ def get_function_names(self, node: Node) -> Iterator[Tuple[Node, str]]: yield (fn_node, fn_name) continue for namespace in self.namespaces: - fn_name = join_names(namespace, fn_name) - if fn_name in self.import_signatures: - yield (fn_node, fn_name) + joined_fn_name = join_names(namespace, fn_name) + if joined_fn_name in self.import_signatures: + yield (fn_node, joined_fn_name) def get_string_literals(self, node: Node) -> List[Tuple[Node, str]]: return self.query.string_literal.captures(node) @@ -141,23 +141,25 @@ def get_global_statements(self) -> List[Tuple[Node, str]]: class TreeSitterTemplateEngine(TreeSitterBaseEngine): query: TemplateQueryBinding + embedded_language: str def __init__(self, buf: bytes): super().__init__(LANG_TEM, buf) + self.embedded_language = self.identify_language() + self.template_namespaces = set(name for _, name in self.get_template_namespaces()) def get_code_sections(self) -> List[Tuple[Node, str]]: return self.query.code.captures(self.tree.root_node) def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: - template_namespaces = set(name for _, name in self.get_template_namespaces()) for node, _ in self.get_code_sections(): # TODO: support JS - if self.identify_language() == LANG_CS: + if self.embedded_language == LANG_CS: yield TreeSitterExtractorEngine( self.identify_language(), self.get_byte_range(node), node.start_byte, - template_namespaces, + self.template_namespaces, ) def get_content_sections(self) -> List[Tuple[Node, str]]: @@ -169,13 +171,19 @@ def identify_language(self) -> str: return LANG_CS return LANG_JS - def get_template_namespaces(self) -> Iterator[Tuple[Node, str]]: + def get_imported_namespaces(self) -> Iterator[Tuple[Node, str]]: for node, _ in self.get_code_sections(): if self.is_aspx_import_directive(node): namespace = self.get_aspx_namespace(node) if namespace is not None: yield node, namespace + def get_template_namespaces(self) -> Iterator[Tuple[Optional[Node], str]]: + for namespace in capa.features.extractors.ts.sig.get_default_namespaces(self.embedded_language, True): + yield None, namespace + for node, namespace in self.get_imported_namespaces(): + yield node, namespace + def is_c_sharp(self, node: Node) -> bool: return bool( re.match( @@ -229,7 +237,7 @@ def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: for node, language in self.get_identified_scripts(): # TODO: support JS if language == LANG_CS: - yield TreeSitterExtractorEngine(language, self.get_byte_range(node), node.start_byte) + yield TreeSitterExtractorEngine(language, self.get_byte_range(node), node.start_byte, self.namespaces) def identify_language(self, node: Node) -> str: for attribute_node, _ in self.get_attributes(node): From 5cfbecc3dd92ef7b649676c548d141af73f27d92 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Mon, 11 Jul 2022 10:39:57 -0400 Subject: [PATCH 29/51] Further improvement to namespace parsing, including default namespaces for aspx; introducing TSFunctionInner dataclass to store both the node and the name of a function to make test-case creation easier. --- capa/features/extractors/ts/extractor.py | 10 ++++- capa/features/extractors/ts/function.py | 17 +++++++-- capa/features/extractors/ts/sig.py | 22 +++++++++++ .../features/extractors/ts/signatures/cs.json | 1 + tests/fixtures.py | 18 ++++++--- tests/test_ts.py | 37 +++++++++++++++---- 6 files changed, 87 insertions(+), 18 deletions(-) diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index e4b823989..7bddd0f15 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -1,4 +1,5 @@ from typing import List, Tuple, Union, Iterator +from dataclasses import dataclass from tree_sitter import Node @@ -11,6 +12,7 @@ from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress, FileOffsetRangeAddress from capa.features.extractors.script import LANG_TEM, LANG_HTML from capa.features.extractors.ts.engine import TreeSitterHTMLEngine, TreeSitterTemplateEngine, TreeSitterExtractorEngine +from capa.features.extractors.ts.function import TSFunctionInner from capa.features.extractors.base_extractor import Feature, BBHandle, InsnHandle, FunctionHandle, FeatureExtractor @@ -60,7 +62,10 @@ def get_base_address( def extract_template_namespaces(self) -> Iterator[Tuple[Feature, Address]]: for node, name in self.template_namespaces: - yield Namespace(name), FileOffsetRangeAddress(node.start_byte, node.end_byte) + if node is None: + yield Namespace(name), NO_ADDRESS + else: + yield Namespace(name), FileOffsetRangeAddress(node.start_byte, node.end_byte) def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.ts.global_.extract_features() @@ -74,7 +79,8 @@ def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: def get_functions(self) -> Iterator[FunctionHandle]: for engine in self.code_sections: for node, _ in engine.get_function_definitions(): - yield FunctionHandle(address=engine.get_address(node), inner=node) + name = engine.get_range(engine.get_function_definition_id(node)) + yield FunctionHandle(address=engine.get_address(node), inner=TSFunctionInner(node, name)) def extract_function_features(self, f: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: for engine in self.code_sections: diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py index 230caf523..03c858619 100644 --- a/capa/features/extractors/ts/function.py +++ b/capa/features/extractors/ts/function.py @@ -1,4 +1,7 @@ from typing import Tuple, Iterator +from dataclasses import dataclass + +from tree_sitter import Node from capa.features.file import Import, FunctionName from capa.features.insn import Number @@ -8,25 +11,31 @@ from capa.features.extractors.base_extractor import FunctionHandle +@dataclass +class TSFunctionInner: + node: Node + name: str + + def extract_strings(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, _ in engine.get_string_literals(fh.inner): + for node, _ in engine.get_string_literals(fh.inner.node): yield String(engine.get_range(node).strip('"')), engine.get_address(node) def extract_integer_literals( fh: FunctionHandle, engine: TreeSitterExtractorEngine ) -> Iterator[Tuple[Feature, Address]]: - for node, _ in engine.get_integer_literals(fh.inner): + for node, _ in engine.get_integer_literals(fh.inner.node): yield Number(int(engine.get_range(node))), engine.get_address(node) def extract_function_names(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, name in engine.get_function_names(fh.inner): + for node, name in engine.get_function_names(fh.inner.node): yield FunctionName(name), engine.get_address(node) def extract_import_names(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, name in engine.get_import_names(fh.inner): + for node, name in engine.get_import_names(fh.inner.node): yield Import(name), engine.get_address(node) diff --git a/capa/features/extractors/ts/sig.py b/capa/features/extractors/ts/sig.py index 4034fc5ac..6c1a6da3f 100644 --- a/capa/features/extractors/ts/sig.py +++ b/capa/features/extractors/ts/sig.py @@ -21,3 +21,25 @@ def get_name_joiner(language: str) -> Callable: if language == LANG_CS: return lambda qualified_name, identifier: qualified_name + "." + identifier raise ValueError("Language {language} does not have a name joiner") + + +def get_default_namespaces(language: str, embedded: bool) -> set: + if embedded and language == LANG_CS: + return { + "System", + "System.Collections", + "System.Collections.Specialized", + "System.Configuration", + "System.Text", + "System.Text.RegularExpressions", + "System.Web", + "System.Web.Caching", + "System.Web.Profile", + "System.Web.Security", + "System.Web.SessionState", + "System.Web.UI", + "System.Web.UI.HtmlControls", + "System.Web.UI.WebControls", + "System.Web.UI.WebControls.WebParts", + } + return set() diff --git a/capa/features/extractors/ts/signatures/cs.json b/capa/features/extractors/ts/signatures/cs.json index d0d7abe68..160f7a150 100644 --- a/capa/features/extractors/ts/signatures/cs.json +++ b/capa/features/extractors/ts/signatures/cs.json @@ -3,6 +3,7 @@ "System.Convert.FromBase64String", "System.Diagnostics.Process", "System.Diagnostics.ProcessStartInfo", + "System.Diagnostics.Process.Start", "System.Security.Cryptography.RijndaelManaged", "System.Security.Cryptography.CryptoStream" ] \ No newline at end of file diff --git a/tests/fixtures.py b/tests/fixtures.py index 70aa9a44a..15cb449f5 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -417,12 +417,17 @@ def sample_ts(request): return resolve_sample_ts(request.param) -def get_function(extractor, fva: Union[int, tuple]) -> FunctionHandle: - if isinstance(fva, tuple) and not isinstance(extractor, TreeSitterFeatureExtractor): +def get_function(extractor, fva: Union[int, tuple, str]) -> FunctionHandle: + if (isinstance(fva, tuple) or isinstance(fva, str)) and not isinstance(extractor, TreeSitterFeatureExtractor): raise ValueError("invalid fva format") for fh in extractor.get_functions(): if isinstance(extractor, TreeSitterFeatureExtractor): - addr = (fh.inner.start_byte, fh.inner.end_byte) + if isinstance(fva, tuple): + addr = (fh.inner.node.start_byte, fh.inner.node.end_byte) + elif isinstance(fva, str): + addr = fh.inner.name + else: + raise ValueError("invalid fva format") elif isinstance(extractor, DnfileFeatureExtractor): addr = fh.inner.offset else: @@ -549,9 +554,12 @@ def inner_fn(extractor): return features elif scope.startswith("function"): - # like `function=(155, 192)` + # like `function=(0xbeef, 0xdead) or function=(123, 456) or function=foo_bar` def inner_fn(extractor): - fh = get_function(extractor, eval(scope.partition("=")[2])) + fn = scope.partition("=")[2] + if fn[0] == "(" and fn[-1] == ")": + fn = tuple(int(x, 16) if x.lstrip().startswith("0x") else int(x) for x in fn[1:-1].split(",")) + fh = get_function(extractor, fn) features = extract_function_features(extractor, fh) for k, vs in extract_global_features(extractor).items(): features[k].update(vs) diff --git a/tests/test_ts.py b/tests/test_ts.py index 58e2afc4a..6f9cd6a0a 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -5,8 +5,19 @@ from fixtures import * from tree_sitter import Node, Tree -from capa.features.file import Import -from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_SCRIPT, Arch, Format, String, Namespace, ScriptLanguage +from capa.features.file import Import, FunctionName +from capa.features.common import ( + OS, + OS_ANY, + ARCH_ANY, + FORMAT_SCRIPT, + Arch, + Format, + String, + Namespace, + Substring, + ScriptLanguage, +) from capa.features.address import FileOffsetRangeAddress from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML from capa.features.extractors.ts.query import QueryBinding, HTMLQueryBinding, TemplateQueryBinding @@ -288,9 +299,14 @@ def do_test_ts_template_engine_init(engine: TreeSitterTemplateEngine): assert addr.start_byte == engine.tree.root_node.start_byte and addr.end_byte == engine.tree.root_node.end_byte -def do_test_ts_template_engine_get_template_namespaces(engine: TreeSitterTemplateEngine, expected: List[str]): - assert len(list(engine.get_template_namespaces())) == len(expected) - for (node, namespace), expected_namespace in zip(list(engine.get_template_namespaces()), expected): +def do_test_ts_template_engine_get_template_namespaces( + engine: TreeSitterTemplateEngine, expected_language: str, expected: List[str] +): + default_namespaces = capa.features.extractors.ts.sig.get_default_namespaces(expected_language, True) + template_namespaces = {name for _, name in engine.get_template_namespaces()} + assert default_namespaces.issubset(template_namespaces) + assert len(list(engine.get_imported_namespaces())) == len(expected) + for (node, namespace), expected_namespace in zip(list(engine.get_imported_namespaces()), expected): assert isinstance(node, Node) assert engine.is_aspx_import_directive(node) == True assert engine.get_aspx_namespace(node) == expected_namespace @@ -908,7 +924,7 @@ def test_ts_template_engine(request: pytest.FixtureRequest, engine_str: str, exp engine: TreeSitterTemplateEngine = request.getfixturevalue(engine_str) do_test_ts_template_engine_init(engine) assert engine.identify_language() == expected["language"] - do_test_ts_template_engine_get_template_namespaces(engine, expected["aspx namespaces"]) + do_test_ts_template_engine_get_template_namespaces(engine, expected["language"], expected["aspx namespaces"]) do_test_ts_template_engine_get_code_sections(engine, expected["code sections"]) do_test_ts_template_engine_get_parsed_code_sections(engine, expected["language"], expected["code sections"]) do_test_ts_template_engine_get_content_sections(engine, expected["content sections"]) @@ -949,7 +965,14 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_4f6fa6", "file", Namespace("System.Diagnostics"), True), ("aspx_4f6fa6", "file", Namespace("System.IO"), True), ("aspx_4f6fa6", "file", Namespace("System.IO.Compression"), True), - ("aspx_4f6fa6", "function=(0xad, 0x28e)", String("powershell.exe"), True), + ("aspx_4f6fa6", "function=do_ps", String("powershell.exe"), True), + ("aspx_4f6fa6", "function=do_ps", Substring("-executionpolicy bypass"), True), + ("aspx_4f6fa6", "function=do_ps", Import("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_4f6fa6", "function=do_ps", FunctionName("System.Diagnostics.Process.Start"), True), + ("aspx_4f6fa6", "function=ps", String("\\nPS> "), True), + ("aspx_4f6fa6", "function=ps", Substring("PS>"), True), + ("aspx_4f6fa6", "function=downloadbutton_Click", Substring("filename"), True), + ("aspx_4f6fa6", "function=base64encode", FunctionName("System.Convert.ToBase64String"), True), ("aspx_5f959f", "global", Arch(ARCH_ANY), True), ("aspx_10162f", "global", Arch(ARCH_ANY), True), ("aspx_2b71dd", "global", Arch(ARCH_ANY), True), From 26cc1bc3701799c95b7b4fc1daa6243988879baf Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Mon, 11 Jul 2022 15:24:29 -0400 Subject: [PATCH 30/51] Added more tests and a few minor bug fixes. --- capa/features/extractors/ts/extractor.py | 1 - capa/features/extractors/ts/file.py | 5 ++- capa/features/extractors/ts/function.py | 5 ++- capa/features/extractors/ts/query.py | 2 +- .../features/extractors/ts/signatures/cs.json | 4 +- tests/fixtures.py | 2 +- tests/test_ts.py | 39 +++++++++++++++++-- 7 files changed, 48 insertions(+), 10 deletions(-) diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index 7bddd0f15..50deacd2b 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -1,5 +1,4 @@ from typing import List, Tuple, Union, Iterator -from dataclasses import dataclass from tree_sitter import Node diff --git a/capa/features/extractors/ts/file.py b/capa/features/extractors/ts/file.py index 45962d1c5..16f099b92 100644 --- a/capa/features/extractors/ts/file.py +++ b/capa/features/extractors/ts/file.py @@ -1,6 +1,7 @@ from typing import Tuple, Iterator import capa.features.extractors.script +import capa.features.extractors.ts.integer from capa.features.file import Import, FunctionName from capa.features.insn import Number from capa.features.common import String, Feature, Namespace @@ -21,7 +22,9 @@ def extract_file_strings(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Fe def extract_file_integer_literals(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: for global_node, _ in engine.get_global_statements(): for node, _ in engine.get_integer_literals(global_node): - yield Number(int(engine.get_range(node))), engine.get_address(node) + parsed_int = capa.features.extractors.ts.integer.parse_integer(engine.get_range(node), engine.language) + if parsed_int is not None: + yield Number(parsed_int), engine.get_address(node) def extract_namespaces(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py index 03c858619..1126d8eac 100644 --- a/capa/features/extractors/ts/function.py +++ b/capa/features/extractors/ts/function.py @@ -3,6 +3,7 @@ from tree_sitter import Node +import capa.features.extractors.ts.integer from capa.features.file import Import, FunctionName from capa.features.insn import Number from capa.features.common import String, Feature @@ -26,7 +27,9 @@ def extract_integer_literals( fh: FunctionHandle, engine: TreeSitterExtractorEngine ) -> Iterator[Tuple[Feature, Address]]: for node, _ in engine.get_integer_literals(fh.inner.node): - yield Number(int(engine.get_range(node))), engine.get_address(node) + parsed_int = capa.features.extractors.ts.integer.parse_integer(engine.get_range(node), engine.language) + if parsed_int is not None: + yield Number(parsed_int), engine.get_address(node) def extract_function_names(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index 169a8c552..935beb61a 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -70,7 +70,7 @@ def deserialize(language: str, binding: dict) -> dict: "string_literal": "(string_literal) @string-literal", "integer_literal": "(integer_literal) @integer-literal", "namespace": "(using_directive [(identifier) @namespace (qualified_name) @namespace])", - "global_statement": "(global_statement [(expression_statement) @global-statement (local_declaration_statement) @global-statement])", + "global_statement": "(global_statement [(if_statement) @global-statement (expression_statement) @global-statement (local_declaration_statement) @global-statement])", }, "field_name": { "new_object": "type", diff --git a/capa/features/extractors/ts/signatures/cs.json b/capa/features/extractors/ts/signatures/cs.json index 160f7a150..65601b235 100644 --- a/capa/features/extractors/ts/signatures/cs.json +++ b/capa/features/extractors/ts/signatures/cs.json @@ -1,9 +1,11 @@ [ + "System.IO.File.Delete", "System.Convert.ToBase64String", "System.Convert.FromBase64String", "System.Diagnostics.Process", "System.Diagnostics.ProcessStartInfo", "System.Diagnostics.Process.Start", "System.Security.Cryptography.RijndaelManaged", - "System.Security.Cryptography.CryptoStream" + "System.Security.Cryptography.CryptoStream", + "System.Security.Cryptography.SHA256CryptoServiceProvider" ] \ No newline at end of file diff --git a/tests/fixtures.py b/tests/fixtures.py index 15cb449f5..2f0a441ab 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -423,7 +423,7 @@ def get_function(extractor, fva: Union[int, tuple, str]) -> FunctionHandle: for fh in extractor.get_functions(): if isinstance(extractor, TreeSitterFeatureExtractor): if isinstance(fva, tuple): - addr = (fh.inner.node.start_byte, fh.inner.node.end_byte) + addr = (fh.address.start_byte, fh.address.end_byte) elif isinstance(fva, str): addr = fh.inner.name else: diff --git a/tests/test_ts.py b/tests/test_ts.py index 6f9cd6a0a..b624e4e49 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -6,6 +6,7 @@ from tree_sitter import Node, Tree from capa.features.file import Import, FunctionName +from capa.features.insn import Number from capa.features.common import ( OS, OS_ANY, @@ -954,10 +955,10 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("cs_138cdc", "file", ScriptLanguage(LANG_CS), True), ("cs_138cdc", "file", Namespace("System"), True), ("cs_138cdc", "file", String(""), True), - ("cs_138cdc", "function=(0x38,0x16c)", String("Not Found"), True), - ("cs_138cdc", "function=(0x16e,0x7ce)", String("127.0.0.1"), True), - ("cs_138cdc", "function=(0x16e,0x7ce)", Import("System.Diagnostics.ProcessStartInfo"), True), - ("cs_138cdc", "function=(0x16e,0x7ce)", Import("System.Diagnostics.Process"), True), + ("cs_138cdc", "function=die", String("Not Found"), True), + ("cs_138cdc", "function=Page_Load", String("127.0.0.1"), True), + ("cs_138cdc", "function=Page_Load", Import("System.Diagnostics.ProcessStartInfo"), True), + ("cs_138cdc", "function=Page_Load", Import("System.Diagnostics.Process"), True), ("aspx_4f6fa6", "global", Arch(ARCH_ANY), True), ("aspx_4f6fa6", "global", OS(OS_ANY), True), ("aspx_4f6fa6", "file", Format(FORMAT_SCRIPT), True), @@ -974,7 +975,37 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_4f6fa6", "function=downloadbutton_Click", Substring("filename"), True), ("aspx_4f6fa6", "function=base64encode", FunctionName("System.Convert.ToBase64String"), True), ("aspx_5f959f", "global", Arch(ARCH_ANY), True), + ("aspx_5f959f", "global", OS(OS_ANY), True), + ("aspx_5f959f", "file", Format(FORMAT_SCRIPT), True), + ("aspx_5f959f", "file", ScriptLanguage(LANG_CS), True), + ("aspx_5f959f", "file", Namespace("System.Diagnostics"), True), + ("aspx_5f959f", "file", Namespace("System.IO"), True), + ("aspx_5f959f", "file", Namespace("System.Web.SessionState"), True), + ("aspx_5f959f", "function=ExcuteCmd", Import("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_5f959f", "function=ExcuteCmd", String("cmd.exe"), True), + ("aspx_5f959f", "function=ExcuteCmd", Substring("/c"), True), + ("aspx_5f959f", "function=cmdExe_Click", String("
"), True),
+        ("aspx_5f959f", "function=cmdExe_Click", String("
"), True), ("aspx_10162f", "global", Arch(ARCH_ANY), True), + ("aspx_10162f", "global", OS(OS_ANY), True), + ("aspx_10162f", "file", Format(FORMAT_SCRIPT), True), + ("aspx_10162f", "file", ScriptLanguage(LANG_CS), True), + ("aspx_10162f", "file", Namespace("System.IO"), True), + ("aspx_10162f", "file", Namespace("System.Web.Security"), True), + ("aspx_10162f", "file", String("data"), True), + ("aspx_10162f", "file", String("gsize"), True), + ("aspx_10162f", "file", String("cmd"), True), + ("aspx_10162f", "file", String("ttar"), True), + ("aspx_10162f", "file", String("sdfewq@#$51234234DF@#$!@#$ASDF"), True), + ("aspx_10162f", "function=rm", FunctionName("System.IO.File.Delete"), False), + ("aspx_10162f", "function=(0x564, 0x6af)", FunctionName("System.Convert.ToBase64String"), True), + ("aspx_10162f", "function=(0x564, 0x6af)", String("p"), True), + ("aspx_10162f", "function=exec", Import("System.Diagnostics.Process"), True), + ("aspx_10162f", "function=exec", String("cmd.exe"), True), + ("aspx_10162f", "function=gsize", Substring("error"), True), + ("aspx_10162f", "function=exp", Substring("root"), True), + ("aspx_10162f", "function=exp", Substring("net use"), True), + ("aspx_10162f", "function=exp", Number(2), True), ("aspx_2b71dd", "global", Arch(ARCH_ANY), True), ("aspx_f2bf20", "global", Arch(ARCH_ANY), True), ("aspx_f39dc0", "global", Arch(ARCH_ANY), True), From 2a9e76f0ec2231f0583e9f2173c5c541893c08d7 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 12 Jul 2022 11:19:28 -0400 Subject: [PATCH 31/51] Added language-specific integer parsing. --- capa/features/extractors/ts/integer.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 capa/features/extractors/ts/integer.py diff --git a/capa/features/extractors/ts/integer.py b/capa/features/extractors/ts/integer.py new file mode 100644 index 000000000..a902833de --- /dev/null +++ b/capa/features/extractors/ts/integer.py @@ -0,0 +1,15 @@ +from typing import Optional + +from capa.features.extractors.script import LANG_CS + + +def parse_integer(integer: str, language: str) -> Optional[int]: + try: + if language == LANG_CS: + if integer.endswith(("u", "l")): + integer = integer[:-1] + if integer.startswith(("0x", "0X")): + return int(integer, 16) + return int(integer) + except: + return None From 672ca71d67864639d03d3d18f52dab9b74b7b083 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 12 Jul 2022 11:20:43 -0400 Subject: [PATCH 32/51] Fixed an important bug in FileOffsetRangeAddress comparison method. --- capa/features/address.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/capa/features/address.py b/capa/features/address.py index 413845f1f..350cdd332 100644 --- a/capa/features/address.py +++ b/capa/features/address.py @@ -61,7 +61,7 @@ def __init__(self, start_byte, end_byte): self.end_byte = end_byte def __eq__(self, other): - return (self.start_byte, self.end_byte) == (self.end_byte, other.end_byte) + return (self.start_byte, self.end_byte) == (self.start_byte, other.end_byte) def __lt__(self, other): return (self.start_byte, self.end_byte) < (other.start_byte, other.end_byte) From ca426cab87d297c43b46863fa146d9cda9687604 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 12 Jul 2022 11:22:05 -0400 Subject: [PATCH 33/51] Added more ASPX tests. --- capa/features/extractors/ts/signatures/cs.json | 3 +++ tests/test_ts.py | 9 +++++++++ 2 files changed, 12 insertions(+) diff --git a/capa/features/extractors/ts/signatures/cs.json b/capa/features/extractors/ts/signatures/cs.json index 65601b235..f0cd2629e 100644 --- a/capa/features/extractors/ts/signatures/cs.json +++ b/capa/features/extractors/ts/signatures/cs.json @@ -1,5 +1,8 @@ [ + "System.IO.DirectoryInfo", "System.IO.File.Delete", + "System.IO.File.Write", + "System.IO.File.GetAttributes", "System.Convert.ToBase64String", "System.Convert.FromBase64String", "System.Diagnostics.Process", diff --git a/tests/test_ts.py b/tests/test_ts.py index b624e4e49..5c871990b 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -1006,6 +1006,15 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_10162f", "function=exp", Substring("root"), True), ("aspx_10162f", "function=exp", Substring("net use"), True), ("aspx_10162f", "function=exp", Number(2), True), + ("aspx_10162f", "function=exp", Import("System.IO.DirectoryInfo"), True), + ("aspx_10162f", "function=exp", FunctionName("System.IO.File.GetAttributes"), True), + ("aspx_10162f", "function=GetDirSize", Number(0), True), + ("aspx_10162f", "function=createJsonDirectory", String('\\"dir\\":['), True), + ("aspx_10162f", "function=createJsonDirectory", Number(0), True), + ("aspx_10162f", "function=createJsonFile", Substring("file"), True), + ("aspx_10162f", "function=sizeFix", Number(1024), True), + ("aspx_10162f", "function=sizeFix", Number(2), True), + ("aspx_10162f", "function=sizeFix", Substring("GB"), True), ("aspx_2b71dd", "global", Arch(ARCH_ANY), True), ("aspx_f2bf20", "global", Arch(ARCH_ANY), True), ("aspx_f39dc0", "global", Arch(ARCH_ANY), True), From fd80277dc9441e0da861e7de545600fde9a28c0f Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 12 Jul 2022 11:23:30 -0400 Subject: [PATCH 34/51] Fixed the capa control flow to fully support capa scripts. --- capa/features/extractors/script.py | 10 +++++++--- capa/features/freeze/__init__.py | 8 ++++++++ capa/helpers.py | 3 ++- capa/main.py | 3 +-- 4 files changed, 18 insertions(+), 6 deletions(-) diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py index d8ba79e5c..7d81cb361 100644 --- a/capa/features/extractors/script.py +++ b/capa/features/extractors/script.py @@ -9,6 +9,10 @@ LANG_JS = "javascript" LANG_TEM = "embedded_template" +EXT_ASPX = (".aspx", "aspx_") +EXT_CS = (".cs", ".cs_") +EXT_HTML = ("html", "html_") + def extract_arch() -> Iterator[Tuple[Feature, Address]]: yield Arch(ARCH_ANY), NO_ADDRESS @@ -27,10 +31,10 @@ def extract_format() -> Iterator[Tuple[Feature, Address]]: def get_language_from_ext(path: str) -> str: - if path.endswith((".aspx", "aspx_")): + if path.endswith(EXT_ASPX): return LANG_TEM - if path.endswith((".cs", ".cs_")): + if path.endswith(EXT_CS): return LANG_CS - if path.endswith(("html", "html_")): + if path.endswith(EXT_HTML): return LANG_HTML raise ValueError(f"{path} has an unrecognized or an unsupported extension.") diff --git a/capa/features/freeze/__init__.py b/capa/features/freeze/__init__.py index 15129a360..24c95af15 100644 --- a/capa/features/freeze/__init__.py +++ b/capa/features/freeze/__init__.py @@ -40,6 +40,7 @@ class AddressType(str, Enum): ABSOLUTE = "absolute" RELATIVE = "relative" FILE = "file" + FILE_RANGE = "file range" DN_TOKEN = "dn token" DN_TOKEN_OFFSET = "dn token offset" NO_ADDRESS = "no address" @@ -60,6 +61,9 @@ def from_capa(cls, a: capa.features.address.Address) -> "Address": elif isinstance(a, capa.features.address.FileOffsetAddress): return cls(type=AddressType.FILE, value=int(a)) + elif isinstance(a, capa.features.address.FileOffsetRangeAddress): + return cls(type=AddressType.FILE_RANGE, value=(a.start_byte, a.end_byte)) + elif isinstance(a, capa.features.address.DNTokenAddress): return cls(type=AddressType.DN_TOKEN, value=a.token.value) @@ -88,6 +92,10 @@ def to_capa(self) -> capa.features.address.Address: elif self.type is AddressType.FILE: return capa.features.address.FileOffsetAddress(self.value) + elif self.type is AddressType.FILE_RANGE: + start_byte, end_byte = self.value + return capa.features.address.FileOffsetRangeAddress(start_byte, end_byte) + elif self.type is AddressType.DN_TOKEN: return capa.features.address.DNTokenAddress(dncil.clr.token.Token(self.value)) diff --git a/capa/helpers.py b/capa/helpers.py index d9907cc58..7d1c180b7 100644 --- a/capa/helpers.py +++ b/capa/helpers.py @@ -11,11 +11,12 @@ from capa.exceptions import UnsupportedFormatError from capa.features.common import FORMAT_SC32, FORMAT_SC64, FORMAT_SCRIPT, FORMAT_UNKNOWN +from capa.features.extractors.script import EXT_CS, EXT_ASPX, EXT_HTML EXTENSIONS_SHELLCODE_32 = ("sc32", "raw32") EXTENSIONS_SHELLCODE_64 = ("sc64", "raw64") EXTENSIONS_ELF = "elf_" -EXTENSIONS_SUPPORTED_SCRIPTS = "cs" +EXTENSIONS_SUPPORTED_SCRIPTS = EXT_ASPX + EXT_CS + EXT_HTML logger = logging.getLogger("capa") diff --git a/capa/main.py b/capa/main.py index ba428ca13..604b2a501 100644 --- a/capa/main.py +++ b/capa/main.py @@ -49,7 +49,6 @@ from capa.rules import Rule, Scope, RuleSet from capa.engine import FeatureSet, MatchResults from capa.helpers import ( - get_format, get_file_taste, get_auto_format, log_unsupported_os_error, @@ -704,7 +703,7 @@ def collect_metadata( if rules_path != [RULES_PATH_DEFAULT_STRING]: rules_path = [os.path.abspath(os.path.normpath(r)) for r in rules_path] - format_ = get_format(sample_path) + format_ = get_auto_format(sample_path) if format_ == FORMAT_SCRIPT: arch = get_script_arch() From d0c4acbd0e28cf2c2da62fa1b972747a95158b02 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Mon, 18 Jul 2022 13:18:40 -0400 Subject: [PATCH 35/51] Major changes: switching imports and function names to properties, streamlining queries, introducing language toolkit class to help move all additional language specific methods to the sample place. --- capa/features/extractors/ts/engine.py | 77 ++++-------- capa/features/extractors/ts/extractor.py | 15 ++- capa/features/extractors/ts/file.py | 34 +---- capa/features/extractors/ts/function.py | 91 +++++++++++--- capa/features/extractors/ts/query.py | 18 ++- capa/features/extractors/ts/sig.py | 45 ------- .../features/extractors/ts/signatures/cs.json | 24 ++++ capa/features/extractors/ts/tools.py | 116 ++++++++++++++++++ capa/features/insn.py | 5 + capa/rules.py | 2 + tests/fixtures.py | 52 +++++--- tests/test_ts.py | 107 ++++++---------- 12 files changed, 326 insertions(+), 260 deletions(-) delete mode 100644 capa/features/extractors/ts/sig.py create mode 100644 capa/features/extractors/ts/tools.py diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index 1a5f97526..e99f12141 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -3,7 +3,6 @@ from tree_sitter import Node, Tree, Parser -import capa.features.extractors.ts.sig import capa.features.extractors.ts.build from capa.features.address import FileOffsetRangeAddress from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML @@ -14,6 +13,7 @@ ScriptQueryBinding, TemplateQueryBinding, ) +from capa.features.extractors.ts.tools import LANGUAGE_TOOLKITS, LanguageToolkit class TreeSitterBaseEngine: @@ -49,7 +49,7 @@ def get_default_address(self) -> FileOffsetRangeAddress: class TreeSitterExtractorEngine(TreeSitterBaseEngine): query: ScriptQueryBinding - import_signatures: set + language_toolkit: LanguageToolkit buf_offset: int namespaces: set[str] @@ -62,69 +62,32 @@ def __init__( ): super().__init__(language, buf) self.buf_offset = buf_offset - self.import_signatures = capa.features.extractors.ts.sig.load_import_signatures(language) - self.namespaces = additional_namespaces if additional_namespaces is not None else set() + self.language_toolkit = LANGUAGE_TOOLKITS[language] + self.namespaces = set(self.get_range(ns_node) for ns_node, _ in self.get_namespaces()) + if additional_namespaces: + self.namespaces = self.namespaces.union(additional_namespaces) def get_address(self, node: Node) -> FileOffsetRangeAddress: return FileOffsetRangeAddress(self.buf_offset + node.start_byte, self.buf_offset + node.end_byte) - def get_new_objects(self, node: Node) -> List[Tuple[Node, str]]: - return self.query.new_object.captures(node) - - def get_object_id(self, node: Node) -> Node: - return node.child_by_field_name(self.query.new_object_field_name) - - def get_new_object_ids(self, node: Node) -> Iterator[Node]: - for obj_node, _ in self.get_new_objects(node): - yield self.get_object_id(obj_node) - - # TODO: move this elsewhere, does not fit this class - def get_import_names(self, node: Node) -> Iterator[Tuple[Node, str]]: - join_names = capa.features.extractors.ts.sig.get_name_joiner(self.language) - self.namespaces = self.namespaces.union(set([self.get_range(ns_node) for ns_node, _ in self.get_namespaces()])) - for obj_node in self.get_new_object_ids(node): - obj_name = self.get_range(obj_node) - if obj_name in self.import_signatures: - yield (obj_node, obj_name) - continue - for namespace in self.namespaces: - joined_obj_name = join_names(namespace, obj_name) - if joined_obj_name in self.import_signatures: - yield (obj_node, joined_obj_name) + def get_new_object_names(self, node: Node) -> List[Tuple[Node, str]]: + return self.query.new_object_name.captures(node) + + def get_assigned_property_names(self, node: Node) -> List[Tuple[Node, str]]: + return self.query.assigned_property_name.captures(node) def get_function_definitions(self, node: Node = None) -> List[Tuple[Node, str]]: return self.query.function_definition.captures(node if node is not None else self.tree.root_node) - def get_function_definition_id(self, node: Node) -> Node: + def get_function_definition_name(self, node: Node) -> Node: return node.child_by_field_name(self.query.function_definition_field_name) - def get_function_definition_ids(self, node: Node) -> Iterator[Node]: + def get_function_definition_names(self, node: Node) -> Iterator[Node]: for fn_node, _ in self.get_function_definitions(node): - yield self.get_function_definition_id(fn_node) - - def get_function_calls(self, node: Node) -> List[Tuple[Node, str]]: - return self.query.function_call.captures(node) - - def get_function_call_id(self, node: Node) -> Node: - return node.child_by_field_name(self.query.function_call_field_name) - - def get_function_call_ids(self, node: Node) -> Iterator[Node]: - for fn_node, _ in self.get_function_calls(node): - yield self.get_function_call_id(fn_node) - - # TODO: move this elsewhere, does not fit this class - def get_function_names(self, node: Node) -> Iterator[Tuple[Node, str]]: - join_names = capa.features.extractors.ts.sig.get_name_joiner(self.language) - self.namespaces = self.namespaces.union(set([self.get_range(ns_node) for ns_node, _ in self.get_namespaces()])) - for fn_node in self.get_function_call_ids(node): - fn_name = self.get_range(fn_node) - if fn_name in self.import_signatures: - yield (fn_node, fn_name) - continue - for namespace in self.namespaces: - joined_fn_name = join_names(namespace, fn_name) - if joined_fn_name in self.import_signatures: - yield (fn_node, joined_fn_name) + yield self.get_function_definition_name(fn_node) + + def get_function_call_names(self, node: Node) -> List[Tuple[Node, str]]: + return self.query.function_call_name.captures(node) def get_string_literals(self, node: Node) -> List[Tuple[Node, str]]: return self.query.string_literal.captures(node) @@ -141,11 +104,13 @@ def get_global_statements(self) -> List[Tuple[Node, str]]: class TreeSitterTemplateEngine(TreeSitterBaseEngine): query: TemplateQueryBinding + language_toolkit: LanguageToolkit embedded_language: str def __init__(self, buf: bytes): super().__init__(LANG_TEM, buf) self.embedded_language = self.identify_language() + self.language_toolkit = LANGUAGE_TOOLKITS[self.embedded_language] self.template_namespaces = set(name for _, name in self.get_template_namespaces()) def get_code_sections(self) -> List[Tuple[Node, str]]: @@ -156,7 +121,7 @@ def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: # TODO: support JS if self.embedded_language == LANG_CS: yield TreeSitterExtractorEngine( - self.identify_language(), + self.embedded_language, self.get_byte_range(node), node.start_byte, self.template_namespaces, @@ -179,7 +144,7 @@ def get_imported_namespaces(self) -> Iterator[Tuple[Node, str]]: yield node, namespace def get_template_namespaces(self) -> Iterator[Tuple[Optional[Node], str]]: - for namespace in capa.features.extractors.ts.sig.get_default_namespaces(self.embedded_language, True): + for namespace in self.language_toolkit.get_default_namespaces(True): yield None, namespace for node, namespace in self.get_imported_namespaces(): yield node, namespace diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index 50deacd2b..ed26e95e0 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -11,7 +11,7 @@ from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress, FileOffsetRangeAddress from capa.features.extractors.script import LANG_TEM, LANG_HTML from capa.features.extractors.ts.engine import TreeSitterHTMLEngine, TreeSitterTemplateEngine, TreeSitterExtractorEngine -from capa.features.extractors.ts.function import TSFunctionInner +from capa.features.extractors.ts.function import PSEUDO_MAIN, TSFunctionInner from capa.features.extractors.base_extractor import Feature, BBHandle, InsnHandle, FunctionHandle, FeatureExtractor @@ -75,15 +75,20 @@ def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: for engine in self.code_sections: yield from capa.features.extractors.ts.file.extract_features(engine) + def get_pseudo_main_function(self, engine: TreeSitterExtractorEngine) -> FunctionHandle: + return FunctionHandle( + address=engine.get_default_address(), inner=TSFunctionInner(engine.tree.root_node, PSEUDO_MAIN, engine) + ) + def get_functions(self) -> Iterator[FunctionHandle]: for engine in self.code_sections: + yield self.get_pseudo_main_function(engine) for node, _ in engine.get_function_definitions(): - name = engine.get_range(engine.get_function_definition_id(node)) - yield FunctionHandle(address=engine.get_address(node), inner=TSFunctionInner(node, name)) + name = engine.get_range(engine.get_function_definition_name(node)) + yield FunctionHandle(address=engine.get_address(node), inner=TSFunctionInner(node, name, engine)) def extract_function_features(self, f: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: - for engine in self.code_sections: - yield from capa.features.extractors.ts.function.extract_features(f, engine) + yield from capa.features.extractors.ts.function.extract_features(f, f.inner.engine) def get_basic_blocks(self, f: FunctionHandle) -> Iterator[BBHandle]: yield from [] diff --git a/capa/features/extractors/ts/file.py b/capa/features/extractors/ts/file.py index 16f099b92..c868b16cc 100644 --- a/capa/features/extractors/ts/file.py +++ b/capa/features/extractors/ts/file.py @@ -2,9 +2,7 @@ import capa.features.extractors.script import capa.features.extractors.ts.integer -from capa.features.file import Import, FunctionName -from capa.features.insn import Number -from capa.features.common import String, Feature, Namespace +from capa.features.common import Feature, Namespace from capa.features.address import Address from capa.features.extractors.ts.engine import TreeSitterExtractorEngine @@ -13,37 +11,11 @@ def extract_language(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Featur yield from capa.features.extractors.script.extract_language(engine.language, engine.get_default_address()) -def extract_file_strings(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for global_node, _ in engine.get_global_statements(): - for node, _ in engine.get_string_literals(global_node): - yield String(engine.get_range(node).strip('"')), engine.get_address(node) - - -def extract_file_integer_literals(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for global_node, _ in engine.get_global_statements(): - for node, _ in engine.get_integer_literals(global_node): - parsed_int = capa.features.extractors.ts.integer.parse_integer(engine.get_range(node), engine.language) - if parsed_int is not None: - yield Number(parsed_int), engine.get_address(node) - - def extract_namespaces(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: for node, _ in engine.get_namespaces(): yield Namespace(engine.get_range(node)), engine.get_address(node) -def extract_file_function_names(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for global_node, _ in engine.get_global_statements(): - for node, name in engine.get_function_names(global_node): - yield FunctionName(name), engine.get_address(node) - - -def extract_file_import_names(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for global_node, _ in engine.get_global_statements(): - for node, name in engine.get_import_names(global_node): - yield Import(name), engine.get_address(node) - - def extract_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: for file_handler in FILE_HANDLERS: for feature, addr in file_handler(engine): @@ -51,10 +23,6 @@ def extract_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Featur FILE_HANDLERS = ( - extract_file_function_names, - extract_file_import_names, - extract_file_integer_literals, - extract_file_strings, extract_language, extract_namespaces, ) diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py index 1126d8eac..72aa3896e 100644 --- a/capa/features/extractors/ts/function.py +++ b/capa/features/extractors/ts/function.py @@ -1,56 +1,107 @@ -from typing import Tuple, Iterator +import functools +import itertools +from os import extsep +from typing import Tuple, Callable, Iterator from dataclasses import dataclass from tree_sitter import Node import capa.features.extractors.ts.integer -from capa.features.file import Import, FunctionName -from capa.features.insn import Number +from capa.features.insn import API, Number, Property from capa.features.common import String, Feature from capa.features.address import Address from capa.features.extractors.ts.engine import TreeSitterExtractorEngine from capa.features.extractors.base_extractor import FunctionHandle +PSEUDO_MAIN = "PSEUDO MAIN" + @dataclass class TSFunctionInner: node: Node name: str + engine: TreeSitterExtractorEngine + + +def is_pseudo_main_function(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> bool: + return ( + fh.address == engine.get_default_address() + and fh.inner.node == engine.tree.root_node + and fh.inner.name == PSEUDO_MAIN + ) -def extract_strings(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, _ in engine.get_string_literals(fh.inner.node): +def extract_strings(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node, _ in engine.get_string_literals(fn_node): yield String(engine.get_range(node).strip('"')), engine.get_address(node) -def extract_integer_literals( - fh: FunctionHandle, engine: TreeSitterExtractorEngine -) -> Iterator[Tuple[Feature, Address]]: - for node, _ in engine.get_integer_literals(fh.inner.node): +def extract_integer_literals(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node, _ in engine.get_integer_literals(fn_node): parsed_int = capa.features.extractors.ts.integer.parse_integer(engine.get_range(node), engine.language) if parsed_int is not None: yield Number(parsed_int), engine.get_address(node) -def extract_function_names(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, name in engine.get_function_names(fh.inner.node): - yield FunctionName(name), engine.get_address(node) +def extract_imports_(name: str, engine: TreeSitterExtractorEngine) -> Iterator[str]: + for namespace in itertools.chain([""], engine.namespaces): + joined_name = engine.language_toolkit.join_names(namespace, name) + if engine.language_toolkit.is_import(joined_name): + yield joined_name -def extract_import_names(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, name in engine.get_import_names(fh.inner.node): - yield Import(name), engine.get_address(node) +def extract_classes(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node, _ in engine.get_new_object_names(fn_node): + for name in extract_imports_(engine.get_range(node), engine): + yield API(engine.language_toolkit.format_imported_class(name)), engine.get_address(node) -def extract_features(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for file_handler in FUNCTION_HANDLERS: - for feature, addr in file_handler(fh=fh, engine=engine): +def extract_properties(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node, _ in engine.get_assigned_property_names(fn_node): + for name in extract_imports_(engine.get_range(node), engine): + yield Property(engine.language_toolkit.format_imported_property(name)), engine.get_address(node) + + +def extract_static_methods_(node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for name in extract_imports_(engine.get_range(node), engine): + yield API(engine.language_toolkit.format_imported_function(name)), engine.get_address(node) + + +def extract_regular_methods_(node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + qualified_names = engine.language_toolkit.split_name(engine.get_range(node)) + if len(qualified_names) > 1: + for name in extract_imports_(engine.language_toolkit.join_names(*qualified_names[1:]), engine): + yield API(engine.language_toolkit.format_imported_function(name)), engine.get_address(node) + + +def extract_function_calls(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node, _ in engine.get_function_call_names(fn_node): + yield from extract_static_methods_(node, engine) + yield from extract_regular_methods_(node, engine) + + +def extract_pseudo_main_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node, _ in engine.get_global_statements(): + yield from extract_features_(node, engine) + + +def extract_features_(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for function_handler in FUNCTION_HANDLERS: + for feature, addr in function_handler(fn_node, engine): yield feature, addr +def extract_features(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + if is_pseudo_main_function(fh, engine): + yield from extract_pseudo_main_features(engine) + else: + yield from extract_features_(fh.inner.node, engine) + + FUNCTION_HANDLERS = ( - extract_function_names, - extract_import_names, + extract_classes, + extract_properties, + extract_function_calls, extract_integer_literals, extract_strings, ) diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index 935beb61a..c275f0587 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -14,16 +14,15 @@ class QueryBinding: @dataclass class ScriptQueryBinding(QueryBinding): - new_object: Query - new_object_field_name: str + new_object_name: Query function_definition: Query function_definition_field_name: str - function_call: Query - function_call_field_name: str + function_call_name: Query + assigned_property_name: Query string_literal: Query integer_literal: Query namespace: Query - global_statement: Query + global_statement: Query # except function definitions @dataclass @@ -64,18 +63,17 @@ def deserialize(language: str, binding: dict) -> dict: LANG_CS, { "query": { - "new_object": "(object_creation_expression) @object.new", - "function_definition": "(local_function_statement) @function.definition", - "function_call": "(invocation_expression) @function.call", + "new_object_name": "(object_creation_expression type: [(qualified_name) @new-object (identifier) @new-object])", + "function_definition": "(local_function_statement) @function-definition", + "function_call_name": "(invocation_expression function: [(member_access_expression name: (identifier)) @function-call (identifier) @function-call])", + "assigned_property_name": "(assignment_expression left: (member_access_expression name: (identifier) @member))", "string_literal": "(string_literal) @string-literal", "integer_literal": "(integer_literal) @integer-literal", "namespace": "(using_directive [(identifier) @namespace (qualified_name) @namespace])", "global_statement": "(global_statement [(if_statement) @global-statement (expression_statement) @global-statement (local_declaration_statement) @global-statement])", }, "field_name": { - "new_object": "type", "function_definition": "name", - "function_call": "function", }, }, ), diff --git a/capa/features/extractors/ts/sig.py b/capa/features/extractors/ts/sig.py deleted file mode 100644 index 6c1a6da3f..000000000 --- a/capa/features/extractors/ts/sig.py +++ /dev/null @@ -1,45 +0,0 @@ -import json -import importlib.resources -from typing import Callable - -import capa.features.extractors.ts.signatures -from capa.features.extractors.script import LANG_CS - - -def get_sig_file(language: str) -> str: - if language == LANG_CS: - return "cs.json" - raise ValueError("Language {language} does not have an import signature file") - - -def load_import_signatures(language: str) -> set: - sig_file = get_sig_file(language) - return set(json.loads(importlib.resources.read_text(capa.features.extractors.ts.signatures, sig_file))) - - -def get_name_joiner(language: str) -> Callable: - if language == LANG_CS: - return lambda qualified_name, identifier: qualified_name + "." + identifier - raise ValueError("Language {language} does not have a name joiner") - - -def get_default_namespaces(language: str, embedded: bool) -> set: - if embedded and language == LANG_CS: - return { - "System", - "System.Collections", - "System.Collections.Specialized", - "System.Configuration", - "System.Text", - "System.Text.RegularExpressions", - "System.Web", - "System.Web.Caching", - "System.Web.Profile", - "System.Web.Security", - "System.Web.SessionState", - "System.Web.UI", - "System.Web.UI.HtmlControls", - "System.Web.UI.WebControls", - "System.Web.UI.WebControls.WebParts", - } - return set() diff --git a/capa/features/extractors/ts/signatures/cs.json b/capa/features/extractors/ts/signatures/cs.json index f0cd2629e..68ae4c04f 100644 --- a/capa/features/extractors/ts/signatures/cs.json +++ b/capa/features/extractors/ts/signatures/cs.json @@ -1,14 +1,38 @@ [ "System.IO.DirectoryInfo", + "System.IO.Directory.CreateDirectory", "System.IO.File.Delete", "System.IO.File.Write", "System.IO.File.GetAttributes", + "System.IO.File.ReadAllBytes", + "System.IO.File.ReadAllBytes", + "System.IO.File.ReadAllBytesAsync", + "System.IO.File.ReadAllLines", + "System.IO.File.ReadAllLinesAsync", + "System.IO.File.ReadAllText", + "System.IO.File.ReadAllTextAsync", + "System.IO.File.ReadLines", + "System.IO.File.ReadLinesAsync", + "System.IO.File.WriteAllBytes", + "System.IO.File.WriteAllBytes", + "System.IO.File.WriteAllBytesAsync", + "System.IO.File.WriteAllLines", + "System.IO.File.WriteAllLinesAsync", + "System.IO.File.WriteAllText", + "System.IO.File.WriteAllTextAsync", + "System.IO.File.WriteLines", + "System.IO.File.WriteLinesAsync", "System.Convert.ToBase64String", "System.Convert.FromBase64String", + "System.Data.SqlClient.SqlCommand", + "System.Data.SqlClient.SqlConnection", "System.Diagnostics.Process", "System.Diagnostics.ProcessStartInfo", "System.Diagnostics.Process.Start", "System.Security.Cryptography.RijndaelManaged", "System.Security.Cryptography.CryptoStream", + "System.Security.Cryptography.SHA1", + "System.Security.Cryptography.SHA1CryptoServiceProvider", + "System.Security.Cryptography.SHA256", "System.Security.Cryptography.SHA256CryptoServiceProvider" ] \ No newline at end of file diff --git a/capa/features/extractors/ts/tools.py b/capa/features/extractors/ts/tools.py new file mode 100644 index 000000000..1e7fc43a0 --- /dev/null +++ b/capa/features/extractors/ts/tools.py @@ -0,0 +1,116 @@ +import abc +import json +import importlib.resources +from typing import List, Optional + +import capa.features.extractors.ts.signatures +from capa.features.extractors.script import LANG_CS + + +class LanguageToolkit: + import_signatures: set + + def __init__(self, signature_file: str): + self.import_signatures = self.load_import_signatures(signature_file) + + def load_import_signatures(self, signature_file: str) -> set: + return set(json.loads(importlib.resources.read_text(capa.features.extractors.ts.signatures, signature_file))) + + def is_import(self, import_: str) -> bool: + return import_ in self.import_signatures + + def join_names(self, *args: str) -> str: + return self.join_names_nonempty(*[arg for arg in args if arg != ""]) + + @abc.abstractmethod + def join_names_nonempty(self, *args: str) -> str: + raise NotImplementedError() + + @abc.abstractmethod + def split_name(self, name: str) -> List[str]: + raise NotImplementedError() + + @abc.abstractmethod + def format_imported_class(self, name: str) -> str: + raise NotImplementedError() + + @abc.abstractmethod + def format_imported_function(self, name: str) -> str: + raise NotImplementedError + + @abc.abstractmethod + def format_imported_property(self, name: str) -> str: + raise NotImplementedError + + @abc.abstractmethod + def get_default_namespaces(self, embedded: bool) -> set: + raise NotImplementedError() + + @abc.abstractmethod + def parse_integer(self, integer: str) -> Optional[int]: + raise NotImplementedError() + + +class CSharpToolkit(LanguageToolkit): + def join_names_nonempty(self, *args: str) -> str: + return ".".join(args) + + def split_name(self, name: str) -> List[str]: + return name.split(".") + + def format_imported_class(self, name: str) -> str: + return name + + def format_imported_function(self, name: str) -> str: + qualified_names = self.split_name(name) + if len(qualified_names) < 2: + raise ValueError(f"function {name} does not have an associated class or namespace") + if len(qualified_names) == 2: + classname, functionname = qualified_names[0], qualified_names[1] + return f"{classname}::{functionname}" + namespace, classname, functionname = qualified_names[:-2], qualified_names[-2], qualified_names[-1] + return f"{'.'.join(namespace)}.{classname}::{functionname}" + + def format_imported_property(self, name: str) -> str: + qualified_names = self.split_name(name) + if len(qualified_names) < 2: + raise ValueError(f"property {name} does not have an associated class") + if len(qualified_names) == 2: + classname, propertyname = qualified_names[0], qualified_names[1] + return f"{classname}::{propertyname}" + namespace, classname, propertyname = qualified_names[:-2], qualified_names[-2], qualified_names[-1] + return f"{'.'.join(namespace)}.{classname}::{propertyname}" + + def get_default_namespaces(self, embedded: bool) -> set: + if embedded: + return { + "System", + "System.Collections", + "System.Collections.Specialized", + "System.Configuration", + "System.Text", + "System.Text.RegularExpressions", + "System.Web", + "System.Web.Caching", + "System.Web.Profile", + "System.Web.Security", + "System.Web.SessionState", + "System.Web.UI", + "System.Web.UI.HtmlControls", + "System.Web.UI.WebControls", + "System.Web.UI.WebControls.WebParts", + } + return set() + + def parse_integer(self, integer: str) -> Optional[int]: + if integer.endswith(("u", "l")): + integer = integer[:-1] + try: + if integer.startswith(("0x", "0X")): + return int(integer, 16) + return int(integer) + except: + return None + + +LANGUAGE_TOOLKITS: dict[str, LanguageToolkit] = {LANG_CS: CSharpToolkit("cs.json")} diff --git a/capa/features/insn.py b/capa/features/insn.py index c62d3ddf3..cdafaa360 100644 --- a/capa/features/insn.py +++ b/capa/features/insn.py @@ -24,6 +24,11 @@ def __init__(self, name: str, description=None): super(API, self).__init__(name, description=description) +class Property(Feature): + def __init__(self, name: str, description=None): + super().__init__(name, description=description) + + class Number(Feature): def __init__(self, value: Union[int, float], description=None): super(Number, self).__init__(value, description=description) diff --git a/capa/rules.py b/capa/rules.py index 02399d367..928d29b45 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -280,6 +280,8 @@ def parse_feature(key: str): return capa.features.common.MatchedRule elif key == "function-name": return capa.features.file.FunctionName + elif key == "language": + return capa.features.common.ScriptLanguage elif key == "os": return capa.features.common.OS elif key == "format": diff --git a/tests/fixtures.py b/tests/fixtures.py index 2f0a441ab..f57a882d5 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -10,10 +10,9 @@ import os import os.path import binascii -import itertools import contextlib import collections -from typing import Set, Dict, Union +from typing import Set, Dict, Tuple, Union, Iterator from functools import lru_cache import pytest @@ -39,7 +38,6 @@ ) from capa.features.address import Address from capa.features.extractors.script import LANG_CS, LANG_TEM -from capa.features.extractors.ts.extractor import TreeSitterFeatureExtractor from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor @@ -417,18 +415,9 @@ def sample_ts(request): return resolve_sample_ts(request.param) -def get_function(extractor, fva: Union[int, tuple, str]) -> FunctionHandle: - if (isinstance(fva, tuple) or isinstance(fva, str)) and not isinstance(extractor, TreeSitterFeatureExtractor): - raise ValueError("invalid fva format") +def get_function(extractor, fva: int) -> FunctionHandle: for fh in extractor.get_functions(): - if isinstance(extractor, TreeSitterFeatureExtractor): - if isinstance(fva, tuple): - addr = (fh.address.start_byte, fh.address.end_byte) - elif isinstance(fva, str): - addr = fh.inner.name - else: - raise ValueError("invalid fva format") - elif isinstance(extractor, DnfileFeatureExtractor): + if isinstance(extractor, DnfileFeatureExtractor): addr = fh.inner.offset else: addr = fh.address @@ -437,6 +426,19 @@ def get_function(extractor, fva: Union[int, tuple, str]) -> FunctionHandle: raise ValueError("function not found") +def get_function_ts(extractor, fid: Union[Tuple[int], str]) -> Iterator[FunctionHandle]: + for fh in extractor.get_functions(): + if isinstance(fid, tuple): + addr = (fh.address.start_byte, fh.address.end_byte) + elif isinstance(fid, str): + addr = fh.inner.name + else: + raise ValueError("invalid fva format") + + if addr == fid: + yield fh + + def get_function_by_token(extractor, token: int) -> FunctionHandle: for fh in extractor.get_functions(): if fh.address.token.value == token: @@ -542,6 +544,13 @@ def scope(request): return resolve_scope(request.param) +def get_function_id_ts(scope): + fid = scope.partition("=")[2] + if fid[0] == "(" and fid[-1] == ")": + fid = tuple(int(x, 16) if x.lstrip().startswith("0x") else int(x) for x in fid[1:-1].split(",")) + return fid + + def resolve_scope_ts(scope): if scope == "global": inner_fn = lambda extractor: extract_global_features(extractor) @@ -556,11 +565,16 @@ def inner_fn(extractor): elif scope.startswith("function"): # like `function=(0xbeef, 0xdead) or function=(123, 456) or function=foo_bar` def inner_fn(extractor): - fn = scope.partition("=")[2] - if fn[0] == "(" and fn[-1] == ")": - fn = tuple(int(x, 16) if x.lstrip().startswith("0x") else int(x) for x in fn[1:-1].split(",")) - fh = get_function(extractor, fn) - features = extract_function_features(extractor, fh) + fid = get_function_id_ts(scope) + fhs = list(get_function_ts(extractor, fid)) + if not fhs: + raise ValueError("function not found") + features = collections.defaultdict(set) + for fh in fhs: + for k, vs in extract_function_features(extractor, fh).items(): + features[k].update(vs) + for k, vs in extract_file_features(extractor).items(): + features[k].update(vs) for k, vs in extract_global_features(extractor).items(): features[k].update(vs) return features diff --git a/tests/test_ts.py b/tests/test_ts.py index 5c871990b..48b97f9af 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -5,8 +5,7 @@ from fixtures import * from tree_sitter import Node, Tree -from capa.features.file import Import, FunctionName -from capa.features.insn import Number +from capa.features.insn import API, Number from capa.features.common import ( OS, OS_ANY, @@ -22,6 +21,7 @@ from capa.features.address import FileOffsetRangeAddress from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML from capa.features.extractors.ts.query import QueryBinding, HTMLQueryBinding, TemplateQueryBinding +from capa.features.extractors.ts.tools import LANGUAGE_TOOLKITS from capa.features.extractors.ts.engine import ( TreeSitterBaseEngine, TreeSitterHTMLEngine, @@ -59,7 +59,6 @@ def do_test_ts_base_engine_get_default_address(engine: TreeSitterBaseEngine): def do_test_ts_extractor_engine_init(engine: TreeSitterExtractorEngine, expected_language: str): assert engine.language == expected_language assert isinstance(engine.query, QueryBinding) - assert isinstance(engine.import_signatures, set) and len(engine.import_signatures) > 0 assert isinstance(engine.get_default_address(), FileOffsetRangeAddress) assert isinstance(engine.buf_offset, int) and engine.buf_offset >= 0 addr = engine.get_default_address() @@ -78,18 +77,11 @@ def do_test_ts_extractor_engine_get_address( def do_test_ts_extractor_engine_get_new_objects( engine: TreeSitterExtractorEngine, root_node: Node, expected: List[Tuple[str, str]] ): - assert len(engine.get_new_objects(root_node)) == len(expected) - for (node, name), (expected_range, expected_id_range) in zip(engine.get_new_objects(root_node), expected): + assert len(list(engine.get_new_object_names(root_node))) == len(expected) + for (node, name), (_, expected_name_range) in zip(engine.get_new_object_names(root_node), expected): assert isinstance(node, Node) - assert name == "object.new" - do_test_ts_base_engine_get_range(engine, node, expected_range) - do_test_ts_base_engine_get_address(engine, node) - do_test_ts_base_engine_get_range(engine, engine.get_object_id(node), expected_id_range) - - assert len(list(engine.get_new_object_ids(root_node))) == len(expected) - for node, (_, expected_id_range) in zip(engine.get_new_object_ids(root_node), expected): - assert isinstance(node, Node) - do_test_ts_base_engine_get_range(engine, node, expected_id_range) + assert name == "new-object" + do_test_ts_base_engine_get_range(engine, node, expected_name_range) do_test_ts_base_engine_get_address(engine, node) @@ -98,34 +90,29 @@ def do_test_ts_extractor_engine_get_function_definitions( ): assert engine.get_function_definitions(engine.tree.root_node) == engine.get_function_definitions() assert len(engine.get_function_definitions(root_node)) == len(expected) - for (node, name), (expected_range, expected_id_range) in zip(engine.get_function_definitions(root_node), expected): + for (node, name), (expected_range, expected_name_range) in zip( + engine.get_function_definitions(root_node), expected + ): assert isinstance(node, Node) - assert name == "function.definition" + assert name == "function-definition" do_test_ts_base_engine_get_range(engine, node, expected_range, startswith=True) do_test_ts_base_engine_get_address(engine, node) - do_test_ts_base_engine_get_range(engine, engine.get_function_definition_id(node), expected_id_range) + do_test_ts_base_engine_get_range(engine, engine.get_function_definition_name(node), expected_name_range) - assert len(list(engine.get_function_definition_ids(root_node))) == len(expected) - for node, (_, expected_id_range) in zip(engine.get_function_definition_ids(root_node), expected): + assert len(list(engine.get_function_definition_names(root_node))) == len(expected) + for node, (_, expected_name_range) in zip(engine.get_function_definition_names(root_node), expected): assert isinstance(node, Node) - do_test_ts_base_engine_get_range(engine, node, expected_id_range) + do_test_ts_base_engine_get_range(engine, node, expected_name_range) do_test_ts_base_engine_get_address(engine, node) def do_test_ts_extractor_engine_get_function_calls( engine: TreeSitterExtractorEngine, root_node: Node, expected: List[Tuple[str, str]] ): - assert len(engine.get_function_calls(root_node)) == len(expected) - for (node, name), (expected_range, expected_id_range) in zip(engine.get_function_calls(root_node), expected): - assert isinstance(node, Node) - assert name == "function.call" - do_test_ts_base_engine_get_range(engine, node, expected_range) - do_test_ts_base_engine_get_address(engine, node) - do_test_ts_base_engine_get_range(engine, engine.get_function_call_id(node), expected_id_range) - - assert len(list(engine.get_function_call_ids(root_node))) == len(expected) - for node, (_, expected_id_range) in zip(engine.get_function_call_ids(root_node), expected): + assert len(list(engine.get_function_call_names(root_node))) == len(expected) + for (node, name), (_, expected_id_range) in zip(engine.get_function_call_names(root_node), expected): assert isinstance(node, Node) + assert name == "function-call" do_test_ts_base_engine_get_range(engine, node, expected_id_range) do_test_ts_base_engine_get_address(engine, node) @@ -171,26 +158,6 @@ def do_test_ts_extractor_engine_get_global_statements(engine: TreeSitterExtracto do_test_ts_base_engine_get_address(engine, node) -def do_test_ts_extractor_engine_get_import_names( - engine: TreeSitterExtractorEngine, root_node: Node, expected: List[str] -): - assert len(list(engine.get_import_names(root_node))) == len(expected) - for (node, import_name), expected_import_name in zip(list(engine.get_import_names(root_node)), expected): - assert isinstance(node, Node) - assert import_name == expected_import_name - do_test_ts_base_engine_get_address(engine, node) - - -def do_test_ts_extractor_engine_get_function_names( - engine: TreeSitterExtractorEngine, root_node: Node, expected: List[str] -): - assert len(list(engine.get_function_names(root_node))) == len(expected) - for (node, function_name), expected_function_name in zip(list(engine.get_function_names(root_node)), expected): - assert isinstance(node, Node) - assert function_name == expected_function_name - do_test_ts_base_engine_get_address(engine, node) - - @parametrize( "engine_str,expected", [ @@ -267,8 +234,6 @@ def do_test_ts_extractor_engine_get_function_names( 'string stdout = "";', 'string stderr = "";', ], - "all import names": ["System.Diagnostics.ProcessStartInfo", "System.Diagnostics.Process"], - "all function names": [], }, ), ], @@ -283,8 +248,6 @@ def test_ts_extractor_engine(request: pytest.FixtureRequest, engine_str: str, ex do_test_ts_extractor_engine_get_function_calls(engine, engine.tree.root_node, expected["all function calls"]) do_test_ts_extractor_engine_get_string_literals(engine, engine.tree.root_node, expected["all string literals"]) do_test_ts_extractor_engine_get_integer_literals(engine, engine.tree.root_node, expected["all integer literals"]) - do_test_ts_extractor_engine_get_import_names(engine, engine.tree.root_node, expected["all import names"]) - do_test_ts_extractor_engine_get_function_names(engine, engine.tree.root_node, expected["all function names"]) do_test_ts_extractor_engine_get_global_statements(engine, expected["global statements"]) do_test_ts_extractor_engine_get_namespaces(engine, expected["namespaces"]) do_test_ts_base_engine_get_default_address(engine) @@ -303,7 +266,7 @@ def do_test_ts_template_engine_init(engine: TreeSitterTemplateEngine): def do_test_ts_template_engine_get_template_namespaces( engine: TreeSitterTemplateEngine, expected_language: str, expected: List[str] ): - default_namespaces = capa.features.extractors.ts.sig.get_default_namespaces(expected_language, True) + default_namespaces = LANGUAGE_TOOLKITS[expected_language].get_default_namespaces(True) template_namespaces = {name for _, name in engine.get_template_namespaces()} assert default_namespaces.issubset(template_namespaces) assert len(list(engine.get_imported_namespaces())) == len(expected) @@ -954,11 +917,11 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("cs_138cdc", "file", Format(FORMAT_SCRIPT), True), ("cs_138cdc", "file", ScriptLanguage(LANG_CS), True), ("cs_138cdc", "file", Namespace("System"), True), - ("cs_138cdc", "file", String(""), True), + ("cs_138cdc", "function=PSEUDO MAIN", String(""), True), ("cs_138cdc", "function=die", String("Not Found"), True), ("cs_138cdc", "function=Page_Load", String("127.0.0.1"), True), - ("cs_138cdc", "function=Page_Load", Import("System.Diagnostics.ProcessStartInfo"), True), - ("cs_138cdc", "function=Page_Load", Import("System.Diagnostics.Process"), True), + ("cs_138cdc", "function=Page_Load", API("System.Diagnostics.ProcessStartInfo"), True), + ("cs_138cdc", "function=Page_Load", API("System.Diagnostics.Process"), True), ("aspx_4f6fa6", "global", Arch(ARCH_ANY), True), ("aspx_4f6fa6", "global", OS(OS_ANY), True), ("aspx_4f6fa6", "file", Format(FORMAT_SCRIPT), True), @@ -968,12 +931,12 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_4f6fa6", "file", Namespace("System.IO.Compression"), True), ("aspx_4f6fa6", "function=do_ps", String("powershell.exe"), True), ("aspx_4f6fa6", "function=do_ps", Substring("-executionpolicy bypass"), True), - ("aspx_4f6fa6", "function=do_ps", Import("System.Diagnostics.ProcessStartInfo"), True), - ("aspx_4f6fa6", "function=do_ps", FunctionName("System.Diagnostics.Process.Start"), True), + ("aspx_4f6fa6", "function=do_ps", API("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_4f6fa6", "function=do_ps", API("System.Diagnostics.Process::Start"), True), ("aspx_4f6fa6", "function=ps", String("\\nPS> "), True), ("aspx_4f6fa6", "function=ps", Substring("PS>"), True), ("aspx_4f6fa6", "function=downloadbutton_Click", Substring("filename"), True), - ("aspx_4f6fa6", "function=base64encode", FunctionName("System.Convert.ToBase64String"), True), + ("aspx_4f6fa6", "function=base64encode", API("System.Convert::ToBase64String"), True), ("aspx_5f959f", "global", Arch(ARCH_ANY), True), ("aspx_5f959f", "global", OS(OS_ANY), True), ("aspx_5f959f", "file", Format(FORMAT_SCRIPT), True), @@ -981,7 +944,7 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_5f959f", "file", Namespace("System.Diagnostics"), True), ("aspx_5f959f", "file", Namespace("System.IO"), True), ("aspx_5f959f", "file", Namespace("System.Web.SessionState"), True), - ("aspx_5f959f", "function=ExcuteCmd", Import("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_5f959f", "function=ExcuteCmd", API("System.Diagnostics.ProcessStartInfo"), True), ("aspx_5f959f", "function=ExcuteCmd", String("cmd.exe"), True), ("aspx_5f959f", "function=ExcuteCmd", Substring("/c"), True), ("aspx_5f959f", "function=cmdExe_Click", String("
"), True),
@@ -992,22 +955,22 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine):
         ("aspx_10162f", "file", ScriptLanguage(LANG_CS), True),
         ("aspx_10162f", "file", Namespace("System.IO"), True),
         ("aspx_10162f", "file", Namespace("System.Web.Security"), True),
-        ("aspx_10162f", "file", String("data"), True),
-        ("aspx_10162f", "file", String("gsize"), True),
-        ("aspx_10162f", "file", String("cmd"), True),
-        ("aspx_10162f", "file", String("ttar"), True),
-        ("aspx_10162f", "file", String("sdfewq@#$51234234DF@#$!@#$ASDF"), True),
-        ("aspx_10162f", "function=rm", FunctionName("System.IO.File.Delete"), False),
-        ("aspx_10162f", "function=(0x564, 0x6af)", FunctionName("System.Convert.ToBase64String"), True),
+        ("aspx_10162f", "function=PSEUDO MAIN", String("data"), True),
+        ("aspx_10162f", "function=PSEUDO MAIN", String("gsize"), True),
+        ("aspx_10162f", "function=PSEUDO MAIN", String("cmd"), True),
+        ("aspx_10162f", "function=PSEUDO MAIN", String("ttar"), True),
+        ("aspx_10162f", "function=PSEUDO MAIN", String("sdfewq@#$51234234DF@#$!@#$ASDF"), True),
+        ("aspx_10162f", "function=rm", API("System.IO.File::Delete"), False),
+        ("aspx_10162f", "function=(0x564, 0x6af)", API("System.Convert::ToBase64String"), True),
         ("aspx_10162f", "function=(0x564, 0x6af)", String("p"), True),
-        ("aspx_10162f", "function=exec", Import("System.Diagnostics.Process"), True),
+        ("aspx_10162f", "function=exec", API("System.Diagnostics.Process"), True),
         ("aspx_10162f", "function=exec", String("cmd.exe"), True),
         ("aspx_10162f", "function=gsize", Substring("error"), True),
         ("aspx_10162f", "function=exp", Substring("root"), True),
         ("aspx_10162f", "function=exp", Substring("net use"), True),
         ("aspx_10162f", "function=exp", Number(2), True),
-        ("aspx_10162f", "function=exp", Import("System.IO.DirectoryInfo"), True),
-        ("aspx_10162f", "function=exp", FunctionName("System.IO.File.GetAttributes"), True),
+        ("aspx_10162f", "function=exp", API("System.IO.DirectoryInfo"), True),
+        ("aspx_10162f", "function=exp", API("System.IO.File::GetAttributes"), True),
         ("aspx_10162f", "function=GetDirSize", Number(0), True),
         ("aspx_10162f", "function=createJsonDirectory", String('\\"dir\\":['), True),
         ("aspx_10162f", "function=createJsonDirectory", Number(0), True),

From ad31d83df49c467ed822a41131d11dd805b77e33 Mon Sep 17 00:00:00 2001
From: Adam Storek 
Date: Tue, 19 Jul 2022 07:44:43 -0400
Subject: [PATCH 36/51] Fixed property-extraction bugs.

---
 capa/features/extractors/ts/function.py       | 60 +++++++++++++------
 capa/features/extractors/ts/query.py          |  2 +-
 .../features/extractors/ts/signatures/cs.json | 12 ++++
 capa/features/freeze/features.py              | 13 ++++
 tests/test_ts.py                              | 39 +++++++++++-
 5 files changed, 106 insertions(+), 20 deletions(-)

diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py
index 72aa3896e..55662b863 100644
--- a/capa/features/extractors/ts/function.py
+++ b/capa/features/extractors/ts/function.py
@@ -1,7 +1,5 @@
-import functools
 import itertools
-from os import extsep
-from typing import Tuple, Callable, Iterator
+from typing import Tuple, Iterator
 from dataclasses import dataclass
 
 from tree_sitter import Node
@@ -43,41 +41,69 @@ def extract_integer_literals(fn_node: Node, engine: TreeSitterExtractorEngine) -
             yield Number(parsed_int), engine.get_address(node)
 
 
-def extract_imports_(name: str, engine: TreeSitterExtractorEngine) -> Iterator[str]:
-    for namespace in itertools.chain([""], engine.namespaces):
+def get_imports(name: str, namespaces: set[str], engine: TreeSitterExtractorEngine) -> Iterator[str]:
+    for namespace in itertools.chain([""], namespaces):
         joined_name = engine.language_toolkit.join_names(namespace, name)
         if engine.language_toolkit.is_import(joined_name):
             yield joined_name
 
 
-def extract_classes(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]:
+def get_properties(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Node, str]]:
+    for node, _ in engine.get_assigned_property_names(fn_node):
+        qualified_names = engine.language_toolkit.split_name(engine.get_range(node))
+        if len(qualified_names) > 1:
+            yield node, engine.language_toolkit.join_names(*qualified_names[1:])
+
+
+def get_classes(fn_node: Node, engine: TreeSitterExtractorEngine) -> set[str]:
+    return set(
+        name
+        for node, _ in engine.get_new_object_names(fn_node)
+        for name in get_imports(engine.get_range(node), engine.namespaces, engine)
+    )
+
+
+def extract_classes_(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]:
     for node, _ in engine.get_new_object_names(fn_node):
-        for name in extract_imports_(engine.get_range(node), engine):
+        for name in get_imports(engine.get_range(node), engine.namespaces, engine):
             yield API(engine.language_toolkit.format_imported_class(name)), engine.get_address(node)
 
 
-def extract_properties(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]:
-    for node, _ in engine.get_assigned_property_names(fn_node):
-        for name in extract_imports_(engine.get_range(node), engine):
+def extract_properties_(
+    fn_node: Node, classes: set[str], engine: TreeSitterExtractorEngine
+) -> Iterator[Tuple[Feature, Address]]:
+    for node, property_name in get_properties(fn_node, engine):
+        for name in get_imports(property_name, classes, engine):
             yield Property(engine.language_toolkit.format_imported_property(name)), engine.get_address(node)
 
 
 def extract_static_methods_(node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]:
-    for name in extract_imports_(engine.get_range(node), engine):
+    for name in get_imports(engine.get_range(node), engine.namespaces, engine):
         yield API(engine.language_toolkit.format_imported_function(name)), engine.get_address(node)
 
 
-def extract_regular_methods_(node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]:
+def extract_regular_methods_(
+    node: Node, classes: set[str], engine: TreeSitterExtractorEngine
+) -> Iterator[Tuple[Feature, Address]]:
     qualified_names = engine.language_toolkit.split_name(engine.get_range(node))
     if len(qualified_names) > 1:
-        for name in extract_imports_(engine.language_toolkit.join_names(*qualified_names[1:]), engine):
+        for name in get_imports(engine.language_toolkit.join_names(*qualified_names[1:]), classes, engine):
             yield API(engine.language_toolkit.format_imported_function(name)), engine.get_address(node)
 
 
-def extract_function_calls(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]:
+def extract_api(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]:
+    classes = get_classes(fn_node, engine)
+    yield from extract_classes_(fn_node, engine)
+    yield from extract_function_calls_(fn_node, classes, engine)
+    yield from extract_properties_(fn_node, classes, engine)
+
+
+def extract_function_calls_(
+    fn_node: Node, classes: set[str], engine: TreeSitterExtractorEngine
+) -> Iterator[Tuple[Feature, Address]]:
     for node, _ in engine.get_function_call_names(fn_node):
         yield from extract_static_methods_(node, engine)
-        yield from extract_regular_methods_(node, engine)
+        yield from extract_regular_methods_(node, classes, engine)
 
 
 def extract_pseudo_main_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]:
@@ -99,9 +125,7 @@ def extract_features(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> I
 
 
 FUNCTION_HANDLERS = (
-    extract_classes,
-    extract_properties,
-    extract_function_calls,
+    extract_api,
     extract_integer_literals,
     extract_strings,
 )
diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py
index c275f0587..6743e78ae 100644
--- a/capa/features/extractors/ts/query.py
+++ b/capa/features/extractors/ts/query.py
@@ -66,7 +66,7 @@ def deserialize(language: str, binding: dict) -> dict:
                     "new_object_name": "(object_creation_expression type: [(qualified_name) @new-object (identifier) @new-object])",
                     "function_definition": "(local_function_statement) @function-definition",
                     "function_call_name": "(invocation_expression function: [(member_access_expression name: (identifier)) @function-call (identifier) @function-call])",
-                    "assigned_property_name": "(assignment_expression left: (member_access_expression name: (identifier) @member))",
+                    "assigned_property_name": "(assignment_expression left: (member_access_expression) @property)",
                     "string_literal": "(string_literal) @string-literal",
                     "integer_literal": "(integer_literal) @integer-literal",
                     "namespace": "(using_directive [(identifier) @namespace (qualified_name) @namespace])",
diff --git a/capa/features/extractors/ts/signatures/cs.json b/capa/features/extractors/ts/signatures/cs.json
index 68ae4c04f..edc3910f2 100644
--- a/capa/features/extractors/ts/signatures/cs.json
+++ b/capa/features/extractors/ts/signatures/cs.json
@@ -28,6 +28,18 @@
     "System.Data.SqlClient.SqlConnection",
     "System.Diagnostics.Process",
     "System.Diagnostics.ProcessStartInfo",
+    "System.Diagnostics.ProcessStartInfo.FileName",
+    "System.Diagnostics.ProcessStartInfo.Arguments",
+    "System.Diagnostics.ProcessStartInfo.RedirectStandardInput",
+    "System.Diagnostics.ProcessStartInfo.RedirectStandardOutput",
+    "System.Diagnostics.ProcessStartInfo.UseShellExecute",
+    "System.Diagnostics.ProcessStartInfo.CreateNoWindow",
+    "System.Diagnostics.Process.StartInfo.FileName",
+    "System.Diagnostics.Process.StartInfo.Arguments",
+    "System.Diagnostics.Process.StartInfo.RedirectStandardInput",
+    "System.Diagnostics.Process.StartInfo.RedirectStandardOutput",
+    "System.Diagnostics.Process.StartInfo.UseShellExecute",
+    "System.Diagnostics.Process.StartInfo.CreateNoWindow",
     "System.Diagnostics.Process.Start",
     "System.Security.Cryptography.RijndaelManaged",
     "System.Security.Cryptography.CryptoStream",
diff --git a/capa/features/freeze/features.py b/capa/features/freeze/features.py
index 633a49a5d..17855987b 100644
--- a/capa/features/freeze/features.py
+++ b/capa/features/freeze/features.py
@@ -69,6 +69,9 @@ def to_capa(self) -> capa.features.common.Feature:
         elif isinstance(self, APIFeature):
             return capa.features.insn.API(self.api, description=self.description)
 
+        elif isinstance(self, PropertyFeature):
+            return capa.features.insn.Property(self.property, description=self.description)
+
         elif isinstance(self, NumberFeature):
             return capa.features.insn.Number(self.number, description=self.description)
 
@@ -153,6 +156,9 @@ def feature_from_capa(f: capa.features.common.Feature) -> "Feature":
     elif isinstance(f, capa.features.insn.API):
         return APIFeature(api=f.value, description=f.description)
 
+    elif isinstance(f, capa.features.insn.Property):
+        return PropertyFeature(property=f.value, description=f.description)
+
     elif isinstance(f, capa.features.insn.Number):
         return NumberFeature(number=f.value, description=f.description)
 
@@ -278,6 +284,12 @@ class APIFeature(FeatureModel):
     description: Optional[str]
 
 
+class PropertyFeature(FeatureModel):
+    type: str = "property"
+    property: str
+    description: Optional[str]
+
+
 class NumberFeature(FeatureModel):
     type: str = "number"
     number: Union[int, float]
@@ -333,6 +345,7 @@ class OperandOffsetFeature(FeatureModel):
     ClassFeature,
     NamespaceFeature,
     APIFeature,
+    PropertyFeature,
     NumberFeature,
     BytesFeature,
     OffsetFeature,
diff --git a/tests/test_ts.py b/tests/test_ts.py
index 48b97f9af..4930ed4d3 100644
--- a/tests/test_ts.py
+++ b/tests/test_ts.py
@@ -5,7 +5,7 @@
 from fixtures import *
 from tree_sitter import Node, Tree
 
-from capa.features.insn import API, Number
+from capa.features.insn import API, Number, Property
 from capa.features.common import (
     OS,
     OS_ANY,
@@ -158,6 +158,17 @@ def do_test_ts_extractor_engine_get_global_statements(engine: TreeSitterExtracto
         do_test_ts_base_engine_get_address(engine, node)
 
 
+def do_test_ts_extractor_engine_get_assigned_property_names(
+    engine: TreeSitterExtractorEngine, root_node: Node, expected: List[str]
+):
+    assert len(list(engine.get_assigned_property_names(root_node))) == len(expected)
+    for (node, name), expected_range in zip(engine.get_assigned_property_names(root_node), expected):
+        assert isinstance(node, Node)
+        assert name == "property"
+        do_test_ts_base_engine_get_range(engine, node, expected_range, startswith=True)
+        do_test_ts_base_engine_get_address(engine, node)
+
+
 @parametrize(
     "engine_str,expected",
     [
@@ -234,6 +245,15 @@ def do_test_ts_extractor_engine_get_global_statements(engine: TreeSitterExtracto
                     'string stdout = "";',
                     'string stderr = "";',
                 ],
+                "properties": [
+                    "HttpContext.Current.Response.StatusCode",
+                    "HttpContext.Current.Response.StatusDescription",
+                    "procStartInfo.RedirectStandardOutput",
+                    "procStartInfo.RedirectStandardError",
+                    "procStartInfo.UseShellExecute",
+                    "procStartInfo.CreateNoWindow",
+                    "p.StartInfo",
+                ],
             },
         ),
     ],
@@ -248,6 +268,7 @@ def test_ts_extractor_engine(request: pytest.FixtureRequest, engine_str: str, ex
     do_test_ts_extractor_engine_get_function_calls(engine, engine.tree.root_node, expected["all function calls"])
     do_test_ts_extractor_engine_get_string_literals(engine, engine.tree.root_node, expected["all string literals"])
     do_test_ts_extractor_engine_get_integer_literals(engine, engine.tree.root_node, expected["all integer literals"])
+    do_test_ts_extractor_engine_get_assigned_property_names(engine, engine.tree.root_node, expected["properties"])
     do_test_ts_extractor_engine_get_global_statements(engine, expected["global statements"])
     do_test_ts_extractor_engine_get_namespaces(engine, expected["namespaces"])
     do_test_ts_base_engine_get_default_address(engine)
@@ -922,6 +943,12 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine):
         ("cs_138cdc", "function=Page_Load", String("127.0.0.1"), True),
         ("cs_138cdc", "function=Page_Load", API("System.Diagnostics.ProcessStartInfo"), True),
         ("cs_138cdc", "function=Page_Load", API("System.Diagnostics.Process"), True),
+        (
+            "cs_138cdc",
+            "function=Page_Load",
+            Property("System.Diagnostics.ProcessStartInfo::RedirectStandardOutput"),
+            True,
+        ),
         ("aspx_4f6fa6", "global", Arch(ARCH_ANY), True),
         ("aspx_4f6fa6", "global", OS(OS_ANY), True),
         ("aspx_4f6fa6", "file", Format(FORMAT_SCRIPT), True),
@@ -965,6 +992,16 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine):
         ("aspx_10162f", "function=(0x564, 0x6af)", String("p"), True),
         ("aspx_10162f", "function=exec", API("System.Diagnostics.Process"), True),
         ("aspx_10162f", "function=exec", String("cmd.exe"), True),
+        ("aspx_10162f", "function=exec", Property("System.Diagnostics.Process.StartInfo::FileName"), True),
+        ("aspx_10162f", "function=exec", Property("System.Diagnostics.Process.StartInfo::UseShellExecute"), True),
+        ("aspx_10162f", "function=exec", Property("System.Diagnostics.Process.StartInfo::RedirectStandardInput"), True),
+        (
+            "aspx_10162f",
+            "function=exec",
+            Property("System.Diagnostics.Process.StartInfo::RedirectStandardOutput"),
+            True,
+        ),
+        ("aspx_10162f", "function=exec", Property("System.Diagnostics.Process.StartInfo::CreateNoWindow"), True),
         ("aspx_10162f", "function=gsize", Substring("error"), True),
         ("aspx_10162f", "function=exp", Substring("root"), True),
         ("aspx_10162f", "function=exp", Substring("net use"), True),

From e52a9b34f8ad4ae84759c942be296717648e4ad4 Mon Sep 17 00:00:00 2001
From: Adam Storek 
Date: Tue, 19 Jul 2022 10:29:26 -0400
Subject: [PATCH 37/51] Added few more test cases.

---
 rules            |  2 +-
 tests/test_ts.py | 11 +++++++++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/rules b/rules
index e88db21de..b2556227f 160000
--- a/rules
+++ b/rules
@@ -1 +1 @@
-Subproject commit e88db21de4d4cf9f7abec9177fab11240075036b
+Subproject commit b2556227fb446f03970e2eea0376a3cd72087d30
diff --git a/tests/test_ts.py b/tests/test_ts.py
index 4930ed4d3..e81040372 100644
--- a/tests/test_ts.py
+++ b/tests/test_ts.py
@@ -974,6 +974,17 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine):
         ("aspx_5f959f", "function=ExcuteCmd", API("System.Diagnostics.ProcessStartInfo"), True),
         ("aspx_5f959f", "function=ExcuteCmd", String("cmd.exe"), True),
         ("aspx_5f959f", "function=ExcuteCmd", Substring("/c"), True),
+        ("aspx_5f959f", "function=ExcuteCmd", API("System.Diagnostics.Process::Start"), True),
+        ("aspx_5f959f", "function=ExcuteCmd", Property("System.Diagnostics.ProcessStartInfo::FileName"), True),
+        ("aspx_5f959f", "function=ExcuteCmd", Property("System.Diagnostics.ProcessStartInfo::Arguments"), True),
+        ("aspx_5f959f", "function=ExcuteCmd", Property("System.Diagnostics.ProcessStartInfo::UseShellExecute"), True),
+        (
+            "aspx_5f959f",
+            "function=ExcuteCmd",
+            Property("System.Diagnostics.ProcessStartInfo::RedirectStandardOutput"),
+            True,
+        ),
+        ("aspx_5f959f", "function=ExcuteCmd", Property("System.Diagnostics.ProcessStartInfo::UseShellExecute"), True),
         ("aspx_5f959f", "function=cmdExe_Click", String("
"), True),
         ("aspx_5f959f", "function=cmdExe_Click", String("
"), True), ("aspx_10162f", "global", Arch(ARCH_ANY), True), From b27713b22820229e6032af42908ec47a3072f4a9 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 19 Jul 2022 12:36:14 -0400 Subject: [PATCH 38/51] Minor style improvements. --- capa/features/extractors/script.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py index 7d81cb361..d226c1627 100644 --- a/capa/features/extractors/script.py +++ b/capa/features/extractors/script.py @@ -9,8 +9,8 @@ LANG_JS = "javascript" LANG_TEM = "embedded_template" -EXT_ASPX = (".aspx", "aspx_") -EXT_CS = (".cs", ".cs_") +EXT_ASPX = ("aspx", "aspx_") +EXT_CS = ("cs", "cs_") EXT_HTML = ("html", "html_") From b2df2b0190a66508da6f50a7c41c0a567c6ed7fe Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 19 Jul 2022 14:11:25 -0400 Subject: [PATCH 39/51] Removed deprecated parse_integer. --- capa/features/extractors/ts/integer.py | 15 --------------- 1 file changed, 15 deletions(-) delete mode 100644 capa/features/extractors/ts/integer.py diff --git a/capa/features/extractors/ts/integer.py b/capa/features/extractors/ts/integer.py deleted file mode 100644 index a902833de..000000000 --- a/capa/features/extractors/ts/integer.py +++ /dev/null @@ -1,15 +0,0 @@ -from typing import Optional - -from capa.features.extractors.script import LANG_CS - - -def parse_integer(integer: str, language: str) -> Optional[int]: - try: - if language == LANG_CS: - if integer.endswith(("u", "l")): - integer = integer[:-1] - if integer.startswith(("0x", "0X")): - return int(integer, 16) - return int(integer) - except: - return None From a0379a60311e4c8673290a071ab0663a97a059ed Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 19 Jul 2022 15:32:02 -0400 Subject: [PATCH 40/51] Added more tests; fixed integer parsing related bugs. --- capa/features/extractors/ts/engine.py | 10 +++++++ capa/features/extractors/ts/file.py | 1 - capa/features/extractors/ts/function.py | 19 +++++++------ capa/features/extractors/ts/query.py | 4 +++ .../features/extractors/ts/signatures/cs.json | 13 ++++++++- rules | 2 +- tests/fixtures.py | 1 + tests/test_ts.py | 28 ++++++++++++++++++- 8 files changed, 66 insertions(+), 12 deletions(-) diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index e99f12141..1003574a6 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -101,6 +101,16 @@ def get_namespaces(self, node: Node = None) -> List[Tuple[Node, str]]: def get_global_statements(self) -> List[Tuple[Node, str]]: return self.query.global_statement.captures(self.tree.root_node) + def get_direct_method_call(self, node: Node) -> Node: + return node.child_by_field_name(self.query.direct_method_call_field_name) + + def is_object_creation_expression(self, node: Node) -> bool: + captures = self.get_new_object_names(node) + if not captures: + return False + new_object_name_node, _ = captures[0] + return new_object_name_node.parent.parent == node + class TreeSitterTemplateEngine(TreeSitterBaseEngine): query: TemplateQueryBinding diff --git a/capa/features/extractors/ts/file.py b/capa/features/extractors/ts/file.py index c868b16cc..20cd3cbd0 100644 --- a/capa/features/extractors/ts/file.py +++ b/capa/features/extractors/ts/file.py @@ -1,7 +1,6 @@ from typing import Tuple, Iterator import capa.features.extractors.script -import capa.features.extractors.ts.integer from capa.features.common import Feature, Namespace from capa.features.address import Address from capa.features.extractors.ts.engine import TreeSitterExtractorEngine diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py index 55662b863..59c02eaab 100644 --- a/capa/features/extractors/ts/function.py +++ b/capa/features/extractors/ts/function.py @@ -4,14 +4,13 @@ from tree_sitter import Node -import capa.features.extractors.ts.integer from capa.features.insn import API, Number, Property from capa.features.common import String, Feature from capa.features.address import Address from capa.features.extractors.ts.engine import TreeSitterExtractorEngine from capa.features.extractors.base_extractor import FunctionHandle -PSEUDO_MAIN = "PSEUDO MAIN" +PSEUDO_MAIN = "PSEUDO MAIN" # all global statements in one function scope @dataclass @@ -34,9 +33,9 @@ def extract_strings(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterato yield String(engine.get_range(node).strip('"')), engine.get_address(node) -def extract_integer_literals(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: +def extract_integers(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: for node, _ in engine.get_integer_literals(fn_node): - parsed_int = capa.features.extractors.ts.integer.parse_integer(engine.get_range(node), engine.language) + parsed_int = engine.language_toolkit.parse_integer(engine.get_range(node)) if parsed_int is not None: yield Number(parsed_int), engine.get_address(node) @@ -85,10 +84,14 @@ def extract_static_methods_(node: Node, engine: TreeSitterExtractorEngine) -> It def extract_regular_methods_( node: Node, classes: set[str], engine: TreeSitterExtractorEngine ) -> Iterator[Tuple[Feature, Address]]: + if engine.is_object_creation_expression(node): + node = engine.get_direct_method_call(node) qualified_names = engine.language_toolkit.split_name(engine.get_range(node)) - if len(qualified_names) > 1: - for name in get_imports(engine.language_toolkit.join_names(*qualified_names[1:]), classes, engine): - yield API(engine.language_toolkit.format_imported_function(name)), engine.get_address(node) + property_name = ( + qualified_names[0] if len(qualified_names) == 1 else engine.language_toolkit.join_names(*qualified_names[1:]) + ) + for name in get_imports(property_name, classes, engine): + yield API(engine.language_toolkit.format_imported_function(name)), engine.get_address(node) def extract_api(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: @@ -126,6 +129,6 @@ def extract_features(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> I FUNCTION_HANDLERS = ( extract_api, - extract_integer_literals, + extract_integers, extract_strings, ) diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index 6743e78ae..d337a588e 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -17,6 +17,8 @@ class ScriptQueryBinding(QueryBinding): new_object_name: Query function_definition: Query function_definition_field_name: str + direct_method_call_field_name: str + object_creation_expression_field_name: str function_call_name: Query assigned_property_name: Query string_literal: Query @@ -74,6 +76,8 @@ def deserialize(language: str, binding: dict) -> dict: }, "field_name": { "function_definition": "name", + "direct_method_call": "name", + "object_creation_expression": "expression", }, }, ), diff --git a/capa/features/extractors/ts/signatures/cs.json b/capa/features/extractors/ts/signatures/cs.json index edc3910f2..67f79ba88 100644 --- a/capa/features/extractors/ts/signatures/cs.json +++ b/capa/features/extractors/ts/signatures/cs.json @@ -4,6 +4,9 @@ "System.IO.File.Delete", "System.IO.File.Write", "System.IO.File.GetAttributes", + "System.IO.File.GetCreationTime", + "System.IO.File.GetLastAccessTime", + "System.IO.File.GetLastWriteTime", "System.IO.File.ReadAllBytes", "System.IO.File.ReadAllBytes", "System.IO.File.ReadAllBytesAsync", @@ -13,6 +16,9 @@ "System.IO.File.ReadAllTextAsync", "System.IO.File.ReadLines", "System.IO.File.ReadLinesAsync", + "System.IO.File.SetCreationTime", + "System.IO.File.SetLastAccessTime", + "System.IO.File.SetLastWriteTime", "System.IO.File.WriteAllBytes", "System.IO.File.WriteAllBytes", "System.IO.File.WriteAllBytesAsync", @@ -22,10 +28,13 @@ "System.IO.File.WriteAllTextAsync", "System.IO.File.WriteLines", "System.IO.File.WriteLinesAsync", + "System.IO.Path.GetTempPath", "System.Convert.ToBase64String", "System.Convert.FromBase64String", "System.Data.SqlClient.SqlCommand", "System.Data.SqlClient.SqlConnection", + "System.Data.SqlClient.SqlConnection.Open", + "System.Data.SqlClient.SqlDataAdapter", "System.Diagnostics.Process", "System.Diagnostics.ProcessStartInfo", "System.Diagnostics.ProcessStartInfo.FileName", @@ -45,6 +54,8 @@ "System.Security.Cryptography.CryptoStream", "System.Security.Cryptography.SHA1", "System.Security.Cryptography.SHA1CryptoServiceProvider", + "System.Security.Cryptography.SHA1CryptoServiceProvider.ComputeHash", "System.Security.Cryptography.SHA256", - "System.Security.Cryptography.SHA256CryptoServiceProvider" + "System.Security.Cryptography.SHA256CryptoServiceProvider", + "System.Security.Cryptography.SHA256CryptoServiceProvider.ComputeHash" ] \ No newline at end of file diff --git a/rules b/rules index b2556227f..e88db21de 160000 --- a/rules +++ b/rules @@ -1 +1 @@ -Subproject commit b2556227fb446f03970e2eea0376a3cd72087d30 +Subproject commit e88db21de4d4cf9f7abec9177fab11240075036b diff --git a/tests/fixtures.py b/tests/fixtures.py index f57a882d5..f5777843d 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -572,6 +572,7 @@ def inner_fn(extractor): features = collections.defaultdict(set) for fh in fhs: for k, vs in extract_function_features(extractor, fh).items(): + # print(f"{k}:{vs}") features[k].update(vs) for k, vs in extract_file_features(extractor).items(): features[k].update(vs) diff --git a/tests/test_ts.py b/tests/test_ts.py index e81040372..9c6fcfa1f 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -984,7 +984,6 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): Property("System.Diagnostics.ProcessStartInfo::RedirectStandardOutput"), True, ), - ("aspx_5f959f", "function=ExcuteCmd", Property("System.Diagnostics.ProcessStartInfo::UseShellExecute"), True), ("aspx_5f959f", "function=cmdExe_Click", String("
"), True),
         ("aspx_5f959f", "function=cmdExe_Click", String("
"), True), ("aspx_10162f", "global", Arch(ARCH_ANY), True), @@ -1000,7 +999,34 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_10162f", "function=PSEUDO MAIN", String("sdfewq@#$51234234DF@#$!@#$ASDF"), True), ("aspx_10162f", "function=rm", API("System.IO.File::Delete"), False), ("aspx_10162f", "function=(0x564, 0x6af)", API("System.Convert::ToBase64String"), True), + ("aspx_10162f", "function=(0x564, 0x6af)", API("System.Convert::ToBase64String"), True), ("aspx_10162f", "function=(0x564, 0x6af)", String("p"), True), + ( + "aspx_10162f", + "function=c", + API("System.Security.Cryptography.SHA256CryptoServiceProvider::ComputeHash"), + True, + ), + ("aspx_10162f", "function=z", API("System.IO.File::ReadAllBytes"), True), + ("aspx_10162f", "function=ti", API("System.IO.File::GetCreationTime"), True), + ("aspx_10162f", "function=ti", API("System.IO.File::GetLastAccessTime"), True), + ("aspx_10162f", "function=ti", API("System.IO.File::GetCreationTime"), True), + ("aspx_10162f", "function=g", API("System.IO.File::GetLastAccessTime"), True), + ("aspx_10162f", "function=g", API("System.IO.File::GetLastWriteTime"), True), + ("aspx_10162f", "function=g", API("System.IO.File::GetLastWriteTime"), True), + ("aspx_10162f", "function=g", API("System.IO.File::SetCreationTime"), True), + ("aspx_10162f", "function=g", API("System.IO.File::SetLastAccessTime"), True), + ("aspx_10162f", "function=g", API("System.IO.File::SetLastWriteTime"), True), + ("aspx_10162f", "function=h", API("System.IO.Path::GetTempPath"), True), + ("aspx_10162f", "function=h", API("System.IO.File::WriteAllBytes"), True), + ("aspx_10162f", "function=h", API("System.Convert::FromBase64String"), True), + ("aspx_10162f", "function=d", API("System.IO.File::Delete"), True), + ("aspx_10162f", "function=d", API("System.IO.File::Delete"), True), + ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlConnection"), True), + ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlConnection"), True), + ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlCommand"), True), + ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlDataAdapter"), True), + ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlConnection::Open"), True), ("aspx_10162f", "function=exec", API("System.Diagnostics.Process"), True), ("aspx_10162f", "function=exec", String("cmd.exe"), True), ("aspx_10162f", "function=exec", Property("System.Diagnostics.Process.StartInfo::FileName"), True), From eeecb63dc99a94561e3c51188fe978a2bd8bf9f3 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Wed, 20 Jul 2022 13:59:19 -0400 Subject: [PATCH 41/51] Fixing address range bug; refactoring and cleanup. --- capa/features/address.py | 2 +- capa/features/extractors/ts/engine.py | 13 ++--- capa/features/extractors/ts/extractor.py | 69 ++++++++++-------------- capa/features/extractors/ts/function.py | 21 ++++---- capa/features/extractors/ts/tools.py | 14 +++-- tests/test_ts.py | 4 +- 6 files changed, 59 insertions(+), 64 deletions(-) diff --git a/capa/features/address.py b/capa/features/address.py index 350cdd332..0a20b4291 100644 --- a/capa/features/address.py +++ b/capa/features/address.py @@ -61,7 +61,7 @@ def __init__(self, start_byte, end_byte): self.end_byte = end_byte def __eq__(self, other): - return (self.start_byte, self.end_byte) == (self.start_byte, other.end_byte) + return (self.start_byte, self.end_byte) == (other.start_byte, other.end_byte) def __lt__(self, other): return (self.start_byte, self.end_byte) < (other.start_byte, other.end_byte) diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index 1003574a6..b8f760b4c 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -116,12 +116,13 @@ class TreeSitterTemplateEngine(TreeSitterBaseEngine): query: TemplateQueryBinding language_toolkit: LanguageToolkit embedded_language: str + namespaces: set[str] def __init__(self, buf: bytes): super().__init__(LANG_TEM, buf) self.embedded_language = self.identify_language() self.language_toolkit = LANGUAGE_TOOLKITS[self.embedded_language] - self.template_namespaces = set(name for _, name in self.get_template_namespaces()) + self.namespaces = set(name for _, name in self.get_namespaces()) def get_code_sections(self) -> List[Tuple[Node, str]]: return self.query.code.captures(self.tree.root_node) @@ -134,7 +135,7 @@ def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: self.embedded_language, self.get_byte_range(node), node.start_byte, - self.template_namespaces, + self.namespaces, ) def get_content_sections(self) -> List[Tuple[Node, str]]: @@ -153,7 +154,7 @@ def get_imported_namespaces(self) -> Iterator[Tuple[Node, str]]: if namespace is not None: yield node, namespace - def get_template_namespaces(self) -> Iterator[Tuple[Optional[Node], str]]: + def get_namespaces(self) -> Iterator[Tuple[Optional[Node], str]]: for namespace in self.language_toolkit.get_default_namespaces(True): yield None, namespace for node, namespace in self.get_imported_namespaces(): @@ -190,9 +191,9 @@ class TreeSitterHTMLEngine(TreeSitterBaseEngine): query: HTMLQueryBinding namespaces: set[str] - def __init__(self, buf: bytes, additional_namespaces: set[str] = None): + def __init__(self, buf: bytes, namespaces: set[str] = set()): super().__init__(LANG_HTML, buf) - self.namespaces = additional_namespaces if additional_namespaces is not None else set() + self.namespaces = namespaces def get_scripts(self) -> List[Tuple[Node, str]]: return self.query.script_element.captures(self.tree.root_node) @@ -221,4 +222,4 @@ def identify_language(self, node: Node) -> str: return LANG_JS def is_server_side_c_sharp(self, node: Node) -> bool: - return len(re.findall(r'runat\s*=\s*"server"'.encode(), self.get_byte_range(node))) > 0 + return bool(re.findall(r'runat\s*=\s*"server"'.encode(), self.get_byte_range(node))) diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index ed26e95e0..a86df8f62 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -1,4 +1,4 @@ -from typing import List, Tuple, Union, Iterator +from typing import List, Tuple, Union, Iterator, Optional from tree_sitter import Node @@ -16,8 +16,8 @@ class TreeSitterFeatureExtractor(FeatureExtractor): - code_sections: List[TreeSitterExtractorEngine] - template_namespaces: List[Tuple[Node, str]] + engines: List[TreeSitterExtractorEngine] + template_engine: TreeSitterTemplateEngine language: str path: str @@ -29,42 +29,30 @@ def __init__(self, path: str): self.language = capa.features.extractors.script.get_language_from_ext(path) if self.language == LANG_TEM: - ( - self.code_sections, - self.template_namespaces, - ) = self.extract_code_from_template(buf) + self.template_engine = TreeSitterTemplateEngine(buf) + self.engines = self.extract_code_from_template() elif self.language == LANG_HTML: - self.code_sections = list(self.extract_code_from_html(buf)) + self.engines = self.extract_code_from_html(buf) else: - self.code_sections = [TreeSitterExtractorEngine(self.language, buf)] - - def extract_code_from_template(self, buf: bytes) -> Tuple[List[TreeSitterExtractorEngine], List[Tuple[Node, str]]]: - template_engine = TreeSitterTemplateEngine(buf) - template_namespaces = list(template_engine.get_template_namespaces()) - code_sections = list(template_engine.get_parsed_code_sections()) - - additional_namespaces = set(name for _, name in template_namespaces) - for node, _ in template_engine.get_content_sections(): - section_buf = template_engine.get_byte_range(node) - code_sections.extend(list(self.extract_code_from_html(section_buf, additional_namespaces))) - return code_sections, template_namespaces - - def extract_code_from_html( - self, buf: bytes, additional_namespaces: set[str] = None - ) -> Iterator[TreeSitterExtractorEngine]: - yield from TreeSitterHTMLEngine(buf, additional_namespaces).get_parsed_code_sections() - - def get_base_address( - self, - ) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: + self.engines = [TreeSitterExtractorEngine(self.language, buf)] + + def extract_code_from_template(self) -> List[TreeSitterExtractorEngine]: + engines = list(self.template_engine.get_parsed_code_sections()) + for node, _ in self.template_engine.get_content_sections(): + section_buf = self.template_engine.get_byte_range(node) + engines.extend(list(self.extract_code_from_html(section_buf, self.template_engine.namespaces))) + return engines + + def extract_code_from_html(self, buf: bytes, namespaces: set[str] = set()) -> List[TreeSitterExtractorEngine]: + return list(TreeSitterHTMLEngine(buf, namespaces).get_parsed_code_sections()) + + def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: return NO_ADDRESS def extract_template_namespaces(self) -> Iterator[Tuple[Feature, Address]]: - for node, name in self.template_namespaces: - if node is None: - yield Namespace(name), NO_ADDRESS - else: - yield Namespace(name), FileOffsetRangeAddress(node.start_byte, node.end_byte) + for node, name in self.template_engine.get_namespaces(): + address = NO_ADDRESS if node is None else FileOffsetRangeAddress(node.start_byte, node.end_byte) + yield Namespace(name), address def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.ts.global_.extract_features() @@ -72,20 +60,21 @@ def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: if self.language == LANG_TEM: yield from self.extract_template_namespaces() - for engine in self.code_sections: + for engine in self.engines: yield from capa.features.extractors.ts.file.extract_features(engine) + def get_pseudo_main_function_inner(self, engine: TreeSitterExtractorEngine) -> TSFunctionInner: + return TSFunctionInner(engine.tree.root_node, PSEUDO_MAIN, engine) + def get_pseudo_main_function(self, engine: TreeSitterExtractorEngine) -> FunctionHandle: - return FunctionHandle( - address=engine.get_default_address(), inner=TSFunctionInner(engine.tree.root_node, PSEUDO_MAIN, engine) - ) + return FunctionHandle(engine.get_default_address(), self.get_pseudo_main_function_inner(engine)) def get_functions(self) -> Iterator[FunctionHandle]: - for engine in self.code_sections: + for engine in self.engines: yield self.get_pseudo_main_function(engine) for node, _ in engine.get_function_definitions(): name = engine.get_range(engine.get_function_definition_name(node)) - yield FunctionHandle(address=engine.get_address(node), inner=TSFunctionInner(node, name, engine)) + yield FunctionHandle(engine.get_address(node), TSFunctionInner(node, name, engine)) def extract_function_features(self, f: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.ts.function.extract_features(f, f.inner.engine) diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py index 59c02eaab..89ab3c549 100644 --- a/capa/features/extractors/ts/function.py +++ b/capa/features/extractors/ts/function.py @@ -1,4 +1,3 @@ -import itertools from typing import Tuple, Iterator from dataclasses import dataclass @@ -30,7 +29,9 @@ def is_pseudo_main_function(fh: FunctionHandle, engine: TreeSitterExtractorEngin def extract_strings(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: for node, _ in engine.get_string_literals(fn_node): - yield String(engine.get_range(node).strip('"')), engine.get_address(node) + parsed_str = engine.language_toolkit.parse_string(engine.get_range(node)) + if parsed_str is not None: + yield String(parsed_str), engine.get_address(node) def extract_integers(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: @@ -41,7 +42,9 @@ def extract_integers(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterat def get_imports(name: str, namespaces: set[str], engine: TreeSitterExtractorEngine) -> Iterator[str]: - for namespace in itertools.chain([""], namespaces): + if engine.language_toolkit.is_import(name): + yield name + for namespace in namespaces: joined_name = engine.language_toolkit.join_names(namespace, name) if engine.language_toolkit.is_import(joined_name): yield joined_name @@ -54,12 +57,10 @@ def get_properties(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator yield node, engine.language_toolkit.join_names(*qualified_names[1:]) -def get_classes(fn_node: Node, engine: TreeSitterExtractorEngine) -> set[str]: - return set( - name - for node, _ in engine.get_new_object_names(fn_node) - for name in get_imports(engine.get_range(node), engine.namespaces, engine) - ) +def get_classes(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[str]: + for node, _ in engine.get_new_object_names(fn_node): + for name in get_imports(engine.get_range(node), engine.namespaces, engine): + yield name def extract_classes_(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: @@ -95,7 +96,7 @@ def extract_regular_methods_( def extract_api(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - classes = get_classes(fn_node, engine) + classes = set(get_classes(fn_node, engine)) yield from extract_classes_(fn_node, engine) yield from extract_function_calls_(fn_node, classes, engine) yield from extract_properties_(fn_node, classes, engine) diff --git a/capa/features/extractors/ts/tools.py b/capa/features/extractors/ts/tools.py index 1e7fc43a0..5ea48481f 100644 --- a/capa/features/extractors/ts/tools.py +++ b/capa/features/extractors/ts/tools.py @@ -19,11 +19,8 @@ def load_import_signatures(self, signature_file: str) -> set: def is_import(self, import_: str) -> bool: return import_ in self.import_signatures - def join_names(self, *args: str) -> str: - return self.join_names_nonempty(*[arg for arg in args if arg != ""]) - @abc.abstractmethod - def join_names_nonempty(self, *args: str) -> str: + def join_names(self, *args: str) -> str: raise NotImplementedError() @abc.abstractmethod @@ -50,9 +47,13 @@ def get_default_namespaces(self, embedded: bool) -> set: def parse_integer(self, integer: str) -> Optional[int]: raise NotImplementedError() + @abc.abstractmethod + def parse_string(self, string: str) -> Optional[str]: + raise NotImplementedError() + class CSharpToolkit(LanguageToolkit): - def join_names_nonempty(self, *args: str) -> str: + def join_names(self, *args: str) -> str: return ".".join(args) def split_name(self, name: str) -> List[str]: @@ -112,5 +113,8 @@ def parse_integer(self, integer: str) -> Optional[int]: except: return None + def parse_string(self, string: str) -> Optional[str]: + return string.strip('"') + LANGUAGE_TOOLKITS: dict[str, LanguageToolkit] = {LANG_CS: CSharpToolkit("cs.json")} diff --git a/tests/test_ts.py b/tests/test_ts.py index 9c6fcfa1f..c2d514e0f 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -288,7 +288,7 @@ def do_test_ts_template_engine_get_template_namespaces( engine: TreeSitterTemplateEngine, expected_language: str, expected: List[str] ): default_namespaces = LANGUAGE_TOOLKITS[expected_language].get_default_namespaces(True) - template_namespaces = {name for _, name in engine.get_template_namespaces()} + template_namespaces = {name for _, name in engine.get_namespaces()} assert default_namespaces.issubset(template_namespaces) assert len(list(engine.get_imported_namespaces())) == len(expected) for (node, namespace), expected_namespace in zip(list(engine.get_imported_namespaces()), expected): @@ -914,7 +914,7 @@ def test_ts_template_engine(request: pytest.FixtureRequest, engine_str: str, exp do_test_ts_template_engine_get_parsed_code_sections(engine, expected["language"], expected["code sections"]) do_test_ts_template_engine_get_content_sections(engine, expected["content sections"]) for expected_start_byte, expected_end_byte in expected["content sections"]: - template_namespaces = list(engine.get_template_namespaces()) + template_namespaces = list(engine.get_namespaces()) additional_namespaces = set(name for _, name in template_namespaces) html_engine = TreeSitterHTMLEngine(engine.buf[expected_start_byte:expected_end_byte], additional_namespaces) do_test_ts_html_engine_init(html_engine) From cebc5e18eb75fe2e1277d76de668238798f164b1 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Wed, 20 Jul 2022 14:05:56 -0400 Subject: [PATCH 42/51] Incorporated more tests. --- tests/data | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/data b/tests/data index b0ba2f632..2e8257475 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit b0ba2f6328160ef6b34951e280f0210d954ca8bf +Subproject commit 2e8257475ebfdc8808d7e180be9a3f94977fcf57 From d7dcc946395fe6c2c966c9574139101706f4a537 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 26 Jul 2022 17:08:10 -0400 Subject: [PATCH 43/51] Added support for Python. --- capa/features/extractors/script.py | 5 +- capa/features/extractors/ts/build.py | 6 +- capa/features/extractors/ts/engine.py | 51 +++--- capa/features/extractors/ts/extractor.py | 15 +- capa/features/extractors/ts/file.py | 4 +- capa/features/extractors/ts/function.py | 21 ++- capa/features/extractors/ts/query.py | 33 +++- .../features/extractors/ts/signatures/cs.json | 143 ++++++++------- .../features/extractors/ts/signatures/py.json | 20 +++ capa/features/extractors/ts/tools.py | 168 +++++++++++++++--- tests/fixtures.py | 12 ++ tests/test_ts.py | 37 ++-- 12 files changed, 367 insertions(+), 148 deletions(-) create mode 100644 capa/features/extractors/ts/signatures/py.json diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py index d226c1627..e5a2bb9d3 100644 --- a/capa/features/extractors/script.py +++ b/capa/features/extractors/script.py @@ -1,4 +1,3 @@ -import os from typing import Tuple, Iterator from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_SCRIPT, Arch, Format, Feature, ScriptLanguage @@ -7,11 +6,13 @@ LANG_CS = "c_sharp" LANG_HTML = "html" LANG_JS = "javascript" +LANG_PY = "python" LANG_TEM = "embedded_template" EXT_ASPX = ("aspx", "aspx_") EXT_CS = ("cs", "cs_") EXT_HTML = ("html", "html_") +EXT_PY = ("py", "py_") def extract_arch() -> Iterator[Tuple[Feature, Address]]: @@ -37,4 +38,6 @@ def get_language_from_ext(path: str) -> str: return LANG_CS if path.endswith(EXT_HTML): return LANG_HTML + if path.endswith(EXT_PY): + return LANG_PY raise ValueError(f"{path} has an unrecognized or an unsupported extension.") diff --git a/capa/features/extractors/ts/build.py b/capa/features/extractors/ts/build.py index 0e78c2a92..2e73eaeea 100644 --- a/capa/features/extractors/ts/build.py +++ b/capa/features/extractors/ts/build.py @@ -6,8 +6,10 @@ "vendor/tree-sitter-embedded-template", "vendor/tree-sitter-html", "vendor/tree-sitter-javascript", + "vendor/tree-sitter-python", ] -def ts_build(): - Language.build_library(build_dir, languages) +class TSBuilder: + def __init__(self): + Language.build_library(build_dir, languages) diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index b8f760b4c..c4458b160 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -3,9 +3,8 @@ from tree_sitter import Node, Tree, Parser -import capa.features.extractors.ts.build from capa.features.address import FileOffsetRangeAddress -from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML +from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_PY, LANG_TEM, LANG_HTML from capa.features.extractors.ts.query import ( BINDINGS, QueryBinding, @@ -13,7 +12,7 @@ ScriptQueryBinding, TemplateQueryBinding, ) -from capa.features.extractors.ts.tools import LANGUAGE_TOOLKITS, LanguageToolkit +from capa.features.extractors.ts.tools import LANGUAGE_TOOLKITS, BaseNamespace, CSharpNamespace, LanguageToolkit class TreeSitterBaseEngine: @@ -23,7 +22,6 @@ class TreeSitterBaseEngine: tree: Tree def __init__(self, language: str, buf: bytes): - capa.features.extractors.ts.build.ts_build() self.language = language self.query = BINDINGS[language] self.buf = buf @@ -51,21 +49,20 @@ class TreeSitterExtractorEngine(TreeSitterBaseEngine): query: ScriptQueryBinding language_toolkit: LanguageToolkit buf_offset: int - namespaces: set[str] + namespaces: set[BaseNamespace] def __init__( self, language: str, buf: bytes, buf_offset: int = 0, - additional_namespaces: set[str] = None, + additional_namespaces: set[BaseNamespace] = set(), ): super().__init__(language, buf) self.buf_offset = buf_offset self.language_toolkit = LANGUAGE_TOOLKITS[language] - self.namespaces = set(self.get_range(ns_node) for ns_node, _ in self.get_namespaces()) - if additional_namespaces: - self.namespaces = self.namespaces.union(additional_namespaces) + self.namespaces = set(self.get_processed_namespaces()) + self.namespaces = self.namespaces.union(additional_namespaces) def get_address(self, node: Node) -> FileOffsetRangeAddress: return FileOffsetRangeAddress(self.buf_offset + node.start_byte, self.buf_offset + node.end_byte) @@ -98,11 +95,19 @@ def get_integer_literals(self, node: Node) -> List[Tuple[Node, str]]: def get_namespaces(self, node: Node = None) -> List[Tuple[Node, str]]: return self.query.namespace.captures(node if node is not None else self.tree.root_node) + def get_processed_namespaces(self, node: Node = None) -> Iterator[BaseNamespace]: + for node, query_name in self.get_namespaces(node): + for namespace in self.language_toolkit.process_namespace(node, query_name, self.get_range): + yield namespace + def get_global_statements(self) -> List[Tuple[Node, str]]: return self.query.global_statement.captures(self.tree.root_node) - def get_direct_method_call(self, node: Node) -> Node: - return node.child_by_field_name(self.query.direct_method_call_field_name) + def get_direct_method_call(self, node: Node) -> Optional[Node]: + captures = self.query.direct_method_call.captures(node) + if captures: + return captures[0][0] + return None def is_object_creation_expression(self, node: Node) -> bool: captures = self.get_new_object_names(node) @@ -116,13 +121,13 @@ class TreeSitterTemplateEngine(TreeSitterBaseEngine): query: TemplateQueryBinding language_toolkit: LanguageToolkit embedded_language: str - namespaces: set[str] + namespaces: set[BaseNamespace] def __init__(self, buf: bytes): super().__init__(LANG_TEM, buf) self.embedded_language = self.identify_language() self.language_toolkit = LANGUAGE_TOOLKITS[self.embedded_language] - self.namespaces = set(name for _, name in self.get_namespaces()) + self.namespaces = set(self.get_namespaces()) def get_code_sections(self) -> List[Tuple[Node, str]]: return self.query.code.captures(self.tree.root_node) @@ -147,18 +152,16 @@ def identify_language(self) -> str: return LANG_CS return LANG_JS - def get_imported_namespaces(self) -> Iterator[Tuple[Node, str]]: + def get_imported_namespaces(self) -> Iterator[BaseNamespace]: for node, _ in self.get_code_sections(): if self.is_aspx_import_directive(node): namespace = self.get_aspx_namespace(node) if namespace is not None: - yield node, namespace + yield namespace - def get_namespaces(self) -> Iterator[Tuple[Optional[Node], str]]: - for namespace in self.language_toolkit.get_default_namespaces(True): - yield None, namespace - for node, namespace in self.get_imported_namespaces(): - yield node, namespace + def get_namespaces(self) -> Iterator[BaseNamespace]: + yield from self.language_toolkit.get_default_namespaces(True) + yield from self.get_imported_namespaces() def is_c_sharp(self, node: Node) -> bool: return bool( @@ -178,20 +181,20 @@ def is_aspx_import_directive(self, node: Node) -> bool: ) ) - def get_aspx_namespace(self, node: Node) -> Optional[str]: + def get_aspx_namespace(self, node: Node) -> Optional[BaseNamespace]: match = re.search( r'@\s*Import namespace="(.*?)"'.encode(), self.get_byte_range(node), re.IGNORECASE, ) - return match.group(1).decode() if match is not None else None + return CSharpNamespace(match.group(1).decode(), node) if match is not None else None class TreeSitterHTMLEngine(TreeSitterBaseEngine): query: HTMLQueryBinding - namespaces: set[str] + namespaces: set[BaseNamespace] - def __init__(self, buf: bytes, namespaces: set[str] = set()): + def __init__(self, buf: bytes, namespaces: set[BaseNamespace] = set()): super().__init__(LANG_HTML, buf) self.namespaces = namespaces diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index a86df8f62..e2a2cb59d 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -1,6 +1,4 @@ -from typing import List, Tuple, Union, Iterator, Optional - -from tree_sitter import Node +from typing import List, Tuple, Union, Iterator import capa.features.extractors.script import capa.features.extractors.ts.file @@ -10,6 +8,7 @@ from capa.features.common import Namespace from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress, FileOffsetRangeAddress from capa.features.extractors.script import LANG_TEM, LANG_HTML +from capa.features.extractors.ts.tools import BaseNamespace from capa.features.extractors.ts.engine import TreeSitterHTMLEngine, TreeSitterTemplateEngine, TreeSitterExtractorEngine from capa.features.extractors.ts.function import PSEUDO_MAIN, TSFunctionInner from capa.features.extractors.base_extractor import Feature, BBHandle, InsnHandle, FunctionHandle, FeatureExtractor @@ -43,16 +42,18 @@ def extract_code_from_template(self) -> List[TreeSitterExtractorEngine]: engines.extend(list(self.extract_code_from_html(section_buf, self.template_engine.namespaces))) return engines - def extract_code_from_html(self, buf: bytes, namespaces: set[str] = set()) -> List[TreeSitterExtractorEngine]: + def extract_code_from_html( + self, buf: bytes, namespaces: set[BaseNamespace] = set() + ) -> List[TreeSitterExtractorEngine]: return list(TreeSitterHTMLEngine(buf, namespaces).get_parsed_code_sections()) def get_base_address(self) -> Union[AbsoluteVirtualAddress, capa.features.address._NoAddress]: return NO_ADDRESS def extract_template_namespaces(self) -> Iterator[Tuple[Feature, Address]]: - for node, name in self.template_engine.get_namespaces(): - address = NO_ADDRESS if node is None else FileOffsetRangeAddress(node.start_byte, node.end_byte) - yield Namespace(name), address + for ns in self.template_engine.get_namespaces(): + address = NO_ADDRESS if ns.node is None else FileOffsetRangeAddress(ns.node.start_byte, ns.node.end_byte) + yield Namespace(ns.name), address def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.ts.global_.extract_features() diff --git a/capa/features/extractors/ts/file.py b/capa/features/extractors/ts/file.py index 20cd3cbd0..ea0504fbe 100644 --- a/capa/features/extractors/ts/file.py +++ b/capa/features/extractors/ts/file.py @@ -11,8 +11,8 @@ def extract_language(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Featur def extract_namespaces(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, _ in engine.get_namespaces(): - yield Namespace(engine.get_range(node)), engine.get_address(node) + for namespace in engine.get_processed_namespaces(): + yield Namespace(namespace.name), engine.get_address(namespace.node) def extract_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py index 89ab3c549..db2ca29ad 100644 --- a/capa/features/extractors/ts/function.py +++ b/capa/features/extractors/ts/function.py @@ -6,6 +6,7 @@ from capa.features.insn import API, Number, Property from capa.features.common import String, Feature from capa.features.address import Address +from capa.features.extractors.ts.tools import BaseNamespace from capa.features.extractors.ts.engine import TreeSitterExtractorEngine from capa.features.extractors.base_extractor import FunctionHandle @@ -41,11 +42,14 @@ def extract_integers(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterat yield Number(parsed_int), engine.get_address(node) -def get_imports(name: str, namespaces: set[str], engine: TreeSitterExtractorEngine) -> Iterator[str]: +def get_imports(name: str, namespaces: set[BaseNamespace], engine: TreeSitterExtractorEngine) -> Iterator[str]: if engine.language_toolkit.is_import(name): yield name for namespace in namespaces: - joined_name = engine.language_toolkit.join_names(namespace, name) + namespace_join_name = namespace.get_join_name() + if not namespace_join_name: + continue + joined_name = engine.language_toolkit.join_names(namespace_join_name, name) if engine.language_toolkit.is_import(joined_name): yield joined_name @@ -70,7 +74,7 @@ def extract_classes_(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterat def extract_properties_( - fn_node: Node, classes: set[str], engine: TreeSitterExtractorEngine + fn_node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine ) -> Iterator[Tuple[Feature, Address]]: for node, property_name in get_properties(fn_node, engine): for name in get_imports(property_name, classes, engine): @@ -83,10 +87,11 @@ def extract_static_methods_(node: Node, engine: TreeSitterExtractorEngine) -> It def extract_regular_methods_( - node: Node, classes: set[str], engine: TreeSitterExtractorEngine + node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine ) -> Iterator[Tuple[Feature, Address]]: - if engine.is_object_creation_expression(node): - node = engine.get_direct_method_call(node) + direct_method_call_node = engine.get_direct_method_call(node) + if direct_method_call_node is not None: + node = direct_method_call_node qualified_names = engine.language_toolkit.split_name(engine.get_range(node)) property_name = ( qualified_names[0] if len(qualified_names) == 1 else engine.language_toolkit.join_names(*qualified_names[1:]) @@ -96,14 +101,14 @@ def extract_regular_methods_( def extract_api(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - classes = set(get_classes(fn_node, engine)) + classes = {engine.language_toolkit.create_namespace(cls) for cls in get_classes(fn_node, engine)} yield from extract_classes_(fn_node, engine) yield from extract_function_calls_(fn_node, classes, engine) yield from extract_properties_(fn_node, classes, engine) def extract_function_calls_( - fn_node: Node, classes: set[str], engine: TreeSitterExtractorEngine + fn_node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine ) -> Iterator[Tuple[Feature, Address]]: for node, _ in engine.get_function_call_names(fn_node): yield from extract_static_methods_(node, engine) diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index d337a588e..c3d8d14cb 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -4,7 +4,7 @@ from tree_sitter.binding import Query import capa.features.extractors.ts.build -from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML +from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_PY, LANG_TEM, LANG_HTML @dataclass @@ -17,8 +17,7 @@ class ScriptQueryBinding(QueryBinding): new_object_name: Query function_definition: Query function_definition_field_name: str - direct_method_call_field_name: str - object_creation_expression_field_name: str + direct_method_call: Query function_call_name: Query assigned_property_name: Query string_literal: Query @@ -51,8 +50,11 @@ def deserialize(language: str, binding: dict) -> dict: return deserialized_binding +capa.features.extractors.ts.build.TSBuilder() + TS_LANGUAGES: dict[str, Language] = { LANG_CS: Language(capa.features.extractors.ts.build.build_dir, LANG_CS), + LANG_PY: Language(capa.features.extractors.ts.build.build_dir, LANG_PY), LANG_TEM: Language(capa.features.extractors.ts.build.build_dir, LANG_TEM), LANG_HTML: Language(capa.features.extractors.ts.build.build_dir, LANG_HTML), LANG_JS: Language(capa.features.extractors.ts.build.build_dir, LANG_JS), @@ -73,11 +75,32 @@ def deserialize(language: str, binding: dict) -> dict: "integer_literal": "(integer_literal) @integer-literal", "namespace": "(using_directive [(identifier) @namespace (qualified_name) @namespace])", "global_statement": "(global_statement [(if_statement) @global-statement (expression_statement) @global-statement (local_declaration_statement) @global-statement])", + "direct_method_call": "(member_access_expression expression: (object_creation_expression) name: (identifier) @direct-method-call)", + }, + "field_name": { + "function_definition": "name", + }, + }, + ), + ), + LANG_PY: ScriptQueryBinding( + TS_LANGUAGES[LANG_PY], + **deserialize( + LANG_PY, + { + "query": { + "new_object_name": "(call function: [(attribute) @new-object (identifier) @new-object])", # Python makes no distinction between new object creation and a function call + "function_definition": "(function_definition) @function-definition", + "function_call_name": "(call function: [(attribute) @function-call (identifier) @function-call])", + "assigned_property_name": "(attribute attribute: (identifier) @property)", + "string_literal": "(string) @string-literal", + "integer_literal": "(integer) @integer-literal", + "namespace": "(import_from_statement) @import_from (import_statement) @import", + "global_statement": "(module [(if_statement) @global-statement (expression_statement) @global-statement])", + "direct_method_call": "(attribute object: (call) attribute: (identifier) @direct-method-call)", }, "field_name": { "function_definition": "name", - "direct_method_call": "name", - "object_creation_expression": "expression", }, }, ), diff --git a/capa/features/extractors/ts/signatures/cs.json b/capa/features/extractors/ts/signatures/cs.json index 67f79ba88..10654a892 100644 --- a/capa/features/extractors/ts/signatures/cs.json +++ b/capa/features/extractors/ts/signatures/cs.json @@ -1,61 +1,82 @@ -[ - "System.IO.DirectoryInfo", - "System.IO.Directory.CreateDirectory", - "System.IO.File.Delete", - "System.IO.File.Write", - "System.IO.File.GetAttributes", - "System.IO.File.GetCreationTime", - "System.IO.File.GetLastAccessTime", - "System.IO.File.GetLastWriteTime", - "System.IO.File.ReadAllBytes", - "System.IO.File.ReadAllBytes", - "System.IO.File.ReadAllBytesAsync", - "System.IO.File.ReadAllLines", - "System.IO.File.ReadAllLinesAsync", - "System.IO.File.ReadAllText", - "System.IO.File.ReadAllTextAsync", - "System.IO.File.ReadLines", - "System.IO.File.ReadLinesAsync", - "System.IO.File.SetCreationTime", - "System.IO.File.SetLastAccessTime", - "System.IO.File.SetLastWriteTime", - "System.IO.File.WriteAllBytes", - "System.IO.File.WriteAllBytes", - "System.IO.File.WriteAllBytesAsync", - "System.IO.File.WriteAllLines", - "System.IO.File.WriteAllLinesAsync", - "System.IO.File.WriteAllText", - "System.IO.File.WriteAllTextAsync", - "System.IO.File.WriteLines", - "System.IO.File.WriteLinesAsync", - "System.IO.Path.GetTempPath", - "System.Convert.ToBase64String", - "System.Convert.FromBase64String", - "System.Data.SqlClient.SqlCommand", - "System.Data.SqlClient.SqlConnection", - "System.Data.SqlClient.SqlConnection.Open", - "System.Data.SqlClient.SqlDataAdapter", - "System.Diagnostics.Process", - "System.Diagnostics.ProcessStartInfo", - "System.Diagnostics.ProcessStartInfo.FileName", - "System.Diagnostics.ProcessStartInfo.Arguments", - "System.Diagnostics.ProcessStartInfo.RedirectStandardInput", - "System.Diagnostics.ProcessStartInfo.RedirectStandardOutput", - "System.Diagnostics.ProcessStartInfo.UseShellExecute", - "System.Diagnostics.ProcessStartInfo.CreateNoWindow", - "System.Diagnostics.Process.StartInfo.FileName", - "System.Diagnostics.Process.StartInfo.Arguments", - "System.Diagnostics.Process.StartInfo.RedirectStandardInput", - "System.Diagnostics.Process.StartInfo.RedirectStandardOutput", - "System.Diagnostics.Process.StartInfo.UseShellExecute", - "System.Diagnostics.Process.StartInfo.CreateNoWindow", - "System.Diagnostics.Process.Start", - "System.Security.Cryptography.RijndaelManaged", - "System.Security.Cryptography.CryptoStream", - "System.Security.Cryptography.SHA1", - "System.Security.Cryptography.SHA1CryptoServiceProvider", - "System.Security.Cryptography.SHA1CryptoServiceProvider.ComputeHash", - "System.Security.Cryptography.SHA256", - "System.Security.Cryptography.SHA256CryptoServiceProvider", - "System.Security.Cryptography.SHA256CryptoServiceProvider.ComputeHash" -] \ No newline at end of file +{ + "namespaces": + [ + "System.IO.DirectoryInfo", + "System.IO.Directory.CreateDirectory", + "System.IO.File.Delete", + "System.IO.File.Write", + "System.IO.File.GetAttributes", + "System.IO.File.GetCreationTime", + "System.IO.File.GetLastAccessTime", + "System.IO.File.GetLastWriteTime", + "System.IO.File.ReadAllBytes", + "System.IO.File.ReadAllBytes", + "System.IO.File.ReadAllBytesAsync", + "System.IO.File.ReadAllLines", + "System.IO.File.ReadAllLinesAsync", + "System.IO.File.ReadAllText", + "System.IO.File.ReadAllTextAsync", + "System.IO.File.ReadLines", + "System.IO.File.ReadLinesAsync", + "System.IO.File.SetCreationTime", + "System.IO.File.SetLastAccessTime", + "System.IO.File.SetLastWriteTime", + "System.IO.File.WriteAllBytes", + "System.IO.File.WriteAllBytes", + "System.IO.File.WriteAllBytesAsync", + "System.IO.File.WriteAllLines", + "System.IO.File.WriteAllLinesAsync", + "System.IO.File.WriteAllText", + "System.IO.File.WriteAllTextAsync", + "System.IO.File.WriteLines", + "System.IO.File.WriteLinesAsync", + "System.IO.Path.GetTempPath", + "System.Convert.ToBase64String", + "System.Convert.FromBase64String", + "System.Data.SqlClient.SqlCommand", + "System.Data.SqlClient.SqlConnection", + "System.Data.SqlClient.SqlConnection.Open", + "System.Data.SqlClient.SqlDataAdapter", + "System.Diagnostics.Process", + "System.Diagnostics.ProcessStartInfo", + "System.Diagnostics.ProcessStartInfo.FileName", + "System.Diagnostics.ProcessStartInfo.Arguments", + "System.Diagnostics.ProcessStartInfo.RedirectStandardInput", + "System.Diagnostics.ProcessStartInfo.RedirectStandardOutput", + "System.Diagnostics.ProcessStartInfo.UseShellExecute", + "System.Diagnostics.ProcessStartInfo.CreateNoWindow", + "System.Diagnostics.Process.StartInfo.FileName", + "System.Diagnostics.Process.StartInfo.Arguments", + "System.Diagnostics.Process.StartInfo.RedirectStandardInput", + "System.Diagnostics.Process.StartInfo.RedirectStandardOutput", + "System.Diagnostics.Process.StartInfo.UseShellExecute", + "System.Diagnostics.Process.StartInfo.CreateNoWindow", + "System.Diagnostics.Process.Start", + "System.Security.Cryptography.RijndaelManaged", + "System.Security.Cryptography.CryptoStream", + "System.Security.Cryptography.SHA1", + "System.Security.Cryptography.SHA1CryptoServiceProvider", + "System.Security.Cryptography.SHA1CryptoServiceProvider.ComputeHash", + "System.Security.Cryptography.SHA256", + "System.Security.Cryptography.SHA256CryptoServiceProvider", + "System.Security.Cryptography.SHA256CryptoServiceProvider.ComputeHash" + ], + "aspx_default_namespaces": + [ + "System", + "System.Collections", + "System.Collections.Specialized", + "System.Configuration", + "System.Text", + "System.Text.RegularExpressions", + "System.Web", + "System.Web.Caching", + "System.Web.Profile", + "System.Web.Security", + "System.Web.SessionState", + "System.Web.UI", + "System.Web.UI.HtmlControls", + "System.Web.UI.WebControls", + "System.Web.UI.WebControls.WebParts" + ] +} \ No newline at end of file diff --git a/capa/features/extractors/ts/signatures/py.json b/capa/features/extractors/ts/signatures/py.json new file mode 100644 index 000000000..da586b0d8 --- /dev/null +++ b/capa/features/extractors/ts/signatures/py.json @@ -0,0 +1,20 @@ + +{ + "namespaces": [ + "socket", + "socket.error", + "subprocess.Popen", + "subprocess.PIPE", + "urllib2.urlopen", + "urllib2.Request", + "base64.encodestring", + "base64.b64encode", + "base64.b64decode", + "os.chmod", + "os.chdir", + "os.path.expanduser", + "os.path.dirname", + "platform.mac_ver", + "time.sleep" + ] +} \ No newline at end of file diff --git a/capa/features/extractors/ts/tools.py b/capa/features/extractors/ts/tools.py index 5ea48481f..9033139b7 100644 --- a/capa/features/extractors/ts/tools.py +++ b/capa/features/extractors/ts/tools.py @@ -1,23 +1,58 @@ import abc import json import importlib.resources -from typing import List, Optional +from typing import Dict, List, Callable, Iterator, Optional +from dataclasses import dataclass + +from tree_sitter import Node import capa.features.extractors.ts.signatures -from capa.features.extractors.script import LANG_CS +from capa.features.extractors.script import LANG_CS, LANG_PY + + +@dataclass(frozen=True) +class BaseNamespace(abc.ABC): + name: str + node: Node = None + alias: str = "" + + def __hash__(self): + return hash(self.name) + + def get_join_name(self) -> Optional[str]: + raise NotImplementedError() + + +class CSharpNamespace(BaseNamespace): + def get_join_name(self) -> Optional[str]: + return self.name + + +class PythonNamespace(BaseNamespace): + def get_join_name(self) -> Optional[str]: + toolkit = LANGUAGE_TOOLKITS[LANG_CS] + qualified_names = toolkit.split_name(self.name) + if len(qualified_names) < 2: + return None + return toolkit.join_names(*qualified_names[:-1]) class LanguageToolkit: - import_signatures: set + import_signatures: Dict[str, set[str]] def __init__(self, signature_file: str): self.import_signatures = self.load_import_signatures(signature_file) - def load_import_signatures(self, signature_file: str) -> set: - return set(json.loads(importlib.resources.read_text(capa.features.extractors.ts.signatures, signature_file))) + def load_import_signatures(self, signature_file: str) -> Dict[str, set[str]]: + signatures = json.loads(importlib.resources.read_text(capa.features.extractors.ts.signatures, signature_file)) + return {category: set(namespaces) for category, namespaces in signatures.items()} def is_import(self, import_: str) -> bool: - return import_ in self.import_signatures + return import_ in self.import_signatures["namespaces"] + + @abc.abstractmethod + def create_namespace(self, name: str, node: Node = None, alias: str = "") -> BaseNamespace: + raise NotImplementedError() @abc.abstractmethod def join_names(self, *args: str) -> str: @@ -33,14 +68,18 @@ def format_imported_class(self, name: str) -> str: @abc.abstractmethod def format_imported_function(self, name: str) -> str: - raise NotImplementedError + raise NotImplementedError() @abc.abstractmethod def format_imported_property(self, name: str) -> str: - raise NotImplementedError + raise NotImplementedError() @abc.abstractmethod - def get_default_namespaces(self, embedded: bool) -> set: + def process_namespace(self, node: Node, query_name: str, get_range: Callable) -> Iterator[BaseNamespace]: + raise NotImplementedError() + + @abc.abstractmethod + def get_default_namespaces(self, embedded: bool) -> set[BaseNamespace]: raise NotImplementedError() @abc.abstractmethod @@ -62,6 +101,9 @@ def split_name(self, name: str) -> List[str]: def format_imported_class(self, name: str) -> str: return name + def create_namespace(self, name: str, node: Node = None, alias: str = "") -> BaseNamespace: + return CSharpNamespace(name, node, alias) + def format_imported_function(self, name: str) -> str: qualified_names = self.split_name(name) if len(qualified_names) < 2: @@ -82,25 +124,12 @@ def format_imported_property(self, name: str) -> str: namespace, classname, propertyname = qualified_names[:-2], qualified_names[-2], qualified_names[-1] return f"{'.'.join(namespace)}.{classname}::{propertyname}" - def get_default_namespaces(self, embedded: bool) -> set: + def process_namespace(self, node: Node, query_name: str, get_range: Callable) -> Iterator[BaseNamespace]: + yield CSharpNamespace(get_range(node), node, "") + + def get_default_namespaces(self, embedded: bool) -> set[BaseNamespace]: if embedded: - return { - "System", - "System.Collections", - "System.Collections.Specialized", - "System.Configuration", - "System.Text", - "System.Text.RegularExpressions", - "System.Web", - "System.Web.Caching", - "System.Web.Profile", - "System.Web.Security", - "System.Web.SessionState", - "System.Web.UI", - "System.Web.UI.HtmlControls", - "System.Web.UI.WebControls", - "System.Web.UI.WebControls.WebParts", - } + return {CSharpNamespace(name) for name in self.import_signatures["aspx_default_namespaces"]} return set() def parse_integer(self, integer: str) -> Optional[int]: @@ -117,4 +146,87 @@ def parse_string(self, string: str) -> Optional[str]: return string.strip('"') -LANGUAGE_TOOLKITS: dict[str, LanguageToolkit] = {LANG_CS: CSharpToolkit("cs.json")} +class PythonToolkit(LanguageToolkit): + def join_names(self, *args: str) -> str: + return ".".join(args) + + def split_name(self, name: str) -> List[str]: + return name.split(".") + + def format_imported_class(self, name: str) -> str: + return name + + def create_namespace(self, name: str, node: Node = None, alias: str = "") -> BaseNamespace: + return PythonNamespace(name, node, alias) + + def format_imported_function(self, name: str) -> str: + qualified_names = self.split_name(name) + if len(qualified_names) < 2: + raise ValueError(f"function {name} does not have an associated class or namespace") + if len(qualified_names) == 2: + classname, functionname = qualified_names[0], qualified_names[1] + return f"{classname}::{functionname}" + namespace, classname, functionname = qualified_names[:-2], qualified_names[-2], qualified_names[-1] + return f"{'.'.join(namespace)}.{classname}::{functionname}" + + def format_imported_property(self, name: str) -> str: + qualified_names = self.split_name(name) + if len(qualified_names) < 2: + raise ValueError(f"property {name} does not have an associated class") + if len(qualified_names) == 2: + classname, propertyname = qualified_names[0], qualified_names[1] + return f"{classname}::{propertyname}" + namespace, classname, propertyname = qualified_names[:-2], qualified_names[-2], qualified_names[-1] + return f"{'.'.join(namespace)}.{classname}::{propertyname}" + + def get_import_name(self, name: str, module_name: Optional[str] = None) -> str: + return self.join_names(module_name, name) if module_name else name + + def process_simple_import( + self, node: Node, get_range: Callable, module_name: Optional[str] = None + ) -> BaseNamespace: + return PythonNamespace(self.get_import_name(get_range(node), module_name), node) + + def process_aliased_import( + self, node: Node, get_range: Callable, module_name: Optional[str] = None + ) -> BaseNamespace: + name = self.get_import_name(get_range(node.get_child_by_field_name("name")), module_name) + alias = get_range(node.get_child_by_field_name("alias")) + return PythonNamespace(name, node, alias) + + def process_imports( + self, nodes: List[Node], get_range: Callable, module_name: Optional[str] = None + ) -> Iterator[BaseNamespace]: + for import_node in nodes: + if import_node.type == "dotted_name": + yield self.process_simple_import(import_node, get_range, module_name) + elif import_node.type == "aliased_import": + yield self.process_aliased_import(import_node, get_range, module_name) + + def process_namespace(self, node: Node, query_name: str, get_range: Callable) -> Iterator[BaseNamespace]: + import_nodes = [child_node for child_node in node.children if child_node.is_named] + if query_name == "import_from": + yield from self.process_imports(import_nodes[1:], get_range, get_range(import_nodes[0])) + elif query_name == "import": + yield from self.process_imports(import_nodes, get_range) + + def get_default_namespaces(self, embedded: bool) -> set[BaseNamespace]: + return set() + + def parse_integer(self, integer: str) -> Optional[int]: + try: + if integer.startswith(("0b, 0B")): + return int(integer, 2) + if integer.startswith(("0o, 0O")): + return int(integer, 8) + if integer.startswith(("0x", "0X")): + return int(integer, 16) + return int(integer) + except: + return None + + def parse_string(self, string: str) -> Optional[str]: + return string.strip('"') + + +LANGUAGE_TOOLKITS: dict[str, LanguageToolkit] = {LANG_CS: CSharpToolkit("cs.json"), LANG_PY: PythonToolkit("py.json")} diff --git a/tests/fixtures.py b/tests/fixtures.py index f5777843d..09a5e1d16 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -47,6 +47,7 @@ SOURCE_DIR = os.path.join(CD, "data", "source") ASPX_DIR = os.path.join(SOURCE_DIR, "aspx") CS_DIR = os.path.join(SOURCE_DIR, "cs") +PY_DIR = os.path.join(SOURCE_DIR, "py") @contextlib.contextmanager @@ -334,6 +335,10 @@ def get_data_path_by_name(name): "aspx_d460ca": os.path.join(ASPX_DIR, "d460cae7d34c51059ef57c5aadb3de099469efbac5fffcf76d0528a511192a28.aspx_"), } +PY_DATA_PATH_BY_NAME = { + "py_7f9cd1": os.path.join(PY_DIR, "7f9cd1eedf0a9088fc3e07a275d04dceadcf0a5cd425a17e9666b63685d3a37e.py_") +} + def get_sample_md5_by_name(name): """used by IDA tests to ensure the correct IDB is loaded""" @@ -402,6 +407,8 @@ def sample(request): def resolve_sample_ts(sample): if sample.startswith("cs_"): return get_data_path_by_name(sample) + if sample.startswith("py_"): + return PY_DATA_PATH_BY_NAME[sample] if sample.startswith("aspx_"): try: return ASPX_DATA_PATH_BY_NAME[sample] @@ -1139,3 +1146,8 @@ def aspx_b75f16_template_engine(): @pytest.fixture def aspx_d460ca_template_engine(): return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_d460ca"]) + + +@pytest.fixture +def py_7f9cd1_template_engine(): + return get_ts_extractor_engine() diff --git a/tests/test_ts.py b/tests/test_ts.py index c2d514e0f..73a8ca037 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -19,7 +19,7 @@ ScriptLanguage, ) from capa.features.address import FileOffsetRangeAddress -from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML +from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_PY, LANG_TEM, LANG_HTML from capa.features.extractors.ts.query import QueryBinding, HTMLQueryBinding, TemplateQueryBinding from capa.features.extractors.ts.tools import LANGUAGE_TOOLKITS from capa.features.extractors.ts.engine import ( @@ -288,14 +288,15 @@ def do_test_ts_template_engine_get_template_namespaces( engine: TreeSitterTemplateEngine, expected_language: str, expected: List[str] ): default_namespaces = LANGUAGE_TOOLKITS[expected_language].get_default_namespaces(True) - template_namespaces = {name for _, name in engine.get_namespaces()} + template_namespaces = set(engine.get_namespaces()) assert default_namespaces.issubset(template_namespaces) assert len(list(engine.get_imported_namespaces())) == len(expected) - for (node, namespace), expected_namespace in zip(list(engine.get_imported_namespaces()), expected): - assert isinstance(node, Node) - assert engine.is_aspx_import_directive(node) == True - assert engine.get_aspx_namespace(node) == expected_namespace - assert namespace == expected_namespace + for namespace, expected_namespace in zip(list(engine.get_imported_namespaces()), expected): + assert isinstance(namespace.node, Node) + assert engine.is_aspx_import_directive(namespace.node) == True + aspx_namespace = engine.get_aspx_namespace(namespace.node) + assert aspx_namespace is not None and aspx_namespace.name == expected_namespace + assert namespace.name == expected_namespace def do_test_ts_template_engine_get_code_sections(engine: TreeSitterTemplateEngine, expected: List[Tuple[int, int]]): @@ -914,9 +915,9 @@ def test_ts_template_engine(request: pytest.FixtureRequest, engine_str: str, exp do_test_ts_template_engine_get_parsed_code_sections(engine, expected["language"], expected["code sections"]) do_test_ts_template_engine_get_content_sections(engine, expected["content sections"]) for expected_start_byte, expected_end_byte in expected["content sections"]: - template_namespaces = list(engine.get_namespaces()) - additional_namespaces = set(name for _, name in template_namespaces) - html_engine = TreeSitterHTMLEngine(engine.buf[expected_start_byte:expected_end_byte], additional_namespaces) + html_engine = TreeSitterHTMLEngine( + engine.buf[expected_start_byte:expected_end_byte], set(engine.get_namespaces()) + ) do_test_ts_html_engine_init(html_engine) @@ -1069,6 +1070,22 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_15eed4", "global", Arch(ARCH_ANY), True), ("aspx_b75f16", "global", Arch(ARCH_ANY), True), ("aspx_d460ca", "global", Arch(ARCH_ANY), True), + ("py_7f9cd1", "global", Arch(ARCH_ANY), True), + ("py_7f9cd1", "global", OS(OS_ANY), True), + ("py_7f9cd1", "file", Format(FORMAT_SCRIPT), True), + ("py_7f9cd1", "file", ScriptLanguage(LANG_PY), True), + ("py_7f9cd1", "file", Namespace("socket"), True), + ("py_7f9cd1", "file", Namespace("threading.Timer"), True), + ("py_7f9cd1", "file", Namespace("threading.Timer"), True), + ("py_7f9cd1", "function=icloud_phish", API("subprocess.Popen"), True), + ("py_7f9cd1", "function=icloud_phish", API("urllib2.Request"), True), + ("py_7f9cd1", "function=icloud_phish", API("base64.encodestring"), True), + ("py_7f9cd1", "function=icloud_phish", API("urllib2.urlopen"), True), + ("py_7f9cd1", "function=get_itunes_backups", String("IMEI"), True), + ("py_7f9cd1", "function=PSEUDO MAIN", String("[I] "), True), + ("py_7f9cd1", "function=PSEUDO MAIN", Substring("[!]"), True), + ("py_7f9cd1", "function=get_itunes_backups", Number(0), True), + ("py_7f9cd1", "function=get_itunes_backups", Number(1), True), ] ) From 32dc5ff4216eae77d149304d470a88d8f046d277 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Fri, 29 Jul 2022 15:32:58 -0400 Subject: [PATCH 44/51] Added more python test cases; fixed a number of python bugs; further refactored language toolkit code; added extraction of global constants. --- capa/features/extractors/ts/engine.py | 25 +- capa/features/extractors/ts/function.py | 14 +- capa/features/extractors/ts/query.py | 9 +- .../features/extractors/ts/signatures/cs.json | 3 +- .../features/extractors/ts/signatures/py.json | 22 +- capa/features/extractors/ts/tools.py | 224 +++++++++--------- tests/fixtures.py | 54 +++-- tests/test_ts.py | 49 +++- 8 files changed, 235 insertions(+), 165 deletions(-) diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index c4458b160..0908d8823 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -4,7 +4,7 @@ from tree_sitter import Node, Tree, Parser from capa.features.address import FileOffsetRangeAddress -from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_PY, LANG_TEM, LANG_HTML +from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML from capa.features.extractors.ts.query import ( BINDINGS, QueryBinding, @@ -70,8 +70,14 @@ def get_address(self, node: Node) -> FileOffsetRangeAddress: def get_new_object_names(self, node: Node) -> List[Tuple[Node, str]]: return self.query.new_object_name.captures(node) - def get_assigned_property_names(self, node: Node) -> List[Tuple[Node, str]]: - return self.query.assigned_property_name.captures(node) + def get_property_names(self, node: Node) -> List[Tuple[Node, str]]: + return self.query.property_name.captures(node) + + def get_processed_property_names(self, node: Node) -> Iterator[Tuple[Node, str]]: + for pt_node, _ in self.get_property_names(node): + pt_name = self.language_toolkit.process_property(pt_node, self.get_range(pt_node)) + if pt_name: + yield pt_node, pt_name def get_function_definitions(self, node: Node = None) -> List[Tuple[Node, str]]: return self.query.function_definition.captures(node if node is not None else self.tree.root_node) @@ -86,6 +92,15 @@ def get_function_definition_names(self, node: Node) -> Iterator[Node]: def get_function_call_names(self, node: Node) -> List[Tuple[Node, str]]: return self.query.function_call_name.captures(node) + def get_imported_constants(self, node: Node) -> List[Tuple[Node, str]]: + return self.query.imported_constant_name.captures(node) + + def get_processed_imported_constants(self, node: Node) -> Iterator[Tuple[Node, str]]: + for const_node, _ in self.get_imported_constants(node): + const_name = self.language_toolkit.process_imported_constant(const_node, self.get_range(const_node)) + if const_name: + yield const_node, const_name + def get_string_literals(self, node: Node) -> List[Tuple[Node, str]]: return self.query.string_literal.captures(node) @@ -96,8 +111,8 @@ def get_namespaces(self, node: Node = None) -> List[Tuple[Node, str]]: return self.query.namespace.captures(node if node is not None else self.tree.root_node) def get_processed_namespaces(self, node: Node = None) -> Iterator[BaseNamespace]: - for node, query_name in self.get_namespaces(node): - for namespace in self.language_toolkit.process_namespace(node, query_name, self.get_range): + for ns_node, query_name in self.get_namespaces(node): + for namespace in self.language_toolkit.process_namespace(ns_node, query_name, self.get_range): yield namespace def get_global_statements(self) -> List[Tuple[Node, str]]: diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py index db2ca29ad..e8dd37a0f 100644 --- a/capa/features/extractors/ts/function.py +++ b/capa/features/extractors/ts/function.py @@ -43,6 +43,8 @@ def extract_integers(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterat def get_imports(name: str, namespaces: set[BaseNamespace], engine: TreeSitterExtractorEngine) -> Iterator[str]: + if engine.language_toolkit.is_builtin(name): + yield name if engine.language_toolkit.is_import(name): yield name for namespace in namespaces: @@ -55,10 +57,7 @@ def get_imports(name: str, namespaces: set[BaseNamespace], engine: TreeSitterExt def get_properties(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Node, str]]: - for node, _ in engine.get_assigned_property_names(fn_node): - qualified_names = engine.language_toolkit.split_name(engine.get_range(node)) - if len(qualified_names) > 1: - yield node, engine.language_toolkit.join_names(*qualified_names[1:]) + yield from engine.get_processed_property_names(fn_node) def get_classes(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[str]: @@ -103,6 +102,7 @@ def extract_regular_methods_( def extract_api(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: classes = {engine.language_toolkit.create_namespace(cls) for cls in get_classes(fn_node, engine)} yield from extract_classes_(fn_node, engine) + yield from extract_imported_constants_(fn_node, engine) yield from extract_function_calls_(fn_node, classes, engine) yield from extract_properties_(fn_node, classes, engine) @@ -115,6 +115,12 @@ def extract_function_calls_( yield from extract_regular_methods_(node, classes, engine) +def extract_imported_constants_(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node, _ in engine.get_processed_imported_constants(fn_node): + for name in get_imports(engine.get_range(node), engine.namespaces, engine): + yield API(engine.language_toolkit.format_imported_constant(name)), engine.get_address(node) + + def extract_pseudo_main_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: for node, _ in engine.get_global_statements(): yield from extract_features_(node, engine) diff --git a/capa/features/extractors/ts/query.py b/capa/features/extractors/ts/query.py index c3d8d14cb..baf837806 100644 --- a/capa/features/extractors/ts/query.py +++ b/capa/features/extractors/ts/query.py @@ -19,7 +19,8 @@ class ScriptQueryBinding(QueryBinding): function_definition_field_name: str direct_method_call: Query function_call_name: Query - assigned_property_name: Query + property_name: Query + imported_constant_name: Query string_literal: Query integer_literal: Query namespace: Query @@ -70,7 +71,8 @@ def deserialize(language: str, binding: dict) -> dict: "new_object_name": "(object_creation_expression type: [(qualified_name) @new-object (identifier) @new-object])", "function_definition": "(local_function_statement) @function-definition", "function_call_name": "(invocation_expression function: [(member_access_expression name: (identifier)) @function-call (identifier) @function-call])", - "assigned_property_name": "(assignment_expression left: (member_access_expression) @property)", + "property_name": "(member_access_expression) @property", + "imported_constant_name": "(member_access_expression) @constant (equals_value_clause (identifier) @constant)", "string_literal": "(string_literal) @string-literal", "integer_literal": "(integer_literal) @integer-literal", "namespace": "(using_directive [(identifier) @namespace (qualified_name) @namespace])", @@ -92,7 +94,8 @@ def deserialize(language: str, binding: dict) -> dict: "new_object_name": "(call function: [(attribute) @new-object (identifier) @new-object])", # Python makes no distinction between new object creation and a function call "function_definition": "(function_definition) @function-definition", "function_call_name": "(call function: [(attribute) @function-call (identifier) @function-call])", - "assigned_property_name": "(attribute attribute: (identifier) @property)", + "property_name": "(attribute) @property", + "imported_constant_name": "(attribute) @constant (expression_statement (assignment right: (identifier) @constant))", "string_literal": "(string) @string-literal", "integer_literal": "(integer) @integer-literal", "namespace": "(import_from_statement) @import_from (import_statement) @import", diff --git a/capa/features/extractors/ts/signatures/cs.json b/capa/features/extractors/ts/signatures/cs.json index 10654a892..f3d5f0530 100644 --- a/capa/features/extractors/ts/signatures/cs.json +++ b/capa/features/extractors/ts/signatures/cs.json @@ -1,5 +1,5 @@ { - "namespaces": + "imports": [ "System.IO.DirectoryInfo", "System.IO.Directory.CreateDirectory", @@ -61,6 +61,7 @@ "System.Security.Cryptography.SHA256CryptoServiceProvider", "System.Security.Cryptography.SHA256CryptoServiceProvider.ComputeHash" ], + "builtins": [], "aspx_default_namespaces": [ "System", diff --git a/capa/features/extractors/ts/signatures/py.json b/capa/features/extractors/ts/signatures/py.json index da586b0d8..21324b558 100644 --- a/capa/features/extractors/ts/signatures/py.json +++ b/capa/features/extractors/ts/signatures/py.json @@ -1,6 +1,5 @@ - { - "namespaces": [ + "imports": [ "socket", "socket.error", "subprocess.Popen", @@ -10,11 +9,26 @@ "base64.encodestring", "base64.b64encode", "base64.b64decode", - "os.chmod", "os.chdir", + "os.chmod", + "os.environ", + "os.getcwd", + "os.popen", + "os.remove", "os.path.expanduser", "os.path.dirname", "platform.mac_ver", - "time.sleep" + "shutil.copytree", + "time.sleep", + "win32com.client.Dispatch", + "win32con.FILE_ATTRIBUTE_HIDDEN", + "win32con.FILE_ATTRIBUTE_SYSTEM", + "win32api.SetFileAttributes" + + ], + "builtins": [ + "eval", + "exec", + "open" ] } \ No newline at end of file diff --git a/capa/features/extractors/ts/tools.py b/capa/features/extractors/ts/tools.py index 9033139b7..c461ddbf6 100644 --- a/capa/features/extractors/ts/tools.py +++ b/capa/features/extractors/ts/tools.py @@ -1,7 +1,7 @@ import abc import json import importlib.resources -from typing import Dict, List, Callable, Iterator, Optional +from typing import Dict, List, Tuple, Union, Callable, Iterator, Optional from dataclasses import dataclass from tree_sitter import Node @@ -12,6 +12,8 @@ @dataclass(frozen=True) class BaseNamespace(abc.ABC): + """Abstract class for internal representation of the namespace concept, including aliases.""" + name: str node: Node = None alias: str = "" @@ -25,11 +27,15 @@ def get_join_name(self) -> Optional[str]: class CSharpNamespace(BaseNamespace): def get_join_name(self) -> Optional[str]: + """using System; Diagnostics.ProcessStartInfo => System.Diagnostics.ProcessStartInfo""" return self.name -class PythonNamespace(BaseNamespace): +class PythonImport(BaseNamespace): def get_join_name(self) -> Optional[str]: + """import subprocess ; subprocess.Popen => subprocess.Popen + from threading import Timer (threading.Timer) => Timer + """ toolkit = LANGUAGE_TOOLKITS[LANG_CS] qualified_names = toolkit.split_name(self.name) if len(qualified_names) < 2: @@ -38,92 +44,116 @@ def get_join_name(self) -> Optional[str]: class LanguageToolkit: + signature_file: str import_signatures: Dict[str, set[str]] + method_call_query_type: str + property_query_type: str + string_delimiters: str + integer_prefixes: List[ + Tuple[Union[str, Tuple[str, ...]], int] + ] # Tends to indicate a number system, e.g. (("0x", "0X"), 16) + integer_suffixes: Tuple[str, ...] # Tends to indicate unsigned (100u) or long (100l) integer literal - def __init__(self, signature_file: str): - self.import_signatures = self.load_import_signatures(signature_file) + def __init__(self): + self.import_signatures = self.load_import_signatures(self.signature_file) def load_import_signatures(self, signature_file: str) -> Dict[str, set[str]]: signatures = json.loads(importlib.resources.read_text(capa.features.extractors.ts.signatures, signature_file)) return {category: set(namespaces) for category, namespaces in signatures.items()} def is_import(self, import_: str) -> bool: - return import_ in self.import_signatures["namespaces"] + return import_ in self.import_signatures["imports"] - @abc.abstractmethod - def create_namespace(self, name: str, node: Node = None, alias: str = "") -> BaseNamespace: - raise NotImplementedError() + def is_builtin(self, func: str) -> bool: + return func in self.import_signatures["builtins"] - @abc.abstractmethod def join_names(self, *args: str) -> str: - raise NotImplementedError() + return ".".join(args) - @abc.abstractmethod def split_name(self, name: str) -> List[str]: - raise NotImplementedError() + return name.split(".") + + def process_property(self, node: Node, name: str) -> Optional[str]: + if self.is_method_call(node): # yield only p.StartInfo but not p.Start() + return None + if self.is_recursive_property(node): # yield only Current.Server.ClearError but not Current.Server and Current + return None + return self.join_names(*self.split_name(name)[1:]) + + def process_imported_constant(self, node: Node, name: str) -> Optional[str]: + if self.is_method_call(node): # yield only ssl.CERT_NONE and not ssl.wrap_socket() + return None + if self.is_recursive_property(node): # yield foo.foo.bar and not foo.bar or bar + return None + return name - @abc.abstractmethod def format_imported_class(self, name: str) -> str: - raise NotImplementedError() + return name + + def format_imported_class_members(self, name: str) -> str: + qualified_names = self.split_name(name) + if len(qualified_names) < 2: + raise ValueError(f"{name} does not have an associated class or namespace") + if len(qualified_names) == 2: + classname, membername = qualified_names[0], qualified_names[1] + return f"{classname}::{membername}" + namespace, classname, membername = qualified_names[:-2], qualified_names[-2], qualified_names[-1] + return f"{'.'.join(namespace)}.{classname}::{membername}" - @abc.abstractmethod def format_imported_function(self, name: str) -> str: - raise NotImplementedError() + return self.format_imported_class_members(name) - @abc.abstractmethod def format_imported_property(self, name: str) -> str: - raise NotImplementedError() + return self.format_imported_class_members(name) - @abc.abstractmethod - def process_namespace(self, node: Node, query_name: str, get_range: Callable) -> Iterator[BaseNamespace]: - raise NotImplementedError() + def format_imported_constant(self, name: str) -> str: + return self.format_imported_class_members(name) + + def parse_integer(self, integer: str) -> Optional[int]: + for suffix in self.integer_suffixes: + if integer.endswith(suffix): + integer = integer[:-1] + try: + for prefix, base in self.integer_prefixes: + if integer.startswith(prefix): + return int(integer, base) + return int(integer) + except: + return None + + def parse_string(self, string: str) -> str: + return string.strip(self.string_delimiters) + + def is_method_call(self, node: Node) -> bool: + return node.parent.type == self.method_call_query_type + + def is_recursive_property(self, node: Node) -> bool: + return node.parent.type == self.property_query_type @abc.abstractmethod - def get_default_namespaces(self, embedded: bool) -> set[BaseNamespace]: + def create_namespace(self, name: str, node: Node = None, alias: str = "") -> BaseNamespace: raise NotImplementedError() @abc.abstractmethod - def parse_integer(self, integer: str) -> Optional[int]: + def process_namespace(self, node: Node, query_name: str, get_range: Callable) -> Iterator[BaseNamespace]: raise NotImplementedError() @abc.abstractmethod - def parse_string(self, string: str) -> Optional[str]: + def get_default_namespaces(self, embedded: bool) -> set[BaseNamespace]: raise NotImplementedError() class CSharpToolkit(LanguageToolkit): - def join_names(self, *args: str) -> str: - return ".".join(args) - - def split_name(self, name: str) -> List[str]: - return name.split(".") - - def format_imported_class(self, name: str) -> str: - return name + signature_file: str = "cs.json" + method_call_query_type: str = "invocation_expression" + property_query_type: str = "member_access_expression" + string_delimiters: str = '"' + integer_prefixes: List[Tuple[Union[str, Tuple[str, ...]], int]] = [(("0x", "0X"), 16)] + integer_suffixes: Tuple[str, ...] = ("u", "l") def create_namespace(self, name: str, node: Node = None, alias: str = "") -> BaseNamespace: return CSharpNamespace(name, node, alias) - def format_imported_function(self, name: str) -> str: - qualified_names = self.split_name(name) - if len(qualified_names) < 2: - raise ValueError(f"function {name} does not have an associated class or namespace") - if len(qualified_names) == 2: - classname, functionname = qualified_names[0], qualified_names[1] - return f"{classname}::{functionname}" - namespace, classname, functionname = qualified_names[:-2], qualified_names[-2], qualified_names[-1] - return f"{'.'.join(namespace)}.{classname}::{functionname}" - - def format_imported_property(self, name: str) -> str: - qualified_names = self.split_name(name) - if len(qualified_names) < 2: - raise ValueError(f"property {name} does not have an associated class") - if len(qualified_names) == 2: - classname, propertyname = qualified_names[0], qualified_names[1] - return f"{classname}::{propertyname}" - namespace, classname, propertyname = qualified_names[:-2], qualified_names[-2], qualified_names[-1] - return f"{'.'.join(namespace)}.{classname}::{propertyname}" - def process_namespace(self, node: Node, query_name: str, get_range: Callable) -> Iterator[BaseNamespace]: yield CSharpNamespace(get_range(node), node, "") @@ -132,52 +162,21 @@ def get_default_namespaces(self, embedded: bool) -> set[BaseNamespace]: return {CSharpNamespace(name) for name in self.import_signatures["aspx_default_namespaces"]} return set() - def parse_integer(self, integer: str) -> Optional[int]: - if integer.endswith(("u", "l")): - integer = integer[:-1] - try: - if integer.startswith(("0x", "0X")): - return int(integer, 16) - return int(integer) - except: - return None - - def parse_string(self, string: str) -> Optional[str]: - return string.strip('"') - class PythonToolkit(LanguageToolkit): - def join_names(self, *args: str) -> str: - return ".".join(args) - - def split_name(self, name: str) -> List[str]: - return name.split(".") - - def format_imported_class(self, name: str) -> str: - return name + signature_file: str = "py.json" + method_call_query_type: str = "call" + property_query_type: str = "attribute" + string_delimiters: str = "\"'" + integer_prefixes: List[Tuple[Union[str, Tuple[str, ...]], int]] = [ + (("0b, 0B"), 2), + (("0o, 0O"), 8), + (("0x", "0X"), 16), + ] + integer_suffixes: Tuple[str, ...] = tuple() def create_namespace(self, name: str, node: Node = None, alias: str = "") -> BaseNamespace: - return PythonNamespace(name, node, alias) - - def format_imported_function(self, name: str) -> str: - qualified_names = self.split_name(name) - if len(qualified_names) < 2: - raise ValueError(f"function {name} does not have an associated class or namespace") - if len(qualified_names) == 2: - classname, functionname = qualified_names[0], qualified_names[1] - return f"{classname}::{functionname}" - namespace, classname, functionname = qualified_names[:-2], qualified_names[-2], qualified_names[-1] - return f"{'.'.join(namespace)}.{classname}::{functionname}" - - def format_imported_property(self, name: str) -> str: - qualified_names = self.split_name(name) - if len(qualified_names) < 2: - raise ValueError(f"property {name} does not have an associated class") - if len(qualified_names) == 2: - classname, propertyname = qualified_names[0], qualified_names[1] - return f"{classname}::{propertyname}" - namespace, classname, propertyname = qualified_names[:-2], qualified_names[-2], qualified_names[-1] - return f"{'.'.join(namespace)}.{classname}::{propertyname}" + return PythonImport(name, node, alias) def get_import_name(self, name: str, module_name: Optional[str] = None) -> str: return self.join_names(module_name, name) if module_name else name @@ -185,14 +184,14 @@ def get_import_name(self, name: str, module_name: Optional[str] = None) -> str: def process_simple_import( self, node: Node, get_range: Callable, module_name: Optional[str] = None ) -> BaseNamespace: - return PythonNamespace(self.get_import_name(get_range(node), module_name), node) + return PythonImport(self.get_import_name(get_range(node), module_name), node) def process_aliased_import( self, node: Node, get_range: Callable, module_name: Optional[str] = None ) -> BaseNamespace: name = self.get_import_name(get_range(node.get_child_by_field_name("name")), module_name) alias = get_range(node.get_child_by_field_name("alias")) - return PythonNamespace(name, node, alias) + return PythonImport(name, node, alias) def process_imports( self, nodes: List[Node], get_range: Callable, module_name: Optional[str] = None @@ -203,30 +202,29 @@ def process_imports( elif import_node.type == "aliased_import": yield self.process_aliased_import(import_node, get_range, module_name) + def get_wildcard_import(self, node: Node) -> Optional[Node]: + for child_node in node.children: + if child_node.type == "wildcard_import": + return child_node + return None + + def process_import_from(self, node: Node, import_nodes: List[Node], get_range: Callable) -> Iterator[BaseNamespace]: + module_name, import_nodes = get_range(import_nodes[0]), import_nodes[1:] + wildcard_import = self.get_wildcard_import(node) + if wildcard_import: + yield self.process_simple_import(wildcard_import, get_range, module_name) + else: + yield from self.process_imports(import_nodes, get_range, module_name) + def process_namespace(self, node: Node, query_name: str, get_range: Callable) -> Iterator[BaseNamespace]: import_nodes = [child_node for child_node in node.children if child_node.is_named] if query_name == "import_from": - yield from self.process_imports(import_nodes[1:], get_range, get_range(import_nodes[0])) + yield from self.process_import_from(node, import_nodes, get_range) elif query_name == "import": yield from self.process_imports(import_nodes, get_range) def get_default_namespaces(self, embedded: bool) -> set[BaseNamespace]: return set() - def parse_integer(self, integer: str) -> Optional[int]: - try: - if integer.startswith(("0b, 0B")): - return int(integer, 2) - if integer.startswith(("0o, 0O")): - return int(integer, 8) - if integer.startswith(("0x", "0X")): - return int(integer, 16) - return int(integer) - except: - return None - - def parse_string(self, string: str) -> Optional[str]: - return string.strip('"') - -LANGUAGE_TOOLKITS: dict[str, LanguageToolkit] = {LANG_CS: CSharpToolkit("cs.json"), LANG_PY: PythonToolkit("py.json")} +LANGUAGE_TOOLKITS: dict[str, LanguageToolkit] = {LANG_CS: CSharpToolkit(), LANG_PY: PythonToolkit()} diff --git a/tests/fixtures.py b/tests/fixtures.py index 09a5e1d16..6ba493dfe 100644 --- a/tests/fixtures.py +++ b/tests/fixtures.py @@ -37,7 +37,7 @@ Feature, ) from capa.features.address import Address -from capa.features.extractors.script import LANG_CS, LANG_TEM +from capa.features.extractors.script import LANG_CS, LANG_PY from capa.features.extractors.base_extractor import BBHandle, InsnHandle, FunctionHandle from capa.features.extractors.dnfile.extractor import DnfileFeatureExtractor @@ -181,7 +181,7 @@ def get_ts_extractor_engine(language, buf): @lru_cache(maxsize=1) -def get_ts_template_engine(language, path): +def get_ts_template_engine(path): import capa.features.extractors.ts.engine with open(path, "rb") as f: @@ -336,7 +336,8 @@ def get_data_path_by_name(name): } PY_DATA_PATH_BY_NAME = { - "py_7f9cd1": os.path.join(PY_DIR, "7f9cd1eedf0a9088fc3e07a275d04dceadcf0a5cd425a17e9666b63685d3a37e.py_") + "py_7f9cd1": os.path.join(PY_DIR, "7f9cd1eedf0a9088fc3e07a275d04dceadcf0a5cd425a17e9666b63685d3a37e.py_"), + "py_ca0df6": os.path.join(PY_DIR, "ca0df6cccf2a15ce8f781d81959cf230aead64e6297a3283b21457dc74938c89.py_"), } @@ -1050,104 +1051,109 @@ def cs_138cdc_extractor_engine(): @pytest.fixture def aspx_4f6fa6_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_4f6fa6"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_4f6fa6"]) @pytest.fixture def aspx_5f959f_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_5f959f"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_5f959f"]) @pytest.fixture def aspx_10162f_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_10162f"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_10162f"]) @pytest.fixture def aspx_2b71dd_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_2b71dd"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_2b71dd"]) @pytest.fixture def aspx_f2bf20_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_f2bf20"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_f2bf20"]) @pytest.fixture def aspx_f39dc0_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_f39dc0"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_f39dc0"]) @pytest.fixture def aspx_ea2a01_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_ea2a01"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_ea2a01"]) @pytest.fixture def aspx_6f3261_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_6f3261"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_6f3261"]) @pytest.fixture def aspx_1f8f40_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_1f8f40"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_1f8f40"]) @pytest.fixture def aspx_2e8c7e_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_2e8c7e"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_2e8c7e"]) @pytest.fixture def aspx_03bb5c_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_03bb5c"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_03bb5c"]) @pytest.fixture def aspx_606dbf_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_606dbf"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_606dbf"]) @pytest.fixture def aspx_f397cb_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_f397cb"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_f397cb"]) @pytest.fixture def aspx_b4bb14_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_b4bb14"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_b4bb14"]) @pytest.fixture def aspx_54433d_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_54433d"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_54433d"]) @pytest.fixture def aspx_a35878_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_a35878"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_a35878"]) @pytest.fixture def aspx_a5c893_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_a5c893"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_a5c893"]) @pytest.fixture def aspx_15eed4_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_15eed4"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_15eed4"]) @pytest.fixture def aspx_b75f16_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_b75f16"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_b75f16"]) @pytest.fixture def aspx_d460ca_template_engine(): - return get_ts_template_engine(LANG_TEM, ASPX_DATA_PATH_BY_NAME["aspx_d460ca"]) + return get_ts_template_engine(ASPX_DATA_PATH_BY_NAME["aspx_d460ca"]) @pytest.fixture def py_7f9cd1_template_engine(): - return get_ts_extractor_engine() + return get_ts_extractor_engine(LANG_PY, PY_DATA_PATH_BY_NAME["py_7f9cd1"]) + + +@pytest.fixture +def py_ca0df6_template_engine(): + return get_ts_extractor_engine(LANG_PY, PY_DATA_PATH_BY_NAME["py_ca0df6"]) diff --git a/tests/test_ts.py b/tests/test_ts.py index 73a8ca037..739293fd8 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -1,4 +1,5 @@ from typing import List, Tuple +from multiprocessing.sharedctypes import Value import pytest import fixtures @@ -161,11 +162,10 @@ def do_test_ts_extractor_engine_get_global_statements(engine: TreeSitterExtracto def do_test_ts_extractor_engine_get_assigned_property_names( engine: TreeSitterExtractorEngine, root_node: Node, expected: List[str] ): - assert len(list(engine.get_assigned_property_names(root_node))) == len(expected) - for (node, name), expected_range in zip(engine.get_assigned_property_names(root_node), expected): + assert len(list(engine.get_processed_property_names(root_node))) == len(expected) + for (node, name), expected_name in zip(engine.get_processed_property_names(root_node), expected): assert isinstance(node, Node) - assert name == "property" - do_test_ts_base_engine_get_range(engine, node, expected_range, startswith=True) + assert name == expected_name do_test_ts_base_engine_get_address(engine, node) @@ -246,13 +246,18 @@ def do_test_ts_extractor_engine_get_assigned_property_names( 'string stderr = "";', ], "properties": [ - "HttpContext.Current.Response.StatusCode", - "HttpContext.Current.Response.StatusDescription", - "procStartInfo.RedirectStandardOutput", - "procStartInfo.RedirectStandardError", - "procStartInfo.UseShellExecute", - "procStartInfo.CreateNoWindow", - "p.StartInfo", + "Current.Response.StatusCode", + "Current.Response.StatusDescription", + "Current.Request.Headers", + "UserHostAddress", + "Current.Request.Headers", + "Form", + "Form", + "RedirectStandardOutput", + "RedirectStandardError", + "UseShellExecute", + "CreateNoWindow", + "StartInfo", ], }, ), @@ -1086,6 +1091,28 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("py_7f9cd1", "function=PSEUDO MAIN", Substring("[!]"), True), ("py_7f9cd1", "function=get_itunes_backups", Number(0), True), ("py_7f9cd1", "function=get_itunes_backups", Number(1), True), + ("py_ca0df6", "file", Namespace("win32com.client"), True), + ("py_ca0df6", "file", Namespace("shutil"), True), + ("py_ca0df6", "function=PSEUDO MAIN", API("os::environ"), True), + ("py_ca0df6", "function=yut", API("shutil.copytree"), True), + ("py_ca0df6", "function=yut", API("os.getcwd"), True), + ("py_ca0df6", "function=takk", API("win32com.client.Dispatch"), True), + ("py_ca0df6", "function=takk", String("Schedule.Service"), True), + ("py_ca0df6", "function=takk", Substring("Updatewmplayer.exe"), True), + ("py_ca0df6", "function=llp", API("win32api.SetFileAttributes"), True), + ("py_ca0df6", "function=llp", Substring("KMPlayer"), True), + ("py_ca0df6", "function=fop", API("os.remove"), True), + ("py_ca0df6", "function=fop", Substring("Projec.exe"), True), + ("py_ca0df6", "function=htr", API("time.sleep"), True), + ("py_ca0df6", "function=htr", Number(30), True), + ("py_ca0df6", "function=htr", Number(25), True), + ("py_ca0df6", "function=htr", Number(10), True), + ("py_ca0df6", "function=vul", Number(5), True), + ("py_ca0df6", "function=vul", Number(1), True), + ("py_ca0df6", "function=vul", API("os.popen"), True), + ("py_ca0df6", "function=vul", String("Updatewmplayer"), True), + ("py_ca0df6", "function=vul", Substring("SCHTASKS"), True), + ("py_ca0df6", "function=llp", API("win32con::FILE_ATTRIBUTE_HIDDEN"), True), ] ) From 5e85a6e46f303958d78cdb88767b086056f5e20e Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Tue, 2 Aug 2022 13:19:05 -0400 Subject: [PATCH 45/51] Implemented namespace aliasing; further refactored the codebase. --- capa/features/extractors/ts/engine.py | 3 +- capa/features/extractors/ts/extractor.py | 18 ++++++---- capa/features/extractors/ts/function.py | 23 +++++++------ capa/features/extractors/ts/tools.py | 42 ++++++++++++++++-------- tests/test_ts.py | 1 - 5 files changed, 53 insertions(+), 34 deletions(-) diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index 0908d8823..bbc29274e 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -128,8 +128,7 @@ def is_object_creation_expression(self, node: Node) -> bool: captures = self.get_new_object_names(node) if not captures: return False - new_object_name_node, _ = captures[0] - return new_object_name_node.parent.parent == node + return captures[0].parent.parent == node class TreeSitterTemplateEngine(TreeSitterBaseEngine): diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index e2a2cb59d..1e4ea50ea 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -27,13 +27,19 @@ def __init__(self, path: str): buf = f.read() self.language = capa.features.extractors.script.get_language_from_ext(path) + self.template_engine = self.get_template_engine(buf) + self.engines = self.get_engines(buf) + + def get_template_engine(self, buf: bytes): if self.language == LANG_TEM: - self.template_engine = TreeSitterTemplateEngine(buf) - self.engines = self.extract_code_from_template() - elif self.language == LANG_HTML: - self.engines = self.extract_code_from_html(buf) - else: - self.engines = [TreeSitterExtractorEngine(self.language, buf)] + return TreeSitterTemplateEngine(buf) + + def get_engines(self, buf: bytes) -> List[TreeSitterExtractorEngine]: + if self.language == LANG_TEM and self.template_engine: + return self.extract_code_from_template() + if self.language == LANG_HTML: + return self.extract_code_from_html(buf) + return [TreeSitterExtractorEngine(self.language, buf)] def extract_code_from_template(self) -> List[TreeSitterExtractorEngine]: engines = list(self.template_engine.get_parsed_code_sections()) diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py index e8dd37a0f..626e17545 100644 --- a/capa/features/extractors/ts/function.py +++ b/capa/features/extractors/ts/function.py @@ -48,12 +48,8 @@ def get_imports(name: str, namespaces: set[BaseNamespace], engine: TreeSitterExt if engine.language_toolkit.is_import(name): yield name for namespace in namespaces: - namespace_join_name = namespace.get_join_name() - if not namespace_join_name: - continue - joined_name = engine.language_toolkit.join_names(namespace_join_name, name) - if engine.language_toolkit.is_import(joined_name): - yield joined_name + if engine.language_toolkit.is_import(name, namespace): + yield namespace.join(name) def get_properties(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Node, str]]: @@ -85,16 +81,19 @@ def extract_static_methods_(node: Node, engine: TreeSitterExtractorEngine) -> It yield API(engine.language_toolkit.format_imported_function(name)), engine.get_address(node) +def get_property_name(node: Node, engine: TreeSitterExtractorEngine) -> str: + qualified_names = engine.language_toolkit.split_name(engine.get_range(node)) + if len(qualified_names) == 1: + return qualified_names[0] + return engine.language_toolkit.join_names(*qualified_names[1:]) + + def extract_regular_methods_( node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine ) -> Iterator[Tuple[Feature, Address]]: direct_method_call_node = engine.get_direct_method_call(node) - if direct_method_call_node is not None: - node = direct_method_call_node - qualified_names = engine.language_toolkit.split_name(engine.get_range(node)) - property_name = ( - qualified_names[0] if len(qualified_names) == 1 else engine.language_toolkit.join_names(*qualified_names[1:]) - ) + node = node if direct_method_call_node is None else direct_method_call_node + property_name = get_property_name(node, engine) for name in get_imports(property_name, classes, engine): yield API(engine.language_toolkit.format_imported_function(name)), engine.get_address(node) diff --git a/capa/features/extractors/ts/tools.py b/capa/features/extractors/ts/tools.py index c461ddbf6..3d5da5125 100644 --- a/capa/features/extractors/ts/tools.py +++ b/capa/features/extractors/ts/tools.py @@ -21,26 +21,26 @@ class BaseNamespace(abc.ABC): def __hash__(self): return hash(self.name) - def get_join_name(self) -> Optional[str]: + def join(self, name: str) -> str: raise NotImplementedError() class CSharpNamespace(BaseNamespace): - def get_join_name(self) -> Optional[str]: + def join(self, name: str) -> str: """using System; Diagnostics.ProcessStartInfo => System.Diagnostics.ProcessStartInfo""" - return self.name + return LANGUAGE_TOOLKITS[LANG_CS].join_names(self.name, name) class PythonImport(BaseNamespace): - def get_join_name(self) -> Optional[str]: + def join(self, name: str) -> str: """import subprocess ; subprocess.Popen => subprocess.Popen from threading import Timer (threading.Timer) => Timer """ toolkit = LANGUAGE_TOOLKITS[LANG_CS] qualified_names = toolkit.split_name(self.name) if len(qualified_names) < 2: - return None - return toolkit.join_names(*qualified_names[:-1]) + return name + return toolkit.join_names(*(qualified_names[:-1] + [name])) class LanguageToolkit: @@ -61,8 +61,8 @@ def load_import_signatures(self, signature_file: str) -> Dict[str, set[str]]: signatures = json.loads(importlib.resources.read_text(capa.features.extractors.ts.signatures, signature_file)) return {category: set(namespaces) for category, namespaces in signatures.items()} - def is_import(self, import_: str) -> bool: - return import_ in self.import_signatures["imports"] + def is_import_(self, name: str) -> bool: + return name in self.import_signatures["imports"] def is_builtin(self, func: str) -> bool: return func in self.import_signatures["builtins"] @@ -131,7 +131,11 @@ def is_recursive_property(self, node: Node) -> bool: return node.parent.type == self.property_query_type @abc.abstractmethod - def create_namespace(self, name: str, node: Node = None, alias: str = "") -> BaseNamespace: + def is_import(self, name: str, namespace: BaseNamespace = None) -> bool: + raise NotImplementedError() + + @abc.abstractmethod + def create_namespace(self, name: str) -> BaseNamespace: raise NotImplementedError() @abc.abstractmethod @@ -151,8 +155,13 @@ class CSharpToolkit(LanguageToolkit): integer_prefixes: List[Tuple[Union[str, Tuple[str, ...]], int]] = [(("0x", "0X"), 16)] integer_suffixes: Tuple[str, ...] = ("u", "l") - def create_namespace(self, name: str, node: Node = None, alias: str = "") -> BaseNamespace: - return CSharpNamespace(name, node, alias) + def is_import(self, name: str, namespace: BaseNamespace = None) -> bool: + if namespace: + return self.is_import_(namespace.join(name)) + return self.is_import_(name) + + def create_namespace(self, name: str) -> BaseNamespace: + return CSharpNamespace(name) def process_namespace(self, node: Node, query_name: str, get_range: Callable) -> Iterator[BaseNamespace]: yield CSharpNamespace(get_range(node), node, "") @@ -175,8 +184,15 @@ class PythonToolkit(LanguageToolkit): ] integer_suffixes: Tuple[str, ...] = tuple() - def create_namespace(self, name: str, node: Node = None, alias: str = "") -> BaseNamespace: - return PythonImport(name, node, alias) + def is_import(self, name: str, namespace: BaseNamespace = None) -> bool: + if namespace: + if namespace.alias: + return self.is_import_(name.replace(namespace.alias, namespace.name)) + return self.is_import_(namespace.join(name)) + return self.is_import_(name) + + def create_namespace(self, name: str) -> BaseNamespace: + return PythonImport(name) def get_import_name(self, name: str, module_name: Optional[str] = None) -> str: return self.join_names(module_name, name) if module_name else name diff --git a/tests/test_ts.py b/tests/test_ts.py index 739293fd8..b49a93811 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -1,5 +1,4 @@ from typing import List, Tuple -from multiprocessing.sharedctypes import Value import pytest import fixtures From 614900fb17974872cdc0e293e53d41def180b23c Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Wed, 3 Aug 2022 11:48:36 -0400 Subject: [PATCH 46/51] Refactored/simplified parts of the codebase to improve readability; addressed most of the GH pull request comments/suggestions. --- capa/features/extractors/script.py | 1 + capa/features/extractors/ts/engine.py | 112 +++++++++++++---------- capa/features/extractors/ts/extractor.py | 6 +- capa/features/extractors/ts/function.py | 73 +++++++-------- capa/features/extractors/ts/tools.py | 25 +++-- tests/test_ts.py | 49 ++++------ 6 files changed, 130 insertions(+), 136 deletions(-) diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py index e5a2bb9d3..7273959a7 100644 --- a/capa/features/extractors/script.py +++ b/capa/features/extractors/script.py @@ -3,6 +3,7 @@ from capa.features.common import OS, OS_ANY, ARCH_ANY, FORMAT_SCRIPT, Arch, Format, Feature, ScriptLanguage from capa.features.address import NO_ADDRESS, Address, FileOffsetRangeAddress +# Can be used to instantiate tree_sitter Language objects (see ts/query.py) LANG_CS = "c_sharp" LANG_HTML = "html" LANG_JS = "javascript" diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index bbc29274e..37877a395 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -36,7 +36,7 @@ def get_byte_range(self, node: Node) -> bytes: return self.buf[node.start_byte : node.end_byte] def get_range(self, node: Node) -> str: - return self.get_byte_range(node).decode() + return self.get_byte_range(node).decode("utf-8") def get_address(self, node: Node) -> FileOffsetRangeAddress: return FileOffsetRangeAddress(node.start_byte, node.end_byte) @@ -67,56 +67,69 @@ def __init__( def get_address(self, node: Node) -> FileOffsetRangeAddress: return FileOffsetRangeAddress(self.buf_offset + node.start_byte, self.buf_offset + node.end_byte) - def get_new_object_names(self, node: Node) -> List[Tuple[Node, str]]: - return self.query.new_object_name.captures(node) + def get_new_object_names(self, node: Node) -> Iterator[Node]: + for obj_node, _ in self.query.new_object_name.captures(node): + yield obj_node - def get_property_names(self, node: Node) -> List[Tuple[Node, str]]: - return self.query.property_name.captures(node) + def get_property_names(self, node: Node) -> Iterator[Node]: + for pt_node, _ in self.query.property_name.captures(node): + yield pt_node def get_processed_property_names(self, node: Node) -> Iterator[Tuple[Node, str]]: - for pt_node, _ in self.get_property_names(node): + """Generates captured property name nodes and their associated proper names (see process_property + for details), e.g.: [(node0, "StartInfo"), (node1, "RedirectStandardOutput")].""" + for pt_node in self.get_property_names(node): pt_name = self.language_toolkit.process_property(pt_node, self.get_range(pt_node)) if pt_name: yield pt_node, pt_name - def get_function_definitions(self, node: Node = None) -> List[Tuple[Node, str]]: - return self.query.function_definition.captures(node if node is not None else self.tree.root_node) + def get_function_definitions(self, node: Node = None) -> Iterator[Node]: + node = self.tree.root_node if node is None else node + for fd_node, _ in self.query.function_definition.captures(node): + yield fd_node def get_function_definition_name(self, node: Node) -> Node: return node.child_by_field_name(self.query.function_definition_field_name) def get_function_definition_names(self, node: Node) -> Iterator[Node]: - for fn_node, _ in self.get_function_definitions(node): - yield self.get_function_definition_name(fn_node) + for fd_node in self.get_function_definitions(node): + yield self.get_function_definition_name(fd_node) - def get_function_call_names(self, node: Node) -> List[Tuple[Node, str]]: - return self.query.function_call_name.captures(node) + def get_function_call_names(self, node: Node) -> Iterator[Node]: + for fcn_node, _ in self.query.function_call_name.captures(node): + yield fcn_node - def get_imported_constants(self, node: Node) -> List[Tuple[Node, str]]: - return self.query.imported_constant_name.captures(node) + def get_imported_constants(self, node: Node) -> Iterator[Node]: + for ic_node, _ in self.query.imported_constant_name.captures(node): + yield ic_node def get_processed_imported_constants(self, node: Node) -> Iterator[Tuple[Node, str]]: - for const_node, _ in self.get_imported_constants(node): - const_name = self.language_toolkit.process_imported_constant(const_node, self.get_range(const_node)) - if const_name: - yield const_node, const_name + """Generates captured imported constant nodes and their associated proper names (see process_imported_constant + for details), e.g.: [(node0, "ssl.CERT_NONE"), (node1, "win32con.FILE_ATTRIBUTE_HIDDEN")].""" + for ic_node in self.get_imported_constants(node): + ic_name = self.language_toolkit.process_imported_constant(ic_node, self.get_range(ic_node)) + if ic_name: + yield ic_node, ic_name - def get_string_literals(self, node: Node) -> List[Tuple[Node, str]]: - return self.query.string_literal.captures(node) + def get_string_literals(self, node: Node) -> Iterator[Node]: + for str_node, _ in self.query.string_literal.captures(node): + yield str_node - def get_integer_literals(self, node: Node) -> List[Tuple[Node, str]]: - return self.query.integer_literal.captures(node) + def get_integer_literals(self, node: Node) -> Iterator[Node]: + for int_node, _ in self.query.integer_literal.captures(node): + yield int_node def get_namespaces(self, node: Node = None) -> List[Tuple[Node, str]]: - return self.query.namespace.captures(node if node is not None else self.tree.root_node) + return self.query.namespace.captures(self.tree.root_node if node is None else node) def get_processed_namespaces(self, node: Node = None) -> Iterator[BaseNamespace]: for ns_node, query_name in self.get_namespaces(node): for namespace in self.language_toolkit.process_namespace(ns_node, query_name, self.get_range): yield namespace - def get_global_statements(self) -> List[Tuple[Node, str]]: - return self.query.global_statement.captures(self.tree.root_node) + def get_global_statements(self) -> Iterator[Node]: + for node, _ in self.query.global_statement.captures(self.tree.root_node): + yield node def get_direct_method_call(self, node: Node) -> Optional[Node]: captures = self.query.direct_method_call.captures(node) @@ -124,12 +137,6 @@ def get_direct_method_call(self, node: Node) -> Optional[Node]: return captures[0][0] return None - def is_object_creation_expression(self, node: Node) -> bool: - captures = self.get_new_object_names(node) - if not captures: - return False - return captures[0].parent.parent == node - class TreeSitterTemplateEngine(TreeSitterBaseEngine): query: TemplateQueryBinding @@ -143,11 +150,12 @@ def __init__(self, buf: bytes): self.language_toolkit = LANGUAGE_TOOLKITS[self.embedded_language] self.namespaces = set(self.get_namespaces()) - def get_code_sections(self) -> List[Tuple[Node, str]]: - return self.query.code.captures(self.tree.root_node) + def get_code_sections(self) -> Iterator[Node]: + for node, _ in self.query.code.captures(self.tree.root_node): + yield node def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: - for node, _ in self.get_code_sections(): + for node in self.get_code_sections(): # TODO: support JS if self.embedded_language == LANG_CS: yield TreeSitterExtractorEngine( @@ -156,18 +164,21 @@ def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: node.start_byte, self.namespaces, ) + else: + raise ValueError(f"parsing of {self.embedded_language} is not supported") - def get_content_sections(self) -> List[Tuple[Node, str]]: - return self.query.content.captures(self.tree.root_node) + def get_content_sections(self) -> Iterator[Node]: + for node, _ in self.query.content.captures(self.tree.root_node): + yield node def identify_language(self) -> str: - for node, _ in self.get_code_sections(): + for node in self.get_code_sections(): if self.is_c_sharp(node): return LANG_CS return LANG_JS def get_imported_namespaces(self) -> Iterator[BaseNamespace]: - for node, _ in self.get_code_sections(): + for node in self.get_code_sections(): if self.is_aspx_import_directive(node): namespace = self.get_aspx_namespace(node) if namespace is not None: @@ -201,7 +212,7 @@ def get_aspx_namespace(self, node: Node) -> Optional[BaseNamespace]: self.get_byte_range(node), re.IGNORECASE, ) - return CSharpNamespace(match.group(1).decode(), node) if match is not None else None + return CSharpNamespace(match.group(1).decode("utf-8"), node) if match is not None else None class TreeSitterHTMLEngine(TreeSitterBaseEngine): @@ -212,19 +223,22 @@ def __init__(self, buf: bytes, namespaces: set[BaseNamespace] = set()): super().__init__(LANG_HTML, buf) self.namespaces = namespaces - def get_scripts(self) -> List[Tuple[Node, str]]: - return self.query.script_element.captures(self.tree.root_node) + def get_scripts(self) -> Iterator[Node]: + for node, _ in self.query.script_element.captures(self.tree.root_node): + yield node - def get_attributes(self, node: Node) -> List[Tuple[Node, str]]: - return self.query.attribute.captures(node) + def get_attributes(self, node: Node) -> Iterator[Node]: + for att_node, _ in self.query.attribute.captures(node): + yield att_node def get_identified_scripts(self) -> Iterator[Tuple[Node, str]]: - for node, _ in self.get_scripts(): - for content_node, _ in self.get_script_contents(node): + for node in self.get_scripts(): + for content_node in self.get_script_contents(node): yield content_node, self.identify_language(node) - def get_script_contents(self, node: Node) -> Iterator[Tuple[Node, str]]: - return self.query.script_content.captures(node) + def get_script_contents(self, node: Node) -> Iterator[Node]: + for sc_node, _ in self.query.script_content.captures(node): + yield sc_node def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: for node, language in self.get_identified_scripts(): @@ -233,8 +247,8 @@ def get_parsed_code_sections(self) -> Iterator[TreeSitterExtractorEngine]: yield TreeSitterExtractorEngine(language, self.get_byte_range(node), node.start_byte, self.namespaces) def identify_language(self, node: Node) -> str: - for attribute_node, _ in self.get_attributes(node): - if self.is_server_side_c_sharp(attribute_node): + for att_node in self.get_attributes(node): + if self.is_server_side_c_sharp(att_node): return LANG_CS return LANG_JS diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index 1e4ea50ea..b611fc206 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -43,9 +43,9 @@ def get_engines(self, buf: bytes) -> List[TreeSitterExtractorEngine]: def extract_code_from_template(self) -> List[TreeSitterExtractorEngine]: engines = list(self.template_engine.get_parsed_code_sections()) - for node, _ in self.template_engine.get_content_sections(): + for node in self.template_engine.get_content_sections(): section_buf = self.template_engine.get_byte_range(node) - engines.extend(list(self.extract_code_from_html(section_buf, self.template_engine.namespaces))) + engines.extend(self.extract_code_from_html(section_buf, self.template_engine.namespaces)) return engines def extract_code_from_html( @@ -79,7 +79,7 @@ def get_pseudo_main_function(self, engine: TreeSitterExtractorEngine) -> Functio def get_functions(self) -> Iterator[FunctionHandle]: for engine in self.engines: yield self.get_pseudo_main_function(engine) - for node, _ in engine.get_function_definitions(): + for node in engine.get_function_definitions(): name = engine.get_range(engine.get_function_definition_name(node)) yield FunctionHandle(engine.get_address(node), TSFunctionInner(node, name, engine)) diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py index 626e17545..ff72d24bf 100644 --- a/capa/features/extractors/ts/function.py +++ b/capa/features/extractors/ts/function.py @@ -29,17 +29,16 @@ def is_pseudo_main_function(fh: FunctionHandle, engine: TreeSitterExtractorEngin def extract_strings(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, _ in engine.get_string_literals(fn_node): - parsed_str = engine.language_toolkit.parse_string(engine.get_range(node)) - if parsed_str is not None: - yield String(parsed_str), engine.get_address(node) + for node in engine.get_string_literals(fn_node): + yield String(engine.language_toolkit.parse_string(engine.get_range(node))), engine.get_address(node) def extract_integers(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, _ in engine.get_integer_literals(fn_node): - parsed_int = engine.language_toolkit.parse_integer(engine.get_range(node)) - if parsed_int is not None: - yield Number(parsed_int), engine.get_address(node) + for node in engine.get_integer_literals(fn_node): + try: + yield Number(engine.language_toolkit.parse_integer(engine.get_range(node))), engine.get_address(node) + except ValueError: + continue def get_imports(name: str, namespaces: set[BaseNamespace], engine: TreeSitterExtractorEngine) -> Iterator[str]: @@ -52,31 +51,27 @@ def get_imports(name: str, namespaces: set[BaseNamespace], engine: TreeSitterExt yield namespace.join(name) -def get_properties(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Node, str]]: - yield from engine.get_processed_property_names(fn_node) - - def get_classes(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[str]: - for node, _ in engine.get_new_object_names(fn_node): + for node in engine.get_new_object_names(fn_node): for name in get_imports(engine.get_range(node), engine.namespaces, engine): yield name -def extract_classes_(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, _ in engine.get_new_object_names(fn_node): +def _extract_classes(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node in engine.get_new_object_names(fn_node): for name in get_imports(engine.get_range(node), engine.namespaces, engine): yield API(engine.language_toolkit.format_imported_class(name)), engine.get_address(node) -def extract_properties_( +def _extract_properties( fn_node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine ) -> Iterator[Tuple[Feature, Address]]: - for node, property_name in get_properties(fn_node, engine): - for name in get_imports(property_name, classes, engine): - yield Property(engine.language_toolkit.format_imported_property(name)), engine.get_address(node) + for pt_node, pt_name in engine.get_processed_property_names(fn_node): + for name in get_imports(pt_name, classes, engine): + yield Property(engine.language_toolkit.format_imported_property(name)), engine.get_address(pt_node) -def extract_static_methods_(node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: +def _extract_static_methods(node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: for name in get_imports(engine.get_range(node), engine.namespaces, engine): yield API(engine.language_toolkit.format_imported_function(name)), engine.get_address(node) @@ -88,7 +83,7 @@ def get_property_name(node: Node, engine: TreeSitterExtractorEngine) -> str: return engine.language_toolkit.join_names(*qualified_names[1:]) -def extract_regular_methods_( +def _extract_regular_methods( node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine ) -> Iterator[Tuple[Feature, Address]]: direct_method_call_node = engine.get_direct_method_call(node) @@ -100,32 +95,32 @@ def extract_regular_methods_( def extract_api(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: classes = {engine.language_toolkit.create_namespace(cls) for cls in get_classes(fn_node, engine)} - yield from extract_classes_(fn_node, engine) - yield from extract_imported_constants_(fn_node, engine) - yield from extract_function_calls_(fn_node, classes, engine) - yield from extract_properties_(fn_node, classes, engine) + yield from _extract_classes(fn_node, engine) + yield from _extract_imported_constants(fn_node, engine) + yield from _extract_function_calls(fn_node, classes, engine) + yield from _extract_properties(fn_node, classes, engine) -def extract_function_calls_( +def _extract_function_calls( fn_node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine ) -> Iterator[Tuple[Feature, Address]]: - for node, _ in engine.get_function_call_names(fn_node): - yield from extract_static_methods_(node, engine) - yield from extract_regular_methods_(node, classes, engine) + for node in engine.get_function_call_names(fn_node): + yield from _extract_static_methods(node, engine) + yield from _extract_regular_methods(node, classes, engine) -def extract_imported_constants_(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, _ in engine.get_processed_imported_constants(fn_node): - for name in get_imports(engine.get_range(node), engine.namespaces, engine): - yield API(engine.language_toolkit.format_imported_constant(name)), engine.get_address(node) +def _extract_imported_constants(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for ic_node, ic_name in engine.get_processed_imported_constants(fn_node): + for name in get_imports(ic_name, engine.namespaces, engine): + yield API(engine.language_toolkit.format_imported_constant(name)), engine.get_address(ic_node) -def extract_pseudo_main_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node, _ in engine.get_global_statements(): - yield from extract_features_(node, engine) +def _extract_pseudo_main_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for node in engine.get_global_statements(): + yield from _extract_features(node, engine) -def extract_features_(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: +def _extract_features(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: for function_handler in FUNCTION_HANDLERS: for feature, addr in function_handler(fn_node, engine): yield feature, addr @@ -133,9 +128,9 @@ def extract_features_(fn_node: Node, engine: TreeSitterExtractorEngine) -> Itera def extract_features(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: if is_pseudo_main_function(fh, engine): - yield from extract_pseudo_main_features(engine) + yield from _extract_pseudo_main_features(engine) else: - yield from extract_features_(fh.inner.node, engine) + yield from _extract_features(fh.inner.node, engine) FUNCTION_HANDLERS = ( diff --git a/capa/features/extractors/ts/tools.py b/capa/features/extractors/ts/tools.py index 3d5da5125..f5a5beb5c 100644 --- a/capa/features/extractors/ts/tools.py +++ b/capa/features/extractors/ts/tools.py @@ -61,7 +61,7 @@ def load_import_signatures(self, signature_file: str) -> Dict[str, set[str]]: signatures = json.loads(importlib.resources.read_text(capa.features.extractors.ts.signatures, signature_file)) return {category: set(namespaces) for category, namespaces in signatures.items()} - def is_import_(self, name: str) -> bool: + def _is_import(self, name: str) -> bool: return name in self.import_signatures["imports"] def is_builtin(self, func: str) -> bool: @@ -109,17 +109,14 @@ def format_imported_property(self, name: str) -> str: def format_imported_constant(self, name: str) -> str: return self.format_imported_class_members(name) - def parse_integer(self, integer: str) -> Optional[int]: + def parse_integer(self, integer: str) -> int: for suffix in self.integer_suffixes: if integer.endswith(suffix): integer = integer[:-1] - try: - for prefix, base in self.integer_prefixes: - if integer.startswith(prefix): - return int(integer, base) - return int(integer) - except: - return None + for prefix, base in self.integer_prefixes: + if integer.startswith(prefix): + return int(integer, base) + return int(integer) def parse_string(self, string: str) -> str: return string.strip(self.string_delimiters) @@ -157,8 +154,8 @@ class CSharpToolkit(LanguageToolkit): def is_import(self, name: str, namespace: BaseNamespace = None) -> bool: if namespace: - return self.is_import_(namespace.join(name)) - return self.is_import_(name) + return self._is_import(namespace.join(name)) + return self._is_import(name) def create_namespace(self, name: str) -> BaseNamespace: return CSharpNamespace(name) @@ -187,9 +184,9 @@ class PythonToolkit(LanguageToolkit): def is_import(self, name: str, namespace: BaseNamespace = None) -> bool: if namespace: if namespace.alias: - return self.is_import_(name.replace(namespace.alias, namespace.name)) - return self.is_import_(namespace.join(name)) - return self.is_import_(name) + return self._is_import(name.replace(namespace.alias, namespace.name)) + return self._is_import(namespace.join(name)) + return self._is_import(name) def create_namespace(self, name: str) -> BaseNamespace: return PythonImport(name) diff --git a/tests/test_ts.py b/tests/test_ts.py index b49a93811..bfbfb0d04 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -78,9 +78,8 @@ def do_test_ts_extractor_engine_get_new_objects( engine: TreeSitterExtractorEngine, root_node: Node, expected: List[Tuple[str, str]] ): assert len(list(engine.get_new_object_names(root_node))) == len(expected) - for (node, name), (_, expected_name_range) in zip(engine.get_new_object_names(root_node), expected): + for node, (_, expected_name_range) in zip(engine.get_new_object_names(root_node), expected): assert isinstance(node, Node) - assert name == "new-object" do_test_ts_base_engine_get_range(engine, node, expected_name_range) do_test_ts_base_engine_get_address(engine, node) @@ -88,13 +87,10 @@ def do_test_ts_extractor_engine_get_new_objects( def do_test_ts_extractor_engine_get_function_definitions( engine: TreeSitterExtractorEngine, root_node: Node, expected: List[Tuple[str, str]] ): - assert engine.get_function_definitions(engine.tree.root_node) == engine.get_function_definitions() - assert len(engine.get_function_definitions(root_node)) == len(expected) - for (node, name), (expected_range, expected_name_range) in zip( - engine.get_function_definitions(root_node), expected - ): + assert list(engine.get_function_definitions(engine.tree.root_node)) == list(engine.get_function_definitions()) + assert len(list(engine.get_function_definitions(root_node))) == len(expected) + for node, (expected_range, expected_name_range) in zip(engine.get_function_definitions(root_node), expected): assert isinstance(node, Node) - assert name == "function-definition" do_test_ts_base_engine_get_range(engine, node, expected_range, startswith=True) do_test_ts_base_engine_get_address(engine, node) do_test_ts_base_engine_get_range(engine, engine.get_function_definition_name(node), expected_name_range) @@ -110,9 +106,8 @@ def do_test_ts_extractor_engine_get_function_calls( engine: TreeSitterExtractorEngine, root_node: Node, expected: List[Tuple[str, str]] ): assert len(list(engine.get_function_call_names(root_node))) == len(expected) - for (node, name), (_, expected_id_range) in zip(engine.get_function_call_names(root_node), expected): + for node, (_, expected_id_range) in zip(engine.get_function_call_names(root_node), expected): assert isinstance(node, Node) - assert name == "function-call" do_test_ts_base_engine_get_range(engine, node, expected_id_range) do_test_ts_base_engine_get_address(engine, node) @@ -120,10 +115,9 @@ def do_test_ts_extractor_engine_get_function_calls( def do_test_ts_extractor_engine_get_string_literals( engine: TreeSitterExtractorEngine, root_node: Node, expected: List[str] ): - assert len(engine.get_string_literals(root_node)) == len(expected) - for (node, name), expected_range in zip(engine.get_string_literals(root_node), expected): + assert len(list(engine.get_string_literals(root_node))) == len(expected) + for node, expected_range in zip(engine.get_string_literals(root_node), expected): assert isinstance(node, Node) - assert name == "string-literal" do_test_ts_base_engine_get_range(engine, node, expected_range) do_test_ts_base_engine_get_address(engine, node) @@ -131,29 +125,26 @@ def do_test_ts_extractor_engine_get_string_literals( def do_test_ts_extractor_engine_get_integer_literals( engine: TreeSitterExtractorEngine, root_node: Node, expected: List[str] ): - assert len(engine.get_integer_literals(root_node)) == len(expected) - for (node, name), expected_range in zip(engine.get_integer_literals(root_node), expected): + assert len(list(engine.get_integer_literals(root_node))) == len(expected) + for node, expected_range in zip(engine.get_integer_literals(root_node), expected): assert isinstance(node, Node) - assert name == "integer-literal" do_test_ts_base_engine_get_range(engine, node, expected_range) do_test_ts_base_engine_get_address(engine, node) def do_test_ts_extractor_engine_get_namespaces(engine: TreeSitterExtractorEngine, expected: List[str]): - assert engine.get_namespaces(engine.tree.root_node) == engine.get_namespaces() - assert len(engine.get_namespaces()) == len(expected) - for (node, name), expected_range in zip(engine.get_namespaces(), expected): + assert list(engine.get_namespaces(engine.tree.root_node)) == list(engine.get_namespaces()) + assert len(list(engine.get_namespaces())) == len(expected) + for (node, _), expected_range in zip(engine.get_namespaces(), expected): assert isinstance(node, Node) - assert name == "namespace" do_test_ts_base_engine_get_range(engine, node, expected_range) do_test_ts_base_engine_get_address(engine, node) def do_test_ts_extractor_engine_get_global_statements(engine: TreeSitterExtractorEngine, expected: List[str]): - assert len(engine.get_global_statements()) == len(expected) - for (node, name), expected_range in zip(engine.get_global_statements(), expected): + assert len(list(engine.get_global_statements())) == len(expected) + for node, expected_range in zip(engine.get_global_statements(), expected): assert isinstance(node, Node) - assert name == "global-statement" do_test_ts_base_engine_get_range(engine, node, expected_range, startswith=True) do_test_ts_base_engine_get_address(engine, node) @@ -164,7 +155,6 @@ def do_test_ts_extractor_engine_get_assigned_property_names( assert len(list(engine.get_processed_property_names(root_node))) == len(expected) for (node, name), expected_name in zip(engine.get_processed_property_names(root_node), expected): assert isinstance(node, Node) - assert name == expected_name do_test_ts_base_engine_get_address(engine, node) @@ -304,18 +294,16 @@ def do_test_ts_template_engine_get_template_namespaces( def do_test_ts_template_engine_get_code_sections(engine: TreeSitterTemplateEngine, expected: List[Tuple[int, int]]): - assert len(engine.get_code_sections()) == len(expected) - for (node, name), (expected_start_byte, expected_end_byte) in zip(list(engine.get_code_sections()), expected): + assert len(list(engine.get_code_sections())) == len(expected) + for node, (expected_start_byte, expected_end_byte) in zip(list(engine.get_code_sections()), expected): assert isinstance(node, Node) - assert name == "code" assert node.start_byte == expected_start_byte and node.end_byte == expected_end_byte def do_test_ts_template_engine_get_content_sections(engine: TreeSitterTemplateEngine, expected: List[Tuple[int, int]]): - assert len(engine.get_content_sections()) == len(expected) - for (node, name), (expected_start_byte, expected_end_byte) in zip(list(engine.get_content_sections()), expected): + assert len(list(engine.get_content_sections())) == len(expected) + for node, (expected_start_byte, expected_end_byte) in zip(list(engine.get_content_sections()), expected): assert isinstance(node, Node) - assert name == "content" assert node.start_byte == expected_start_byte and node.end_byte == expected_end_byte @@ -323,7 +311,6 @@ def do_test_ts_template_engine_get_parsed_code_sections( engine: TreeSitterTemplateEngine, expected_language: str, expected: List[Tuple[int, int]] ): assert len(list(engine.get_parsed_code_sections())) == len(expected) - addrs = [e.get_default_address() for e in engine.get_parsed_code_sections()] for extractor_engine, (expected_start_byte, _) in zip(engine.get_parsed_code_sections(), expected): do_test_ts_extractor_engine_init(extractor_engine, expected_language) assert extractor_engine.buf_offset == expected_start_byte From bb08181d6e7472b05d498c045fa02e9840cd1eb8 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Wed, 3 Aug 2022 16:11:54 -0400 Subject: [PATCH 47/51] Implemented script language auto-detection. --- capa/features/extractors/common.py | 16 ++++++- capa/features/extractors/script.py | 12 ----- capa/features/extractors/ts/autodetect.py | 55 +++++++++++++++++++++++ capa/features/extractors/ts/extractor.py | 3 +- 4 files changed, 72 insertions(+), 14 deletions(-) create mode 100644 capa/features/extractors/ts/autodetect.py diff --git a/capa/features/extractors/common.py b/capa/features/extractors/common.py index 5f56e50d4..cd8be8e15 100644 --- a/capa/features/extractors/common.py +++ b/capa/features/extractors/common.py @@ -9,9 +9,21 @@ import capa.features import capa.features.extractors.elf import capa.features.extractors.pefile -from capa.features.common import OS, FORMAT_PE, FORMAT_ELF, OS_WINDOWS, FORMAT_FREEZE, Arch, Format, String, Feature +from capa.features.common import ( + OS, + FORMAT_PE, + FORMAT_ELF, + OS_WINDOWS, + FORMAT_FREEZE, + FORMAT_SCRIPT, + Arch, + Format, + String, + Feature, +) from capa.features.freeze import is_freeze from capa.features.address import NO_ADDRESS, Address, FileOffsetAddress +from capa.features.extractors.ts.autodetect import is_script logger = logging.getLogger(__name__) @@ -34,6 +46,8 @@ def extract_format(buf) -> Iterator[Tuple[Feature, Address]]: yield Format(FORMAT_ELF), NO_ADDRESS elif is_freeze(buf): yield Format(FORMAT_FREEZE), NO_ADDRESS + elif is_script(buf): + yield Format(FORMAT_SCRIPT), NO_ADDRESS else: # we likely end up here: # 1. handling a file format (e.g. macho) diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py index 7273959a7..e2425960b 100644 --- a/capa/features/extractors/script.py +++ b/capa/features/extractors/script.py @@ -30,15 +30,3 @@ def extract_os() -> Iterator[Tuple[Feature, Address]]: def extract_format() -> Iterator[Tuple[Feature, Address]]: yield Format(FORMAT_SCRIPT), NO_ADDRESS - - -def get_language_from_ext(path: str) -> str: - if path.endswith(EXT_ASPX): - return LANG_TEM - if path.endswith(EXT_CS): - return LANG_CS - if path.endswith(EXT_HTML): - return LANG_HTML - if path.endswith(EXT_PY): - return LANG_PY - raise ValueError(f"{path} has an unrecognized or an unsupported extension.") diff --git a/capa/features/extractors/ts/autodetect.py b/capa/features/extractors/ts/autodetect.py new file mode 100644 index 000000000..2f1636d80 --- /dev/null +++ b/capa/features/extractors/ts/autodetect.py @@ -0,0 +1,55 @@ +from multiprocessing.sharedctypes import Value + +from tree_sitter import Node, Tree, Parser, Language + +from capa.features.extractors.script import EXT_CS, EXT_PY, LANG_CS, LANG_PY, EXT_ASPX, EXT_HTML, LANG_TEM, LANG_HTML +from capa.features.extractors.ts.query import TS_LANGUAGES + + +def is_script(buf: bytes) -> bool: + try: + return bool(get_language_ts(buf)) + except ValueError: + return False + + +def _parse(ts_language: Language, buf: bytes) -> Tree: + parser = Parser() + parser.set_language(ts_language) + return parser.parse(buf) + + +def _contains_errors(ts_language, node: Node) -> bool: + return ts_language.query("(ERROR) @error").captures(node) + + +def get_language_ts(buf: bytes) -> str: + for language, ts_language in TS_LANGUAGES.items(): + try: + tree = _parse(ts_language, buf) + except ValueError: + continue + if not _contains_errors(ts_language, tree.root_node): + return language + raise ValueError("failed to parse the language") + + +def get_language_from_ext(path: str) -> str: + if path.endswith(EXT_ASPX): + return LANG_TEM + if path.endswith(EXT_CS): + return LANG_CS + if path.endswith(EXT_HTML): + return LANG_HTML + if path.endswith(EXT_PY): + return LANG_PY + raise ValueError(f"{path} has an unrecognized or an unsupported extension.") + + +def get_language(path: str) -> str: + try: + with open(path, "rb") as f: + buf = f.read() + return get_language_ts(buf) + except ValueError: + return get_language_from_ext(path) diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index b611fc206..d1e375ffc 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -5,6 +5,7 @@ import capa.features.extractors.ts.engine import capa.features.extractors.ts.global_ import capa.features.extractors.ts.function +import capa.features.extractors.ts.autodetect from capa.features.common import Namespace from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress, FileOffsetRangeAddress from capa.features.extractors.script import LANG_TEM, LANG_HTML @@ -26,7 +27,7 @@ def __init__(self, path: str): with open(self.path, "rb") as f: buf = f.read() - self.language = capa.features.extractors.script.get_language_from_ext(path) + self.language = capa.features.extractors.ts.autodetect.get_language(path) self.template_engine = self.get_template_engine(buf) self.engines = self.get_engines(buf) From 1fd9d4a7dd485eafbab9a08f1d1cf0ca127a1bcc Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Wed, 3 Aug 2022 16:23:19 -0400 Subject: [PATCH 48/51] Removed a spurious import. --- capa/features/extractors/ts/autodetect.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/capa/features/extractors/ts/autodetect.py b/capa/features/extractors/ts/autodetect.py index 2f1636d80..dabe60022 100644 --- a/capa/features/extractors/ts/autodetect.py +++ b/capa/features/extractors/ts/autodetect.py @@ -1,5 +1,3 @@ -from multiprocessing.sharedctypes import Value - from tree_sitter import Node, Tree, Parser, Language from capa.features.extractors.script import EXT_CS, EXT_PY, LANG_CS, LANG_PY, EXT_ASPX, EXT_HTML, LANG_TEM, LANG_HTML From 7ba978f3959c0670240286e8ffb474d6cbf891d1 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Fri, 5 Aug 2022 09:45:14 -0400 Subject: [PATCH 49/51] Added more test cases; moved script language feature to global features in order to make rules clearer; refactored the codebase to address the latest PR comments/suggestions. --- capa/features/common.py | 4 +- capa/features/extractors/script.py | 11 +- capa/features/extractors/ts/autodetect.py | 20 +-- capa/features/extractors/ts/engine.py | 14 +-- capa/features/extractors/ts/extractor.py | 14 ++- capa/features/extractors/ts/file.py | 10 +- capa/features/extractors/ts/function.py | 16 +-- .../features/extractors/ts/signatures/cs.json | 7 ++ capa/features/extractors/ts/tools.py | 52 ++++---- capa/rules.py | 4 + tests/test_ts.py | 115 +++++++++++++++--- 11 files changed, 181 insertions(+), 86 deletions(-) diff --git a/capa/features/common.py b/capa/features/common.py index 4327ac1e2..b77514f7b 100644 --- a/capa/features/common.py +++ b/capa/features/common.py @@ -414,13 +414,13 @@ def __init__(self, value: str, description=None): FORMAT_PE = "pe" FORMAT_ELF = "elf" FORMAT_DOTNET = "dotnet" -VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET) +FORMAT_SCRIPT = "script" +VALID_FORMAT = (FORMAT_PE, FORMAT_ELF, FORMAT_DOTNET, FORMAT_SCRIPT) # internal only, not to be used in rules FORMAT_AUTO = "auto" FORMAT_SC32 = "sc32" FORMAT_SC64 = "sc64" FORMAT_FREEZE = "freeze" -FORMAT_SCRIPT = "script" FORMAT_UNKNOWN = "unknown" diff --git a/capa/features/extractors/script.py b/capa/features/extractors/script.py index e2425960b..853623695 100644 --- a/capa/features/extractors/script.py +++ b/capa/features/extractors/script.py @@ -16,12 +16,21 @@ EXT_PY = ("py", "py_") +LANGUAGE_FEATURE_FORMAT = { + LANG_CS: "C#", + LANG_HTML: "HTML", + LANG_JS: "JavaScript", + LANG_PY: "Python", + LANG_TEM: "Embedded Template", +} + + def extract_arch() -> Iterator[Tuple[Feature, Address]]: yield Arch(ARCH_ANY), NO_ADDRESS def extract_language(language: str, addr: FileOffsetRangeAddress) -> Iterator[Tuple[Feature, Address]]: - yield ScriptLanguage(language), addr + yield ScriptLanguage(LANGUAGE_FEATURE_FORMAT[language]), addr def extract_os() -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/ts/autodetect.py b/capa/features/extractors/ts/autodetect.py index dabe60022..a03ee2ff2 100644 --- a/capa/features/extractors/ts/autodetect.py +++ b/capa/features/extractors/ts/autodetect.py @@ -1,3 +1,5 @@ +from typing import Optional + from tree_sitter import Node, Tree, Parser, Language from capa.features.extractors.script import EXT_CS, EXT_PY, LANG_CS, LANG_PY, EXT_ASPX, EXT_HTML, LANG_TEM, LANG_HTML @@ -11,10 +13,13 @@ def is_script(buf: bytes) -> bool: return False -def _parse(ts_language: Language, buf: bytes) -> Tree: - parser = Parser() - parser.set_language(ts_language) - return parser.parse(buf) +def _parse(ts_language: Language, buf: bytes) -> Optional[Tree]: + try: + parser = Parser() + parser.set_language(ts_language) + return parser.parse(buf) + except ValueError: + return None def _contains_errors(ts_language, node: Node) -> bool: @@ -23,11 +28,8 @@ def _contains_errors(ts_language, node: Node) -> bool: def get_language_ts(buf: bytes) -> str: for language, ts_language in TS_LANGUAGES.items(): - try: - tree = _parse(ts_language, buf) - except ValueError: - continue - if not _contains_errors(ts_language, tree.root_node): + tree = _parse(ts_language, buf) + if tree and not _contains_errors(ts_language, tree.root_node): return language raise ValueError("failed to parse the language") diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index 37877a395..dc93ed2ac 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -35,7 +35,7 @@ def parse(self) -> Tree: def get_byte_range(self, node: Node) -> bytes: return self.buf[node.start_byte : node.end_byte] - def get_range(self, node: Node) -> str: + def get_str(self, node: Node) -> str: return self.get_byte_range(node).decode("utf-8") def get_address(self, node: Node) -> FileOffsetRangeAddress: @@ -79,11 +79,11 @@ def get_processed_property_names(self, node: Node) -> Iterator[Tuple[Node, str]] """Generates captured property name nodes and their associated proper names (see process_property for details), e.g.: [(node0, "StartInfo"), (node1, "RedirectStandardOutput")].""" for pt_node in self.get_property_names(node): - pt_name = self.language_toolkit.process_property(pt_node, self.get_range(pt_node)) + pt_name = self.language_toolkit.process_property(pt_node, self.get_str(pt_node)) if pt_name: yield pt_node, pt_name - def get_function_definitions(self, node: Node = None) -> Iterator[Node]: + def get_function_definitions(self, node: Optional[Node] = None) -> Iterator[Node]: node = self.tree.root_node if node is None else node for fd_node, _ in self.query.function_definition.captures(node): yield fd_node @@ -107,7 +107,7 @@ def get_processed_imported_constants(self, node: Node) -> Iterator[Tuple[Node, s """Generates captured imported constant nodes and their associated proper names (see process_imported_constant for details), e.g.: [(node0, "ssl.CERT_NONE"), (node1, "win32con.FILE_ATTRIBUTE_HIDDEN")].""" for ic_node in self.get_imported_constants(node): - ic_name = self.language_toolkit.process_imported_constant(ic_node, self.get_range(ic_node)) + ic_name = self.language_toolkit.process_imported_constant(ic_node, self.get_str(ic_node)) if ic_name: yield ic_node, ic_name @@ -119,12 +119,12 @@ def get_integer_literals(self, node: Node) -> Iterator[Node]: for int_node, _ in self.query.integer_literal.captures(node): yield int_node - def get_namespaces(self, node: Node = None) -> List[Tuple[Node, str]]: + def get_namespaces(self, node: Optional[Node] = None) -> List[Tuple[Node, str]]: return self.query.namespace.captures(self.tree.root_node if node is None else node) - def get_processed_namespaces(self, node: Node = None) -> Iterator[BaseNamespace]: + def get_processed_namespaces(self, node: Optional[Node] = None) -> Iterator[BaseNamespace]: for ns_node, query_name in self.get_namespaces(node): - for namespace in self.language_toolkit.process_namespace(ns_node, query_name, self.get_range): + for namespace in self.language_toolkit.process_namespace(ns_node, query_name, self.get_str): yield namespace def get_global_statements(self) -> Iterator[Node]: diff --git a/capa/features/extractors/ts/extractor.py b/capa/features/extractors/ts/extractor.py index d1e375ffc..1447659d0 100644 --- a/capa/features/extractors/ts/extractor.py +++ b/capa/features/extractors/ts/extractor.py @@ -6,6 +6,7 @@ import capa.features.extractors.ts.global_ import capa.features.extractors.ts.function import capa.features.extractors.ts.autodetect +from capa.exceptions import UnsupportedFormatError from capa.features.common import Namespace from capa.features.address import NO_ADDRESS, Address, AbsoluteVirtualAddress, FileOffsetRangeAddress from capa.features.extractors.script import LANG_TEM, LANG_HTML @@ -27,9 +28,12 @@ def __init__(self, path: str): with open(self.path, "rb") as f: buf = f.read() - self.language = capa.features.extractors.ts.autodetect.get_language(path) - self.template_engine = self.get_template_engine(buf) - self.engines = self.get_engines(buf) + try: + self.language = capa.features.extractors.ts.autodetect.get_language(path) + self.template_engine = self.get_template_engine(buf) + self.engines = self.get_engines(buf) + except ValueError as e: + raise UnsupportedFormatError(e) def get_template_engine(self, buf: bytes): if self.language == LANG_TEM: @@ -63,6 +67,8 @@ def extract_template_namespaces(self) -> Iterator[Tuple[Feature, Address]]: yield Namespace(ns.name), address def extract_global_features(self) -> Iterator[Tuple[Feature, Address]]: + for engine in self.engines: + yield from capa.features.extractors.script.extract_language(engine.language, engine.get_default_address()) yield from capa.features.extractors.ts.global_.extract_features() def extract_file_features(self) -> Iterator[Tuple[Feature, Address]]: @@ -81,7 +87,7 @@ def get_functions(self) -> Iterator[FunctionHandle]: for engine in self.engines: yield self.get_pseudo_main_function(engine) for node in engine.get_function_definitions(): - name = engine.get_range(engine.get_function_definition_name(node)) + name = engine.get_str(engine.get_function_definition_name(node)) yield FunctionHandle(engine.get_address(node), TSFunctionInner(node, name, engine)) def extract_function_features(self, f: FunctionHandle) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/ts/file.py b/capa/features/extractors/ts/file.py index ea0504fbe..7aedef458 100644 --- a/capa/features/extractors/ts/file.py +++ b/capa/features/extractors/ts/file.py @@ -1,15 +1,10 @@ from typing import Tuple, Iterator -import capa.features.extractors.script from capa.features.common import Feature, Namespace from capa.features.address import Address from capa.features.extractors.ts.engine import TreeSitterExtractorEngine -def extract_language(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - yield from capa.features.extractors.script.extract_language(engine.language, engine.get_default_address()) - - def extract_namespaces(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: for namespace in engine.get_processed_namespaces(): yield Namespace(namespace.name), engine.get_address(namespace.node) @@ -21,7 +16,4 @@ def extract_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Featur yield feature, addr -FILE_HANDLERS = ( - extract_language, - extract_namespaces, -) +FILE_HANDLERS = (extract_namespaces,) diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py index ff72d24bf..3857c17f7 100644 --- a/capa/features/extractors/ts/function.py +++ b/capa/features/extractors/ts/function.py @@ -30,13 +30,13 @@ def is_pseudo_main_function(fh: FunctionHandle, engine: TreeSitterExtractorEngin def extract_strings(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: for node in engine.get_string_literals(fn_node): - yield String(engine.language_toolkit.parse_string(engine.get_range(node))), engine.get_address(node) + yield String(engine.language_toolkit.parse_string(engine.get_str(node))), engine.get_address(node) def extract_integers(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: for node in engine.get_integer_literals(fn_node): try: - yield Number(engine.language_toolkit.parse_integer(engine.get_range(node))), engine.get_address(node) + yield Number(engine.language_toolkit.parse_integer(engine.get_str(node))), engine.get_address(node) except ValueError: continue @@ -53,13 +53,13 @@ def get_imports(name: str, namespaces: set[BaseNamespace], engine: TreeSitterExt def get_classes(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[str]: for node in engine.get_new_object_names(fn_node): - for name in get_imports(engine.get_range(node), engine.namespaces, engine): + for name in get_imports(engine.get_str(node), engine.namespaces, engine): yield name def _extract_classes(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: for node in engine.get_new_object_names(fn_node): - for name in get_imports(engine.get_range(node), engine.namespaces, engine): + for name in get_imports(engine.get_str(node), engine.namespaces, engine): yield API(engine.language_toolkit.format_imported_class(name)), engine.get_address(node) @@ -72,18 +72,18 @@ def _extract_properties( def _extract_static_methods(node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for name in get_imports(engine.get_range(node), engine.namespaces, engine): + for name in get_imports(engine.get_str(node), engine.namespaces, engine): yield API(engine.language_toolkit.format_imported_function(name)), engine.get_address(node) def get_property_name(node: Node, engine: TreeSitterExtractorEngine) -> str: - qualified_names = engine.language_toolkit.split_name(engine.get_range(node)) + qualified_names = engine.language_toolkit.split_name(engine.get_str(node)) if len(qualified_names) == 1: return qualified_names[0] return engine.language_toolkit.join_names(*qualified_names[1:]) -def _extract_regular_methods( +def _extract_instance_methods( node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine ) -> Iterator[Tuple[Feature, Address]]: direct_method_call_node = engine.get_direct_method_call(node) @@ -106,7 +106,7 @@ def _extract_function_calls( ) -> Iterator[Tuple[Feature, Address]]: for node in engine.get_function_call_names(fn_node): yield from _extract_static_methods(node, engine) - yield from _extract_regular_methods(node, classes, engine) + yield from _extract_instance_methods(node, classes, engine) def _extract_imported_constants(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: diff --git a/capa/features/extractors/ts/signatures/cs.json b/capa/features/extractors/ts/signatures/cs.json index f3d5f0530..2f4f0dfca 100644 --- a/capa/features/extractors/ts/signatures/cs.json +++ b/capa/features/extractors/ts/signatures/cs.json @@ -34,6 +34,7 @@ "System.Convert.ToBase64String", "System.Convert.FromBase64String", "System.Data.SqlClient.SqlCommand", + "System.Data.SqlClient.SqlCommand.ExecuteReader", "System.Data.SqlClient.SqlConnection", "System.Data.SqlClient.SqlConnection.Open", "System.Data.SqlClient.SqlDataAdapter", @@ -52,7 +53,13 @@ "System.Diagnostics.Process.StartInfo.UseShellExecute", "System.Diagnostics.Process.StartInfo.CreateNoWindow", "System.Diagnostics.Process.Start", + "System.Security.Cryptography.RSACryptoServiceProvider", + "System.Security.Cryptography.RSACryptoServiceProvider.Encrypt", + "System.Security.Cryptography.Rijndael", + "System.Security.Cryptography.Rijndael.Create", "System.Security.Cryptography.RijndaelManaged", + "System.Security.Cryptography.RijndaelManaged.CreateDecryptor", + "System.Security.Cryptography.RijndaelManaged.CreateEncryptor", "System.Security.Cryptography.CryptoStream", "System.Security.Cryptography.SHA1", "System.Security.Cryptography.SHA1CryptoServiceProvider", diff --git a/capa/features/extractors/ts/tools.py b/capa/features/extractors/ts/tools.py index f5a5beb5c..95bc880f6 100644 --- a/capa/features/extractors/ts/tools.py +++ b/capa/features/extractors/ts/tools.py @@ -73,11 +73,11 @@ def join_names(self, *args: str) -> str: def split_name(self, name: str) -> List[str]: return name.split(".") - def process_property(self, node: Node, name: str) -> Optional[str]: + def process_property(self, node: Node, name: str) -> str: if self.is_method_call(node): # yield only p.StartInfo but not p.Start() - return None + return "" if self.is_recursive_property(node): # yield only Current.Server.ClearError but not Current.Server and Current - return None + return "" return self.join_names(*self.split_name(name)[1:]) def process_imported_constant(self, node: Node, name: str) -> Optional[str]: @@ -128,7 +128,7 @@ def is_recursive_property(self, node: Node) -> bool: return node.parent.type == self.property_query_type @abc.abstractmethod - def is_import(self, name: str, namespace: BaseNamespace = None) -> bool: + def is_import(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: raise NotImplementedError() @abc.abstractmethod @@ -136,7 +136,7 @@ def create_namespace(self, name: str) -> BaseNamespace: raise NotImplementedError() @abc.abstractmethod - def process_namespace(self, node: Node, query_name: str, get_range: Callable) -> Iterator[BaseNamespace]: + def process_namespace(self, node: Node, query_name: str, get_str: Callable) -> Iterator[BaseNamespace]: raise NotImplementedError() @abc.abstractmethod @@ -152,7 +152,7 @@ class CSharpToolkit(LanguageToolkit): integer_prefixes: List[Tuple[Union[str, Tuple[str, ...]], int]] = [(("0x", "0X"), 16)] integer_suffixes: Tuple[str, ...] = ("u", "l") - def is_import(self, name: str, namespace: BaseNamespace = None) -> bool: + def is_import(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: if namespace: return self._is_import(namespace.join(name)) return self._is_import(name) @@ -160,8 +160,8 @@ def is_import(self, name: str, namespace: BaseNamespace = None) -> bool: def create_namespace(self, name: str) -> BaseNamespace: return CSharpNamespace(name) - def process_namespace(self, node: Node, query_name: str, get_range: Callable) -> Iterator[BaseNamespace]: - yield CSharpNamespace(get_range(node), node, "") + def process_namespace(self, node: Node, query_name: str, get_str: Callable) -> Iterator[BaseNamespace]: + yield CSharpNamespace(get_str(node), node, "") def get_default_namespaces(self, embedded: bool) -> set[BaseNamespace]: if embedded: @@ -181,7 +181,7 @@ class PythonToolkit(LanguageToolkit): ] integer_suffixes: Tuple[str, ...] = tuple() - def is_import(self, name: str, namespace: BaseNamespace = None) -> bool: + def is_import(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: if namespace: if namespace.alias: return self._is_import(name.replace(namespace.alias, namespace.name)) @@ -194,26 +194,22 @@ def create_namespace(self, name: str) -> BaseNamespace: def get_import_name(self, name: str, module_name: Optional[str] = None) -> str: return self.join_names(module_name, name) if module_name else name - def process_simple_import( - self, node: Node, get_range: Callable, module_name: Optional[str] = None - ) -> BaseNamespace: - return PythonImport(self.get_import_name(get_range(node), module_name), node) + def process_simple_import(self, node: Node, get_str: Callable, module_name: Optional[str] = None) -> BaseNamespace: + return PythonImport(self.get_import_name(get_str(node), module_name), node) - def process_aliased_import( - self, node: Node, get_range: Callable, module_name: Optional[str] = None - ) -> BaseNamespace: - name = self.get_import_name(get_range(node.get_child_by_field_name("name")), module_name) - alias = get_range(node.get_child_by_field_name("alias")) + def process_aliased_import(self, node: Node, get_str: Callable, module_name: Optional[str] = None) -> BaseNamespace: + name = self.get_import_name(get_str(node.get_child_by_field_name("name")), module_name) + alias = get_str(node.get_child_by_field_name("alias")) return PythonImport(name, node, alias) def process_imports( - self, nodes: List[Node], get_range: Callable, module_name: Optional[str] = None + self, nodes: List[Node], get_str: Callable, module_name: Optional[str] = None ) -> Iterator[BaseNamespace]: for import_node in nodes: if import_node.type == "dotted_name": - yield self.process_simple_import(import_node, get_range, module_name) + yield self.process_simple_import(import_node, get_str, module_name) elif import_node.type == "aliased_import": - yield self.process_aliased_import(import_node, get_range, module_name) + yield self.process_aliased_import(import_node, get_str, module_name) def get_wildcard_import(self, node: Node) -> Optional[Node]: for child_node in node.children: @@ -221,20 +217,20 @@ def get_wildcard_import(self, node: Node) -> Optional[Node]: return child_node return None - def process_import_from(self, node: Node, import_nodes: List[Node], get_range: Callable) -> Iterator[BaseNamespace]: - module_name, import_nodes = get_range(import_nodes[0]), import_nodes[1:] + def process_import_from(self, node: Node, import_nodes: List[Node], get_str: Callable) -> Iterator[BaseNamespace]: + module_name, import_nodes = get_str(import_nodes[0]), import_nodes[1:] wildcard_import = self.get_wildcard_import(node) if wildcard_import: - yield self.process_simple_import(wildcard_import, get_range, module_name) + yield self.process_simple_import(wildcard_import, get_str, module_name) else: - yield from self.process_imports(import_nodes, get_range, module_name) + yield from self.process_imports(import_nodes, get_str, module_name) - def process_namespace(self, node: Node, query_name: str, get_range: Callable) -> Iterator[BaseNamespace]: + def process_namespace(self, node: Node, query_name: str, get_str: Callable) -> Iterator[BaseNamespace]: import_nodes = [child_node for child_node in node.children if child_node.is_named] if query_name == "import_from": - yield from self.process_import_from(node, import_nodes, get_range) + yield from self.process_import_from(node, import_nodes, get_str) elif query_name == "import": - yield from self.process_imports(import_nodes, get_range) + yield from self.process_imports(import_nodes, get_str) def get_default_namespaces(self, embedded: bool) -> set[BaseNamespace]: return set() diff --git a/capa/rules.py b/capa/rules.py index 928d29b45..8688c8f06 100644 --- a/capa/rules.py +++ b/capa/rules.py @@ -91,6 +91,7 @@ class Scope(str, Enum): capa.features.common.OS, capa.features.common.Arch, capa.features.common.Format, + capa.features.common.ScriptLanguage, }, FILE_SCOPE: { capa.features.common.MatchedRule, @@ -121,6 +122,7 @@ class Scope(str, Enum): }, INSTRUCTION_SCOPE: { capa.features.common.MatchedRule, + capa.features.insn.Property, capa.features.insn.API, capa.features.insn.Number, capa.features.common.String, @@ -254,6 +256,8 @@ def parse_feature(key: str): # keep this in sync with supported features if key == "api": return capa.features.insn.API + if key == "property": + return capa.features.insn.Property elif key == "string": return capa.features.common.StringFactory elif key == "substring": diff --git a/tests/test_ts.py b/tests/test_ts.py index bfbfb0d04..959468c2d 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -19,7 +19,7 @@ ScriptLanguage, ) from capa.features.address import FileOffsetRangeAddress -from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_PY, LANG_TEM, LANG_HTML +from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_PY, LANG_TEM, LANG_HTML, LANGUAGE_FEATURE_FORMAT from capa.features.extractors.ts.query import QueryBinding, HTMLQueryBinding, TemplateQueryBinding from capa.features.extractors.ts.tools import LANGUAGE_TOOLKITS from capa.features.extractors.ts.engine import ( @@ -37,10 +37,10 @@ def do_test_ts_base_engine_init(engine: TreeSitterBaseEngine): assert isinstance(engine.tree, Tree) -def do_test_ts_base_engine_get_range( +def do_test_ts_base_engine_get_str( engine: TreeSitterBaseEngine, node: Node, expected_range: str, startswith: bool = False ): - assert engine.get_range(node).startswith(expected_range) if startswith else engine.get_range(node) == expected_range + assert engine.get_str(node).startswith(expected_range) if startswith else engine.get_str(node) == expected_range def do_test_ts_base_engine_get_address(engine: TreeSitterBaseEngine, node: Node): @@ -71,7 +71,7 @@ def do_test_ts_extractor_engine_init(engine: TreeSitterExtractorEngine, expected def do_test_ts_extractor_engine_get_address( engine: TreeSitterExtractorEngine, node: Node, expected_range: str, startswith: bool = False ): - assert engine.get_range(node).startswith(expected_range) if startswith else engine.get_range(node) == expected_range + assert engine.get_str(node).startswith(expected_range) if startswith else engine.get_str(node) == expected_range def do_test_ts_extractor_engine_get_new_objects( @@ -80,7 +80,7 @@ def do_test_ts_extractor_engine_get_new_objects( assert len(list(engine.get_new_object_names(root_node))) == len(expected) for node, (_, expected_name_range) in zip(engine.get_new_object_names(root_node), expected): assert isinstance(node, Node) - do_test_ts_base_engine_get_range(engine, node, expected_name_range) + do_test_ts_base_engine_get_str(engine, node, expected_name_range) do_test_ts_base_engine_get_address(engine, node) @@ -91,14 +91,14 @@ def do_test_ts_extractor_engine_get_function_definitions( assert len(list(engine.get_function_definitions(root_node))) == len(expected) for node, (expected_range, expected_name_range) in zip(engine.get_function_definitions(root_node), expected): assert isinstance(node, Node) - do_test_ts_base_engine_get_range(engine, node, expected_range, startswith=True) + do_test_ts_base_engine_get_str(engine, node, expected_range, startswith=True) do_test_ts_base_engine_get_address(engine, node) - do_test_ts_base_engine_get_range(engine, engine.get_function_definition_name(node), expected_name_range) + do_test_ts_base_engine_get_str(engine, engine.get_function_definition_name(node), expected_name_range) assert len(list(engine.get_function_definition_names(root_node))) == len(expected) for node, (_, expected_name_range) in zip(engine.get_function_definition_names(root_node), expected): assert isinstance(node, Node) - do_test_ts_base_engine_get_range(engine, node, expected_name_range) + do_test_ts_base_engine_get_str(engine, node, expected_name_range) do_test_ts_base_engine_get_address(engine, node) @@ -108,7 +108,7 @@ def do_test_ts_extractor_engine_get_function_calls( assert len(list(engine.get_function_call_names(root_node))) == len(expected) for node, (_, expected_id_range) in zip(engine.get_function_call_names(root_node), expected): assert isinstance(node, Node) - do_test_ts_base_engine_get_range(engine, node, expected_id_range) + do_test_ts_base_engine_get_str(engine, node, expected_id_range) do_test_ts_base_engine_get_address(engine, node) @@ -118,7 +118,7 @@ def do_test_ts_extractor_engine_get_string_literals( assert len(list(engine.get_string_literals(root_node))) == len(expected) for node, expected_range in zip(engine.get_string_literals(root_node), expected): assert isinstance(node, Node) - do_test_ts_base_engine_get_range(engine, node, expected_range) + do_test_ts_base_engine_get_str(engine, node, expected_range) do_test_ts_base_engine_get_address(engine, node) @@ -128,7 +128,7 @@ def do_test_ts_extractor_engine_get_integer_literals( assert len(list(engine.get_integer_literals(root_node))) == len(expected) for node, expected_range in zip(engine.get_integer_literals(root_node), expected): assert isinstance(node, Node) - do_test_ts_base_engine_get_range(engine, node, expected_range) + do_test_ts_base_engine_get_str(engine, node, expected_range) do_test_ts_base_engine_get_address(engine, node) @@ -137,7 +137,7 @@ def do_test_ts_extractor_engine_get_namespaces(engine: TreeSitterExtractorEngine assert len(list(engine.get_namespaces())) == len(expected) for (node, _), expected_range in zip(engine.get_namespaces(), expected): assert isinstance(node, Node) - do_test_ts_base_engine_get_range(engine, node, expected_range) + do_test_ts_base_engine_get_str(engine, node, expected_range) do_test_ts_base_engine_get_address(engine, node) @@ -145,7 +145,7 @@ def do_test_ts_extractor_engine_get_global_statements(engine: TreeSitterExtracto assert len(list(engine.get_global_statements())) == len(expected) for node, expected_range in zip(engine.get_global_statements(), expected): assert isinstance(node, Node) - do_test_ts_base_engine_get_range(engine, node, expected_range, startswith=True) + do_test_ts_base_engine_get_str(engine, node, expected_range, startswith=True) do_test_ts_base_engine_get_address(engine, node) @@ -927,8 +927,8 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): [ ("cs_138cdc", "global", Arch(ARCH_ANY), True), ("cs_138cdc", "global", OS(OS_ANY), True), + ("cs_138cdc", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), ("cs_138cdc", "file", Format(FORMAT_SCRIPT), True), - ("cs_138cdc", "file", ScriptLanguage(LANG_CS), True), ("cs_138cdc", "file", Namespace("System"), True), ("cs_138cdc", "function=PSEUDO MAIN", String(""), True), ("cs_138cdc", "function=die", String("Not Found"), True), @@ -943,8 +943,8 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ), ("aspx_4f6fa6", "global", Arch(ARCH_ANY), True), ("aspx_4f6fa6", "global", OS(OS_ANY), True), + ("aspx_4f6fa6", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), ("aspx_4f6fa6", "file", Format(FORMAT_SCRIPT), True), - ("aspx_4f6fa6", "file", ScriptLanguage(LANG_CS), True), ("aspx_4f6fa6", "file", Namespace("System.Diagnostics"), True), ("aspx_4f6fa6", "file", Namespace("System.IO"), True), ("aspx_4f6fa6", "file", Namespace("System.IO.Compression"), True), @@ -958,8 +958,8 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_4f6fa6", "function=base64encode", API("System.Convert::ToBase64String"), True), ("aspx_5f959f", "global", Arch(ARCH_ANY), True), ("aspx_5f959f", "global", OS(OS_ANY), True), + ("aspx_5f959f", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), ("aspx_5f959f", "file", Format(FORMAT_SCRIPT), True), - ("aspx_5f959f", "file", ScriptLanguage(LANG_CS), True), ("aspx_5f959f", "file", Namespace("System.Diagnostics"), True), ("aspx_5f959f", "file", Namespace("System.IO"), True), ("aspx_5f959f", "file", Namespace("System.Web.SessionState"), True), @@ -980,8 +980,8 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_5f959f", "function=cmdExe_Click", String("
"), True), ("aspx_10162f", "global", Arch(ARCH_ANY), True), ("aspx_10162f", "global", OS(OS_ANY), True), + ("aspx_10162f", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), ("aspx_10162f", "file", Format(FORMAT_SCRIPT), True), - ("aspx_10162f", "file", ScriptLanguage(LANG_CS), True), ("aspx_10162f", "file", Namespace("System.IO"), True), ("aspx_10162f", "file", Namespace("System.Web.Security"), True), ("aspx_10162f", "function=PSEUDO MAIN", String("data"), True), @@ -1045,13 +1045,92 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_10162f", "function=sizeFix", Number(2), True), ("aspx_10162f", "function=sizeFix", Substring("GB"), True), ("aspx_2b71dd", "global", Arch(ARCH_ANY), True), + ("aspx_2b71dd", "global", OS(OS_ANY), True), + ("aspx_2b71dd", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), + ("aspx_2b71dd", "file", Format(FORMAT_SCRIPT), True), + ("aspx_2b71dd", "file", Namespace("System.Diagnostics"), True), + ("aspx_2b71dd", "file", Namespace("System.IO"), True), + ("aspx_2b71dd", "function=ExcuteCmd", API("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_2b71dd", "function=ExcuteCmd", String("cmd.exe"), True), + ("aspx_2b71dd", "function=ExcuteCmd", Substring("/c"), True), + ("aspx_2b71dd", "function=ExcuteCmd", API("System.Diagnostics.Process::Start"), True), + ("aspx_2b71dd", "function=ExcuteCmd", Property("System.Diagnostics.ProcessStartInfo::FileName"), True), + ("aspx_2b71dd", "function=ExcuteCmd", Property("System.Diagnostics.ProcessStartInfo::Arguments"), True), + ("aspx_2b71dd", "function=ExcuteCmd", Property("System.Diagnostics.ProcessStartInfo::UseShellExecute"), True), + ( + "aspx_2b71dd", + "function=ExcuteCmd", + Property("System.Diagnostics.ProcessStartInfo::RedirectStandardOutput"), + True, + ), ("aspx_f2bf20", "global", Arch(ARCH_ANY), True), ("aspx_f39dc0", "global", Arch(ARCH_ANY), True), ("aspx_ea2a01", "global", Arch(ARCH_ANY), True), ("aspx_6f3261", "global", Arch(ARCH_ANY), True), + ("aspx_6f3261", "global", OS(OS_ANY), True), + ("aspx_6f3261", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), + ("aspx_6f3261", "file", Format(FORMAT_SCRIPT), True), + ("aspx_6f3261", "file", Namespace("System.Data"), True), + ("aspx_6f3261", "file", Namespace("System.Data.SqlClient"), True), + ("aspx_6f3261", "function=PSEUDO MAIN", String("woanware"), True), + ("aspx_6f3261", "function=btnExecute_Click", API("System.Data.SqlClient.SqlConnection"), True), + ("aspx_6f3261", "function=btnExecute_Click", API("System.Data.SqlClient.SqlConnection::Open"), True), + ("aspx_6f3261", "function=btnExecute_Click", API("System.Data.SqlClient.SqlCommand"), True), + ("aspx_6f3261", "function=btnExecute_Click", API("System.Data.SqlClient.SqlCommand::ExecuteReader"), True), ("aspx_1f8f40", "global", Arch(ARCH_ANY), True), + ("aspx_1f8f40", "global", OS(OS_ANY), True), + ("aspx_1f8f40", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), + ("aspx_1f8f40", "file", Format(FORMAT_SCRIPT), True), + ("aspx_1f8f40", "file", Namespace("System.Reflection"), True), + ("aspx_1f8f40", "function=PSEUDO MAIN", API("System.Security.Cryptography.RijndaelManaged"), True), + ( + "aspx_1f8f40", + "function=PSEUDO MAIN", + API("System.Security.Cryptography.RijndaelManaged::CreateDecryptor"), + True, + ), ("aspx_2e8c7e", "global", Arch(ARCH_ANY), True), + ("aspx_2e8c7e", "global", OS(OS_ANY), True), + ("aspx_2e8c7e", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), + ("aspx_2e8c7e", "file", Format(FORMAT_SCRIPT), True), + ("aspx_2e8c7e", "file", Namespace("System.Diagnostics"), True), + ("aspx_2e8c7e", "file", Namespace("System.IO"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", API("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", String("cmd.exe"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", Substring("/c"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", API("System.Diagnostics.Process::Start"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", Property("System.Diagnostics.ProcessStartInfo::FileName"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", Property("System.Diagnostics.ProcessStartInfo::Arguments"), True), + ( + "aspx_2e8c7e", + "function=ExecuteCommand", + Property("System.Diagnostics.ProcessStartInfo::UseShellExecute"), + True, + ), + ( + "aspx_2e8c7e", + "function=ExecuteCommand", + Property("System.Diagnostics.ProcessStartInfo::RedirectStandardOutput"), + True, + ), ("aspx_03bb5c", "global", Arch(ARCH_ANY), True), + ("aspx_03bb5c", "global", OS(OS_ANY), True), + ("aspx_03bb5c", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), + ("aspx_03bb5c", "file", Format(FORMAT_SCRIPT), True), + ("aspx_03bb5c", "file", Namespace("System.Diagnostics"), True), + ("aspx_03bb5c", "file", Namespace("System.IO"), True), + ("aspx_03bb5c", "function=PSEUDO MAIN", API("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_03bb5c", "function=PSEUDO MAIN", API("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_03bb5c", "function=PSEUDO MAIN", API("System.Diagnostics.Process::Start"), True), + ("aspx_03bb5c", "function=PSEUDO MAIN", Property("System.Diagnostics.ProcessStartInfo::FileName"), True), + ("aspx_03bb5c", "function=PSEUDO MAIN", Property("System.Diagnostics.ProcessStartInfo::Arguments"), True), + ("aspx_03bb5c", "function=PSEUDO MAIN", Property("System.Diagnostics.ProcessStartInfo::UseShellExecute"), True), + ( + "aspx_03bb5c", + "function=PSEUDO MAIN", + Property("System.Diagnostics.ProcessStartInfo::RedirectStandardOutput"), + True, + ), ("aspx_606dbf", "global", Arch(ARCH_ANY), True), ("aspx_f397cb", "global", Arch(ARCH_ANY), True), ("aspx_b4bb14", "global", Arch(ARCH_ANY), True), @@ -1063,8 +1142,8 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_d460ca", "global", Arch(ARCH_ANY), True), ("py_7f9cd1", "global", Arch(ARCH_ANY), True), ("py_7f9cd1", "global", OS(OS_ANY), True), + ("py_7f9cd1", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_PY]), True), ("py_7f9cd1", "file", Format(FORMAT_SCRIPT), True), - ("py_7f9cd1", "file", ScriptLanguage(LANG_PY), True), ("py_7f9cd1", "file", Namespace("socket"), True), ("py_7f9cd1", "file", Namespace("threading.Timer"), True), ("py_7f9cd1", "file", Namespace("threading.Timer"), True), From 25cf09bee73b81af73610f5b7b53727fa3e6e7a1 Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Wed, 10 Aug 2022 10:03:20 -0400 Subject: [PATCH 50/51] Introduced auto-detection to template-script parsing, builtins namespace, and modified tests. --- capa/features/extractors/ts/autodetect.py | 10 ++++++++++ capa/features/extractors/ts/engine.py | 7 ++++++- capa/features/extractors/ts/function.py | 2 +- capa/features/extractors/ts/tools.py | 3 +++ tests/test_ts.py | 22 +++++++++++----------- 5 files changed, 31 insertions(+), 13 deletions(-) diff --git a/capa/features/extractors/ts/autodetect.py b/capa/features/extractors/ts/autodetect.py index a03ee2ff2..2883be9b8 100644 --- a/capa/features/extractors/ts/autodetect.py +++ b/capa/features/extractors/ts/autodetect.py @@ -34,6 +34,16 @@ def get_language_ts(buf: bytes) -> str: raise ValueError("failed to parse the language") +def get_template_language_ts(buf: bytes) -> str: + for language, ts_language in TS_LANGUAGES.items(): + if language in [LANG_TEM, LANG_HTML]: + continue + tree = _parse(ts_language, buf) + if tree and not _contains_errors(ts_language, tree.root_node): + return language + raise ValueError("failed to parse the language") + + def get_language_from_ext(path: str) -> str: if path.endswith(EXT_ASPX): return LANG_TEM diff --git a/capa/features/extractors/ts/engine.py b/capa/features/extractors/ts/engine.py index dc93ed2ac..e1be25330 100644 --- a/capa/features/extractors/ts/engine.py +++ b/capa/features/extractors/ts/engine.py @@ -3,6 +3,7 @@ from tree_sitter import Node, Tree, Parser +import capa.features.extractors.ts.autodetect from capa.features.address import FileOffsetRangeAddress from capa.features.extractors.script import LANG_CS, LANG_JS, LANG_TEM, LANG_HTML from capa.features.extractors.ts.query import ( @@ -175,7 +176,11 @@ def identify_language(self) -> str: for node in self.get_code_sections(): if self.is_c_sharp(node): return LANG_CS - return LANG_JS + try: + return capa.features.extractors.ts.autodetect.get_template_language_ts(self.get_byte_range(node)) + except: + continue + raise ValueError(f"failed to identify the template language") def get_imported_namespaces(self) -> Iterator[BaseNamespace]: for node in self.get_code_sections(): diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py index 3857c17f7..5ae0a3ba3 100644 --- a/capa/features/extractors/ts/function.py +++ b/capa/features/extractors/ts/function.py @@ -43,7 +43,7 @@ def extract_integers(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterat def get_imports(name: str, namespaces: set[BaseNamespace], engine: TreeSitterExtractorEngine) -> Iterator[str]: if engine.language_toolkit.is_builtin(name): - yield name + yield engine.language_toolkit.get_builtin_name(name) if engine.language_toolkit.is_import(name): yield name for namespace in namespaces: diff --git a/capa/features/extractors/ts/tools.py b/capa/features/extractors/ts/tools.py index 95bc880f6..9d72ee8c0 100644 --- a/capa/features/extractors/ts/tools.py +++ b/capa/features/extractors/ts/tools.py @@ -67,6 +67,9 @@ def _is_import(self, name: str) -> bool: def is_builtin(self, func: str) -> bool: return func in self.import_signatures["builtins"] + def get_builtin_name(self, func: str) -> str: + return self.join_names("builtins", func) + def join_names(self, *args: str) -> str: return ".".join(args) diff --git a/tests/test_ts.py b/tests/test_ts.py index 959468c2d..f4572b836 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -1147,10 +1147,10 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("py_7f9cd1", "file", Namespace("socket"), True), ("py_7f9cd1", "file", Namespace("threading.Timer"), True), ("py_7f9cd1", "file", Namespace("threading.Timer"), True), - ("py_7f9cd1", "function=icloud_phish", API("subprocess.Popen"), True), - ("py_7f9cd1", "function=icloud_phish", API("urllib2.Request"), True), - ("py_7f9cd1", "function=icloud_phish", API("base64.encodestring"), True), - ("py_7f9cd1", "function=icloud_phish", API("urllib2.urlopen"), True), + ("py_7f9cd1", "function=icloud_phish", API("subprocess::Popen"), True), + ("py_7f9cd1", "function=icloud_phish", API("urllib2::Request"), True), + ("py_7f9cd1", "function=icloud_phish", API("base64::encodestring"), True), + ("py_7f9cd1", "function=icloud_phish", API("urllib2::urlopen"), True), ("py_7f9cd1", "function=get_itunes_backups", String("IMEI"), True), ("py_7f9cd1", "function=PSEUDO MAIN", String("[I] "), True), ("py_7f9cd1", "function=PSEUDO MAIN", Substring("[!]"), True), @@ -1159,22 +1159,22 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("py_ca0df6", "file", Namespace("win32com.client"), True), ("py_ca0df6", "file", Namespace("shutil"), True), ("py_ca0df6", "function=PSEUDO MAIN", API("os::environ"), True), - ("py_ca0df6", "function=yut", API("shutil.copytree"), True), - ("py_ca0df6", "function=yut", API("os.getcwd"), True), - ("py_ca0df6", "function=takk", API("win32com.client.Dispatch"), True), + ("py_ca0df6", "function=yut", API("shutil::copytree"), True), + ("py_ca0df6", "function=yut", API("os::getcwd"), True), + ("py_ca0df6", "function=takk", API("win32com.client::Dispatch"), True), ("py_ca0df6", "function=takk", String("Schedule.Service"), True), ("py_ca0df6", "function=takk", Substring("Updatewmplayer.exe"), True), - ("py_ca0df6", "function=llp", API("win32api.SetFileAttributes"), True), + ("py_ca0df6", "function=llp", API("win32api::SetFileAttributes"), True), ("py_ca0df6", "function=llp", Substring("KMPlayer"), True), - ("py_ca0df6", "function=fop", API("os.remove"), True), + ("py_ca0df6", "function=fop", API("os::remove"), True), ("py_ca0df6", "function=fop", Substring("Projec.exe"), True), - ("py_ca0df6", "function=htr", API("time.sleep"), True), + ("py_ca0df6", "function=htr", API("time::sleep"), True), ("py_ca0df6", "function=htr", Number(30), True), ("py_ca0df6", "function=htr", Number(25), True), ("py_ca0df6", "function=htr", Number(10), True), ("py_ca0df6", "function=vul", Number(5), True), ("py_ca0df6", "function=vul", Number(1), True), - ("py_ca0df6", "function=vul", API("os.popen"), True), + ("py_ca0df6", "function=vul", API("os::popen"), True), ("py_ca0df6", "function=vul", String("Updatewmplayer"), True), ("py_ca0df6", "function=vul", Substring("SCHTASKS"), True), ("py_ca0df6", "function=llp", API("win32con::FILE_ATTRIBUTE_HIDDEN"), True), From e69357303c52a81df3ecece25b3344d17693277f Mon Sep 17 00:00:00 2001 From: Adam Storek Date: Fri, 12 Aug 2022 12:09:45 -0400 Subject: [PATCH 51/51] Attempted to implement the class extraction as specified last Friday (passes all test cases but by no means perfect); further clean up, especially of the signatures; synced with new Python test cases. --- capa/features/extractors/ts/function.py | 121 ++++++++++++------ capa/features/extractors/ts/global_.py | 8 +- .../features/extractors/ts/signatures/cs.json | 71 +++++----- .../features/extractors/ts/signatures/py.json | 29 +++-- capa/features/extractors/ts/tools.py | 59 ++++++--- tests/data | 2 +- tests/test_ts.py | 50 +++++--- 7 files changed, 217 insertions(+), 123 deletions(-) diff --git a/capa/features/extractors/ts/function.py b/capa/features/extractors/ts/function.py index 5ae0a3ba3..854f116af 100644 --- a/capa/features/extractors/ts/function.py +++ b/capa/features/extractors/ts/function.py @@ -1,10 +1,11 @@ +import itertools from typing import Tuple, Iterator from dataclasses import dataclass from tree_sitter import Node from capa.features.insn import API, Number, Property -from capa.features.common import String, Feature +from capa.features.common import Class, String, Feature, Namespace from capa.features.address import Address from capa.features.extractors.ts.tools import BaseNamespace from capa.features.extractors.ts.engine import TreeSitterExtractorEngine @@ -41,64 +42,100 @@ def extract_integers(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterat continue -def get_imports(name: str, namespaces: set[BaseNamespace], engine: TreeSitterExtractorEngine) -> Iterator[str]: - if engine.language_toolkit.is_builtin(name): - yield engine.language_toolkit.get_builtin_name(name) - if engine.language_toolkit.is_import(name): - yield name +def get_possible_full_names(name: str, namespaces: set[BaseNamespace]) -> Iterator[str]: + yield name for namespace in namespaces: - if engine.language_toolkit.is_import(name, namespace): - yield namespace.join(name) + yield namespace.join(name) + + +def get_default_constructor(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[str]: + for name_node in engine.get_new_object_names(fn_node): + for full_name in get_possible_full_names(engine.get_str(name_node), engine.namespaces): + if engine.language_toolkit.is_imported_class(full_name): + yield full_name + + +def get_custom_constructor(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[str]: + for name_node in engine.get_function_call_names(fn_node): + for full_name in get_possible_full_names(engine.get_str(name_node), engine.namespaces): + if engine.language_toolkit.is_imported_constructor(full_name): + yield full_name def get_classes(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[str]: - for node in engine.get_new_object_names(fn_node): - for name in get_imports(engine.get_str(node), engine.namespaces, engine): - yield name + yield from get_default_constructor(fn_node, engine) + yield from get_custom_constructor(fn_node, engine) + + +def _extract_default_constructor(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for name_node in engine.get_new_object_names(fn_node): + for full_name in get_possible_full_names(engine.get_str(name_node), engine.namespaces): + if engine.language_toolkit.is_imported_class(full_name): + yield Namespace(full_name), engine.get_address(name_node) + yield Class(engine.language_toolkit.format_imported_class(full_name)), engine.get_address(name_node) + yield API(engine.language_toolkit.format_imported_default_constructor(full_name)), engine.get_address( + name_node + ) + + +def _extract_custom_constructor(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for name_node in engine.get_function_call_names(fn_node): + for full_name in get_possible_full_names(engine.get_str(name_node), engine.namespaces): + if engine.language_toolkit.is_imported_constructor(full_name): + yield Namespace(full_name), engine.get_address(name_node) + yield Class(engine.language_toolkit.format_imported_class(full_name)), engine.get_address(name_node) + yield API(engine.language_toolkit.format_imported_custom_constructor(full_name)), engine.get_address( + name_node + ) def _extract_classes(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for node in engine.get_new_object_names(fn_node): - for name in get_imports(engine.get_str(node), engine.namespaces, engine): - yield API(engine.language_toolkit.format_imported_class(name)), engine.get_address(node) + yield from _extract_default_constructor(fn_node, engine) + yield from _extract_custom_constructor(fn_node, engine) + + +def _extract_constants(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + for ic_node, ic_name in engine.get_processed_imported_constants(fn_node): + for full_name in get_possible_full_names(ic_name, engine.namespaces): + if engine.language_toolkit.is_imported_constant(full_name): + yield API(engine.language_toolkit.format_imported_constant(full_name)), engine.get_address(ic_node) def _extract_properties( fn_node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine ) -> Iterator[Tuple[Feature, Address]]: for pt_node, pt_name in engine.get_processed_property_names(fn_node): - for name in get_imports(pt_name, classes, engine): - yield Property(engine.language_toolkit.format_imported_property(name)), engine.get_address(pt_node) + for full_name in get_possible_full_names(pt_name, classes): + if engine.language_toolkit.is_imported_property(full_name): + yield Property(engine.language_toolkit.format_imported_property(full_name)), engine.get_address(pt_node) def _extract_static_methods(node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for name in get_imports(engine.get_str(node), engine.namespaces, engine): - yield API(engine.language_toolkit.format_imported_function(name)), engine.get_address(node) + if engine.language_toolkit.is_builtin(engine.get_str(node)): + yield API(engine.language_toolkit.get_builtin_name(engine.get_str(node))), engine.get_address(node) + for full_name in get_possible_full_names(engine.get_str(node), engine.namespaces): + if engine.language_toolkit.is_imported_function(full_name): + yield API(engine.language_toolkit.format_imported_function(full_name)), engine.get_address(node) -def get_property_name(node: Node, engine: TreeSitterExtractorEngine) -> str: - qualified_names = engine.language_toolkit.split_name(engine.get_str(node)) - if len(qualified_names) == 1: - return qualified_names[0] - return engine.language_toolkit.join_names(*qualified_names[1:]) +def _do_extract_instance_methods( + node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine +) -> Iterator[Tuple[Feature, Address]]: + for full_name in get_possible_full_names( + engine.language_toolkit.get_member_from_name(engine.get_str(node)), classes + ): + if engine.language_toolkit.is_imported_function(full_name): + yield API(engine.language_toolkit.format_imported_function(full_name)), engine.get_address(node) def _extract_instance_methods( node: Node, classes: set[BaseNamespace], engine: TreeSitterExtractorEngine ) -> Iterator[Tuple[Feature, Address]]: - direct_method_call_node = engine.get_direct_method_call(node) - node = node if direct_method_call_node is None else direct_method_call_node - property_name = get_property_name(node, engine) - for name in get_imports(property_name, classes, engine): - yield API(engine.language_toolkit.format_imported_function(name)), engine.get_address(node) - - -def extract_api(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - classes = {engine.language_toolkit.create_namespace(cls) for cls in get_classes(fn_node, engine)} - yield from _extract_classes(fn_node, engine) - yield from _extract_imported_constants(fn_node, engine) - yield from _extract_function_calls(fn_node, classes, engine) - yield from _extract_properties(fn_node, classes, engine) + direct_method_call_node = engine.get_direct_method_call(node) # eg new Foo.Bar().direct_method_call(x, y, 3) + if direct_method_call_node: + yield from _do_extract_instance_methods(direct_method_call_node, classes, engine) + else: + yield from _do_extract_instance_methods(node, classes, engine) def _extract_function_calls( @@ -109,10 +146,12 @@ def _extract_function_calls( yield from _extract_instance_methods(node, classes, engine) -def _extract_imported_constants(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: - for ic_node, ic_name in engine.get_processed_imported_constants(fn_node): - for name in get_imports(ic_name, engine.namespaces, engine): - yield API(engine.language_toolkit.format_imported_constant(name)), engine.get_address(ic_node) +def extract_imports(fn_node: Node, engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: + classes = {engine.language_toolkit.create_namespace(cls) for cls in get_classes(fn_node, engine)} + yield from _extract_classes(fn_node, engine) + yield from _extract_constants(fn_node, engine) + yield from _extract_properties(fn_node, classes, engine) + yield from _extract_function_calls(fn_node, classes, engine) def _extract_pseudo_main_features(engine: TreeSitterExtractorEngine) -> Iterator[Tuple[Feature, Address]]: @@ -134,7 +173,7 @@ def extract_features(fh: FunctionHandle, engine: TreeSitterExtractorEngine) -> I FUNCTION_HANDLERS = ( - extract_api, + extract_imports, extract_integers, extract_strings, ) diff --git a/capa/features/extractors/ts/global_.py b/capa/features/extractors/ts/global_.py index c0bab903b..3ea55879d 100644 --- a/capa/features/extractors/ts/global_.py +++ b/capa/features/extractors/ts/global_.py @@ -13,14 +13,14 @@ def extract_os() -> Iterator[Tuple[Feature, Address]]: yield from capa.features.extractors.script.extract_os() +def extract_file_format() -> Iterator[Tuple[Feature, Address]]: + yield from capa.features.extractors.script.extract_format() + + def extract_features() -> Iterator[Tuple[Feature, Address]]: for glob_handler in GLOBAL_HANDLERS: for feature, addr in glob_handler(): yield feature, addr -def extract_file_format() -> Iterator[Tuple[Feature, Address]]: - yield from capa.features.extractors.script.extract_format() - - GLOBAL_HANDLERS = (extract_arch, extract_os, extract_file_format) diff --git a/capa/features/extractors/ts/signatures/cs.json b/capa/features/extractors/ts/signatures/cs.json index 2f4f0dfca..07ce4ee3c 100644 --- a/capa/features/extractors/ts/signatures/cs.json +++ b/capa/features/extractors/ts/signatures/cs.json @@ -1,7 +1,30 @@ { - "imports": - [ + "classes" : [ + "System.Data.SqlClient.SqlCommand", + "System.Data.SqlClient.SqlConnection", + "System.Data.SqlClient.SqlDataAdapter", + "System.Diagnostics.Process", + "System.Diagnostics.ProcessStartInfo", "System.IO.DirectoryInfo", + "System.Security.Cryptography.CryptoStream", + "System.Security.Cryptography.Rijndael", + "System.Security.Cryptography.RijndaelManaged", + "System.Security.Cryptography.RSACryptoServiceProvider", + "System.Security.Cryptography.SHA1", + "System.Security.Cryptography.SHA1CryptoServiceProvider", + "System.Security.Cryptography.SHA256", + "System.Security.Cryptography.SHA256CryptoServiceProvider" + ], + "constructors" : [ + "System.Security.Cryptography.Rijndael.Create" + ], + "functions": + [ + "System.Convert.ToBase64String", + "System.Convert.FromBase64String", + "System.Data.SqlClient.SqlCommand.ExecuteReader", + "System.Data.SqlClient.SqlConnection.Open", + "System.Diagnostics.Process.Start", "System.IO.Directory.CreateDirectory", "System.IO.File.Delete", "System.IO.File.Write", @@ -31,43 +54,27 @@ "System.IO.File.WriteLines", "System.IO.File.WriteLinesAsync", "System.IO.Path.GetTempPath", - "System.Convert.ToBase64String", - "System.Convert.FromBase64String", - "System.Data.SqlClient.SqlCommand", - "System.Data.SqlClient.SqlCommand.ExecuteReader", - "System.Data.SqlClient.SqlConnection", - "System.Data.SqlClient.SqlConnection.Open", - "System.Data.SqlClient.SqlDataAdapter", - "System.Diagnostics.Process", - "System.Diagnostics.ProcessStartInfo", - "System.Diagnostics.ProcessStartInfo.FileName", - "System.Diagnostics.ProcessStartInfo.Arguments", - "System.Diagnostics.ProcessStartInfo.RedirectStandardInput", - "System.Diagnostics.ProcessStartInfo.RedirectStandardOutput", - "System.Diagnostics.ProcessStartInfo.UseShellExecute", - "System.Diagnostics.ProcessStartInfo.CreateNoWindow", + "System.Security.Cryptography.RijndaelManaged.CreateDecryptor", + "System.Security.Cryptography.RijndaelManaged.CreateEncryptor", + "System.Security.Cryptography.RSACryptoServiceProvider.Encrypt", + "System.Security.Cryptography.SHA1CryptoServiceProvider.ComputeHash", + "System.Security.Cryptography.SHA256CryptoServiceProvider.ComputeHash" + ], + "properties": [ "System.Diagnostics.Process.StartInfo.FileName", "System.Diagnostics.Process.StartInfo.Arguments", "System.Diagnostics.Process.StartInfo.RedirectStandardInput", "System.Diagnostics.Process.StartInfo.RedirectStandardOutput", "System.Diagnostics.Process.StartInfo.UseShellExecute", "System.Diagnostics.Process.StartInfo.CreateNoWindow", - "System.Diagnostics.Process.Start", - "System.Security.Cryptography.RSACryptoServiceProvider", - "System.Security.Cryptography.RSACryptoServiceProvider.Encrypt", - "System.Security.Cryptography.Rijndael", - "System.Security.Cryptography.Rijndael.Create", - "System.Security.Cryptography.RijndaelManaged", - "System.Security.Cryptography.RijndaelManaged.CreateDecryptor", - "System.Security.Cryptography.RijndaelManaged.CreateEncryptor", - "System.Security.Cryptography.CryptoStream", - "System.Security.Cryptography.SHA1", - "System.Security.Cryptography.SHA1CryptoServiceProvider", - "System.Security.Cryptography.SHA1CryptoServiceProvider.ComputeHash", - "System.Security.Cryptography.SHA256", - "System.Security.Cryptography.SHA256CryptoServiceProvider", - "System.Security.Cryptography.SHA256CryptoServiceProvider.ComputeHash" + "System.Diagnostics.ProcessStartInfo.FileName", + "System.Diagnostics.ProcessStartInfo.Arguments", + "System.Diagnostics.ProcessStartInfo.RedirectStandardInput", + "System.Diagnostics.ProcessStartInfo.RedirectStandardOutput", + "System.Diagnostics.ProcessStartInfo.UseShellExecute", + "System.Diagnostics.ProcessStartInfo.CreateNoWindow" ], + "constants": [], "builtins": [], "aspx_default_namespaces": [ diff --git a/capa/features/extractors/ts/signatures/py.json b/capa/features/extractors/ts/signatures/py.json index 21324b558..667ffe1db 100644 --- a/capa/features/extractors/ts/signatures/py.json +++ b/capa/features/extractors/ts/signatures/py.json @@ -1,17 +1,22 @@ { - "imports": [ - "socket", + "classes": [ + "socket.socket", "socket.error", + "urllib2.Request" + ], + "constructors": [ + "ssl.wrap_socket", + "win32com.client.Dispatch" + ], + "functions": [ "subprocess.Popen", "subprocess.PIPE", "urllib2.urlopen", - "urllib2.Request", "base64.encodestring", "base64.b64encode", "base64.b64decode", "os.chdir", "os.chmod", - "os.environ", "os.getcwd", "os.popen", "os.remove", @@ -20,12 +25,20 @@ "platform.mac_ver", "shutil.copytree", "time.sleep", - "win32com.client.Dispatch", - "win32con.FILE_ATTRIBUTE_HIDDEN", - "win32con.FILE_ATTRIBUTE_SYSTEM", "win32api.SetFileAttributes" - ], + "constants": [ + "os.environ", + "socket.AF_INET", + "socket.SOCK_STREAM", + "socket.SQL_SOCKET", + "socket.SO_REUSEADDR", + "ssl.PROTOCOL_TLSv1", + "ssl.CERT_NONE", + "win32con.FILE_ATTRIBUTE_HIDDEN", + "win32con.FILE_ATTRIBUTE_SYSTEM" + ], + "properties": [], "builtins": [ "eval", "exec", diff --git a/capa/features/extractors/ts/tools.py b/capa/features/extractors/ts/tools.py index 9d72ee8c0..1c1d9e48f 100644 --- a/capa/features/extractors/ts/tools.py +++ b/capa/features/extractors/ts/tools.py @@ -59,10 +59,29 @@ def __init__(self): def load_import_signatures(self, signature_file: str) -> Dict[str, set[str]]: signatures = json.loads(importlib.resources.read_text(capa.features.extractors.ts.signatures, signature_file)) - return {category: set(namespaces) for category, namespaces in signatures.items()} + return {category: set(names) for category, names in signatures.items()} - def _is_import(self, name: str) -> bool: - return name in self.import_signatures["imports"] + def get_full_name(self, name: str, namespace: Optional[BaseNamespace] = None) -> str: + if namespace: + if namespace.alias: + return name.replace(namespace.alias, namespace.name) + return namespace.join(name) + return name + + def is_imported_function(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: + return self.get_full_name(name, namespace) in self.import_signatures["functions"] + + def is_imported_class(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: + return self.get_full_name(name, namespace) in self.import_signatures["classes"] + + def is_imported_constructor(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: + return self.get_full_name(name, namespace) in self.import_signatures["constructors"] + + def is_imported_property(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: + return self.get_full_name(name, namespace) in self.import_signatures["properties"] + + def is_imported_constant(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: + return self.get_full_name(name, namespace) in self.import_signatures["constants"] def is_builtin(self, func: str) -> bool: return func in self.import_signatures["builtins"] @@ -90,6 +109,18 @@ def process_imported_constant(self, node: Node, name: str) -> Optional[str]: return None return name + def get_namespace_from_name(self, name: str) -> str: + qualified_names = self.split_name(name) + if len(qualified_names) < 2: + return "" + return self.join_names(*qualified_names[:-1]) + + def get_member_from_name(self, name: str) -> str: + qualified_names = self.split_name(name) + if len(qualified_names) < 2: + return qualified_names[0] + return self.join_names(*qualified_names[1:]) + def format_imported_class(self, name: str) -> str: return name @@ -106,6 +137,12 @@ def format_imported_class_members(self, name: str) -> str: def format_imported_function(self, name: str) -> str: return self.format_imported_class_members(name) + def format_imported_custom_constructor(self, name: str) -> str: + return self.format_imported_class_members(name) + + def format_imported_default_constructor(self, name: str) -> str: + return self.format_imported_function(self.join_names(name, "ctor")) + def format_imported_property(self, name: str) -> str: return self.format_imported_class_members(name) @@ -130,10 +167,6 @@ def is_method_call(self, node: Node) -> bool: def is_recursive_property(self, node: Node) -> bool: return node.parent.type == self.property_query_type - @abc.abstractmethod - def is_import(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: - raise NotImplementedError() - @abc.abstractmethod def create_namespace(self, name: str) -> BaseNamespace: raise NotImplementedError() @@ -155,11 +188,6 @@ class CSharpToolkit(LanguageToolkit): integer_prefixes: List[Tuple[Union[str, Tuple[str, ...]], int]] = [(("0x", "0X"), 16)] integer_suffixes: Tuple[str, ...] = ("u", "l") - def is_import(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: - if namespace: - return self._is_import(namespace.join(name)) - return self._is_import(name) - def create_namespace(self, name: str) -> BaseNamespace: return CSharpNamespace(name) @@ -184,13 +212,6 @@ class PythonToolkit(LanguageToolkit): ] integer_suffixes: Tuple[str, ...] = tuple() - def is_import(self, name: str, namespace: Optional[BaseNamespace] = None) -> bool: - if namespace: - if namespace.alias: - return self._is_import(name.replace(namespace.alias, namespace.name)) - return self._is_import(namespace.join(name)) - return self._is_import(name) - def create_namespace(self, name: str) -> BaseNamespace: return PythonImport(name) diff --git a/tests/data b/tests/data index 2e8257475..f032303b5 160000 --- a/tests/data +++ b/tests/data @@ -1 +1 @@ -Subproject commit 2e8257475ebfdc8808d7e180be9a3f94977fcf57 +Subproject commit f032303b50d0a4225fc436d35f0d8b215751f9aa diff --git a/tests/test_ts.py b/tests/test_ts.py index f4572b836..c29f93f18 100644 --- a/tests/test_ts.py +++ b/tests/test_ts.py @@ -12,6 +12,7 @@ ARCH_ANY, FORMAT_SCRIPT, Arch, + Class, Format, String, Namespace, @@ -933,8 +934,10 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("cs_138cdc", "function=PSEUDO MAIN", String(""), True), ("cs_138cdc", "function=die", String("Not Found"), True), ("cs_138cdc", "function=Page_Load", String("127.0.0.1"), True), - ("cs_138cdc", "function=Page_Load", API("System.Diagnostics.ProcessStartInfo"), True), - ("cs_138cdc", "function=Page_Load", API("System.Diagnostics.Process"), True), + ("cs_138cdc", "function=Page_Load", Class("System.Diagnostics.ProcessStartInfo"), True), + ("cs_138cdc", "function=Page_Load", API("System.Diagnostics.ProcessStartInfo::ctor"), True), + ("cs_138cdc", "function=Page_Load", Class("System.Diagnostics.Process"), True), + ("cs_138cdc", "function=Page_Load", API("System.Diagnostics.Process::ctor"), True), ( "cs_138cdc", "function=Page_Load", @@ -950,7 +953,8 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_4f6fa6", "file", Namespace("System.IO.Compression"), True), ("aspx_4f6fa6", "function=do_ps", String("powershell.exe"), True), ("aspx_4f6fa6", "function=do_ps", Substring("-executionpolicy bypass"), True), - ("aspx_4f6fa6", "function=do_ps", API("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_4f6fa6", "function=do_ps", Class("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_4f6fa6", "function=do_ps", API("System.Diagnostics.ProcessStartInfo::ctor"), True), ("aspx_4f6fa6", "function=do_ps", API("System.Diagnostics.Process::Start"), True), ("aspx_4f6fa6", "function=ps", String("\\nPS> "), True), ("aspx_4f6fa6", "function=ps", Substring("PS>"), True), @@ -963,7 +967,8 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_5f959f", "file", Namespace("System.Diagnostics"), True), ("aspx_5f959f", "file", Namespace("System.IO"), True), ("aspx_5f959f", "file", Namespace("System.Web.SessionState"), True), - ("aspx_5f959f", "function=ExcuteCmd", API("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_5f959f", "function=ExcuteCmd", Class("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_5f959f", "function=ExcuteCmd", API("System.Diagnostics.ProcessStartInfo::ctor"), True), ("aspx_5f959f", "function=ExcuteCmd", String("cmd.exe"), True), ("aspx_5f959f", "function=ExcuteCmd", Substring("/c"), True), ("aspx_5f959f", "function=ExcuteCmd", API("System.Diagnostics.Process::Start"), True), @@ -1014,12 +1019,15 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_10162f", "function=h", API("System.Convert::FromBase64String"), True), ("aspx_10162f", "function=d", API("System.IO.File::Delete"), True), ("aspx_10162f", "function=d", API("System.IO.File::Delete"), True), - ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlConnection"), True), - ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlConnection"), True), - ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlCommand"), True), - ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlDataAdapter"), True), + ("aspx_10162f", "function=sq", Class("System.Data.SqlClient.SqlConnection"), True), + ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlConnection::ctor"), True), + ("aspx_10162f", "function=sq", Class("System.Data.SqlClient.SqlCommand"), True), + ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlCommand::ctor"), True), + ("aspx_10162f", "function=sq", Class("System.Data.SqlClient.SqlDataAdapter"), True), + ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlDataAdapter::ctor"), True), ("aspx_10162f", "function=sq", API("System.Data.SqlClient.SqlConnection::Open"), True), - ("aspx_10162f", "function=exec", API("System.Diagnostics.Process"), True), + ("aspx_10162f", "function=exec", Class("System.Diagnostics.Process"), True), + ("aspx_10162f", "function=exec", API("System.Diagnostics.Process::ctor"), True), ("aspx_10162f", "function=exec", String("cmd.exe"), True), ("aspx_10162f", "function=exec", Property("System.Diagnostics.Process.StartInfo::FileName"), True), ("aspx_10162f", "function=exec", Property("System.Diagnostics.Process.StartInfo::UseShellExecute"), True), @@ -1035,7 +1043,8 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_10162f", "function=exp", Substring("root"), True), ("aspx_10162f", "function=exp", Substring("net use"), True), ("aspx_10162f", "function=exp", Number(2), True), - ("aspx_10162f", "function=exp", API("System.IO.DirectoryInfo"), True), + ("aspx_10162f", "function=exp", Class("System.IO.DirectoryInfo"), True), + ("aspx_10162f", "function=exp", API("System.IO.DirectoryInfo::ctor"), True), ("aspx_10162f", "function=exp", API("System.IO.File::GetAttributes"), True), ("aspx_10162f", "function=GetDirSize", Number(0), True), ("aspx_10162f", "function=createJsonDirectory", String('\\"dir\\":['), True), @@ -1050,7 +1059,8 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_2b71dd", "file", Format(FORMAT_SCRIPT), True), ("aspx_2b71dd", "file", Namespace("System.Diagnostics"), True), ("aspx_2b71dd", "file", Namespace("System.IO"), True), - ("aspx_2b71dd", "function=ExcuteCmd", API("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_2b71dd", "function=ExcuteCmd", Class("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_2b71dd", "function=ExcuteCmd", API("System.Diagnostics.ProcessStartInfo::ctor"), True), ("aspx_2b71dd", "function=ExcuteCmd", String("cmd.exe"), True), ("aspx_2b71dd", "function=ExcuteCmd", Substring("/c"), True), ("aspx_2b71dd", "function=ExcuteCmd", API("System.Diagnostics.Process::Start"), True), @@ -1073,16 +1083,19 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_6f3261", "file", Namespace("System.Data"), True), ("aspx_6f3261", "file", Namespace("System.Data.SqlClient"), True), ("aspx_6f3261", "function=PSEUDO MAIN", String("woanware"), True), - ("aspx_6f3261", "function=btnExecute_Click", API("System.Data.SqlClient.SqlConnection"), True), + ("aspx_6f3261", "function=btnExecute_Click", Class("System.Data.SqlClient.SqlConnection"), True), + ("aspx_6f3261", "function=btnExecute_Click", API("System.Data.SqlClient.SqlConnection::ctor"), True), ("aspx_6f3261", "function=btnExecute_Click", API("System.Data.SqlClient.SqlConnection::Open"), True), - ("aspx_6f3261", "function=btnExecute_Click", API("System.Data.SqlClient.SqlCommand"), True), + ("aspx_6f3261", "function=btnExecute_Click", Class("System.Data.SqlClient.SqlCommand"), True), + ("aspx_6f3261", "function=btnExecute_Click", API("System.Data.SqlClient.SqlCommand::ctor"), True), ("aspx_6f3261", "function=btnExecute_Click", API("System.Data.SqlClient.SqlCommand::ExecuteReader"), True), ("aspx_1f8f40", "global", Arch(ARCH_ANY), True), ("aspx_1f8f40", "global", OS(OS_ANY), True), ("aspx_1f8f40", "global", ScriptLanguage(LANGUAGE_FEATURE_FORMAT[LANG_CS]), True), ("aspx_1f8f40", "file", Format(FORMAT_SCRIPT), True), ("aspx_1f8f40", "file", Namespace("System.Reflection"), True), - ("aspx_1f8f40", "function=PSEUDO MAIN", API("System.Security.Cryptography.RijndaelManaged"), True), + ("aspx_1f8f40", "function=PSEUDO MAIN", Class("System.Security.Cryptography.RijndaelManaged"), True), + ("aspx_1f8f40", "function=PSEUDO MAIN", API("System.Security.Cryptography.RijndaelManaged::ctor"), True), ( "aspx_1f8f40", "function=PSEUDO MAIN", @@ -1095,7 +1108,8 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_2e8c7e", "file", Format(FORMAT_SCRIPT), True), ("aspx_2e8c7e", "file", Namespace("System.Diagnostics"), True), ("aspx_2e8c7e", "file", Namespace("System.IO"), True), - ("aspx_2e8c7e", "function=ExecuteCommand", API("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", Class("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_2e8c7e", "function=ExecuteCommand", API("System.Diagnostics.ProcessStartInfo::ctor"), True), ("aspx_2e8c7e", "function=ExecuteCommand", String("cmd.exe"), True), ("aspx_2e8c7e", "function=ExecuteCommand", Substring("/c"), True), ("aspx_2e8c7e", "function=ExecuteCommand", API("System.Diagnostics.Process::Start"), True), @@ -1119,8 +1133,8 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("aspx_03bb5c", "file", Format(FORMAT_SCRIPT), True), ("aspx_03bb5c", "file", Namespace("System.Diagnostics"), True), ("aspx_03bb5c", "file", Namespace("System.IO"), True), - ("aspx_03bb5c", "function=PSEUDO MAIN", API("System.Diagnostics.ProcessStartInfo"), True), - ("aspx_03bb5c", "function=PSEUDO MAIN", API("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_03bb5c", "function=PSEUDO MAIN", Class("System.Diagnostics.ProcessStartInfo"), True), + ("aspx_03bb5c", "function=PSEUDO MAIN", API("System.Diagnostics.ProcessStartInfo::ctor"), True), ("aspx_03bb5c", "function=PSEUDO MAIN", API("System.Diagnostics.Process::Start"), True), ("aspx_03bb5c", "function=PSEUDO MAIN", Property("System.Diagnostics.ProcessStartInfo::FileName"), True), ("aspx_03bb5c", "function=PSEUDO MAIN", Property("System.Diagnostics.ProcessStartInfo::Arguments"), True), @@ -1148,7 +1162,7 @@ def do_test_ts_html_engine_init(engine: TreeSitterHTMLEngine): ("py_7f9cd1", "file", Namespace("threading.Timer"), True), ("py_7f9cd1", "file", Namespace("threading.Timer"), True), ("py_7f9cd1", "function=icloud_phish", API("subprocess::Popen"), True), - ("py_7f9cd1", "function=icloud_phish", API("urllib2::Request"), True), + ("py_7f9cd1", "function=icloud_phish", Class("urllib2.Request"), True), ("py_7f9cd1", "function=icloud_phish", API("base64::encodestring"), True), ("py_7f9cd1", "function=icloud_phish", API("urllib2::urlopen"), True), ("py_7f9cd1", "function=get_itunes_backups", String("IMEI"), True),