Skip to content

Commit

Permalink
sequence: add sequence address
Browse files Browse the repository at this point in the history
contains the call ids for all the calls within the sequence, so we know
where to look for related matched.
  • Loading branch information
williballenthin committed Dec 18, 2024
1 parent 8fe6cb2 commit 6dde963
Show file tree
Hide file tree
Showing 11 changed files with 521 additions and 162 deletions.
31 changes: 28 additions & 3 deletions capa/capabilities/dynamic.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
import capa.render.result_document as rdoc
from capa.rules import Scope, RuleSet
from capa.engine import FeatureSet, MatchResults
from capa.features.address import _NoAddress
from capa.features.address import DynamicCallAddress, DynamicSequenceAddress, _NoAddress
from capa.capabilities.common import Capabilities, find_file_capabilities
from capa.features.extractors.base_extractor import CallHandle, ThreadHandle, ProcessHandle, DynamicFeatureExtractor

Expand Down Expand Up @@ -101,14 +101,17 @@ def find_thread_capabilities(
# With this approach, our algorithm performance is independent of SEQUENCE_SIZE.
# The naive algorithm, of merging all the trailing feature sets at each call, is dependent upon SEQUENCE_SIZE
# (that is, runtime gets slower the larger SEQUENCE_SIZE is).
sequence_call_addresses: collections.deque[DynamicCallAddress] = collections.deque(maxlen=SEQUENCE_SIZE)
sequence_feature_sets: collections.deque[FeatureSet] = collections.deque(maxlen=SEQUENCE_SIZE)
sequence_features: FeatureSet = collections.defaultdict(set)

# the names of rules matched at the last sequence,
# so that we can deduplicate long strings of the same matche.
last_sequence_matches: set[str] = set()

call_count = 0
for ch in extractor.get_calls(ph, th):
call_count += 1
call_capabilities = find_call_capabilities(ruleset, extractor, ph, th, ch)
for feature, vas in call_capabilities.features.items():
features[feature].update(vas)
Expand All @@ -119,6 +122,12 @@ def find_thread_capabilities(
#
# sequence scope matching
#
sequence_call_addresses.append(ch.address)
# TODO: it would be nice to create this only when needed, since it generates garbage.
sequence_address = DynamicSequenceAddress(
th.address, id=ch.address.id, calls=tuple(address.id for address in sequence_call_addresses)
)

# As we add items to the end of the deque, overflow and drop the oldest items (at the left end).
# While we could rely on `deque.append` with `maxlen` set (which we provide above),
# we want to use the dropped item first, to remove the old features, so we manually pop it here.
Expand All @@ -144,8 +153,10 @@ def find_thread_capabilities(
for feature, vas in latest_features.items():
sequence_features[feature].update(vas)

_, smatches = ruleset.match(Scope.SEQUENCE, sequence_features, ch.address)
_, smatches = ruleset.match(Scope.SEQUENCE, sequence_features, sequence_address)
# TODO: if smatches: create the sequence location
for rule_name, res in smatches.items():
# TODO: maybe just garbage collect here better.
if rule_name in last_sequence_matches:
# don't emit match results for rules seen during the immediately preceeding sequence.
#
Expand All @@ -172,6 +183,14 @@ def find_thread_capabilities(
for va, _ in res:
capa.engine.index_rule_matches(features, rule, [va])

logger.debug(
"analyzed thread %d[%d] with %d events, %d features, and %d matches",
th.address.process.pid,
th.address.tid,
call_count,
len(features),
len(matches) + len(sequence_matches) + len(call_matches),
)
return ThreadCapabilities(features, matches, sequence_matches, call_matches)


Expand Down Expand Up @@ -224,6 +243,13 @@ def find_process_capabilities(
process_features[feature].add(va)

_, process_matches = ruleset.match(Scope.PROCESS, process_features, ph.address)

logger.debug(
"analyzed process %d and extracted %d features with %d matches",
ph.address.pid,
len(process_features),
len(process_matches),
)
return ProcessCapabilities(process_matches, thread_matches, sequence_matches, call_matches, len(process_features))


Expand Down Expand Up @@ -252,7 +278,6 @@ def find_dynamic_capabilities(
address=frz.Address.from_capa(p.address), count=process_capabilities.feature_count
),
)
logger.debug("analyzed %s and extracted %d features", p.address, process_capabilities.feature_count)

for rule_name, res in process_capabilities.process_matches.items():
all_process_matches[rule_name].extend(res)
Expand Down
30 changes: 28 additions & 2 deletions capa/features/address.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,11 +104,37 @@ def __repr__(self):
return f"{self.thread}, call(id: {self.id})"

def __hash__(self):
return hash((self.thread, self.id))
return hash(("call", self.thread, self.id))

def __eq__(self, other):
return isinstance(other, DynamicCallAddress) and (self.thread, self.id) == (other.thread, other.id)

def __lt__(self, other):
assert isinstance(other, DynamicCallAddress)
return (self.thread, self.id) == (other.thread, other.id)
return (self.thread, self.id) < (other.thread, other.id)


class DynamicSequenceAddress(Address):
"""addresses a sequence in a dynamic execution trace"""

def __init__(self, thread: ThreadAddress, id: int, calls: tuple[int, ...]):
assert id >= 0
self.thread = thread
# ID of the call that identifies this sequence
self.id = id
# list of call IDs contained with this sequence.
# not required for identity, because the id + SEQUENCE_SIZE will dictate this.
self.calls = calls

def __repr__(self):
return f"{self.thread}, sequence(id: {self.id})"

def __hash__(self):
# calls not required for identity, because the id + SEQUENCE_SIZE will be sufficient.
return hash(("sequence", self.thread, self.id))

def __eq__(self, other):
return isinstance(other, DynamicSequenceAddress) and (self.thread, self.id) == (other.thread, other.id)

def __lt__(self, other):
assert isinstance(other, DynamicCallAddress)
Expand Down
29 changes: 28 additions & 1 deletion capa/features/freeze/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,13 +52,23 @@ class AddressType(str, Enum):
DN_TOKEN_OFFSET = "dn token offset"
PROCESS = "process"
THREAD = "thread"
SEQUENCE = "sequence"
CALL = "call"
NO_ADDRESS = "no address"


class Address(HashableModel):
type: AddressType
value: Union[int, tuple[int, ...], None] = None # None default value to support deserialization of NO_ADDRESS
value: Union[
# for absolute, relative, file
int,
# for DNToken, Process, Thread, Call
tuple[int, ...],
# for sequence
tuple[int, int, int, int, tuple[int, ...]],
# for NO_ADDRESS,
None,
] = None # None default value to support deserialization of NO_ADDRESS

@classmethod
def from_capa(cls, a: capa.features.address.Address) -> "Address":
Expand Down Expand Up @@ -86,6 +96,12 @@ def from_capa(cls, a: capa.features.address.Address) -> "Address":
elif isinstance(a, capa.features.address.DynamicCallAddress):
return cls(type=AddressType.CALL, value=(a.thread.process.ppid, a.thread.process.pid, a.thread.tid, a.id))

elif isinstance(a, capa.features.address.DynamicSequenceAddress):
return cls(
type=AddressType.SEQUENCE,
value=(a.thread.process.ppid, a.thread.process.pid, a.thread.tid, a.id, a.calls),
)

elif a == capa.features.address.NO_ADDRESS or isinstance(a, capa.features.address._NoAddress):
return cls(type=AddressType.NO_ADDRESS, value=None)

Expand Down Expand Up @@ -149,6 +165,17 @@ def to_capa(self) -> capa.features.address.Address:
id=id_,
)

elif self.type is AddressType.SEQUENCE:
assert isinstance(self.value, tuple)
ppid, pid, tid, id_, calls = self.value
return capa.features.address.DynamicSequenceAddress(
thread=capa.features.address.ThreadAddress(
process=capa.features.address.ProcessAddress(ppid=ppid, pid=pid), tid=tid
),
id=id_,
calls=calls,
)

elif self.type is AddressType.NO_ADDRESS:
return capa.features.address.NO_ADDRESS

Expand Down
27 changes: 27 additions & 0 deletions capa/render/proto/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,6 +135,25 @@ def addr_to_pb2(addr: frz.Address) -> capa_pb2.Address:
),
)

elif addr.type is AddressType.SEQUENCE:
assert isinstance(addr.value, tuple)
ppid, pid, tid, id_, calls = addr.value
assert isinstance(ppid, int)
assert isinstance(pid, int)
assert isinstance(tid, int)
assert isinstance(id_, int)
assert isinstance(calls, tuple)
return capa_pb2.Address(
type=capa_pb2.AddressType.ADDRESSTYPE_SEQUENCE,
ppid_pid_tid_id_calls=capa_pb2.Ppid_Pid_Tid_Id_Calls(
ppid=int_to_pb2(ppid),
pid=int_to_pb2(pid),
tid=int_to_pb2(tid),
id=int_to_pb2(id_),
calls=tuple(int_to_pb2(i) for i in calls),
),
)

elif addr.type is AddressType.NO_ADDRESS:
# value == None, so only set type
return capa_pb2.Address(type=capa_pb2.AddressType.ADDRESSTYPE_NO_ADDRESS)
Expand Down Expand Up @@ -630,6 +649,14 @@ def addr_from_pb2(addr: capa_pb2.Address) -> frz.Address:
id_ = int_from_pb2(addr.ppid_pid_tid_id.id)
return frz.Address(type=frz.AddressType.CALL, value=(ppid, pid, tid, id_))

elif addr.type == capa_pb2.AddressType.ADDRESSTYPE_SEQUENCE:
ppid = int_from_pb2(addr.ppid_pid_tid_id_calls.ppid)
pid = int_from_pb2(addr.ppid_pid_tid_id_calls.pid)
tid = int_from_pb2(addr.ppid_pid_tid_id_calls.tid)
id_ = int_from_pb2(addr.ppid_pid_tid_id_calls.id)
calls = tuple(int_from_pb2(i) for i in addr.ppid_pid_tid_id_calls.calls)
return frz.Address(type=frz.AddressType.SEQUENCE, value=(ppid, pid, tid, id_, calls))

elif addr.type == capa_pb2.AddressType.ADDRESSTYPE_NO_ADDRESS:
return frz.Address(type=frz.AddressType.NO_ADDRESS, value=None)

Expand Down
10 changes: 10 additions & 0 deletions capa/render/proto/capa.proto
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ message Address {
Ppid_Pid ppid_pid = 4;
Ppid_Pid_Tid ppid_pid_tid = 5;
Ppid_Pid_Tid_Id ppid_pid_tid_id = 6;
Ppid_Pid_Tid_Id_Calls ppid_pid_tid_id_calls = 7;
};
}

Expand All @@ -30,6 +31,7 @@ enum AddressType {
ADDRESSTYPE_PROCESS = 7;
ADDRESSTYPE_THREAD = 8;
ADDRESSTYPE_CALL = 9;
ADDRESSTYPE_SEQUENCE = 10;
}

message Analysis {
Expand Down Expand Up @@ -473,6 +475,14 @@ message Ppid_Pid_Tid_Id {
Integer id = 4;
}

message Ppid_Pid_Tid_Id_Calls {
Integer ppid = 1;
Integer pid = 2;
Integer tid = 3;
Integer id = 4;
repeated Integer calls = 5;
}

message Integer { oneof value { uint64 u = 1; sint64 i = 2; } } // unsigned or signed int

message Number { oneof value { uint64 u = 1; sint64 i = 2; double f = 3; } }
301 changes: 157 additions & 144 deletions capa/render/proto/capa_pb2.py

Large diffs are not rendered by default.

45 changes: 42 additions & 3 deletions capa/render/proto/capa_pb2.pyi
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ class _AddressTypeEnumTypeWrapper(google.protobuf.internal.enum_type_wrapper._En
ADDRESSTYPE_PROCESS: _AddressType.ValueType # 7
ADDRESSTYPE_THREAD: _AddressType.ValueType # 8
ADDRESSTYPE_CALL: _AddressType.ValueType # 9
ADDRESSTYPE_SEQUENCE: _AddressType.ValueType # 10

class AddressType(_AddressType, metaclass=_AddressTypeEnumTypeWrapper): ...

Expand All @@ -48,6 +49,7 @@ ADDRESSTYPE_NO_ADDRESS: AddressType.ValueType # 6
ADDRESSTYPE_PROCESS: AddressType.ValueType # 7
ADDRESSTYPE_THREAD: AddressType.ValueType # 8
ADDRESSTYPE_CALL: AddressType.ValueType # 9
ADDRESSTYPE_SEQUENCE: AddressType.ValueType # 10
global___AddressType = AddressType

class _Flavor:
Expand Down Expand Up @@ -129,6 +131,7 @@ class Address(google.protobuf.message.Message):
PPID_PID_FIELD_NUMBER: builtins.int
PPID_PID_TID_FIELD_NUMBER: builtins.int
PPID_PID_TID_ID_FIELD_NUMBER: builtins.int
PPID_PID_TID_ID_CALLS_FIELD_NUMBER: builtins.int
type: global___AddressType.ValueType
@property
def v(self) -> global___Integer: ...
Expand All @@ -140,6 +143,8 @@ class Address(google.protobuf.message.Message):
def ppid_pid_tid(self) -> global___Ppid_Pid_Tid: ...
@property
def ppid_pid_tid_id(self) -> global___Ppid_Pid_Tid_Id: ...
@property
def ppid_pid_tid_id_calls(self) -> global___Ppid_Pid_Tid_Id_Calls: ...
def __init__(
self,
*,
Expand All @@ -149,10 +154,11 @@ class Address(google.protobuf.message.Message):
ppid_pid: global___Ppid_Pid | None = ...,
ppid_pid_tid: global___Ppid_Pid_Tid | None = ...,
ppid_pid_tid_id: global___Ppid_Pid_Tid_Id | None = ...,
ppid_pid_tid_id_calls: global___Ppid_Pid_Tid_Id_Calls | None = ...,
) -> None: ...
def HasField(self, field_name: typing.Literal["ppid_pid", b"ppid_pid", "ppid_pid_tid", b"ppid_pid_tid", "ppid_pid_tid_id", b"ppid_pid_tid_id", "token_offset", b"token_offset", "v", b"v", "value", b"value"]) -> builtins.bool: ...
def ClearField(self, field_name: typing.Literal["ppid_pid", b"ppid_pid", "ppid_pid_tid", b"ppid_pid_tid", "ppid_pid_tid_id", b"ppid_pid_tid_id", "token_offset", b"token_offset", "type", b"type", "v", b"v", "value", b"value"]) -> None: ...
def WhichOneof(self, oneof_group: typing.Literal["value", b"value"]) -> typing.Literal["v", "token_offset", "ppid_pid", "ppid_pid_tid", "ppid_pid_tid_id"] | None: ...
def HasField(self, field_name: typing.Literal["ppid_pid", b"ppid_pid", "ppid_pid_tid", b"ppid_pid_tid", "ppid_pid_tid_id", b"ppid_pid_tid_id", "ppid_pid_tid_id_calls", b"ppid_pid_tid_id_calls", "token_offset", b"token_offset", "v", b"v", "value", b"value"]) -> builtins.bool: ...
def ClearField(self, field_name: typing.Literal["ppid_pid", b"ppid_pid", "ppid_pid_tid", b"ppid_pid_tid", "ppid_pid_tid_id", b"ppid_pid_tid_id", "ppid_pid_tid_id_calls", b"ppid_pid_tid_id_calls", "token_offset", b"token_offset", "type", b"type", "v", b"v", "value", b"value"]) -> None: ...
def WhichOneof(self, oneof_group: typing.Literal["value", b"value"]) -> typing.Literal["v", "token_offset", "ppid_pid", "ppid_pid_tid", "ppid_pid_tid_id", "ppid_pid_tid_id_calls"] | None: ...

global___Address = Address

Expand Down Expand Up @@ -1817,6 +1823,39 @@ class Ppid_Pid_Tid_Id(google.protobuf.message.Message):

global___Ppid_Pid_Tid_Id = Ppid_Pid_Tid_Id

@typing.final
class Ppid_Pid_Tid_Id_Calls(google.protobuf.message.Message):
DESCRIPTOR: google.protobuf.descriptor.Descriptor

PPID_FIELD_NUMBER: builtins.int
PID_FIELD_NUMBER: builtins.int
TID_FIELD_NUMBER: builtins.int
ID_FIELD_NUMBER: builtins.int
CALLS_FIELD_NUMBER: builtins.int
@property
def ppid(self) -> global___Integer: ...
@property
def pid(self) -> global___Integer: ...
@property
def tid(self) -> global___Integer: ...
@property
def id(self) -> global___Integer: ...
@property
def calls(self) -> google.protobuf.internal.containers.RepeatedCompositeFieldContainer[global___Integer]: ...
def __init__(
self,
*,
ppid: global___Integer | None = ...,
pid: global___Integer | None = ...,
tid: global___Integer | None = ...,
id: global___Integer | None = ...,
calls: collections.abc.Iterable[global___Integer] | None = ...,
) -> None: ...
def HasField(self, field_name: typing.Literal["id", b"id", "pid", b"pid", "ppid", b"ppid", "tid", b"tid"]) -> builtins.bool: ...
def ClearField(self, field_name: typing.Literal["calls", b"calls", "id", b"id", "pid", b"pid", "ppid", b"ppid", "tid", b"tid"]) -> None: ...

global___Ppid_Pid_Tid_Id_Calls = Ppid_Pid_Tid_Id_Calls

@typing.final
class Integer(google.protobuf.message.Message):
DESCRIPTOR: google.protobuf.descriptor.Descriptor
Expand Down
Loading

0 comments on commit 6dde963

Please sign in to comment.