Skip to content

Commit a4eb8ca

Browse files
committed
feat(reporting): report meta-data information about chunks.
Allow handlers to provide a dict value as part of a ValidChunk metadata attribute. That dictionnary can contain any relevant metadata information from the perspective of the handler, but we advise handler writers to report parsed information such as header values. This metadata dict is later reported as part of our ChunkReports and available in the JSON report file if the user requested one. The idea is to expose metadata to further analysis steps through the unblob report. For example, a binary analysis toolkit would read the load address and architecture from a uImage chunk to analyze the file extracted from that chunk with the right settings. A note on the 'as_dict' implementation. The initial idea was to implement it in dissect.cstruct (see fox-it/dissect.cstruct#29), but due to expected changes in the project's API I chose to implement it in unblob so we're not dependent on another project.
1 parent c5837b7 commit a4eb8ca

File tree

5 files changed

+75
-50
lines changed

5 files changed

+75
-50
lines changed

tests/test_models.py

+15-1
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import pytest
22

33
from unblob.file_utils import InvalidInputFormat
4-
from unblob.models import Chunk, UnknownChunk
4+
from unblob.models import Chunk, UnknownChunk, ValidChunk
55

66

77
class TestChunk:
@@ -47,3 +47,17 @@ def test_contains_offset(self, chunk, offset, expected):
4747
def test_validation(self, start_offset, end_offset):
4848
with pytest.raises(InvalidInputFormat):
4949
Chunk(start_offset, end_offset)
50+
51+
@pytest.mark.parametrize(
52+
"metadata",
53+
[
54+
pytest.param(1, id="metadata_int"),
55+
pytest.param(0.2, id="metadata_float"),
56+
pytest.param(True, id="metadata_bool"),
57+
pytest.param([1, 2], id="metadata_list"),
58+
pytest.param((1, 2), id="metadata_tuple"),
59+
],
60+
)
61+
def test_invalid_metadata(self, metadata):
62+
with pytest.raises(ValueError, match="Can only convert dict or Instance"):
63+
ValidChunk(start_offset=0, end_offset=100, metadata=metadata)

tests/test_report.py

+40-43
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ def test_simple_conversion(self):
8686
end_offset=384,
8787
size=384,
8888
is_encrypted=False,
89+
metadata={},
8990
extraction_reports=[],
9091
)
9192
)
@@ -135,6 +136,7 @@ def test_simple_conversion(self):
135136
"handler_name": "zip",
136137
"chunk_id": "test_basic_conversion:id",
137138
"is_encrypted": False,
139+
"metadata": {},
138140
"size": 384,
139141
"start_offset": 0,
140142
},
@@ -180,63 +182,58 @@ def test_exotic_command_output(self):
180182
json_text = ProcessResult(results=[task_result]).to_json()
181183

182184
decoded_report = json.loads(json_text)
183-
184185
assert decoded_report == [
185186
{
186-
"__typename__": "TaskResult",
187+
"task": {
188+
"path": "/nonexistent",
189+
"depth": 0,
190+
"chunk_id": "",
191+
"__typename__": "Task",
192+
},
187193
"reports": [
188194
{
189-
"__typename__": "ChunkReport",
195+
"chunk_id": "test",
196+
"handler_name": "fail",
197+
"start_offset": 0,
190198
"end_offset": 256,
199+
"size": 256,
200+
"is_encrypted": False,
201+
"metadata": {},
191202
"extraction_reports": [
192203
{
193-
"__typename__": "ExtractCommandFailedReport",
194-
"command": "dump all bytes",
195-
"exit_code": 1,
196204
"severity": "WARNING",
205+
"command": "dump all bytes",
206+
"stdout": "\x00\x01\x02\x03\x04\x05\x06\x07\x08"
207+
"\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15"
208+
'\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !"#$'
209+
"%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQR"
210+
"STUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~"
211+
"\x7f\udc80\udc81\udc82\udc83\udc84\udc85\udc86"
212+
"\udc87\udc88\udc89\udc8a\udc8b\udc8c\udc8d\udc8e"
213+
"\udc8f\udc90\udc91\udc92\udc93\udc94\udc95\udc96"
214+
"\udc97\udc98\udc99\udc9a\udc9b\udc9c\udc9d\udc9e"
215+
"\udc9f\udca0\udca1\udca2\udca3\udca4\udca5\udca6"
216+
"\udca7\udca8\udca9\udcaa\udcab\udcac\udcad\udcae"
217+
"\udcaf\udcb0\udcb1\udcb2\udcb3\udcb4\udcb5\udcb6"
218+
"\udcb7\udcb8\udcb9\udcba\udcbb\udcbc\udcbd\udcbe"
219+
"\udcbf\udcc0\udcc1\udcc2\udcc3\udcc4\udcc5\udcc6"
220+
"\udcc7\udcc8\udcc9\udcca\udccb\udccc\udccd\udcce"
221+
"\udccf\udcd0\udcd1\udcd2\udcd3\udcd4\udcd5\udcd6"
222+
"\udcd7\udcd8\udcd9\udcda\udcdb\udcdc\udcdd\udcde"
223+
"\udcdf\udce0\udce1\udce2\udce3\udce4\udce5\udce6"
224+
"\udce7\udce8\udce9\udcea\udceb\udcec\udced\udcee"
225+
"\udcef\udcf0\udcf1\udcf2\udcf3\udcf4\udcf5\udcf6"
226+
"\udcf7\udcf8\udcf9\udcfa\udcfb\udcfc\udcfd\udcfe\udcff",
197227
"stderr": "stdout is pretty strange ;)",
198-
"stdout": (
199-
"b'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07"
200-
"\\x08\\t\\n\\x0b\\x0c\\r\\x0e\\x0f"
201-
"\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17"
202-
'\\x18\\x19\\x1a\\x1b\\x1c\\x1d\\x1e\\x1f !"#'
203-
"$%&\\'()*+,-./0123456789:;<=>?@AB"
204-
"CDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`a"
205-
"bcdefghijklmnopqrstuvwxyz{|}~\\x7f"
206-
"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87"
207-
"\\x88\\x89\\x8a\\x8b\\x8c\\x8d\\x8e\\x8f"
208-
"\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97"
209-
"\\x98\\x99\\x9a\\x9b\\x9c\\x9d\\x9e\\x9f"
210-
"\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7"
211-
"\\xa8\\xa9\\xaa\\xab\\xac\\xad\\xae\\xaf"
212-
"\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
213-
"\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf"
214-
"\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5\\xc6\\xc7"
215-
"\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf"
216-
"\\xd0\\xd1\\xd2\\xd3\\xd4\\xd5\\xd6\\xd7"
217-
"\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf"
218-
"\\xe0\\xe1\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7"
219-
"\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
220-
"\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7"
221-
"\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd\\xfe\\xff"
222-
"'"
223-
),
228+
"exit_code": 1,
229+
"__typename__": "ExtractCommandFailedReport",
224230
}
225231
],
226-
"handler_name": "fail",
227-
"chunk_id": "test",
228-
"is_encrypted": False,
229-
"size": 256,
230-
"start_offset": 0,
232+
"__typename__": "ChunkReport",
231233
}
232234
],
233235
"subtasks": [],
234-
"task": {
235-
"__typename__": "Task",
236-
"chunk_id": "",
237-
"depth": 0,
238-
"path": "/nonexistent",
239-
},
236+
"__typename__": "TaskResult",
240237
}
241238
]
242239

unblob/handlers/archive/sevenzip.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -70,4 +70,6 @@ def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]
7070
# We read the signature header here to get the offset to the header database
7171
first_db_header = start_offset + len(header) + header.next_header_offset
7272
end_offset = first_db_header + header.next_header_size
73-
return ValidChunk(start_offset=start_offset, end_offset=end_offset)
73+
return ValidChunk(
74+
start_offset=start_offset, end_offset=end_offset, metadata=header
75+
)

unblob/models.py

+16-5
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
import json
44
from enum import Enum
55
from pathlib import Path
6-
from typing import List, Optional, Tuple, Type
6+
from typing import Dict, List, Optional, Tuple, Type, Union
77

88
import attr
9+
from dissect.cstruct import Instance
910
from structlog import get_logger
1011

1112
from .file_utils import Endian, File, InvalidInputFormat, StructParser
@@ -21,6 +22,17 @@
2122
#
2223

2324

25+
def metadata_converter(obj: Union[Dict, Instance]) -> dict:
26+
if isinstance(obj, dict):
27+
return obj
28+
if isinstance(obj, Instance):
29+
result = {}
30+
for k, v in obj._values.items(): # noqa: SLF001
31+
result[k] = v
32+
return result
33+
raise ValueError("Can only convert dict or Instance")
34+
35+
2436
@attr.define(frozen=True)
2537
class Task:
2638
path: Path
@@ -88,6 +100,7 @@ class ValidChunk(Chunk):
88100

89101
handler: "Handler" = attr.ib(init=False, eq=False)
90102
is_encrypted: bool = attr.ib(default=False)
103+
metadata: dict = attr.ib(factory=dict, converter=metadata_converter)
91104

92105
def extract(self, inpath: Path, outdir: Path):
93106
if self.is_encrypted:
@@ -108,6 +121,7 @@ def as_report(self, extraction_reports: List[Report]) -> ChunkReport:
108121
size=self.size,
109122
handler_name=self.handler.NAME,
110123
is_encrypted=self.is_encrypted,
124+
metadata=self.metadata,
111125
extraction_reports=extraction_reports,
112126
)
113127

@@ -187,10 +201,7 @@ def default(self, obj):
187201
return str(obj)
188202

189203
if isinstance(obj, bytes):
190-
try:
191-
return obj.decode()
192-
except UnicodeDecodeError:
193-
return str(obj)
204+
return obj.decode("utf-8", errors="surrogateescape")
194205

195206
logger.error("JSONEncoder met a non-JSON encodable value", obj=obj)
196207
# the usual fail path of custom JSONEncoders is to call the parent and let it fail

unblob/report.py

+1
Original file line numberDiff line numberDiff line change
@@ -181,6 +181,7 @@ class ChunkReport(Report):
181181
end_offset: int
182182
size: int
183183
is_encrypted: bool
184+
metadata: dict = attr.ib(factory=dict)
184185
extraction_reports: List[Report]
185186

186187

0 commit comments

Comments
 (0)