Skip to content

Commit 0f5d9f2

Browse files
committed
feat(reporting): report meta-data information about chunks.
Allow handlers to provide a dict value as part of a ValidChunk metadata attribute. That dictionnary can contain any relevant metadata information from the perspective of the handler, but we advise handler writers to report parsed information such as header values. This metadata dict is later reported as part of our ChunkReports and available in the JSON report file if the user requested one. The idea is to expose metadata to further analysis steps through the unblob report. For example, a binary analysis toolkit would read the load address and architecture from a uImage chunk to analyze the file extracted from that chunk with the right settings. A note on the 'as_dict' implementation. The initial idea was to implement it in dissect.cstruct (see fox-it/dissect.cstruct#29), but due to expected changes in the project's API I chose to implement it in unblob so we're not dependent on another project.
1 parent 10263b5 commit 0f5d9f2

File tree

5 files changed

+280
-60
lines changed

5 files changed

+280
-60
lines changed

tests/test_models.py

+71-53
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,15 @@
44
import pytest
55

66
from unblob.file_utils import InvalidInputFormat
7-
from unblob.models import Chunk, ProcessResult, Task, TaskResult, UnknownChunk, to_json
7+
from unblob.models import (
8+
Chunk,
9+
ProcessResult,
10+
Task,
11+
TaskResult,
12+
UnknownChunk,
13+
ValidChunk,
14+
to_json,
15+
)
816
from unblob.report import (
917
ChunkReport,
1018
ExtractCommandFailedReport,
@@ -153,56 +161,57 @@ def test_process_result_conversion(self):
153161
decoded_report = json.loads(json_text)
154162
assert decoded_report == [
155163
{
156-
"__typename__": "TaskResult",
164+
"task": {
165+
"path": "/nonexistent",
166+
"depth": 0,
167+
"blob_id": "",
168+
"is_multi_file": False,
169+
"__typename__": "Task",
170+
},
157171
"reports": [
158172
{
159-
"__typename__": "StatReport",
173+
"path": "/nonexistent",
174+
"size": 384,
160175
"is_dir": False,
161176
"is_file": True,
162177
"is_link": False,
163178
"link_target": None,
164-
"path": "/nonexistent",
165-
"size": 384,
179+
"__typename__": "StatReport",
166180
},
167181
{
168-
"__typename__": "FileMagicReport",
169182
"magic": "Zip archive data, at least v2.0 to extract",
170183
"mime_type": "application/zip",
184+
"__typename__": "FileMagicReport",
171185
},
172186
{
173-
"__typename__": "HashReport",
174187
"md5": "9019fcece2433ad7f12c077e84537a74",
175188
"sha1": "36998218d8f43b69ef3adcadf2e8979e81eed166",
176189
"sha256": "7d7ca7e1410b702b0f85d18257aebb964ac34f7fad0a0328d72e765bfcb21118",
190+
"__typename__": "HashReport",
177191
},
178192
{
179-
"__typename__": "ChunkReport",
180-
"end_offset": 384,
181-
"extraction_reports": [],
182-
"handler_name": "zip",
183193
"id": "test_basic_conversion:id",
184-
"is_encrypted": False,
185-
"size": 384,
194+
"handler_name": "zip",
186195
"start_offset": 0,
196+
"end_offset": 384,
197+
"size": 384,
198+
"is_encrypted": False,
199+
"metadata": {},
200+
"extraction_reports": [],
201+
"__typename__": "ChunkReport",
187202
},
188203
],
189204
"subtasks": [
190205
{
191-
"__typename__": "Task",
192-
"blob_id": "test_basic_conversion:id",
206+
"path": "/extractions/nonexistent_extract",
193207
"depth": 314,
208+
"blob_id": "test_basic_conversion:id",
194209
"is_multi_file": False,
195-
"path": "/extractions/nonexistent_extract",
210+
"__typename__": "Task",
196211
}
197212
],
198-
"task": {
199-
"__typename__": "Task",
200-
"blob_id": "",
201-
"depth": 0,
202-
"is_multi_file": False,
203-
"path": "/nonexistent",
204-
},
205-
},
213+
"__typename__": "TaskResult",
214+
}
206215
]
207216

208217
def test_exotic_command_output(self):
@@ -218,35 +227,44 @@ def test_exotic_command_output(self):
218227
decoded_report = json.loads(json_text)
219228

220229
assert decoded_report == {
221-
"__typename__": "ExtractCommandFailedReport",
222-
"command": "dump all bytes",
223-
"exit_code": 1,
224230
"severity": "WARNING",
231+
"command": "dump all bytes",
232+
"stdout": "\x00\x01\x02\x03\x04\x05\x06\x07\x08"
233+
"\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16"
234+
"\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,"
235+
"-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]"
236+
"^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\udc80\udc81"
237+
"\udc82\udc83\udc84\udc85\udc86\udc87\udc88\udc89"
238+
"\udc8a\udc8b\udc8c\udc8d\udc8e\udc8f\udc90\udc91"
239+
"\udc92\udc93\udc94\udc95\udc96\udc97\udc98\udc99"
240+
"\udc9a\udc9b\udc9c\udc9d\udc9e\udc9f\udca0\udca1"
241+
"\udca2\udca3\udca4\udca5\udca6\udca7\udca8\udca9"
242+
"\udcaa\udcab\udcac\udcad\udcae\udcaf\udcb0\udcb1"
243+
"\udcb2\udcb3\udcb4\udcb5\udcb6\udcb7\udcb8\udcb9"
244+
"\udcba\udcbb\udcbc\udcbd\udcbe\udcbf\udcc0\udcc1"
245+
"\udcc2\udcc3\udcc4\udcc5\udcc6\udcc7\udcc8\udcc9"
246+
"\udcca\udccb\udccc\udccd\udcce\udccf\udcd0\udcd1"
247+
"\udcd2\udcd3\udcd4\udcd5\udcd6\udcd7\udcd8\udcd9"
248+
"\udcda\udcdb\udcdc\udcdd\udcde\udcdf\udce0\udce1"
249+
"\udce2\udce3\udce4\udce5\udce6\udce7\udce8\udce9"
250+
"\udcea\udceb\udcec\udced\udcee\udcef\udcf0\udcf1"
251+
"\udcf2\udcf3\udcf4\udcf5\udcf6\udcf7\udcf8\udcf9"
252+
"\udcfa\udcfb\udcfc\udcfd\udcfe\udcff",
225253
"stderr": "stdout is pretty strange ;)",
226-
"stdout": (
227-
"b'\\x00\\x01\\x02\\x03\\x04\\x05\\x06\\x07"
228-
"\\x08\\t\\n\\x0b\\x0c\\r\\x0e\\x0f"
229-
"\\x10\\x11\\x12\\x13\\x14\\x15\\x16\\x17"
230-
'\\x18\\x19\\x1a\\x1b\\x1c\\x1d\\x1e\\x1f !"#'
231-
"$%&\\'()*+,-./0123456789:;<=>?@AB"
232-
"CDEFGHIJKLMNOPQRSTUVWXYZ[\\\\]^_`a"
233-
"bcdefghijklmnopqrstuvwxyz{|}~\\x7f"
234-
"\\x80\\x81\\x82\\x83\\x84\\x85\\x86\\x87"
235-
"\\x88\\x89\\x8a\\x8b\\x8c\\x8d\\x8e\\x8f"
236-
"\\x90\\x91\\x92\\x93\\x94\\x95\\x96\\x97"
237-
"\\x98\\x99\\x9a\\x9b\\x9c\\x9d\\x9e\\x9f"
238-
"\\xa0\\xa1\\xa2\\xa3\\xa4\\xa5\\xa6\\xa7"
239-
"\\xa8\\xa9\\xaa\\xab\\xac\\xad\\xae\\xaf"
240-
"\\xb0\\xb1\\xb2\\xb3\\xb4\\xb5\\xb6\\xb7"
241-
"\\xb8\\xb9\\xba\\xbb\\xbc\\xbd\\xbe\\xbf"
242-
"\\xc0\\xc1\\xc2\\xc3\\xc4\\xc5\\xc6\\xc7"
243-
"\\xc8\\xc9\\xca\\xcb\\xcc\\xcd\\xce\\xcf"
244-
"\\xd0\\xd1\\xd2\\xd3\\xd4\\xd5\\xd6\\xd7"
245-
"\\xd8\\xd9\\xda\\xdb\\xdc\\xdd\\xde\\xdf"
246-
"\\xe0\\xe1\\xe2\\xe3\\xe4\\xe5\\xe6\\xe7"
247-
"\\xe8\\xe9\\xea\\xeb\\xec\\xed\\xee\\xef"
248-
"\\xf0\\xf1\\xf2\\xf3\\xf4\\xf5\\xf6\\xf7"
249-
"\\xf8\\xf9\\xfa\\xfb\\xfc\\xfd\\xfe\\xff"
250-
"'"
251-
),
254+
"exit_code": 1,
255+
"__typename__": "ExtractCommandFailedReport",
252256
}
257+
258+
@pytest.mark.parametrize(
259+
"metadata",
260+
[
261+
pytest.param(1, id="metadata_int"),
262+
pytest.param(0.2, id="metadata_float"),
263+
pytest.param(True, id="metadata_bool"),
264+
pytest.param([1, 2], id="metadata_list"),
265+
pytest.param((1, 2), id="metadata_tuple"),
266+
],
267+
)
268+
def test_invalid_metadata(self, metadata):
269+
with pytest.raises(ValueError, match="Can only convert dict or Instance"):
270+
ValidChunk(start_offset=0, end_offset=100, metadata=metadata)

tests/test_report.py

+173
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111
from unblob.processing import ExtractionConfig, process_file
1212
from unblob.report import (
1313
ChunkReport,
14+
ExtractCommandFailedReport,
1415
FileMagicReport,
1516
HashReport,
1617
StatReport,
@@ -48,6 +49,178 @@ def test_process_file_report_output_is_valid_json(
4849
assert len(report)
4950

5051

52+
class Test_ProcessResult_to_json: # noqa: N801
53+
def test_simple_conversion(self):
54+
task = Task(path=Path("/nonexistent"), depth=0, blob_id="")
55+
task_result = TaskResult(task)
56+
chunk_id = "test_basic_conversion:id"
57+
58+
task_result.add_report(
59+
StatReport(
60+
path=task.path,
61+
size=384,
62+
is_dir=False,
63+
is_file=True,
64+
is_link=False,
65+
link_target=None,
66+
)
67+
)
68+
task_result.add_report(
69+
FileMagicReport(
70+
magic="Zip archive data, at least v2.0 to extract",
71+
mime_type="application/zip",
72+
)
73+
)
74+
task_result.add_report(
75+
HashReport(
76+
md5="9019fcece2433ad7f12c077e84537a74",
77+
sha1="36998218d8f43b69ef3adcadf2e8979e81eed166",
78+
sha256="7d7ca7e1410b702b0f85d18257aebb964ac34f7fad0a0328d72e765bfcb21118",
79+
)
80+
)
81+
task_result.add_report(
82+
ChunkReport(
83+
id=chunk_id,
84+
handler_name="zip",
85+
start_offset=0,
86+
end_offset=384,
87+
size=384,
88+
is_encrypted=False,
89+
metadata={},
90+
extraction_reports=[],
91+
)
92+
)
93+
task_result.add_subtask(
94+
Task(
95+
path=Path("/extractions/nonexistent_extract"),
96+
depth=314,
97+
blob_id=chunk_id,
98+
)
99+
)
100+
101+
json_text = ProcessResult(results=[task_result]).to_json()
102+
103+
# output must be a valid json string
104+
assert isinstance(json_text, str)
105+
106+
# that can be loaded back
107+
decoded_report = json.loads(json_text)
108+
assert decoded_report == [
109+
{
110+
"task": {
111+
"path": "/nonexistent",
112+
"depth": 0,
113+
"blob_id": "",
114+
"is_multi_file": False,
115+
"__typename__": "Task",
116+
},
117+
"reports": [
118+
{
119+
"path": "/nonexistent",
120+
"size": 384,
121+
"is_dir": False,
122+
"is_file": True,
123+
"is_link": False,
124+
"link_target": None,
125+
"__typename__": "StatReport",
126+
},
127+
{
128+
"magic": "Zip archive data, at least v2.0 to extract",
129+
"mime_type": "application/zip",
130+
"__typename__": "FileMagicReport",
131+
},
132+
{
133+
"md5": "9019fcece2433ad7f12c077e84537a74",
134+
"sha1": "36998218d8f43b69ef3adcadf2e8979e81eed166",
135+
"sha256": "7d7ca7e1410b702b0f85d18257aebb964ac34f7fad0a0328d72e765bfcb21118",
136+
"__typename__": "HashReport",
137+
},
138+
{
139+
"id": "test_basic_conversion:id",
140+
"handler_name": "zip",
141+
"start_offset": 0,
142+
"end_offset": 384,
143+
"size": 384,
144+
"is_encrypted": False,
145+
"metadata": {},
146+
"extraction_reports": [],
147+
"__typename__": "ChunkReport",
148+
},
149+
],
150+
"subtasks": [
151+
{
152+
"path": "/extractions/nonexistent_extract",
153+
"depth": 314,
154+
"blob_id": "test_basic_conversion:id",
155+
"is_multi_file": False,
156+
"__typename__": "Task",
157+
}
158+
],
159+
"__typename__": "TaskResult",
160+
}
161+
]
162+
163+
def test_exotic_command_output(self):
164+
task = Task(path=Path("/nonexistent"), depth=0, blob_id="")
165+
task_result = TaskResult(task)
166+
report = ExtractCommandFailedReport(
167+
command="dump all bytes",
168+
stdout=bytes(range(256)),
169+
stderr=b"stdout is pretty strange ;)",
170+
exit_code=1,
171+
)
172+
173+
task_result.add_report(
174+
ChunkReport(
175+
id="test",
176+
handler_name="fail",
177+
start_offset=0,
178+
end_offset=256,
179+
size=256,
180+
is_encrypted=False,
181+
extraction_reports=[report],
182+
)
183+
)
184+
json_text = ProcessResult(results=[task_result]).to_json()
185+
186+
decoded_report = json.loads(json_text)
187+
assert decoded_report == [
188+
{
189+
"task": {
190+
"path": "/nonexistent",
191+
"depth": 0,
192+
"blob_id": "",
193+
"is_multi_file": False,
194+
"__typename__": "Task",
195+
},
196+
"reports": [
197+
{
198+
"id": "test",
199+
"handler_name": "fail",
200+
"start_offset": 0,
201+
"end_offset": 256,
202+
"size": 256,
203+
"is_encrypted": False,
204+
"metadata": {},
205+
"extraction_reports": [
206+
{
207+
"severity": "WARNING",
208+
"command": "dump all bytes",
209+
"stdout": "\x00\x01\x02\x03\x04\x05\x06\x07\x08\t\n\x0b\x0c\r\x0e\x0f\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_`abcdefghijklmnopqrstuvwxyz{|}~\x7f\udc80\udc81\udc82\udc83\udc84\udc85\udc86\udc87\udc88\udc89\udc8a\udc8b\udc8c\udc8d\udc8e\udc8f\udc90\udc91\udc92\udc93\udc94\udc95\udc96\udc97\udc98\udc99\udc9a\udc9b\udc9c\udc9d\udc9e\udc9f\udca0\udca1\udca2\udca3\udca4\udca5\udca6\udca7\udca8\udca9\udcaa\udcab\udcac\udcad\udcae\udcaf\udcb0\udcb1\udcb2\udcb3\udcb4\udcb5\udcb6\udcb7\udcb8\udcb9\udcba\udcbb\udcbc\udcbd\udcbe\udcbf\udcc0\udcc1\udcc2\udcc3\udcc4\udcc5\udcc6\udcc7\udcc8\udcc9\udcca\udccb\udccc\udccd\udcce\udccf\udcd0\udcd1\udcd2\udcd3\udcd4\udcd5\udcd6\udcd7\udcd8\udcd9\udcda\udcdb\udcdc\udcdd\udcde\udcdf\udce0\udce1\udce2\udce3\udce4\udce5\udce6\udce7\udce8\udce9\udcea\udceb\udcec\udced\udcee\udcef\udcf0\udcf1\udcf2\udcf3\udcf4\udcf5\udcf6\udcf7\udcf8\udcf9\udcfa\udcfb\udcfc\udcfd\udcfe\udcff",
210+
"stderr": "stdout is pretty strange ;)",
211+
"exit_code": 1,
212+
"__typename__": "ExtractCommandFailedReport",
213+
}
214+
],
215+
"__typename__": "ChunkReport",
216+
}
217+
],
218+
"subtasks": [],
219+
"__typename__": "TaskResult",
220+
}
221+
]
222+
223+
51224
@pytest.fixture
52225
def hello_kitty(tmp_path: Path) -> Path:
53226
"""Generate an input file with 3 unknown chunks and 2 zip files."""

unblob/handlers/archive/sevenzip.py

+10-2
Original file line numberDiff line numberDiff line change
@@ -19,8 +19,9 @@
1919
"""
2020
import binascii
2121
from pathlib import Path
22-
from typing import Optional
22+
from typing import Dict, Optional
2323

24+
from dissect.cstruct import Instance
2425
from structlog import get_logger
2526

2627
from unblob.extractors import Command
@@ -89,14 +90,21 @@ class SevenZipHandler(StructHandler):
8990
HEADER_STRUCT = HEADER_STRUCT
9091
EXTRACTOR = Command("7z", "x", "-p", "-y", "{inpath}", "-o{outdir}")
9192

93+
def get_metadata(self, header: Instance) -> Dict:
94+
return {"version_maj": header.version_maj, "version_min": header.version_min}
95+
9296
def calculate_chunk(self, file: File, start_offset: int) -> Optional[ValidChunk]:
9397
header = self.parse_header(file)
9498

9599
check_header_crc(header)
96100

97101
size = calculate_sevenzip_size(header)
98102

99-
return ValidChunk(start_offset=start_offset, end_offset=start_offset + size)
103+
metadata = self.get_metadata(header)
104+
105+
return ValidChunk(
106+
start_offset=start_offset, end_offset=start_offset + size, metadata=metadata
107+
)
100108

101109

102110
class MultiVolumeSevenZipHandler(DirectoryHandler):

0 commit comments

Comments
 (0)