Skip to content

Commit a0d80c3

Browse files
committed
simd-doc: verify newlines and proper end-of-endpoint in fallbacks
We weren't properly verifying that the full deserialization buffer was consumed through to the newline pivot in fallback handlers. Do so, and improve test coverage slightly. Also update simdjson usage to verify that a document extent ends in a newline. This is to catch cases like `{"doc":1} {"doc":2}\n` which are allowed by simd-json normally, but which are reject by fallback parsing. Note this will cause the fallback handler to be used without causing an explicit error if leading whitespace appears in a buffer, such as `{"doc":1}\n {"doc":2}\n". Other updates to the garden-path simdjson flow are not required because we give simdjson only complete chunks of documents with a trailing newline: it parses all documents of the input as a single operation and errors out if *any* documents fail to parse, triggering fallback handling.
1 parent f6e758a commit a0d80c3

File tree

3 files changed

+125
-77
lines changed

3 files changed

+125
-77
lines changed

crates/simd-doc/src/ffi/simd-doc.hpp

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -454,7 +454,7 @@ void parse_node(const Allocator &alloc, dom::element_type typ, dom::element elem
454454
class Parser
455455
{
456456
public:
457-
Parser(size_t capacity) : parser(capacity){};
457+
Parser(size_t capacity) : parser(capacity) {};
458458
void parse(const rust::Slice<const uint8_t> input, int64_t offset, const Allocator &alloc, HeapNode &node, Parsed &output);
459459
void transcode(const rust::Slice<const uint8_t> input, Transcoded &output);
460460

@@ -478,7 +478,13 @@ inline void Parser::parse(const rust::Slice<const uint8_t> input, int64_t offset
478478
{
479479
dom::element elem = *it;
480480
parse_node(alloc, elem.type(), elem, node);
481-
++it; // Step to next document.
481+
482+
++it; // Step to the next document and verify trailing newline.
483+
if (input.data()[it.current_index() - 1] != '\n')
484+
{
485+
throw std::out_of_range("missing trailing newline");
486+
}
487+
482488
complete(output, node, offset + static_cast<int64_t>(it.current_index()));
483489
}
484490

@@ -511,7 +517,12 @@ inline void Parser::transcode(const rust::Slice<const uint8_t> input, Transcoded
511517
dom::element elem = *it;
512518
pnode root = transcode_node(buf, elem.type(), elem);
513519
place_array(buf, &root, 1);
514-
++it; // Step to next document.
520+
521+
++it; // Step to the next document and verify trailing newline.
522+
if (input.data()[it.current_index() - 1] != '\n')
523+
{
524+
throw std::out_of_range("missing trailing newline");
525+
}
515526

516527
// Update and re-write header now that we know the next offset and length.
517528
header.u32.l = static_cast<uint32_t>(it.current_index());

crates/simd-doc/src/lib.rs

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,13 +72,15 @@ impl Parser {
7272
);
7373
let mut buf = std::mem::take(&mut self.whole);
7474
buf.extend_from_slice(input);
75+
buf.push(b'\n');
7576

7677
if let Err(err) = parse_simd(&mut buf, 0, alloc, &mut self.parsed, &mut self.ffi) {
7778
self.parsed.clear(); // Clear a partial simd parsing.
7879
tracing::debug!(%err, "simdjson JSON parse-one failed; using fallback");
7980

8081
let mut de = serde_json::Deserializer::from_slice(&buf);
8182
let node = doc::HeapNode::from_serde(&mut de, &alloc)?;
83+
() = de.end()?;
8284
self.parsed.push((node, 0));
8385
}
8486
let mut parsed = self.parsed.drain(..);
@@ -292,7 +294,10 @@ fn parse_fallback<'a>(
292294
let pivot = memchr::memchr(b'\n', &input).expect("input always ends with newline") + 1;
293295

294296
let mut de = serde_json::Deserializer::from_slice(&input[..pivot]);
295-
match doc::HeapNode::from_serde(&mut de, &alloc) {
297+
match doc::HeapNode::from_serde(&mut de, &alloc).and_then(|node| {
298+
() = de.end()?;
299+
Ok(node)
300+
}) {
296301
Ok(node) => {
297302
input = &input[pivot..];
298303
consumed += pivot;
@@ -327,7 +332,10 @@ fn transcode_fallback(
327332
let pivot = memchr::memchr(b'\n', &input).expect("input always ends with newline") + 1;
328333

329334
let mut de = serde_json::Deserializer::from_slice(&input[..pivot]);
330-
match doc::HeapNode::from_serde(&mut de, &alloc) {
335+
match doc::HeapNode::from_serde(&mut de, &alloc).and_then(|node| {
336+
() = de.end()?;
337+
Ok(node)
338+
}) {
331339
Ok(node) => {
332340
input = &input[pivot..];
333341
consumed += pivot;

crates/simd-doc/src/tests/fixtures.rs

Lines changed: 101 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,8 @@ fn test_incomplete_row_parsing() {
108108
let mut inputs = Vec::new();
109109
inputs.push(r###"st": {"this": "is", "a": "doc"}}"###.as_bytes());
110110
inputs.push(r###""test": {"this": "is", "a": "doc"}}"###.as_bytes());
111-
inputs.push(r###"{"test": 5}, {"test": 6"}]"###.as_bytes());
111+
inputs.push(r###"{"test": 5}, {"test": 6}]"###.as_bytes());
112+
inputs.push(r###"{"test": 7} {"test": 8}"###.as_bytes());
112113
inputs.push(r###"{"real": "object"}"###.as_bytes());
113114

114115
let real = r###"{"this": "is a real doc"}"###;
@@ -166,76 +167,91 @@ fn test_incomplete_row_parsing() {
166167
}
167168
}
168169

169-
debug_assert_eq!(parse_snap, transcode_snap);
170+
assert_eq!(parse_snap, transcode_snap);
170171

171172
snaps.push((String::from_utf8(real_input).unwrap(), parse_snap));
172173
}
173174

174175
insta::assert_debug_snapshot!(snaps, @r###"
175-
[
176-
(
177-
"st\": {\"this\": \"is\", \"a\": \"doc\"}}\n{\"this\": \"is a real doc\"}\n",
178-
[
179-
(
180-
-1,
181-
String("expected value at line 1 column 1 @ 0..33"),
182-
),
183-
(
184-
59,
185-
Object {
186-
"this": String("is a real doc"),
187-
},
188-
),
189-
],
190-
),
191-
(
192-
"\"test\": {\"this\": \"is\", \"a\": \"doc\"}}\n{\"this\": \"is a real doc\"}\n",
193-
[
194-
(
195-
-1,
196-
String("incomplete row @ 0..45"),
197-
),
198-
(
199-
62,
200-
Object {
201-
"this": String("is a real doc"),
202-
},
203-
),
204-
],
205-
),
206-
(
207-
"{\"test\": 5}, {\"test\": 6\"}]\n{\"this\": \"is a real doc\"}\n",
208-
[
209-
(
210-
-1,
211-
String("incomplete row @ 0..33"),
212-
),
213-
(
214-
53,
215-
Object {
216-
"this": String("is a real doc"),
217-
},
218-
),
219-
],
220-
),
221-
(
222-
"{\"real\": \"object\"}\n{\"this\": \"is a real doc\"}\n",
223-
[
224-
(
225-
19,
226-
Object {
227-
"real": String("object"),
228-
},
229-
),
230-
(
231-
45,
232-
Object {
233-
"this": String("is a real doc"),
234-
},
235-
),
236-
],
237-
),
238-
]
176+
[
177+
(
178+
"st\": {\"this\": \"is\", \"a\": \"doc\"}}\n{\"this\": \"is a real doc\"}\n",
179+
[
180+
(
181+
-1,
182+
String("expected value at line 1 column 1 @ 0..33"),
183+
),
184+
(
185+
59,
186+
Object {
187+
"this": String("is a real doc"),
188+
},
189+
),
190+
],
191+
),
192+
(
193+
"\"test\": {\"this\": \"is\", \"a\": \"doc\"}}\n{\"this\": \"is a real doc\"}\n",
194+
[
195+
(
196+
-1,
197+
String("trailing characters at line 1 column 7 @ 0..36"),
198+
),
199+
(
200+
62,
201+
Object {
202+
"this": String("is a real doc"),
203+
},
204+
),
205+
],
206+
),
207+
(
208+
"{\"test\": 5}, {\"test\": 6}]\n{\"this\": \"is a real doc\"}\n",
209+
[
210+
(
211+
-1,
212+
String("trailing characters at line 1 column 12 @ 0..26"),
213+
),
214+
(
215+
52,
216+
Object {
217+
"this": String("is a real doc"),
218+
},
219+
),
220+
],
221+
),
222+
(
223+
"{\"test\": 7} {\"test\": 8}\n{\"this\": \"is a real doc\"}\n",
224+
[
225+
(
226+
-1,
227+
String("trailing characters at line 1 column 14 @ 0..25"),
228+
),
229+
(
230+
51,
231+
Object {
232+
"this": String("is a real doc"),
233+
},
234+
),
235+
],
236+
),
237+
(
238+
"{\"real\": \"object\"}\n{\"this\": \"is a real doc\"}\n",
239+
[
240+
(
241+
19,
242+
Object {
243+
"real": String("object"),
244+
},
245+
),
246+
(
247+
45,
248+
Object {
249+
"this": String("is a real doc"),
250+
},
251+
),
252+
],
253+
),
254+
]
239255
"###);
240256
}
241257

@@ -352,13 +368,26 @@ fn test_basic_parser_apis() {
352368
let doc = parser.parse_one(input.as_bytes(), &alloc).unwrap();
353369
snap.push((0, doc.to_debug_json_value()));
354370

355-
let input = input.repeat(3);
356-
insta::assert_debug_snapshot!(parser.parse_one(input.as_bytes(), &alloc).unwrap_err(), @r###"
357-
Custom {
358-
kind: InvalidData,
359-
error: "expected one document, but parsed 3",
371+
{
372+
let input = vec![input.as_str(), input.as_str(), input.as_str()].join("\n");
373+
let input = input.as_bytes();
374+
375+
// Verify error handling of multiple valid documents in the input.
376+
insta::assert_debug_snapshot!(parser.parse_one(input, &alloc).unwrap_err(), @r###"
377+
Custom {
378+
kind: InvalidData,
379+
error: "expected one document, but parsed 3",
380+
}
381+
"###);
382+
383+
// Verify error handling of a trailing partial document.
384+
insta::assert_debug_snapshot!(parser.parse_one(&input[..input.len()/2], &alloc).unwrap_err(), @r###"
385+
Custom {
386+
kind: InvalidData,
387+
error: Error("trailing characters", line: 11, column: 1),
388+
}
389+
"###);
360390
}
361-
"###);
362391

363392
insta::assert_json_snapshot!(snap, @r###"
364393
[

0 commit comments

Comments
 (0)