Skip to content

Commit

Permalink
Properly normalize attribute values
Browse files Browse the repository at this point in the history
closes #371
  • Loading branch information
dralley committed Jul 4, 2022
1 parent 0febc2b commit 08c0eea
Show file tree
Hide file tree
Showing 6 changed files with 286 additions and 99 deletions.
3 changes: 1 addition & 2 deletions benches/macrobenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,14 +17,13 @@ static SAMPLE_NS: &[u8] = include_bytes!("../tests/documents/sample_ns.xml");
static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml");

// TODO: read the namespaces too
// TODO: use fully normalized attribute values
fn parse_document(doc: &[u8]) -> XmlResult<()> {
let mut r = Reader::from_reader(doc);
loop {
match r.read_event_unbuffered()? {
Event::Start(e) | Event::Empty(e) => {
for attr in e.attributes() {
criterion::black_box(attr?.unescaped_value()?);
criterion::black_box(attr?.normalized_value()?);
}
}
Event::Text(e) => {
Expand Down
205 changes: 125 additions & 80 deletions benches/microbenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -125,86 +125,6 @@ fn read_namespaced_event(c: &mut Criterion) {
group.finish();
}

/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
/// benchmark)
fn bytes_text_unescaped(c: &mut Criterion) {
let mut group = c.benchmark_group("BytesText::unescaped");
group.bench_function("trim_text = false", |b| {
b.iter(|| {
let mut buf = Vec::new();
let mut r = Reader::from_reader(SAMPLE);
r.check_end_names(false).check_comments(false);
let mut count = criterion::black_box(0);
let mut nbtxt = criterion::black_box(0);
loop {
match r.read_event(&mut buf) {
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
assert_eq!(
count, 1550,
"Overall tag count in ./tests/documents/sample_rss.xml"
);

// Windows has \r\n instead of \n
#[cfg(windows)]
assert_eq!(
nbtxt, 67661,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);

#[cfg(not(windows))]
assert_eq!(
nbtxt, 66277,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);
});
});

group.bench_function("trim_text = true", |b| {
b.iter(|| {
let mut buf = Vec::new();
let mut r = Reader::from_reader(SAMPLE);
r.check_end_names(false)
.check_comments(false)
.trim_text(true);
let mut count = criterion::black_box(0);
let mut nbtxt = criterion::black_box(0);
loop {
match r.read_event(&mut buf) {
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
assert_eq!(
count, 1550,
"Overall tag count in ./tests/documents/sample_rss.xml"
);

// Windows has \r\n instead of \n
#[cfg(windows)]
assert_eq!(
nbtxt, 50334,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);

#[cfg(not(windows))]
assert_eq!(
nbtxt, 50261,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);
});
});
group.finish();
}

/// Benchmarks, how fast individual event parsed
fn one_event(c: &mut Criterion) {
let mut group = c.benchmark_group("One event");
Expand Down Expand Up @@ -364,6 +284,130 @@ fn attributes(c: &mut Criterion) {
assert_eq!(count, 150);
})
});

group.finish();
}

/// Benchmarks normalizing attribute values
fn attribute_value_normalization(c: &mut Criterion) {
let mut group = c.benchmark_group("attribute_value_normalization");

group.bench_function("noop_short", |b| {
b.iter(|| {
criterion::black_box(unescape(b"foobar")).unwrap();
})
});

group.bench_function("noop_long", |b| {
b.iter(|| {
criterion::black_box(unescape(b"just a bit of text without any entities")).unwrap();
})
});

group.bench_function("replacement_chars", |b| {
b.iter(|| {
criterion::black_box(unescape(b"just a bit\n of text without\tany entities")).unwrap();
})
});

group.bench_function("char_reference", |b| {
b.iter(|| {
let text = b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
criterion::black_box(unescape(text)).unwrap();
let text = b"&#38;&#60;";
criterion::black_box(unescape(text)).unwrap();
})
});

group.bench_function("entity_reference", |b| {
b.iter(|| {
let text = b"age &gt; 72 &amp;&amp; age &lt; 21";
criterion::black_box(unescape(text)).unwrap();
let text = b"&quot;what&apos;s that?&quot;";
criterion::black_box(unescape(text)).unwrap();
})
});

group.finish();
}

/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
/// benchmark)
fn bytes_text_unescaped(c: &mut Criterion) {
let mut group = c.benchmark_group("BytesText::unescaped");
group.bench_function("trim_text = false", |b| {
b.iter(|| {
let mut buf = Vec::new();
let mut r = Reader::from_reader(SAMPLE);
r.check_end_names(false).check_comments(false);
let mut count = criterion::black_box(0);
let mut nbtxt = criterion::black_box(0);
loop {
match r.read_event(&mut buf) {
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
assert_eq!(
count, 1550,
"Overall tag count in ./tests/documents/sample_rss.xml"
);

// Windows has \r\n instead of \n
#[cfg(windows)]
assert_eq!(
nbtxt, 67661,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);

#[cfg(not(windows))]
assert_eq!(
nbtxt, 66277,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);
});
});

group.bench_function("trim_text = true", |b| {
b.iter(|| {
let mut buf = Vec::new();
let mut r = Reader::from_reader(SAMPLE);
r.check_end_names(false)
.check_comments(false)
.trim_text(true);
let mut count = criterion::black_box(0);
let mut nbtxt = criterion::black_box(0);
loop {
match r.read_event(&mut buf) {
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
assert_eq!(
count, 1550,
"Overall tag count in ./tests/documents/sample_rss.xml"
);

// Windows has \r\n instead of \n
#[cfg(windows)]
assert_eq!(
nbtxt, 50334,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);

#[cfg(not(windows))]
assert_eq!(
nbtxt, 50261,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);
});
});
group.finish();
}

Expand Down Expand Up @@ -477,6 +521,7 @@ criterion_group!(
read_namespaced_event,
one_event,
attributes,
attribute_value_normalization,
escaping,
unescaping,
);
1 change: 1 addition & 0 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,7 @@ impl From<EscapeError> for Error {
}

impl From<AttrError> for Error {
/// Creates a new `Error::InvalidAttr` from the given error
#[inline]
fn from(error: AttrError) -> Self {
Error::InvalidAttr(error)
Expand Down
20 changes: 11 additions & 9 deletions src/escapei.rs
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ use std::ops::Range;
use pretty_assertions::assert_eq;

/// Error for XML escape/unescqpe.
#[derive(Debug)]
#[derive(Debug, PartialEq)]
pub enum EscapeError {
/// Entity with Null character
EntityWithNull(::std::ops::Range<usize>),
Expand Down Expand Up @@ -134,7 +134,7 @@ pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
}

/// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
/// value, using a dictionnary of custom entities.
/// value, using a dictionary of custom entities.
///
/// # Pre-condition
///
Expand Down Expand Up @@ -174,7 +174,9 @@ pub fn do_unescape<'a>(
if let Some(s) = named_entity(pat) {
unescaped.extend_from_slice(s.as_bytes());
} else if pat.starts_with(b"#") {
push_utf8(unescaped, parse_number(&pat[1..], start..end)?);
let entity = &pat[1..]; // starts after the #
let codepoint = parse_number(entity, start..end)?;
push_utf8(unescaped, codepoint);
} else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) {
unescaped.extend_from_slice(&value);
} else {
Expand All @@ -201,7 +203,7 @@ pub fn do_unescape<'a>(
}

#[cfg(not(feature = "escape-html"))]
const fn named_entity(name: &[u8]) -> Option<&str> {
pub(crate) const fn named_entity(name: &[u8]) -> Option<&str> {
let s = match name {
b"lt" => "<",
b"gt" => ">",
Expand All @@ -213,7 +215,7 @@ const fn named_entity(name: &[u8]) -> Option<&str> {
Some(s)
}
#[cfg(feature = "escape-html")]
const fn named_entity(name: &[u8]) -> Option<&str> {
pub(crate) const fn named_entity(name: &[u8]) -> Option<&str> {
// imported from https://dev.w3.org/html5/html-author/charref
let s = match name {
b"Tab" => "\u{09}",
Expand Down Expand Up @@ -1675,12 +1677,12 @@ const fn named_entity(name: &[u8]) -> Option<&str> {
Some(s)
}

fn push_utf8(out: &mut Vec<u8>, code: char) {
pub(crate) fn push_utf8(out: &mut Vec<u8>, code: char) {
let mut buf = [0u8; 4];
out.extend_from_slice(code.encode_utf8(&mut buf).as_bytes());
}

fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError> {
pub(crate) fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError> {
let code = if bytes.starts_with(b"x") {
parse_hexadecimal(&bytes[1..])
} else {
Expand All @@ -1695,7 +1697,7 @@ fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError>
}
}

fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
pub(crate) fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
// maximum code is 0x10FFFF => 6 characters
if bytes.len() > 6 {
return Err(EscapeError::TooLongHexadecimal);
Expand All @@ -1713,7 +1715,7 @@ fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
Ok(code)
}

fn parse_decimal(bytes: &[u8]) -> Result<u32, EscapeError> {
pub(crate) fn parse_decimal(bytes: &[u8]) -> Result<u32, EscapeError> {
// maximum code is 0x10FFFF = 1114111 => 7 characters
if bytes.len() > 7 {
return Err(EscapeError::TooLongDecimal);
Expand Down
Loading

0 comments on commit 08c0eea

Please sign in to comment.