From 08c0eea76c36c6ddcf753c045fd5ff5ab78aba80 Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Sat, 2 Apr 2022 21:14:52 -0400 Subject: [PATCH] Properly normalize attribute values closes #371 --- benches/macrobenches.rs | 3 +- benches/microbenches.rs | 205 ++++++++++++++++++++++++--------------- src/errors.rs | 1 + src/escapei.rs | 20 ++-- src/events/attributes.rs | 151 +++++++++++++++++++++++++++- src/reader.rs | 5 +- 6 files changed, 286 insertions(+), 99 deletions(-) diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index 14f28cb9..54e56628 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -17,14 +17,13 @@ static SAMPLE_NS: &[u8] = include_bytes!("../tests/documents/sample_ns.xml"); static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml"); // TODO: read the namespaces too -// TODO: use fully normalized attribute values fn parse_document(doc: &[u8]) -> XmlResult<()> { let mut r = Reader::from_reader(doc); loop { match r.read_event_unbuffered()? { Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { - criterion::black_box(attr?.unescaped_value()?); + criterion::black_box(attr?.normalized_value()?); } } Event::Text(e) => { diff --git a/benches/microbenches.rs b/benches/microbenches.rs index 9d701c05..443ebace 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -125,86 +125,6 @@ fn read_namespaced_event(c: &mut Criterion) { group.finish(); } -/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event` -/// benchmark) -fn bytes_text_unescaped(c: &mut Criterion) { - let mut group = c.benchmark_group("BytesText::unescaped"); - group.bench_function("trim_text = false", |b| { - b.iter(|| { - let mut buf = Vec::new(); - let mut r = Reader::from_reader(SAMPLE); - r.check_end_names(false).check_comments(false); - let mut count = criterion::black_box(0); - let mut nbtxt = criterion::black_box(0); - loop { - match r.read_event(&mut buf) { - Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, - Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(), - Ok(Event::Eof) => break, - _ => (), - } - buf.clear(); - } - assert_eq!( - count, 1550, - "Overall tag count in ./tests/documents/sample_rss.xml" - ); - - // Windows has \r\n instead of \n - #[cfg(windows)] - assert_eq!( - nbtxt, 67661, - "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" - ); - - #[cfg(not(windows))] - assert_eq!( - nbtxt, 66277, - "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" - ); - }); - }); - - group.bench_function("trim_text = true", |b| { - b.iter(|| { - let mut buf = Vec::new(); - let mut r = Reader::from_reader(SAMPLE); - r.check_end_names(false) - .check_comments(false) - .trim_text(true); - let mut count = criterion::black_box(0); - let mut nbtxt = criterion::black_box(0); - loop { - match r.read_event(&mut buf) { - Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, - Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(), - Ok(Event::Eof) => break, - _ => (), - } - buf.clear(); - } - assert_eq!( - count, 1550, - "Overall tag count in ./tests/documents/sample_rss.xml" - ); - - // Windows has \r\n instead of \n - #[cfg(windows)] - assert_eq!( - nbtxt, 50334, - "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" - ); - - #[cfg(not(windows))] - assert_eq!( - nbtxt, 50261, - "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" - ); - }); - }); - group.finish(); -} - /// Benchmarks, how fast individual event parsed fn one_event(c: &mut Criterion) { let mut group = c.benchmark_group("One event"); @@ -364,6 +284,130 @@ fn attributes(c: &mut Criterion) { assert_eq!(count, 150); }) }); + + group.finish(); +} + +/// Benchmarks normalizing attribute values +fn attribute_value_normalization(c: &mut Criterion) { + let mut group = c.benchmark_group("attribute_value_normalization"); + + group.bench_function("noop_short", |b| { + b.iter(|| { + criterion::black_box(unescape(b"foobar")).unwrap(); + }) + }); + + group.bench_function("noop_long", |b| { + b.iter(|| { + criterion::black_box(unescape(b"just a bit of text without any entities")).unwrap(); + }) + }); + + group.bench_function("replacement_chars", |b| { + b.iter(|| { + criterion::black_box(unescape(b"just a bit\n of text without\tany entities")).unwrap(); + }) + }); + + group.bench_function("char_reference", |b| { + b.iter(|| { + let text = b"prefix "some stuff","more stuff""; + criterion::black_box(unescape(text)).unwrap(); + let text = b"&<"; + criterion::black_box(unescape(text)).unwrap(); + }) + }); + + group.bench_function("entity_reference", |b| { + b.iter(|| { + let text = b"age > 72 && age < 21"; + criterion::black_box(unescape(text)).unwrap(); + let text = b""what's that?""; + criterion::black_box(unescape(text)).unwrap(); + }) + }); + + group.finish(); +} + +/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event` +/// benchmark) +fn bytes_text_unescaped(c: &mut Criterion) { + let mut group = c.benchmark_group("BytesText::unescaped"); + group.bench_function("trim_text = false", |b| { + b.iter(|| { + let mut buf = Vec::new(); + let mut r = Reader::from_reader(SAMPLE); + r.check_end_names(false).check_comments(false); + let mut count = criterion::black_box(0); + let mut nbtxt = criterion::black_box(0); + loop { + match r.read_event(&mut buf) { + Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, + Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(), + Ok(Event::Eof) => break, + _ => (), + } + buf.clear(); + } + assert_eq!( + count, 1550, + "Overall tag count in ./tests/documents/sample_rss.xml" + ); + + // Windows has \r\n instead of \n + #[cfg(windows)] + assert_eq!( + nbtxt, 67661, + "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" + ); + + #[cfg(not(windows))] + assert_eq!( + nbtxt, 66277, + "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" + ); + }); + }); + + group.bench_function("trim_text = true", |b| { + b.iter(|| { + let mut buf = Vec::new(); + let mut r = Reader::from_reader(SAMPLE); + r.check_end_names(false) + .check_comments(false) + .trim_text(true); + let mut count = criterion::black_box(0); + let mut nbtxt = criterion::black_box(0); + loop { + match r.read_event(&mut buf) { + Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1, + Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(), + Ok(Event::Eof) => break, + _ => (), + } + buf.clear(); + } + assert_eq!( + count, 1550, + "Overall tag count in ./tests/documents/sample_rss.xml" + ); + + // Windows has \r\n instead of \n + #[cfg(windows)] + assert_eq!( + nbtxt, 50334, + "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" + ); + + #[cfg(not(windows))] + assert_eq!( + nbtxt, 50261, + "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml" + ); + }); + }); group.finish(); } @@ -477,6 +521,7 @@ criterion_group!( read_namespaced_event, one_event, attributes, + attribute_value_normalization, escaping, unescaping, ); diff --git a/src/errors.rs b/src/errors.rs index c1af08af..5f41b68e 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -72,6 +72,7 @@ impl From for Error { } impl From for Error { + /// Creates a new `Error::InvalidAttr` from the given error #[inline] fn from(error: AttrError) -> Self { Error::InvalidAttr(error) diff --git a/src/escapei.rs b/src/escapei.rs index 64749c27..851d1070 100644 --- a/src/escapei.rs +++ b/src/escapei.rs @@ -9,7 +9,7 @@ use std::ops::Range; use pretty_assertions::assert_eq; /// Error for XML escape/unescqpe. -#[derive(Debug)] +#[derive(Debug, PartialEq)] pub enum EscapeError { /// Entity with Null character EntityWithNull(::std::ops::Range), @@ -134,7 +134,7 @@ pub fn unescape(raw: &[u8]) -> Result, EscapeError> { } /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding -/// value, using a dictionnary of custom entities. +/// value, using a dictionary of custom entities. /// /// # Pre-condition /// @@ -174,7 +174,9 @@ pub fn do_unescape<'a>( if let Some(s) = named_entity(pat) { unescaped.extend_from_slice(s.as_bytes()); } else if pat.starts_with(b"#") { - push_utf8(unescaped, parse_number(&pat[1..], start..end)?); + let entity = &pat[1..]; // starts after the # + let codepoint = parse_number(entity, start..end)?; + push_utf8(unescaped, codepoint); } else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) { unescaped.extend_from_slice(&value); } else { @@ -201,7 +203,7 @@ pub fn do_unescape<'a>( } #[cfg(not(feature = "escape-html"))] -const fn named_entity(name: &[u8]) -> Option<&str> { +pub(crate) const fn named_entity(name: &[u8]) -> Option<&str> { let s = match name { b"lt" => "<", b"gt" => ">", @@ -213,7 +215,7 @@ const fn named_entity(name: &[u8]) -> Option<&str> { Some(s) } #[cfg(feature = "escape-html")] -const fn named_entity(name: &[u8]) -> Option<&str> { +pub(crate) const fn named_entity(name: &[u8]) -> Option<&str> { // imported from https://dev.w3.org/html5/html-author/charref let s = match name { b"Tab" => "\u{09}", @@ -1675,12 +1677,12 @@ const fn named_entity(name: &[u8]) -> Option<&str> { Some(s) } -fn push_utf8(out: &mut Vec, code: char) { +pub(crate) fn push_utf8(out: &mut Vec, code: char) { let mut buf = [0u8; 4]; out.extend_from_slice(code.encode_utf8(&mut buf).as_bytes()); } -fn parse_number(bytes: &[u8], range: Range) -> Result { +pub(crate) fn parse_number(bytes: &[u8], range: Range) -> Result { let code = if bytes.starts_with(b"x") { parse_hexadecimal(&bytes[1..]) } else { @@ -1695,7 +1697,7 @@ fn parse_number(bytes: &[u8], range: Range) -> Result } } -fn parse_hexadecimal(bytes: &[u8]) -> Result { +pub(crate) fn parse_hexadecimal(bytes: &[u8]) -> Result { // maximum code is 0x10FFFF => 6 characters if bytes.len() > 6 { return Err(EscapeError::TooLongHexadecimal); @@ -1713,7 +1715,7 @@ fn parse_hexadecimal(bytes: &[u8]) -> Result { Ok(code) } -fn parse_decimal(bytes: &[u8]) -> Result { +pub(crate) fn parse_decimal(bytes: &[u8]) -> Result { // maximum code is 0x10FFFF = 1114111 => 7 characters if bytes.len() > 7 { return Err(EscapeError::TooLongDecimal); diff --git a/src/events/attributes.rs b/src/events/attributes.rs index d6331bc7..226d6d9f 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -4,6 +4,7 @@ use crate::errors::{Error, Result as XmlResult}; use crate::escape::{do_unescape, escape}; +use crate::escapei::{self, EscapeError}; use crate::name::QName; use crate::reader::{is_whitespace, Reader}; use crate::utils::{write_byte_string, write_cow_string, Bytes}; @@ -11,6 +12,8 @@ use std::fmt::{self, Debug, Display, Formatter}; use std::iter::FusedIterator; use std::{borrow::Cow, collections::HashMap, io::BufRead, ops::Range}; +type CustomEntityMap = HashMap, Vec>; + /// A struct representing a key/value XML attribute. /// /// Field `value` stores raw bytes, possibly containing escape-sequences. Most users will likely @@ -32,6 +35,95 @@ pub struct Attribute<'a> { } impl<'a> Attribute<'a> { + /// Returns the attribute value normalized as per the XML specification. + /// + /// https://www.w3.org/TR/xml/#AVNormalize + /// + /// Do not use this method with HTML attributes. + /// + /// This is normally the value you are interested in. Escape sequences such as `>` are + /// replaced with their unescaped equivalents such as `>`. + /// + /// This will allocate if the value contains any escape sequences. + /// + /// See also [`normalized_value_with_custom_entities()`](#method.normalized_value_with_custom_entities) + pub fn normalized_value(&'a self) -> Result, EscapeError> { + self.make_normalized_value(None) + } + + /// Returns the attribute value normalized as per the XML specification, using custom entities. + /// + /// https://www.w3.org/TR/xml/#AVNormalize + /// + /// Do not use this method with HTML attributes. + /// + /// This is normally the value you are interested in. Escape sequences such as `>` are + /// replaced with their unescaped equivalents such as `>`. + /// Additional entities can be provided in `custom_entities`. + /// + /// This will allocate if the value contains any escape sequences. + /// + /// See also [`normalized_value()`](#method.normalized_value) + /// + /// # Pre-condition + /// + /// The keys and values of `custom_entities`, if any, must be valid UTF-8. + pub fn normalized_value_with_custom_entities( + &'a self, + custom_entities: &CustomEntityMap, + ) -> Result, EscapeError> { + self.make_normalized_value(Some(custom_entities)) + } + + /// Normalize the attribute value according to xml specification section 3.3.3 + /// + /// https://www.w3.org/TR/xml/#AVNormalize + fn make_normalized_value( + &'a self, + custom_entities: Option<&CustomEntityMap>, + ) -> Result, EscapeError> { + let mut normalized: Vec = Vec::with_capacity(self.value.len()); + + // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference + // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new + // buffer and continue using this buffer. + let attr = self.value.as_ref(); + let mut attr_iter = attr.iter().enumerate(); + + while let Some((idx, ch)) = attr_iter.next() { + match ch { + b' ' | b'\n' | b'\r' | b'\t' => normalized.push(b' '), + b'&' => { + let end = idx + + 1 + + attr_iter + .position(|(_, c)| *c == b';') + .ok_or_else(|| EscapeError::UnterminatedEntity(idx..attr.len()))?; + let entity = &attr[idx + 1..end]; // starts after the & + + if let Some(s) = escapei::named_entity(entity) { + normalized.extend_from_slice(s.as_bytes()); + } else if entity.starts_with(b"#") { + let entity = &entity[1..]; // starts after the # + let codepoint = escapei::parse_number(entity, idx..end)?; + escapei::push_utf8(&mut normalized, codepoint); + } else if let Some(value) = custom_entities.and_then(|hm| hm.get(entity)) { + // TODO: recursively apply entity substitution + normalized.extend_from_slice(&value); + } else { + return Err(EscapeError::UnrecognizedSymbol( + idx + 1..end, + String::from_utf8(entity.to_vec()), + )); + } + } + _ => normalized.push(*ch), + } + } + + Ok(Cow::Owned(normalized)) + } + /// Returns the unescaped value. /// /// This is normally the value you are interested in. Escape sequences such as `>` are @@ -59,14 +151,14 @@ impl<'a> Attribute<'a> { /// The keys and values of `custom_entities`, if any, must be valid UTF-8. pub fn unescaped_value_with_custom_entities( &self, - custom_entities: &HashMap, Vec>, + custom_entities: &CustomEntityMap, ) -> XmlResult> { self.make_unescaped_value(Some(custom_entities)) } fn make_unescaped_value( &self, - custom_entities: Option<&HashMap, Vec>>, + custom_entities: Option<&CustomEntityMap>, ) -> XmlResult> { do_unescape(&*self.value, custom_entities).map_err(Error::EscapeError) } @@ -102,7 +194,7 @@ impl<'a> Attribute<'a> { pub fn unescape_and_decode_value_with_custom_entities( &self, reader: &Reader, - custom_entities: &HashMap, Vec>, + custom_entities: &CustomEntityMap, ) -> XmlResult { self.do_unescape_and_decode_value(reader, Some(custom_entities)) } @@ -111,7 +203,7 @@ impl<'a> Attribute<'a> { fn do_unescape_and_decode_value( &self, reader: &Reader, - custom_entities: Option<&HashMap, Vec>>, + custom_entities: Option<&CustomEntityMap>, ) -> XmlResult { let decoded = reader.decoder().decode(&*self.value)?; @@ -798,6 +890,57 @@ mod xml { use super::*; use pretty_assertions::assert_eq; + #[test] + fn attribute_value_normalization() { + // empty value + let raw_value = "".as_bytes(); + let output = "".as_bytes().to_vec(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output))); + + // return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with a space character + let raw_value = "\r\nfoo\rbar\tbaz\n\ndelta\n".as_bytes(); + let output = " foo bar baz delta ".as_bytes().to_vec(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output))); + + // entities must be terminated + let raw_value = "abc"def".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!( + attr.normalized_value(), + Err(EscapeError::UnterminatedEntity(3..11)) + ); + + // unknown entities raise error + let raw_value = "abc&unkn;def".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!( + attr.normalized_value(), + Err(EscapeError::UnrecognizedSymbol(4..8, Ok("unkn".to_owned()))) // TODO: is this divergence between range behavior of UnterminatedEntity and UnrecognizedSymbol appropriate. shared with unescape code + ); + + // // custom entity replacement works, entity replacement text processed recursively + // let raw_value = "&d;&d;A&a; &a;B&da;".as_bytes(); + // let output = b" A B ".to_vec(); + // let attr = Attribute::from(("foo".as_bytes(), raw_value)); + // let mut custom_entities = HashMap::new(); + // custom_entities.insert(b"d".to_vec(), b" ".to_vec()); + // custom_entities.insert(b"a".to_vec(), b" ".to_vec()); + // custom_entities.insert(b"da".to_vec(), b" ".to_vec()); + // dbg!(std::str::from_utf8(attr.normalized_value_with_custom_entities(&custom_entities).unwrap().as_ref()).unwrap()); + // assert_eq!( + // attr.normalized_value_with_custom_entities(&custom_entities), + // Ok(Cow::Owned::<[u8]>(output)) + // ); + + // character literal references are substituted without being replaced by spaces + let raw_value = " A B ".as_bytes(); + let output = "\r\rA\n\nB\r\n".as_bytes().to_vec(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output))); + } + /// Checked attribute is the single attribute mod single { use super::*; diff --git a/src/reader.rs b/src/reader.rs index a1909255..18c00200 100644 --- a/src/reader.rs +++ b/src/reader.rs @@ -1489,10 +1489,7 @@ impl ReadElementState { /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab) #[inline] pub(crate) fn is_whitespace(b: u8) -> bool { - match b { - b' ' | b'\r' | b'\n' | b'\t' => true, - _ => false, - } + matches!(b, b' ' | b'\r' | b'\n' | b'\t') } ////////////////////////////////////////////////////////////////////////////////////////////////////