From 34c2f720cb57873ecfa1b3567e173e104733f7ec Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Sat, 2 Apr 2022 21:14:52 -0400 Subject: [PATCH] More correctly normalize attribute values --- Changelog.md | 11 +++ benches/macrobenches.rs | 3 +- benches/microbenches.rs | 45 ++++++++++ src/errors.rs | 1 + src/escapei.rs | 71 +++++++++++++-- src/events/attributes.rs | 186 ++++++++++++++++++++++++++++++++++++++- src/lib.rs | 3 +- 7 files changed, 308 insertions(+), 12 deletions(-) diff --git a/Changelog.md b/Changelog.md index d22f4962..a0622396 100644 --- a/Changelog.md +++ b/Changelog.md @@ -14,6 +14,10 @@ - [#541]: Deserialize specially named `$text` enum variant in [externally tagged] enums from textual content +- [#379]: Improved compliance with the XML attribute value normalization process by + adding `Attribute::normalized_value()` and `Attribute::normalized_value_with()`, + which ought to be used in place of `Attribute::unescape_value()` and + `Attribute::unescape_value_with()` ### Bug Fixes @@ -22,10 +26,15 @@ ### Misc Changes +### New Tests + +- [#379]: Added tests for attribute value normalization + [externally tagged]: https://serde.rs/enum-representations.html#externally-tagged [#490]: https://github.com/tafia/quick-xml/pull/490 [#537]: https://github.com/tafia/quick-xml/issues/537 [#541]: https://github.com/tafia/quick-xml/pull/541 +[#379]: https://github.com/tafia/quick-xml/pull/379 ## 0.27.1 -- 2022-12-28 @@ -120,6 +129,8 @@ - [#489]: Reduced the size of the package uploaded into the crates.io by excluding tests, examples, and benchmarks. +### New Tests + [#481]: https://github.com/tafia/quick-xml/pull/481 [#489]: https://github.com/tafia/quick-xml/pull/489 diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index a8cbbd53..67e51f1f 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -43,14 +43,13 @@ static INPUTS: &[(&str, &str)] = &[ ("players.xml", PLAYERS), ]; -// TODO: use fully normalized attribute values fn parse_document_from_str(doc: &str) -> XmlResult<()> { let mut r = Reader::from_str(doc); loop { match criterion::black_box(r.read_event()?) { Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { - criterion::black_box(attr?.decode_and_unescape_value(&r)?); + criterion::black_box(attr?.decode_and_normalize_value(&r)?); } } Event::Text(e) => { diff --git a/benches/microbenches.rs b/benches/microbenches.rs index aa5c8b70..341ad42d 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -242,6 +242,50 @@ fn attributes(c: &mut Criterion) { assert_eq!(count, 150); }) }); + + group.finish(); +} + +/// Benchmarks normalizing attribute values +fn attribute_value_normalization(c: &mut Criterion) { + let mut group = c.benchmark_group("attribute_value_normalization"); + + group.bench_function("noop_short", |b| { + b.iter(|| { + criterion::black_box(unescape("foobar")).unwrap(); + }) + }); + + group.bench_function("noop_long", |b| { + b.iter(|| { + criterion::black_box(unescape("just a bit of text without any entities")).unwrap(); + }) + }); + + group.bench_function("replacement_chars", |b| { + b.iter(|| { + criterion::black_box(unescape("just a bit\n of text without\tany entities")).unwrap(); + }) + }); + + group.bench_function("char_reference", |b| { + b.iter(|| { + let text = "prefix "some stuff","more stuff""; + criterion::black_box(unescape(text)).unwrap(); + let text = "&<"; + criterion::black_box(unescape(text)).unwrap(); + }) + }); + + group.bench_function("entity_reference", |b| { + b.iter(|| { + let text = "age > 72 && age < 21"; + criterion::black_box(unescape(text)).unwrap(); + let text = ""what's that?""; + criterion::black_box(unescape(text)).unwrap(); + }) + }); + group.finish(); } @@ -354,6 +398,7 @@ criterion_group!( read_resolved_event_into, one_event, attributes, + attribute_value_normalization, escaping, unescaping, ); diff --git a/src/errors.rs b/src/errors.rs index 0afd5a02..f0be93ef 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -77,6 +77,7 @@ impl From for Error { } impl From for Error { + /// Creates a new `Error::InvalidAttr` from the given error #[inline] fn from(error: AttrError) -> Self { Error::InvalidAttr(error) diff --git a/src/escapei.rs b/src/escapei.rs index f537233a..b711b0f0 100644 --- a/src/escapei.rs +++ b/src/escapei.rs @@ -8,7 +8,7 @@ use std::ops::Range; use pretty_assertions::assert_eq; /// Error for XML escape / unescape. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq)] pub enum EscapeError { /// Entity with Null character EntityWithNull(Range), @@ -212,8 +212,66 @@ where } } +/// Returns the attribute value normalized as per the XML specification, using a custom +/// entity resolver. +/// +/// https://www.w3.org/TR/xml/#AVNormalize +/// +/// Do not use this method with HTML attributes. +/// +/// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` +/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function +/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still +/// take precedence. +/// +/// This will allocate unless the raw attribute value does not require normalization. +pub(crate) fn normalize_attribute_value_with<'a, 'entity>( + value: &'a str, + resolve_entity: impl Fn(&str) -> Option<&'entity str>, +) -> Result, EscapeError> { + // TODO: avoid allocation when not needed + let mut normalized = String::with_capacity(value.len()); + + let attr = value.as_bytes(); + let mut attr_iter = attr.iter().enumerate(); + + while let Some((idx, ch)) = attr_iter.next() { + match ch { + b' ' | b'\n' | b'\r' | b'\t' => normalized.push(' '), + b'&' => { + let end = idx + + 1 + + attr_iter + .position(|(_, c)| *c == b';') + .ok_or_else(|| EscapeError::UnterminatedEntity(idx..attr.len()))?; + let entity = &attr[idx + 1..end]; // starts after the & + let entity_str = std::str::from_utf8(entity).expect("failed UTF-8 check"); + + if entity.starts_with(b"#") { + let entity = &entity_str[1..]; // starts after the # + let codepoint = parse_number(entity, idx..end)?; + normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4])); + } else if let Some(s) = named_entity(entity_str) { + normalized.push_str(s); + } else if let Some(value) = resolve_entity(entity_str) { + // TODO: recursively apply entity substitution + normalized.push_str(&value); + } else { + return Err(EscapeError::UnrecognizedSymbol( + idx + 1..end, + String::from_utf8(entity.to_vec()).expect("failed UTF-8 check"), + )); + } + } + _ => normalized.push(*ch as char), + } + } + + Ok(Cow::Owned(normalized)) +} + #[cfg(not(feature = "escape-html"))] -fn named_entity(name: &str) -> Option<&str> { +pub(crate) fn named_entity(name: &str) -> Option<&str> { // match over strings are not allowed in const functions let s = match name.as_bytes() { b"lt" => "<", @@ -226,9 +284,8 @@ fn named_entity(name: &str) -> Option<&str> { Some(s) } #[cfg(feature = "escape-html")] -fn named_entity(name: &str) -> Option<&str> { +pub(crate) fn named_entity(name: &str) -> Option<&str> { // imported from https://dev.w3.org/html5/html-author/charref - // match over strings are not allowed in const functions //TODO: automate up-to-dating using https://html.spec.whatwg.org/entities.json let s = match name.as_bytes() { b"Tab" => "\u{09}", @@ -1690,7 +1747,7 @@ fn named_entity(name: &str) -> Option<&str> { Some(s) } -fn parse_number(bytes: &str, range: Range) -> Result { +pub fn parse_number(bytes: &str, range: Range) -> Result { let code = if bytes.starts_with('x') { parse_hexadecimal(&bytes[1..]) } else { @@ -1705,7 +1762,7 @@ fn parse_number(bytes: &str, range: Range) -> Result { } } -fn parse_hexadecimal(bytes: &str) -> Result { +pub fn parse_hexadecimal(bytes: &str) -> Result { // maximum code is 0x10FFFF => 6 characters if bytes.len() > 6 { return Err(EscapeError::TooLongHexadecimal); @@ -1723,7 +1780,7 @@ fn parse_hexadecimal(bytes: &str) -> Result { Ok(code) } -fn parse_decimal(bytes: &str) -> Result { +pub fn parse_decimal(bytes: &str) -> Result { // maximum code is 0x10FFFF = 1114111 => 7 characters if bytes.len() > 7 { return Err(EscapeError::TooLongDecimal); diff --git a/src/events/attributes.rs b/src/events/attributes.rs index e2cf8c62..6e12c133 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -4,6 +4,9 @@ use crate::errors::Result as XmlResult; use crate::escape::{escape, unescape_with}; +use crate::escapei; +#[cfg(any(doc, not(feature = "encoding")))] +use crate::escapei::EscapeError; use crate::name::QName; use crate::reader::{is_whitespace, Reader}; use crate::utils::{write_byte_string, write_cow_string, Bytes}; @@ -30,7 +33,100 @@ pub struct Attribute<'a> { } impl<'a> Attribute<'a> { - /// Decodes using UTF-8 then unescapes the value. + /// Returns the attribute value normalized as per the XML specification. + /// + /// https://www.w3.org/TR/xml/#AVNormalize + /// + /// Do not use this method with HTML attributes. + /// + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` + /// and the characters \t, \r, \n are replaced with whitespace characters. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// See also [`normalized_value_with()`](#method.normalized_value_with) + #[cfg(any(doc, not(feature = "encoding")))] + pub fn normalized_value(&'a self) -> Result, EscapeError> { + self.normalized_value_with(|_| None) + } + + /// Returns the attribute value normalized as per the XML specification, using a custom + /// entity resolver. + /// + /// https://www.w3.org/TR/xml/#AVNormalize + /// + /// Do not use this method with HTML attributes. + /// + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` + /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function + /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still + /// take precedence. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// See also [`normalized_value()`](#method.normalized_value) + #[cfg(any(doc, not(feature = "encoding")))] + pub fn normalized_value_with<'entity>( + &'a self, + resolve_entity: impl Fn(&str) -> Option<&'entity str>, + ) -> Result, EscapeError> { + escapei::normalize_attribute_value_with( + std::str::from_utf8(self.value.as_ref()).expect("unable to decode as utf-8"), + resolve_entity, + ) + } + + /// Decodes using a provided reader and returns the attribute value normalized + /// as per the XML specification. + /// + /// https://www.w3.org/TR/xml/#AVNormalize + /// + /// Do not use this method with HTML attributes. + /// + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` + /// and the characters \t, \r, \n are replaced with whitespace characters. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// See also [`decode_and_normalize_value_with()`](#method.decode_and_normalize_value_with) + pub fn decode_and_normalize_value(&self, reader: &Reader) -> XmlResult> { + self.decode_and_normalize_value_with(reader, |_| None) + } + + /// Decodes using a provided reader and returns the attribute value normalized + /// as per the XML specification, using a custom entity resolver. + /// + /// https://www.w3.org/TR/xml/#AVNormalize + /// + /// Do not use this method with HTML attributes. + /// + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` + /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function + /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still + /// take precedence. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// See also [`decode_and_normalize_value()`](#method.decode_and_normalize_value) + pub fn decode_and_normalize_value_with<'entity, B>( + &self, + reader: &Reader, + resolve_entity: impl Fn(&str) -> Option<&'entity str>, + ) -> XmlResult> { + let decoded = match &self.value { + Cow::Borrowed(bytes) => reader.decoder().decode(bytes)?, + // Convert to owned, because otherwise Cow will be bound with wrong lifetime + Cow::Owned(bytes) => reader.decoder().decode(bytes)?.into_owned().into(), + }; + + match escapei::normalize_attribute_value_with(&decoded, resolve_entity)? { + // Because result is borrowed, no replacements was done and we can use original string + Cow::Borrowed(_) => Ok(decoded), + Cow::Owned(s) => Ok(s.into()), + } + } + + /// Returns the unescaped value. /// /// This is normally the value you are interested in. Escape sequences such as `>` are /// replaced with their unescaped equivalents such as `>`. @@ -791,6 +887,94 @@ mod xml { use super::*; use pretty_assertions::assert_eq; + #[cfg(any(doc, not(feature = "encoding")))] + mod attribute_value_normalization { + + use super::*; + use pretty_assertions::assert_eq; + + /// Empty values returned are unchanged + #[test] + fn test_empty() { + let raw_value = "".as_bytes(); + let output = "".to_owned(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output))); // TODO: this should not allocate + } + + /// Already normalized values are returned unchanged + #[test] + fn test_already_normalized() { + let raw_value = "foobar123".as_bytes(); + let output = "foobar123".to_owned(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output))); // TODO: this should not allocate + } + + /// Return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with + /// a space character + #[test] + fn test_space_replacement() { + let raw_value = "\r\nfoo\rbar\tbaz\n\ndelta\n".as_bytes(); + let output = " foo bar baz delta ".to_owned(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output))); + } + + /// Entities must be terminated + #[test] + fn test_unterminated_entity() { + let raw_value = "abc"def".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!( + attr.normalized_value(), + Err(EscapeError::UnterminatedEntity(3..11)) + ); + } + + /// Unknown entities raise error + #[test] + fn test_unrecognized_entity() { + let raw_value = "abc&unkn;def".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!( + attr.normalized_value(), + Err(EscapeError::UnrecognizedSymbol(4..8, "unkn".to_owned())) // TODO: is this divergence between range behavior of UnterminatedEntity and UnrecognizedSymbol appropriate? existing unescape code behaves the same. (see: start index) + ); + } + + /// custom entity replacement works, entity replacement text processed recursively + #[test] + #[ignore = "Algorithm not completely implemented"] + fn test_entity_replacement() { + let raw_value = "&d;&d;A&a; &a;B&da;".as_bytes(); + let output = " A B ".to_owned(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + fn custom_resolver(ent: &str) -> Option<&'static str> { + match ent { + "d" => Some(" "), + "a" => Some(" "), + "da" => Some(" "), + _ => None, + } + } + // dbg!(attr.normalized_value_with(&custom_resolver)); + assert_eq!( + attr.normalized_value_with(&custom_resolver), + Ok(Cow::Owned(output)) + ); + } + + #[test] + fn test_char_references() { + // character literal references are substituted without being replaced by spaces + let raw_value = " A B ".as_bytes(); + let output = "\r\rA\n\nB\r\n".to_owned(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output))); + } + } + /// Checked attribute is the single attribute mod single { use super::*; diff --git a/src/lib.rs b/src/lib.rs index 5e8a20c2..992928fa 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -51,8 +51,7 @@ mod errors; mod escapei; pub mod escape { //! Manage xml character escapes - pub(crate) use crate::escapei::EscapeError; - pub use crate::escapei::{escape, partial_escape, unescape, unescape_with}; + pub use crate::escapei::{escape, partial_escape, unescape, unescape_with, EscapeError}; } pub mod events; pub mod name;