From 34c2f720cb57873ecfa1b3567e173e104733f7ec Mon Sep 17 00:00:00 2001
From: Daniel Alley <dalley@redhat.com>
Date: Sat, 2 Apr 2022 21:14:52 -0400
Subject: [PATCH] More correctly normalize attribute values

---
 Changelog.md             |  11 +++
 benches/macrobenches.rs  |   3 +-
 benches/microbenches.rs  |  45 ++++++++++
 src/errors.rs            |   1 +
 src/escapei.rs           |  71 +++++++++++++--
 src/events/attributes.rs | 186 ++++++++++++++++++++++++++++++++++++++-
 src/lib.rs               |   3 +-
 7 files changed, 308 insertions(+), 12 deletions(-)

diff --git a/Changelog.md b/Changelog.md
index d22f4962..a0622396 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -14,6 +14,10 @@
 
 - [#541]: Deserialize specially named `$text` enum variant in [externally tagged]
   enums from textual content
+- [#379]: Improved compliance with the XML attribute value normalization process by
+  adding `Attribute::normalized_value()` and `Attribute::normalized_value_with()`,
+  which ought to be used in place of `Attribute::unescape_value()` and
+  `Attribute::unescape_value_with()`
 
 ### Bug Fixes
 
@@ -22,10 +26,15 @@
 
 ### Misc Changes
 
+### New Tests
+
+- [#379]: Added tests for attribute value normalization
+
 [externally tagged]: https://serde.rs/enum-representations.html#externally-tagged
 [#490]: https://github.com/tafia/quick-xml/pull/490
 [#537]: https://github.com/tafia/quick-xml/issues/537
 [#541]: https://github.com/tafia/quick-xml/pull/541
+[#379]: https://github.com/tafia/quick-xml/pull/379
 
 ## 0.27.1 -- 2022-12-28
 
@@ -120,6 +129,8 @@
 - [#489]: Reduced the size of the package uploaded into the crates.io by excluding
   tests, examples, and benchmarks.
 
+### New Tests
+
 [#481]: https://github.com/tafia/quick-xml/pull/481
 [#489]: https://github.com/tafia/quick-xml/pull/489
 
diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs
index a8cbbd53..67e51f1f 100644
--- a/benches/macrobenches.rs
+++ b/benches/macrobenches.rs
@@ -43,14 +43,13 @@ static INPUTS: &[(&str, &str)] = &[
     ("players.xml", PLAYERS),
 ];
 
-// TODO: use fully normalized attribute values
 fn parse_document_from_str(doc: &str) -> XmlResult<()> {
     let mut r = Reader::from_str(doc);
     loop {
         match criterion::black_box(r.read_event()?) {
             Event::Start(e) | Event::Empty(e) => {
                 for attr in e.attributes() {
-                    criterion::black_box(attr?.decode_and_unescape_value(&r)?);
+                    criterion::black_box(attr?.decode_and_normalize_value(&r)?);
                 }
             }
             Event::Text(e) => {
diff --git a/benches/microbenches.rs b/benches/microbenches.rs
index aa5c8b70..341ad42d 100644
--- a/benches/microbenches.rs
+++ b/benches/microbenches.rs
@@ -242,6 +242,50 @@ fn attributes(c: &mut Criterion) {
             assert_eq!(count, 150);
         })
     });
+
+    group.finish();
+}
+
+/// Benchmarks normalizing attribute values
+fn attribute_value_normalization(c: &mut Criterion) {
+    let mut group = c.benchmark_group("attribute_value_normalization");
+
+    group.bench_function("noop_short", |b| {
+        b.iter(|| {
+            criterion::black_box(unescape("foobar")).unwrap();
+        })
+    });
+
+    group.bench_function("noop_long", |b| {
+        b.iter(|| {
+            criterion::black_box(unescape("just a bit of text without any entities")).unwrap();
+        })
+    });
+
+    group.bench_function("replacement_chars", |b| {
+        b.iter(|| {
+            criterion::black_box(unescape("just a bit\n of text without\tany entities")).unwrap();
+        })
+    });
+
+    group.bench_function("char_reference", |b| {
+        b.iter(|| {
+            let text = "prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
+            criterion::black_box(unescape(text)).unwrap();
+            let text = "&#38;&#60;";
+            criterion::black_box(unescape(text)).unwrap();
+        })
+    });
+
+    group.bench_function("entity_reference", |b| {
+        b.iter(|| {
+            let text = "age &gt; 72 &amp;&amp; age &lt; 21";
+            criterion::black_box(unescape(text)).unwrap();
+            let text = "&quot;what&apos;s that?&quot;";
+            criterion::black_box(unescape(text)).unwrap();
+        })
+    });
+
     group.finish();
 }
 
@@ -354,6 +398,7 @@ criterion_group!(
     read_resolved_event_into,
     one_event,
     attributes,
+    attribute_value_normalization,
     escaping,
     unescaping,
 );
diff --git a/src/errors.rs b/src/errors.rs
index 0afd5a02..f0be93ef 100644
--- a/src/errors.rs
+++ b/src/errors.rs
@@ -77,6 +77,7 @@ impl From<EscapeError> for Error {
 }
 
 impl From<AttrError> for Error {
+    /// Creates a new `Error::InvalidAttr` from the given error
     #[inline]
     fn from(error: AttrError) -> Self {
         Error::InvalidAttr(error)
diff --git a/src/escapei.rs b/src/escapei.rs
index f537233a..b711b0f0 100644
--- a/src/escapei.rs
+++ b/src/escapei.rs
@@ -8,7 +8,7 @@ use std::ops::Range;
 use pretty_assertions::assert_eq;
 
 /// Error for XML escape / unescape.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, PartialEq)]
 pub enum EscapeError {
     /// Entity with Null character
     EntityWithNull(Range<usize>),
@@ -212,8 +212,66 @@ where
     }
 }
 
+/// Returns the attribute value normalized as per the XML specification, using a custom
+/// entity resolver.
+///
+/// https://www.w3.org/TR/xml/#AVNormalize
+///
+/// Do not use this method with HTML attributes.
+///
+/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
+/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
+/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
+/// take precedence.
+///
+/// This will allocate unless the raw attribute value does not require normalization.
+pub(crate) fn normalize_attribute_value_with<'a, 'entity>(
+    value: &'a str,
+    resolve_entity: impl Fn(&str) -> Option<&'entity str>,
+) -> Result<Cow<'a, str>, EscapeError> {
+    // TODO: avoid allocation when not needed
+    let mut normalized = String::with_capacity(value.len());
+
+    let attr = value.as_bytes();
+    let mut attr_iter = attr.iter().enumerate();
+
+    while let Some((idx, ch)) = attr_iter.next() {
+        match ch {
+            b' ' | b'\n' | b'\r' | b'\t' => normalized.push(' '),
+            b'&' => {
+                let end = idx
+                    + 1
+                    + attr_iter
+                        .position(|(_, c)| *c == b';')
+                        .ok_or_else(|| EscapeError::UnterminatedEntity(idx..attr.len()))?;
+                let entity = &attr[idx + 1..end]; // starts after the &
+                let entity_str = std::str::from_utf8(entity).expect("failed UTF-8 check");
+
+                if entity.starts_with(b"#") {
+                    let entity = &entity_str[1..]; // starts after the #
+                    let codepoint = parse_number(entity, idx..end)?;
+                    normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
+                } else if let Some(s) = named_entity(entity_str) {
+                    normalized.push_str(s);
+                } else if let Some(value) = resolve_entity(entity_str) {
+                    // TODO: recursively apply entity substitution
+                    normalized.push_str(&value);
+                } else {
+                    return Err(EscapeError::UnrecognizedSymbol(
+                        idx + 1..end,
+                        String::from_utf8(entity.to_vec()).expect("failed UTF-8 check"),
+                    ));
+                }
+            }
+            _ => normalized.push(*ch as char),
+        }
+    }
+
+    Ok(Cow::Owned(normalized))
+}
+
 #[cfg(not(feature = "escape-html"))]
-fn named_entity(name: &str) -> Option<&str> {
+pub(crate) fn named_entity(name: &str) -> Option<&str> {
     // match over strings are not allowed in const functions
     let s = match name.as_bytes() {
         b"lt" => "<",
@@ -226,9 +284,8 @@ fn named_entity(name: &str) -> Option<&str> {
     Some(s)
 }
 #[cfg(feature = "escape-html")]
-fn named_entity(name: &str) -> Option<&str> {
+pub(crate) fn named_entity(name: &str) -> Option<&str> {
     // imported from https://dev.w3.org/html5/html-author/charref
-    // match over strings are not allowed in const functions
     //TODO: automate up-to-dating using https://html.spec.whatwg.org/entities.json
     let s = match name.as_bytes() {
         b"Tab" => "\u{09}",
@@ -1690,7 +1747,7 @@ fn named_entity(name: &str) -> Option<&str> {
     Some(s)
 }
 
-fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
+pub fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
     let code = if bytes.starts_with('x') {
         parse_hexadecimal(&bytes[1..])
     } else {
@@ -1705,7 +1762,7 @@ fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
     }
 }
 
-fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
+pub fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
     // maximum code is 0x10FFFF => 6 characters
     if bytes.len() > 6 {
         return Err(EscapeError::TooLongHexadecimal);
@@ -1723,7 +1780,7 @@ fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
     Ok(code)
 }
 
-fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
+pub fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
     // maximum code is 0x10FFFF = 1114111 => 7 characters
     if bytes.len() > 7 {
         return Err(EscapeError::TooLongDecimal);
diff --git a/src/events/attributes.rs b/src/events/attributes.rs
index e2cf8c62..6e12c133 100644
--- a/src/events/attributes.rs
+++ b/src/events/attributes.rs
@@ -4,6 +4,9 @@
 
 use crate::errors::Result as XmlResult;
 use crate::escape::{escape, unescape_with};
+use crate::escapei;
+#[cfg(any(doc, not(feature = "encoding")))]
+use crate::escapei::EscapeError;
 use crate::name::QName;
 use crate::reader::{is_whitespace, Reader};
 use crate::utils::{write_byte_string, write_cow_string, Bytes};
@@ -30,7 +33,100 @@ pub struct Attribute<'a> {
 }
 
 impl<'a> Attribute<'a> {
-    /// Decodes using UTF-8 then unescapes the value.
+    /// Returns the attribute value normalized as per the XML specification.
+    ///
+    /// https://www.w3.org/TR/xml/#AVNormalize
+    ///
+    /// Do not use this method with HTML attributes.
+    ///
+    /// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
+    /// and the characters \t, \r, \n are replaced with whitespace characters.
+    ///
+    /// This will allocate unless the raw attribute value does not require normalization.
+    ///
+    /// See also [`normalized_value_with()`](#method.normalized_value_with)
+    #[cfg(any(doc, not(feature = "encoding")))]
+    pub fn normalized_value(&'a self) -> Result<Cow<'a, str>, EscapeError> {
+        self.normalized_value_with(|_| None)
+    }
+
+    /// Returns the attribute value normalized as per the XML specification, using a custom
+    /// entity resolver.
+    ///
+    /// https://www.w3.org/TR/xml/#AVNormalize
+    ///
+    /// Do not use this method with HTML attributes.
+    ///
+    /// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
+    /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
+    /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
+    /// take precedence.
+    ///
+    /// This will allocate unless the raw attribute value does not require normalization.
+    ///
+    /// See also [`normalized_value()`](#method.normalized_value)
+    #[cfg(any(doc, not(feature = "encoding")))]
+    pub fn normalized_value_with<'entity>(
+        &'a self,
+        resolve_entity: impl Fn(&str) -> Option<&'entity str>,
+    ) -> Result<Cow<'a, str>, EscapeError> {
+        escapei::normalize_attribute_value_with(
+            std::str::from_utf8(self.value.as_ref()).expect("unable to decode as utf-8"),
+            resolve_entity,
+        )
+    }
+
+    /// Decodes using a provided reader and returns the attribute value normalized
+    /// as per the XML specification.
+    ///
+    /// https://www.w3.org/TR/xml/#AVNormalize
+    ///
+    /// Do not use this method with HTML attributes.
+    ///
+    /// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
+    /// and the characters \t, \r, \n are replaced with whitespace characters.
+    ///
+    /// This will allocate unless the raw attribute value does not require normalization.
+    ///
+    /// See also [`decode_and_normalize_value_with()`](#method.decode_and_normalize_value_with)
+    pub fn decode_and_normalize_value<B>(&self, reader: &Reader<B>) -> XmlResult<Cow<'a, str>> {
+        self.decode_and_normalize_value_with(reader, |_| None)
+    }
+
+    /// Decodes using a provided reader and returns the attribute value normalized
+    /// as per the XML specification, using a custom entity resolver.
+    ///
+    /// https://www.w3.org/TR/xml/#AVNormalize
+    ///
+    /// Do not use this method with HTML attributes.
+    ///
+    /// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
+    /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
+    /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
+    /// take precedence.
+    ///
+    /// This will allocate unless the raw attribute value does not require normalization.
+    ///
+    /// See also [`decode_and_normalize_value()`](#method.decode_and_normalize_value)
+    pub fn decode_and_normalize_value_with<'entity, B>(
+        &self,
+        reader: &Reader<B>,
+        resolve_entity: impl Fn(&str) -> Option<&'entity str>,
+    ) -> XmlResult<Cow<'a, str>> {
+        let decoded = match &self.value {
+            Cow::Borrowed(bytes) => reader.decoder().decode(bytes)?,
+            // Convert to owned, because otherwise Cow will be bound with wrong lifetime
+            Cow::Owned(bytes) => reader.decoder().decode(bytes)?.into_owned().into(),
+        };
+
+        match escapei::normalize_attribute_value_with(&decoded, resolve_entity)? {
+            // Because result is borrowed, no replacements was done and we can use original string
+            Cow::Borrowed(_) => Ok(decoded),
+            Cow::Owned(s) => Ok(s.into()),
+        }
+    }
+
+    /// Returns the unescaped value.
     ///
     /// This is normally the value you are interested in. Escape sequences such as `&gt;` are
     /// replaced with their unescaped equivalents such as `>`.
@@ -791,6 +887,94 @@ mod xml {
     use super::*;
     use pretty_assertions::assert_eq;
 
+    #[cfg(any(doc, not(feature = "encoding")))]
+    mod attribute_value_normalization {
+
+        use super::*;
+        use pretty_assertions::assert_eq;
+
+        /// Empty values returned are unchanged
+        #[test]
+        fn test_empty() {
+            let raw_value = "".as_bytes();
+            let output = "".to_owned();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output))); // TODO: this should not allocate
+        }
+
+        /// Already normalized values are returned unchanged
+        #[test]
+        fn test_already_normalized() {
+            let raw_value = "foobar123".as_bytes();
+            let output = "foobar123".to_owned();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output))); // TODO: this should not allocate
+        }
+
+        /// Return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with
+        /// a space character
+        #[test]
+        fn test_space_replacement() {
+            let raw_value = "\r\nfoo\rbar\tbaz\n\ndelta\n".as_bytes();
+            let output = "  foo bar baz  delta ".to_owned();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output)));
+        }
+
+        /// Entities must be terminated
+        #[test]
+        fn test_unterminated_entity() {
+            let raw_value = "abc&quotdef".as_bytes();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            assert_eq!(
+                attr.normalized_value(),
+                Err(EscapeError::UnterminatedEntity(3..11))
+            );
+        }
+
+        /// Unknown entities raise error
+        #[test]
+        fn test_unrecognized_entity() {
+            let raw_value = "abc&unkn;def".as_bytes();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            assert_eq!(
+                attr.normalized_value(),
+                Err(EscapeError::UnrecognizedSymbol(4..8, "unkn".to_owned())) // TODO: is this divergence between range behavior of UnterminatedEntity and UnrecognizedSymbol appropriate? existing unescape code behaves the same.  (see: start index)
+            );
+        }
+
+        /// custom entity replacement works, entity replacement text processed recursively
+        #[test]
+        #[ignore = "Algorithm not completely implemented"]
+        fn test_entity_replacement() {
+            let raw_value = "&d;&d;A&a;&#x20;&a;B&da;".as_bytes();
+            let output = "  A   B  ".to_owned();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            fn custom_resolver(ent: &str) -> Option<&'static str> {
+                match ent {
+                    "d" => Some("&#xD;"),
+                    "a" => Some("&#xA;"),
+                    "da" => Some("&#xD;&#xA;"),
+                    _ => None,
+                }
+            }
+            // dbg!(attr.normalized_value_with(&custom_resolver));
+            assert_eq!(
+                attr.normalized_value_with(&custom_resolver),
+                Ok(Cow::Owned(output))
+            );
+        }
+
+        #[test]
+        fn test_char_references() {
+            // character literal references are substituted without being replaced by spaces
+            let raw_value = "&#xd;&#xd;A&#xa;&#xa;B&#xd;&#xa;".as_bytes();
+            let output = "\r\rA\n\nB\r\n".to_owned();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output)));
+        }
+    }
+
     /// Checked attribute is the single attribute
     mod single {
         use super::*;
diff --git a/src/lib.rs b/src/lib.rs
index 5e8a20c2..992928fa 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -51,8 +51,7 @@ mod errors;
 mod escapei;
 pub mod escape {
     //! Manage xml character escapes
-    pub(crate) use crate::escapei::EscapeError;
-    pub use crate::escapei::{escape, partial_escape, unescape, unescape_with};
+    pub use crate::escapei::{escape, partial_escape, unescape, unescape_with, EscapeError};
 }
 pub mod events;
 pub mod name;