From ac12797782ff3511c12ed0e4263370363a09b89c Mon Sep 17 00:00:00 2001
From: Daniel Alley <dalley@redhat.com>
Date: Sat, 2 Apr 2022 21:14:52 -0400
Subject: [PATCH 1/2] Add functions for attribute value normalization

closes #371
---
 Changelog.md             |  15 ++-
 benches/macrobenches.rs  |  12 +-
 benches/microbenches.rs  |  72 +++++++++++
 src/errors.rs            |   1 +
 src/escape.rs            | 261 ++++++++++++++++++++++++++++++++++++++-
 src/events/attributes.rs | 185 ++++++++++++++++++++++++++-
 src/events/mod.rs        |   2 +-
 7 files changed, 532 insertions(+), 16 deletions(-)

diff --git a/Changelog.md b/Changelog.md
index 0ba5a368..89385b69 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -12,12 +12,19 @@
 
 ### New Features
 
+- [#379]: Improved compliance with the XML attribute value normalization process by
+  adding `Attribute::normalized_value()` and `Attribute::normalized_value_with()`,
+  which ought to be used in place of `Attribute::unescape_value()` and
+  `Attribute::unescape_value_with()`
+
 ### Bug Fixes
 
 ### Misc Changes
 
+- [#379]: Added tests for attribute value normalization
 - [#650]: Change the type of `Event::PI` to a new dedicated `BytesPI` type.
 
+[#379]: https://github.com/tafia/quick-xml/pull/379
 [#650]: https://github.com/tafia/quick-xml/issues/650
 
 
@@ -57,9 +64,6 @@ resolve predefined entities.
   - `quick_xml::escape::resolve_html5_entity`
 - [#753]: Added parser for processing instructions: `quick_xml::reader::PiParser`.
 - [#754]: Added parser for elements: `quick_xml::reader::ElementParser`.
-
-### Bug Fixes
-
 - [#622]: Fix wrong disregarding of not closed markup, such as lone `<`.
 - [#684]: Fix incorrect position reported for `Error::IllFormed(DoubleHyphenInComment)`.
 - [#684]: Fix incorrect position reported for `Error::IllFormed(MissingDoctypeName)`.
@@ -120,7 +124,6 @@ resolve predefined entities.
 [`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html
 [`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html
 
-
 ## 0.31.0 -- 2023-10-22
 
 MSRV bumped to 1.56! Crate now uses Rust 2021 edition.
@@ -291,6 +294,10 @@ serde >= 1.0.181
 - [#565]: Fix compilation error when build with serde <1.0.139
 
 
+### New Tests
+
+- [#379]: Added tests for attribute value normalization
+
 [externally tagged]: https://serde.rs/enum-representations.html#externally-tagged
 [#490]: https://github.com/tafia/quick-xml/pull/490
 [#510]: https://github.com/tafia/quick-xml/issues/510
diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs
index a8cbbd53..8864e74b 100644
--- a/benches/macrobenches.rs
+++ b/benches/macrobenches.rs
@@ -43,14 +43,13 @@ static INPUTS: &[(&str, &str)] = &[
     ("players.xml", PLAYERS),
 ];
 
-// TODO: use fully normalized attribute values
 fn parse_document_from_str(doc: &str) -> XmlResult<()> {
     let mut r = Reader::from_str(doc);
     loop {
         match criterion::black_box(r.read_event()?) {
             Event::Start(e) | Event::Empty(e) => {
                 for attr in e.attributes() {
-                    criterion::black_box(attr?.decode_and_unescape_value(&r)?);
+                    criterion::black_box(attr?.decode_and_normalize_value(&r)?);
                 }
             }
             Event::Text(e) => {
@@ -67,7 +66,6 @@ fn parse_document_from_str(doc: &str) -> XmlResult<()> {
     Ok(())
 }
 
-// TODO: use fully normalized attribute values
 fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> {
     let mut r = Reader::from_reader(doc);
     let mut buf = Vec::new();
@@ -75,7 +73,7 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> {
         match criterion::black_box(r.read_event_into(&mut buf)?) {
             Event::Start(e) | Event::Empty(e) => {
                 for attr in e.attributes() {
-                    criterion::black_box(attr?.decode_and_unescape_value(&r)?);
+                    criterion::black_box(attr?.decode_and_normalize_value(&r)?);
                 }
             }
             Event::Text(e) => {
@@ -93,7 +91,6 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> {
     Ok(())
 }
 
-// TODO: use fully normalized attribute values
 fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> {
     let mut r = NsReader::from_str(doc);
     loop {
@@ -101,7 +98,7 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> {
             (resolved_ns, Event::Start(e) | Event::Empty(e)) => {
                 criterion::black_box(resolved_ns);
                 for attr in e.attributes() {
-                    criterion::black_box(attr?.decode_and_unescape_value(&r)?);
+                    criterion::black_box(attr?.decode_and_normalize_value(&r)?);
                 }
             }
             (resolved_ns, Event::Text(e)) => {
@@ -120,7 +117,6 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> {
     Ok(())
 }
 
-// TODO: use fully normalized attribute values
 fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> {
     let mut r = NsReader::from_reader(doc);
     let mut buf = Vec::new();
@@ -129,7 +125,7 @@ fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> {
             (resolved_ns, Event::Start(e) | Event::Empty(e)) => {
                 criterion::black_box(resolved_ns);
                 for attr in e.attributes() {
-                    criterion::black_box(attr?.decode_and_unescape_value(&r)?);
+                    criterion::black_box(attr?.decode_and_normalize_value(&r)?);
                 }
             }
             (resolved_ns, Event::Text(e)) => {
diff --git a/benches/microbenches.rs b/benches/microbenches.rs
index 2f4ece04..70b84568 100644
--- a/benches/microbenches.rs
+++ b/benches/microbenches.rs
@@ -1,6 +1,9 @@
+use std::borrow::Cow;
+
 use criterion::{self, criterion_group, criterion_main, Criterion};
 use pretty_assertions::assert_eq;
 use quick_xml::escape::{escape, unescape};
+use quick_xml::events::attributes::Attribute;
 use quick_xml::events::Event;
 use quick_xml::name::QName;
 use quick_xml::reader::{NsReader, Reader};
@@ -242,6 +245,74 @@ fn attributes(c: &mut Criterion) {
             assert_eq!(count, 150);
         })
     });
+
+    group.finish();
+}
+
+/// Benchmarks normalizing attribute values
+fn attribute_value_normalization(c: &mut Criterion) {
+    let mut group = c.benchmark_group("attribute_value_normalization");
+
+    group.bench_function("noop_short", |b| {
+        let attr = Attribute {
+            key: QName(b"foo"),
+            value: Cow::Borrowed(b"foobar"),
+        };
+        b.iter(|| {
+            criterion::black_box(attr.normalized_value()).unwrap();
+        })
+    });
+
+    group.bench_function("noop_long", |b| {
+        let attr = Attribute {
+            key: QName(b"foo"),
+            value: Cow::Borrowed(LOREM_IPSUM_TEXT.as_bytes()),
+        };
+        b.iter(|| {
+            criterion::black_box(attr.normalized_value()).unwrap();
+        })
+    });
+
+    group.bench_function("replacement_chars", |b| {
+        let attr = Attribute {
+            key: QName(b"foo"),
+            value: Cow::Borrowed(b"just a bit\n of text without\tany entities"),
+        };
+        b.iter(|| {
+            criterion::black_box(attr.normalized_value()).unwrap();
+        })
+    });
+
+    group.bench_function("char_reference", |b| {
+        let attr1 = Attribute {
+            key: QName(b"foo"),
+            value: Cow::Borrowed(b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;"),
+        };
+        let attr2 = Attribute {
+            key: QName(b"foo"),
+            value: Cow::Borrowed(b"&#38;&#60;"),
+        };
+        b.iter(|| {
+            criterion::black_box(attr1.normalized_value()).unwrap();
+            criterion::black_box(attr2.normalized_value()).unwrap();
+        })
+    });
+
+    group.bench_function("entity_reference", |b| {
+        let attr1 = Attribute {
+            key: QName(b"foo"),
+            value: Cow::Borrowed(b"age &gt; 72 &amp;&amp; age &lt; 21"),
+        };
+        let attr2 = Attribute {
+            key: QName(b"foo"),
+            value: Cow::Borrowed(b"&quot;what&apos;s that?&quot;"),
+        };
+        b.iter(|| {
+            criterion::black_box(attr1.normalized_value()).unwrap();
+            criterion::black_box(attr2.normalized_value()).unwrap();
+        })
+    });
+
     group.finish();
 }
 
@@ -354,6 +425,7 @@ criterion_group!(
     read_resolved_event_into,
     one_event,
     attributes,
+    attribute_value_normalization,
     escaping,
     unescaping,
 );
diff --git a/src/errors.rs b/src/errors.rs
index 0c5c46e3..e0394aaf 100644
--- a/src/errors.rs
+++ b/src/errors.rs
@@ -254,6 +254,7 @@ impl From<EscapeError> for Error {
 }
 
 impl From<AttrError> for Error {
+    /// Creates a new `Error::InvalidAttr` from the given error
     #[inline]
     fn from(error: AttrError) -> Self {
         Error::InvalidAttr(error)
diff --git a/src/escape.rs b/src/escape.rs
index 54972624..3d65dacc 100644
--- a/src/escape.rs
+++ b/src/escape.rs
@@ -3,12 +3,13 @@
 use memchr::memchr2_iter;
 use std::borrow::Cow;
 use std::ops::Range;
+use std::slice::Iter;
 
 #[cfg(test)]
 use pretty_assertions::assert_eq;
 
 /// Error for XML escape / unescape.
-#[derive(Clone, Debug)]
+#[derive(Clone, Debug, PartialEq)]
 pub enum EscapeError {
     /// Entity with Null character
     EntityWithNull(Range<usize>),
@@ -278,6 +279,163 @@ where
     }
 }
 
+const fn is_normalization_char(b: &u8) -> bool {
+    matches!(*b, b'\t' | b'\r' | b'\n' | b' ' | b'&')
+}
+
+/// Returns the attribute value normalized as per [the XML specification],
+/// using a custom entity resolver.
+///
+/// Do not use this method with HTML attributes.
+///
+/// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
+/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
+/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
+/// take precedence.
+///
+/// This will allocate unless the raw attribute value does not require normalization.
+///
+/// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
+pub(crate) fn normalize_attribute_value<'input, 'entity, F>(
+    value: &'input str,
+    resolve_entity: F,
+) -> Result<Cow<'input, str>, EscapeError>
+where
+    // the lifetime of the output comes from a capture or is `'static`
+    F: Fn(&str) -> Option<&'entity str>,
+{
+    const DEPTH: usize = 128;
+
+    let bytes = value.as_bytes();
+    let mut iter = bytes.iter();
+
+    if let Some(i) = iter.position(is_normalization_char) {
+        let mut normalized = String::new();
+        let pos = normalize_step(
+            &mut normalized,
+            &mut iter,
+            value,
+            0,
+            i,
+            DEPTH,
+            &resolve_entity,
+        )?;
+
+        normalize_steps(
+            &mut normalized,
+            &mut iter,
+            value,
+            pos,
+            DEPTH,
+            &resolve_entity,
+        )?;
+        return Ok(normalized.into());
+    }
+    Ok(Cow::Borrowed(value))
+}
+
+fn normalize_steps<'entity, F>(
+    normalized: &mut String,
+    iter: &mut Iter<u8>,
+    input: &str,
+    mut pos: usize,
+    depth: usize,
+    resolve_entity: &F,
+) -> Result<(), EscapeError>
+where
+    // the lifetime of the output comes from a capture or is `'static`
+    F: Fn(&str) -> Option<&'entity str>,
+{
+    while let Some(i) = iter.position(is_normalization_char) {
+        pos = normalize_step(normalized, iter, input, pos, pos + i, depth, resolve_entity)?;
+    }
+    if let Some(rest) = input.get(pos..) {
+        normalized.push_str(rest);
+    }
+    Ok(())
+}
+
+/// Performs one step of the [normalization algorithm] (but with recursive part):
+///
+/// 1. For a character reference, append the referenced character
+///    to the normalized value.
+/// 2. For an entity reference, recursively apply this algorithm
+///    to the replacement text of the entity.
+/// 3. For a white space character (#x20, #xD, #xA, #x9), append
+///    a space character (#x20) to the normalized value.
+/// 4. For another character, append the character to the normalized value.
+///
+/// # Parameters
+///
+/// - `normalized`: Output of the algorithm. Normalized value will be placed here
+/// - `iter`: Iterator over bytes of `input`
+/// - `input`: Original non-normalized value
+/// - `last_pos`: Index of the last byte in `input` that was processed
+/// - `index`: Index of the byte in `input` that should be processed now
+/// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm
+/// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities
+///
+/// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize
+fn normalize_step<'entity, F>(
+    normalized: &mut String,
+    iter: &mut Iter<u8>,
+    input: &str,
+    last_pos: usize,
+    index: usize,
+    depth: usize,
+    resolve_entity: &F,
+) -> Result<usize, EscapeError>
+where
+    // the lifetime of the output comes from a capture or is `'static`
+    F: Fn(&str) -> Option<&'entity str>,
+{
+    // 4. For another character, append the character to the normalized value.
+    normalized.push_str(&input[last_pos..index]);
+
+    match input.as_bytes()[index] {
+        b'&' => {
+            let start = index + 1; // +1 - skip `&`
+            let end = start
+                + match iter.position(|&b| b == b';') {
+                    Some(end) => end,
+                    None => return Err(EscapeError::UnterminatedEntity(index..input.len())),
+                };
+
+            // Content between & and ; - &pat;
+            let pat = &input[start..end];
+            // 1. For a character reference, append the referenced character
+            //    to the normalized value.
+            if pat.starts_with('#') {
+                let entity = &pat[1..]; // starts after the #
+                let codepoint = parse_number(entity, start..end)?;
+                normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
+            // 2. For an entity reference, recursively apply this algorithm
+            //    to the replacement text of the entity.
+            } else if let Some(value) = resolve_entity(pat) {
+                normalize_steps(
+                    normalized,
+                    &mut value.as_bytes().iter(),
+                    value,
+                    0,
+                    depth.saturating_sub(1),
+                    resolve_entity,
+                )?;
+            } else {
+                return Err(EscapeError::UnrecognizedSymbol(start..end, pat.to_string()));
+            }
+            Ok(end + 1) // +1 - skip `;`
+        }
+        // 3. For a white space character (#x20, #xD, #xA, #x9), append
+        //    a space character (#x20) to the normalized value.
+        b'\t' | b'\n' | b'\r' | b' ' => {
+            normalized.push(' ');
+            Ok(index + 1) // +1 - skip character
+        }
+
+        _ => unreachable!("Only '\\t', '\\n', '\\r', ' ', and '&' are possible here"),
+    }
+}
+
 /// Resolves predefined XML entities or all HTML5 entities depending on the feature
 /// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html).
 ///
@@ -1927,3 +2085,104 @@ fn test_minimal_escape() {
         "prefix_\"a\"b&amp;&lt;>c"
     );
 }
+
+#[cfg(test)]
+mod normalization {
+    use super::*;
+    use pretty_assertions::assert_eq;
+
+    #[test]
+    fn empty() {
+        assert_eq!(normalize_attribute_value("", |_| { None }), Ok("".into()));
+    }
+
+    #[test]
+    fn only_spaces() {
+        assert_eq!(
+            normalize_attribute_value("   ", |_| { None }),
+            Ok("   ".into())
+        );
+        assert_eq!(
+            normalize_attribute_value("\t\t\t", |_| { None }),
+            Ok("   ".into())
+        );
+        assert_eq!(
+            normalize_attribute_value("\r\r\r", |_| { None }),
+            Ok("   ".into())
+        );
+        assert_eq!(
+            normalize_attribute_value("\n\n\n", |_| { None }),
+            Ok("   ".into())
+        );
+    }
+
+    #[test]
+    fn already_normalized() {
+        assert_eq!(
+            normalize_attribute_value("already normalized", |_| { None }),
+            Ok("already normalized".into())
+        );
+    }
+
+    #[test]
+    fn characters() {
+        assert_eq!(
+            normalize_attribute_value("string with &#32; character", |_| { None }),
+            Ok("string with   character".into())
+        );
+        assert_eq!(
+            normalize_attribute_value("string with &#x20; character", |_| { None }),
+            Ok("string with   character".into())
+        );
+    }
+
+    #[test]
+    fn entities() {
+        assert_eq!(
+            normalize_attribute_value("string with &entity; reference", |_| {
+                Some("replacement")
+            }),
+            Ok("string with replacement reference".into())
+        );
+        assert_eq!(
+            normalize_attribute_value("string with &entity-1; reference", |entity| {
+                match entity {
+                    "entity-1" => Some("recursive &entity-2;"),
+                    "entity-2" => Some("entity&#32;2"),
+                    _ => None,
+                }
+            }),
+            Ok("string with recursive entity 2 reference".into())
+        );
+    }
+
+    #[test]
+    fn unclosed_entity() {
+        assert_eq!(
+            normalize_attribute_value("string with unclosed &entity reference", |_| {
+                //     0                    ^ = 21           ^ = 38
+                Some("replacement")
+            }),
+            Err(EscapeError::UnterminatedEntity(21..38))
+        );
+        assert_eq!(
+            normalize_attribute_value("string with unclosed &#32 (character) reference", |_| {
+                //     0                    ^ = 21                    ^ = 47
+                None
+            }),
+            Err(EscapeError::UnterminatedEntity(21..47))
+        );
+    }
+
+    #[test]
+    fn unknown_entity() {
+        assert_eq!(
+            normalize_attribute_value("string with unknown &entity; reference", |_| { None }),
+            //         0                    ^     ^ = 21..27
+            Err(EscapeError::UnrecognizedSymbol(
+                21..27,
+                "entity".to_string()
+            ))
+        );
+    }
+}
diff --git a/src/events/attributes.rs b/src/events/attributes.rs
index e08303d7..274107da 100644
--- a/src/events/attributes.rs
+++ b/src/events/attributes.rs
@@ -3,7 +3,9 @@
 //! Provides an iterator over attributes key/value pairs
 
 use crate::errors::Result as XmlResult;
-use crate::escape::{escape, resolve_predefined_entity, unescape_with};
+use crate::escape::{self, escape, resolve_predefined_entity, unescape_with};
+#[cfg(any(doc, not(feature = "encoding")))]
+use crate::escape::EscapeError;
 use crate::name::QName;
 use crate::reader::{is_whitespace, Reader};
 use crate::utils::{write_byte_string, write_cow_string, Bytes};
@@ -30,7 +32,100 @@ pub struct Attribute<'a> {
 }
 
 impl<'a> Attribute<'a> {
-    /// Decodes using UTF-8 then unescapes the value.
+    /// Returns the attribute value normalized as per [the XML specification].
+    ///
+    /// Do not use this method with HTML attributes.
+    ///
+    /// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
+    /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters.
+    ///
+    /// This will allocate unless the raw attribute value does not require normalization.
+    ///
+    /// See also [`normalized_value_with()`](#method.Self::normalized_value_with)
+    ///
+    /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
+    #[cfg(any(doc, not(feature = "encoding")))]
+    pub fn normalized_value(&'a self) -> Result<Cow<'a, str>, EscapeError> {
+        self.normalized_value_with(|_| None)
+    }
+
+    /// Returns the attribute value normalized as per [the XML specification],
+    /// using a custom entity resolver.
+    ///
+    /// Do not use this method with HTML attributes.
+    ///
+    /// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
+    /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
+    /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
+    /// take precedence.
+    ///
+    /// This will allocate unless the raw attribute value does not require normalization.
+    ///
+    /// See also [`normalized_value()`](#method.normalized_value)
+    ///
+    /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
+    #[cfg(any(doc, not(feature = "encoding")))]
+    pub fn normalized_value_with<'entity>(
+        &'a self,
+        resolve_entity: impl Fn(&str) -> Option<&'entity str>,
+    ) -> Result<Cow<'a, str>, EscapeError> {
+        escape::normalize_attribute_value(
+            std::str::from_utf8(self.value.as_ref()).expect("unable to decode as utf-8"),
+            resolve_entity,
+        )
+    }
+
+    /// Decodes using a provided reader and returns the attribute value normalized
+    /// as per [the XML specification].
+    ///
+    /// Do not use this method with HTML attributes.
+    ///
+    /// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
+    /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters.
+    ///
+    /// This will allocate unless the raw attribute value does not require normalization.
+    ///
+    /// See also [`decode_and_normalize_value_with()`](#method.decode_and_normalize_value_with)
+    ///
+    /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
+    pub fn decode_and_normalize_value<B>(&self, reader: &Reader<B>) -> XmlResult<Cow<'a, str>> {
+        self.decode_and_normalize_value_with(reader, |_| None)
+    }
+
+    /// Decodes using a provided reader and returns the attribute value normalized
+    /// as per [the XML specification], using a custom entity resolver.
+    ///
+    /// Do not use this method with HTML attributes.
+    ///
+    /// Escape sequences such as `&gt;` are replaced with their unescaped equivalents such as `>`
+    /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function
+    /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still
+    /// take precedence.
+    ///
+    /// This will allocate unless the raw attribute value does not require normalization.
+    ///
+    /// See also [`decode_and_normalize_value()`](#method.decode_and_normalize_value)
+    ///
+    /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize
+    pub fn decode_and_normalize_value_with<'entity, B>(
+        &self,
+        reader: &Reader<B>,
+        resolve_entity: impl Fn(&str) -> Option<&'entity str>,
+    ) -> XmlResult<Cow<'a, str>> {
+        let decoded = match &self.value {
+            Cow::Borrowed(bytes) => reader.decoder().decode(bytes)?,
+            // Convert to owned, because otherwise Cow will be bound with wrong lifetime
+            Cow::Owned(bytes) => reader.decoder().decode(bytes)?.into_owned().into(),
+        };
+
+        match escape::normalize_attribute_value(&decoded, resolve_entity)? {
+            // Because result is borrowed, no replacements was done and we can use original string
+            Cow::Borrowed(_) => Ok(decoded),
+            Cow::Owned(s) => Ok(s.into()),
+        }
+    }
+
+    /// Returns the unescaped value.
     ///
     /// This is normally the value you are interested in. Escape sequences such as `&gt;` are
     /// replaced with their unescaped equivalents such as `>`.
@@ -796,6 +891,92 @@ mod xml {
     use super::*;
     use pretty_assertions::assert_eq;
 
+    #[cfg(any(doc, not(feature = "encoding")))]
+    mod attribute_value_normalization {
+
+        use super::*;
+        use pretty_assertions::assert_eq;
+
+        /// Empty values returned are unchanged
+        #[test]
+        fn test_empty() {
+            let raw_value = "".as_bytes();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            assert_eq!(attr.normalized_value(), Ok(Cow::Borrowed("")));
+        }
+
+        /// Already normalized values are returned unchanged
+        #[test]
+        fn test_already_normalized() {
+            let raw_value = "foobar123".as_bytes();
+            let output = "foobar123";
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            assert_eq!(attr.normalized_value(), Ok(Cow::Borrowed(output)));
+        }
+
+        /// Return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with
+        /// a space character
+        #[test]
+        fn test_space_replacement() {
+            let raw_value = "\r\nfoo\rbar\tbaz\n\ndelta\n".as_bytes();
+            let output = "  foo bar baz  delta ".to_owned();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output)));
+        }
+
+        /// Entities must be terminated
+        #[test]
+        fn test_unterminated_entity() {
+            let raw_value = "abc&quotdef".as_bytes();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            assert_eq!(
+                attr.normalized_value(),
+                Err(EscapeError::UnterminatedEntity(3..11))
+            );
+        }
+
+        /// Unknown entities raise error
+        #[test]
+        fn test_unrecognized_entity() {
+            let raw_value = "abc&unkn;def".as_bytes();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            assert_eq!(
+                attr.normalized_value(),
+                Err(EscapeError::UnrecognizedSymbol(4..8, "unkn".to_owned()))
+            );
+        }
+
+        /// custom entity replacement works, entity replacement text processed recursively
+        #[test]
+        #[ignore]
+        fn test_entity_replacement() {
+            let raw_value = "&d;&d;A&a;&#x20;&a;B&da;".as_bytes();
+            let output = "  A   B  ".to_owned();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            fn custom_resolver(ent: &str) -> Option<&'static str> {
+                match ent {
+                    "d" => Some("&#xD;"),
+                    "a" => Some("&#xA;"),
+                    "da" => Some("&#xD;&#xA;"),
+                    _ => None,
+                }
+            }
+            assert_eq!(
+                attr.normalized_value_with(&custom_resolver),
+                Ok(Cow::Owned(output))
+            );
+        }
+
+        #[test]
+        fn test_char_references() {
+            // character literal references are substituted without being replaced by spaces
+            let raw_value = "&#xd;&#xd;A&#xa;&#xa;B&#xd;&#xa;".as_bytes();
+            let output = "\r\rA\n\nB\r\n".to_owned();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output)));
+        }
+    }
+
     /// Checked attribute is the single attribute
     mod single {
         use super::*;
diff --git a/src/events/mod.rs b/src/events/mod.rs
index 9d5d311f..dbe2f058 100644
--- a/src/events/mod.rs
+++ b/src/events/mod.rs
@@ -512,7 +512,7 @@ impl<'a> BytesDecl<'a> {
     /// Although according to the [grammar] standalone flag must appear after `"version"`
     /// and `"encoding"`, this method does not check that. The first occurrence of the
     /// attribute will be returned even if there are several. Also, method does not
-    /// restrict symbols that can forming the value, so the returned flag name may not
+    /// restrict symbols that can form the value, so the returned flag name may not
     /// correspond to the grammar.
     ///
     /// # Examples

From 5c0d5d6eddcf99225394f221af082c5909aa093b Mon Sep 17 00:00:00 2001
From: Daniel Alley <dalley@redhat.com>
Date: Fri, 14 Jun 2024 14:32:52 -0400
Subject: [PATCH 2/2] temp

---
 Changelog.md             | 13 +++++++------
 src/escape.rs            | 27 ++++++++++++++++++--------
 src/events/attributes.rs | 41 +++++++++++++++++++++++++++++++++++++++-
 3 files changed, 66 insertions(+), 15 deletions(-)

diff --git a/Changelog.md b/Changelog.md
index 89385b69..059db5ee 100644
--- a/Changelog.md
+++ b/Changelog.md
@@ -21,9 +21,15 @@
 
 ### Misc Changes
 
-- [#379]: Added tests for attribute value normalization
+- [#379]: Fixed error messages for some variants of `EscapeError`
+  which were incorrectly described as occurring during escaping (rather than unescaping).
 - [#650]: Change the type of `Event::PI` to a new dedicated `BytesPI` type.
 
+### New Tests
+
+- [#379]: Added tests for attribute value normalization
+
+
 [#379]: https://github.com/tafia/quick-xml/pull/379
 [#650]: https://github.com/tafia/quick-xml/issues/650
 
@@ -293,11 +299,6 @@ serde >= 1.0.181
 - [#565]: Correctly set minimum required version of tokio dependency to 1.10
 - [#565]: Fix compilation error when build with serde <1.0.139
 
-
-### New Tests
-
-- [#379]: Added tests for attribute value normalization
-
 [externally tagged]: https://serde.rs/enum-representations.html#externally-tagged
 [#490]: https://github.com/tafia/quick-xml/pull/490
 [#510]: https://github.com/tafia/quick-xml/issues/510
diff --git a/src/escape.rs b/src/escape.rs
index 3d65dacc..6f1305e4 100644
--- a/src/escape.rs
+++ b/src/escape.rs
@@ -27,6 +27,12 @@ pub enum EscapeError {
     InvalidDecimal(char),
     /// Not a valid unicode codepoint
     InvalidCodepoint(u32),
+    /// The recursion limit was reached while attempting to unescape XML entities,
+    /// or normalize an attribute value. This could indicate an entity loop.
+    ///
+    /// Limiting recursion prevents Denial-of-Service type attacks
+    /// such as the "billion laughs" [attack](https://en.wikipedia.org/wiki/Billion_laughs_attack).
+    ReachedRecursionLimit(Range<usize>),
 }
 
 impl std::fmt::Display for EscapeError {
@@ -34,17 +40,17 @@ impl std::fmt::Display for EscapeError {
         match self {
             EscapeError::EntityWithNull(e) => write!(
                 f,
-                "Error while escaping character at range {:?}: Null character entity not allowed",
+                "Error while unescaping character at range {:?}: Null character entity not allowed",
                 e
             ),
             EscapeError::UnrecognizedSymbol(rge, res) => write!(
                 f,
-                "Error while escaping character at range {:?}: Unrecognized escape symbol: {:?}",
+                "Error while unescaping character at range {:?}: Unrecognized escape symbol: {:?}",
                 rge, res
             ),
             EscapeError::UnterminatedEntity(e) => write!(
                 f,
-                "Error while escaping character at range {:?}: Cannot find ';' after '&'",
+                "Error while unescaping character at range {:?}: Cannot find ';' after '&'",
                 e
             ),
             EscapeError::TooLongHexadecimal => write!(f, "Cannot convert hexadecimal to utf8"),
@@ -54,6 +60,10 @@ impl std::fmt::Display for EscapeError {
             EscapeError::TooLongDecimal => write!(f, "Cannot convert decimal to utf8"),
             EscapeError::InvalidDecimal(e) => write!(f, "'{}' is not a valid decimal character", e),
             EscapeError::InvalidCodepoint(n) => write!(f, "'{}' is not a valid codepoint", n),
+            EscapeError::ReachedRecursionLimit(e) => write!(
+                f,
+                "Error while unescaping entity at range {:?}: Recursion limit reached"
+            ),
         }
     }
 }
@@ -310,7 +320,7 @@ where
     let mut iter = bytes.iter();
 
     if let Some(i) = iter.position(is_normalization_char) {
-        let mut normalized = String::new();
+        let mut normalized = String::with_capacity(value.len());
         let pos = normalize_step(
             &mut normalized,
             &mut iter,
@@ -405,7 +415,7 @@ where
             let pat = &input[start..end];
             // 1. For a character reference, append the referenced character
             //    to the normalized value.
-            if pat.starts_with('#') {
+            if let Some(entity) = pat.strip_prefix('#') {
                 let entity = &pat[1..]; // starts after the #
                 let codepoint = parse_number(entity, start..end)?;
                 normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4]));
@@ -432,6 +442,7 @@ where
             Ok(index + 1) // +1 - skip character
         }
 
+        // SAFETY: We call normalize_step only when is_normalization_char() return true
         _ => unreachable!("Only '\\t', '\\n', '\\r', ' ', and '&' are possible here"),
     }
 }
@@ -2160,14 +2171,14 @@ mod normalization {
     fn unclosed_entity() {
         assert_eq!(
             normalize_attribute_value("string with unclosed &entity reference", |_| {
-                //     0                    ^ = 21           ^ = 38
+                //                     0                    ^ = 21           ^ = 38
                 Some("replacement")
             }),
             Err(EscapeError::UnterminatedEntity(21..38))
         );
         assert_eq!(
             normalize_attribute_value("string with unclosed &#32 (character) reference", |_| {
-                //     0                    ^ = 21                    ^ = 47
+                //                     0                    ^ = 21                    ^ = 47
                 None
             }),
             Err(EscapeError::UnterminatedEntity(21..47))
@@ -2178,7 +2189,7 @@ mod normalization {
     fn unknown_entity() {
         assert_eq!(
             normalize_attribute_value("string with unknown &entity; reference", |_| { None }),
-            //         0                    ^     ^ = 21..27
+            //                         0                    ^ 21  ^ 27
             Err(EscapeError::UnrecognizedSymbol(
                 21..27,
                 "entity".to_string()
diff --git a/src/events/attributes.rs b/src/events/attributes.rs
index 274107da..8516e6d0 100644
--- a/src/events/attributes.rs
+++ b/src/events/attributes.rs
@@ -3,9 +3,9 @@
 //! Provides an iterator over attributes key/value pairs
 
 use crate::errors::Result as XmlResult;
-use crate::escape::{self, escape, resolve_predefined_entity, unescape_with};
 #[cfg(any(doc, not(feature = "encoding")))]
 use crate::escape::EscapeError;
+use crate::escape::{self, escape, resolve_predefined_entity, unescape_with};
 use crate::name::QName;
 use crate::reader::{is_whitespace, Reader};
 use crate::utils::{write_byte_string, write_cow_string, Bytes};
@@ -967,6 +967,45 @@ mod xml {
             );
         }
 
+        /// An obvious loop should be detected immediately
+        #[test]
+        #[ignore]
+        fn test_entity_loop() {
+            let raw_value = "&d;".as_bytes();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            fn custom_resolver(ent: &str) -> Option<&'static str> {
+                match ent {
+                    "d" => Some("&d;"),
+                }
+            }
+            assert_eq!(
+                attr.normalized_value_with(&custom_resolver),
+                Err(EscapeError::ReachedRecursionLimit(1..2))
+            );
+        }
+
+        /// A more complex entity loop should hit the recursion limit.
+        #[test]
+        #[ignore]
+        fn test_recursion_limit() {
+            let raw_value = "&a;".as_bytes();
+            let attr = Attribute::from(("foo".as_bytes(), raw_value));
+            fn custom_resolver(ent: &str) -> Option<&'static str> {
+                match ent {
+                    "a" => Some("&b;"),
+                    "b" => Some("&c;"),
+                    "c" => Some("&d;"),
+                    "d" => Some("&e;"),
+                    "e" => Some("&f;"),
+                    "f" => Some("&a;"),
+                }
+            }
+            assert_eq!(
+                attr.normalized_value_with(&custom_resolver),
+                Err(EscapeError::ReachedRecursionLimit(1..2))
+            );
+        }
+
         #[test]
         fn test_char_references() {
             // character literal references are substituted without being replaced by spaces