From 08c0eea76c36c6ddcf753c045fd5ff5ab78aba80 Mon Sep 17 00:00:00 2001
From: Daniel Alley <dalley@redhat.com>
Date: Sat, 2 Apr 2022 21:14:52 -0400
Subject: [PATCH] Properly normalize attribute values

closes #371
---
 benches/macrobenches.rs  |   3 +-
 benches/microbenches.rs  | 205 ++++++++++++++++++++++++---------------
 src/errors.rs            |   1 +
 src/escapei.rs           |  20 ++--
 src/events/attributes.rs | 151 +++++++++++++++++++++++++++-
 src/reader.rs            |   5 +-
 6 files changed, 286 insertions(+), 99 deletions(-)

diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs
index 14f28cb9..54e56628 100644
--- a/benches/macrobenches.rs
+++ b/benches/macrobenches.rs
@@ -17,14 +17,13 @@ static SAMPLE_NS: &[u8] = include_bytes!("../tests/documents/sample_ns.xml");
 static PLAYERS: &[u8] = include_bytes!("../tests/documents/players.xml");
 
 // TODO: read the namespaces too
-// TODO: use fully normalized attribute values
 fn parse_document(doc: &[u8]) -> XmlResult<()> {
     let mut r = Reader::from_reader(doc);
     loop {
         match r.read_event_unbuffered()? {
             Event::Start(e) | Event::Empty(e) => {
                 for attr in e.attributes() {
-                    criterion::black_box(attr?.unescaped_value()?);
+                    criterion::black_box(attr?.normalized_value()?);
                 }
             }
             Event::Text(e) => {
diff --git a/benches/microbenches.rs b/benches/microbenches.rs
index 9d701c05..443ebace 100644
--- a/benches/microbenches.rs
+++ b/benches/microbenches.rs
@@ -125,86 +125,6 @@ fn read_namespaced_event(c: &mut Criterion) {
     group.finish();
 }
 
-/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
-/// benchmark)
-fn bytes_text_unescaped(c: &mut Criterion) {
-    let mut group = c.benchmark_group("BytesText::unescaped");
-    group.bench_function("trim_text = false", |b| {
-        b.iter(|| {
-            let mut buf = Vec::new();
-            let mut r = Reader::from_reader(SAMPLE);
-            r.check_end_names(false).check_comments(false);
-            let mut count = criterion::black_box(0);
-            let mut nbtxt = criterion::black_box(0);
-            loop {
-                match r.read_event(&mut buf) {
-                    Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
-                    Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
-                    Ok(Event::Eof) => break,
-                    _ => (),
-                }
-                buf.clear();
-            }
-            assert_eq!(
-                count, 1550,
-                "Overall tag count in ./tests/documents/sample_rss.xml"
-            );
-
-            // Windows has \r\n instead of \n
-            #[cfg(windows)]
-            assert_eq!(
-                nbtxt, 67661,
-                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
-            );
-
-            #[cfg(not(windows))]
-            assert_eq!(
-                nbtxt, 66277,
-                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
-            );
-        });
-    });
-
-    group.bench_function("trim_text = true", |b| {
-        b.iter(|| {
-            let mut buf = Vec::new();
-            let mut r = Reader::from_reader(SAMPLE);
-            r.check_end_names(false)
-                .check_comments(false)
-                .trim_text(true);
-            let mut count = criterion::black_box(0);
-            let mut nbtxt = criterion::black_box(0);
-            loop {
-                match r.read_event(&mut buf) {
-                    Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
-                    Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
-                    Ok(Event::Eof) => break,
-                    _ => (),
-                }
-                buf.clear();
-            }
-            assert_eq!(
-                count, 1550,
-                "Overall tag count in ./tests/documents/sample_rss.xml"
-            );
-
-            // Windows has \r\n instead of \n
-            #[cfg(windows)]
-            assert_eq!(
-                nbtxt, 50334,
-                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
-            );
-
-            #[cfg(not(windows))]
-            assert_eq!(
-                nbtxt, 50261,
-                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
-            );
-        });
-    });
-    group.finish();
-}
-
 /// Benchmarks, how fast individual event parsed
 fn one_event(c: &mut Criterion) {
     let mut group = c.benchmark_group("One event");
@@ -364,6 +284,130 @@ fn attributes(c: &mut Criterion) {
             assert_eq!(count, 150);
         })
     });
+
+    group.finish();
+}
+
+/// Benchmarks normalizing attribute values
+fn attribute_value_normalization(c: &mut Criterion) {
+    let mut group = c.benchmark_group("attribute_value_normalization");
+
+    group.bench_function("noop_short", |b| {
+        b.iter(|| {
+            criterion::black_box(unescape(b"foobar")).unwrap();
+        })
+    });
+
+    group.bench_function("noop_long", |b| {
+        b.iter(|| {
+            criterion::black_box(unescape(b"just a bit of text without any entities")).unwrap();
+        })
+    });
+
+    group.bench_function("replacement_chars", |b| {
+        b.iter(|| {
+            criterion::black_box(unescape(b"just a bit\n of text without\tany entities")).unwrap();
+        })
+    });
+
+    group.bench_function("char_reference", |b| {
+        b.iter(|| {
+            let text = b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
+            criterion::black_box(unescape(text)).unwrap();
+            let text = b"&#38;&#60;";
+            criterion::black_box(unescape(text)).unwrap();
+        })
+    });
+
+    group.bench_function("entity_reference", |b| {
+        b.iter(|| {
+            let text = b"age &gt; 72 &amp;&amp; age &lt; 21";
+            criterion::black_box(unescape(text)).unwrap();
+            let text = b"&quot;what&apos;s that?&quot;";
+            criterion::black_box(unescape(text)).unwrap();
+        })
+    });
+
+    group.finish();
+}
+
+/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
+/// benchmark)
+fn bytes_text_unescaped(c: &mut Criterion) {
+    let mut group = c.benchmark_group("BytesText::unescaped");
+    group.bench_function("trim_text = false", |b| {
+        b.iter(|| {
+            let mut buf = Vec::new();
+            let mut r = Reader::from_reader(SAMPLE);
+            r.check_end_names(false).check_comments(false);
+            let mut count = criterion::black_box(0);
+            let mut nbtxt = criterion::black_box(0);
+            loop {
+                match r.read_event(&mut buf) {
+                    Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
+                    Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
+                    Ok(Event::Eof) => break,
+                    _ => (),
+                }
+                buf.clear();
+            }
+            assert_eq!(
+                count, 1550,
+                "Overall tag count in ./tests/documents/sample_rss.xml"
+            );
+
+            // Windows has \r\n instead of \n
+            #[cfg(windows)]
+            assert_eq!(
+                nbtxt, 67661,
+                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
+            );
+
+            #[cfg(not(windows))]
+            assert_eq!(
+                nbtxt, 66277,
+                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
+            );
+        });
+    });
+
+    group.bench_function("trim_text = true", |b| {
+        b.iter(|| {
+            let mut buf = Vec::new();
+            let mut r = Reader::from_reader(SAMPLE);
+            r.check_end_names(false)
+                .check_comments(false)
+                .trim_text(true);
+            let mut count = criterion::black_box(0);
+            let mut nbtxt = criterion::black_box(0);
+            loop {
+                match r.read_event(&mut buf) {
+                    Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
+                    Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
+                    Ok(Event::Eof) => break,
+                    _ => (),
+                }
+                buf.clear();
+            }
+            assert_eq!(
+                count, 1550,
+                "Overall tag count in ./tests/documents/sample_rss.xml"
+            );
+
+            // Windows has \r\n instead of \n
+            #[cfg(windows)]
+            assert_eq!(
+                nbtxt, 50334,
+                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
+            );
+
+            #[cfg(not(windows))]
+            assert_eq!(
+                nbtxt, 50261,
+                "Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
+            );
+        });
+    });
     group.finish();
 }
 
@@ -477,6 +521,7 @@ criterion_group!(
     read_namespaced_event,
     one_event,
     attributes,
+    attribute_value_normalization,
     escaping,
     unescaping,
 );
diff --git a/src/errors.rs b/src/errors.rs
index c1af08af..5f41b68e 100644
--- a/src/errors.rs
+++ b/src/errors.rs
@@ -72,6 +72,7 @@ impl From<EscapeError> for Error {
 }
 
 impl From<AttrError> for Error {
+    /// Creates a new `Error::InvalidAttr` from the given error
     #[inline]
     fn from(error: AttrError) -> Self {
         Error::InvalidAttr(error)
diff --git a/src/escapei.rs b/src/escapei.rs
index 64749c27..851d1070 100644
--- a/src/escapei.rs
+++ b/src/escapei.rs
@@ -9,7 +9,7 @@ use std::ops::Range;
 use pretty_assertions::assert_eq;
 
 /// Error for XML escape/unescqpe.
-#[derive(Debug)]
+#[derive(Debug, PartialEq)]
 pub enum EscapeError {
     /// Entity with Null character
     EntityWithNull(::std::ops::Range<usize>),
@@ -134,7 +134,7 @@ pub fn unescape(raw: &[u8]) -> Result<Cow<[u8]>, EscapeError> {
 }
 
 /// Unescape a `&[u8]` and replaces all xml escaped characters ('&...;') into their corresponding
-/// value, using a dictionnary of custom entities.
+/// value, using a dictionary of custom entities.
 ///
 /// # Pre-condition
 ///
@@ -174,7 +174,9 @@ pub fn do_unescape<'a>(
                 if let Some(s) = named_entity(pat) {
                     unescaped.extend_from_slice(s.as_bytes());
                 } else if pat.starts_with(b"#") {
-                    push_utf8(unescaped, parse_number(&pat[1..], start..end)?);
+                    let entity = &pat[1..]; // starts after the #
+                    let codepoint = parse_number(entity, start..end)?;
+                    push_utf8(unescaped, codepoint);
                 } else if let Some(value) = custom_entities.and_then(|hm| hm.get(pat)) {
                     unescaped.extend_from_slice(&value);
                 } else {
@@ -201,7 +203,7 @@ pub fn do_unescape<'a>(
 }
 
 #[cfg(not(feature = "escape-html"))]
-const fn named_entity(name: &[u8]) -> Option<&str> {
+pub(crate) const fn named_entity(name: &[u8]) -> Option<&str> {
     let s = match name {
         b"lt" => "<",
         b"gt" => ">",
@@ -213,7 +215,7 @@ const fn named_entity(name: &[u8]) -> Option<&str> {
     Some(s)
 }
 #[cfg(feature = "escape-html")]
-const fn named_entity(name: &[u8]) -> Option<&str> {
+pub(crate) const fn named_entity(name: &[u8]) -> Option<&str> {
     // imported from https://dev.w3.org/html5/html-author/charref
     let s = match name {
         b"Tab" => "\u{09}",
@@ -1675,12 +1677,12 @@ const fn named_entity(name: &[u8]) -> Option<&str> {
     Some(s)
 }
 
-fn push_utf8(out: &mut Vec<u8>, code: char) {
+pub(crate) fn push_utf8(out: &mut Vec<u8>, code: char) {
     let mut buf = [0u8; 4];
     out.extend_from_slice(code.encode_utf8(&mut buf).as_bytes());
 }
 
-fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError> {
+pub(crate) fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError> {
     let code = if bytes.starts_with(b"x") {
         parse_hexadecimal(&bytes[1..])
     } else {
@@ -1695,7 +1697,7 @@ fn parse_number(bytes: &[u8], range: Range<usize>) -> Result<char, EscapeError>
     }
 }
 
-fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
+pub(crate) fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
     // maximum code is 0x10FFFF => 6 characters
     if bytes.len() > 6 {
         return Err(EscapeError::TooLongHexadecimal);
@@ -1713,7 +1715,7 @@ fn parse_hexadecimal(bytes: &[u8]) -> Result<u32, EscapeError> {
     Ok(code)
 }
 
-fn parse_decimal(bytes: &[u8]) -> Result<u32, EscapeError> {
+pub(crate) fn parse_decimal(bytes: &[u8]) -> Result<u32, EscapeError> {
     // maximum code is 0x10FFFF = 1114111 => 7 characters
     if bytes.len() > 7 {
         return Err(EscapeError::TooLongDecimal);
diff --git a/src/events/attributes.rs b/src/events/attributes.rs
index d6331bc7..226d6d9f 100644
--- a/src/events/attributes.rs
+++ b/src/events/attributes.rs
@@ -4,6 +4,7 @@
 
 use crate::errors::{Error, Result as XmlResult};
 use crate::escape::{do_unescape, escape};
+use crate::escapei::{self, EscapeError};
 use crate::name::QName;
 use crate::reader::{is_whitespace, Reader};
 use crate::utils::{write_byte_string, write_cow_string, Bytes};
@@ -11,6 +12,8 @@ use std::fmt::{self, Debug, Display, Formatter};
 use std::iter::FusedIterator;
 use std::{borrow::Cow, collections::HashMap, io::BufRead, ops::Range};
 
+type CustomEntityMap = HashMap<Vec<u8>, Vec<u8>>;
+
 /// A struct representing a key/value XML attribute.
 ///
 /// Field `value` stores raw bytes, possibly containing escape-sequences. Most users will likely
@@ -32,6 +35,95 @@ pub struct Attribute<'a> {
 }
 
 impl<'a> Attribute<'a> {
+    /// Returns the attribute value normalized as per the XML specification.
+    ///
+    /// https://www.w3.org/TR/xml/#AVNormalize
+    ///
+    /// Do not use this method with HTML attributes.
+    ///
+    /// This is normally the value you are interested in. Escape sequences such as `&gt;` are
+    /// replaced with their unescaped equivalents such as `>`.
+    ///
+    /// This will allocate if the value contains any escape sequences.
+    ///
+    /// See also [`normalized_value_with_custom_entities()`](#method.normalized_value_with_custom_entities)
+    pub fn normalized_value(&'a self) -> Result<Cow<'a, [u8]>, EscapeError> {
+        self.make_normalized_value(None)
+    }
+
+    /// Returns the attribute value normalized as per the XML specification, using custom entities.
+    ///
+    /// https://www.w3.org/TR/xml/#AVNormalize
+    ///
+    /// Do not use this method with HTML attributes.
+    ///
+    /// This is normally the value you are interested in. Escape sequences such as `&gt;` are
+    /// replaced with their unescaped equivalents such as `>`.
+    /// Additional entities can be provided in `custom_entities`.
+    ///
+    /// This will allocate if the value contains any escape sequences.
+    ///
+    /// See also [`normalized_value()`](#method.normalized_value)
+    ///
+    /// # Pre-condition
+    ///
+    /// The keys and values of `custom_entities`, if any, must be valid UTF-8.
+    pub fn normalized_value_with_custom_entities(
+        &'a self,
+        custom_entities: &CustomEntityMap,
+    ) -> Result<Cow<'a, [u8]>, EscapeError> {
+        self.make_normalized_value(Some(custom_entities))
+    }
+
+    /// Normalize the attribute value according to xml specification section 3.3.3
+    ///
+    /// https://www.w3.org/TR/xml/#AVNormalize
+    fn make_normalized_value(
+        &'a self,
+        custom_entities: Option<&CustomEntityMap>,
+    ) -> Result<Cow<'a, [u8]>, EscapeError> {
+        let mut normalized: Vec<u8> = Vec::with_capacity(self.value.len());
+
+        // Perform a single pass over the trimmed attribute value. If we encounter a character / entity reference
+        // or whitespace-like characters that need to be substituted, copy everything processed thus far to a new
+        // buffer and continue using this buffer.
+        let attr = self.value.as_ref();
+        let mut attr_iter = attr.iter().enumerate();
+
+        while let Some((idx, ch)) = attr_iter.next() {
+            match ch {
+                b' ' | b'\n' | b'\r' | b'\t' => normalized.push(b' '),
+                b'&' => {
+                    let end = idx
+                        + 1
+                        + attr_iter
+                            .position(|(_, c)| *c == b';')
+                            .ok_or_else(|| EscapeError::UnterminatedEntity(idx..attr.len()))?;
+                    let entity = &attr[idx + 1..end]; // starts after the &
+
+                    if let Some(s) = escapei::named_entity(entity) {
+                        normalized.extend_from_slice(s.as_bytes());
+                    } else if entity.starts_with(b"#") {
+                        let entity = &entity[1..]; // starts after the #
+                        let codepoint = escapei::parse_number(entity, idx..end)?;
+                        escapei::push_utf8(&mut normalized, codepoint);
+                    } else if let Some(value) = custom_entities.and_then(|hm| hm.get(entity)) {
+                        // TODO: recursively apply entity substitution
+                        normalized.extend_from_slice(&value);
+                    } else {
+                        return Err(EscapeError::UnrecognizedSymbol(
+                            idx + 1..end,
+                            String::from_utf8(entity.to_vec()),
+                        ));
+                    }
+                }
+                _ => normalized.push(*ch),
+            }
+        }
+
+        Ok(Cow::Owned(normalized))
+    }
+
     /// Returns the unescaped value.
     ///
     /// This is normally the value you are interested in. Escape sequences such as `&gt;` are
@@ -59,14 +151,14 @@ impl<'a> Attribute<'a> {
     /// The keys and values of `custom_entities`, if any, must be valid UTF-8.
     pub fn unescaped_value_with_custom_entities(
         &self,
-        custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
+        custom_entities: &CustomEntityMap,
     ) -> XmlResult<Cow<[u8]>> {
         self.make_unescaped_value(Some(custom_entities))
     }
 
     fn make_unescaped_value(
         &self,
-        custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
+        custom_entities: Option<&CustomEntityMap>,
     ) -> XmlResult<Cow<[u8]>> {
         do_unescape(&*self.value, custom_entities).map_err(Error::EscapeError)
     }
@@ -102,7 +194,7 @@ impl<'a> Attribute<'a> {
     pub fn unescape_and_decode_value_with_custom_entities<B: BufRead>(
         &self,
         reader: &Reader<B>,
-        custom_entities: &HashMap<Vec<u8>, Vec<u8>>,
+        custom_entities: &CustomEntityMap,
     ) -> XmlResult<String> {
         self.do_unescape_and_decode_value(reader, Some(custom_entities))
     }
@@ -111,7 +203,7 @@ impl<'a> Attribute<'a> {
     fn do_unescape_and_decode_value<B: BufRead>(
         &self,
         reader: &Reader<B>,
-        custom_entities: Option<&HashMap<Vec<u8>, Vec<u8>>>,
+        custom_entities: Option<&CustomEntityMap>,
     ) -> XmlResult<String> {
         let decoded = reader.decoder().decode(&*self.value)?;
 
@@ -798,6 +890,57 @@ mod xml {
     use super::*;
     use pretty_assertions::assert_eq;
 
+    #[test]
+    fn attribute_value_normalization() {
+        // empty value
+        let raw_value = "".as_bytes();
+        let output = "".as_bytes().to_vec();
+        let attr = Attribute::from(("foo".as_bytes(), raw_value));
+        assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output)));
+
+        // return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with a space character
+        let raw_value = "\r\nfoo\rbar\tbaz\n\ndelta\n".as_bytes();
+        let output = "  foo bar baz  delta ".as_bytes().to_vec();
+        let attr = Attribute::from(("foo".as_bytes(), raw_value));
+        assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output)));
+
+        // entities must be terminated
+        let raw_value = "abc&quotdef".as_bytes();
+        let attr = Attribute::from(("foo".as_bytes(), raw_value));
+        assert_eq!(
+            attr.normalized_value(),
+            Err(EscapeError::UnterminatedEntity(3..11))
+        );
+
+        // unknown entities raise error
+        let raw_value = "abc&unkn;def".as_bytes();
+        let attr = Attribute::from(("foo".as_bytes(), raw_value));
+        assert_eq!(
+            attr.normalized_value(),
+            Err(EscapeError::UnrecognizedSymbol(4..8, Ok("unkn".to_owned()))) // TODO: is this divergence between range behavior of UnterminatedEntity and UnrecognizedSymbol appropriate. shared with unescape code
+        );
+
+        // // custom entity replacement works, entity replacement text processed recursively
+        // let raw_value = "&d;&d;A&a;&#x20;&a;B&da;".as_bytes();
+        // let output = b"  A   B  ".to_vec();
+        // let attr = Attribute::from(("foo".as_bytes(), raw_value));
+        // let mut custom_entities = HashMap::new();
+        // custom_entities.insert(b"d".to_vec(), b"&#xD;".to_vec());
+        // custom_entities.insert(b"a".to_vec(), b"&#xA;".to_vec());
+        // custom_entities.insert(b"da".to_vec(), b"&#xD;&#xA;".to_vec());
+        // dbg!(std::str::from_utf8(attr.normalized_value_with_custom_entities(&custom_entities).unwrap().as_ref()).unwrap());
+        // assert_eq!(
+        //     attr.normalized_value_with_custom_entities(&custom_entities),
+        //     Ok(Cow::Owned::<[u8]>(output))
+        // );
+
+        // character literal references are substituted without being replaced by spaces
+        let raw_value = "&#xd;&#xd;A&#xa;&#xa;B&#xd;&#xa;".as_bytes();
+        let output = "\r\rA\n\nB\r\n".as_bytes().to_vec();
+        let attr = Attribute::from(("foo".as_bytes(), raw_value));
+        assert_eq!(attr.normalized_value(), Ok(Cow::Owned::<[u8]>(output)));
+    }
+
     /// Checked attribute is the single attribute
     mod single {
         use super::*;
diff --git a/src/reader.rs b/src/reader.rs
index a1909255..18c00200 100644
--- a/src/reader.rs
+++ b/src/reader.rs
@@ -1489,10 +1489,7 @@ impl ReadElementState {
 /// A function to check whether the byte is a whitespace (blank, new line, carriage return or tab)
 #[inline]
 pub(crate) fn is_whitespace(b: u8) -> bool {
-    match b {
-        b' ' | b'\r' | b'\n' | b'\t' => true,
-        _ => false,
-    }
+    matches!(b, b' ' | b'\r' | b'\n' | b'\t')
 }
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////