From ac12797782ff3511c12ed0e4263370363a09b89c Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Sat, 2 Apr 2022 21:14:52 -0400 Subject: [PATCH 1/2] Add functions for attribute value normalization closes #371 --- Changelog.md | 15 ++- benches/macrobenches.rs | 12 +- benches/microbenches.rs | 72 +++++++++++ src/errors.rs | 1 + src/escape.rs | 261 ++++++++++++++++++++++++++++++++++++++- src/events/attributes.rs | 185 ++++++++++++++++++++++++++- src/events/mod.rs | 2 +- 7 files changed, 532 insertions(+), 16 deletions(-) diff --git a/Changelog.md b/Changelog.md index 0ba5a368..89385b69 100644 --- a/Changelog.md +++ b/Changelog.md @@ -12,12 +12,19 @@ ### New Features +- [#379]: Improved compliance with the XML attribute value normalization process by + adding `Attribute::normalized_value()` and `Attribute::normalized_value_with()`, + which ought to be used in place of `Attribute::unescape_value()` and + `Attribute::unescape_value_with()` + ### Bug Fixes ### Misc Changes +- [#379]: Added tests for attribute value normalization - [#650]: Change the type of `Event::PI` to a new dedicated `BytesPI` type. +[#379]: https://github.com/tafia/quick-xml/pull/379 [#650]: https://github.com/tafia/quick-xml/issues/650 @@ -57,9 +64,6 @@ resolve predefined entities. - `quick_xml::escape::resolve_html5_entity` - [#753]: Added parser for processing instructions: `quick_xml::reader::PiParser`. - [#754]: Added parser for elements: `quick_xml::reader::ElementParser`. - -### Bug Fixes - - [#622]: Fix wrong disregarding of not closed markup, such as lone `<`. - [#684]: Fix incorrect position reported for `Error::IllFormed(DoubleHyphenInComment)`. - [#684]: Fix incorrect position reported for `Error::IllFormed(MissingDoctypeName)`. @@ -120,7 +124,6 @@ resolve predefined entities. [`PayloadEvent`]: https://docs.rs/quick-xml/latest/quick_xml/de/enum.PayloadEvent.html [`Text`]: https://docs.rs/quick-xml/latest/quick_xml/de/struct.Text.html - ## 0.31.0 -- 2023-10-22 MSRV bumped to 1.56! Crate now uses Rust 2021 edition. @@ -291,6 +294,10 @@ serde >= 1.0.181 - [#565]: Fix compilation error when build with serde <1.0.139 +### New Tests + +- [#379]: Added tests for attribute value normalization + [externally tagged]: https://serde.rs/enum-representations.html#externally-tagged [#490]: https://github.com/tafia/quick-xml/pull/490 [#510]: https://github.com/tafia/quick-xml/issues/510 diff --git a/benches/macrobenches.rs b/benches/macrobenches.rs index a8cbbd53..8864e74b 100644 --- a/benches/macrobenches.rs +++ b/benches/macrobenches.rs @@ -43,14 +43,13 @@ static INPUTS: &[(&str, &str)] = &[ ("players.xml", PLAYERS), ]; -// TODO: use fully normalized attribute values fn parse_document_from_str(doc: &str) -> XmlResult<()> { let mut r = Reader::from_str(doc); loop { match criterion::black_box(r.read_event()?) { Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { - criterion::black_box(attr?.decode_and_unescape_value(&r)?); + criterion::black_box(attr?.decode_and_normalize_value(&r)?); } } Event::Text(e) => { @@ -67,7 +66,6 @@ fn parse_document_from_str(doc: &str) -> XmlResult<()> { Ok(()) } -// TODO: use fully normalized attribute values fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { let mut r = Reader::from_reader(doc); let mut buf = Vec::new(); @@ -75,7 +73,7 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { match criterion::black_box(r.read_event_into(&mut buf)?) { Event::Start(e) | Event::Empty(e) => { for attr in e.attributes() { - criterion::black_box(attr?.decode_and_unescape_value(&r)?); + criterion::black_box(attr?.decode_and_normalize_value(&r)?); } } Event::Text(e) => { @@ -93,7 +91,6 @@ fn parse_document_from_bytes(doc: &[u8]) -> XmlResult<()> { Ok(()) } -// TODO: use fully normalized attribute values fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { let mut r = NsReader::from_str(doc); loop { @@ -101,7 +98,7 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { (resolved_ns, Event::Start(e) | Event::Empty(e)) => { criterion::black_box(resolved_ns); for attr in e.attributes() { - criterion::black_box(attr?.decode_and_unescape_value(&r)?); + criterion::black_box(attr?.decode_and_normalize_value(&r)?); } } (resolved_ns, Event::Text(e)) => { @@ -120,7 +117,6 @@ fn parse_document_from_str_with_namespaces(doc: &str) -> XmlResult<()> { Ok(()) } -// TODO: use fully normalized attribute values fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> { let mut r = NsReader::from_reader(doc); let mut buf = Vec::new(); @@ -129,7 +125,7 @@ fn parse_document_from_bytes_with_namespaces(doc: &[u8]) -> XmlResult<()> { (resolved_ns, Event::Start(e) | Event::Empty(e)) => { criterion::black_box(resolved_ns); for attr in e.attributes() { - criterion::black_box(attr?.decode_and_unescape_value(&r)?); + criterion::black_box(attr?.decode_and_normalize_value(&r)?); } } (resolved_ns, Event::Text(e)) => { diff --git a/benches/microbenches.rs b/benches/microbenches.rs index 2f4ece04..70b84568 100644 --- a/benches/microbenches.rs +++ b/benches/microbenches.rs @@ -1,6 +1,9 @@ +use std::borrow::Cow; + use criterion::{self, criterion_group, criterion_main, Criterion}; use pretty_assertions::assert_eq; use quick_xml::escape::{escape, unescape}; +use quick_xml::events::attributes::Attribute; use quick_xml::events::Event; use quick_xml::name::QName; use quick_xml::reader::{NsReader, Reader}; @@ -242,6 +245,74 @@ fn attributes(c: &mut Criterion) { assert_eq!(count, 150); }) }); + + group.finish(); +} + +/// Benchmarks normalizing attribute values +fn attribute_value_normalization(c: &mut Criterion) { + let mut group = c.benchmark_group("attribute_value_normalization"); + + group.bench_function("noop_short", |b| { + let attr = Attribute { + key: QName(b"foo"), + value: Cow::Borrowed(b"foobar"), + }; + b.iter(|| { + criterion::black_box(attr.normalized_value()).unwrap(); + }) + }); + + group.bench_function("noop_long", |b| { + let attr = Attribute { + key: QName(b"foo"), + value: Cow::Borrowed(LOREM_IPSUM_TEXT.as_bytes()), + }; + b.iter(|| { + criterion::black_box(attr.normalized_value()).unwrap(); + }) + }); + + group.bench_function("replacement_chars", |b| { + let attr = Attribute { + key: QName(b"foo"), + value: Cow::Borrowed(b"just a bit\n of text without\tany entities"), + }; + b.iter(|| { + criterion::black_box(attr.normalized_value()).unwrap(); + }) + }); + + group.bench_function("char_reference", |b| { + let attr1 = Attribute { + key: QName(b"foo"), + value: Cow::Borrowed(b"prefix "some stuff","more stuff""), + }; + let attr2 = Attribute { + key: QName(b"foo"), + value: Cow::Borrowed(b"&<"), + }; + b.iter(|| { + criterion::black_box(attr1.normalized_value()).unwrap(); + criterion::black_box(attr2.normalized_value()).unwrap(); + }) + }); + + group.bench_function("entity_reference", |b| { + let attr1 = Attribute { + key: QName(b"foo"), + value: Cow::Borrowed(b"age > 72 && age < 21"), + }; + let attr2 = Attribute { + key: QName(b"foo"), + value: Cow::Borrowed(b""what's that?""), + }; + b.iter(|| { + criterion::black_box(attr1.normalized_value()).unwrap(); + criterion::black_box(attr2.normalized_value()).unwrap(); + }) + }); + group.finish(); } @@ -354,6 +425,7 @@ criterion_group!( read_resolved_event_into, one_event, attributes, + attribute_value_normalization, escaping, unescaping, ); diff --git a/src/errors.rs b/src/errors.rs index 0c5c46e3..e0394aaf 100644 --- a/src/errors.rs +++ b/src/errors.rs @@ -254,6 +254,7 @@ impl From for Error { } impl From for Error { + /// Creates a new `Error::InvalidAttr` from the given error #[inline] fn from(error: AttrError) -> Self { Error::InvalidAttr(error) diff --git a/src/escape.rs b/src/escape.rs index 54972624..3d65dacc 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -3,12 +3,13 @@ use memchr::memchr2_iter; use std::borrow::Cow; use std::ops::Range; +use std::slice::Iter; #[cfg(test)] use pretty_assertions::assert_eq; /// Error for XML escape / unescape. -#[derive(Clone, Debug)] +#[derive(Clone, Debug, PartialEq)] pub enum EscapeError { /// Entity with Null character EntityWithNull(Range), @@ -278,6 +279,163 @@ where } } +const fn is_normalization_char(b: &u8) -> bool { + matches!(*b, b'\t' | b'\r' | b'\n' | b' ' | b'&') +} + +/// Returns the attribute value normalized as per [the XML specification], +/// using a custom entity resolver. +/// +/// Do not use this method with HTML attributes. +/// +/// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` +/// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function +/// for resolving entities can be provided as `resolve_entity`. Builtin entities will still +/// take precedence. +/// +/// This will allocate unless the raw attribute value does not require normalization. +/// +/// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize +pub(crate) fn normalize_attribute_value<'input, 'entity, F>( + value: &'input str, + resolve_entity: F, +) -> Result, EscapeError> +where + // the lifetime of the output comes from a capture or is `'static` + F: Fn(&str) -> Option<&'entity str>, +{ + const DEPTH: usize = 128; + + let bytes = value.as_bytes(); + let mut iter = bytes.iter(); + + if let Some(i) = iter.position(is_normalization_char) { + let mut normalized = String::new(); + let pos = normalize_step( + &mut normalized, + &mut iter, + value, + 0, + i, + DEPTH, + &resolve_entity, + )?; + + normalize_steps( + &mut normalized, + &mut iter, + value, + pos, + DEPTH, + &resolve_entity, + )?; + return Ok(normalized.into()); + } + Ok(Cow::Borrowed(value)) +} + +fn normalize_steps<'entity, F>( + normalized: &mut String, + iter: &mut Iter, + input: &str, + mut pos: usize, + depth: usize, + resolve_entity: &F, +) -> Result<(), EscapeError> +where + // the lifetime of the output comes from a capture or is `'static` + F: Fn(&str) -> Option<&'entity str>, +{ + while let Some(i) = iter.position(is_normalization_char) { + pos = normalize_step(normalized, iter, input, pos, pos + i, depth, resolve_entity)?; + } + if let Some(rest) = input.get(pos..) { + normalized.push_str(rest); + } + Ok(()) +} + +/// Performs one step of the [normalization algorithm] (but with recursive part): +/// +/// 1. For a character reference, append the referenced character +/// to the normalized value. +/// 2. For an entity reference, recursively apply this algorithm +/// to the replacement text of the entity. +/// 3. For a white space character (#x20, #xD, #xA, #x9), append +/// a space character (#x20) to the normalized value. +/// 4. For another character, append the character to the normalized value. +/// +/// # Parameters +/// +/// - `normalized`: Output of the algorithm. Normalized value will be placed here +/// - `iter`: Iterator over bytes of `input` +/// - `input`: Original non-normalized value +/// - `last_pos`: Index of the last byte in `input` that was processed +/// - `index`: Index of the byte in `input` that should be processed now +/// - `depth`: Current recursion depth. Too deep recursion will interrupt the algorithm +/// - `resolve_entity`: Resolver of entities. Returns `None` for unknown entities +/// +/// [normalization algorithm]: https://www.w3.org/TR/xml11/#AVNormalize +fn normalize_step<'entity, F>( + normalized: &mut String, + iter: &mut Iter, + input: &str, + last_pos: usize, + index: usize, + depth: usize, + resolve_entity: &F, +) -> Result +where + // the lifetime of the output comes from a capture or is `'static` + F: Fn(&str) -> Option<&'entity str>, +{ + // 4. For another character, append the character to the normalized value. + normalized.push_str(&input[last_pos..index]); + + match input.as_bytes()[index] { + b'&' => { + let start = index + 1; // +1 - skip `&` + let end = start + + match iter.position(|&b| b == b';') { + Some(end) => end, + None => return Err(EscapeError::UnterminatedEntity(index..input.len())), + }; + + // Content between & and ; - &pat; + let pat = &input[start..end]; + // 1. For a character reference, append the referenced character + // to the normalized value. + if pat.starts_with('#') { + let entity = &pat[1..]; // starts after the # + let codepoint = parse_number(entity, start..end)?; + normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4])); + // 2. For an entity reference, recursively apply this algorithm + // to the replacement text of the entity. + } else if let Some(value) = resolve_entity(pat) { + normalize_steps( + normalized, + &mut value.as_bytes().iter(), + value, + 0, + depth.saturating_sub(1), + resolve_entity, + )?; + } else { + return Err(EscapeError::UnrecognizedSymbol(start..end, pat.to_string())); + } + Ok(end + 1) // +1 - skip `;` + } + // 3. For a white space character (#x20, #xD, #xA, #x9), append + // a space character (#x20) to the normalized value. + b'\t' | b'\n' | b'\r' | b' ' => { + normalized.push(' '); + Ok(index + 1) // +1 - skip character + } + + _ => unreachable!("Only '\\t', '\\n', '\\r', ' ', and '&' are possible here"), + } +} + /// Resolves predefined XML entities or all HTML5 entities depending on the feature /// [`escape-html`](https://docs.rs/quick-xml/latest/quick_xml/#escape-html). /// @@ -1927,3 +2085,104 @@ fn test_minimal_escape() { "prefix_\"a\"b&<>c" ); } + +#[cfg(test)] +mod normalization { + use super::*; + use pretty_assertions::assert_eq; + + #[test] + fn empty() { + assert_eq!(normalize_attribute_value("", |_| { None }), Ok("".into())); + } + + #[test] + fn only_spaces() { + assert_eq!( + normalize_attribute_value(" ", |_| { None }), + Ok(" ".into()) + ); + assert_eq!( + normalize_attribute_value("\t\t\t", |_| { None }), + Ok(" ".into()) + ); + assert_eq!( + normalize_attribute_value("\r\r\r", |_| { None }), + Ok(" ".into()) + ); + assert_eq!( + normalize_attribute_value("\n\n\n", |_| { None }), + Ok(" ".into()) + ); + } + + #[test] + fn already_normalized() { + assert_eq!( + normalize_attribute_value("already normalized", |_| { None }), + Ok("already normalized".into()) + ); + } + + #[test] + fn characters() { + assert_eq!( + normalize_attribute_value("string with character", |_| { None }), + Ok("string with character".into()) + ); + assert_eq!( + normalize_attribute_value("string with character", |_| { None }), + Ok("string with character".into()) + ); + } + + #[test] + fn entities() { + assert_eq!( + normalize_attribute_value("string with &entity; reference", |_| { + Some("replacement") + }), + Ok("string with replacement reference".into()) + ); + assert_eq!( + normalize_attribute_value("string with &entity-1; reference", |entity| { + match entity { + "entity-1" => Some("recursive &entity-2;"), + "entity-2" => Some("entity 2"), + _ => None, + } + }), + Ok("string with recursive entity 2 reference".into()) + ); + } + + #[test] + fn unclosed_entity() { + assert_eq!( + normalize_attribute_value("string with unclosed &entity reference", |_| { + // 0 ^ = 21 ^ = 38 + Some("replacement") + }), + Err(EscapeError::UnterminatedEntity(21..38)) + ); + assert_eq!( + normalize_attribute_value("string with unclosed (character) reference", |_| { + // 0 ^ = 21 ^ = 47 + None + }), + Err(EscapeError::UnterminatedEntity(21..47)) + ); + } + + #[test] + fn unknown_entity() { + assert_eq!( + normalize_attribute_value("string with unknown &entity; reference", |_| { None }), + // 0 ^ ^ = 21..27 + Err(EscapeError::UnrecognizedSymbol( + 21..27, + "entity".to_string() + )) + ); + } +} diff --git a/src/events/attributes.rs b/src/events/attributes.rs index e08303d7..274107da 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -3,7 +3,9 @@ //! Provides an iterator over attributes key/value pairs use crate::errors::Result as XmlResult; -use crate::escape::{escape, resolve_predefined_entity, unescape_with}; +use crate::escape::{self, escape, resolve_predefined_entity, unescape_with}; +#[cfg(any(doc, not(feature = "encoding")))] +use crate::escape::EscapeError; use crate::name::QName; use crate::reader::{is_whitespace, Reader}; use crate::utils::{write_byte_string, write_cow_string, Bytes}; @@ -30,7 +32,100 @@ pub struct Attribute<'a> { } impl<'a> Attribute<'a> { - /// Decodes using UTF-8 then unescapes the value. + /// Returns the attribute value normalized as per [the XML specification]. + /// + /// Do not use this method with HTML attributes. + /// + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` + /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// See also [`normalized_value_with()`](#method.Self::normalized_value_with) + /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + #[cfg(any(doc, not(feature = "encoding")))] + pub fn normalized_value(&'a self) -> Result, EscapeError> { + self.normalized_value_with(|_| None) + } + + /// Returns the attribute value normalized as per [the XML specification], + /// using a custom entity resolver. + /// + /// Do not use this method with HTML attributes. + /// + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` + /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function + /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still + /// take precedence. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// See also [`normalized_value()`](#method.normalized_value) + /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + #[cfg(any(doc, not(feature = "encoding")))] + pub fn normalized_value_with<'entity>( + &'a self, + resolve_entity: impl Fn(&str) -> Option<&'entity str>, + ) -> Result, EscapeError> { + escape::normalize_attribute_value( + std::str::from_utf8(self.value.as_ref()).expect("unable to decode as utf-8"), + resolve_entity, + ) + } + + /// Decodes using a provided reader and returns the attribute value normalized + /// as per [the XML specification]. + /// + /// Do not use this method with HTML attributes. + /// + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` + /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// See also [`decode_and_normalize_value_with()`](#method.decode_and_normalize_value_with) + /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + pub fn decode_and_normalize_value(&self, reader: &Reader) -> XmlResult> { + self.decode_and_normalize_value_with(reader, |_| None) + } + + /// Decodes using a provided reader and returns the attribute value normalized + /// as per [the XML specification], using a custom entity resolver. + /// + /// Do not use this method with HTML attributes. + /// + /// Escape sequences such as `>` are replaced with their unescaped equivalents such as `>` + /// and the characters `\t`, `\r`, `\n` are replaced with whitespace characters. A function + /// for resolving entities can be provided as `resolve_entity`. Builtin entities will still + /// take precedence. + /// + /// This will allocate unless the raw attribute value does not require normalization. + /// + /// See also [`decode_and_normalize_value()`](#method.decode_and_normalize_value) + /// + /// [the XML specification]: https://www.w3.org/TR/xml11/#AVNormalize + pub fn decode_and_normalize_value_with<'entity, B>( + &self, + reader: &Reader, + resolve_entity: impl Fn(&str) -> Option<&'entity str>, + ) -> XmlResult> { + let decoded = match &self.value { + Cow::Borrowed(bytes) => reader.decoder().decode(bytes)?, + // Convert to owned, because otherwise Cow will be bound with wrong lifetime + Cow::Owned(bytes) => reader.decoder().decode(bytes)?.into_owned().into(), + }; + + match escape::normalize_attribute_value(&decoded, resolve_entity)? { + // Because result is borrowed, no replacements was done and we can use original string + Cow::Borrowed(_) => Ok(decoded), + Cow::Owned(s) => Ok(s.into()), + } + } + + /// Returns the unescaped value. /// /// This is normally the value you are interested in. Escape sequences such as `>` are /// replaced with their unescaped equivalents such as `>`. @@ -796,6 +891,92 @@ mod xml { use super::*; use pretty_assertions::assert_eq; + #[cfg(any(doc, not(feature = "encoding")))] + mod attribute_value_normalization { + + use super::*; + use pretty_assertions::assert_eq; + + /// Empty values returned are unchanged + #[test] + fn test_empty() { + let raw_value = "".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Borrowed(""))); + } + + /// Already normalized values are returned unchanged + #[test] + fn test_already_normalized() { + let raw_value = "foobar123".as_bytes(); + let output = "foobar123"; + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Borrowed(output))); + } + + /// Return, tab, and newline characters (0xD, 0x9, 0xA) must be substituted with + /// a space character + #[test] + fn test_space_replacement() { + let raw_value = "\r\nfoo\rbar\tbaz\n\ndelta\n".as_bytes(); + let output = " foo bar baz delta ".to_owned(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output))); + } + + /// Entities must be terminated + #[test] + fn test_unterminated_entity() { + let raw_value = "abc"def".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!( + attr.normalized_value(), + Err(EscapeError::UnterminatedEntity(3..11)) + ); + } + + /// Unknown entities raise error + #[test] + fn test_unrecognized_entity() { + let raw_value = "abc&unkn;def".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!( + attr.normalized_value(), + Err(EscapeError::UnrecognizedSymbol(4..8, "unkn".to_owned())) + ); + } + + /// custom entity replacement works, entity replacement text processed recursively + #[test] + #[ignore] + fn test_entity_replacement() { + let raw_value = "&d;&d;A&a; &a;B&da;".as_bytes(); + let output = " A B ".to_owned(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + fn custom_resolver(ent: &str) -> Option<&'static str> { + match ent { + "d" => Some(" "), + "a" => Some(" "), + "da" => Some(" "), + _ => None, + } + } + assert_eq!( + attr.normalized_value_with(&custom_resolver), + Ok(Cow::Owned(output)) + ); + } + + #[test] + fn test_char_references() { + // character literal references are substituted without being replaced by spaces + let raw_value = " A B ".as_bytes(); + let output = "\r\rA\n\nB\r\n".to_owned(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + assert_eq!(attr.normalized_value(), Ok(Cow::Owned(output))); + } + } + /// Checked attribute is the single attribute mod single { use super::*; diff --git a/src/events/mod.rs b/src/events/mod.rs index 9d5d311f..dbe2f058 100644 --- a/src/events/mod.rs +++ b/src/events/mod.rs @@ -512,7 +512,7 @@ impl<'a> BytesDecl<'a> { /// Although according to the [grammar] standalone flag must appear after `"version"` /// and `"encoding"`, this method does not check that. The first occurrence of the /// attribute will be returned even if there are several. Also, method does not - /// restrict symbols that can forming the value, so the returned flag name may not + /// restrict symbols that can form the value, so the returned flag name may not /// correspond to the grammar. /// /// # Examples From 5c0d5d6eddcf99225394f221af082c5909aa093b Mon Sep 17 00:00:00 2001 From: Daniel Alley Date: Fri, 14 Jun 2024 14:32:52 -0400 Subject: [PATCH 2/2] temp --- Changelog.md | 13 +++++++------ src/escape.rs | 27 ++++++++++++++++++-------- src/events/attributes.rs | 41 +++++++++++++++++++++++++++++++++++++++- 3 files changed, 66 insertions(+), 15 deletions(-) diff --git a/Changelog.md b/Changelog.md index 89385b69..059db5ee 100644 --- a/Changelog.md +++ b/Changelog.md @@ -21,9 +21,15 @@ ### Misc Changes -- [#379]: Added tests for attribute value normalization +- [#379]: Fixed error messages for some variants of `EscapeError` + which were incorrectly described as occurring during escaping (rather than unescaping). - [#650]: Change the type of `Event::PI` to a new dedicated `BytesPI` type. +### New Tests + +- [#379]: Added tests for attribute value normalization + + [#379]: https://github.com/tafia/quick-xml/pull/379 [#650]: https://github.com/tafia/quick-xml/issues/650 @@ -293,11 +299,6 @@ serde >= 1.0.181 - [#565]: Correctly set minimum required version of tokio dependency to 1.10 - [#565]: Fix compilation error when build with serde <1.0.139 - -### New Tests - -- [#379]: Added tests for attribute value normalization - [externally tagged]: https://serde.rs/enum-representations.html#externally-tagged [#490]: https://github.com/tafia/quick-xml/pull/490 [#510]: https://github.com/tafia/quick-xml/issues/510 diff --git a/src/escape.rs b/src/escape.rs index 3d65dacc..6f1305e4 100644 --- a/src/escape.rs +++ b/src/escape.rs @@ -27,6 +27,12 @@ pub enum EscapeError { InvalidDecimal(char), /// Not a valid unicode codepoint InvalidCodepoint(u32), + /// The recursion limit was reached while attempting to unescape XML entities, + /// or normalize an attribute value. This could indicate an entity loop. + /// + /// Limiting recursion prevents Denial-of-Service type attacks + /// such as the "billion laughs" [attack](https://en.wikipedia.org/wiki/Billion_laughs_attack). + ReachedRecursionLimit(Range), } impl std::fmt::Display for EscapeError { @@ -34,17 +40,17 @@ impl std::fmt::Display for EscapeError { match self { EscapeError::EntityWithNull(e) => write!( f, - "Error while escaping character at range {:?}: Null character entity not allowed", + "Error while unescaping character at range {:?}: Null character entity not allowed", e ), EscapeError::UnrecognizedSymbol(rge, res) => write!( f, - "Error while escaping character at range {:?}: Unrecognized escape symbol: {:?}", + "Error while unescaping character at range {:?}: Unrecognized escape symbol: {:?}", rge, res ), EscapeError::UnterminatedEntity(e) => write!( f, - "Error while escaping character at range {:?}: Cannot find ';' after '&'", + "Error while unescaping character at range {:?}: Cannot find ';' after '&'", e ), EscapeError::TooLongHexadecimal => write!(f, "Cannot convert hexadecimal to utf8"), @@ -54,6 +60,10 @@ impl std::fmt::Display for EscapeError { EscapeError::TooLongDecimal => write!(f, "Cannot convert decimal to utf8"), EscapeError::InvalidDecimal(e) => write!(f, "'{}' is not a valid decimal character", e), EscapeError::InvalidCodepoint(n) => write!(f, "'{}' is not a valid codepoint", n), + EscapeError::ReachedRecursionLimit(e) => write!( + f, + "Error while unescaping entity at range {:?}: Recursion limit reached" + ), } } } @@ -310,7 +320,7 @@ where let mut iter = bytes.iter(); if let Some(i) = iter.position(is_normalization_char) { - let mut normalized = String::new(); + let mut normalized = String::with_capacity(value.len()); let pos = normalize_step( &mut normalized, &mut iter, @@ -405,7 +415,7 @@ where let pat = &input[start..end]; // 1. For a character reference, append the referenced character // to the normalized value. - if pat.starts_with('#') { + if let Some(entity) = pat.strip_prefix('#') { let entity = &pat[1..]; // starts after the # let codepoint = parse_number(entity, start..end)?; normalized.push_str(codepoint.encode_utf8(&mut [0u8; 4])); @@ -432,6 +442,7 @@ where Ok(index + 1) // +1 - skip character } + // SAFETY: We call normalize_step only when is_normalization_char() return true _ => unreachable!("Only '\\t', '\\n', '\\r', ' ', and '&' are possible here"), } } @@ -2160,14 +2171,14 @@ mod normalization { fn unclosed_entity() { assert_eq!( normalize_attribute_value("string with unclosed &entity reference", |_| { - // 0 ^ = 21 ^ = 38 + // 0 ^ = 21 ^ = 38 Some("replacement") }), Err(EscapeError::UnterminatedEntity(21..38)) ); assert_eq!( normalize_attribute_value("string with unclosed (character) reference", |_| { - // 0 ^ = 21 ^ = 47 + // 0 ^ = 21 ^ = 47 None }), Err(EscapeError::UnterminatedEntity(21..47)) @@ -2178,7 +2189,7 @@ mod normalization { fn unknown_entity() { assert_eq!( normalize_attribute_value("string with unknown &entity; reference", |_| { None }), - // 0 ^ ^ = 21..27 + // 0 ^ 21 ^ 27 Err(EscapeError::UnrecognizedSymbol( 21..27, "entity".to_string() diff --git a/src/events/attributes.rs b/src/events/attributes.rs index 274107da..8516e6d0 100644 --- a/src/events/attributes.rs +++ b/src/events/attributes.rs @@ -3,9 +3,9 @@ //! Provides an iterator over attributes key/value pairs use crate::errors::Result as XmlResult; -use crate::escape::{self, escape, resolve_predefined_entity, unescape_with}; #[cfg(any(doc, not(feature = "encoding")))] use crate::escape::EscapeError; +use crate::escape::{self, escape, resolve_predefined_entity, unescape_with}; use crate::name::QName; use crate::reader::{is_whitespace, Reader}; use crate::utils::{write_byte_string, write_cow_string, Bytes}; @@ -967,6 +967,45 @@ mod xml { ); } + /// An obvious loop should be detected immediately + #[test] + #[ignore] + fn test_entity_loop() { + let raw_value = "&d;".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + fn custom_resolver(ent: &str) -> Option<&'static str> { + match ent { + "d" => Some("&d;"), + } + } + assert_eq!( + attr.normalized_value_with(&custom_resolver), + Err(EscapeError::ReachedRecursionLimit(1..2)) + ); + } + + /// A more complex entity loop should hit the recursion limit. + #[test] + #[ignore] + fn test_recursion_limit() { + let raw_value = "&a;".as_bytes(); + let attr = Attribute::from(("foo".as_bytes(), raw_value)); + fn custom_resolver(ent: &str) -> Option<&'static str> { + match ent { + "a" => Some("&b;"), + "b" => Some("&c;"), + "c" => Some("&d;"), + "d" => Some("&e;"), + "e" => Some("&f;"), + "f" => Some("&a;"), + } + } + assert_eq!( + attr.normalized_value_with(&custom_resolver), + Err(EscapeError::ReachedRecursionLimit(1..2)) + ); + } + #[test] fn test_char_references() { // character literal references are substituted without being replaced by spaces