Skip to content

Commit

Permalink
Properly normalize attribute values
Browse files Browse the repository at this point in the history
closes tafia#371
  • Loading branch information
dralley committed Jan 24, 2023
1 parent add7406 commit 57b9878
Show file tree
Hide file tree
Showing 7 changed files with 275 additions and 13 deletions.
9 changes: 9 additions & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@

- [#541]: Deserialize specially named `$text` enum variant in [externally tagged]
enums from textual content
- [#379]: Improved support for the XML attribute value normalization process

### Bug Fixes

Expand Down Expand Up @@ -120,6 +121,11 @@
- [#489]: Reduced the size of the package uploaded into the crates.io by excluding
tests, examples, and benchmarks.

### New Tests

- [#379]: Added tests for attribute value normalization

[#379]: https://github.com/tafia/quick-xml/pull/379
[#481]: https://github.com/tafia/quick-xml/pull/481
[#489]: https://github.com/tafia/quick-xml/pull/489

Expand Down Expand Up @@ -160,6 +166,9 @@
- [#395]: Add support for XML Schema `xs:list`
- [#324]: `Reader::from_str` / `Deserializer::from_str` / `from_str` now ignore
the XML declared encoding and always use UTF-8
- [#379]: Add full support for XML attribute value normalization via
`Attribute::normalized_value()` and
`Attribute::normalized_value_with_custom_entities()`
- [#416]: Add `borrow()` methods in all event structs which allows to get
a borrowed version of any event
- [#437]: Split out namespace reading functionality to a dedicated `NsReader`, namely:
Expand Down
3 changes: 1 addition & 2 deletions benches/macrobenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -43,14 +43,13 @@ static INPUTS: &[(&str, &str)] = &[
("players.xml", PLAYERS),
];

// TODO: use fully normalized attribute values
fn parse_document_from_str(doc: &str) -> XmlResult<()> {
let mut r = Reader::from_str(doc);
loop {
match criterion::black_box(r.read_event()?) {
Event::Start(e) | Event::Empty(e) => {
for attr in e.attributes() {
criterion::black_box(attr?.decode_and_unescape_value(&r)?);
criterion::black_box(attr?.normalized_value()?);
}
}
Event::Text(e) => {
Expand Down
125 changes: 125 additions & 0 deletions benches/microbenches.rs
Original file line number Diff line number Diff line change
Expand Up @@ -242,6 +242,130 @@ fn attributes(c: &mut Criterion) {
assert_eq!(count, 150);
})
});

group.finish();
}

/// Benchmarks normalizing attribute values
fn attribute_value_normalization(c: &mut Criterion) {
let mut group = c.benchmark_group("attribute_value_normalization");

group.bench_function("noop_short", |b| {
b.iter(|| {
criterion::black_box(unescape(b"foobar")).unwrap();
})
});

group.bench_function("noop_long", |b| {
b.iter(|| {
criterion::black_box(unescape(b"just a bit of text without any entities")).unwrap();
})
});

group.bench_function("replacement_chars", |b| {
b.iter(|| {
criterion::black_box(unescape(b"just a bit\n of text without\tany entities")).unwrap();
})
});

group.bench_function("char_reference", |b| {
b.iter(|| {
let text = b"prefix &#34;some stuff&#34;,&#x22;more stuff&#x22;";
criterion::black_box(unescape(text)).unwrap();
let text = b"&#38;&#60;";
criterion::black_box(unescape(text)).unwrap();
})
});

group.bench_function("entity_reference", |b| {
b.iter(|| {
let text = b"age &gt; 72 &amp;&amp; age &lt; 21";
criterion::black_box(unescape(text)).unwrap();
let text = b"&quot;what&apos;s that?&quot;";
criterion::black_box(unescape(text)).unwrap();
})
});

group.finish();
}

/// Benchmarks the `BytesText::unescaped()` method (includes time of `read_event`
/// benchmark)
fn bytes_text_unescaped(c: &mut Criterion) {
let mut group = c.benchmark_group("BytesText::unescaped");
group.bench_function("trim_text = false", |b| {
b.iter(|| {
let mut buf = Vec::new();
let mut r = Reader::from_reader(SAMPLE);
r.check_end_names(false).check_comments(false);
let mut count = criterion::black_box(0);
let mut nbtxt = criterion::black_box(0);
loop {
match r.read_event(&mut buf) {
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
assert_eq!(
count, 1550,
"Overall tag count in ./tests/documents/sample_rss.xml"
);

// Windows has \r\n instead of \n
#[cfg(windows)]
assert_eq!(
nbtxt, 67661,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);

#[cfg(not(windows))]
assert_eq!(
nbtxt, 66277,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);
});
});

group.bench_function("trim_text = true", |b| {
b.iter(|| {
let mut buf = Vec::new();
let mut r = Reader::from_reader(SAMPLE);
r.check_end_names(false)
.check_comments(false)
.trim_text(true);
let mut count = criterion::black_box(0);
let mut nbtxt = criterion::black_box(0);
loop {
match r.read_event(&mut buf) {
Ok(Event::Start(_)) | Ok(Event::Empty(_)) => count += 1,
Ok(Event::Text(ref e)) => nbtxt += e.unescaped().unwrap().len(),
Ok(Event::Eof) => break,
_ => (),
}
buf.clear();
}
assert_eq!(
count, 1550,
"Overall tag count in ./tests/documents/sample_rss.xml"
);

// Windows has \r\n instead of \n
#[cfg(windows)]
assert_eq!(
nbtxt, 50334,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);

#[cfg(not(windows))]
assert_eq!(
nbtxt, 50261,
"Overall length (in bytes) of all text contents of ./tests/documents/sample_rss.xml"
);
});
});
group.finish();
}

Expand Down Expand Up @@ -354,6 +478,7 @@ criterion_group!(
read_resolved_event_into,
one_event,
attributes,
attribute_value_normalization,
escaping,
unescaping,
);
Expand Down
1 change: 1 addition & 0 deletions src/errors.rs
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ impl From<EscapeError> for Error {
}

impl From<AttrError> for Error {
/// Creates a new `Error::InvalidAttr` from the given error
#[inline]
fn from(error: AttrError) -> Self {
Error::InvalidAttr(error)
Expand Down
15 changes: 7 additions & 8 deletions src/escapei.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ use std::ops::Range;
use pretty_assertions::assert_eq;

/// Error for XML escape / unescape.
#[derive(Clone, Debug)]
#[derive(Clone, Debug, PartialEq)]
pub enum EscapeError {
/// Entity with Null character
EntityWithNull(Range<usize>),
Expand Down Expand Up @@ -213,7 +213,7 @@ where
}

#[cfg(not(feature = "escape-html"))]
fn named_entity(name: &str) -> Option<&str> {
pub(crate) fn named_entity(name: &str) -> Option<&str> {
// match over strings are not allowed in const functions
let s = match name.as_bytes() {
b"lt" => "<",
Expand All @@ -226,11 +226,10 @@ fn named_entity(name: &str) -> Option<&str> {
Some(s)
}
#[cfg(feature = "escape-html")]
fn named_entity(name: &str) -> Option<&str> {
pub(crate) fn named_entity(name: &str) -> Option<&str> {
// imported from https://dev.w3.org/html5/html-author/charref
// match over strings are not allowed in const functions
//TODO: automate up-to-dating using https://html.spec.whatwg.org/entities.json
let s = match name.as_bytes() {
match name.as_bytes() {
b"Tab" => "\u{09}",
b"NewLine" => "\u{0A}",
b"excl" => "\u{21}",
Expand Down Expand Up @@ -1690,7 +1689,7 @@ fn named_entity(name: &str) -> Option<&str> {
Some(s)
}

fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
pub(crate) fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
let code = if bytes.starts_with('x') {
parse_hexadecimal(&bytes[1..])
} else {
Expand All @@ -1705,7 +1704,7 @@ fn parse_number(bytes: &str, range: Range<usize>) -> Result<char, EscapeError> {
}
}

fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
pub(crate) fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
// maximum code is 0x10FFFF => 6 characters
if bytes.len() > 6 {
return Err(EscapeError::TooLongHexadecimal);
Expand All @@ -1723,7 +1722,7 @@ fn parse_hexadecimal(bytes: &str) -> Result<u32, EscapeError> {
Ok(code)
}

fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
pub(crate) fn parse_decimal(bytes: &str) -> Result<u32, EscapeError> {
// maximum code is 0x10FFFF = 1114111 => 7 characters
if bytes.len() > 7 {
return Err(EscapeError::TooLongDecimal);
Expand Down
Loading

0 comments on commit 57b9878

Please sign in to comment.