Skip to content

Commit

Permalink
Add dependency on memchr and use it to optimize CDATA normalization.
Browse files Browse the repository at this point in the history
  • Loading branch information
adamreichold committed Jan 13, 2025
1 parent 231c7c5 commit 506e927
Show file tree
Hide file tree
Showing 3 changed files with 32 additions and 15 deletions.
3 changes: 3 additions & 0 deletions .github/workflows/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,9 @@ jobs:
toolchain: ${{ matrix.rust }}
override: true

- name: Use MSRV-compatible version of memchr
run: cargo update --package memchr --precise 2.6.2

- name: Run tests
run: cargo test

Expand Down
5 changes: 4 additions & 1 deletion Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,11 @@ exclude = ["testing-tools"]

[features]
default = ["std", "positions"]
std = []
std = ["memchr/std"]
# Enables Nodes and Attributes position in the original document preserving.
# Increases memory usage by `Range<usize>` for each Node.
# Increases memory usage by `Range<usize>` + `u16` + `u8` for each Attribute.
positions = []

[dependencies]
memchr = { version = "2", default-features = false }
39 changes: 25 additions & 14 deletions src/parse.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use alloc::string::{String, ToString};
use alloc::{vec, vec::Vec};
use core::ops::Range;
use memchr::{memchr, memchr_iter};

use crate::{
AttributeData, Document, ExpandedNameIndexed, NamespaceIdx, Namespaces, NodeData, NodeId,
Expand Down Expand Up @@ -555,8 +556,8 @@ impl<'input> Context<'input> {

fn parse(text: &str, opt: ParsingOptions) -> Result<Document> {
// Trying to guess rough nodes and attributes amount.
let nodes_capacity = text.bytes().filter(|c| *c == b'<').count();
let attributes_capacity = text.bytes().filter(|c| *c == b'=').count();
let nodes_capacity = memchr_iter(b'<', text.as_bytes()).count();
let attributes_capacity = memchr_iter(b'=', text.as_bytes()).count();

// Init document.
let mut doc = Document {
Expand Down Expand Up @@ -1002,30 +1003,40 @@ fn process_text<'input>(
// While the whole purpose of CDATA is to indicate to an XML library that this text
// has to be stored as is, carriage return (`\r`) is still has to be replaced with `\n`.
fn process_cdata<'input>(
text: &'input str,
mut text: &'input str,
range: Range<usize>,
ctx: &mut Context<'input>,
) -> Result<()> {
let mut pos = memchr(b'\r', text.as_bytes());

// Add text as is if it has only valid characters.
if !text.as_bytes().contains(&b'\r') {
if pos.is_none() {
append_text(StringStorage::Borrowed(text), range, ctx)?;
ctx.after_text = true;
return Ok(());
}

let mut text_buffer = TextBuffer::new();
let count = text.chars().count();
for (i, c) in text.chars().enumerate() {
for b in CharToBytes::new(c) {
text_buffer.push_from_text(b, i + 1 == count);
}
}
let mut buf = String::new();

if !text_buffer.is_empty() {
append_text(StringStorage::new_owned(text_buffer.finish()), range, ctx)?;
ctx.after_text = true;
while let Some(pos1) = pos {
let (line, rest) = text.split_at(pos1);

buf.push_str(line);
buf.push('\n');

text = if rest.as_bytes().get(1) == Some(&b'\n') {
&rest[2..]
} else {
&rest[1..]
};

pos = memchr(b'\r', text.as_bytes());
}

buf.push_str(text);

append_text(StringStorage::new_owned(buf), range, ctx)?;
ctx.after_text = true;
Ok(())
}

Expand Down

0 comments on commit 506e927

Please sign in to comment.