Add dependency on memchr and use it to optimize CDATA normalization.

RazrFalcon · Jan 13, 2025 · 506e927 · 506e927
1 parent 231c7c5
commit 506e927
Show file tree

Hide file tree

Showing 3 changed files with 32 additions and 15 deletions.
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
@@ -24,6 +24,9 @@ jobs:
         toolchain: ${{ matrix.rust }}
         override: true
 
+    - name: Use MSRV-compatible version of memchr
+      run: cargo update --package memchr --precise 2.6.2
+
     - name: Run tests
       run: cargo test
 

diff --git a/Cargo.toml b/Cargo.toml
@@ -18,8 +18,11 @@ exclude = ["testing-tools"]
 
 [features]
 default = ["std", "positions"]
-std = []
+std = ["memchr/std"]
 # Enables Nodes and Attributes position in the original document preserving.
 # Increases memory usage by `Range<usize>` for each Node.
 # Increases memory usage by `Range<usize>` + `u16` + `u8` for each Attribute.
 positions = []
+
+[dependencies]
+memchr = { version = "2", default-features = false }
diff --git a/src/parse.rs b/src/parse.rs
@@ -1,6 +1,7 @@
 use alloc::string::{String, ToString};
 use alloc::{vec, vec::Vec};
 use core::ops::Range;
+use memchr::{memchr, memchr_iter};
 
 use crate::{
     AttributeData, Document, ExpandedNameIndexed, NamespaceIdx, Namespaces, NodeData, NodeId,
@@ -555,8 +556,8 @@ impl<'input> Context<'input> {
 
 fn parse(text: &str, opt: ParsingOptions) -> Result<Document> {
     // Trying to guess rough nodes and attributes amount.
-    let nodes_capacity = text.bytes().filter(|c| *c == b'<').count();
-    let attributes_capacity = text.bytes().filter(|c| *c == b'=').count();
+    let nodes_capacity = memchr_iter(b'<', text.as_bytes()).count();
+    let attributes_capacity = memchr_iter(b'=', text.as_bytes()).count();
 
     // Init document.
     let mut doc = Document {
@@ -1002,30 +1003,40 @@ fn process_text<'input>(
 // While the whole purpose of CDATA is to indicate to an XML library that this text
 // has to be stored as is, carriage return (`\r`) is still has to be replaced with `\n`.
 fn process_cdata<'input>(
-    text: &'input str,
+    mut text: &'input str,
     range: Range<usize>,
     ctx: &mut Context<'input>,
 ) -> Result<()> {
+    let mut pos = memchr(b'\r', text.as_bytes());
+
     // Add text as is if it has only valid characters.
-    if !text.as_bytes().contains(&b'\r') {
+    if pos.is_none() {
         append_text(StringStorage::Borrowed(text), range, ctx)?;
         ctx.after_text = true;
         return Ok(());
     }
 
-    let mut text_buffer = TextBuffer::new();
-    let count = text.chars().count();
-    for (i, c) in text.chars().enumerate() {
-        for b in CharToBytes::new(c) {
-            text_buffer.push_from_text(b, i + 1 == count);
-        }
-    }
+    let mut buf = String::new();
 
-    if !text_buffer.is_empty() {
-        append_text(StringStorage::new_owned(text_buffer.finish()), range, ctx)?;
-        ctx.after_text = true;
+    while let Some(pos1) = pos {
+        let (line, rest) = text.split_at(pos1);
+
+        buf.push_str(line);
+        buf.push('\n');
+
+        text = if rest.as_bytes().get(1) == Some(&b'\n') {
+            &rest[2..]
+        } else {
+            &rest[1..]
+        };
+
+        pos = memchr(b'\r', text.as_bytes());
     }
 
+    buf.push_str(text);
+
+    append_text(StringStorage::new_owned(buf), range, ctx)?;
+    ctx.after_text = true;
     Ok(())
 }