From ee66ee8d23f82eae453f00b507321e2f4819fc50 Mon Sep 17 00:00:00 2001 From: Adam Reichold Date: Mon, 2 Dec 2024 08:34:17 +0100 Subject: [PATCH] Drop hash table for per-element attributes for more compact sorted vector. --- Cargo.lock | 72 +++++++++++++---------------------- scraper/Cargo.toml | 3 +- scraper/src/html/tree_sink.rs | 11 ++++++ scraper/src/node.rs | 41 ++++++++++++-------- 4 files changed, 64 insertions(+), 63 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index d0ca7422..70b6588d 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2,24 +2,11 @@ # It is not intended for manual editing. version = 4 -[[package]] -name = "ahash" -version = "0.8.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e89da841a80418a9b391ebaea17f5c112ffaaa96f621d2c285b5174da76b9011" -dependencies = [ - "cfg-if", - "getrandom", - "once_cell", - "version_check", - "zerocopy", -] - [[package]] name = "autocfg" -version = "1.3.0" +version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0c4b4d0bd25bd0b74681c0ad21497610ce1b7c91b1022cd21c80c6fbdd9476b0" +checksum = "ace50bade8e6234aa140d9a2f552bbee1db4d353f69b8217bc503490fc1a9f26" [[package]] name = "bitflags" @@ -141,9 +128,9 @@ dependencies = [ [[package]] name = "hashbrown" -version = "0.15.0" +version = "0.15.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" +checksum = "bf151400ff0baff5465007dd2f3e717f3fe502074ca563069ce3a6629d07b289" [[package]] name = "html5ever" @@ -161,9 +148,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.6.0" +version = "2.7.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "707907fe3c25f5424cce2cb7e1cbcafee6bdbe735ca90ef77c29e84591e5b9da" +checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" dependencies = [ "equivalent", "hashbrown", @@ -171,15 +158,15 @@ dependencies = [ [[package]] name = "itoa" -version = "1.0.11" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" +checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674" [[package]] name = "libc" -version = "0.2.158" +version = "0.2.167" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" +checksum = "09d6582e104315a817dff97f75133544b2e094ee22447d2acf4a74e189ba06fc" [[package]] name = "lock_api" @@ -225,9 +212,9 @@ checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" [[package]] name = "once_cell" -version = "1.19.0" +version = "1.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" +checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775" [[package]] name = "parking_lot" @@ -340,9 +327,9 @@ checksum = "925383efa346730478fb4838dbe9137d2a47675ad789c546d150a6e1dd4ab31c" [[package]] name = "proc-macro2" -version = "1.0.86" +version = "1.0.92" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5e719e8df665df0d1c8fbfd238015744736151d4445ec0836b8e628aae103b77" +checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0" dependencies = [ "unicode-ident", ] @@ -388,9 +375,9 @@ dependencies = [ [[package]] name = "redox_syscall" -version = "0.5.3" +version = "0.5.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2a908a6e00f1fdd0dfd9c0eb08ce85126f6d8bbda50017e74bc4a4b7d4a926a4" +checksum = "9b6dfecf2c74bce2466cabf93f6664d6998a69eb21e39f4207930065b27b771f" dependencies = [ "bitflags", ] @@ -405,7 +392,6 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" name = "scraper" version = "0.21.0" dependencies = [ - "ahash", "cssparser", "ego-tree", "getopts", @@ -437,18 +423,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.209" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09" +checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.209" +version = "1.0.215" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170" +checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" dependencies = [ "proc-macro2", "quote", @@ -510,9 +496,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.76" +version = "2.0.90" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525" +checksum = "919d3b74a5dd0ccd15aeb8f93e7006bd9e14c295087c9896a110f490752bcf31" dependencies = [ "proc-macro2", "quote", @@ -532,15 +518,15 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.12" +version = "1.0.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" +checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83" [[package]] name = "unicode-width" -version = "0.1.13" +version = "0.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" +checksum = "7dd6e30e90baa6f72411720665d41d89b9a3d039dc45b8faea1ddd07f617f6af" [[package]] name = "utf-8" @@ -548,12 +534,6 @@ version = "0.7.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09cc8ee72d2a9becf2f2febe0205bbed8fc6615b7cb429ad062dc7b7ddd036a9" -[[package]] -name = "version_check" -version = "0.9.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" - [[package]] name = "wasi" version = "0.11.0+wasi-snapshot-preview1" diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index 7a0549e3..fe11f8c0 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -13,11 +13,10 @@ repository = "https://github.com/causal-agent/scraper" readme = "README.md" [dependencies] -ahash = "0.8.0" cssparser = "0.34.0" ego-tree = "0.9.0" html5ever = "0.29.0" -indexmap = { version = "2.6.0", optional = true } +indexmap = { version = "2.7.0", optional = true } precomputed-hash = "0.1.1" selectors = "0.26.0" tendril = "0.4.3" diff --git a/scraper/src/html/tree_sink.rs b/scraper/src/html/tree_sink.rs index 02d43eb6..49b30b49 100644 --- a/scraper/src/html/tree_sink.rs +++ b/scraper/src/html/tree_sink.rs @@ -223,6 +223,17 @@ impl TreeSink for HtmlTreeSink { }; for attr in attrs { + #[cfg(not(feature = "deterministic"))] + if let Err(idx) = element + .attrs + .binary_search_by(|(name, _)| name.cmp(&attr.name)) + { + element + .attrs + .insert(idx, (attr.name, make_tendril(attr.value))); + } + + #[cfg(feature = "deterministic")] element .attrs .entry(attr.name) diff --git a/scraper/src/node.rs b/scraper/src/node.rs index f2390c3e..4f900857 100644 --- a/scraper/src/node.rs +++ b/scraper/src/node.rs @@ -1,9 +1,5 @@ //! HTML nodes. -#[cfg(not(feature = "deterministic"))] -use ahash::AHashMap as HashMap; -#[cfg(not(feature = "deterministic"))] -use std::collections::hash_map; use std::fmt; use std::ops::Deref; use std::slice::Iter as SliceIter; @@ -219,7 +215,7 @@ pub type Attributes = indexmap::IndexMap; /// Please enable the `deterministic` feature for order-preserving /// (de)serialization. #[cfg(not(feature = "deterministic"))] -pub type Attributes = HashMap; +pub type Attributes = Vec<(QualName, StrTendril)>; /// An HTML element. #[derive(Clone, PartialEq, Eq)] @@ -232,16 +228,20 @@ pub struct Element { id: OnceCell>, - classes: OnceCell>, + classes: OnceCell>, } impl Element { #[doc(hidden)] pub fn new(name: QualName, attributes: Vec) -> Self { - let attrs = attributes + #[allow(unused_mut)] + let mut attrs = attributes .into_iter() - .map(|a| (a.name, crate::tendril_util::make(a.value))) - .collect(); + .map(|attr| (attr.name, crate::tendril_util::make(attr.value))) + .collect::(); + + #[cfg(not(feature = "deterministic"))] + attrs.sort_unstable_by(|lhs, rhs| lhs.0.cmp(&rhs.0)); Element { attrs, @@ -277,17 +277,17 @@ impl Element { /// Returns an iterator over the element's classes. pub fn classes(&self) -> Classes { let classes = self.classes.get_or_init(|| { - let mut classes: Vec = self + let mut classes = self .attrs .iter() .filter(|(name, _)| name.local.as_ref() == "class") - .flat_map(|(_, value)| value.split_whitespace().map(LocalName::from)) - .collect(); + .flat_map(|(_, value)| value.split_ascii_whitespace().map(LocalName::from)) + .collect::>(); classes.sort_unstable(); classes.dedup(); - classes + classes.into_boxed_slice() }); Classes { @@ -298,7 +298,18 @@ impl Element { /// Returns the value of an attribute. pub fn attr(&self, attr: &str) -> Option<&str> { let qualname = QualName::new(None, ns!(), LocalName::from(attr)); - self.attrs.get(&qualname).map(Deref::deref) + + #[cfg(not(feature = "deterministic"))] + let value = self + .attrs + .binary_search_by(|attr| attr.0.cmp(&qualname)) + .ok() + .map(|idx| &*self.attrs[idx].1); + + #[cfg(feature = "deterministic")] + let value = self.attrs.get(&qualname).map(Deref::deref); + + value } /// Returns an iterator over the element's attributes. @@ -330,7 +341,7 @@ pub type AttributesIter<'a> = indexmap::map::Iter<'a, QualName, StrTendril>; /// An iterator over a node's attributes. #[cfg(not(feature = "deterministic"))] -pub type AttributesIter<'a> = hash_map::Iter<'a, QualName, StrTendril>; +pub type AttributesIter<'a> = SliceIter<'a, (QualName, StrTendril)>; /// Iterator over attributes. #[allow(missing_debug_implementations)]