diff --git a/Cargo.lock b/Cargo.lock index 35b950f7..e8e241d5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1,6 +1,6 @@ # This file is automatically @generated by Cargo. # It is not intended for manual editing. -version = 3 +version = 4 [[package]] name = "ahash" @@ -41,14 +41,14 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" [[package]] name = "cssparser" -version = "0.31.2" +version = "0.34.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b3df4f93e5fbbe73ec01ec8d3f68bba73107993a5b1e7519273c32db9b0d5be" +checksum = "b7c66d1cd8ed61bf80b38432613a7a2f09401ab8d0501110655f8b341484a3e3" dependencies = [ "cssparser-macros", "dtoa-short", "itoa", - "phf 0.11.2", + "phf", "smallvec", ] @@ -147,9 +147,9 @@ checksum = "1e087f84d4f86bf4b218b927129862374b72199ae7d8657835f1e89000eea4fb" [[package]] name = "html5ever" -version = "0.27.0" +version = "0.29.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c13771afe0e6e846f1e67d038d4cb29998a6779f93c809212e4e9c32efd244d4" +checksum = "2e15626aaf9c351bc696217cbe29cb9b5e86c43f8a46b5e2f5c6c5cf7cb904ce" dependencies = [ "log", "mac", @@ -205,13 +205,13 @@ checksum = "c41e0c4fef86961ac6d6f8a82609f55f31b05e4fce149ac5710e439df7619ba4" [[package]] name = "markup5ever" -version = "0.12.1" +version = "0.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "16ce3abbeba692c8b8441d036ef91aea6df8da2c6b6e21c7e14d3c18e526be45" +checksum = "82c88c6129bd24319e62a0359cb6b958fa7e8be6e19bb1663bc396b90883aca5" dependencies = [ "log", - "phf 0.11.2", - "phf_codegen 0.11.2", + "phf", + "phf_codegen", "string_cache", "string_cache_codegen", "tendril", @@ -252,15 +252,6 @@ dependencies = [ "windows-targets", ] -[[package]] -name = "phf" -version = "0.10.1" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fabbf1ead8a5bcbc20f5f8b939ee3f5b0f6f281b6ad3468b84656b658b455259" -dependencies = [ - "phf_shared 0.10.0", -] - [[package]] name = "phf" version = "0.11.2" @@ -271,16 +262,6 @@ dependencies = [ "phf_shared 0.11.2", ] -[[package]] -name = "phf_codegen" -version = "0.10.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4fb1c3a8bc4dd4e5cfce29b44ffc14bedd2ee294559a294e2a4d4c9e9a6a13cd" -dependencies = [ - "phf_generator 0.10.0", - "phf_shared 0.10.0", -] - [[package]] name = "phf_codegen" version = "0.11.2" @@ -430,15 +411,16 @@ dependencies = [ "getopts", "html5ever", "indexmap", + "precomputed-hash", "selectors", "tendril", ] [[package]] name = "selectors" -version = "0.25.0" +version = "0.26.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4eb30575f3638fc8f6815f448d50cb1a2e255b0897985c8c59f4d37b72a07b06" +checksum = "fd568a4c9bb598e291a08244a5c1f5a8a6650bee243b5b0f8dbb3d9cc1d87fe8" dependencies = [ "bitflags", "cssparser", @@ -446,8 +428,8 @@ dependencies = [ "fxhash", "log", "new_debug_unreachable", - "phf 0.10.1", - "phf_codegen 0.10.0", + "phf", + "phf_codegen", "precomputed-hash", "servo_arc", "smallvec", @@ -475,9 +457,9 @@ dependencies = [ [[package]] name = "servo_arc" -version = "0.3.0" +version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d036d71a959e00c77a63538b90a6c2390969f9772b096ea837205c6bd0491a44" +checksum = "ae65c4249478a2647db249fb43e23cec56a2c8974a427e7bd8cb5a1d0964921a" dependencies = [ "stable_deref_trait", ] diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index 00d51956..e7fabfd6 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -13,13 +13,14 @@ repository = "https://github.com/causal-agent/scraper" readme = "README.md" [dependencies] -cssparser = "0.31.0" +ahash = "0.8.0" +cssparser = "0.34.0" ego-tree = "0.9.0" -html5ever = "0.27" -selectors = "0.25.0" -tendril = "0.4.3" -ahash = "0.8" +html5ever = "0.29.0" indexmap = { version = "2.6.0", optional = true } +precomputed-hash = "0.1.1" +selectors = "0.26.0" +tendril = "0.4.3" [dependencies.getopts] version = "0.2.21" diff --git a/scraper/src/element_ref/element.rs b/scraper/src/element_ref/element.rs index 77804223..e804d81e 100644 --- a/scraper/src/element_ref/element.rs +++ b/scraper/src/element_ref/element.rs @@ -1,6 +1,7 @@ use html5ever::Namespace; use selectors::{ attr::{AttrSelectorOperation, CaseSensitivity, NamespaceConstraint}, + bloom::BloomFilter, matching, Element, OpaqueElement, }; @@ -122,6 +123,10 @@ impl<'a> Element for ElementRef<'a> { self.value().has_class(&name.0, case_sensitivity) } + fn has_custom_state(&self, _name: &CssLocalName) -> bool { + false + } + fn is_empty(&self) -> bool { !self .children() @@ -134,6 +139,11 @@ impl<'a> Element for ElementRef<'a> { } fn apply_selector_flags(&self, _flags: matching::ElementSelectorFlags) {} + + fn add_element_unique_hashes(&self, _filter: &mut BloomFilter) -> bool { + // FIXME: Do we want to add `self.node.id()` here? + false + } } #[cfg(test)] diff --git a/scraper/src/element_ref/mod.rs b/scraper/src/element_ref/mod.rs index b0fc09a6..4e8500e2 100644 --- a/scraper/src/element_ref/mod.rs +++ b/scraper/src/element_ref/mod.rs @@ -7,7 +7,7 @@ use std::ops::Deref; use ego_tree::iter::{Edge, Traverse}; use ego_tree::NodeRef; use html5ever::serialize::{serialize, SerializeOpts, TraversalScope}; -use selectors::NthIndexCache; +use selectors::matching::SelectorCaches; use crate::node::Element; use crate::{Node, Selector}; @@ -49,7 +49,7 @@ impl<'a> ElementRef<'a> { scope: *self, inner, selector, - nth_index_cache: NthIndexCache::default(), + caches: Default::default(), } } @@ -135,7 +135,7 @@ pub struct Select<'a, 'b> { scope: ElementRef<'a>, inner: Traverse<'a, Node>, selector: &'b Selector, - nth_index_cache: NthIndexCache, + caches: SelectorCaches, } impl Debug for Select<'_, '_> { @@ -144,7 +144,7 @@ impl Debug for Select<'_, '_> { .field("scope", &self.scope) .field("inner", &self.inner) .field("selector", &self.selector) - .field("nth_index_cache", &"..") + .field("caches", &"..") .finish() } } @@ -155,7 +155,7 @@ impl Clone for Select<'_, '_> { scope: self.scope, inner: self.inner.clone(), selector: self.selector, - nth_index_cache: NthIndexCache::default(), + caches: Default::default(), } } } @@ -170,7 +170,7 @@ impl<'a, 'b> Iterator for Select<'a, 'b> { if self.selector.matches_with_scope_and_cache( &element, Some(self.scope), - &mut self.nth_index_cache, + &mut self.caches, ) { return Some(element); } diff --git a/scraper/src/html/mod.rs b/scraper/src/html/mod.rs index f7058164..39ad74cf 100644 --- a/scraper/src/html/mod.rs +++ b/scraper/src/html/mod.rs @@ -10,12 +10,14 @@ use ego_tree::Tree; use html5ever::serialize::SerializeOpts; use html5ever::tree_builder::QuirksMode; use html5ever::{driver, serialize, QualName}; -use selectors::NthIndexCache; +use selectors::matching::SelectorCaches; use tendril::TendrilSink; use crate::selector::Selector; use crate::{ElementRef, Node}; +pub use tree_sink::HtmlTreeSink; + /// An HTML tree. /// /// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the @@ -67,22 +69,23 @@ impl Html { /// # fn main() { /// # let document = ""; /// use html5ever::driver::{self, ParseOpts}; - /// use scraper::Html; + /// use scraper::{Html, HtmlTreeSink}; /// use tendril::TendrilSink; /// - /// let parser = driver::parse_document(Html::new_document(), ParseOpts::default()); + /// let parser = driver::parse_document(HtmlTreeSink::new(Html::new_document()), ParseOpts::default()); /// let html = parser.one(document); /// # } /// ``` pub fn parse_document(document: &str) -> Self { - let parser = driver::parse_document(Self::new_document(), Default::default()); + let parser = + driver::parse_document(HtmlTreeSink::new(Self::new_document()), Default::default()); parser.one(document) } /// Parses a string of HTML as a fragment. pub fn parse_fragment(fragment: &str) -> Self { let parser = driver::parse_fragment( - Self::new_fragment(), + HtmlTreeSink::new(Self::new_fragment()), Default::default(), QualName::new(None, ns!(html), local_name!("body")), Vec::new(), @@ -95,7 +98,7 @@ impl Html { Select { inner: self.tree.nodes(), selector, - nth_index_cache: NthIndexCache::default(), + caches: Default::default(), } } @@ -127,7 +130,7 @@ impl Html { pub struct Select<'a, 'b> { inner: Nodes<'a, Node>, selector: &'b Selector, - nth_index_cache: NthIndexCache, + caches: SelectorCaches, } impl fmt::Debug for Select<'_, '_> { @@ -135,7 +138,7 @@ impl fmt::Debug for Select<'_, '_> { fmt.debug_struct("Select") .field("inner", &self.inner) .field("selector", &self.selector) - .field("nth_index_cache", &"..") + .field("caches", &"..") .finish() } } @@ -145,7 +148,7 @@ impl Clone for Select<'_, '_> { Self { inner: self.inner.clone(), selector: self.selector, - nth_index_cache: NthIndexCache::default(), + caches: Default::default(), } } } @@ -157,11 +160,9 @@ impl<'a, 'b> Iterator for Select<'a, 'b> { for node in self.inner.by_ref() { if let Some(element) = ElementRef::wrap(node) { if element.parent().is_some() - && self.selector.matches_with_scope_and_cache( - &element, - None, - &mut self.nth_index_cache, - ) + && self + .selector + .matches_with_scope_and_cache(&element, None, &mut self.caches) { return Some(element); } @@ -182,11 +183,9 @@ impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> { for node in self.inner.by_ref().rev() { if let Some(element) = ElementRef::wrap(node) { if element.parent().is_some() - && self.selector.matches_with_scope_and_cache( - &element, - None, - &mut self.nth_index_cache, - ) + && self + .selector + .matches_with_scope_and_cache(&element, None, &mut self.caches) { return Some(element); } diff --git a/scraper/src/html/tree_sink.rs b/scraper/src/html/tree_sink.rs index af253765..f9e18720 100644 --- a/scraper/src/html/tree_sink.rs +++ b/scraper/src/html/tree_sink.rs @@ -5,34 +5,47 @@ use ego_tree::NodeId; use html5ever::tendril::StrTendril; use html5ever::tree_builder::{ElementFlags, NodeOrText, QuirksMode, TreeSink}; use html5ever::Attribute; -use html5ever::{ExpandedName, QualName}; +use html5ever::QualName; use std::borrow::Cow; +use std::cell::{Ref, RefCell}; + +/// Wraps `Html` instances as sinks to drive parsing +#[derive(Debug)] +pub struct HtmlTreeSink(RefCell); + +impl HtmlTreeSink { + /// Wrap a `Html`instance as a sink to drive parsing + pub fn new(html: Html) -> Self { + Self(RefCell::new(html)) + } +} /// Note: does not support the `