Skip to content

Commit

Permalink
Merge pull request #214 from rust-scraper/bump-selectors
Browse files Browse the repository at this point in the history
Bump selectors, cssparser and html5ever
  • Loading branch information
cfvescovo authored Oct 24, 2024
2 parents e0d4ea7 + fddd90e commit 2ede12e
Show file tree
Hide file tree
Showing 8 changed files with 160 additions and 120 deletions.
52 changes: 17 additions & 35 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 6 additions & 5 deletions scraper/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,14 @@ repository = "https://github.com/causal-agent/scraper"
readme = "README.md"

[dependencies]
cssparser = "0.31.0"
ahash = "0.8.0"
cssparser = "0.34.0"
ego-tree = "0.9.0"
html5ever = "0.27"
selectors = "0.25.0"
tendril = "0.4.3"
ahash = "0.8"
html5ever = "0.29.0"
indexmap = { version = "2.6.0", optional = true }
precomputed-hash = "0.1.1"
selectors = "0.26.0"
tendril = "0.4.3"

[dependencies.getopts]
version = "0.2.21"
Expand Down
10 changes: 10 additions & 0 deletions scraper/src/element_ref/element.rs
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
use html5ever::Namespace;
use selectors::{
attr::{AttrSelectorOperation, CaseSensitivity, NamespaceConstraint},
bloom::BloomFilter,
matching, Element, OpaqueElement,
};

Expand Down Expand Up @@ -122,6 +123,10 @@ impl<'a> Element for ElementRef<'a> {
self.value().has_class(&name.0, case_sensitivity)
}

fn has_custom_state(&self, _name: &CssLocalName) -> bool {
false
}

fn is_empty(&self) -> bool {
!self
.children()
Expand All @@ -134,6 +139,11 @@ impl<'a> Element for ElementRef<'a> {
}

fn apply_selector_flags(&self, _flags: matching::ElementSelectorFlags) {}

fn add_element_unique_hashes(&self, _filter: &mut BloomFilter) -> bool {
// FIXME: Do we want to add `self.node.id()` here?
false
}
}

#[cfg(test)]
Expand Down
12 changes: 6 additions & 6 deletions scraper/src/element_ref/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ use std::ops::Deref;
use ego_tree::iter::{Edge, Traverse};
use ego_tree::NodeRef;
use html5ever::serialize::{serialize, SerializeOpts, TraversalScope};
use selectors::NthIndexCache;
use selectors::matching::SelectorCaches;

use crate::node::Element;
use crate::{Node, Selector};
Expand Down Expand Up @@ -49,7 +49,7 @@ impl<'a> ElementRef<'a> {
scope: *self,
inner,
selector,
nth_index_cache: NthIndexCache::default(),
caches: Default::default(),
}
}

Expand Down Expand Up @@ -135,7 +135,7 @@ pub struct Select<'a, 'b> {
scope: ElementRef<'a>,
inner: Traverse<'a, Node>,
selector: &'b Selector,
nth_index_cache: NthIndexCache,
caches: SelectorCaches,
}

impl Debug for Select<'_, '_> {
Expand All @@ -144,7 +144,7 @@ impl Debug for Select<'_, '_> {
.field("scope", &self.scope)
.field("inner", &self.inner)
.field("selector", &self.selector)
.field("nth_index_cache", &"..")
.field("caches", &"..")
.finish()
}
}
Expand All @@ -155,7 +155,7 @@ impl Clone for Select<'_, '_> {
scope: self.scope,
inner: self.inner.clone(),
selector: self.selector,
nth_index_cache: NthIndexCache::default(),
caches: Default::default(),
}
}
}
Expand All @@ -170,7 +170,7 @@ impl<'a, 'b> Iterator for Select<'a, 'b> {
if self.selector.matches_with_scope_and_cache(
&element,
Some(self.scope),
&mut self.nth_index_cache,
&mut self.caches,
) {
return Some(element);
}
Expand Down
37 changes: 18 additions & 19 deletions scraper/src/html/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,12 +10,14 @@ use ego_tree::Tree;
use html5ever::serialize::SerializeOpts;
use html5ever::tree_builder::QuirksMode;
use html5ever::{driver, serialize, QualName};
use selectors::NthIndexCache;
use selectors::matching::SelectorCaches;
use tendril::TendrilSink;

use crate::selector::Selector;
use crate::{ElementRef, Node};

pub use tree_sink::HtmlTreeSink;

/// An HTML tree.
///
/// Parsing does not fail hard. Instead, the `quirks_mode` is set and errors are added to the
Expand Down Expand Up @@ -67,22 +69,23 @@ impl Html {
/// # fn main() {
/// # let document = "";
/// use html5ever::driver::{self, ParseOpts};
/// use scraper::Html;
/// use scraper::{Html, HtmlTreeSink};
/// use tendril::TendrilSink;
///
/// let parser = driver::parse_document(Html::new_document(), ParseOpts::default());
/// let parser = driver::parse_document(HtmlTreeSink::new(Html::new_document()), ParseOpts::default());
/// let html = parser.one(document);
/// # }
/// ```
pub fn parse_document(document: &str) -> Self {
let parser = driver::parse_document(Self::new_document(), Default::default());
let parser =
driver::parse_document(HtmlTreeSink::new(Self::new_document()), Default::default());
parser.one(document)
}

/// Parses a string of HTML as a fragment.
pub fn parse_fragment(fragment: &str) -> Self {
let parser = driver::parse_fragment(
Self::new_fragment(),
HtmlTreeSink::new(Self::new_fragment()),
Default::default(),
QualName::new(None, ns!(html), local_name!("body")),
Vec::new(),
Expand All @@ -95,7 +98,7 @@ impl Html {
Select {
inner: self.tree.nodes(),
selector,
nth_index_cache: NthIndexCache::default(),
caches: Default::default(),
}
}

Expand Down Expand Up @@ -127,15 +130,15 @@ impl Html {
pub struct Select<'a, 'b> {
inner: Nodes<'a, Node>,
selector: &'b Selector,
nth_index_cache: NthIndexCache,
caches: SelectorCaches,
}

impl fmt::Debug for Select<'_, '_> {
fn fmt(&self, fmt: &mut fmt::Formatter<'_>) -> fmt::Result {
fmt.debug_struct("Select")
.field("inner", &self.inner)
.field("selector", &self.selector)
.field("nth_index_cache", &"..")
.field("caches", &"..")
.finish()
}
}
Expand All @@ -145,7 +148,7 @@ impl Clone for Select<'_, '_> {
Self {
inner: self.inner.clone(),
selector: self.selector,
nth_index_cache: NthIndexCache::default(),
caches: Default::default(),
}
}
}
Expand All @@ -157,11 +160,9 @@ impl<'a, 'b> Iterator for Select<'a, 'b> {
for node in self.inner.by_ref() {
if let Some(element) = ElementRef::wrap(node) {
if element.parent().is_some()
&& self.selector.matches_with_scope_and_cache(
&element,
None,
&mut self.nth_index_cache,
)
&& self
.selector
.matches_with_scope_and_cache(&element, None, &mut self.caches)
{
return Some(element);
}
Expand All @@ -182,11 +183,9 @@ impl<'a, 'b> DoubleEndedIterator for Select<'a, 'b> {
for node in self.inner.by_ref().rev() {
if let Some(element) = ElementRef::wrap(node) {
if element.parent().is_some()
&& self.selector.matches_with_scope_and_cache(
&element,
None,
&mut self.nth_index_cache,
)
&& self
.selector
.matches_with_scope_and_cache(&element, None, &mut self.caches)
{
return Some(element);
}
Expand Down
Loading

0 comments on commit 2ede12e

Please sign in to comment.