diff --git a/Cargo.lock b/Cargo.lock index 5937ff36..50df5256 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -148,9 +148,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" dependencies = [ "equivalent", "hashbrown", @@ -390,7 +390,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "scraper" -version = "0.22.0" +version = "0.23.1" dependencies = [ "cssparser", "ego-tree", @@ -399,6 +399,7 @@ dependencies = [ "indexmap", "precomputed-hash", "selectors", + "serde", "tendril", ] @@ -423,18 +424,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.215" +version = "1.0.218" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" +checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.215" +version = "1.0.218" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" +checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" dependencies = [ "proc-macro2", "quote", diff --git a/LICENSE b/LICENSE index 3c787528..bb793d8a 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,6 @@ Copyright © 2016, June McEnroe Copyright © 2017, Vivek Kushwaha +Copyright © 2024-2025, rust-scraper Contributors Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above diff --git a/README.md b/README.md deleted file mode 100644 index 39450ec2..00000000 --- a/README.md +++ /dev/null @@ -1,152 +0,0 @@ -# scraper - -[![crates.io](https://img.shields.io/crates/v/scraper?color=dark-green)][crate] -[![downloads](https://img.shields.io/crates/d/scraper)][crate] -[![test](https://github.com/causal-agent/scraper/actions/workflows/test.yml/badge.svg)][tests] - -HTML parsing and querying with CSS selectors. - -`scraper` is on [Crates.io][crate] and [GitHub][github]. - -[crate]: https://crates.io/crates/scraper -[github]: https://github.com/causal-agent/scraper -[tests]: https://github.com/causal-agent/scraper/actions/workflows/test.yml - -Scraper provides an interface to Servo's `html5ever` and `selectors` crates, for browser-grade parsing and querying. - -## Examples - -### Parsing a document - -```rust -use scraper::Html; - -let html = r#" - - - Hello, world! -

Hello, world!

-"#; - -let document = Html::parse_document(html); -``` - -### Parsing a fragment - -```rust -use scraper::Html; -let fragment = Html::parse_fragment("

Hello, world!

"); -``` - -### Parsing a selector - -```rust -use scraper::Selector; -let selector = Selector::parse("h1.foo").unwrap(); -``` - -### Selecting elements - -```rust -use scraper::{Html, Selector}; - -let html = r#" - -"#; - -let fragment = Html::parse_fragment(html); -let selector = Selector::parse("li").unwrap(); - -for element in fragment.select(&selector) { - assert_eq!("li", element.value().name()); -} -``` - -### Selecting descendent elements - -```rust -use scraper::{Html, Selector}; - -let html = r#" - -"#; - -let fragment = Html::parse_fragment(html); -let ul_selector = Selector::parse("ul").unwrap(); -let li_selector = Selector::parse("li").unwrap(); - -let ul = fragment.select(&ul_selector).next().unwrap(); -for element in ul.select(&li_selector) { - assert_eq!("li", element.value().name()); -} -``` - -### Accessing element attributes - -```rust -use scraper::{Html, Selector}; - -let fragment = Html::parse_fragment(r#""#); -let selector = Selector::parse(r#"input[name="foo"]"#).unwrap(); - -let input = fragment.select(&selector).next().unwrap(); -assert_eq!(Some("bar"), input.value().attr("value")); -``` - -### Serializing HTML and inner HTML - -```rust -use scraper::{Html, Selector}; - -let fragment = Html::parse_fragment("

Hello, world!

"); -let selector = Selector::parse("h1").unwrap(); - -let h1 = fragment.select(&selector).next().unwrap(); - -assert_eq!("

Hello, world!

", h1.html()); -assert_eq!("Hello, world!", h1.inner_html()); -``` - -### Accessing descendent text - -```rust -use scraper::{Html, Selector}; - -let fragment = Html::parse_fragment("

Hello, world!

"); -let selector = Selector::parse("h1").unwrap(); - -let h1 = fragment.select(&selector).next().unwrap(); -let text = h1.text().collect::>(); - -assert_eq!(vec!["Hello, ", "world!"], text); -``` - -### Manipulating the DOM - -```rust -use html5ever::tree_builder::TreeSink; -use scraper::{Html, Selector}; - -let html = "hello

REMOVE ME

"; -let selector = Selector::parse(".hello").unwrap(); -let mut document = Html::parse_document(html); -let node_ids: Vec<_> = document.select(&selector).map(|x| x.id()).collect(); -for id in node_ids { - document.remove_from_parent(&id); -} -assert_eq!(document.html(), "hello"); -``` - -## Contributing - -Please feel free to open pull requests. If you're planning on implementing -something big (i.e. not fixing a typo, a small bug fix, minor refactor, etc) -then please open an issue first. diff --git a/README.md b/README.md new file mode 120000 index 00000000..a6541ddb --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +scraper/README.md \ No newline at end of file diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index 0144f3a0..b88d6f49 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scraper" -version = "0.22.0" +version = "0.23.1" edition = "2021" description = "HTML parsing and querying with CSS selectors" @@ -16,9 +16,10 @@ readme = "README.md" cssparser = "0.34.0" ego-tree = "0.10.0" html5ever = "0.29.0" -indexmap = { version = "2.7.0", optional = true } +indexmap = { version = "2.7.1", optional = true } precomputed-hash = "0.1.1" selectors = "0.26.0" +serde = { version = "1.0.218", optional = true } tendril = "0.4.3" [dependencies.getopts] @@ -31,6 +32,7 @@ deterministic = ["indexmap"] main = ["getopts"] atomic = [] errors = [] +serde = ["dep:serde"] [[bin]] name = "scraper" diff --git a/scraper/README.md b/scraper/README.md index 3dfb7d79..39450ec2 120000 --- a/scraper/README.md +++ b/scraper/README.md @@ -1 +1,152 @@ -./../README.md \ No newline at end of file +# scraper + +[![crates.io](https://img.shields.io/crates/v/scraper?color=dark-green)][crate] +[![downloads](https://img.shields.io/crates/d/scraper)][crate] +[![test](https://github.com/causal-agent/scraper/actions/workflows/test.yml/badge.svg)][tests] + +HTML parsing and querying with CSS selectors. + +`scraper` is on [Crates.io][crate] and [GitHub][github]. + +[crate]: https://crates.io/crates/scraper +[github]: https://github.com/causal-agent/scraper +[tests]: https://github.com/causal-agent/scraper/actions/workflows/test.yml + +Scraper provides an interface to Servo's `html5ever` and `selectors` crates, for browser-grade parsing and querying. + +## Examples + +### Parsing a document + +```rust +use scraper::Html; + +let html = r#" + + + Hello, world! +

Hello, world!

+"#; + +let document = Html::parse_document(html); +``` + +### Parsing a fragment + +```rust +use scraper::Html; +let fragment = Html::parse_fragment("

Hello, world!

"); +``` + +### Parsing a selector + +```rust +use scraper::Selector; +let selector = Selector::parse("h1.foo").unwrap(); +``` + +### Selecting elements + +```rust +use scraper::{Html, Selector}; + +let html = r#" +
    +
  • Foo
  • +
  • Bar
  • +
  • Baz
  • +
+"#; + +let fragment = Html::parse_fragment(html); +let selector = Selector::parse("li").unwrap(); + +for element in fragment.select(&selector) { + assert_eq!("li", element.value().name()); +} +``` + +### Selecting descendent elements + +```rust +use scraper::{Html, Selector}; + +let html = r#" +
    +
  • Foo
  • +
  • Bar
  • +
  • Baz
  • +
+"#; + +let fragment = Html::parse_fragment(html); +let ul_selector = Selector::parse("ul").unwrap(); +let li_selector = Selector::parse("li").unwrap(); + +let ul = fragment.select(&ul_selector).next().unwrap(); +for element in ul.select(&li_selector) { + assert_eq!("li", element.value().name()); +} +``` + +### Accessing element attributes + +```rust +use scraper::{Html, Selector}; + +let fragment = Html::parse_fragment(r#""#); +let selector = Selector::parse(r#"input[name="foo"]"#).unwrap(); + +let input = fragment.select(&selector).next().unwrap(); +assert_eq!(Some("bar"), input.value().attr("value")); +``` + +### Serializing HTML and inner HTML + +```rust +use scraper::{Html, Selector}; + +let fragment = Html::parse_fragment("

Hello, world!

"); +let selector = Selector::parse("h1").unwrap(); + +let h1 = fragment.select(&selector).next().unwrap(); + +assert_eq!("

Hello, world!

", h1.html()); +assert_eq!("Hello, world!", h1.inner_html()); +``` + +### Accessing descendent text + +```rust +use scraper::{Html, Selector}; + +let fragment = Html::parse_fragment("

Hello, world!

"); +let selector = Selector::parse("h1").unwrap(); + +let h1 = fragment.select(&selector).next().unwrap(); +let text = h1.text().collect::>(); + +assert_eq!(vec!["Hello, ", "world!"], text); +``` + +### Manipulating the DOM + +```rust +use html5ever::tree_builder::TreeSink; +use scraper::{Html, Selector}; + +let html = "hello

REMOVE ME

"; +let selector = Selector::parse(".hello").unwrap(); +let mut document = Html::parse_document(html); +let node_ids: Vec<_> = document.select(&selector).map(|x| x.id()).collect(); +for id in node_ids { + document.remove_from_parent(&id); +} +assert_eq!(document.html(), "hello"); +``` + +## Contributing + +Please feel free to open pull requests. If you're planning on implementing +something big (i.e. not fixing a typo, a small bug fix, minor refactor, etc) +then please open an issue first. diff --git a/scraper/src/html/tree_sink.rs b/scraper/src/html/tree_sink.rs index 49b30b49..8af46156 100644 --- a/scraper/src/html/tree_sink.rs +++ b/scraper/src/html/tree_sink.rs @@ -11,7 +11,7 @@ use std::cell::{Ref, RefCell}; /// Wraps `Html` instances as sinks to drive parsing #[derive(Debug)] -pub struct HtmlTreeSink(RefCell); +pub struct HtmlTreeSink(pub RefCell); impl HtmlTreeSink { /// Wrap a `Html`instance as a sink to drive parsing diff --git a/scraper/src/selector.rs b/scraper/src/selector.rs index 4f76c2b8..a8f23e46 100644 --- a/scraper/src/selector.rs +++ b/scraper/src/selector.rs @@ -11,6 +11,9 @@ use selectors::{ parser::{self, ParseRelative, SelectorList, SelectorParseErrorKind}, }; +#[cfg(feature = "serde")] +use serde::{de::Visitor, Deserialize, Serialize}; + use crate::error::SelectorErrorKind; use crate::ElementRef; @@ -80,6 +83,36 @@ impl ToCss for Selector { } } +#[cfg(feature = "serde")] +impl Serialize for Selector { + fn serialize(&self, serializer: S) -> Result { + serializer.serialize_str(&self.to_css_string()) + } +} + +#[cfg(feature = "serde")] +impl<'de> Deserialize<'de> for Selector { + fn deserialize>(deserializer: D) -> Result { + deserializer.deserialize_str(SelectorVisitor) + } +} + +#[cfg(feature = "serde")] +struct SelectorVisitor; + +#[cfg(feature = "serde")] +impl Visitor<'_> for SelectorVisitor { + type Value = Selector; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + write!(formatter, "a css selector string") + } + + fn visit_str(self, v: &str) -> Result { + Selector::parse(v).map_err(serde::de::Error::custom) + } +} + /// An implementation of `Parser` for `selectors` #[derive(Clone, Copy, Debug)] pub struct Parser;