From d4d1a9957e36dc617cef59532f4b315d44caf514 Mon Sep 17 00:00:00 2001 From: hwup <145976670+hwup@users.noreply.github.com> Date: Fri, 20 Dec 2024 14:32:03 +0200 Subject: [PATCH 01/10] Fix README symlink --- scraper/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper/README.md b/scraper/README.md index 3dfb7d79..32d46ee8 120000 --- a/scraper/README.md +++ b/scraper/README.md @@ -1 +1 @@ -./../README.md \ No newline at end of file +../README.md \ No newline at end of file From 15de83a94fb5caaa88fc6853e8d0ca52119590a4 Mon Sep 17 00:00:00 2001 From: Max Heller Date: Wed, 1 Jan 2025 12:34:48 -0500 Subject: [PATCH 02/10] Make `HtmlTreeSink`'s field public --- scraper/src/html/tree_sink.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scraper/src/html/tree_sink.rs b/scraper/src/html/tree_sink.rs index 49b30b49..8af46156 100644 --- a/scraper/src/html/tree_sink.rs +++ b/scraper/src/html/tree_sink.rs @@ -11,7 +11,7 @@ use std::cell::{Ref, RefCell}; /// Wraps `Html` instances as sinks to drive parsing #[derive(Debug)] -pub struct HtmlTreeSink(RefCell); +pub struct HtmlTreeSink(pub RefCell); impl HtmlTreeSink { /// Wrap a `Html`instance as a sink to drive parsing From cbe480a4e9745e5584dbe0d6587d3ef9c5f26bd6 Mon Sep 17 00:00:00 2001 From: Carlo Federico Vescovo <26970569+cfvescovo@users.noreply.github.com> Date: Sat, 11 Jan 2025 11:41:37 +0100 Subject: [PATCH 03/10] Update LICENSE info --- LICENSE | 1 + 1 file changed, 1 insertion(+) diff --git a/LICENSE b/LICENSE index 3c787528..bb793d8a 100644 --- a/LICENSE +++ b/LICENSE @@ -1,5 +1,6 @@ Copyright © 2016, June McEnroe Copyright © 2017, Vivek Kushwaha +Copyright © 2024-2025, rust-scraper Contributors Permission to use, copy, modify, and/or distribute this software for any purpose with or without fee is hereby granted, provided that the above From 638d731f55d729b0d1e89213635f47625014ab7d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 20 Jan 2025 06:29:21 +0000 Subject: [PATCH 04/10] Bump indexmap from 2.7.0 to 2.7.1 Bumps [indexmap](https://github.com/indexmap-rs/indexmap) from 2.7.0 to 2.7.1. - [Changelog](https://github.com/indexmap-rs/indexmap/blob/master/RELEASES.md) - [Commits](https://github.com/indexmap-rs/indexmap/compare/2.7.0...2.7.1) --- updated-dependencies: - dependency-name: indexmap dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 4 ++-- scraper/Cargo.toml | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 5937ff36..0b1eafcc 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -148,9 +148,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.7.0" +version = "2.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "62f822373a4fe84d4bb149bf54e584a7f4abec90e072ed49cda0edea5b95471f" +checksum = "8c9c992b02b5b4c94ea26e32fe5bccb7aa7d9f390ab5c1221ff895bc7ea8b652" dependencies = [ "equivalent", "hashbrown", diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index 0144f3a0..2b3392a6 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -16,7 +16,7 @@ readme = "README.md" cssparser = "0.34.0" ego-tree = "0.10.0" html5ever = "0.29.0" -indexmap = { version = "2.7.0", optional = true } +indexmap = { version = "2.7.1", optional = true } precomputed-hash = "0.1.1" selectors = "0.26.0" tendril = "0.4.3" From 1a5533780655eeecca4c7c6e37ea146390c86ae8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jakub=20Slab=C3=BD?= <76100262+jakubslaby09@users.noreply.github.com> Date: Sun, 2 Feb 2025 09:16:29 +0100 Subject: [PATCH 05/10] Add a serde feature for (de)serializing Selectors (#227) Add a serde feature for serializing Selectors using CSS syntax --- Cargo.lock | 1 + scraper/Cargo.toml | 2 ++ scraper/src/selector.rs | 33 +++++++++++++++++++++++++++++++++ 3 files changed, 36 insertions(+) diff --git a/Cargo.lock b/Cargo.lock index 0b1eafcc..21fc2d34 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -399,6 +399,7 @@ dependencies = [ "indexmap", "precomputed-hash", "selectors", + "serde", "tendril", ] diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index 2b3392a6..39e8d2fa 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -19,6 +19,7 @@ html5ever = "0.29.0" indexmap = { version = "2.7.1", optional = true } precomputed-hash = "0.1.1" selectors = "0.26.0" +serde = { version = "1.0.215", optional = true } tendril = "0.4.3" [dependencies.getopts] @@ -31,6 +32,7 @@ deterministic = ["indexmap"] main = ["getopts"] atomic = [] errors = [] +serde = ["dep:serde"] [[bin]] name = "scraper" diff --git a/scraper/src/selector.rs b/scraper/src/selector.rs index 4f76c2b8..a8f23e46 100644 --- a/scraper/src/selector.rs +++ b/scraper/src/selector.rs @@ -11,6 +11,9 @@ use selectors::{ parser::{self, ParseRelative, SelectorList, SelectorParseErrorKind}, }; +#[cfg(feature = "serde")] +use serde::{de::Visitor, Deserialize, Serialize}; + use crate::error::SelectorErrorKind; use crate::ElementRef; @@ -80,6 +83,36 @@ impl ToCss for Selector { } } +#[cfg(feature = "serde")] +impl Serialize for Selector { + fn serialize(&self, serializer: S) -> Result { + serializer.serialize_str(&self.to_css_string()) + } +} + +#[cfg(feature = "serde")] +impl<'de> Deserialize<'de> for Selector { + fn deserialize>(deserializer: D) -> Result { + deserializer.deserialize_str(SelectorVisitor) + } +} + +#[cfg(feature = "serde")] +struct SelectorVisitor; + +#[cfg(feature = "serde")] +impl Visitor<'_> for SelectorVisitor { + type Value = Selector; + + fn expecting(&self, formatter: &mut fmt::Formatter) -> fmt::Result { + write!(formatter, "a css selector string") + } + + fn visit_str(self, v: &str) -> Result { + Selector::parse(v).map_err(serde::de::Error::custom) + } +} + /// An implementation of `Parser` for `selectors` #[derive(Clone, Copy, Debug)] pub struct Parser; From 11cca40454e3e7bfb8cb99a1f9ac1a1c838884e3 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 3 Feb 2025 06:43:24 +0000 Subject: [PATCH 06/10] Bump serde from 1.0.215 to 1.0.217 Bumps [serde](https://github.com/serde-rs/serde) from 1.0.215 to 1.0.217. - [Release notes](https://github.com/serde-rs/serde/releases) - [Commits](https://github.com/serde-rs/serde/compare/v1.0.215...v1.0.217) --- updated-dependencies: - dependency-name: serde dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 8 ++++---- scraper/Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 21fc2d34..a16161ce 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -424,18 +424,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.215" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6513c1ad0b11a9376da888e3e0baa0077f1aed55c17f50e7b2397136129fb88f" +checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.215" +version = "1.0.217" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad1e866f866923f252f05c889987993144fb74e722403468a4ebd70c3cd756c0" +checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" dependencies = [ "proc-macro2", "quote", diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index 39e8d2fa..e1b5b8a1 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -19,7 +19,7 @@ html5ever = "0.29.0" indexmap = { version = "2.7.1", optional = true } precomputed-hash = "0.1.1" selectors = "0.26.0" -serde = { version = "1.0.215", optional = true } +serde = { version = "1.0.217", optional = true } tendril = "0.4.3" [dependencies.getopts] From 97a987e31a11f209ee67853b78152eb6b6f0059d Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Mon, 24 Feb 2025 06:12:21 +0000 Subject: [PATCH 07/10] Bump serde from 1.0.217 to 1.0.218 Bumps [serde](https://github.com/serde-rs/serde) from 1.0.217 to 1.0.218. - [Release notes](https://github.com/serde-rs/serde/releases) - [Commits](https://github.com/serde-rs/serde/compare/v1.0.217...v1.0.218) --- updated-dependencies: - dependency-name: serde dependency-type: direct:production update-type: version-update:semver-patch ... Signed-off-by: dependabot[bot] --- Cargo.lock | 8 ++++---- scraper/Cargo.toml | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index a16161ce..e7106949 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -424,18 +424,18 @@ dependencies = [ [[package]] name = "serde" -version = "1.0.217" +version = "1.0.218" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70" +checksum = "e8dfc9d19bdbf6d17e22319da49161d5d0108e4188e8b680aef6299eed22df60" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.217" +version = "1.0.218" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0" +checksum = "f09503e191f4e797cb8aac08e9a4a4695c5edf6a2e70e376d961ddd5c969f82b" dependencies = [ "proc-macro2", "quote", diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index e1b5b8a1..55d937d9 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -19,7 +19,7 @@ html5ever = "0.29.0" indexmap = { version = "2.7.1", optional = true } precomputed-hash = "0.1.1" selectors = "0.26.0" -serde = { version = "1.0.217", optional = true } +serde = { version = "1.0.218", optional = true } tendril = "0.4.3" [dependencies.getopts] From 08afce2b3fc6bbd2cb50047b0a74861edffee17c Mon Sep 17 00:00:00 2001 From: Carlo Federico Vescovo Date: Mon, 24 Feb 2025 10:06:26 +0100 Subject: [PATCH 08/10] Version 0.23.0 --- Cargo.lock | 2 +- scraper/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index e7106949..b854ef2b 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -390,7 +390,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "scraper" -version = "0.22.0" +version = "0.23.0" dependencies = [ "cssparser", "ego-tree", diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index 55d937d9..6c53b45d 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scraper" -version = "0.22.0" +version = "0.23.0" edition = "2021" description = "HTML parsing and querying with CSS selectors" From 9bd68689a918c36f0e18daa63be56a2373b08650 Mon Sep 17 00:00:00 2001 From: Carlo Federico Vescovo Date: Mon, 24 Feb 2025 10:14:57 +0100 Subject: [PATCH 09/10] Fix README.md for crates.io --- README.md | 153 +--------------------------------------------- scraper/README.md | 153 +++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 153 insertions(+), 153 deletions(-) mode change 100644 => 120000 README.md diff --git a/README.md b/README.md deleted file mode 100644 index 39450ec2..00000000 --- a/README.md +++ /dev/null @@ -1,152 +0,0 @@ -# scraper - -[![crates.io](https://img.shields.io/crates/v/scraper?color=dark-green)][crate] -[![downloads](https://img.shields.io/crates/d/scraper)][crate] -[![test](https://github.com/causal-agent/scraper/actions/workflows/test.yml/badge.svg)][tests] - -HTML parsing and querying with CSS selectors. - -`scraper` is on [Crates.io][crate] and [GitHub][github]. - -[crate]: https://crates.io/crates/scraper -[github]: https://github.com/causal-agent/scraper -[tests]: https://github.com/causal-agent/scraper/actions/workflows/test.yml - -Scraper provides an interface to Servo's `html5ever` and `selectors` crates, for browser-grade parsing and querying. - -## Examples - -### Parsing a document - -```rust -use scraper::Html; - -let html = r#" - - - Hello, world! -

Hello, world!

-"#; - -let document = Html::parse_document(html); -``` - -### Parsing a fragment - -```rust -use scraper::Html; -let fragment = Html::parse_fragment("

Hello, world!

"); -``` - -### Parsing a selector - -```rust -use scraper::Selector; -let selector = Selector::parse("h1.foo").unwrap(); -``` - -### Selecting elements - -```rust -use scraper::{Html, Selector}; - -let html = r#" -
    -
  • Foo
  • -
  • Bar
  • -
  • Baz
  • -
-"#; - -let fragment = Html::parse_fragment(html); -let selector = Selector::parse("li").unwrap(); - -for element in fragment.select(&selector) { - assert_eq!("li", element.value().name()); -} -``` - -### Selecting descendent elements - -```rust -use scraper::{Html, Selector}; - -let html = r#" -
    -
  • Foo
  • -
  • Bar
  • -
  • Baz
  • -
-"#; - -let fragment = Html::parse_fragment(html); -let ul_selector = Selector::parse("ul").unwrap(); -let li_selector = Selector::parse("li").unwrap(); - -let ul = fragment.select(&ul_selector).next().unwrap(); -for element in ul.select(&li_selector) { - assert_eq!("li", element.value().name()); -} -``` - -### Accessing element attributes - -```rust -use scraper::{Html, Selector}; - -let fragment = Html::parse_fragment(r#""#); -let selector = Selector::parse(r#"input[name="foo"]"#).unwrap(); - -let input = fragment.select(&selector).next().unwrap(); -assert_eq!(Some("bar"), input.value().attr("value")); -``` - -### Serializing HTML and inner HTML - -```rust -use scraper::{Html, Selector}; - -let fragment = Html::parse_fragment("

Hello, world!

"); -let selector = Selector::parse("h1").unwrap(); - -let h1 = fragment.select(&selector).next().unwrap(); - -assert_eq!("

Hello, world!

", h1.html()); -assert_eq!("Hello, world!", h1.inner_html()); -``` - -### Accessing descendent text - -```rust -use scraper::{Html, Selector}; - -let fragment = Html::parse_fragment("

Hello, world!

"); -let selector = Selector::parse("h1").unwrap(); - -let h1 = fragment.select(&selector).next().unwrap(); -let text = h1.text().collect::>(); - -assert_eq!(vec!["Hello, ", "world!"], text); -``` - -### Manipulating the DOM - -```rust -use html5ever::tree_builder::TreeSink; -use scraper::{Html, Selector}; - -let html = "hello

REMOVE ME

"; -let selector = Selector::parse(".hello").unwrap(); -let mut document = Html::parse_document(html); -let node_ids: Vec<_> = document.select(&selector).map(|x| x.id()).collect(); -for id in node_ids { - document.remove_from_parent(&id); -} -assert_eq!(document.html(), "hello"); -``` - -## Contributing - -Please feel free to open pull requests. If you're planning on implementing -something big (i.e. not fixing a typo, a small bug fix, minor refactor, etc) -then please open an issue first. diff --git a/README.md b/README.md new file mode 120000 index 00000000..a6541ddb --- /dev/null +++ b/README.md @@ -0,0 +1 @@ +scraper/README.md \ No newline at end of file diff --git a/scraper/README.md b/scraper/README.md index 32d46ee8..39450ec2 120000 --- a/scraper/README.md +++ b/scraper/README.md @@ -1 +1,152 @@ -../README.md \ No newline at end of file +# scraper + +[![crates.io](https://img.shields.io/crates/v/scraper?color=dark-green)][crate] +[![downloads](https://img.shields.io/crates/d/scraper)][crate] +[![test](https://github.com/causal-agent/scraper/actions/workflows/test.yml/badge.svg)][tests] + +HTML parsing and querying with CSS selectors. + +`scraper` is on [Crates.io][crate] and [GitHub][github]. + +[crate]: https://crates.io/crates/scraper +[github]: https://github.com/causal-agent/scraper +[tests]: https://github.com/causal-agent/scraper/actions/workflows/test.yml + +Scraper provides an interface to Servo's `html5ever` and `selectors` crates, for browser-grade parsing and querying. + +## Examples + +### Parsing a document + +```rust +use scraper::Html; + +let html = r#" + + + Hello, world! +

Hello, world!

+"#; + +let document = Html::parse_document(html); +``` + +### Parsing a fragment + +```rust +use scraper::Html; +let fragment = Html::parse_fragment("

Hello, world!

"); +``` + +### Parsing a selector + +```rust +use scraper::Selector; +let selector = Selector::parse("h1.foo").unwrap(); +``` + +### Selecting elements + +```rust +use scraper::{Html, Selector}; + +let html = r#" +
    +
  • Foo
  • +
  • Bar
  • +
  • Baz
  • +
+"#; + +let fragment = Html::parse_fragment(html); +let selector = Selector::parse("li").unwrap(); + +for element in fragment.select(&selector) { + assert_eq!("li", element.value().name()); +} +``` + +### Selecting descendent elements + +```rust +use scraper::{Html, Selector}; + +let html = r#" +
    +
  • Foo
  • +
  • Bar
  • +
  • Baz
  • +
+"#; + +let fragment = Html::parse_fragment(html); +let ul_selector = Selector::parse("ul").unwrap(); +let li_selector = Selector::parse("li").unwrap(); + +let ul = fragment.select(&ul_selector).next().unwrap(); +for element in ul.select(&li_selector) { + assert_eq!("li", element.value().name()); +} +``` + +### Accessing element attributes + +```rust +use scraper::{Html, Selector}; + +let fragment = Html::parse_fragment(r#""#); +let selector = Selector::parse(r#"input[name="foo"]"#).unwrap(); + +let input = fragment.select(&selector).next().unwrap(); +assert_eq!(Some("bar"), input.value().attr("value")); +``` + +### Serializing HTML and inner HTML + +```rust +use scraper::{Html, Selector}; + +let fragment = Html::parse_fragment("

Hello, world!

"); +let selector = Selector::parse("h1").unwrap(); + +let h1 = fragment.select(&selector).next().unwrap(); + +assert_eq!("

Hello, world!

", h1.html()); +assert_eq!("Hello, world!", h1.inner_html()); +``` + +### Accessing descendent text + +```rust +use scraper::{Html, Selector}; + +let fragment = Html::parse_fragment("

Hello, world!

"); +let selector = Selector::parse("h1").unwrap(); + +let h1 = fragment.select(&selector).next().unwrap(); +let text = h1.text().collect::>(); + +assert_eq!(vec!["Hello, ", "world!"], text); +``` + +### Manipulating the DOM + +```rust +use html5ever::tree_builder::TreeSink; +use scraper::{Html, Selector}; + +let html = "hello

REMOVE ME

"; +let selector = Selector::parse(".hello").unwrap(); +let mut document = Html::parse_document(html); +let node_ids: Vec<_> = document.select(&selector).map(|x| x.id()).collect(); +for id in node_ids { + document.remove_from_parent(&id); +} +assert_eq!(document.html(), "hello"); +``` + +## Contributing + +Please feel free to open pull requests. If you're planning on implementing +something big (i.e. not fixing a typo, a small bug fix, minor refactor, etc) +then please open an issue first. From d4a9eaf6f102c03e88c92bb2e2481a61ad7d2502 Mon Sep 17 00:00:00 2001 From: Carlo Federico Vescovo Date: Mon, 24 Feb 2025 10:15:22 +0100 Subject: [PATCH 10/10] Version 0.23.1 --- Cargo.lock | 2 +- scraper/Cargo.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index b854ef2b..50df5256 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -390,7 +390,7 @@ checksum = "94143f37725109f92c262ed2cf5e59bce7498c01bcc1502d7b9afe439a4e9f49" [[package]] name = "scraper" -version = "0.23.0" +version = "0.23.1" dependencies = [ "cssparser", "ego-tree", diff --git a/scraper/Cargo.toml b/scraper/Cargo.toml index 6c53b45d..b88d6f49 100644 --- a/scraper/Cargo.toml +++ b/scraper/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "scraper" -version = "0.23.0" +version = "0.23.1" edition = "2021" description = "HTML parsing and querying with CSS selectors"