soup_kuchiki/
lib.rs

1//! Inspired by the Python library "BeautifulSoup," `soup` is a layer on top of
2//! `html5ever` that aims to provide a slightly different API for querying &
3//! manipulating HTML
4//!
5//! # Examples (inspired by bs4's docs)
6//!
7//! Here is the HTML document we will be using for the rest of the examples:
8//!
9//! ```
10//! const THREE_SISTERS: &'static str = r#"
11//! <html><head><title>The Dormouse's story</title></head>
12//! <body>
13//! <p class="title"><b>The Dormouse's story</b></p>
14//!
15//! <p class="story">Once upon a time there were three little sisters; and their names were
16//! <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
17//! <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
18//! <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
19//! and they lived at the bottom of a well.</p>
20//!
21//! <p class="story">...</p>
22//! "#;
23//! # fn main() {}
24//! ```
25//!
26//! First let's try searching for a tag with a specific name:
27//!
28//! ```
29//! # extern crate soup;
30//! # const THREE_SISTERS: &'static str = r#"
31//! # <html><head><title>The Dormouse's story</title></head>
32//! # <body>
33//! # <p class="title"><b>The Dormouse's story</b></p>
34//! #
35//! # <p class="story">Once upon a time there were three little sisters; and their names were
36//! # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
37//! # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
38//! # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
39//! # and they lived at the bottom of a well.</p>
40//! #
41//! # <p class="story">...</p>
42//! # "#;
43//! # fn main() {
44//! use soup::prelude::*;
45//!
46//! let soup = Soup::new(THREE_SISTERS);
47//!
48//! let title = soup.tag("title").find().expect("Couldn't find tag 'title'");
49//! assert_eq!(title.display(), "<title>The Dormouse's story</title>");
50//! assert_eq!(title.name(), "title");
51//! assert_eq!(title.text(), "The Dormouse's story".to_string());
52//! assert_eq!(title.parent().expect("Couldn't find parent of 'title'").name(), "head");
53//!
54//! let p = soup.tag("p").find().expect("Couldn't find tag 'p'");
55//! assert_eq!(
56//!     p.display(),
57//!     r#"<p class="title"><b>The Dormouse's story</b></p>"#
58//! );
59//! assert_eq!(p.get("class"), Some("title".to_string()));
60//! # }
61//! ```
62//!
63//! So we see that `.find` will give us the first element that matches the
64//! query, and we've seen some of the methods that we can call on the results.
65//! But what if we want to retrieve more than one element with the query? For
66//! that, we'll use `.find_all`:
67//!
68//! ```
69//! # extern crate soup;
70//! # use soup::prelude::*;
71//! # const THREE_SISTERS: &'static str = r#"
72//! # <html><head><title>The Dormouse's story</title></head>
73//! # <body>
74//! # <p class="title"><b>The Dormouse's story</b></p>
75//! #
76//! # <p class="story">Once upon a time there were three little sisters; and their names were
77//! # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
78//! # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
79//! # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
80//! # and they lived at the bottom of a well.</p>
81//! #
82//! # <p class="story">...</p>
83//! # "#;
84//! # fn main() {
85//! # let soup = Soup::new(THREE_SISTERS);
86//! // .find returns only the first 'a' tag
87//! let a = soup.tag("a").find().expect("Couldn't find tag 'a'");
88//! assert_eq!(
89//!     a.display(),
90//!     r#"<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>"#
91//! );
92//! // but .find_all will return _all_ of them:
93//! let a_s = soup.tag("a").find_all();
94//! assert_eq!(
95//!     a_s.map(|a| a.display())
96//!        .collect::<Vec<_>>()
97//!        .join("\n"),
98//!     r#"<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
99//! <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
100//! <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>"#
101//! );
102//! # }
103//! ```
104//!
105//! Since `.find_all` returns an iterator, you can use it with all the methods
106//! you would use with other iterators:
107//!
108//! ```
109//! # extern crate soup;
110//! # use soup::prelude::*;
111//! # const THREE_SISTERS: &'static str = r#"
112//! # <html><head><title>The Dormouse's story</title></head>
113//! # <body>
114//! # <p class="title"><b>The Dormouse's story</b></p>
115//! #
116//! # <p class="story">Once upon a time there were three little sisters; and their names were
117//! # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
118//! # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
119//! # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
120//! # and they lived at the bottom of a well.</p>
121//! #
122//! # <p class="story">...</p>
123//! # "#;
124//! # fn main() {
125//! # let soup = Soup::new(THREE_SISTERS);
126//! let expected = [
127//!     "http://example.com/elsie",
128//!     "http://example.com/lacie",
129//!     "http://example.com/tillie",
130//! ];
131//!
132//! for (i, link) in soup.tag("a").find_all().enumerate() {
133//!     let href = link.get("href").expect("Couldn't find link with 'href' attribute");
134//!     assert_eq!(href, expected[i].to_string());
135//! }
136//! # }
137//! ```
138//!
139//! The top-level structure we've been working with here, `soup`, implements the
140//! same methods that the query results do, so you can call the same methods on
141//! it and it will delegate the calls to the root node:
142//!
143//! ```
144//! # extern crate soup;
145//! # use soup::prelude::*;
146//! # const THREE_SISTERS: &'static str = r#"
147//! # <html><head><title>The Dormouse's story</title></head>
148//! # <body>
149//! # <p class="title"><b>The Dormouse's story</b></p>
150//! #
151//! # <p class="story">Once upon a time there were three little sisters; and their names were
152//! # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
153//! # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
154//! # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
155//! # and they lived at the bottom of a well.</p>
156//! #
157//! # <p class="story">...</p>
158//! # "#;
159//! # fn main() {
160//! # let soup = Soup::new(THREE_SISTERS);
161//! let text = soup.text();
162//! assert_eq!(
163//!     text,
164//!     r#"The Dormouse's story
165//!
166//! The Dormouse's story
167//!
168//! Once upon a time there were three little sisters; and their names were
169//! Elsie,
170//! Lacie and
171//! Tillie;
172//! and they lived at the bottom of a well.
173//!
174//! ...
175//! "#
176//! );
177//! # }
178//! ```
179//!
180//! You can use more than just strings to search for results, such as Regex:
181//!
182//! ```rust
183//! # extern crate regex;
184//! # extern crate soup;
185//! # use soup::prelude::*;
186//! # use std::error::Error;
187//! use regex::Regex;
188//! # fn main() -> Result<(), Box<Error>> {
189//!
190//! let soup = Soup::new(r#"<body><p>some text, <b>Some bold text</b></p></body>"#);
191//! let results = soup.tag(Regex::new("^b")?)
192//!                   .find_all()
193//!                   .map(|tag| tag.name().to_string())
194//!                   .collect::<Vec<_>>();
195//! assert_eq!(results, vec!["body".to_string(), "b".to_string()]);
196//! #   Ok(())
197//! # }
198//! ```
199//!
200//! Passing `true` will match everything:
201//!
202//! ```rust
203//! # extern crate soup;
204//! # use soup::prelude::*;
205//! # use std::error::Error;
206//! # fn main() -> Result<(), Box<Error>> {
207//!
208//! let soup = Soup::new(r#"<body><p>some text, <b>Some bold text</b></p></body>"#);
209//! let results = soup.tag(true)
210//!                   .find_all()
211//!                   .map(|tag| tag.name().to_string())
212//!                   .collect::<Vec<_>>();
213//! assert_eq!(results, vec![
214//!     "html".to_string(),
215//!     "head".to_string(),
216//!     "body".to_string(),
217//!     "p".to_string(),
218//!     "b".to_string(),
219//! ]);
220//! #   Ok(())
221//! # }
222//! ```
223//!
224//! (also, passing `false` will always return no results, though if that is
225//! useful to you, please let me know)
226//!
227//! So what can you do once you get the result of a query? Well, for one thing,
228//! you can traverse the tree a few different ways. You can ascend the tree:
229//!
230//! ```rust
231//! # extern crate soup;
232//! # use soup::prelude::*;
233//! # use std::error::Error;
234//! # fn main() -> Result<(), Box<Error>> {
235//!
236//! let soup = Soup::new(r#"<body><p>some text, <b>Some bold text</b></p></body>"#);
237//! let b = soup.tag("b")
238//!             .find()
239//!             .expect("Couldn't find tag 'b'");
240//! let p = b.parent()
241//!          .expect("Couldn't find parent of 'b'");
242//! assert_eq!(p.name(), "p".to_string());
243//! let body = p.parent()
244//!             .expect("Couldn't find parent of 'p'");
245//! assert_eq!(body.name(), "body".to_string());
246//! #   Ok(())
247//! # }
248//! ```
249//!
250//! Or you can descend it:
251//!
252//! ```rust
253//! # extern crate soup;
254//! # use soup::prelude::*;
255//! # use std::error::Error;
256//! # fn main() -> Result<(), Box<Error>> {
257//!
258//! let soup = Soup::new(r#"<body><ul><li>ONE</li><li>TWO</li><li>THREE</li></ul></body>"#);
259//! let ul = soup.tag("ul")
260//!             .find()
261//!             .expect("Couldn't find tag 'ul'");
262//! let mut li_tags = ul.children().filter(|child| child.is_element());
263//! assert_eq!(li_tags.next().map(|tag| tag.text().to_string()), Some("ONE".to_string()));
264//! assert_eq!(li_tags.next().map(|tag| tag.text().to_string()), Some("TWO".to_string()));
265//! assert_eq!(li_tags.next().map(|tag| tag.text().to_string()), Some("THREE".to_string()));
266//! assert!(li_tags.next().is_none());
267//! #   Ok(())
268//! # }
269//! ```
270//!
271//! Or ascend it with an iterator:
272//!
273//! ```rust
274//! # extern crate soup;
275//! # use soup::prelude::*;
276//! # use std::error::Error;
277//! # fn main() -> Result<(), Box<Error>> {
278//!
279//! let soup = Soup::new(r#"<body><ul><li>ONE</li><li>TWO</li><li>THREE</li></ul></body>"#);
280//! let li = soup.tag("li").find().expect("Couldn't find tag 'li'");
281//! let mut parents = li.parents();
282//! assert_eq!(parents.next().map(|tag| tag.name().to_string()), Some("ul".to_string()));
283//! assert_eq!(parents.next().map(|tag| tag.name().to_string()), Some("body".to_string()));
284//! assert_eq!(parents.next().map(|tag| tag.name().to_string()), Some("html".to_string()));
285//! assert_eq!(parents.next().map(|tag| tag.name().to_string()), Some("[document]".to_string()));
286//! #   Ok(())
287//! # }
288//! ```
289#![deny(
290    missing_docs,
291    missing_debug_implementations,
292    missing_copy_implementations,
293    trivial_casts,
294    trivial_numeric_casts,
295    unsafe_code,
296    unstable_features,
297    unused_import_braces,
298    unused_qualifications,
299    rust_2018_compatibility,
300    rust_2018_idioms
301)]
302extern crate html5ever;
303extern crate kuchiki;
304#[cfg(feature = "regex")]
305extern crate regex;
306
307use html5ever::tendril::TendrilSink;
308type RcDom = kuchiki::NodeRef;
309
310/// The type of a DOM node
311pub type Handle = kuchiki::NodeRef;
312
313use std::{
314    fmt,
315    io::{self, Read},
316};
317
318/// This module exports all the important types & traits to use `soup`
319/// effectively
320pub mod prelude {
321    pub use crate::{
322        node_ext::{AttributeExt, NodeExt},
323        qb_ext::QueryBuilderExt,
324        Handle,
325        Soup,
326    };
327}
328
329pub use crate::{find::QueryBuilder, node_ext::NodeExt, qb_ext::QueryBuilderExt};
330
331mod attribute;
332mod find;
333mod node_ext;
334pub mod pattern;
335mod qb_ext;
336
337/// Parses HTML & provides methods to query & manipulate the document
338pub struct Soup {
339    handle: RcDom,
340}
341
342impl Soup {
343    /// Create a new `Soup` instance from a string slice
344    ///
345    /// # Example
346    ///
347    /// ```rust
348    /// # extern crate soup;
349    /// # use soup::prelude::*;
350    /// # use std::error::Error;
351    /// # fn main() -> Result<(), Box<Error>> {
352    /// let html = r#"
353    /// <!doctype html>
354    /// <html>
355    ///   <head>
356    ///     <title>page title</title>
357    ///   </head>
358    ///   <body>
359    ///     <h1>Heading</h1>
360    ///     <p>Some text</p>
361    ///     <p>Some more text</p>
362    ///   </body>
363    /// </html>
364    /// "#;
365    ///
366    /// let soup = Soup::new(html);
367    /// #   Ok(())
368    /// # }
369    /// ```
370    pub fn new(html: &str) -> Soup {
371        let dom = kuchiki::parse_html().from_utf8().one(html.as_bytes());
372        Soup {
373            handle: dom,
374        }
375    }
376
377    /// Create a new `Soup` instance from something that implements `Read`
378    ///
379    /// This is good for parsing the output of an HTTP response, for example.
380    ///
381    /// ```rust,no_run
382    /// # extern crate reqwest;
383    /// # extern crate soup;
384    /// # use std::error::Error;
385    /// use soup::prelude::*;
386    ///
387    /// # fn main() -> Result<(), Box<Error>> {
388    /// let response = reqwest::get("https://docs.rs/soup")?;
389    /// let soup = Soup::from_reader(response)?;
390    /// #   Ok(())
391    /// # }
392    /// ```
393    pub fn from_reader<R: Read>(mut reader: R) -> io::Result<Soup> {
394        let dom = kuchiki::parse_html().from_utf8().read_from(&mut reader)?;
395        Ok(Soup {
396            handle: dom,
397        })
398    }
399
400    /// Extracts all text from the HTML
401    pub fn text(&self) -> String {
402        self.handle.text()
403    }
404}
405
406impl From<RcDom> for Soup {
407    fn from(rc: RcDom) -> Soup {
408        Soup {
409            handle: rc,
410        }
411    }
412}
413
414impl fmt::Debug for Soup {
415    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
416        write!(f, "{}", self.handle.text())
417    }
418}
419
420#[cfg(test)]
421mod tests {
422    use super::*;
423
424    const TEST_HTML_STRING: &'static str = r#"
425<!doctype html>
426<html>
427  <head>
428    <title>foo</title>
429  </head>
430  <body>
431    <p>One</p>
432    <p>Two</p>
433  </body>
434</html>
435"#;
436
437    #[test]
438    fn find() {
439        let soup = Soup::new(TEST_HTML_STRING);
440        let result = soup.tag("p").find().expect("Couldn't find tag 'p'");
441        assert_eq!(result.text(), "One".to_string());
442    }
443
444    #[test]
445    fn find_all() {
446        let soup = Soup::new(TEST_HTML_STRING);
447        let result = soup
448            .tag("p")
449            .find_all()
450            .map(|p| p.text())
451            .collect::<Vec<_>>();
452        assert_eq!(result, vec!["One".to_string(), "Two".to_string()]);
453    }
454}