soup_kuchiki/lib.rs
1//! Inspired by the Python library "BeautifulSoup," `soup` is a layer on top of
2//! `html5ever` that aims to provide a slightly different API for querying &
3//! manipulating HTML
4//!
5//! # Examples (inspired by bs4's docs)
6//!
7//! Here is the HTML document we will be using for the rest of the examples:
8//!
9//! ```
10//! const THREE_SISTERS: &'static str = r#"
11//! <html><head><title>The Dormouse's story</title></head>
12//! <body>
13//! <p class="title"><b>The Dormouse's story</b></p>
14//!
15//! <p class="story">Once upon a time there were three little sisters; and their names were
16//! <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
17//! <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
18//! <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
19//! and they lived at the bottom of a well.</p>
20//!
21//! <p class="story">...</p>
22//! "#;
23//! # fn main() {}
24//! ```
25//!
26//! First let's try searching for a tag with a specific name:
27//!
28//! ```
29//! # extern crate soup;
30//! # const THREE_SISTERS: &'static str = r#"
31//! # <html><head><title>The Dormouse's story</title></head>
32//! # <body>
33//! # <p class="title"><b>The Dormouse's story</b></p>
34//! #
35//! # <p class="story">Once upon a time there were three little sisters; and their names were
36//! # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
37//! # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
38//! # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
39//! # and they lived at the bottom of a well.</p>
40//! #
41//! # <p class="story">...</p>
42//! # "#;
43//! # fn main() {
44//! use soup::prelude::*;
45//!
46//! let soup = Soup::new(THREE_SISTERS);
47//!
48//! let title = soup.tag("title").find().expect("Couldn't find tag 'title'");
49//! assert_eq!(title.display(), "<title>The Dormouse's story</title>");
50//! assert_eq!(title.name(), "title");
51//! assert_eq!(title.text(), "The Dormouse's story".to_string());
52//! assert_eq!(title.parent().expect("Couldn't find parent of 'title'").name(), "head");
53//!
54//! let p = soup.tag("p").find().expect("Couldn't find tag 'p'");
55//! assert_eq!(
56//! p.display(),
57//! r#"<p class="title"><b>The Dormouse's story</b></p>"#
58//! );
59//! assert_eq!(p.get("class"), Some("title".to_string()));
60//! # }
61//! ```
62//!
63//! So we see that `.find` will give us the first element that matches the
64//! query, and we've seen some of the methods that we can call on the results.
65//! But what if we want to retrieve more than one element with the query? For
66//! that, we'll use `.find_all`:
67//!
68//! ```
69//! # extern crate soup;
70//! # use soup::prelude::*;
71//! # const THREE_SISTERS: &'static str = r#"
72//! # <html><head><title>The Dormouse's story</title></head>
73//! # <body>
74//! # <p class="title"><b>The Dormouse's story</b></p>
75//! #
76//! # <p class="story">Once upon a time there were three little sisters; and their names were
77//! # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
78//! # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
79//! # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
80//! # and they lived at the bottom of a well.</p>
81//! #
82//! # <p class="story">...</p>
83//! # "#;
84//! # fn main() {
85//! # let soup = Soup::new(THREE_SISTERS);
86//! // .find returns only the first 'a' tag
87//! let a = soup.tag("a").find().expect("Couldn't find tag 'a'");
88//! assert_eq!(
89//! a.display(),
90//! r#"<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>"#
91//! );
92//! // but .find_all will return _all_ of them:
93//! let a_s = soup.tag("a").find_all();
94//! assert_eq!(
95//! a_s.map(|a| a.display())
96//! .collect::<Vec<_>>()
97//! .join("\n"),
98//! r#"<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
99//! <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>
100//! <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>"#
101//! );
102//! # }
103//! ```
104//!
105//! Since `.find_all` returns an iterator, you can use it with all the methods
106//! you would use with other iterators:
107//!
108//! ```
109//! # extern crate soup;
110//! # use soup::prelude::*;
111//! # const THREE_SISTERS: &'static str = r#"
112//! # <html><head><title>The Dormouse's story</title></head>
113//! # <body>
114//! # <p class="title"><b>The Dormouse's story</b></p>
115//! #
116//! # <p class="story">Once upon a time there were three little sisters; and their names were
117//! # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
118//! # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
119//! # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
120//! # and they lived at the bottom of a well.</p>
121//! #
122//! # <p class="story">...</p>
123//! # "#;
124//! # fn main() {
125//! # let soup = Soup::new(THREE_SISTERS);
126//! let expected = [
127//! "http://example.com/elsie",
128//! "http://example.com/lacie",
129//! "http://example.com/tillie",
130//! ];
131//!
132//! for (i, link) in soup.tag("a").find_all().enumerate() {
133//! let href = link.get("href").expect("Couldn't find link with 'href' attribute");
134//! assert_eq!(href, expected[i].to_string());
135//! }
136//! # }
137//! ```
138//!
139//! The top-level structure we've been working with here, `soup`, implements the
140//! same methods that the query results do, so you can call the same methods on
141//! it and it will delegate the calls to the root node:
142//!
143//! ```
144//! # extern crate soup;
145//! # use soup::prelude::*;
146//! # const THREE_SISTERS: &'static str = r#"
147//! # <html><head><title>The Dormouse's story</title></head>
148//! # <body>
149//! # <p class="title"><b>The Dormouse's story</b></p>
150//! #
151//! # <p class="story">Once upon a time there were three little sisters; and their names were
152//! # <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
153//! # <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
154//! # <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
155//! # and they lived at the bottom of a well.</p>
156//! #
157//! # <p class="story">...</p>
158//! # "#;
159//! # fn main() {
160//! # let soup = Soup::new(THREE_SISTERS);
161//! let text = soup.text();
162//! assert_eq!(
163//! text,
164//! r#"The Dormouse's story
165//!
166//! The Dormouse's story
167//!
168//! Once upon a time there were three little sisters; and their names were
169//! Elsie,
170//! Lacie and
171//! Tillie;
172//! and they lived at the bottom of a well.
173//!
174//! ...
175//! "#
176//! );
177//! # }
178//! ```
179//!
180//! You can use more than just strings to search for results, such as Regex:
181//!
182//! ```rust
183//! # extern crate regex;
184//! # extern crate soup;
185//! # use soup::prelude::*;
186//! # use std::error::Error;
187//! use regex::Regex;
188//! # fn main() -> Result<(), Box<Error>> {
189//!
190//! let soup = Soup::new(r#"<body><p>some text, <b>Some bold text</b></p></body>"#);
191//! let results = soup.tag(Regex::new("^b")?)
192//! .find_all()
193//! .map(|tag| tag.name().to_string())
194//! .collect::<Vec<_>>();
195//! assert_eq!(results, vec!["body".to_string(), "b".to_string()]);
196//! # Ok(())
197//! # }
198//! ```
199//!
200//! Passing `true` will match everything:
201//!
202//! ```rust
203//! # extern crate soup;
204//! # use soup::prelude::*;
205//! # use std::error::Error;
206//! # fn main() -> Result<(), Box<Error>> {
207//!
208//! let soup = Soup::new(r#"<body><p>some text, <b>Some bold text</b></p></body>"#);
209//! let results = soup.tag(true)
210//! .find_all()
211//! .map(|tag| tag.name().to_string())
212//! .collect::<Vec<_>>();
213//! assert_eq!(results, vec![
214//! "html".to_string(),
215//! "head".to_string(),
216//! "body".to_string(),
217//! "p".to_string(),
218//! "b".to_string(),
219//! ]);
220//! # Ok(())
221//! # }
222//! ```
223//!
224//! (also, passing `false` will always return no results, though if that is
225//! useful to you, please let me know)
226//!
227//! So what can you do once you get the result of a query? Well, for one thing,
228//! you can traverse the tree a few different ways. You can ascend the tree:
229//!
230//! ```rust
231//! # extern crate soup;
232//! # use soup::prelude::*;
233//! # use std::error::Error;
234//! # fn main() -> Result<(), Box<Error>> {
235//!
236//! let soup = Soup::new(r#"<body><p>some text, <b>Some bold text</b></p></body>"#);
237//! let b = soup.tag("b")
238//! .find()
239//! .expect("Couldn't find tag 'b'");
240//! let p = b.parent()
241//! .expect("Couldn't find parent of 'b'");
242//! assert_eq!(p.name(), "p".to_string());
243//! let body = p.parent()
244//! .expect("Couldn't find parent of 'p'");
245//! assert_eq!(body.name(), "body".to_string());
246//! # Ok(())
247//! # }
248//! ```
249//!
250//! Or you can descend it:
251//!
252//! ```rust
253//! # extern crate soup;
254//! # use soup::prelude::*;
255//! # use std::error::Error;
256//! # fn main() -> Result<(), Box<Error>> {
257//!
258//! let soup = Soup::new(r#"<body><ul><li>ONE</li><li>TWO</li><li>THREE</li></ul></body>"#);
259//! let ul = soup.tag("ul")
260//! .find()
261//! .expect("Couldn't find tag 'ul'");
262//! let mut li_tags = ul.children().filter(|child| child.is_element());
263//! assert_eq!(li_tags.next().map(|tag| tag.text().to_string()), Some("ONE".to_string()));
264//! assert_eq!(li_tags.next().map(|tag| tag.text().to_string()), Some("TWO".to_string()));
265//! assert_eq!(li_tags.next().map(|tag| tag.text().to_string()), Some("THREE".to_string()));
266//! assert!(li_tags.next().is_none());
267//! # Ok(())
268//! # }
269//! ```
270//!
271//! Or ascend it with an iterator:
272//!
273//! ```rust
274//! # extern crate soup;
275//! # use soup::prelude::*;
276//! # use std::error::Error;
277//! # fn main() -> Result<(), Box<Error>> {
278//!
279//! let soup = Soup::new(r#"<body><ul><li>ONE</li><li>TWO</li><li>THREE</li></ul></body>"#);
280//! let li = soup.tag("li").find().expect("Couldn't find tag 'li'");
281//! let mut parents = li.parents();
282//! assert_eq!(parents.next().map(|tag| tag.name().to_string()), Some("ul".to_string()));
283//! assert_eq!(parents.next().map(|tag| tag.name().to_string()), Some("body".to_string()));
284//! assert_eq!(parents.next().map(|tag| tag.name().to_string()), Some("html".to_string()));
285//! assert_eq!(parents.next().map(|tag| tag.name().to_string()), Some("[document]".to_string()));
286//! # Ok(())
287//! # }
288//! ```
289#![deny(
290 missing_docs,
291 missing_debug_implementations,
292 missing_copy_implementations,
293 trivial_casts,
294 trivial_numeric_casts,
295 unsafe_code,
296 unstable_features,
297 unused_import_braces,
298 unused_qualifications,
299 rust_2018_compatibility,
300 rust_2018_idioms
301)]
302extern crate html5ever;
303extern crate kuchiki;
304#[cfg(feature = "regex")]
305extern crate regex;
306
307use html5ever::tendril::TendrilSink;
308type RcDom = kuchiki::NodeRef;
309
310/// The type of a DOM node
311pub type Handle = kuchiki::NodeRef;
312
313use std::{
314 fmt,
315 io::{self, Read},
316};
317
318/// This module exports all the important types & traits to use `soup`
319/// effectively
320pub mod prelude {
321 pub use crate::{
322 node_ext::{AttributeExt, NodeExt},
323 qb_ext::QueryBuilderExt,
324 Handle,
325 Soup,
326 };
327}
328
329pub use crate::{find::QueryBuilder, node_ext::NodeExt, qb_ext::QueryBuilderExt};
330
331mod attribute;
332mod find;
333mod node_ext;
334pub mod pattern;
335mod qb_ext;
336
337/// Parses HTML & provides methods to query & manipulate the document
338pub struct Soup {
339 handle: RcDom,
340}
341
342impl Soup {
343 /// Create a new `Soup` instance from a string slice
344 ///
345 /// # Example
346 ///
347 /// ```rust
348 /// # extern crate soup;
349 /// # use soup::prelude::*;
350 /// # use std::error::Error;
351 /// # fn main() -> Result<(), Box<Error>> {
352 /// let html = r#"
353 /// <!doctype html>
354 /// <html>
355 /// <head>
356 /// <title>page title</title>
357 /// </head>
358 /// <body>
359 /// <h1>Heading</h1>
360 /// <p>Some text</p>
361 /// <p>Some more text</p>
362 /// </body>
363 /// </html>
364 /// "#;
365 ///
366 /// let soup = Soup::new(html);
367 /// # Ok(())
368 /// # }
369 /// ```
370 pub fn new(html: &str) -> Soup {
371 let dom = kuchiki::parse_html().from_utf8().one(html.as_bytes());
372 Soup {
373 handle: dom,
374 }
375 }
376
377 /// Create a new `Soup` instance from something that implements `Read`
378 ///
379 /// This is good for parsing the output of an HTTP response, for example.
380 ///
381 /// ```rust,no_run
382 /// # extern crate reqwest;
383 /// # extern crate soup;
384 /// # use std::error::Error;
385 /// use soup::prelude::*;
386 ///
387 /// # fn main() -> Result<(), Box<Error>> {
388 /// let response = reqwest::get("https://docs.rs/soup")?;
389 /// let soup = Soup::from_reader(response)?;
390 /// # Ok(())
391 /// # }
392 /// ```
393 pub fn from_reader<R: Read>(mut reader: R) -> io::Result<Soup> {
394 let dom = kuchiki::parse_html().from_utf8().read_from(&mut reader)?;
395 Ok(Soup {
396 handle: dom,
397 })
398 }
399
400 /// Extracts all text from the HTML
401 pub fn text(&self) -> String {
402 self.handle.text()
403 }
404}
405
406impl From<RcDom> for Soup {
407 fn from(rc: RcDom) -> Soup {
408 Soup {
409 handle: rc,
410 }
411 }
412}
413
414impl fmt::Debug for Soup {
415 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
416 write!(f, "{}", self.handle.text())
417 }
418}
419
420#[cfg(test)]
421mod tests {
422 use super::*;
423
424 const TEST_HTML_STRING: &'static str = r#"
425<!doctype html>
426<html>
427 <head>
428 <title>foo</title>
429 </head>
430 <body>
431 <p>One</p>
432 <p>Two</p>
433 </body>
434</html>
435"#;
436
437 #[test]
438 fn find() {
439 let soup = Soup::new(TEST_HTML_STRING);
440 let result = soup.tag("p").find().expect("Couldn't find tag 'p'");
441 assert_eq!(result.text(), "One".to_string());
442 }
443
444 #[test]
445 fn find_all() {
446 let soup = Soup::new(TEST_HTML_STRING);
447 let result = soup
448 .tag("p")
449 .find_all()
450 .map(|p| p.text())
451 .collect::<Vec<_>>();
452 assert_eq!(result, vec!["One".to_string(), "Two".to_string()]);
453 }
454}