html_extractor/
lib.rs

1#![allow(clippy::needless_doctest_main)]
2//! This crate provides an easy way to extract data from HTML.
3//!
4//! [`HtmlExtractor`] is neither a parser nor a deserializer.
5//! It picks up only the desired data from HTML.
6//!
7//! [`html_extractor!`](macro.html_extractor.html) will help to implement [`HtmlExtractor`].
8//!
9//! # Examples
10//! ## Extracting a simple value from HTML
11//! ```
12//! use html_extractor::{html_extractor, HtmlExtractor};
13//! html_extractor! {
14//!     #[derive(Debug, PartialEq)]
15//!     Foo {
16//!         foo: usize = (text of "#foo"),
17//!     }
18//! }
19//!
20//! fn main() {
21//!     let input = r#"
22//!         <div id="foo">1</div>
23//!     "#;
24//!     let foo = Foo::extract_from_str(input).unwrap();
25//!     assert_eq!(foo, Foo { foo: 1 });
26//! }
27//! ```
28//!
29//! ## Extracting a collection from HTML
30//! ```
31//! use html_extractor::{html_extractor, HtmlExtractor};
32//! html_extractor! {
33//!     #[derive(Debug, PartialEq)]
34//!     Foo {
35//!         foo: Vec<usize> = (text of ".foo", collect),
36//!     }
37//! }
38//!
39//! fn main() {
40//!     let input = r#"
41//!         <div class="foo">1</div>
42//!         <div class="foo">2</div>
43//!         <div class="foo">3</div>
44//!         <div class="foo">4</div>
45//!     "#;
46//!     let foo = Foo::extract_from_str(input).unwrap();
47//!     assert_eq!(foo, Foo { foo: vec![1, 2, 3, 4] });
48//! }
49//! ```
50//!
51//! ## Extracting with regex
52//! ```
53//! use html_extractor::{html_extractor, HtmlExtractor};
54//! html_extractor! {
55//!     #[derive(Debug, PartialEq)]
56//!     Foo {
57//!         (foo: usize,) = (text of "#foo", capture with "^foo=(.*)$"),
58//!     }
59//! }
60//!
61//! fn main() {
62//!     let input = r#"
63//!         <div id="foo">foo=1</div>
64//!     "#;
65//!     let foo = Foo::extract_from_str(input).unwrap();
66//!     assert_eq!(foo, Foo { foo: 1 });
67//! }
68//! ```
69
70#[doc(hidden)]
71pub extern crate lazy_static;
72#[doc(hidden)]
73pub extern crate regex;
74#[doc(hidden)]
75pub extern crate scraper;
76pub use error::Error;
77pub mod error;
78
79/// Generates structures that implement [`HtmlExtractor`].
80///
81/// # Syntax
82///
83/// ## Defining structures
84/// In this macro, zero or more structures can be defined.
85///
86/// Attributes can be attached to the structures, but currently attributes that may remove the structures (like `#[cfg]`) will not work.
87/// ```no_run
88/// # use html_extractor::html_extractor;
89/// # fn main() {}
90/// html_extractor! {
91///     //private structure
92///     Foo {
93///         //fields...
94///     }
95///     //any visibilities and some attributes can be used
96///     #[derive(Debug, Clone)]
97///     pub(crate) Bar {
98///         //fields...
99///     }
100/// }
101/// ```
102///
103/// ## Defining fields in structures
104/// There are two types of fields, "single field" and "tuple field".
105/// Tuple fields are used to [capture data with regex](#capture-specifier).
106///
107/// Each field definition has a declaration part and an [extractor](#extractor-part-of-field-definitions) part.
108///
109/// Attributes can be attached to the fields, but currently attributes that may remove the fields (like `#[cfg]`) will not work.
110/// ```no_run
111/// # use html_extractor::html_extractor;
112/// # fn main() {}
113/// html_extractor! {
114///     Foo {
115///         //single field
116///         pub foo: usize = (text of "#foo"),
117///         //^^^^^^^^^^^^   ^^^^^^^^^^^^^^^^
118///         // declaration   extractor
119///
120///         //tuple field
121///         (pub bar: usize, pub baz: usize) = (text of "#bar-baz", capture with "bar=(.*),baz=(.*)"),
122///         //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
123///         //                   declaration   extractor
124///     }
125/// }
126/// ```
127///
128/// ## Extractor part of field definitions
129/// The extractor part of field definitions specifies how to extract data from HTML.
130/// Extractor consists of [Target](#target-specifier), [Capture](#capture-specifier), [Collector](#collector-specifier) and [Parser](#parser-specifier) specifier.
131///
132/// The order of specifiers does not matter. If the same specifier is written multiple times, the one given later applies.
133/// ### Target specifier
134/// Target specifier specifies a selector to select an element (or elements) and what of the selected element is extracted.
135///
136/// If the specified selector is invalid, it will be a compile error.  
137/// If `text of ..` or `attr[..] of ..` is used, the type of field must implement [`FromStr`](std::str::FromStr).  
138/// If `elem of ..` is used, the type of field must implement [`HtmlExtractor`].  
139/// If `text of ..` is used, leading and trailing whitespace removed from the extracted string.  
140/// If `presence of ..` is used, the type must be `bool` and any other specifier cannot be used,
141/// ```
142/// use html_extractor::{html_extractor, HtmlExtractor};
143/// html_extractor! {
144///     #[derive(Debug, PartialEq)]
145///     Foo {
146///         // extracts the first text node in the element that first matched the selector "#foo"
147///         foo: usize = (text of "#foo"),
148///         // extracts the third text node in the element that first matched the selector "#bar"
149///         bar: usize = (text[2] of "#bar"),
150///         // extracts attribute "data-baz" in the element that first matched the selector "#baz"
151///         baz: usize = (attr["data-baz"] of "#baz"),
152///         // extracts an element that first matched the selector "#qux" and parse it with `HtmlExtractor::extract()`
153///         qux: Qux = (elem of "#qux"),
154///         // extracts inner HTML of the element that first matched the selector "#grault",
155///         grault: String = (inner_html of "#grault"),
156///         // stores if the elements that matches the selector "#garply" exist.
157///         garply: bool = (presence of "#garply"),
158///     }
159///     #[derive(Debug, PartialEq)]
160///     Qux {
161///         corge: usize = (text of "#corge"),
162///     }
163/// }
164///
165/// fn main() {
166///     let input = r#"
167///         <div id="foo">1</div>
168///         <div id="bar">ignore first<br>ignore second<br>2</div>
169///         <div id="baz" data-baz="3"></div>
170///         <div id="qux">
171///             <div id="corge">4</div>
172///         </div>
173///         <div id="grault">
174///             inner<br>html
175///         </div>
176///     "#;
177///     let foo = Foo::extract_from_str(input).unwrap();
178///     assert_eq!(foo, Foo {
179///         foo: 1,
180///         bar: 2,
181///         baz: 3,
182///         qux: Qux { corge: 4 },
183///         grault: "inner<br>html".to_owned(),
184///         garply: false,
185///     });
186/// }
187/// ```
188/// ### Capture specifier
189/// Capture specifier specifies an regex that is used to capture desired data from the string that is extracted with target specifier.
190///
191/// The number of captures and the number of tuple elements must be the same.
192///
193/// If the specified regex is invalid, it will be a compile error.
194///
195/// It cannot be used with target specifier `elem of ..`.
196///
197/// If it is used without [collect specifier](#collect-specifier), the field must be a [tuple field](#defining-fields-in-structures).
198/// If it is used with [collect specifier](#collect-specifier), the type of the field must be [`FromIterator`](std::iter::FromIterator) of tuple.
199/// ```
200/// use html_extractor::{html_extractor, HtmlExtractor};
201/// html_extractor! {
202///     #[derive(Debug, PartialEq)]
203///     Foo {
204///         // extracts a string from the first text node in the element that matches the selector "#foo-bar",
205///         // and captures two data from the string with the regex "foo=(.*), bar=(.*)"
206///         (foo: usize, bar: usize) = (text of "#foo-bar", capture with "foo=(.*), bar=(.*)"),
207///         
208///         // extracts strings from the first text node in all elements that matches the selector ".baz-qux-corge",
209///         // captures three data from each string with the regex "baz=(.*), qux=(.*), corge=(.*)" ,
210///         // and collects into `Vec<(usize, usize, usize)>`
211///         baz_qux_corge: Vec<(usize, usize, usize)> = (text of ".baz-qux-corge", capture with "baz=(.*), qux=(.*), corge=(.*)", collect),
212///     }
213/// }
214///
215/// fn main() {
216///     let input = r#"
217///         <div id="foo-bar">foo=1, bar=2</div>
218///
219///         <div class="baz-qux-corge">baz=1, qux=2, corge=3</div>
220///         <div class="baz-qux-corge">baz=4, qux=5, corge=6</div>
221///         <div class="baz-qux-corge">baz=7, qux=8, corge=9</div>
222///         <div class="baz-qux-corge">baz=10, qux=11, corge=12</div>
223///     "#;
224///     let foo = Foo::extract_from_str(input).unwrap();
225///     assert_eq!(foo, Foo {
226///         foo: 1,
227///         bar: 2,
228///         baz_qux_corge: vec![(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12)],
229///     });
230/// }
231/// ```
232///
233/// ### Collector specifier
234/// Collector specifier specifies how to collect HTML elements.  
235/// The default collector is "first", which collects only the first matched element.  
236/// The "collect" collector collects all the element into the type that implements [`FromIterator`](std::iter::FromIterator).  
237/// The "optional" collector collects the first element if it exists. If not, it emits `None`.
238/// ```
239/// use html_extractor::{html_extractor, HtmlExtractor};
240/// html_extractor! {
241///     #[derive(Debug, PartialEq)]
242///     Foo {
243///         // extracts the first text node from each element that matches the selector ".foo", and collect them into `Vec<usize>`.
244///         foo: Vec<usize> = (text of ".foo", collect),
245///
246///         // extracts all the elements that match that selector "#bar",
247///         // parses them with `HtmlExtractor::extract()`,
248///         // and collects into `Vec<Bar>`.
249///         bar: Vec<Bar> = (elem of "#bar", collect),
250///         
251///         // extracts strings from the first text node in all elements that matches the selector ".baz-qux-corge",
252///         // captures three data from each string with the regex "baz=(.*), qux=(.*), corge=(.*)" ,
253///         // and collects into `Vec<(usize, usize, usize)>`
254///         baz_qux_corge: Vec<(usize, usize, usize)> = (text of ".baz-qux-corge", capture with "baz=(.*), qux=(.*), corge=(.*)", collect),
255///
256///         // optionally extracts the first text node in the first element that matches the selector ".grault".
257///         grault: Option<usize> = (text of ".grault", optional),
258///     }
259///     #[derive(Debug, PartialEq)]
260///     Bar {
261///         bar: usize = (text of ".bar-data"),
262///     }
263/// }
264///
265/// fn main() {
266///     let input = r#"
267///         <div class="foo">1</div>
268///         <div class="foo">2</div>
269///         <div class="foo">3</div>
270///         <div class="foo">4</div>
271///
272///         <div id="bar"><div class="bar-data">1</div></div>
273///         <div id="bar"><div class="bar-data">2</div></div>
274///         <div id="bar"><div class="bar-data">3</div></div>
275///         <div id="bar"><div class="bar-data">4</div></div>
276///
277///         <div class="baz-qux-corge">baz=1, qux=2, corge=3</div>
278///         <div class="baz-qux-corge">baz=4, qux=5, corge=6</div>
279///         <div class="baz-qux-corge">baz=7, qux=8, corge=9</div>
280///         <div class="baz-qux-corge">baz=10, qux=11, corge=12</div>
281///     "#;
282///     let foo = Foo::extract_from_str(input).unwrap();
283///     assert_eq!(foo, Foo {
284///         foo: vec![1, 2, 3, 4],
285///         bar: vec![
286///             Bar { bar: 1 },
287///             Bar { bar: 2 },
288///             Bar { bar: 3 },
289///             Bar { bar: 4 },
290///         ],
291///         baz_qux_corge: vec![(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12)],
292///         grault: None,
293///     });
294/// }
295/// ```
296/// ### Parser specifier
297/// Parser specifier specifies the parser used to parse the extracted string.  
298/// The default parser is [`::std::str::FromStr::from_str`].  
299/// The parser must be `Fn(&str) -> Result<_, T> where T: std::fmt::Debug`
300/// ```
301/// use html_extractor::{html_extractor, HtmlExtractor};
302/// html_extractor! {
303///     #[derive(Debug, PartialEq)]
304///     Foo {
305///         // extracts using a custom parser.
306///         foo: usize = (text of "#foo", parse with custom_parser),
307///     }
308/// }
309/// fn custom_parser(input: &str) -> Result<usize, std::num::ParseIntError> {
310///     input.replace(",", "").parse()
311/// }
312///
313/// fn main() {
314///     let input = r#"
315///         <div id="foo">1,000,000,000</div>
316///     "#;
317///     let foo = Foo::extract_from_str(input).unwrap();
318///     assert_eq!(foo, Foo {
319///         foo: 1000000000,
320///     });
321/// }
322/// ```
323///
324/// # Usage of the generated structures
325/// The generated structures implement trait [`HtmlExtractor`].
326/// See the document of the trait.
327pub use html_extractor_macros::html_extractor;
328
329/// A trait for extracting data from HTML documents.
330///
331/// It is recommended to use [`html_extractor!`](macro.html_extractor.html) to implement `HtmlExtractor`.
332pub trait HtmlExtractor
333where
334    Self: Sized,
335{
336    /// Extracts data from [`scraper::element_ref::ElementRef`].
337    fn extract(elem: &scraper::ElementRef) -> Result<Self, Error>;
338    /// Parses HTML string and extracts data from it.
339    fn extract_from_str(html_str: &str) -> Result<Self, Error> {
340        let html = scraper::Html::parse_document(html_str);
341        HtmlExtractor::extract(&html.root_element())
342    }
343}
344
345#[cfg(test)]
346mod test;