html_extractor/lib.rs
1#![allow(clippy::needless_doctest_main)]
2//! This crate provides an easy way to extract data from HTML.
3//!
4//! [`HtmlExtractor`] is neither a parser nor a deserializer.
5//! It picks up only the desired data from HTML.
6//!
7//! [`html_extractor!`](macro.html_extractor.html) will help to implement [`HtmlExtractor`].
8//!
9//! # Examples
10//! ## Extracting a simple value from HTML
11//! ```
12//! use html_extractor::{html_extractor, HtmlExtractor};
13//! html_extractor! {
14//! #[derive(Debug, PartialEq)]
15//! Foo {
16//! foo: usize = (text of "#foo"),
17//! }
18//! }
19//!
20//! fn main() {
21//! let input = r#"
22//! <div id="foo">1</div>
23//! "#;
24//! let foo = Foo::extract_from_str(input).unwrap();
25//! assert_eq!(foo, Foo { foo: 1 });
26//! }
27//! ```
28//!
29//! ## Extracting a collection from HTML
30//! ```
31//! use html_extractor::{html_extractor, HtmlExtractor};
32//! html_extractor! {
33//! #[derive(Debug, PartialEq)]
34//! Foo {
35//! foo: Vec<usize> = (text of ".foo", collect),
36//! }
37//! }
38//!
39//! fn main() {
40//! let input = r#"
41//! <div class="foo">1</div>
42//! <div class="foo">2</div>
43//! <div class="foo">3</div>
44//! <div class="foo">4</div>
45//! "#;
46//! let foo = Foo::extract_from_str(input).unwrap();
47//! assert_eq!(foo, Foo { foo: vec![1, 2, 3, 4] });
48//! }
49//! ```
50//!
51//! ## Extracting with regex
52//! ```
53//! use html_extractor::{html_extractor, HtmlExtractor};
54//! html_extractor! {
55//! #[derive(Debug, PartialEq)]
56//! Foo {
57//! (foo: usize,) = (text of "#foo", capture with "^foo=(.*)$"),
58//! }
59//! }
60//!
61//! fn main() {
62//! let input = r#"
63//! <div id="foo">foo=1</div>
64//! "#;
65//! let foo = Foo::extract_from_str(input).unwrap();
66//! assert_eq!(foo, Foo { foo: 1 });
67//! }
68//! ```
69
70#[doc(hidden)]
71pub extern crate lazy_static;
72#[doc(hidden)]
73pub extern crate regex;
74#[doc(hidden)]
75pub extern crate scraper;
76pub use error::Error;
77pub mod error;
78
79/// Generates structures that implement [`HtmlExtractor`].
80///
81/// # Syntax
82///
83/// ## Defining structures
84/// In this macro, zero or more structures can be defined.
85///
86/// Attributes can be attached to the structures, but currently attributes that may remove the structures (like `#[cfg]`) will not work.
87/// ```no_run
88/// # use html_extractor::html_extractor;
89/// # fn main() {}
90/// html_extractor! {
91/// //private structure
92/// Foo {
93/// //fields...
94/// }
95/// //any visibilities and some attributes can be used
96/// #[derive(Debug, Clone)]
97/// pub(crate) Bar {
98/// //fields...
99/// }
100/// }
101/// ```
102///
103/// ## Defining fields in structures
104/// There are two types of fields, "single field" and "tuple field".
105/// Tuple fields are used to [capture data with regex](#capture-specifier).
106///
107/// Each field definition has a declaration part and an [extractor](#extractor-part-of-field-definitions) part.
108///
109/// Attributes can be attached to the fields, but currently attributes that may remove the fields (like `#[cfg]`) will not work.
110/// ```no_run
111/// # use html_extractor::html_extractor;
112/// # fn main() {}
113/// html_extractor! {
114/// Foo {
115/// //single field
116/// pub foo: usize = (text of "#foo"),
117/// //^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^
118/// // declaration extractor
119///
120/// //tuple field
121/// (pub bar: usize, pub baz: usize) = (text of "#bar-baz", capture with "bar=(.*),baz=(.*)"),
122/// //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
123/// // declaration extractor
124/// }
125/// }
126/// ```
127///
128/// ## Extractor part of field definitions
129/// The extractor part of field definitions specifies how to extract data from HTML.
130/// Extractor consists of [Target](#target-specifier), [Capture](#capture-specifier), [Collector](#collector-specifier) and [Parser](#parser-specifier) specifier.
131///
132/// The order of specifiers does not matter. If the same specifier is written multiple times, the one given later applies.
133/// ### Target specifier
134/// Target specifier specifies a selector to select an element (or elements) and what of the selected element is extracted.
135///
136/// If the specified selector is invalid, it will be a compile error.
137/// If `text of ..` or `attr[..] of ..` is used, the type of field must implement [`FromStr`](std::str::FromStr).
138/// If `elem of ..` is used, the type of field must implement [`HtmlExtractor`].
139/// If `text of ..` is used, leading and trailing whitespace removed from the extracted string.
140/// If `presence of ..` is used, the type must be `bool` and any other specifier cannot be used,
141/// ```
142/// use html_extractor::{html_extractor, HtmlExtractor};
143/// html_extractor! {
144/// #[derive(Debug, PartialEq)]
145/// Foo {
146/// // extracts the first text node in the element that first matched the selector "#foo"
147/// foo: usize = (text of "#foo"),
148/// // extracts the third text node in the element that first matched the selector "#bar"
149/// bar: usize = (text[2] of "#bar"),
150/// // extracts attribute "data-baz" in the element that first matched the selector "#baz"
151/// baz: usize = (attr["data-baz"] of "#baz"),
152/// // extracts an element that first matched the selector "#qux" and parse it with `HtmlExtractor::extract()`
153/// qux: Qux = (elem of "#qux"),
154/// // extracts inner HTML of the element that first matched the selector "#grault",
155/// grault: String = (inner_html of "#grault"),
156/// // stores if the elements that matches the selector "#garply" exist.
157/// garply: bool = (presence of "#garply"),
158/// }
159/// #[derive(Debug, PartialEq)]
160/// Qux {
161/// corge: usize = (text of "#corge"),
162/// }
163/// }
164///
165/// fn main() {
166/// let input = r#"
167/// <div id="foo">1</div>
168/// <div id="bar">ignore first<br>ignore second<br>2</div>
169/// <div id="baz" data-baz="3"></div>
170/// <div id="qux">
171/// <div id="corge">4</div>
172/// </div>
173/// <div id="grault">
174/// inner<br>html
175/// </div>
176/// "#;
177/// let foo = Foo::extract_from_str(input).unwrap();
178/// assert_eq!(foo, Foo {
179/// foo: 1,
180/// bar: 2,
181/// baz: 3,
182/// qux: Qux { corge: 4 },
183/// grault: "inner<br>html".to_owned(),
184/// garply: false,
185/// });
186/// }
187/// ```
188/// ### Capture specifier
189/// Capture specifier specifies an regex that is used to capture desired data from the string that is extracted with target specifier.
190///
191/// The number of captures and the number of tuple elements must be the same.
192///
193/// If the specified regex is invalid, it will be a compile error.
194///
195/// It cannot be used with target specifier `elem of ..`.
196///
197/// If it is used without [collect specifier](#collect-specifier), the field must be a [tuple field](#defining-fields-in-structures).
198/// If it is used with [collect specifier](#collect-specifier), the type of the field must be [`FromIterator`](std::iter::FromIterator) of tuple.
199/// ```
200/// use html_extractor::{html_extractor, HtmlExtractor};
201/// html_extractor! {
202/// #[derive(Debug, PartialEq)]
203/// Foo {
204/// // extracts a string from the first text node in the element that matches the selector "#foo-bar",
205/// // and captures two data from the string with the regex "foo=(.*), bar=(.*)"
206/// (foo: usize, bar: usize) = (text of "#foo-bar", capture with "foo=(.*), bar=(.*)"),
207///
208/// // extracts strings from the first text node in all elements that matches the selector ".baz-qux-corge",
209/// // captures three data from each string with the regex "baz=(.*), qux=(.*), corge=(.*)" ,
210/// // and collects into `Vec<(usize, usize, usize)>`
211/// baz_qux_corge: Vec<(usize, usize, usize)> = (text of ".baz-qux-corge", capture with "baz=(.*), qux=(.*), corge=(.*)", collect),
212/// }
213/// }
214///
215/// fn main() {
216/// let input = r#"
217/// <div id="foo-bar">foo=1, bar=2</div>
218///
219/// <div class="baz-qux-corge">baz=1, qux=2, corge=3</div>
220/// <div class="baz-qux-corge">baz=4, qux=5, corge=6</div>
221/// <div class="baz-qux-corge">baz=7, qux=8, corge=9</div>
222/// <div class="baz-qux-corge">baz=10, qux=11, corge=12</div>
223/// "#;
224/// let foo = Foo::extract_from_str(input).unwrap();
225/// assert_eq!(foo, Foo {
226/// foo: 1,
227/// bar: 2,
228/// baz_qux_corge: vec![(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12)],
229/// });
230/// }
231/// ```
232///
233/// ### Collector specifier
234/// Collector specifier specifies how to collect HTML elements.
235/// The default collector is "first", which collects only the first matched element.
236/// The "collect" collector collects all the element into the type that implements [`FromIterator`](std::iter::FromIterator).
237/// The "optional" collector collects the first element if it exists. If not, it emits `None`.
238/// ```
239/// use html_extractor::{html_extractor, HtmlExtractor};
240/// html_extractor! {
241/// #[derive(Debug, PartialEq)]
242/// Foo {
243/// // extracts the first text node from each element that matches the selector ".foo", and collect them into `Vec<usize>`.
244/// foo: Vec<usize> = (text of ".foo", collect),
245///
246/// // extracts all the elements that match that selector "#bar",
247/// // parses them with `HtmlExtractor::extract()`,
248/// // and collects into `Vec<Bar>`.
249/// bar: Vec<Bar> = (elem of "#bar", collect),
250///
251/// // extracts strings from the first text node in all elements that matches the selector ".baz-qux-corge",
252/// // captures three data from each string with the regex "baz=(.*), qux=(.*), corge=(.*)" ,
253/// // and collects into `Vec<(usize, usize, usize)>`
254/// baz_qux_corge: Vec<(usize, usize, usize)> = (text of ".baz-qux-corge", capture with "baz=(.*), qux=(.*), corge=(.*)", collect),
255///
256/// // optionally extracts the first text node in the first element that matches the selector ".grault".
257/// grault: Option<usize> = (text of ".grault", optional),
258/// }
259/// #[derive(Debug, PartialEq)]
260/// Bar {
261/// bar: usize = (text of ".bar-data"),
262/// }
263/// }
264///
265/// fn main() {
266/// let input = r#"
267/// <div class="foo">1</div>
268/// <div class="foo">2</div>
269/// <div class="foo">3</div>
270/// <div class="foo">4</div>
271///
272/// <div id="bar"><div class="bar-data">1</div></div>
273/// <div id="bar"><div class="bar-data">2</div></div>
274/// <div id="bar"><div class="bar-data">3</div></div>
275/// <div id="bar"><div class="bar-data">4</div></div>
276///
277/// <div class="baz-qux-corge">baz=1, qux=2, corge=3</div>
278/// <div class="baz-qux-corge">baz=4, qux=5, corge=6</div>
279/// <div class="baz-qux-corge">baz=7, qux=8, corge=9</div>
280/// <div class="baz-qux-corge">baz=10, qux=11, corge=12</div>
281/// "#;
282/// let foo = Foo::extract_from_str(input).unwrap();
283/// assert_eq!(foo, Foo {
284/// foo: vec![1, 2, 3, 4],
285/// bar: vec![
286/// Bar { bar: 1 },
287/// Bar { bar: 2 },
288/// Bar { bar: 3 },
289/// Bar { bar: 4 },
290/// ],
291/// baz_qux_corge: vec![(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12)],
292/// grault: None,
293/// });
294/// }
295/// ```
296/// ### Parser specifier
297/// Parser specifier specifies the parser used to parse the extracted string.
298/// The default parser is [`::std::str::FromStr::from_str`].
299/// The parser must be `Fn(&str) -> Result<_, T> where T: std::fmt::Debug`
300/// ```
301/// use html_extractor::{html_extractor, HtmlExtractor};
302/// html_extractor! {
303/// #[derive(Debug, PartialEq)]
304/// Foo {
305/// // extracts using a custom parser.
306/// foo: usize = (text of "#foo", parse with custom_parser),
307/// }
308/// }
309/// fn custom_parser(input: &str) -> Result<usize, std::num::ParseIntError> {
310/// input.replace(",", "").parse()
311/// }
312///
313/// fn main() {
314/// let input = r#"
315/// <div id="foo">1,000,000,000</div>
316/// "#;
317/// let foo = Foo::extract_from_str(input).unwrap();
318/// assert_eq!(foo, Foo {
319/// foo: 1000000000,
320/// });
321/// }
322/// ```
323///
324/// # Usage of the generated structures
325/// The generated structures implement trait [`HtmlExtractor`].
326/// See the document of the trait.
327pub use html_extractor_macros::html_extractor;
328
329/// A trait for extracting data from HTML documents.
330///
331/// It is recommended to use [`html_extractor!`](macro.html_extractor.html) to implement `HtmlExtractor`.
332pub trait HtmlExtractor
333where
334 Self: Sized,
335{
336 /// Extracts data from [`scraper::element_ref::ElementRef`].
337 fn extract(elem: &scraper::ElementRef) -> Result<Self, Error>;
338 /// Parses HTML string and extracts data from it.
339 fn extract_from_str(html_str: &str) -> Result<Self, Error> {
340 let html = scraper::Html::parse_document(html_str);
341 HtmlExtractor::extract(&html.root_element())
342 }
343}
344
345#[cfg(test)]
346mod test;