1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
//! This crate provides an easy way to extract data from HTML.
//!
//! [`HtmlExtractor`] is neither a parser nor a deserializer.
//! It picks up only the desired data from HTML.
//!
//! [`html_extractor!`](macro.html_extractor.html) will help to implement [`HtmlExtractor`].
//!
//! # Examples
//! ## Extracting a simple value from HTML
//! ```
//! use html_extractor::{html_extractor, HtmlExtractor};
//! html_extractor! {
//! #[derive(Debug, PartialEq)]
//! Foo {
//! foo: usize = (text of "#foo"),
//! }
//! }
//!
//! fn main() {
//! let input = r#"
//! <div id="foo">1</div>
//! "#;
//! let foo = Foo::extract_from_str(input).unwrap();
//! assert_eq!(foo, Foo { foo: 1 });
//! }
//! ```
//!
//! ## Extracting a collection from HTML
//! ```
//! use html_extractor::{html_extractor, HtmlExtractor};
//! html_extractor! {
//! #[derive(Debug, PartialEq)]
//! Foo {
//! foo: Vec<usize> = (text of ".foo", collect),
//! }
//! }
//!
//! fn main() {
//! let input = r#"
//! <div class="foo">1</div>
//! <div class="foo">2</div>
//! <div class="foo">3</div>
//! <div class="foo">4</div>
//! "#;
//! let foo = Foo::extract_from_str(input).unwrap();
//! assert_eq!(foo, Foo { foo: vec![1, 2, 3, 4] });
//! }
//! ```
//!
//! ## Extracting with regex
//! ```
//! use html_extractor::{html_extractor, HtmlExtractor};
//! html_extractor! {
//! #[derive(Debug, PartialEq)]
//! Foo {
//! (foo: usize,) = (text of "#foo", capture with "^foo=(.*)$"),
//! }
//! }
//!
//! fn main() {
//! let input = r#"
//! <div id="foo">foo=1</div>
//! "#;
//! let foo = Foo::extract_from_str(input).unwrap();
//! assert_eq!(foo, Foo { foo: 1 });
//! }
//! ```
pub extern crate lazy_static;
pub extern crate regex;
pub extern crate scraper;
pub use Error;
/// Generates structures that implement [`HtmlExtractor`].
///
/// # Syntax
///
/// ## Defining structures
/// In this macro, zero or more structures can be defined.
///
/// Attributes can be attached to the structures, but currently attributes that may remove the structures (like `#[cfg]`) will not work.
/// ```no_run
/// # use html_extractor::html_extractor;
/// # fn main() {}
/// html_extractor! {
/// //private structure
/// Foo {
/// //fields...
/// }
/// //any visibilities and some attributes can be used
/// #[derive(Debug, Clone)]
/// pub(crate) Bar {
/// //fields...
/// }
/// }
/// ```
///
/// ## Defining fields in structures
/// There are two types of fields, "single field" and "tuple field".
/// Tuple fields are used to [capture data with regex](#capture-specifier).
///
/// Each field definition has a declaration part and an [extractor](#extractor-part-of-field-definitions) part.
///
/// Attributes can be attached to the fields, but currently attributes that may remove the fields (like `#[cfg]`) will not work.
/// ```no_run
/// # use html_extractor::html_extractor;
/// # fn main() {}
/// html_extractor! {
/// Foo {
/// //single field
/// pub foo: usize = (text of "#foo"),
/// //^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^
/// // declaration extractor
///
/// //tuple field
/// (pub bar: usize, pub baz: usize) = (text of "#bar-baz", capture with "bar=(.*),baz=(.*)"),
/// //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
/// // declaration extractor
/// }
/// }
/// ```
///
/// ## Extractor part of field definitions
/// The extractor part of field definitions specifies how to extract data from HTML.
/// Extractor consists of [Target](#target-specifier), [Capture](#capture-specifier), [Collector](#collector-specifier) and [Parser](#parser-specifier) specifier.
///
/// The order of specifiers does not matter. If the same specifier is written multiple times, the one given later applies.
/// ### Target specifier
/// Target specifier specifies a selector to select an element (or elements) and what of the selected element is extracted.
///
/// If the specified selector is invalid, it will be a compile error.
/// If `text of ..` or `attr[..] of ..` is used, the type of field must implement [`FromStr`](std::str::FromStr).
/// If `elem of ..` is used, the type of field must implement [`HtmlExtractor`].
/// If `text of ..` is used, leading and trailing whitespace removed from the extracted string.
/// If `presence of ..` is used, the type must be `bool` and any other specifier cannot be used,
/// ```
/// use html_extractor::{html_extractor, HtmlExtractor};
/// html_extractor! {
/// #[derive(Debug, PartialEq)]
/// Foo {
/// // extracts the first text node in the element that first matched the selector "#foo"
/// foo: usize = (text of "#foo"),
/// // extracts the third text node in the element that first matched the selector "#bar"
/// bar: usize = (text[2] of "#bar"),
/// // extracts attribute "data-baz" in the element that first matched the selector "#baz"
/// baz: usize = (attr["data-baz"] of "#baz"),
/// // extracts an element that first matched the selector "#qux" and parse it with `HtmlExtractor::extract()`
/// qux: Qux = (elem of "#qux"),
/// // extracts inner HTML of the element that first matched the selector "#grault",
/// grault: String = (inner_html of "#grault"),
/// // stores if the elements that matches the selector "#garply" exist.
/// garply: bool = (presence of "#garply"),
/// }
/// #[derive(Debug, PartialEq)]
/// Qux {
/// corge: usize = (text of "#corge"),
/// }
/// }
///
/// fn main() {
/// let input = r#"
/// <div id="foo">1</div>
/// <div id="bar">ignore first<br>ignore second<br>2</div>
/// <div id="baz" data-baz="3"></div>
/// <div id="qux">
/// <div id="corge">4</div>
/// </div>
/// <div id="grault">
/// inner<br>html
/// </div>
/// "#;
/// let foo = Foo::extract_from_str(input).unwrap();
/// assert_eq!(foo, Foo {
/// foo: 1,
/// bar: 2,
/// baz: 3,
/// qux: Qux { corge: 4 },
/// grault: "inner<br>html".to_owned(),
/// garply: false,
/// });
/// }
/// ```
/// ### Capture specifier
/// Capture specifier specifies an regex that is used to capture desired data from the string that is extracted with target specifier.
///
/// The number of captures and the number of tuple elements must be the same.
///
/// If the specified regex is invalid, it will be a compile error.
///
/// It cannot be used with target specifier `elem of ..`.
///
/// If it is used without [collect specifier](#collect-specifier), the field must be a [tuple field](#defining-fields-in-structures).
/// If it is used with [collect specifier](#collect-specifier), the type of the field must be [`FromIterator`](std::iter::FromIterator) of tuple.
/// ```
/// use html_extractor::{html_extractor, HtmlExtractor};
/// html_extractor! {
/// #[derive(Debug, PartialEq)]
/// Foo {
/// // extracts a string from the first text node in the element that matches the selector "#foo-bar",
/// // and captures two data from the string with the regex "foo=(.*), bar=(.*)"
/// (foo: usize, bar: usize) = (text of "#foo-bar", capture with "foo=(.*), bar=(.*)"),
///
/// // extracts strings from the first text node in all elements that matches the selector ".baz-qux-corge",
/// // captures three data from each string with the regex "baz=(.*), qux=(.*), corge=(.*)" ,
/// // and collects into `Vec<(usize, usize, usize)>`
/// baz_qux_corge: Vec<(usize, usize, usize)> = (text of ".baz-qux-corge", capture with "baz=(.*), qux=(.*), corge=(.*)", collect),
/// }
/// }
///
/// fn main() {
/// let input = r#"
/// <div id="foo-bar">foo=1, bar=2</div>
///
/// <div class="baz-qux-corge">baz=1, qux=2, corge=3</div>
/// <div class="baz-qux-corge">baz=4, qux=5, corge=6</div>
/// <div class="baz-qux-corge">baz=7, qux=8, corge=9</div>
/// <div class="baz-qux-corge">baz=10, qux=11, corge=12</div>
/// "#;
/// let foo = Foo::extract_from_str(input).unwrap();
/// assert_eq!(foo, Foo {
/// foo: 1,
/// bar: 2,
/// baz_qux_corge: vec![(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12)],
/// });
/// }
/// ```
///
/// ### Collector specifier
/// Collector specifier specifies how to collect HTML elements.
/// The default collector is "first", which collects only the first matched element.
/// The "collect" collector collects all the element into the type that implements [`FromIterator`](std::iter::FromIterator).
/// The "optional" collector collects the first element if it exists. If not, it emits `None`.
/// ```
/// use html_extractor::{html_extractor, HtmlExtractor};
/// html_extractor! {
/// #[derive(Debug, PartialEq)]
/// Foo {
/// // extracts the first text node from each element that matches the selector ".foo", and collect them into `Vec<usize>`.
/// foo: Vec<usize> = (text of ".foo", collect),
///
/// // extracts all the elements that match that selector "#bar",
/// // parses them with `HtmlExtractor::extract()`,
/// // and collects into `Vec<Bar>`.
/// bar: Vec<Bar> = (elem of "#bar", collect),
///
/// // extracts strings from the first text node in all elements that matches the selector ".baz-qux-corge",
/// // captures three data from each string with the regex "baz=(.*), qux=(.*), corge=(.*)" ,
/// // and collects into `Vec<(usize, usize, usize)>`
/// baz_qux_corge: Vec<(usize, usize, usize)> = (text of ".baz-qux-corge", capture with "baz=(.*), qux=(.*), corge=(.*)", collect),
///
/// // optionally extracts the first text node in the first element that matches the selector ".grault".
/// grault: Option<usize> = (text of ".grault", optional),
/// }
/// #[derive(Debug, PartialEq)]
/// Bar {
/// bar: usize = (text of ".bar-data"),
/// }
/// }
///
/// fn main() {
/// let input = r#"
/// <div class="foo">1</div>
/// <div class="foo">2</div>
/// <div class="foo">3</div>
/// <div class="foo">4</div>
///
/// <div id="bar"><div class="bar-data">1</div></div>
/// <div id="bar"><div class="bar-data">2</div></div>
/// <div id="bar"><div class="bar-data">3</div></div>
/// <div id="bar"><div class="bar-data">4</div></div>
///
/// <div class="baz-qux-corge">baz=1, qux=2, corge=3</div>
/// <div class="baz-qux-corge">baz=4, qux=5, corge=6</div>
/// <div class="baz-qux-corge">baz=7, qux=8, corge=9</div>
/// <div class="baz-qux-corge">baz=10, qux=11, corge=12</div>
/// "#;
/// let foo = Foo::extract_from_str(input).unwrap();
/// assert_eq!(foo, Foo {
/// foo: vec![1, 2, 3, 4],
/// bar: vec![
/// Bar { bar: 1 },
/// Bar { bar: 2 },
/// Bar { bar: 3 },
/// Bar { bar: 4 },
/// ],
/// baz_qux_corge: vec![(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12)],
/// grault: None,
/// });
/// }
/// ```
/// ### Parser specifier
/// Parser specifier specifies the parser used to parse the extracted string.
/// The default parser is [`::std::str::FromStr::from_str`].
/// The parser must be `Fn(&str) -> Result<_, T> where T: std::fmt::Debug`
/// ```
/// use html_extractor::{html_extractor, HtmlExtractor};
/// html_extractor! {
/// #[derive(Debug, PartialEq)]
/// Foo {
/// // extracts using a custom parser.
/// foo: usize = (text of "#foo", parse with custom_parser),
/// }
/// }
/// fn custom_parser(input: &str) -> Result<usize, std::num::ParseIntError> {
/// input.replace(",", "").parse()
/// }
///
/// fn main() {
/// let input = r#"
/// <div id="foo">1,000,000,000</div>
/// "#;
/// let foo = Foo::extract_from_str(input).unwrap();
/// assert_eq!(foo, Foo {
/// foo: 1000000000,
/// });
/// }
/// ```
///
/// # Usage of the generated structures
/// The generated structures implement trait [`HtmlExtractor`].
/// See the document of the trait.
pub use html_extractor;
/// A trait for extracting data from HTML documents.
///
/// It is recommended to use [`html_extractor!`](macro.html_extractor.html) to implement `HtmlExtractor`.