1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346
#![allow(clippy::needless_doctest_main)] //! This crate provides an easy way to extract data from HTML. //! //! [`HtmlExtractor`] is neither a parser nor a deserializer. //! It picks up only the desired data from HTML. //! //! [`html_extractor!`](macro.html_extractor.html) will help to implement [`HtmlExtractor`]. //! //! # Examples //! ## Extracting a simple value from HTML //! ``` //! use html_extractor::{html_extractor, HtmlExtractor}; //! html_extractor! { //! #[derive(Debug, PartialEq)] //! Foo { //! foo: usize = (text of "#foo"), //! } //! } //! //! fn main() { //! let input = r#" //! <div id="foo">1</div> //! "#; //! let foo = Foo::extract_from_str(input).unwrap(); //! assert_eq!(foo, Foo { foo: 1 }); //! } //! ``` //! //! ## Extracting a collection from HTML //! ``` //! use html_extractor::{html_extractor, HtmlExtractor}; //! html_extractor! { //! #[derive(Debug, PartialEq)] //! Foo { //! foo: Vec<usize> = (text of ".foo", collect), //! } //! } //! //! fn main() { //! let input = r#" //! <div class="foo">1</div> //! <div class="foo">2</div> //! <div class="foo">3</div> //! <div class="foo">4</div> //! "#; //! let foo = Foo::extract_from_str(input).unwrap(); //! assert_eq!(foo, Foo { foo: vec![1, 2, 3, 4] }); //! } //! ``` //! //! ## Extracting with regex //! ``` //! use html_extractor::{html_extractor, HtmlExtractor}; //! html_extractor! { //! #[derive(Debug, PartialEq)] //! Foo { //! (foo: usize,) = (text of "#foo", capture with "^foo=(.*)$"), //! } //! } //! //! fn main() { //! let input = r#" //! <div id="foo">foo=1</div> //! "#; //! let foo = Foo::extract_from_str(input).unwrap(); //! assert_eq!(foo, Foo { foo: 1 }); //! } //! ``` #[doc(hidden)] pub extern crate lazy_static; #[doc(hidden)] pub extern crate regex; #[doc(hidden)] pub extern crate scraper; pub use error::Error; pub mod error; /// Generates structures that implement [`HtmlExtractor`]. /// /// # Syntax /// /// ## Defining structures /// In this macro, zero or more structures can be defined. /// /// Attributes can be attached to the structures, but currently attributes that may remove the structures (like `#[cfg]`) will not work. /// ```no_run /// # use html_extractor::html_extractor; /// # fn main() {} /// html_extractor! { /// //private structure /// Foo { /// //fields... /// } /// //any visibilities and some attributes can be used /// #[derive(Debug, Clone)] /// pub(crate) Bar { /// //fields... /// } /// } /// ``` /// /// ## Defining fields in structures /// There are two types of fields, "single field" and "tuple field". /// Tuple fields are used to [capture data with regex](#capture-specifier). /// /// Each field definition has a declaration part and an [extractor](#extractor-part-of-field-definitions) part. /// /// Attributes can be attached to the fields, but currently attributes that may remove the fields (like `#[cfg]`) will not work. /// ```no_run /// # use html_extractor::html_extractor; /// # fn main() {} /// html_extractor! { /// Foo { /// //single field /// pub foo: usize = (text of "#foo"), /// //^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^ /// // declaration extractor /// /// //tuple field /// (pub bar: usize, pub baz: usize) = (text of "#bar-baz", capture with "bar=(.*),baz=(.*)"), /// //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ /// // declaration extractor /// } /// } /// ``` /// /// ## Extractor part of field definitions /// The extractor part of field definitions specifies how to extract data from HTML. /// Extractor consists of [Target](#target-specifier), [Capture](#capture-specifier), [Collector](#collector-specifier) and [Parser](#parser-specifier) specifier. /// /// The order of specifiers does not matter. If the same specifier is written multiple times, the one given later applies. /// ### Target specifier /// Target specifier specifies a selector to select an element (or elements) and what of the selected element is extracted. /// /// If the specified selector is invalid, it will be a compile error. /// If `text of ..` or `attr[..] of ..` is used, the type of field must implement [`FromStr`](std::str::FromStr). /// If `elem of ..` is used, the type of field must implement [`HtmlExtractor`]. /// If `text of ..` is used, leading and trailing whitespace removed from the extracted string. /// If `presence of ..` is used, the type must be `bool` and any other specifier cannot be used, /// ``` /// use html_extractor::{html_extractor, HtmlExtractor}; /// html_extractor! { /// #[derive(Debug, PartialEq)] /// Foo { /// // extracts the first text node in the element that first matched the selector "#foo" /// foo: usize = (text of "#foo"), /// // extracts the third text node in the element that first matched the selector "#bar" /// bar: usize = (text[2] of "#bar"), /// // extracts attribute "data-baz" in the element that first matched the selector "#baz" /// baz: usize = (attr["data-baz"] of "#baz"), /// // extracts an element that first matched the selector "#qux" and parse it with `HtmlExtractor::extract()` /// qux: Qux = (elem of "#qux"), /// // extracts inner HTML of the element that first matched the selector "#grault", /// grault: String = (inner_html of "#grault"), /// // stores if the elements that matches the selector "#garply" exist. /// garply: bool = (presence of "#garply"), /// } /// #[derive(Debug, PartialEq)] /// Qux { /// corge: usize = (text of "#corge"), /// } /// } /// /// fn main() { /// let input = r#" /// <div id="foo">1</div> /// <div id="bar">ignore first<br>ignore second<br>2</div> /// <div id="baz" data-baz="3"></div> /// <div id="qux"> /// <div id="corge">4</div> /// </div> /// <div id="grault"> /// inner<br>html /// </div> /// "#; /// let foo = Foo::extract_from_str(input).unwrap(); /// assert_eq!(foo, Foo { /// foo: 1, /// bar: 2, /// baz: 3, /// qux: Qux { corge: 4 }, /// grault: "inner<br>html".to_owned(), /// garply: false, /// }); /// } /// ``` /// ### Capture specifier /// Capture specifier specifies an regex that is used to capture desired data from the string that is extracted with target specifier. /// /// The number of captures and the number of tuple elements must be the same. /// /// If the specified regex is invalid, it will be a compile error. /// /// It cannot be used with target specifier `elem of ..`. /// /// If it is used without [collect specifier](#collect-specifier), the field must be a [tuple field](#defining-fields-in-structures). /// If it is used with [collect specifier](#collect-specifier), the type of the field must be [`FromIterator`](std::iter::FromIterator) of tuple. /// ``` /// use html_extractor::{html_extractor, HtmlExtractor}; /// html_extractor! { /// #[derive(Debug, PartialEq)] /// Foo { /// // extracts a string from the first text node in the element that matches the selector "#foo-bar", /// // and captures two data from the string with the regex "foo=(.*), bar=(.*)" /// (foo: usize, bar: usize) = (text of "#foo-bar", capture with "foo=(.*), bar=(.*)"), /// /// // extracts strings from the first text node in all elements that matches the selector ".baz-qux-corge", /// // captures three data from each string with the regex "baz=(.*), qux=(.*), corge=(.*)" , /// // and collects into `Vec<(usize, usize, usize)>` /// baz_qux_corge: Vec<(usize, usize, usize)> = (text of ".baz-qux-corge", capture with "baz=(.*), qux=(.*), corge=(.*)", collect), /// } /// } /// /// fn main() { /// let input = r#" /// <div id="foo-bar">foo=1, bar=2</div> /// /// <div class="baz-qux-corge">baz=1, qux=2, corge=3</div> /// <div class="baz-qux-corge">baz=4, qux=5, corge=6</div> /// <div class="baz-qux-corge">baz=7, qux=8, corge=9</div> /// <div class="baz-qux-corge">baz=10, qux=11, corge=12</div> /// "#; /// let foo = Foo::extract_from_str(input).unwrap(); /// assert_eq!(foo, Foo { /// foo: 1, /// bar: 2, /// baz_qux_corge: vec![(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12)], /// }); /// } /// ``` /// /// ### Collector specifier /// Collector specifier specifies how to collect HTML elements. /// The default collector is "first", which collects only the first matched element. /// The "collect" collector collects all the element into the type that implements [`FromIterator`](std::iter::FromIterator). /// The "optional" collector collects the first element if it exists. If not, it emits `None`. /// ``` /// use html_extractor::{html_extractor, HtmlExtractor}; /// html_extractor! { /// #[derive(Debug, PartialEq)] /// Foo { /// // extracts the first text node from each element that matches the selector ".foo", and collect them into `Vec<usize>`. /// foo: Vec<usize> = (text of ".foo", collect), /// /// // extracts all the elements that match that selector "#bar", /// // parses them with `HtmlExtractor::extract()`, /// // and collects into `Vec<Bar>`. /// bar: Vec<Bar> = (elem of "#bar", collect), /// /// // extracts strings from the first text node in all elements that matches the selector ".baz-qux-corge", /// // captures three data from each string with the regex "baz=(.*), qux=(.*), corge=(.*)" , /// // and collects into `Vec<(usize, usize, usize)>` /// baz_qux_corge: Vec<(usize, usize, usize)> = (text of ".baz-qux-corge", capture with "baz=(.*), qux=(.*), corge=(.*)", collect), /// /// // optionally extracts the first text node in the first element that matches the selector ".grault". /// grault: Option<usize> = (text of ".grault", optional), /// } /// #[derive(Debug, PartialEq)] /// Bar { /// bar: usize = (text of ".bar-data"), /// } /// } /// /// fn main() { /// let input = r#" /// <div class="foo">1</div> /// <div class="foo">2</div> /// <div class="foo">3</div> /// <div class="foo">4</div> /// /// <div id="bar"><div class="bar-data">1</div></div> /// <div id="bar"><div class="bar-data">2</div></div> /// <div id="bar"><div class="bar-data">3</div></div> /// <div id="bar"><div class="bar-data">4</div></div> /// /// <div class="baz-qux-corge">baz=1, qux=2, corge=3</div> /// <div class="baz-qux-corge">baz=4, qux=5, corge=6</div> /// <div class="baz-qux-corge">baz=7, qux=8, corge=9</div> /// <div class="baz-qux-corge">baz=10, qux=11, corge=12</div> /// "#; /// let foo = Foo::extract_from_str(input).unwrap(); /// assert_eq!(foo, Foo { /// foo: vec![1, 2, 3, 4], /// bar: vec![ /// Bar { bar: 1 }, /// Bar { bar: 2 }, /// Bar { bar: 3 }, /// Bar { bar: 4 }, /// ], /// baz_qux_corge: vec![(1, 2, 3), (4, 5, 6), (7, 8, 9), (10, 11, 12)], /// grault: None, /// }); /// } /// ``` /// ### Parser specifier /// Parser specifier specifies the parser used to parse the extracted string. /// The default parser is [`::std::str::FromStr::from_str`]. /// The parser must be `Fn(&str) -> Result<_, T> where T: std::fmt::Debug` /// ``` /// use html_extractor::{html_extractor, HtmlExtractor}; /// html_extractor! { /// #[derive(Debug, PartialEq)] /// Foo { /// // extracts using a custom parser. /// foo: usize = (text of "#foo", parse with custom_parser), /// } /// } /// fn custom_parser(input: &str) -> Result<usize, std::num::ParseIntError> { /// input.replace(",", "").parse() /// } /// /// fn main() { /// let input = r#" /// <div id="foo">1,000,000,000</div> /// "#; /// let foo = Foo::extract_from_str(input).unwrap(); /// assert_eq!(foo, Foo { /// foo: 1000000000, /// }); /// } /// ``` /// /// # Usage of the generated structures /// The generated structures implement trait [`HtmlExtractor`]. /// See the document of the trait. pub use html_extractor_macros::html_extractor; /// A trait for extracting data from HTML documents. /// /// It is recommended to use [`html_extractor!`](macro.html_extractor.html) to implement `HtmlExtractor`. pub trait HtmlExtractor where Self: Sized, { /// Extracts data from [`scraper::element_ref::ElementRef`]. fn extract(elem: &scraper::ElementRef) -> Result<Self, Error>; /// Parses HTML string and extracts data from it. fn extract_from_str(html_str: &str) -> Result<Self, Error> { let html = scraper::Html::parse_document(html_str); HtmlExtractor::extract(&html.root_element()) } } #[cfg(test)] mod test;