mlscraper_rust/
lib.rs

1//! Tool for scraping structured data from webpages automatically.
2//!
3//! This project is inspired by the python package [mlscraper](https://github.com/lorey/mlscraper).
4//! See README.md for a comparison with the python version and example code.
5//!
6//! Quick example:
7//!
8//! ```
9//! # use mlscraper_rust::search::AttributeBuilder;
10//! let html = reqwest::blocking::get("http://quotes.toscrape.com/author/Albert-Einstein/")
11//!     .expect("request") // Scrappy error handling for demonstration purposes
12//!     .text()
13//!     .expect("text");
14//!
15//! let result = mlscraper_rust::train(
16//!     vec![html.as_str()],
17//!     vec![
18//!         AttributeBuilder::new("name")
19//!             .values(&[Some("Albert Einstein")])
20//!             .build(),
21//!
22//!         AttributeBuilder::new("born")
23//!             .values(&[Some("March 14, 1879")])
24//!             .build(),
25//!     ],
26//!     Default::default(),
27//!     1
28//! ).expect("training");
29//!
30//! // Prints `{"born": .author-born-date, "name": h3}`
31//! println!("{:?}", result.selectors());
32//! ```
33
34extern crate tl;
35
36pub mod search;
37pub mod selectors;
38pub mod util;
39
40use crate::search::*;
41use anyhow::Result;
42use rand::rngs::SmallRng;
43use rand::Rng;
44use rand::SeedableRng;
45
46/// Find suitable selectors for `attributes` in HTML documents `documents`.
47///
48/// The number of `iterations`
49/// is the number of generations the fuzzing algorithm should produce.
50/// In our experience, a very low number (1-3) of iterations should be
51/// sufficient for most input HTML documents. If a document has a very
52/// deep, nested structure, a higher number of iterations may be necessary.
53///
54/// Further settings can be adjusted with [`FuzzerSettings`]. If the generated
55/// selectors are not satisfactory, you can experiment with increasing the
56/// `random_generation_count`, `random_generation_retries` and other settings.
57/// Note that this may impact the training time.
58///
59/// The returned `TrainingResult` can be used to retrieve the generated
60/// selectors or to automatically extract information from previously
61/// unseen documents.
62pub fn train<'a, S: Into<&'a str>>(
63    documents: Vec<S>,
64    attributes: Vec<Attribute<'a>>,
65    settings: FuzzerSettings,
66    iterations: usize,
67) -> Result<TrainingResult> {
68    let mut rng = SmallRng::from_entropy();
69
70    train_with_rng(documents, attributes, settings, iterations, &mut rng)
71}
72
73/// Same as [`train`], but with a custom random number generator ([`Rng`]).
74pub fn train_with_rng<'a, R: Rng, S: Into<&'a str>>(
75    mut documents: Vec<S>,
76    attributes: Vec<Attribute<'a>>,
77    settings: FuzzerSettings,
78    iterations: usize,
79    rng: &mut R,
80) -> Result<TrainingResult> {
81    let doms = documents
82        .drain(..)
83        .map(|doc| {
84            tl::parse(doc.into(), tl::ParserOptions::default()).expect("HTML parsing failed")
85        })
86        .collect();
87    let mut training = Training::with_settings(doms, attributes, settings)?;
88
89    for _ in 0..iterations {
90        training.do_one_fuzzing_round(rng);
91    }
92
93    Ok(training.to_result())
94}