mlscraper_rust/lib.rs
1//! Tool for scraping structured data from webpages automatically.
2//!
3//! This project is inspired by the python package [mlscraper](https://github.com/lorey/mlscraper).
4//! See README.md for a comparison with the python version and example code.
5//!
6//! Quick example:
7//!
8//! ```
9//! # use mlscraper_rust::search::AttributeBuilder;
10//! let html = reqwest::blocking::get("http://quotes.toscrape.com/author/Albert-Einstein/")
11//! .expect("request") // Scrappy error handling for demonstration purposes
12//! .text()
13//! .expect("text");
14//!
15//! let result = mlscraper_rust::train(
16//! vec![html.as_str()],
17//! vec![
18//! AttributeBuilder::new("name")
19//! .values(&[Some("Albert Einstein")])
20//! .build(),
21//!
22//! AttributeBuilder::new("born")
23//! .values(&[Some("March 14, 1879")])
24//! .build(),
25//! ],
26//! Default::default(),
27//! 1
28//! ).expect("training");
29//!
30//! // Prints `{"born": .author-born-date, "name": h3}`
31//! println!("{:?}", result.selectors());
32//! ```
33
34extern crate tl;
35
36pub mod search;
37pub mod selectors;
38pub mod util;
39
40use crate::search::*;
41use anyhow::Result;
42use rand::rngs::SmallRng;
43use rand::Rng;
44use rand::SeedableRng;
45
46/// Find suitable selectors for `attributes` in HTML documents `documents`.
47///
48/// The number of `iterations`
49/// is the number of generations the fuzzing algorithm should produce.
50/// In our experience, a very low number (1-3) of iterations should be
51/// sufficient for most input HTML documents. If a document has a very
52/// deep, nested structure, a higher number of iterations may be necessary.
53///
54/// Further settings can be adjusted with [`FuzzerSettings`]. If the generated
55/// selectors are not satisfactory, you can experiment with increasing the
56/// `random_generation_count`, `random_generation_retries` and other settings.
57/// Note that this may impact the training time.
58///
59/// The returned `TrainingResult` can be used to retrieve the generated
60/// selectors or to automatically extract information from previously
61/// unseen documents.
62pub fn train<'a, S: Into<&'a str>>(
63 documents: Vec<S>,
64 attributes: Vec<Attribute<'a>>,
65 settings: FuzzerSettings,
66 iterations: usize,
67) -> Result<TrainingResult> {
68 let mut rng = SmallRng::from_entropy();
69
70 train_with_rng(documents, attributes, settings, iterations, &mut rng)
71}
72
73/// Same as [`train`], but with a custom random number generator ([`Rng`]).
74pub fn train_with_rng<'a, R: Rng, S: Into<&'a str>>(
75 mut documents: Vec<S>,
76 attributes: Vec<Attribute<'a>>,
77 settings: FuzzerSettings,
78 iterations: usize,
79 rng: &mut R,
80) -> Result<TrainingResult> {
81 let doms = documents
82 .drain(..)
83 .map(|doc| {
84 tl::parse(doc.into(), tl::ParserOptions::default()).expect("HTML parsing failed")
85 })
86 .collect();
87 let mut training = Training::with_settings(doms, attributes, settings)?;
88
89 for _ in 0..iterations {
90 training.do_one_fuzzing_round(rng);
91 }
92
93 Ok(training.to_result())
94}