gar_crawl/
crawler_builder.rs

1use crate::absolute_url;
2
3use super::crawler::*;
4use super::handler::*;
5use anyhow::Result;
6use reqwest::{Client, Url};
7use std::collections::HashMap;
8use std::fs::File;
9use std::io::Read;
10use std::marker::Send;
11
12/// Builder object for Crawler, fields are left public
13pub struct CrawlerBuilder<'a> {
14    pub client_builder: reqwest::ClientBuilder,
15    pub handlers: HashMap<HandlerEvent, Vec<Handler<'a>>>,
16    pub propagators: HashMap<HandlerEvent, Vec<Propagator<'a>>>,
17    pub depth: usize,
18    pub workers: usize,
19    pub blacklist: Vec<String>,
20    pub whitelist: Vec<String>,
21    pub revisit: bool,
22}
23
24impl<'a> CrawlerBuilder<'a> {
25    pub fn new() -> Self {
26        Self {
27            client_builder: Client::builder(),
28            handlers: HashMap::new(),
29            propagators: HashMap::new(),
30            depth: 2,
31            workers: 40,
32            whitelist: vec![],
33            blacklist: vec![],
34            revisit: false,
35        }
36    }
37
38    /// Consume the Builder and produce a Crawler
39    pub fn build(self) -> Result<Crawler<'a>> {
40        Crawler::from_builder(self)
41    }
42
43    /// Don't crawl a Url containing `expr`
44    pub fn blacklist(mut self, expr: &str) -> Self {
45        self.blacklist.push(expr.to_string());
46        self
47    }
48
49    /// Only crawl Urls containing `expr`
50    pub fn whitelist(mut self, expr: &str) -> Self {
51        self.whitelist.push(expr.to_string());
52        self
53    }
54
55    /// Set the crawl depth ( default: 2 )
56    pub fn depth(mut self, depth: usize) -> Self {
57        self.depth = depth;
58        self
59    }
60
61    /// Revisit pages ( default: false )
62    pub fn revisit(mut self, revisit: bool) -> Self {
63        self.revisit = revisit;
64        self
65    }
66
67    /// Set the concurrency limit ( default: 40 )
68    pub fn workers(mut self, limit: usize) -> Self {
69        self.workers = limit;
70        self
71    }
72    /// Set the user agent
73    pub fn user_agent(mut self, user_agent: &'a str) -> Self {
74        self.client_builder = self.client_builder.user_agent(user_agent.to_string());
75        self
76    }
77
78    /// Set an https proxy with a cacert.der file
79    pub fn proxy(mut self, proxy_str: &str, ca_cert: &str) -> Result<Self> {
80        let mut buf = Vec::new();
81        File::open(ca_cert)?.read_to_end(&mut buf)?;
82        let cert = reqwest::Certificate::from_der(&buf)?;
83
84        let proxy = reqwest::Proxy::all(proxy_str)?;
85
86        self.client_builder = self.client_builder.add_root_certificate(cert).proxy(proxy);
87        Ok(self)
88    }
89
90    /// Set the request timeout
91    pub fn timeout(mut self, seconds: u64, nanoseconds: u32) -> Self {
92        self.client_builder = self
93            .client_builder
94            .timeout(std::time::Duration::new(seconds, nanoseconds));
95        self
96    }
97
98    /// Add a handler  
99    /// Closure type: `FnMut(&HandlerArgs)`  
100    pub fn on_page<F>(mut self, closure: F) -> Self
101    where
102        F: FnMut(&HandlerArgs) + Send + Sync + 'a,
103    {
104        let closure: Box<dyn FnMut(&HandlerArgs) + Send + Sync + 'a> = Box::new(closure);
105        if let Some(handlers) = self.handlers.get_mut(&HandlerEvent::OnPage) {
106            handlers.push(closure)
107        } else {
108            self.handlers.insert(HandlerEvent::OnPage, vec![closure]);
109        }
110        self
111    }
112
113    /// Add a propagator  
114    /// Closure type: `FnMut(&HandlerArgs) -> Vec<Url>`  
115    pub fn on_page_propagator<F>(mut self, closure: F) -> Self
116    where
117        F: FnMut(&HandlerArgs) -> Vec<Url> + Send + Sync + 'a,
118    {
119        let closure: Propagator = Box::new(closure);
120        if let Some(propagators) = self.propagators.get_mut(&HandlerEvent::OnPage) {
121            propagators.push(closure)
122        } else {
123            self.propagators.insert(HandlerEvent::OnPage, vec![closure]);
124        }
125        self
126    }
127
128    /// Add a handler  
129    /// Closure type: `FnMut(&HandlerArgs)`  
130    pub fn add_handler<F>(mut self, sel: &str, closure: F) -> Self
131    where
132        F: FnMut(&HandlerArgs) + Send + Sync + 'a,
133    {
134        let sel = sel.to_string();
135        let closure: Box<dyn FnMut(&HandlerArgs) + Send + Sync + 'a> = Box::new(closure);
136        if let Some(handlers) = self
137            .handlers
138            .get_mut(&HandlerEvent::OnSelector(sel.clone()))
139        {
140            handlers.push(closure)
141        } else {
142            self.handlers
143                .insert(HandlerEvent::OnSelector(sel.clone()), vec![closure]);
144        }
145        self
146    }
147
148    /// Add a propagator  
149    /// Closure type: `FnMut(&HandlerArgs) -> Vec<Url>`
150    pub fn add_propagator<F>(mut self, sel: &str, closure: F) -> Self
151    where
152        F: FnMut(&HandlerArgs) -> Vec<Url> + 'a + Send + Sync,
153    {
154        let sel = sel.to_string();
155        let closure: Box<dyn FnMut(&HandlerArgs) -> Vec<Url> + Send + Sync + 'a> =
156            Box::new(closure);
157        if let Some(propagators) = self
158            .propagators
159            .get_mut(&HandlerEvent::OnSelector(sel.clone()))
160        {
161            propagators.push(closure)
162        } else {
163            self.propagators
164                .insert(HandlerEvent::OnSelector(sel), vec![closure]);
165        }
166        self
167    }
168
169    /// Propagate on all href and src attributes  
170    /// NOTE: "scheme://domain.tld/path" and "scheme://domain.tld/path/" may behave differently,  
171    /// see <https://docs.rs/reqwest/0.10.8/reqwest/struct.Url.html#method.join> for info.
172    pub fn add_default_propagators(mut self) -> Self {
173        let href_prop = |args: &HandlerArgs| -> Vec<Url> {
174            if let Some(href) = args.element.unwrap().value().attr("href") {
175                if let Ok(url) = absolute_url(&args.page.url, href) {
176                    return vec![url];
177                }
178            }
179            vec![]
180        };
181
182        let src_prop = |args: &HandlerArgs| -> Vec<Url> {
183            if let Some(href) = args.element.unwrap().value().attr("src") {
184                if let Ok(url) = absolute_url(&args.page.url, href) {
185                    return vec![url];
186                }
187            }
188            vec![]
189        };
190
191        self = self.add_propagator("*[href]", href_prop);
192        self = self.add_propagator("*[src]", src_prop);
193
194        self
195    }
196}