gar_crawl/
crawler_builder.rs1use crate::absolute_url;
2
3use super::crawler::*;
4use super::handler::*;
5use anyhow::Result;
6use reqwest::{Client, Url};
7use std::collections::HashMap;
8use std::fs::File;
9use std::io::Read;
10use std::marker::Send;
11
12pub struct CrawlerBuilder<'a> {
14 pub client_builder: reqwest::ClientBuilder,
15 pub handlers: HashMap<HandlerEvent, Vec<Handler<'a>>>,
16 pub propagators: HashMap<HandlerEvent, Vec<Propagator<'a>>>,
17 pub depth: usize,
18 pub workers: usize,
19 pub blacklist: Vec<String>,
20 pub whitelist: Vec<String>,
21 pub revisit: bool,
22}
23
24impl<'a> CrawlerBuilder<'a> {
25 pub fn new() -> Self {
26 Self {
27 client_builder: Client::builder(),
28 handlers: HashMap::new(),
29 propagators: HashMap::new(),
30 depth: 2,
31 workers: 40,
32 whitelist: vec![],
33 blacklist: vec![],
34 revisit: false,
35 }
36 }
37
38 pub fn build(self) -> Result<Crawler<'a>> {
40 Crawler::from_builder(self)
41 }
42
43 pub fn blacklist(mut self, expr: &str) -> Self {
45 self.blacklist.push(expr.to_string());
46 self
47 }
48
49 pub fn whitelist(mut self, expr: &str) -> Self {
51 self.whitelist.push(expr.to_string());
52 self
53 }
54
55 pub fn depth(mut self, depth: usize) -> Self {
57 self.depth = depth;
58 self
59 }
60
61 pub fn revisit(mut self, revisit: bool) -> Self {
63 self.revisit = revisit;
64 self
65 }
66
67 pub fn workers(mut self, limit: usize) -> Self {
69 self.workers = limit;
70 self
71 }
72 pub fn user_agent(mut self, user_agent: &'a str) -> Self {
74 self.client_builder = self.client_builder.user_agent(user_agent.to_string());
75 self
76 }
77
78 pub fn proxy(mut self, proxy_str: &str, ca_cert: &str) -> Result<Self> {
80 let mut buf = Vec::new();
81 File::open(ca_cert)?.read_to_end(&mut buf)?;
82 let cert = reqwest::Certificate::from_der(&buf)?;
83
84 let proxy = reqwest::Proxy::all(proxy_str)?;
85
86 self.client_builder = self.client_builder.add_root_certificate(cert).proxy(proxy);
87 Ok(self)
88 }
89
90 pub fn timeout(mut self, seconds: u64, nanoseconds: u32) -> Self {
92 self.client_builder = self
93 .client_builder
94 .timeout(std::time::Duration::new(seconds, nanoseconds));
95 self
96 }
97
98 pub fn on_page<F>(mut self, closure: F) -> Self
101 where
102 F: FnMut(&HandlerArgs) + Send + Sync + 'a,
103 {
104 let closure: Box<dyn FnMut(&HandlerArgs) + Send + Sync + 'a> = Box::new(closure);
105 if let Some(handlers) = self.handlers.get_mut(&HandlerEvent::OnPage) {
106 handlers.push(closure)
107 } else {
108 self.handlers.insert(HandlerEvent::OnPage, vec![closure]);
109 }
110 self
111 }
112
113 pub fn on_page_propagator<F>(mut self, closure: F) -> Self
116 where
117 F: FnMut(&HandlerArgs) -> Vec<Url> + Send + Sync + 'a,
118 {
119 let closure: Propagator = Box::new(closure);
120 if let Some(propagators) = self.propagators.get_mut(&HandlerEvent::OnPage) {
121 propagators.push(closure)
122 } else {
123 self.propagators.insert(HandlerEvent::OnPage, vec![closure]);
124 }
125 self
126 }
127
128 pub fn add_handler<F>(mut self, sel: &str, closure: F) -> Self
131 where
132 F: FnMut(&HandlerArgs) + Send + Sync + 'a,
133 {
134 let sel = sel.to_string();
135 let closure: Box<dyn FnMut(&HandlerArgs) + Send + Sync + 'a> = Box::new(closure);
136 if let Some(handlers) = self
137 .handlers
138 .get_mut(&HandlerEvent::OnSelector(sel.clone()))
139 {
140 handlers.push(closure)
141 } else {
142 self.handlers
143 .insert(HandlerEvent::OnSelector(sel.clone()), vec![closure]);
144 }
145 self
146 }
147
148 pub fn add_propagator<F>(mut self, sel: &str, closure: F) -> Self
151 where
152 F: FnMut(&HandlerArgs) -> Vec<Url> + 'a + Send + Sync,
153 {
154 let sel = sel.to_string();
155 let closure: Box<dyn FnMut(&HandlerArgs) -> Vec<Url> + Send + Sync + 'a> =
156 Box::new(closure);
157 if let Some(propagators) = self
158 .propagators
159 .get_mut(&HandlerEvent::OnSelector(sel.clone()))
160 {
161 propagators.push(closure)
162 } else {
163 self.propagators
164 .insert(HandlerEvent::OnSelector(sel), vec![closure]);
165 }
166 self
167 }
168
169 pub fn add_default_propagators(mut self) -> Self {
173 let href_prop = |args: &HandlerArgs| -> Vec<Url> {
174 if let Some(href) = args.element.unwrap().value().attr("href") {
175 if let Ok(url) = absolute_url(&args.page.url, href) {
176 return vec![url];
177 }
178 }
179 vec![]
180 };
181
182 let src_prop = |args: &HandlerArgs| -> Vec<Url> {
183 if let Some(href) = args.element.unwrap().value().attr("src") {
184 if let Ok(url) = absolute_url(&args.page.url, href) {
185 return vec![url];
186 }
187 }
188 vec![]
189 };
190
191 self = self.add_propagator("*[href]", href_prop);
192 self = self.add_propagator("*[src]", src_prop);
193
194 self
195 }
196}