crusty_core/
config.rs

1use ipnet::Ipv4Net;
2use serde::{de, Deserialize, Deserializer};
3
4use crate::{
5	_prelude::*,
6	resolver::{AsyncTrustDnsResolver, Resolver, RESERVED_V4_SUBNETS, RESERVED_V6_SUBNETS},
7	types,
8};
9
10type Result<T> = types::Result<T>;
11
12#[derive(Clone, Debug)]
13pub struct CLevel(pub Level);
14
15impl Deref for CLevel {
16	type Target = Level;
17
18	fn deref(&self) -> &Self::Target {
19		&self.0
20	}
21}
22
23impl<'de> Deserialize<'de> for CLevel {
24	fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<CLevel, D::Error> {
25		let s: String = Deserialize::deserialize(deserializer)?;
26		Level::from_str(&s).map(CLevel).map_err(de::Error::custom)
27	}
28}
29
30#[derive(Clone, Debug)]
31pub struct CIP4Addr(pub Ipv4Addr);
32
33impl Deref for CIP4Addr {
34	type Target = Ipv4Addr;
35
36	fn deref(&self) -> &Self::Target {
37		&self.0
38	}
39}
40
41impl<'de> Deserialize<'de> for CIP4Addr {
42	fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<CIP4Addr, D::Error> {
43		let s: String = Deserialize::deserialize(deserializer)?;
44		let addr: Ipv4Addr = s.parse().map_err(de::Error::custom)?;
45		Ok(CIP4Addr(addr))
46	}
47}
48
49#[derive(Clone, Debug)]
50pub struct CIP6Addr(pub Ipv6Addr);
51
52impl Deref for CIP6Addr {
53	type Target = Ipv6Addr;
54
55	fn deref(&self) -> &Self::Target {
56		&self.0
57	}
58}
59
60impl<'de> Deserialize<'de> for CIP6Addr {
61	fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<CIP6Addr, D::Error> {
62		let s: String = Deserialize::deserialize(deserializer)?;
63		let addr: Ipv6Addr = s.parse().map_err(de::Error::custom)?;
64		Ok(CIP6Addr(addr))
65	}
66}
67
68#[derive(Clone, Debug)]
69pub struct CIpv4Net(pub Ipv4Net);
70
71impl Deref for CIpv4Net {
72	type Target = Ipv4Net;
73
74	fn deref(&self) -> &Self::Target {
75		&self.0
76	}
77}
78
79impl<'de> Deserialize<'de> for CIpv4Net {
80	fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<CIpv4Net, D::Error> {
81		let s: String = Deserialize::deserialize(deserializer)?;
82		s.parse::<Ipv4Net>().map(CIpv4Net).map_err(de::Error::custom)
83	}
84}
85
86#[derive(Clone, Debug)]
87pub struct CBytes(pub usize);
88
89impl Deref for CBytes {
90	type Target = usize;
91
92	fn deref(&self) -> &Self::Target {
93		&self.0
94	}
95}
96
97impl<'de> Deserialize<'de> for CBytes {
98	fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<CBytes, D::Error> {
99		let s: String = Deserialize::deserialize(deserializer)?;
100		let s = s.replace("_", "");
101		let v = s.parse::<humanize_rs::bytes::Bytes>();
102		let r = v.map_err(de::Error::custom)?;
103		Ok(CBytes(r.size()))
104	}
105}
106
107#[derive(Clone, Debug)]
108pub struct CDuration(Duration);
109
110impl Deref for CDuration {
111	type Target = Duration;
112
113	fn deref(&self) -> &Self::Target {
114		&self.0
115	}
116}
117
118impl CDuration {
119	pub fn from_secs(secs: u64) -> Self {
120		CDuration(Duration::from_secs(secs))
121	}
122
123	pub fn from_millis(millis: u64) -> Self {
124		CDuration(Duration::from_millis(millis))
125	}
126}
127
128impl<'de> Deserialize<'de> for CDuration {
129	fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<CDuration, D::Error> {
130		let s: String = Deserialize::deserialize(deserializer)?;
131		let s = s.replace("_", "");
132		let v = humanize_rs::duration::parse(&s);
133		let r = v.map_err(de::Error::custom)?;
134		Ok(CDuration(r))
135	}
136}
137
138pub type AsyncTrustDnsResolverConfig = trust_dns_resolver::config::ResolverConfig;
139pub struct AsyncTrustDnsResolverOptsMapping(trust_dns_resolver::config::ResolverOpts);
140
141impl Deref for AsyncTrustDnsResolverOptsMapping {
142	type Target = trust_dns_resolver::config::ResolverOpts;
143
144	fn deref(&self) -> &Self::Target {
145		&self.0
146	}
147}
148
149#[derive(Clone, Debug, Deserialize)]
150#[serde(default)]
151#[serde(deny_unknown_fields)]
152pub struct AsyncTrustDnsResolverOpts {
153	pub ndots:                  usize,
154	pub timeout:                CDuration,
155	pub attempts:               usize,
156	pub rotate:                 bool,
157	pub check_names:            bool,
158	pub edns0:                  bool,
159	pub validate:               bool,
160	pub ip_strategy:            trust_dns_resolver::config::LookupIpStrategy,
161	pub cache_size:             usize,
162	pub use_hosts_file:         bool,
163	pub positive_min_ttl:       Option<CDuration>,
164	pub negative_min_ttl:       Option<CDuration>,
165	pub positive_max_ttl:       Option<CDuration>,
166	pub negative_max_ttl:       Option<CDuration>,
167	pub num_concurrent_reqs:    usize,
168	pub preserve_intermediates: bool,
169}
170
171impl From<AsyncTrustDnsResolverOpts> for AsyncTrustDnsResolverOptsMapping {
172	fn from(p: AsyncTrustDnsResolverOpts) -> AsyncTrustDnsResolverOptsMapping {
173		AsyncTrustDnsResolverOptsMapping(trust_dns_resolver::config::ResolverOpts {
174			ndots:                  p.ndots,
175			timeout:                *p.timeout,
176			attempts:               p.attempts,
177			rotate:                 p.rotate,
178			check_names:            p.check_names,
179			edns0:                  p.edns0,
180			validate:               p.validate,
181			ip_strategy:            p.ip_strategy,
182			cache_size:             p.cache_size,
183			use_hosts_file:         p.use_hosts_file,
184			positive_min_ttl:       p.positive_min_ttl.map(|v| *v),
185			negative_min_ttl:       p.negative_min_ttl.map(|v| *v),
186			positive_max_ttl:       p.positive_max_ttl.map(|v| *v),
187			negative_max_ttl:       p.negative_max_ttl.map(|v| *v),
188			num_concurrent_reqs:    p.num_concurrent_reqs,
189			preserve_intermediates: p.preserve_intermediates,
190		})
191	}
192}
193
194impl From<AsyncTrustDnsResolverOptsMapping> for AsyncTrustDnsResolverOpts {
195	fn from(p: AsyncTrustDnsResolverOptsMapping) -> AsyncTrustDnsResolverOpts {
196		AsyncTrustDnsResolverOpts {
197			ndots:                  p.ndots,
198			timeout:                CDuration(p.timeout),
199			attempts:               p.attempts,
200			rotate:                 p.rotate,
201			check_names:            p.check_names,
202			edns0:                  p.edns0,
203			validate:               p.validate,
204			ip_strategy:            p.ip_strategy,
205			cache_size:             p.cache_size,
206			use_hosts_file:         p.use_hosts_file,
207			positive_min_ttl:       p.positive_min_ttl.map(CDuration),
208			negative_min_ttl:       p.negative_min_ttl.map(CDuration),
209			positive_max_ttl:       p.positive_max_ttl.map(CDuration),
210			negative_max_ttl:       p.negative_max_ttl.map(CDuration),
211			num_concurrent_reqs:    p.num_concurrent_reqs,
212			preserve_intermediates: p.preserve_intermediates,
213		}
214	}
215}
216
217impl Default for AsyncTrustDnsResolverOpts {
218	fn default() -> Self {
219		AsyncTrustDnsResolverOptsMapping(trust_dns_resolver::config::ResolverOpts::default()).into()
220	}
221}
222
223#[derive(Clone, Debug, Deserialize)]
224#[serde(deny_unknown_fields, default)]
225pub struct ParserProfile {
226	pub concurrency: usize,
227	pub pin:         usize,
228	pub stack_size:  Option<CBytes>,
229}
230
231impl Default for ParserProfile {
232	fn default() -> Self {
233		let physical_cores = num_cpus::get_physical();
234		Self { concurrency: physical_cores, pin: 0, stack_size: Some(CBytes(128 * 1024 * 1024)) }
235	}
236}
237
238#[derive(Clone, Debug, Deserialize)]
239#[serde(deny_unknown_fields)]
240pub struct ConcurrencyProfile {
241	pub domain_concurrency: usize,
242}
243
244impl Default for ConcurrencyProfile {
245	fn default() -> Self {
246		let physical_cores = num_cpus::get_physical();
247		Self { domain_concurrency: physical_cores * 40 }
248	}
249}
250
251impl ConcurrencyProfile {
252	pub fn transit_buffer_size(&self) -> usize {
253		self.domain_concurrency * 10
254	}
255
256	pub fn job_tx_buffer_size(&self) -> usize {
257		self.domain_concurrency
258	}
259
260	pub fn job_update_buffer_size(&self) -> usize {
261		self.domain_concurrency * 2
262	}
263}
264
265#[derive(Clone, Debug, Deserialize)]
266#[serde(deny_unknown_fields)]
267pub struct NetworkingProfileValues {
268	pub connect_timeout:                Option<CDuration>,
269	pub socket_read_buffer_size:        Option<CBytes>,
270	pub socket_write_buffer_size:       Option<CBytes>,
271	pub static_binding_within_the_task: bool,
272	pub bind_local_ipv4:                Vec<CIP4Addr>,
273	pub bind_local_ipv6:                Vec<CIP6Addr>,
274}
275
276impl NetworkingProfileValues {
277	pub fn rand_bind_local_ipv4(&self) -> Option<Ipv4Addr> {
278		if !self.bind_local_ipv4.is_empty() {
279			if self.bind_local_ipv4.len() == 1 {
280				Some(*self.bind_local_ipv4[0])
281			} else {
282				let mut rng = thread_rng();
283				Some(*self.bind_local_ipv4[rng.gen_range(0..self.bind_local_ipv4.len())])
284			}
285		} else {
286			None
287		}
288	}
289
290	pub fn rand_bind_local_ipv6(&self) -> Option<Ipv6Addr> {
291		if !self.bind_local_ipv6.is_empty() {
292			if self.bind_local_ipv6.len() == 1 {
293				Some(*self.bind_local_ipv6[0])
294			} else {
295				let mut rng = thread_rng();
296				Some(*self.bind_local_ipv6[rng.gen_range(0..self.bind_local_ipv6.len())])
297			}
298		} else {
299			None
300		}
301	}
302}
303
304impl Default for NetworkingProfileValues {
305	fn default() -> Self {
306		Self {
307			connect_timeout:                Some(CDuration::from_secs(5)),
308			socket_write_buffer_size:       Some(CBytes(32 * 1024)),
309			socket_read_buffer_size:        Some(CBytes(32 * 1024)),
310			static_binding_within_the_task: true,
311			bind_local_ipv4:                vec![],
312			bind_local_ipv6:                vec![],
313		}
314	}
315}
316
317#[derive(Clone, Debug, Default, Deserialize)]
318#[serde(deny_unknown_fields)]
319pub struct ResolverConfig {
320	#[serde(default)]
321	config:  Option<AsyncTrustDnsResolverConfig>,
322	#[serde(default)]
323	options: Option<AsyncTrustDnsResolverOpts>,
324}
325
326#[derive(Clone, Debug, Deserialize, Default)]
327#[serde(deny_unknown_fields)]
328pub struct NetworkingProfile {
329	pub profile:          NetworkingProfileValues,
330	pub resolver:         ResolverConfig,
331	pub net_v4_blacklist: Vec<CIpv4Net>,
332}
333
334impl NetworkingProfile {
335	pub fn resolve(self) -> Result<ResolvedNetworkingProfile> {
336		ResolvedNetworkingProfile::new(self)
337	}
338
339	pub fn resolve_custom_resolver(self, resolver: Box<dyn Resolver>) -> Result<ResolvedNetworkingProfile> {
340		Ok(ResolvedNetworkingProfile::new_with_resolver(self, resolver))
341	}
342}
343
344#[derive(Clone, Debug)]
345pub struct ResolvedNetworkingProfile {
346	pub profile: NetworkingProfileValues,
347
348	pub resolver: Arc<Box<dyn Resolver>>,
349}
350
351impl ResolvedNetworkingProfile {
352	fn new_with_resolver(p: NetworkingProfile, resolver: Box<dyn Resolver>) -> Self {
353		Self { profile: p.profile, resolver: Arc::new(resolver) }
354	}
355
356	fn new(p: NetworkingProfile) -> Result<Self> {
357		let (config, options) = if let (Some(config), Some(options)) = (p.resolver.config, p.resolver.options) {
358			(config, options.into())
359		} else {
360			let (config, options) = trust_dns_resolver::system_conf::read_system_conf()
361				.context("cannot read resolver config settings from system")?;
362			(config, AsyncTrustDnsResolverOptsMapping(options))
363		};
364
365		let mut resolver = AsyncTrustDnsResolver::new(config, *options).context("cannot create default resolver")?;
366
367		let reserved_v4 = RESERVED_V4_SUBNETS
368			.clone()
369			.into_iter()
370			.chain(p.net_v4_blacklist.into_iter().map(|a| *a))
371			.collect::<Vec<_>>();
372		resolver.with_net_v4_blacklist(reserved_v4);
373		resolver.with_net_v6_blacklist(RESERVED_V6_SUBNETS.to_vec());
374		Ok(Self { profile: p.profile, resolver: Arc::new(Box::new(resolver)) })
375	}
376}
377
378#[derive(Clone, Debug, Deserialize)]
379#[serde(deny_unknown_fields)]
380pub struct CrawlingSettings {
381	pub internal_read_buffer_size: CBytes,
382	pub concurrency:               usize,
383	pub max_response_size:         CBytes,
384	pub delay:                     CDuration,
385	pub delay_jitter:              CDuration,
386	pub status_timeout:            CDuration,
387	pub load_timeout:              CDuration,
388	pub job_soft_timeout:          CDuration,
389	pub job_hard_timeout:          CDuration,
390	pub job_hard_timeout_jitter:   CDuration,
391	pub custom_headers:            HashMap<String, Vec<String>>,
392	pub user_agent:                Option<String>,
393	pub compression:               bool,
394}
395
396impl fmt::Display for CrawlingSettings {
397	fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
398		write!(
399			f,
400			"concurrency: {}, delay: {:?}, job hard timeout: {:?}+~{:?}, job soft timeout: {:?}, irbs: {:?}, load timeout: {:?}, max_response_size: {:?}, custom headers: {:?}",
401			self.concurrency, self.delay, self.job_hard_timeout,  self.job_hard_timeout_jitter, self.job_soft_timeout, self.internal_read_buffer_size, self.load_timeout, self.max_response_size, self.custom_headers,
402		)
403	}
404}
405
406impl Default for CrawlingSettings {
407	fn default() -> Self {
408		Self {
409			concurrency:               2,
410			internal_read_buffer_size: CBytes(32 * 1024),
411			delay:                     CDuration::from_secs(1),
412			delay_jitter:              CDuration::from_secs(1),
413			job_hard_timeout:          CDuration::from_secs(360),
414			job_hard_timeout_jitter:   CDuration::from_secs(72),
415			job_soft_timeout:          CDuration::from_secs(240),
416			status_timeout:            CDuration::from_secs(5),
417			load_timeout:              CDuration::from_secs(10),
418			user_agent:                Some(String::from("crusty-core/0.82.0")),
419			compression:               true,
420			custom_headers:            HashMap::new(),
421			max_response_size:         CBytes(1024 * 1024 * 2),
422		}
423		.build_headers()
424	}
425}
426
427impl CrawlingSettings {
428	pub fn build_headers(mut self) -> Self {
429		if let Some(user_agent) = &self.user_agent {
430			self.custom_headers.insert(http::header::USER_AGENT.to_string(), vec![user_agent.clone()]);
431		}
432		if self.compression {
433			self.custom_headers.insert(http::header::ACCEPT_ENCODING.to_string(), vec!["gzip, deflate".into()]);
434		}
435		self.custom_headers.insert(
436			http::header::ACCEPT.to_string(),
437			vec!["text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9".into()],
438		);
439		self
440	}
441}