1use ipnet::Ipv4Net;
2use serde::{de, Deserialize, Deserializer};
3
4use crate::{
5 _prelude::*,
6 resolver::{AsyncTrustDnsResolver, Resolver, RESERVED_V4_SUBNETS, RESERVED_V6_SUBNETS},
7 types,
8};
9
10type Result<T> = types::Result<T>;
11
12#[derive(Clone, Debug)]
13pub struct CLevel(pub Level);
14
15impl Deref for CLevel {
16 type Target = Level;
17
18 fn deref(&self) -> &Self::Target {
19 &self.0
20 }
21}
22
23impl<'de> Deserialize<'de> for CLevel {
24 fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<CLevel, D::Error> {
25 let s: String = Deserialize::deserialize(deserializer)?;
26 Level::from_str(&s).map(CLevel).map_err(de::Error::custom)
27 }
28}
29
30#[derive(Clone, Debug)]
31pub struct CIP4Addr(pub Ipv4Addr);
32
33impl Deref for CIP4Addr {
34 type Target = Ipv4Addr;
35
36 fn deref(&self) -> &Self::Target {
37 &self.0
38 }
39}
40
41impl<'de> Deserialize<'de> for CIP4Addr {
42 fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<CIP4Addr, D::Error> {
43 let s: String = Deserialize::deserialize(deserializer)?;
44 let addr: Ipv4Addr = s.parse().map_err(de::Error::custom)?;
45 Ok(CIP4Addr(addr))
46 }
47}
48
49#[derive(Clone, Debug)]
50pub struct CIP6Addr(pub Ipv6Addr);
51
52impl Deref for CIP6Addr {
53 type Target = Ipv6Addr;
54
55 fn deref(&self) -> &Self::Target {
56 &self.0
57 }
58}
59
60impl<'de> Deserialize<'de> for CIP6Addr {
61 fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<CIP6Addr, D::Error> {
62 let s: String = Deserialize::deserialize(deserializer)?;
63 let addr: Ipv6Addr = s.parse().map_err(de::Error::custom)?;
64 Ok(CIP6Addr(addr))
65 }
66}
67
68#[derive(Clone, Debug)]
69pub struct CIpv4Net(pub Ipv4Net);
70
71impl Deref for CIpv4Net {
72 type Target = Ipv4Net;
73
74 fn deref(&self) -> &Self::Target {
75 &self.0
76 }
77}
78
79impl<'de> Deserialize<'de> for CIpv4Net {
80 fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<CIpv4Net, D::Error> {
81 let s: String = Deserialize::deserialize(deserializer)?;
82 s.parse::<Ipv4Net>().map(CIpv4Net).map_err(de::Error::custom)
83 }
84}
85
86#[derive(Clone, Debug)]
87pub struct CBytes(pub usize);
88
89impl Deref for CBytes {
90 type Target = usize;
91
92 fn deref(&self) -> &Self::Target {
93 &self.0
94 }
95}
96
97impl<'de> Deserialize<'de> for CBytes {
98 fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<CBytes, D::Error> {
99 let s: String = Deserialize::deserialize(deserializer)?;
100 let s = s.replace("_", "");
101 let v = s.parse::<humanize_rs::bytes::Bytes>();
102 let r = v.map_err(de::Error::custom)?;
103 Ok(CBytes(r.size()))
104 }
105}
106
107#[derive(Clone, Debug)]
108pub struct CDuration(Duration);
109
110impl Deref for CDuration {
111 type Target = Duration;
112
113 fn deref(&self) -> &Self::Target {
114 &self.0
115 }
116}
117
118impl CDuration {
119 pub fn from_secs(secs: u64) -> Self {
120 CDuration(Duration::from_secs(secs))
121 }
122
123 pub fn from_millis(millis: u64) -> Self {
124 CDuration(Duration::from_millis(millis))
125 }
126}
127
128impl<'de> Deserialize<'de> for CDuration {
129 fn deserialize<D: Deserializer<'de>>(deserializer: D) -> std::result::Result<CDuration, D::Error> {
130 let s: String = Deserialize::deserialize(deserializer)?;
131 let s = s.replace("_", "");
132 let v = humanize_rs::duration::parse(&s);
133 let r = v.map_err(de::Error::custom)?;
134 Ok(CDuration(r))
135 }
136}
137
138pub type AsyncTrustDnsResolverConfig = trust_dns_resolver::config::ResolverConfig;
139pub struct AsyncTrustDnsResolverOptsMapping(trust_dns_resolver::config::ResolverOpts);
140
141impl Deref for AsyncTrustDnsResolverOptsMapping {
142 type Target = trust_dns_resolver::config::ResolverOpts;
143
144 fn deref(&self) -> &Self::Target {
145 &self.0
146 }
147}
148
149#[derive(Clone, Debug, Deserialize)]
150#[serde(default)]
151#[serde(deny_unknown_fields)]
152pub struct AsyncTrustDnsResolverOpts {
153 pub ndots: usize,
154 pub timeout: CDuration,
155 pub attempts: usize,
156 pub rotate: bool,
157 pub check_names: bool,
158 pub edns0: bool,
159 pub validate: bool,
160 pub ip_strategy: trust_dns_resolver::config::LookupIpStrategy,
161 pub cache_size: usize,
162 pub use_hosts_file: bool,
163 pub positive_min_ttl: Option<CDuration>,
164 pub negative_min_ttl: Option<CDuration>,
165 pub positive_max_ttl: Option<CDuration>,
166 pub negative_max_ttl: Option<CDuration>,
167 pub num_concurrent_reqs: usize,
168 pub preserve_intermediates: bool,
169}
170
171impl From<AsyncTrustDnsResolverOpts> for AsyncTrustDnsResolverOptsMapping {
172 fn from(p: AsyncTrustDnsResolverOpts) -> AsyncTrustDnsResolverOptsMapping {
173 AsyncTrustDnsResolverOptsMapping(trust_dns_resolver::config::ResolverOpts {
174 ndots: p.ndots,
175 timeout: *p.timeout,
176 attempts: p.attempts,
177 rotate: p.rotate,
178 check_names: p.check_names,
179 edns0: p.edns0,
180 validate: p.validate,
181 ip_strategy: p.ip_strategy,
182 cache_size: p.cache_size,
183 use_hosts_file: p.use_hosts_file,
184 positive_min_ttl: p.positive_min_ttl.map(|v| *v),
185 negative_min_ttl: p.negative_min_ttl.map(|v| *v),
186 positive_max_ttl: p.positive_max_ttl.map(|v| *v),
187 negative_max_ttl: p.negative_max_ttl.map(|v| *v),
188 num_concurrent_reqs: p.num_concurrent_reqs,
189 preserve_intermediates: p.preserve_intermediates,
190 })
191 }
192}
193
194impl From<AsyncTrustDnsResolverOptsMapping> for AsyncTrustDnsResolverOpts {
195 fn from(p: AsyncTrustDnsResolverOptsMapping) -> AsyncTrustDnsResolverOpts {
196 AsyncTrustDnsResolverOpts {
197 ndots: p.ndots,
198 timeout: CDuration(p.timeout),
199 attempts: p.attempts,
200 rotate: p.rotate,
201 check_names: p.check_names,
202 edns0: p.edns0,
203 validate: p.validate,
204 ip_strategy: p.ip_strategy,
205 cache_size: p.cache_size,
206 use_hosts_file: p.use_hosts_file,
207 positive_min_ttl: p.positive_min_ttl.map(CDuration),
208 negative_min_ttl: p.negative_min_ttl.map(CDuration),
209 positive_max_ttl: p.positive_max_ttl.map(CDuration),
210 negative_max_ttl: p.negative_max_ttl.map(CDuration),
211 num_concurrent_reqs: p.num_concurrent_reqs,
212 preserve_intermediates: p.preserve_intermediates,
213 }
214 }
215}
216
217impl Default for AsyncTrustDnsResolverOpts {
218 fn default() -> Self {
219 AsyncTrustDnsResolverOptsMapping(trust_dns_resolver::config::ResolverOpts::default()).into()
220 }
221}
222
223#[derive(Clone, Debug, Deserialize)]
224#[serde(deny_unknown_fields, default)]
225pub struct ParserProfile {
226 pub concurrency: usize,
227 pub pin: usize,
228 pub stack_size: Option<CBytes>,
229}
230
231impl Default for ParserProfile {
232 fn default() -> Self {
233 let physical_cores = num_cpus::get_physical();
234 Self { concurrency: physical_cores, pin: 0, stack_size: Some(CBytes(128 * 1024 * 1024)) }
235 }
236}
237
238#[derive(Clone, Debug, Deserialize)]
239#[serde(deny_unknown_fields)]
240pub struct ConcurrencyProfile {
241 pub domain_concurrency: usize,
242}
243
244impl Default for ConcurrencyProfile {
245 fn default() -> Self {
246 let physical_cores = num_cpus::get_physical();
247 Self { domain_concurrency: physical_cores * 40 }
248 }
249}
250
251impl ConcurrencyProfile {
252 pub fn transit_buffer_size(&self) -> usize {
253 self.domain_concurrency * 10
254 }
255
256 pub fn job_tx_buffer_size(&self) -> usize {
257 self.domain_concurrency
258 }
259
260 pub fn job_update_buffer_size(&self) -> usize {
261 self.domain_concurrency * 2
262 }
263}
264
265#[derive(Clone, Debug, Deserialize)]
266#[serde(deny_unknown_fields)]
267pub struct NetworkingProfileValues {
268 pub connect_timeout: Option<CDuration>,
269 pub socket_read_buffer_size: Option<CBytes>,
270 pub socket_write_buffer_size: Option<CBytes>,
271 pub static_binding_within_the_task: bool,
272 pub bind_local_ipv4: Vec<CIP4Addr>,
273 pub bind_local_ipv6: Vec<CIP6Addr>,
274}
275
276impl NetworkingProfileValues {
277 pub fn rand_bind_local_ipv4(&self) -> Option<Ipv4Addr> {
278 if !self.bind_local_ipv4.is_empty() {
279 if self.bind_local_ipv4.len() == 1 {
280 Some(*self.bind_local_ipv4[0])
281 } else {
282 let mut rng = thread_rng();
283 Some(*self.bind_local_ipv4[rng.gen_range(0..self.bind_local_ipv4.len())])
284 }
285 } else {
286 None
287 }
288 }
289
290 pub fn rand_bind_local_ipv6(&self) -> Option<Ipv6Addr> {
291 if !self.bind_local_ipv6.is_empty() {
292 if self.bind_local_ipv6.len() == 1 {
293 Some(*self.bind_local_ipv6[0])
294 } else {
295 let mut rng = thread_rng();
296 Some(*self.bind_local_ipv6[rng.gen_range(0..self.bind_local_ipv6.len())])
297 }
298 } else {
299 None
300 }
301 }
302}
303
304impl Default for NetworkingProfileValues {
305 fn default() -> Self {
306 Self {
307 connect_timeout: Some(CDuration::from_secs(5)),
308 socket_write_buffer_size: Some(CBytes(32 * 1024)),
309 socket_read_buffer_size: Some(CBytes(32 * 1024)),
310 static_binding_within_the_task: true,
311 bind_local_ipv4: vec![],
312 bind_local_ipv6: vec![],
313 }
314 }
315}
316
317#[derive(Clone, Debug, Default, Deserialize)]
318#[serde(deny_unknown_fields)]
319pub struct ResolverConfig {
320 #[serde(default)]
321 config: Option<AsyncTrustDnsResolverConfig>,
322 #[serde(default)]
323 options: Option<AsyncTrustDnsResolverOpts>,
324}
325
326#[derive(Clone, Debug, Deserialize, Default)]
327#[serde(deny_unknown_fields)]
328pub struct NetworkingProfile {
329 pub profile: NetworkingProfileValues,
330 pub resolver: ResolverConfig,
331 pub net_v4_blacklist: Vec<CIpv4Net>,
332}
333
334impl NetworkingProfile {
335 pub fn resolve(self) -> Result<ResolvedNetworkingProfile> {
336 ResolvedNetworkingProfile::new(self)
337 }
338
339 pub fn resolve_custom_resolver(self, resolver: Box<dyn Resolver>) -> Result<ResolvedNetworkingProfile> {
340 Ok(ResolvedNetworkingProfile::new_with_resolver(self, resolver))
341 }
342}
343
344#[derive(Clone, Debug)]
345pub struct ResolvedNetworkingProfile {
346 pub profile: NetworkingProfileValues,
347
348 pub resolver: Arc<Box<dyn Resolver>>,
349}
350
351impl ResolvedNetworkingProfile {
352 fn new_with_resolver(p: NetworkingProfile, resolver: Box<dyn Resolver>) -> Self {
353 Self { profile: p.profile, resolver: Arc::new(resolver) }
354 }
355
356 fn new(p: NetworkingProfile) -> Result<Self> {
357 let (config, options) = if let (Some(config), Some(options)) = (p.resolver.config, p.resolver.options) {
358 (config, options.into())
359 } else {
360 let (config, options) = trust_dns_resolver::system_conf::read_system_conf()
361 .context("cannot read resolver config settings from system")?;
362 (config, AsyncTrustDnsResolverOptsMapping(options))
363 };
364
365 let mut resolver = AsyncTrustDnsResolver::new(config, *options).context("cannot create default resolver")?;
366
367 let reserved_v4 = RESERVED_V4_SUBNETS
368 .clone()
369 .into_iter()
370 .chain(p.net_v4_blacklist.into_iter().map(|a| *a))
371 .collect::<Vec<_>>();
372 resolver.with_net_v4_blacklist(reserved_v4);
373 resolver.with_net_v6_blacklist(RESERVED_V6_SUBNETS.to_vec());
374 Ok(Self { profile: p.profile, resolver: Arc::new(Box::new(resolver)) })
375 }
376}
377
378#[derive(Clone, Debug, Deserialize)]
379#[serde(deny_unknown_fields)]
380pub struct CrawlingSettings {
381 pub internal_read_buffer_size: CBytes,
382 pub concurrency: usize,
383 pub max_response_size: CBytes,
384 pub delay: CDuration,
385 pub delay_jitter: CDuration,
386 pub status_timeout: CDuration,
387 pub load_timeout: CDuration,
388 pub job_soft_timeout: CDuration,
389 pub job_hard_timeout: CDuration,
390 pub job_hard_timeout_jitter: CDuration,
391 pub custom_headers: HashMap<String, Vec<String>>,
392 pub user_agent: Option<String>,
393 pub compression: bool,
394}
395
396impl fmt::Display for CrawlingSettings {
397 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
398 write!(
399 f,
400 "concurrency: {}, delay: {:?}, job hard timeout: {:?}+~{:?}, job soft timeout: {:?}, irbs: {:?}, load timeout: {:?}, max_response_size: {:?}, custom headers: {:?}",
401 self.concurrency, self.delay, self.job_hard_timeout, self.job_hard_timeout_jitter, self.job_soft_timeout, self.internal_read_buffer_size, self.load_timeout, self.max_response_size, self.custom_headers,
402 )
403 }
404}
405
406impl Default for CrawlingSettings {
407 fn default() -> Self {
408 Self {
409 concurrency: 2,
410 internal_read_buffer_size: CBytes(32 * 1024),
411 delay: CDuration::from_secs(1),
412 delay_jitter: CDuration::from_secs(1),
413 job_hard_timeout: CDuration::from_secs(360),
414 job_hard_timeout_jitter: CDuration::from_secs(72),
415 job_soft_timeout: CDuration::from_secs(240),
416 status_timeout: CDuration::from_secs(5),
417 load_timeout: CDuration::from_secs(10),
418 user_agent: Some(String::from("crusty-core/0.82.0")),
419 compression: true,
420 custom_headers: HashMap::new(),
421 max_response_size: CBytes(1024 * 1024 * 2),
422 }
423 .build_headers()
424 }
425}
426
427impl CrawlingSettings {
428 pub fn build_headers(mut self) -> Self {
429 if let Some(user_agent) = &self.user_agent {
430 self.custom_headers.insert(http::header::USER_AGENT.to_string(), vec![user_agent.clone()]);
431 }
432 if self.compression {
433 self.custom_headers.insert(http::header::ACCEPT_ENCODING.to_string(), vec!["gzip, deflate".into()]);
434 }
435 self.custom_headers.insert(
436 http::header::ACCEPT.to_string(),
437 vec!["text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9".into()],
438 );
439 self
440 }
441}