spider_core/
config.rs

1//! Configuration types used by the crawler runtime.
2//!
3//! Most users touch these settings indirectly through [`crate::CrawlerBuilder`],
4//! but they are public because they are also useful for explicit configuration
5//! and inspection.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_core::config::{CrawlerConfig, CheckpointConfig};
11//! use std::time::Duration;
12//!
13//! let crawler_config = CrawlerConfig::default()
14//!     .with_max_concurrent_downloads(10)
15//!     .with_parser_workers(4)
16//!     .with_max_concurrent_pipelines(8)
17//!     .with_channel_capacity(2000);
18//!
19//! let checkpoint_config = CheckpointConfig::builder()
20//!     .path("./crawl.checkpoint")
21//!     .interval(Duration::from_secs(60))
22//!     .build();
23//! ```
24
25use std::path::{Path, PathBuf};
26use std::time::Duration;
27
28use spider_util::response::{LinkExtractOptions, LinkType};
29use url::Url;
30
31/// Runtime discovery mode applied to each downloaded response.
32#[derive(Debug, Clone, Copy, PartialEq, Eq)]
33pub enum DiscoveryMode {
34    /// Disable framework-managed discovery.
35    Disabled,
36    /// Discover navigational HTML links only.
37    HtmlLinks,
38    /// Discover navigational HTML links and inject page metadata into response metadata.
39    HtmlAndMetadata,
40    /// Discover all supported resource types from HTML plus optional metadata.
41    FullResources,
42    /// Only process sitemap responses for follow-up URLs.
43    SitemapOnly,
44}
45
46/// Rule-like configuration for runtime-managed discovery.
47#[derive(Debug, Clone, PartialEq, Eq)]
48pub struct DiscoveryRule {
49    /// Stable rule name injected into request/response metadata when matched.
50    pub name: String,
51    /// URL patterns that the source response must match.
52    pub allow_patterns: Vec<String>,
53    /// URL patterns that exclude the source response.
54    pub deny_patterns: Vec<String>,
55    /// Domains or subdomains allowed for the source response.
56    pub allow_domains: Vec<String>,
57    /// Domains or subdomains denied for the source response.
58    pub deny_domains: Vec<String>,
59    /// Path prefixes allowed for the source response.
60    pub allow_path_prefixes: Vec<String>,
61    /// Path prefixes denied for the source response.
62    pub deny_path_prefixes: Vec<String>,
63    /// Link extraction behavior used when the rule matches.
64    pub link_extract_options: LinkExtractOptions,
65}
66
67impl DiscoveryRule {
68    /// Creates a new discovery rule with the provided name.
69    pub fn new(name: impl Into<String>) -> Self {
70        Self {
71            name: name.into(),
72            allow_patterns: Vec::new(),
73            deny_patterns: Vec::new(),
74            allow_domains: Vec::new(),
75            deny_domains: Vec::new(),
76            allow_path_prefixes: Vec::new(),
77            deny_path_prefixes: Vec::new(),
78            link_extract_options: LinkExtractOptions::default(),
79        }
80    }
81
82    /// Replaces the link extraction options used by this rule.
83    pub fn with_link_extract_options(mut self, options: LinkExtractOptions) -> Self {
84        self.link_extract_options = options;
85        self
86    }
87
88    /// Restricts this rule to source response URLs that match at least one pattern.
89    pub fn with_allow_patterns(
90        mut self,
91        patterns: impl IntoIterator<Item = impl Into<String>>,
92    ) -> Self {
93        self.allow_patterns = patterns.into_iter().map(Into::into).collect();
94        self
95    }
96
97    /// Excludes this rule for source response URLs that match any pattern.
98    pub fn with_deny_patterns(
99        mut self,
100        patterns: impl IntoIterator<Item = impl Into<String>>,
101    ) -> Self {
102        self.deny_patterns = patterns.into_iter().map(Into::into).collect();
103        self
104    }
105
106    /// Restricts this rule to source response domains or subdomains.
107    pub fn with_allow_domains(
108        mut self,
109        domains: impl IntoIterator<Item = impl Into<String>>,
110    ) -> Self {
111        self.allow_domains = domains.into_iter().map(normalize_domain_filter).collect();
112        self
113    }
114
115    /// Excludes this rule for source response domains or subdomains.
116    pub fn with_deny_domains(
117        mut self,
118        domains: impl IntoIterator<Item = impl Into<String>>,
119    ) -> Self {
120        self.deny_domains = domains.into_iter().map(normalize_domain_filter).collect();
121        self
122    }
123
124    /// Restricts this rule to source response paths with one of the provided prefixes.
125    pub fn with_allow_path_prefixes(
126        mut self,
127        prefixes: impl IntoIterator<Item = impl Into<String>>,
128    ) -> Self {
129        self.allow_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
130        self
131    }
132
133    /// Excludes this rule for source response paths with one of the provided prefixes.
134    pub fn with_deny_path_prefixes(
135        mut self,
136        prefixes: impl IntoIterator<Item = impl Into<String>>,
137    ) -> Self {
138        self.deny_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
139        self
140    }
141
142    /// Sets whether only same-site links should be extracted for matching responses.
143    pub fn with_same_site_only(mut self, enabled: bool) -> Self {
144        self.link_extract_options.same_site_only = enabled;
145        self
146    }
147
148    /// Sets whether text content should be scanned for plain-text URLs.
149    pub fn with_text_links(mut self, enabled: bool) -> Self {
150        self.link_extract_options.include_text_links = enabled;
151        self
152    }
153
154    /// Restricts discovered follow-up links to matching patterns.
155    pub fn with_follow_allow_patterns(
156        mut self,
157        patterns: impl IntoIterator<Item = impl Into<String>>,
158    ) -> Self {
159        self.link_extract_options = self.link_extract_options.with_allow_patterns(patterns);
160        self
161    }
162
163    /// Excludes discovered follow-up links that match the given patterns.
164    pub fn with_follow_deny_patterns(
165        mut self,
166        patterns: impl IntoIterator<Item = impl Into<String>>,
167    ) -> Self {
168        self.link_extract_options = self.link_extract_options.with_deny_patterns(patterns);
169        self
170    }
171
172    /// Restricts discovered follow-up links to the given domains or subdomains.
173    pub fn with_follow_allow_domains(
174        mut self,
175        domains: impl IntoIterator<Item = impl Into<String>>,
176    ) -> Self {
177        self.link_extract_options = self.link_extract_options.with_allow_domains(domains);
178        self
179    }
180
181    /// Excludes discovered follow-up links for the given domains or subdomains.
182    pub fn with_follow_deny_domains(
183        mut self,
184        domains: impl IntoIterator<Item = impl Into<String>>,
185    ) -> Self {
186        self.link_extract_options = self.link_extract_options.with_deny_domains(domains);
187        self
188    }
189
190    /// Restricts discovered follow-up links to the provided path prefixes.
191    pub fn with_follow_allow_path_prefixes(
192        mut self,
193        prefixes: impl IntoIterator<Item = impl Into<String>>,
194    ) -> Self {
195        self.link_extract_options = self.link_extract_options.with_allow_path_prefixes(prefixes);
196        self
197    }
198
199    /// Excludes discovered follow-up links for the provided path prefixes.
200    pub fn with_follow_deny_path_prefixes(
201        mut self,
202        prefixes: impl IntoIterator<Item = impl Into<String>>,
203    ) -> Self {
204        self.link_extract_options = self.link_extract_options.with_deny_path_prefixes(prefixes);
205        self
206    }
207
208    /// Restricts attribute extraction to specific HTML tags for matching responses.
209    pub fn with_allowed_tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
210        self.link_extract_options = self.link_extract_options.with_allowed_tags(tags);
211        self
212    }
213
214    /// Restricts attribute extraction to specific HTML attributes for matching responses.
215    pub fn with_allowed_attributes(
216        mut self,
217        attributes: impl IntoIterator<Item = impl Into<String>>,
218    ) -> Self {
219        self.link_extract_options = self
220            .link_extract_options
221            .with_allowed_attributes(attributes);
222        self
223    }
224
225    /// Restricts discovered follow-up links to the provided link types.
226    pub fn with_allowed_link_types(
227        mut self,
228        link_types: impl IntoIterator<Item = LinkType>,
229    ) -> Self {
230        self.link_extract_options = self
231            .link_extract_options
232            .with_allowed_link_types(link_types);
233        self
234    }
235
236    /// Excludes the provided link types from discovered follow-up links.
237    pub fn with_denied_link_types(
238        mut self,
239        link_types: impl IntoIterator<Item = LinkType>,
240    ) -> Self {
241        self.link_extract_options = self.link_extract_options.with_denied_link_types(link_types);
242        self
243    }
244
245    pub(crate) fn matches_response(&self, url: &Url) -> bool {
246        let absolute_url = url.as_str();
247        if !self.allow_patterns.is_empty()
248            && !self
249                .allow_patterns
250                .iter()
251                .any(|pattern| glob_matches(pattern, absolute_url))
252        {
253            return false;
254        }
255
256        if self
257            .deny_patterns
258            .iter()
259            .any(|pattern| glob_matches(pattern, absolute_url))
260        {
261            return false;
262        }
263
264        let host = url.host_str().unwrap_or_default();
265        if !self.allow_domains.is_empty()
266            && !self
267                .allow_domains
268                .iter()
269                .any(|domain| domain_matches(host, domain))
270        {
271            return false;
272        }
273
274        if self
275            .deny_domains
276            .iter()
277            .any(|domain| domain_matches(host, domain))
278        {
279            return false;
280        }
281
282        let path = url.path();
283        if !self.allow_path_prefixes.is_empty()
284            && !self
285                .allow_path_prefixes
286                .iter()
287                .any(|prefix| path.starts_with(prefix))
288        {
289            return false;
290        }
291
292        if self
293            .deny_path_prefixes
294            .iter()
295            .any(|prefix| path.starts_with(prefix))
296        {
297            return false;
298        }
299
300        true
301    }
302}
303
304/// Discovery-specific runtime configuration.
305#[derive(Debug, Clone, PartialEq, Eq)]
306pub struct DiscoveryConfig {
307    /// How the runtime should discover follow-up work from responses.
308    pub mode: DiscoveryMode,
309    /// Whether sitemap XML should be parsed into follow-up requests.
310    pub discover_sitemaps: bool,
311    /// Maximum recursion depth for nested sitemap indexes.
312    pub max_sitemap_depth: usize,
313    /// Whether page metadata should be extracted and attached to response metadata.
314    pub extract_page_metadata: bool,
315    /// Base link extraction options used for HTML discovery.
316    pub link_extract_options: LinkExtractOptions,
317    /// Optional rule-like link discovery behavior matched against source responses.
318    pub rules: Vec<DiscoveryRule>,
319}
320
321impl Default for DiscoveryConfig {
322    fn default() -> Self {
323        Self {
324            mode: DiscoveryMode::Disabled,
325            discover_sitemaps: false,
326            max_sitemap_depth: 4,
327            extract_page_metadata: false,
328            link_extract_options: LinkExtractOptions::default(),
329            rules: Vec::new(),
330        }
331    }
332}
333
334impl DiscoveryConfig {
335    /// Creates a new discovery config with default values.
336    pub fn new() -> Self {
337        Self::default()
338    }
339
340    /// Sets the discovery mode.
341    pub fn with_mode(mut self, mode: DiscoveryMode) -> Self {
342        self.mode = mode;
343        self
344    }
345
346    /// Enables or disables sitemap parsing.
347    pub fn with_sitemaps(mut self, enabled: bool) -> Self {
348        self.discover_sitemaps = enabled;
349        self
350    }
351
352    /// Sets the maximum nested sitemap depth.
353    pub fn with_max_sitemap_depth(mut self, depth: usize) -> Self {
354        self.max_sitemap_depth = depth;
355        self
356    }
357
358    /// Enables or disables page metadata extraction.
359    pub fn with_page_metadata(mut self, enabled: bool) -> Self {
360        self.extract_page_metadata = enabled;
361        self
362    }
363
364    /// Replaces the base link extraction options.
365    pub fn with_link_extract_options(mut self, options: LinkExtractOptions) -> Self {
366        self.link_extract_options = options;
367        self
368    }
369
370    /// Replaces the configured discovery rules.
371    pub fn with_rules(mut self, rules: impl IntoIterator<Item = DiscoveryRule>) -> Self {
372        self.rules = rules.into_iter().collect();
373        self
374    }
375
376    /// Adds a single discovery rule.
377    pub fn with_rule(mut self, rule: DiscoveryRule) -> Self {
378        self.rules.push(rule);
379        self
380    }
381
382    /// Sets whether only same-site links should be discovered.
383    pub fn with_same_site_only(mut self, enabled: bool) -> Self {
384        self.link_extract_options.same_site_only = enabled;
385        self
386    }
387
388    /// Sets whether text content should be scanned for plain-text URLs.
389    pub fn with_text_links(mut self, enabled: bool) -> Self {
390        self.link_extract_options.include_text_links = enabled;
391        self
392    }
393
394    /// Restricts discovery to URLs that match at least one glob-style pattern.
395    pub fn with_allow_patterns(
396        mut self,
397        patterns: impl IntoIterator<Item = impl Into<String>>,
398    ) -> Self {
399        self.link_extract_options = self.link_extract_options.with_allow_patterns(patterns);
400        self
401    }
402
403    /// Excludes URLs that match any glob-style pattern.
404    pub fn with_deny_patterns(
405        mut self,
406        patterns: impl IntoIterator<Item = impl Into<String>>,
407    ) -> Self {
408        self.link_extract_options = self.link_extract_options.with_deny_patterns(patterns);
409        self
410    }
411
412    /// Restricts discovery to the given domains or subdomains.
413    pub fn with_allow_domains(
414        mut self,
415        domains: impl IntoIterator<Item = impl Into<String>>,
416    ) -> Self {
417        self.link_extract_options = self.link_extract_options.with_allow_domains(domains);
418        self
419    }
420
421    /// Excludes discovery for the given domains or subdomains.
422    pub fn with_deny_domains(
423        mut self,
424        domains: impl IntoIterator<Item = impl Into<String>>,
425    ) -> Self {
426        self.link_extract_options = self.link_extract_options.with_deny_domains(domains);
427        self
428    }
429
430    /// Restricts discovery to URL paths with one of the provided prefixes.
431    pub fn with_allow_path_prefixes(
432        mut self,
433        prefixes: impl IntoIterator<Item = impl Into<String>>,
434    ) -> Self {
435        self.link_extract_options = self.link_extract_options.with_allow_path_prefixes(prefixes);
436        self
437    }
438
439    /// Excludes URL paths with one of the provided prefixes.
440    pub fn with_deny_path_prefixes(
441        mut self,
442        prefixes: impl IntoIterator<Item = impl Into<String>>,
443    ) -> Self {
444        self.link_extract_options = self.link_extract_options.with_deny_path_prefixes(prefixes);
445        self
446    }
447
448    /// Restricts attribute extraction to specific HTML tags.
449    pub fn with_allowed_tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
450        self.link_extract_options = self.link_extract_options.with_allowed_tags(tags);
451        self
452    }
453
454    /// Restricts attribute extraction to specific attributes.
455    pub fn with_allowed_attributes(
456        mut self,
457        attributes: impl IntoIterator<Item = impl Into<String>>,
458    ) -> Self {
459        self.link_extract_options = self
460            .link_extract_options
461            .with_allowed_attributes(attributes);
462        self
463    }
464
465    /// Restricts discovery to the provided link types.
466    pub fn with_allowed_link_types(
467        mut self,
468        link_types: impl IntoIterator<Item = LinkType>,
469    ) -> Self {
470        self.link_extract_options = self
471            .link_extract_options
472            .with_allowed_link_types(link_types);
473        self
474    }
475
476    /// Excludes the provided link types from discovery.
477    pub fn with_denied_link_types(
478        mut self,
479        link_types: impl IntoIterator<Item = LinkType>,
480    ) -> Self {
481        self.link_extract_options = self.link_extract_options.with_denied_link_types(link_types);
482        self
483    }
484
485    /// Returns the effective link extraction options for the configured mode.
486    pub fn effective_link_extract_options(&self) -> Option<LinkExtractOptions> {
487        self.effective_link_extract_options_for(self.link_extract_options.clone())
488    }
489
490    /// Returns the effective link extraction options for a specific rule or override.
491    pub fn effective_link_extract_options_for(
492        &self,
493        mut options: LinkExtractOptions,
494    ) -> Option<LinkExtractOptions> {
495        match self.mode {
496            DiscoveryMode::Disabled | DiscoveryMode::SitemapOnly => None,
497            DiscoveryMode::HtmlLinks | DiscoveryMode::HtmlAndMetadata => {
498                if options.allowed_link_types.is_none() {
499                    options.allowed_link_types = Some(vec![LinkType::Page]);
500                }
501                Some(options)
502            }
503            DiscoveryMode::FullResources => Some(options),
504        }
505    }
506
507    /// Returns `true` when metadata extraction should run.
508    pub fn should_extract_metadata(&self) -> bool {
509        self.extract_page_metadata || matches!(self.mode, DiscoveryMode::HtmlAndMetadata)
510    }
511}
512
513fn normalize_domain_filter(domain: impl Into<String>) -> String {
514    domain
515        .into()
516        .trim()
517        .trim_start_matches('.')
518        .to_ascii_lowercase()
519}
520
521fn normalize_path_prefix(prefix: impl Into<String>) -> String {
522    let prefix = prefix.into();
523    let prefix = prefix.trim();
524    if prefix.is_empty() || prefix == "/" {
525        "/".to_string()
526    } else if prefix.starts_with('/') {
527        prefix.to_string()
528    } else {
529        format!("/{prefix}")
530    }
531}
532
533fn domain_matches(host: &str, filter: &str) -> bool {
534    let host = host.to_ascii_lowercase();
535    let filter = filter.to_ascii_lowercase();
536    host == filter || host.ends_with(&format!(".{filter}"))
537}
538
539fn glob_matches(pattern: &str, input: &str) -> bool {
540    let pattern = pattern.as_bytes();
541    let input = input.as_bytes();
542    let (mut p, mut s) = (0usize, 0usize);
543    let mut last_star = None;
544    let mut match_after_star = 0usize;
545
546    while s < input.len() {
547        if p < pattern.len() && (pattern[p] == b'?' || pattern[p] == input[s]) {
548            p += 1;
549            s += 1;
550        } else if p < pattern.len() && pattern[p] == b'*' {
551            last_star = Some(p);
552            p += 1;
553            match_after_star = s;
554        } else if let Some(star_idx) = last_star {
555            p = star_idx + 1;
556            match_after_star += 1;
557            s = match_after_star;
558        } else {
559            return false;
560        }
561    }
562
563    while p < pattern.len() && pattern[p] == b'*' {
564        p += 1;
565    }
566
567    p == pattern.len()
568}
569
570/// Guided runtime presets for common crawl shapes.
571#[derive(Debug, Clone, Copy, PartialEq, Eq)]
572pub enum CrawlShapePreset {
573    Broad,
574    Deep,
575    Sitemap,
576    ApiHeavy,
577}
578
579/// Core runtime configuration for the crawler.
580#[derive(Debug, Clone)]
581pub struct CrawlerConfig {
582    /// The maximum number of concurrent downloads.
583    pub max_concurrent_downloads: usize,
584    /// The maximum number of outstanding requests tracked by the scheduler.
585    pub max_pending_requests: usize,
586    /// The number of workers dedicated to parsing responses.
587    pub parser_workers: usize,
588    /// The maximum number of concurrent item processing pipelines.
589    pub max_concurrent_pipelines: usize,
590    /// The capacity of communication channels between components.
591    pub channel_capacity: usize,
592    /// Number of requests/items processed per parser output batch.
593    pub output_batch_size: usize,
594    /// Downloader backpressure threshold for the response channel.
595    pub response_backpressure_threshold: usize,
596    /// Parser backpressure threshold for the item channel.
597    pub item_backpressure_threshold: usize,
598    /// When enabled, retries are scheduled outside the downloader permit path.
599    pub retry_release_permit: bool,
600    /// Enables balanced browser-like default headers for the built-in reqwest downloader.
601    pub browser_like_headers: bool,
602    /// Enables in-place live statistics updates on terminal stdout.
603    pub live_stats: bool,
604    /// Refresh interval for live statistics output.
605    pub live_stats_interval: Duration,
606    /// Optional item fields to show in live-stats preview instead of full JSON.
607    pub live_stats_preview_fields: Option<Vec<String>>,
608    /// Maximum time to wait for a graceful shutdown before forcing task abort.
609    pub shutdown_grace_period: Duration,
610    /// Maximum number of scraped items to process before stopping the crawl.
611    pub item_limit: Option<usize>,
612    /// Response discovery behavior such as sitemap parsing and HTML link extraction.
613    pub discovery: DiscoveryConfig,
614}
615
616impl Default for CrawlerConfig {
617    fn default() -> Self {
618        let cpu_count = num_cpus::get();
619        let max_concurrent_downloads = (cpu_count * 4).clamp(8, 128);
620        let max_pending_requests = (max_concurrent_downloads * 8).clamp(64, 4096);
621        let parser_workers = (cpu_count * 2).clamp(4, 32);
622        let max_concurrent_pipelines = (cpu_count * 2).clamp(4, 16);
623        let channel_capacity = (max_pending_requests / 2).clamp(512, 4096);
624        CrawlerConfig {
625            max_concurrent_downloads,
626            max_pending_requests,
627            parser_workers,
628            max_concurrent_pipelines,
629            channel_capacity,
630            output_batch_size: 64,
631            response_backpressure_threshold: (max_concurrent_downloads * 6).min(channel_capacity),
632            item_backpressure_threshold: (parser_workers * 6).min(channel_capacity),
633            retry_release_permit: true,
634            browser_like_headers: true,
635            live_stats: false,
636            live_stats_interval: Duration::from_millis(50),
637            live_stats_preview_fields: None,
638            shutdown_grace_period: Duration::from_secs(5),
639            item_limit: None,
640            discovery: DiscoveryConfig::default(),
641        }
642    }
643}
644
645impl CrawlerConfig {
646    /// Creates a new `CrawlerConfig` with default settings.
647    pub fn new() -> Self {
648        Self::default()
649    }
650
651    /// Sets the maximum number of concurrent downloads.
652    pub fn with_max_concurrent_downloads(mut self, limit: usize) -> Self {
653        self.max_concurrent_downloads = limit;
654        self
655    }
656
657    /// Sets the maximum number of outstanding requests tracked by the scheduler.
658    pub fn with_max_pending_requests(mut self, limit: usize) -> Self {
659        self.max_pending_requests = limit;
660        self
661    }
662
663    /// Sets the number of parser workers.
664    pub fn with_parser_workers(mut self, count: usize) -> Self {
665        self.parser_workers = count;
666        self
667    }
668
669    /// Sets the maximum number of concurrent pipelines.
670    pub fn with_max_concurrent_pipelines(mut self, limit: usize) -> Self {
671        self.max_concurrent_pipelines = limit;
672        self
673    }
674
675    /// Sets the channel capacity.
676    pub fn with_channel_capacity(mut self, capacity: usize) -> Self {
677        self.channel_capacity = capacity;
678        self
679    }
680
681    /// Sets the parser output batch size.
682    pub fn with_output_batch_size(mut self, batch_size: usize) -> Self {
683        self.output_batch_size = batch_size;
684        self
685    }
686
687    /// Sets the downloader response-channel backpressure threshold.
688    pub fn with_response_backpressure_threshold(mut self, threshold: usize) -> Self {
689        self.response_backpressure_threshold = threshold;
690        self
691    }
692
693    /// Sets the parser item-channel backpressure threshold.
694    pub fn with_item_backpressure_threshold(mut self, threshold: usize) -> Self {
695        self.item_backpressure_threshold = threshold;
696        self
697    }
698
699    /// Controls whether retry delays release the downloader permit immediately.
700    pub fn with_retry_release_permit(mut self, enabled: bool) -> Self {
701        self.retry_release_permit = enabled;
702        self
703    }
704
705    /// Enables or disables balanced browser-like default headers for the built-in reqwest downloader.
706    pub fn with_browser_like_headers(mut self, enabled: bool) -> Self {
707        self.browser_like_headers = enabled;
708        self
709    }
710
711    /// Enables or disables in-place live stats updates on stdout.
712    pub fn with_live_stats(mut self, enabled: bool) -> Self {
713        self.live_stats = enabled;
714        self
715    }
716
717    /// Sets the refresh interval used by live stats mode.
718    pub fn with_live_stats_interval(mut self, interval: Duration) -> Self {
719        self.live_stats_interval = interval;
720        self
721    }
722
723    /// Sets which item fields should be shown in live stats preview output.
724    ///
725    /// Field names support dot notation for nested JSON objects, for example:
726    /// `title`, `source_url`, or `metadata.Japanese`.
727    ///
728    /// You can also set aliases with `label=path`, for example:
729    /// `url=source_url` or `jp=metadata.Japanese`.
730    pub fn with_live_stats_preview_fields(
731        mut self,
732        fields: impl IntoIterator<Item = impl Into<String>>,
733    ) -> Self {
734        self.live_stats_preview_fields = Some(fields.into_iter().map(Into::into).collect());
735        self
736    }
737
738    /// Sets the maximum grace period for crawler shutdown.
739    pub fn with_shutdown_grace_period(mut self, grace_period: Duration) -> Self {
740        self.shutdown_grace_period = grace_period;
741        self
742    }
743
744    /// Sets the maximum number of scraped items to process before stopping the crawl.
745    pub fn with_item_limit(mut self, limit: usize) -> Self {
746        self.item_limit = Some(limit);
747        self
748    }
749
750    /// Applies a guided set of concurrency defaults for a common crawl shape.
751    pub fn with_crawl_shape_preset(mut self, preset: CrawlShapePreset) -> Self {
752        let cpu_count = num_cpus::get();
753        match preset {
754            CrawlShapePreset::Broad => {
755                self.max_concurrent_downloads = (cpu_count * 6).clamp(12, 192);
756                self.max_pending_requests = (self.max_concurrent_downloads * 10).clamp(128, 8192);
757                self.parser_workers = (cpu_count * 2).clamp(4, 32);
758                self.max_concurrent_pipelines = (cpu_count * 2).clamp(4, 16);
759                self.channel_capacity = (self.max_pending_requests / 2).clamp(512, 4096);
760                self.output_batch_size = 32;
761                self.response_backpressure_threshold =
762                    (self.max_concurrent_downloads * 4).min(self.channel_capacity);
763                self.item_backpressure_threshold =
764                    (self.parser_workers * 6).min(self.channel_capacity);
765            }
766            CrawlShapePreset::Deep => {
767                self.max_concurrent_downloads = (cpu_count * 3).clamp(8, 96);
768                self.max_pending_requests = (self.max_concurrent_downloads * 4).clamp(64, 2048);
769                self.parser_workers = (cpu_count * 2).clamp(4, 24);
770                self.max_concurrent_pipelines = (cpu_count * 2).clamp(4, 16);
771                self.channel_capacity = (self.max_pending_requests / 2).clamp(256, 2048);
772                self.output_batch_size = 24;
773                self.response_backpressure_threshold =
774                    (self.max_concurrent_downloads * 5).min(self.channel_capacity);
775                self.item_backpressure_threshold =
776                    (self.parser_workers * 5).min(self.channel_capacity);
777            }
778            CrawlShapePreset::Sitemap => {
779                self.max_concurrent_downloads = (cpu_count * 5).clamp(10, 160);
780                self.max_pending_requests = (self.max_concurrent_downloads * 8).clamp(128, 8192);
781                self.parser_workers = cpu_count.clamp(2, 12);
782                self.max_concurrent_pipelines = (cpu_count * 2).clamp(4, 12);
783                self.channel_capacity = (self.max_pending_requests / 2).clamp(512, 4096);
784                self.output_batch_size = 96;
785                self.response_backpressure_threshold =
786                    (self.max_concurrent_downloads * 6).min(self.channel_capacity);
787                self.item_backpressure_threshold =
788                    (self.parser_workers * 8).min(self.channel_capacity);
789            }
790            CrawlShapePreset::ApiHeavy => {
791                self.max_concurrent_downloads = (cpu_count * 3).clamp(8, 96);
792                self.max_pending_requests = (self.max_concurrent_downloads * 6).clamp(96, 4096);
793                self.parser_workers = cpu_count.clamp(2, 8);
794                self.max_concurrent_pipelines = (cpu_count * 3).clamp(4, 24);
795                self.channel_capacity = (self.max_pending_requests / 2).clamp(256, 4096);
796                self.output_batch_size = 48;
797                self.response_backpressure_threshold =
798                    (self.max_concurrent_downloads * 5).min(self.channel_capacity);
799                self.item_backpressure_threshold =
800                    (self.max_concurrent_pipelines * 4).min(self.channel_capacity);
801            }
802        }
803        self
804    }
805
806    /// Sets the discovery configuration.
807    pub fn with_discovery(mut self, discovery: DiscoveryConfig) -> Self {
808        self.discovery = discovery;
809        self
810    }
811
812    /// Validates the configuration.
813    pub fn validate(&self) -> Result<(), String> {
814        if self.max_concurrent_downloads == 0 {
815            return Err("max_concurrent_downloads must be greater than 0".to_string());
816        }
817        if self.max_pending_requests == 0 {
818            return Err("max_pending_requests must be greater than 0".to_string());
819        }
820        if self.parser_workers == 0 {
821            return Err("parser_workers must be greater than 0".to_string());
822        }
823        if self.max_concurrent_pipelines == 0 {
824            return Err("max_concurrent_pipelines must be greater than 0".to_string());
825        }
826        if self.output_batch_size == 0 {
827            return Err("output_batch_size must be greater than 0".to_string());
828        }
829        if self.response_backpressure_threshold == 0 {
830            return Err("response_backpressure_threshold must be greater than 0".to_string());
831        }
832        if self.item_backpressure_threshold == 0 {
833            return Err("item_backpressure_threshold must be greater than 0".to_string());
834        }
835        if self.live_stats_interval.is_zero() {
836            return Err("live_stats_interval must be greater than 0".to_string());
837        }
838        if matches!(self.live_stats_preview_fields.as_ref(), Some(fields) if fields.is_empty()) {
839            return Err("live_stats_preview_fields must not be empty".to_string());
840        }
841        if self.shutdown_grace_period.is_zero() {
842            return Err("shutdown_grace_period must be greater than 0".to_string());
843        }
844        if matches!(self.item_limit, Some(0)) {
845            return Err("item_limit must be greater than 0".to_string());
846        }
847        if self.discovery.max_sitemap_depth == 0 {
848            return Err("discovery.max_sitemap_depth must be greater than 0".to_string());
849        }
850        Ok(())
851    }
852}
853
854/// Configuration for checkpoint save/load operations.
855///
856/// This struct holds settings for automatic checkpoint persistence,
857/// allowing crawls to be resumed after interruption.
858#[derive(Debug, Clone, Default)]
859pub struct CheckpointConfig {
860    /// Optional path for saving and loading checkpoints.
861    pub path: Option<PathBuf>,
862    /// Optional interval between automatic checkpoint saves.
863    pub interval: Option<Duration>,
864}
865
866impl CheckpointConfig {
867    /// Creates a new `CheckpointConfig` with no path or interval.
868    pub fn new() -> Self {
869        Self::default()
870    }
871
872    /// Creates a new `CheckpointConfigBuilder` for fluent construction.
873    pub fn builder() -> CheckpointConfigBuilder {
874        CheckpointConfigBuilder::default()
875    }
876
877    /// Sets the checkpoint path.
878    pub fn with_path<P: AsRef<Path>>(mut self, path: P) -> Self {
879        self.path = Some(path.as_ref().to_path_buf());
880        self
881    }
882
883    /// Sets the checkpoint interval.
884    pub fn with_interval(mut self, interval: Duration) -> Self {
885        self.interval = Some(interval);
886        self
887    }
888
889    /// Returns true if checkpointing is enabled.
890    pub fn is_enabled(&self) -> bool {
891        self.path.is_some()
892    }
893}
894
895/// Builder for `CheckpointConfig`.
896#[derive(Debug, Default)]
897pub struct CheckpointConfigBuilder {
898    path: Option<PathBuf>,
899    interval: Option<Duration>,
900}
901
902impl CheckpointConfigBuilder {
903    /// Creates a new builder with default settings.
904    pub fn new() -> Self {
905        Self::default()
906    }
907
908    /// Sets the checkpoint path.
909    pub fn path<P: AsRef<Path>>(mut self, path: P) -> Self {
910        self.path = Some(path.as_ref().to_path_buf());
911        self
912    }
913
914    /// Sets the checkpoint interval.
915    pub fn interval(mut self, interval: Duration) -> Self {
916        self.interval = Some(interval);
917        self
918    }
919
920    /// Builds the `CheckpointConfig`.
921    pub fn build(self) -> CheckpointConfig {
922        CheckpointConfig {
923            path: self.path,
924            interval: self.interval,
925        }
926    }
927}
928
929/// Configuration for the parser workers.
930///
931/// This struct holds settings specific to the response parsing subsystem.
932#[derive(Debug, Clone)]
933pub struct ParserConfig {
934    /// The number of parser worker tasks to spawn.
935    pub worker_count: usize,
936    /// The capacity of the internal parse queue per worker.
937    pub queue_capacity: usize,
938}
939
940impl Default for ParserConfig {
941    fn default() -> Self {
942        ParserConfig {
943            worker_count: num_cpus::get().clamp(4, 16),
944            queue_capacity: 100,
945        }
946    }
947}
948
949impl ParserConfig {
950    /// Creates a new `ParserConfig` with default settings.
951    pub fn new() -> Self {
952        Self::default()
953    }
954
955    /// Sets the number of parser workers.
956    pub fn with_worker_count(mut self, count: usize) -> Self {
957        self.worker_count = count;
958        self
959    }
960
961    /// Sets the internal queue capacity per worker.
962    pub fn with_queue_capacity(mut self, capacity: usize) -> Self {
963        self.queue_capacity = capacity;
964        self
965    }
966}
967
968/// Configuration for the downloader.
969///
970/// This struct holds settings specific to the HTTP download subsystem.
971#[derive(Debug, Clone)]
972pub struct DownloaderConfig {
973    /// The maximum number of concurrent downloads.
974    pub max_concurrent: usize,
975    /// The backpressure threshold for response channel occupancy.
976    pub backpressure_threshold: usize,
977}
978
979impl Default for DownloaderConfig {
980    fn default() -> Self {
981        let max_concurrent = num_cpus::get().max(16);
982        DownloaderConfig {
983            max_concurrent,
984            backpressure_threshold: max_concurrent * 2,
985        }
986    }
987}
988
989impl DownloaderConfig {
990    /// Creates a new `DownloaderConfig` with default settings.
991    pub fn new() -> Self {
992        Self::default()
993    }
994
995    /// Sets the maximum number of concurrent downloads.
996    pub fn with_max_concurrent(mut self, limit: usize) -> Self {
997        self.max_concurrent = limit;
998        self
999    }
1000
1001    /// Sets the backpressure threshold.
1002    pub fn with_backpressure_threshold(mut self, threshold: usize) -> Self {
1003        self.backpressure_threshold = threshold;
1004        self
1005    }
1006}
1007
1008/// Configuration for the item processor.
1009///
1010/// This struct holds settings specific to the item processing pipeline.
1011#[derive(Debug, Clone)]
1012pub struct ItemProcessorConfig {
1013    /// The maximum number of concurrent pipeline processors.
1014    pub max_concurrent: usize,
1015}
1016
1017impl Default for ItemProcessorConfig {
1018    fn default() -> Self {
1019        ItemProcessorConfig {
1020            max_concurrent: num_cpus::get().min(8),
1021        }
1022    }
1023}
1024
1025impl ItemProcessorConfig {
1026    /// Creates a new `ItemProcessorConfig` with default settings.
1027    pub fn new() -> Self {
1028        Self::default()
1029    }
1030
1031    /// Sets the maximum number of concurrent processors.
1032    pub fn with_max_concurrent(mut self, limit: usize) -> Self {
1033        self.max_concurrent = limit;
1034        self
1035    }
1036}
spider_core/config.rs

spider_core/
config.rs