1use std::path::{Path, PathBuf};
26use std::time::Duration;
27
28use spider_util::response::{LinkExtractOptions, LinkType};
29use url::Url;
30
31#[derive(Debug, Clone, Copy, PartialEq, Eq)]
33pub enum DiscoveryMode {
34 Disabled,
36 HtmlLinks,
38 HtmlAndMetadata,
40 FullResources,
42 SitemapOnly,
44}
45
46#[derive(Debug, Clone, PartialEq, Eq)]
48pub struct DiscoveryRule {
49 pub name: String,
51 pub allow_patterns: Vec<String>,
53 pub deny_patterns: Vec<String>,
55 pub allow_domains: Vec<String>,
57 pub deny_domains: Vec<String>,
59 pub allow_path_prefixes: Vec<String>,
61 pub deny_path_prefixes: Vec<String>,
63 pub link_extract_options: LinkExtractOptions,
65}
66
67impl DiscoveryRule {
68 pub fn new(name: impl Into<String>) -> Self {
70 Self {
71 name: name.into(),
72 allow_patterns: Vec::new(),
73 deny_patterns: Vec::new(),
74 allow_domains: Vec::new(),
75 deny_domains: Vec::new(),
76 allow_path_prefixes: Vec::new(),
77 deny_path_prefixes: Vec::new(),
78 link_extract_options: LinkExtractOptions::default(),
79 }
80 }
81
82 pub fn with_link_extract_options(mut self, options: LinkExtractOptions) -> Self {
84 self.link_extract_options = options;
85 self
86 }
87
88 pub fn with_allow_patterns(
90 mut self,
91 patterns: impl IntoIterator<Item = impl Into<String>>,
92 ) -> Self {
93 self.allow_patterns = patterns.into_iter().map(Into::into).collect();
94 self
95 }
96
97 pub fn with_deny_patterns(
99 mut self,
100 patterns: impl IntoIterator<Item = impl Into<String>>,
101 ) -> Self {
102 self.deny_patterns = patterns.into_iter().map(Into::into).collect();
103 self
104 }
105
106 pub fn with_allow_domains(
108 mut self,
109 domains: impl IntoIterator<Item = impl Into<String>>,
110 ) -> Self {
111 self.allow_domains = domains.into_iter().map(normalize_domain_filter).collect();
112 self
113 }
114
115 pub fn with_deny_domains(
117 mut self,
118 domains: impl IntoIterator<Item = impl Into<String>>,
119 ) -> Self {
120 self.deny_domains = domains.into_iter().map(normalize_domain_filter).collect();
121 self
122 }
123
124 pub fn with_allow_path_prefixes(
126 mut self,
127 prefixes: impl IntoIterator<Item = impl Into<String>>,
128 ) -> Self {
129 self.allow_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
130 self
131 }
132
133 pub fn with_deny_path_prefixes(
135 mut self,
136 prefixes: impl IntoIterator<Item = impl Into<String>>,
137 ) -> Self {
138 self.deny_path_prefixes = prefixes.into_iter().map(normalize_path_prefix).collect();
139 self
140 }
141
142 pub fn with_same_site_only(mut self, enabled: bool) -> Self {
144 self.link_extract_options.same_site_only = enabled;
145 self
146 }
147
148 pub fn with_text_links(mut self, enabled: bool) -> Self {
150 self.link_extract_options.include_text_links = enabled;
151 self
152 }
153
154 pub fn with_follow_allow_patterns(
156 mut self,
157 patterns: impl IntoIterator<Item = impl Into<String>>,
158 ) -> Self {
159 self.link_extract_options = self.link_extract_options.with_allow_patterns(patterns);
160 self
161 }
162
163 pub fn with_follow_deny_patterns(
165 mut self,
166 patterns: impl IntoIterator<Item = impl Into<String>>,
167 ) -> Self {
168 self.link_extract_options = self.link_extract_options.with_deny_patterns(patterns);
169 self
170 }
171
172 pub fn with_follow_allow_domains(
174 mut self,
175 domains: impl IntoIterator<Item = impl Into<String>>,
176 ) -> Self {
177 self.link_extract_options = self.link_extract_options.with_allow_domains(domains);
178 self
179 }
180
181 pub fn with_follow_deny_domains(
183 mut self,
184 domains: impl IntoIterator<Item = impl Into<String>>,
185 ) -> Self {
186 self.link_extract_options = self.link_extract_options.with_deny_domains(domains);
187 self
188 }
189
190 pub fn with_follow_allow_path_prefixes(
192 mut self,
193 prefixes: impl IntoIterator<Item = impl Into<String>>,
194 ) -> Self {
195 self.link_extract_options = self.link_extract_options.with_allow_path_prefixes(prefixes);
196 self
197 }
198
199 pub fn with_follow_deny_path_prefixes(
201 mut self,
202 prefixes: impl IntoIterator<Item = impl Into<String>>,
203 ) -> Self {
204 self.link_extract_options = self.link_extract_options.with_deny_path_prefixes(prefixes);
205 self
206 }
207
208 pub fn with_allowed_tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
210 self.link_extract_options = self.link_extract_options.with_allowed_tags(tags);
211 self
212 }
213
214 pub fn with_allowed_attributes(
216 mut self,
217 attributes: impl IntoIterator<Item = impl Into<String>>,
218 ) -> Self {
219 self.link_extract_options = self
220 .link_extract_options
221 .with_allowed_attributes(attributes);
222 self
223 }
224
225 pub fn with_allowed_link_types(
227 mut self,
228 link_types: impl IntoIterator<Item = LinkType>,
229 ) -> Self {
230 self.link_extract_options = self
231 .link_extract_options
232 .with_allowed_link_types(link_types);
233 self
234 }
235
236 pub fn with_denied_link_types(
238 mut self,
239 link_types: impl IntoIterator<Item = LinkType>,
240 ) -> Self {
241 self.link_extract_options = self.link_extract_options.with_denied_link_types(link_types);
242 self
243 }
244
245 pub(crate) fn matches_response(&self, url: &Url) -> bool {
246 let absolute_url = url.as_str();
247 if !self.allow_patterns.is_empty()
248 && !self
249 .allow_patterns
250 .iter()
251 .any(|pattern| glob_matches(pattern, absolute_url))
252 {
253 return false;
254 }
255
256 if self
257 .deny_patterns
258 .iter()
259 .any(|pattern| glob_matches(pattern, absolute_url))
260 {
261 return false;
262 }
263
264 let host = url.host_str().unwrap_or_default();
265 if !self.allow_domains.is_empty()
266 && !self
267 .allow_domains
268 .iter()
269 .any(|domain| domain_matches(host, domain))
270 {
271 return false;
272 }
273
274 if self
275 .deny_domains
276 .iter()
277 .any(|domain| domain_matches(host, domain))
278 {
279 return false;
280 }
281
282 let path = url.path();
283 if !self.allow_path_prefixes.is_empty()
284 && !self
285 .allow_path_prefixes
286 .iter()
287 .any(|prefix| path.starts_with(prefix))
288 {
289 return false;
290 }
291
292 if self
293 .deny_path_prefixes
294 .iter()
295 .any(|prefix| path.starts_with(prefix))
296 {
297 return false;
298 }
299
300 true
301 }
302}
303
304#[derive(Debug, Clone, PartialEq, Eq)]
306pub struct DiscoveryConfig {
307 pub mode: DiscoveryMode,
309 pub discover_sitemaps: bool,
311 pub max_sitemap_depth: usize,
313 pub extract_page_metadata: bool,
315 pub link_extract_options: LinkExtractOptions,
317 pub rules: Vec<DiscoveryRule>,
319}
320
321impl Default for DiscoveryConfig {
322 fn default() -> Self {
323 Self {
324 mode: DiscoveryMode::Disabled,
325 discover_sitemaps: false,
326 max_sitemap_depth: 4,
327 extract_page_metadata: false,
328 link_extract_options: LinkExtractOptions::default(),
329 rules: Vec::new(),
330 }
331 }
332}
333
334impl DiscoveryConfig {
335 pub fn new() -> Self {
337 Self::default()
338 }
339
340 pub fn with_mode(mut self, mode: DiscoveryMode) -> Self {
342 self.mode = mode;
343 self
344 }
345
346 pub fn with_sitemaps(mut self, enabled: bool) -> Self {
348 self.discover_sitemaps = enabled;
349 self
350 }
351
352 pub fn with_max_sitemap_depth(mut self, depth: usize) -> Self {
354 self.max_sitemap_depth = depth;
355 self
356 }
357
358 pub fn with_page_metadata(mut self, enabled: bool) -> Self {
360 self.extract_page_metadata = enabled;
361 self
362 }
363
364 pub fn with_link_extract_options(mut self, options: LinkExtractOptions) -> Self {
366 self.link_extract_options = options;
367 self
368 }
369
370 pub fn with_rules(mut self, rules: impl IntoIterator<Item = DiscoveryRule>) -> Self {
372 self.rules = rules.into_iter().collect();
373 self
374 }
375
376 pub fn with_rule(mut self, rule: DiscoveryRule) -> Self {
378 self.rules.push(rule);
379 self
380 }
381
382 pub fn with_same_site_only(mut self, enabled: bool) -> Self {
384 self.link_extract_options.same_site_only = enabled;
385 self
386 }
387
388 pub fn with_text_links(mut self, enabled: bool) -> Self {
390 self.link_extract_options.include_text_links = enabled;
391 self
392 }
393
394 pub fn with_allow_patterns(
396 mut self,
397 patterns: impl IntoIterator<Item = impl Into<String>>,
398 ) -> Self {
399 self.link_extract_options = self.link_extract_options.with_allow_patterns(patterns);
400 self
401 }
402
403 pub fn with_deny_patterns(
405 mut self,
406 patterns: impl IntoIterator<Item = impl Into<String>>,
407 ) -> Self {
408 self.link_extract_options = self.link_extract_options.with_deny_patterns(patterns);
409 self
410 }
411
412 pub fn with_allow_domains(
414 mut self,
415 domains: impl IntoIterator<Item = impl Into<String>>,
416 ) -> Self {
417 self.link_extract_options = self.link_extract_options.with_allow_domains(domains);
418 self
419 }
420
421 pub fn with_deny_domains(
423 mut self,
424 domains: impl IntoIterator<Item = impl Into<String>>,
425 ) -> Self {
426 self.link_extract_options = self.link_extract_options.with_deny_domains(domains);
427 self
428 }
429
430 pub fn with_allow_path_prefixes(
432 mut self,
433 prefixes: impl IntoIterator<Item = impl Into<String>>,
434 ) -> Self {
435 self.link_extract_options = self.link_extract_options.with_allow_path_prefixes(prefixes);
436 self
437 }
438
439 pub fn with_deny_path_prefixes(
441 mut self,
442 prefixes: impl IntoIterator<Item = impl Into<String>>,
443 ) -> Self {
444 self.link_extract_options = self.link_extract_options.with_deny_path_prefixes(prefixes);
445 self
446 }
447
448 pub fn with_allowed_tags(mut self, tags: impl IntoIterator<Item = impl Into<String>>) -> Self {
450 self.link_extract_options = self.link_extract_options.with_allowed_tags(tags);
451 self
452 }
453
454 pub fn with_allowed_attributes(
456 mut self,
457 attributes: impl IntoIterator<Item = impl Into<String>>,
458 ) -> Self {
459 self.link_extract_options = self
460 .link_extract_options
461 .with_allowed_attributes(attributes);
462 self
463 }
464
465 pub fn with_allowed_link_types(
467 mut self,
468 link_types: impl IntoIterator<Item = LinkType>,
469 ) -> Self {
470 self.link_extract_options = self
471 .link_extract_options
472 .with_allowed_link_types(link_types);
473 self
474 }
475
476 pub fn with_denied_link_types(
478 mut self,
479 link_types: impl IntoIterator<Item = LinkType>,
480 ) -> Self {
481 self.link_extract_options = self.link_extract_options.with_denied_link_types(link_types);
482 self
483 }
484
485 pub fn effective_link_extract_options(&self) -> Option<LinkExtractOptions> {
487 self.effective_link_extract_options_for(self.link_extract_options.clone())
488 }
489
490 pub fn effective_link_extract_options_for(
492 &self,
493 mut options: LinkExtractOptions,
494 ) -> Option<LinkExtractOptions> {
495 match self.mode {
496 DiscoveryMode::Disabled | DiscoveryMode::SitemapOnly => None,
497 DiscoveryMode::HtmlLinks | DiscoveryMode::HtmlAndMetadata => {
498 if options.allowed_link_types.is_none() {
499 options.allowed_link_types = Some(vec![LinkType::Page]);
500 }
501 Some(options)
502 }
503 DiscoveryMode::FullResources => Some(options),
504 }
505 }
506
507 pub fn should_extract_metadata(&self) -> bool {
509 self.extract_page_metadata || matches!(self.mode, DiscoveryMode::HtmlAndMetadata)
510 }
511}
512
513fn normalize_domain_filter(domain: impl Into<String>) -> String {
514 domain
515 .into()
516 .trim()
517 .trim_start_matches('.')
518 .to_ascii_lowercase()
519}
520
521fn normalize_path_prefix(prefix: impl Into<String>) -> String {
522 let prefix = prefix.into();
523 let prefix = prefix.trim();
524 if prefix.is_empty() || prefix == "/" {
525 "/".to_string()
526 } else if prefix.starts_with('/') {
527 prefix.to_string()
528 } else {
529 format!("/{prefix}")
530 }
531}
532
533fn domain_matches(host: &str, filter: &str) -> bool {
534 let host = host.to_ascii_lowercase();
535 let filter = filter.to_ascii_lowercase();
536 host == filter || host.ends_with(&format!(".{filter}"))
537}
538
539fn glob_matches(pattern: &str, input: &str) -> bool {
540 let pattern = pattern.as_bytes();
541 let input = input.as_bytes();
542 let (mut p, mut s) = (0usize, 0usize);
543 let mut last_star = None;
544 let mut match_after_star = 0usize;
545
546 while s < input.len() {
547 if p < pattern.len() && (pattern[p] == b'?' || pattern[p] == input[s]) {
548 p += 1;
549 s += 1;
550 } else if p < pattern.len() && pattern[p] == b'*' {
551 last_star = Some(p);
552 p += 1;
553 match_after_star = s;
554 } else if let Some(star_idx) = last_star {
555 p = star_idx + 1;
556 match_after_star += 1;
557 s = match_after_star;
558 } else {
559 return false;
560 }
561 }
562
563 while p < pattern.len() && pattern[p] == b'*' {
564 p += 1;
565 }
566
567 p == pattern.len()
568}
569
570#[derive(Debug, Clone, Copy, PartialEq, Eq)]
572pub enum CrawlShapePreset {
573 Broad,
574 Deep,
575 Sitemap,
576 ApiHeavy,
577}
578
579#[derive(Debug, Clone)]
581pub struct CrawlerConfig {
582 pub max_concurrent_downloads: usize,
584 pub max_pending_requests: usize,
586 pub parser_workers: usize,
588 pub max_concurrent_pipelines: usize,
590 pub channel_capacity: usize,
592 pub output_batch_size: usize,
594 pub response_backpressure_threshold: usize,
596 pub item_backpressure_threshold: usize,
598 pub retry_release_permit: bool,
600 pub browser_like_headers: bool,
602 pub live_stats: bool,
604 pub live_stats_interval: Duration,
606 pub live_stats_preview_fields: Option<Vec<String>>,
608 pub shutdown_grace_period: Duration,
610 pub item_limit: Option<usize>,
612 pub discovery: DiscoveryConfig,
614}
615
616impl Default for CrawlerConfig {
617 fn default() -> Self {
618 let cpu_count = num_cpus::get();
619 let max_concurrent_downloads = (cpu_count * 4).clamp(8, 128);
620 let max_pending_requests = (max_concurrent_downloads * 8).clamp(64, 4096);
621 let parser_workers = (cpu_count * 2).clamp(4, 32);
622 let max_concurrent_pipelines = (cpu_count * 2).clamp(4, 16);
623 let channel_capacity = (max_pending_requests / 2).clamp(512, 4096);
624 CrawlerConfig {
625 max_concurrent_downloads,
626 max_pending_requests,
627 parser_workers,
628 max_concurrent_pipelines,
629 channel_capacity,
630 output_batch_size: 64,
631 response_backpressure_threshold: (max_concurrent_downloads * 6).min(channel_capacity),
632 item_backpressure_threshold: (parser_workers * 6).min(channel_capacity),
633 retry_release_permit: true,
634 browser_like_headers: true,
635 live_stats: false,
636 live_stats_interval: Duration::from_millis(50),
637 live_stats_preview_fields: None,
638 shutdown_grace_period: Duration::from_secs(5),
639 item_limit: None,
640 discovery: DiscoveryConfig::default(),
641 }
642 }
643}
644
645impl CrawlerConfig {
646 pub fn new() -> Self {
648 Self::default()
649 }
650
651 pub fn with_max_concurrent_downloads(mut self, limit: usize) -> Self {
653 self.max_concurrent_downloads = limit;
654 self
655 }
656
657 pub fn with_max_pending_requests(mut self, limit: usize) -> Self {
659 self.max_pending_requests = limit;
660 self
661 }
662
663 pub fn with_parser_workers(mut self, count: usize) -> Self {
665 self.parser_workers = count;
666 self
667 }
668
669 pub fn with_max_concurrent_pipelines(mut self, limit: usize) -> Self {
671 self.max_concurrent_pipelines = limit;
672 self
673 }
674
675 pub fn with_channel_capacity(mut self, capacity: usize) -> Self {
677 self.channel_capacity = capacity;
678 self
679 }
680
681 pub fn with_output_batch_size(mut self, batch_size: usize) -> Self {
683 self.output_batch_size = batch_size;
684 self
685 }
686
687 pub fn with_response_backpressure_threshold(mut self, threshold: usize) -> Self {
689 self.response_backpressure_threshold = threshold;
690 self
691 }
692
693 pub fn with_item_backpressure_threshold(mut self, threshold: usize) -> Self {
695 self.item_backpressure_threshold = threshold;
696 self
697 }
698
699 pub fn with_retry_release_permit(mut self, enabled: bool) -> Self {
701 self.retry_release_permit = enabled;
702 self
703 }
704
705 pub fn with_browser_like_headers(mut self, enabled: bool) -> Self {
707 self.browser_like_headers = enabled;
708 self
709 }
710
711 pub fn with_live_stats(mut self, enabled: bool) -> Self {
713 self.live_stats = enabled;
714 self
715 }
716
717 pub fn with_live_stats_interval(mut self, interval: Duration) -> Self {
719 self.live_stats_interval = interval;
720 self
721 }
722
723 pub fn with_live_stats_preview_fields(
731 mut self,
732 fields: impl IntoIterator<Item = impl Into<String>>,
733 ) -> Self {
734 self.live_stats_preview_fields = Some(fields.into_iter().map(Into::into).collect());
735 self
736 }
737
738 pub fn with_shutdown_grace_period(mut self, grace_period: Duration) -> Self {
740 self.shutdown_grace_period = grace_period;
741 self
742 }
743
744 pub fn with_item_limit(mut self, limit: usize) -> Self {
746 self.item_limit = Some(limit);
747 self
748 }
749
750 pub fn with_crawl_shape_preset(mut self, preset: CrawlShapePreset) -> Self {
752 let cpu_count = num_cpus::get();
753 match preset {
754 CrawlShapePreset::Broad => {
755 self.max_concurrent_downloads = (cpu_count * 6).clamp(12, 192);
756 self.max_pending_requests = (self.max_concurrent_downloads * 10).clamp(128, 8192);
757 self.parser_workers = (cpu_count * 2).clamp(4, 32);
758 self.max_concurrent_pipelines = (cpu_count * 2).clamp(4, 16);
759 self.channel_capacity = (self.max_pending_requests / 2).clamp(512, 4096);
760 self.output_batch_size = 32;
761 self.response_backpressure_threshold =
762 (self.max_concurrent_downloads * 4).min(self.channel_capacity);
763 self.item_backpressure_threshold =
764 (self.parser_workers * 6).min(self.channel_capacity);
765 }
766 CrawlShapePreset::Deep => {
767 self.max_concurrent_downloads = (cpu_count * 3).clamp(8, 96);
768 self.max_pending_requests = (self.max_concurrent_downloads * 4).clamp(64, 2048);
769 self.parser_workers = (cpu_count * 2).clamp(4, 24);
770 self.max_concurrent_pipelines = (cpu_count * 2).clamp(4, 16);
771 self.channel_capacity = (self.max_pending_requests / 2).clamp(256, 2048);
772 self.output_batch_size = 24;
773 self.response_backpressure_threshold =
774 (self.max_concurrent_downloads * 5).min(self.channel_capacity);
775 self.item_backpressure_threshold =
776 (self.parser_workers * 5).min(self.channel_capacity);
777 }
778 CrawlShapePreset::Sitemap => {
779 self.max_concurrent_downloads = (cpu_count * 5).clamp(10, 160);
780 self.max_pending_requests = (self.max_concurrent_downloads * 8).clamp(128, 8192);
781 self.parser_workers = cpu_count.clamp(2, 12);
782 self.max_concurrent_pipelines = (cpu_count * 2).clamp(4, 12);
783 self.channel_capacity = (self.max_pending_requests / 2).clamp(512, 4096);
784 self.output_batch_size = 96;
785 self.response_backpressure_threshold =
786 (self.max_concurrent_downloads * 6).min(self.channel_capacity);
787 self.item_backpressure_threshold =
788 (self.parser_workers * 8).min(self.channel_capacity);
789 }
790 CrawlShapePreset::ApiHeavy => {
791 self.max_concurrent_downloads = (cpu_count * 3).clamp(8, 96);
792 self.max_pending_requests = (self.max_concurrent_downloads * 6).clamp(96, 4096);
793 self.parser_workers = cpu_count.clamp(2, 8);
794 self.max_concurrent_pipelines = (cpu_count * 3).clamp(4, 24);
795 self.channel_capacity = (self.max_pending_requests / 2).clamp(256, 4096);
796 self.output_batch_size = 48;
797 self.response_backpressure_threshold =
798 (self.max_concurrent_downloads * 5).min(self.channel_capacity);
799 self.item_backpressure_threshold =
800 (self.max_concurrent_pipelines * 4).min(self.channel_capacity);
801 }
802 }
803 self
804 }
805
806 pub fn with_discovery(mut self, discovery: DiscoveryConfig) -> Self {
808 self.discovery = discovery;
809 self
810 }
811
812 pub fn validate(&self) -> Result<(), String> {
814 if self.max_concurrent_downloads == 0 {
815 return Err("max_concurrent_downloads must be greater than 0".to_string());
816 }
817 if self.max_pending_requests == 0 {
818 return Err("max_pending_requests must be greater than 0".to_string());
819 }
820 if self.parser_workers == 0 {
821 return Err("parser_workers must be greater than 0".to_string());
822 }
823 if self.max_concurrent_pipelines == 0 {
824 return Err("max_concurrent_pipelines must be greater than 0".to_string());
825 }
826 if self.output_batch_size == 0 {
827 return Err("output_batch_size must be greater than 0".to_string());
828 }
829 if self.response_backpressure_threshold == 0 {
830 return Err("response_backpressure_threshold must be greater than 0".to_string());
831 }
832 if self.item_backpressure_threshold == 0 {
833 return Err("item_backpressure_threshold must be greater than 0".to_string());
834 }
835 if self.live_stats_interval.is_zero() {
836 return Err("live_stats_interval must be greater than 0".to_string());
837 }
838 if matches!(self.live_stats_preview_fields.as_ref(), Some(fields) if fields.is_empty()) {
839 return Err("live_stats_preview_fields must not be empty".to_string());
840 }
841 if self.shutdown_grace_period.is_zero() {
842 return Err("shutdown_grace_period must be greater than 0".to_string());
843 }
844 if matches!(self.item_limit, Some(0)) {
845 return Err("item_limit must be greater than 0".to_string());
846 }
847 if self.discovery.max_sitemap_depth == 0 {
848 return Err("discovery.max_sitemap_depth must be greater than 0".to_string());
849 }
850 Ok(())
851 }
852}
853
854#[derive(Debug, Clone, Default)]
859pub struct CheckpointConfig {
860 pub path: Option<PathBuf>,
862 pub interval: Option<Duration>,
864}
865
866impl CheckpointConfig {
867 pub fn new() -> Self {
869 Self::default()
870 }
871
872 pub fn builder() -> CheckpointConfigBuilder {
874 CheckpointConfigBuilder::default()
875 }
876
877 pub fn with_path<P: AsRef<Path>>(mut self, path: P) -> Self {
879 self.path = Some(path.as_ref().to_path_buf());
880 self
881 }
882
883 pub fn with_interval(mut self, interval: Duration) -> Self {
885 self.interval = Some(interval);
886 self
887 }
888
889 pub fn is_enabled(&self) -> bool {
891 self.path.is_some()
892 }
893}
894
895#[derive(Debug, Default)]
897pub struct CheckpointConfigBuilder {
898 path: Option<PathBuf>,
899 interval: Option<Duration>,
900}
901
902impl CheckpointConfigBuilder {
903 pub fn new() -> Self {
905 Self::default()
906 }
907
908 pub fn path<P: AsRef<Path>>(mut self, path: P) -> Self {
910 self.path = Some(path.as_ref().to_path_buf());
911 self
912 }
913
914 pub fn interval(mut self, interval: Duration) -> Self {
916 self.interval = Some(interval);
917 self
918 }
919
920 pub fn build(self) -> CheckpointConfig {
922 CheckpointConfig {
923 path: self.path,
924 interval: self.interval,
925 }
926 }
927}
928
929#[derive(Debug, Clone)]
933pub struct ParserConfig {
934 pub worker_count: usize,
936 pub queue_capacity: usize,
938}
939
940impl Default for ParserConfig {
941 fn default() -> Self {
942 ParserConfig {
943 worker_count: num_cpus::get().clamp(4, 16),
944 queue_capacity: 100,
945 }
946 }
947}
948
949impl ParserConfig {
950 pub fn new() -> Self {
952 Self::default()
953 }
954
955 pub fn with_worker_count(mut self, count: usize) -> Self {
957 self.worker_count = count;
958 self
959 }
960
961 pub fn with_queue_capacity(mut self, capacity: usize) -> Self {
963 self.queue_capacity = capacity;
964 self
965 }
966}
967
968#[derive(Debug, Clone)]
972pub struct DownloaderConfig {
973 pub max_concurrent: usize,
975 pub backpressure_threshold: usize,
977}
978
979impl Default for DownloaderConfig {
980 fn default() -> Self {
981 let max_concurrent = num_cpus::get().max(16);
982 DownloaderConfig {
983 max_concurrent,
984 backpressure_threshold: max_concurrent * 2,
985 }
986 }
987}
988
989impl DownloaderConfig {
990 pub fn new() -> Self {
992 Self::default()
993 }
994
995 pub fn with_max_concurrent(mut self, limit: usize) -> Self {
997 self.max_concurrent = limit;
998 self
999 }
1000
1001 pub fn with_backpressure_threshold(mut self, threshold: usize) -> Self {
1003 self.backpressure_threshold = threshold;
1004 self
1005 }
1006}
1007
1008#[derive(Debug, Clone)]
1012pub struct ItemProcessorConfig {
1013 pub max_concurrent: usize,
1015}
1016
1017impl Default for ItemProcessorConfig {
1018 fn default() -> Self {
1019 ItemProcessorConfig {
1020 max_concurrent: num_cpus::get().min(8),
1021 }
1022 }
1023}
1024
1025impl ItemProcessorConfig {
1026 pub fn new() -> Self {
1028 Self::default()
1029 }
1030
1031 pub fn with_max_concurrent(mut self, limit: usize) -> Self {
1033 self.max_concurrent = limit;
1034 self
1035 }
1036}