Skip to main content

spider_core/
config.rs

1//! # Configuration Module
2//!
3//! Provides configuration structs for the crawler and its components.
4//!
5//! ## Overview
6//!
7//! This module defines configuration structs that group related parameters
8//! together, following the Parameter Object pattern. Each configuration struct
9//! implements a builder pattern for ergonomic construction.
10//!
11//! ## Key Structs
12//!
13//! - [`CrawlerConfig`]: Core crawler concurrency and channel settings
14//! - [`CheckpointConfig`]: Checkpoint save/load configuration
15//! - [`ParserConfig`]: Parser worker configuration
16//! - [`DownloaderConfig`]: Downloader concurrency configuration
17//! - [`ItemProcessorConfig`]: Item processing pipeline configuration
18//!
19//! ## Example
20//!
21//! ```rust
22//! use spider_core::config::{CrawlerConfig, CheckpointConfig};
23//! use std::time::Duration;
24//!
25//! let crawler_config = CrawlerConfig::default()
26//!     .with_max_concurrent_downloads(10)
27//!     .with_parser_workers(4)
28//!     .with_max_concurrent_pipelines(8)
29//!     .with_channel_capacity(2000);
30//!
31//! let checkpoint_config = CheckpointConfig::builder()
32//!     .path("./crawl.checkpoint")
33//!     .interval(Duration::from_secs(60))
34//!     .build();
35//! ```
36
37use std::path::{Path, PathBuf};
38use std::time::Duration;
39
40/// Core configuration for the crawler's concurrency settings.
41///
42/// This struct holds tunable parameters that control the parallelism
43/// and throughput of the crawler.
44#[derive(Debug, Clone)]
45pub struct CrawlerConfig {
46    /// The maximum number of concurrent downloads.
47    pub max_concurrent_downloads: usize,
48    /// The number of workers dedicated to parsing responses.
49    pub parser_workers: usize,
50    /// The maximum number of concurrent item processing pipelines.
51    pub max_concurrent_pipelines: usize,
52    /// The capacity of communication channels between components.
53    pub channel_capacity: usize,
54}
55
56impl Default for CrawlerConfig {
57    fn default() -> Self {
58        CrawlerConfig {
59            max_concurrent_downloads: num_cpus::get().max(16),
60            parser_workers: num_cpus::get().clamp(4, 16),
61            max_concurrent_pipelines: num_cpus::get().min(8),
62            channel_capacity: 1000,
63        }
64    }
65}
66
67impl CrawlerConfig {
68    /// Creates a new `CrawlerConfig` with default settings.
69    pub fn new() -> Self {
70        Self::default()
71    }
72
73    /// Sets the maximum number of concurrent downloads.
74    pub fn with_max_concurrent_downloads(mut self, limit: usize) -> Self {
75        self.max_concurrent_downloads = limit;
76        self
77    }
78
79    /// Sets the number of parser workers.
80    pub fn with_parser_workers(mut self, count: usize) -> Self {
81        self.parser_workers = count;
82        self
83    }
84
85    /// Sets the maximum number of concurrent pipelines.
86    pub fn with_max_concurrent_pipelines(mut self, limit: usize) -> Self {
87        self.max_concurrent_pipelines = limit;
88        self
89    }
90
91    /// Sets the channel capacity.
92    pub fn with_channel_capacity(mut self, capacity: usize) -> Self {
93        self.channel_capacity = capacity;
94        self
95    }
96
97    /// Validates the configuration.
98    pub fn validate(&self) -> Result<(), String> {
99        if self.max_concurrent_downloads == 0 {
100            return Err("max_concurrent_downloads must be greater than 0".to_string());
101        }
102        if self.parser_workers == 0 {
103            return Err("parser_workers must be greater than 0".to_string());
104        }
105        if self.max_concurrent_pipelines == 0 {
106            return Err("max_concurrent_pipelines must be greater than 0".to_string());
107        }
108        Ok(())
109    }
110}
111
112/// Configuration for checkpoint save/load operations.
113///
114/// This struct holds settings for automatic checkpoint persistence,
115/// allowing crawls to be resumed after interruption.
116#[derive(Debug, Clone, Default)]
117pub struct CheckpointConfig {
118    /// Optional path for saving and loading checkpoints.
119    pub path: Option<PathBuf>,
120    /// Optional interval between automatic checkpoint saves.
121    pub interval: Option<Duration>,
122}
123
124impl CheckpointConfig {
125    /// Creates a new `CheckpointConfig` with no path or interval.
126    pub fn new() -> Self {
127        Self::default()
128    }
129
130    /// Creates a new `CheckpointConfigBuilder` for fluent construction.
131    pub fn builder() -> CheckpointConfigBuilder {
132        CheckpointConfigBuilder::default()
133    }
134
135    /// Sets the checkpoint path.
136    pub fn with_path<P: AsRef<Path>>(mut self, path: P) -> Self {
137        self.path = Some(path.as_ref().to_path_buf());
138        self
139    }
140
141    /// Sets the checkpoint interval.
142    pub fn with_interval(mut self, interval: Duration) -> Self {
143        self.interval = Some(interval);
144        self
145    }
146
147    /// Returns true if checkpointing is enabled.
148    pub fn is_enabled(&self) -> bool {
149        self.path.is_some()
150    }
151}
152
153/// Builder for `CheckpointConfig`.
154#[derive(Debug, Default)]
155pub struct CheckpointConfigBuilder {
156    path: Option<PathBuf>,
157    interval: Option<Duration>,
158}
159
160impl CheckpointConfigBuilder {
161    /// Creates a new builder with default settings.
162    pub fn new() -> Self {
163        Self::default()
164    }
165
166    /// Sets the checkpoint path.
167    pub fn path<P: AsRef<Path>>(mut self, path: P) -> Self {
168        self.path = Some(path.as_ref().to_path_buf());
169        self
170    }
171
172    /// Sets the checkpoint interval.
173    pub fn interval(mut self, interval: Duration) -> Self {
174        self.interval = Some(interval);
175        self
176    }
177
178    /// Builds the `CheckpointConfig`.
179    pub fn build(self) -> CheckpointConfig {
180        CheckpointConfig {
181            path: self.path,
182            interval: self.interval,
183        }
184    }
185}
186
187/// Configuration for the parser workers.
188///
189/// This struct holds settings specific to the response parsing subsystem.
190#[derive(Debug, Clone)]
191pub struct ParserConfig {
192    /// The number of parser worker tasks to spawn.
193    pub worker_count: usize,
194    /// The capacity of the internal parse queue per worker.
195    pub queue_capacity: usize,
196}
197
198impl Default for ParserConfig {
199    fn default() -> Self {
200        ParserConfig {
201            worker_count: num_cpus::get().clamp(4, 16),
202            queue_capacity: 100,
203        }
204    }
205}
206
207impl ParserConfig {
208    /// Creates a new `ParserConfig` with default settings.
209    pub fn new() -> Self {
210        Self::default()
211    }
212
213    /// Sets the number of parser workers.
214    pub fn with_worker_count(mut self, count: usize) -> Self {
215        self.worker_count = count;
216        self
217    }
218
219    /// Sets the internal queue capacity per worker.
220    pub fn with_queue_capacity(mut self, capacity: usize) -> Self {
221        self.queue_capacity = capacity;
222        self
223    }
224}
225
226/// Configuration for the downloader.
227///
228/// This struct holds settings specific to the HTTP download subsystem.
229#[derive(Debug, Clone)]
230pub struct DownloaderConfig {
231    /// The maximum number of concurrent downloads.
232    pub max_concurrent: usize,
233    /// The backpressure threshold for response channel occupancy.
234    pub backpressure_threshold: usize,
235}
236
237impl Default for DownloaderConfig {
238    fn default() -> Self {
239        let max_concurrent = num_cpus::get().max(16);
240        DownloaderConfig {
241            max_concurrent,
242            backpressure_threshold: max_concurrent * 2,
243        }
244    }
245}
246
247impl DownloaderConfig {
248    /// Creates a new `DownloaderConfig` with default settings.
249    pub fn new() -> Self {
250        Self::default()
251    }
252
253    /// Sets the maximum number of concurrent downloads.
254    pub fn with_max_concurrent(mut self, limit: usize) -> Self {
255        self.max_concurrent = limit;
256        self
257    }
258
259    /// Sets the backpressure threshold.
260    pub fn with_backpressure_threshold(mut self, threshold: usize) -> Self {
261        self.backpressure_threshold = threshold;
262        self
263    }
264}
265
266/// Configuration for the item processor.
267///
268/// This struct holds settings specific to the item processing pipeline.
269#[derive(Debug, Clone)]
270pub struct ItemProcessorConfig {
271    /// The maximum number of concurrent pipeline processors.
272    pub max_concurrent: usize,
273}
274
275impl Default for ItemProcessorConfig {
276    fn default() -> Self {
277        ItemProcessorConfig {
278            max_concurrent: num_cpus::get().min(8),
279        }
280    }
281}
282
283impl ItemProcessorConfig {
284    /// Creates a new `ItemProcessorConfig` with default settings.
285    pub fn new() -> Self {
286        Self::default()
287    }
288
289    /// Sets the maximum number of concurrent processors.
290    pub fn with_max_concurrent(mut self, limit: usize) -> Self {
291        self.max_concurrent = limit;
292        self
293    }
294}
295
296#[cfg(test)]
297mod tests {
298    use super::*;
299
300    #[test]
301    fn test_crawler_config_default() {
302        let config = CrawlerConfig::default();
303        assert!(config.max_concurrent_downloads > 0);
304        assert!(config.parser_workers > 0);
305        assert!(config.max_concurrent_pipelines > 0);
306        assert!(config.channel_capacity > 0);
307    }
308
309    #[test]
310    fn test_crawler_config_builder() {
311        let config = CrawlerConfig::new()
312            .with_max_concurrent_downloads(20)
313            .with_parser_workers(8)
314            .with_max_concurrent_pipelines(4)
315            .with_channel_capacity(500);
316
317        assert_eq!(config.max_concurrent_downloads, 20);
318        assert_eq!(config.parser_workers, 8);
319        assert_eq!(config.max_concurrent_pipelines, 4);
320        assert_eq!(config.channel_capacity, 500);
321    }
322
323    #[test]
324    fn test_crawler_config_validation() {
325        let valid_config = CrawlerConfig::default();
326        assert!(valid_config.validate().is_ok());
327
328        let invalid_config = CrawlerConfig::new().with_max_concurrent_downloads(0);
329        assert!(invalid_config.validate().is_err());
330    }
331
332    #[test]
333    fn test_checkpoint_config_builder() {
334        let config = CheckpointConfig::builder()
335            .path("./test.checkpoint")
336            .interval(Duration::from_secs(30))
337            .build();
338
339        assert!(config.path.is_some());
340        assert!(config.interval.is_some());
341        assert!(config.is_enabled());
342    }
343
344    #[test]
345    fn test_checkpoint_config_disabled() {
346        let config = CheckpointConfig::new();
347        assert!(!config.is_enabled());
348    }
349}