Skip to main content

spider_core/
config.rs

1//! # Configuration Module
2//!
3//! Provides configuration structs for the crawler and its components.
4//!
5//! ## Overview
6//!
7//! This module defines configuration structs that group related parameters
8//! together, following the Parameter Object pattern. Each configuration struct
9//! implements a builder pattern for ergonomic construction.
10//!
11//! ## Key Structs
12//!
13//! - [`CrawlerConfig`]: Core crawler concurrency and channel settings
14//! - [`CheckpointConfig`]: Checkpoint save/load configuration
15//! - [`ParserConfig`]: Parser worker configuration
16//! - [`DownloaderConfig`]: Downloader concurrency configuration
17//! - [`ItemProcessorConfig`]: Item processing pipeline configuration
18//!
19//! ## Example
20//!
21//! ```rust
22//! use spider_core::config::{CrawlerConfig, CheckpointConfig};
23//! use std::time::Duration;
24//!
25//! let crawler_config = CrawlerConfig::default()
26//!     .with_max_concurrent_downloads(10)
27//!     .with_parser_workers(4)
28//!     .with_max_concurrent_pipelines(8)
29//!     .with_channel_capacity(2000);
30//!
31//! let checkpoint_config = CheckpointConfig::builder()
32//!     .path("./crawl.checkpoint")
33//!     .interval(Duration::from_secs(60))
34//!     .build();
35//! ```
36
37use std::path::{Path, PathBuf};
38use std::time::Duration;
39
40/// Core configuration for the crawler's concurrency settings.
41///
42/// This struct holds tunable parameters that control the parallelism
43/// and throughput of the crawler.
44#[derive(Debug, Clone)]
45pub struct CrawlerConfig {
46    /// The maximum number of concurrent downloads.
47    pub max_concurrent_downloads: usize,
48    /// The number of workers dedicated to parsing responses.
49    pub parser_workers: usize,
50    /// The maximum number of concurrent item processing pipelines.
51    pub max_concurrent_pipelines: usize,
52    /// The capacity of communication channels between components.
53    pub channel_capacity: usize,
54    /// Enables in-place live statistics updates on terminal stdout.
55    pub live_stats: bool,
56    /// Refresh interval for live statistics output.
57    pub live_stats_interval: Duration,
58}
59
60impl Default for CrawlerConfig {
61    fn default() -> Self {
62        CrawlerConfig {
63            max_concurrent_downloads: num_cpus::get().max(16),
64            parser_workers: num_cpus::get().clamp(4, 16),
65            max_concurrent_pipelines: num_cpus::get().min(8),
66            channel_capacity: 1000,
67            live_stats: false,
68            live_stats_interval: Duration::from_millis(50),
69        }
70    }
71}
72
73impl CrawlerConfig {
74    /// Creates a new `CrawlerConfig` with default settings.
75    pub fn new() -> Self {
76        Self::default()
77    }
78
79    /// Sets the maximum number of concurrent downloads.
80    pub fn with_max_concurrent_downloads(mut self, limit: usize) -> Self {
81        self.max_concurrent_downloads = limit;
82        self
83    }
84
85    /// Sets the number of parser workers.
86    pub fn with_parser_workers(mut self, count: usize) -> Self {
87        self.parser_workers = count;
88        self
89    }
90
91    /// Sets the maximum number of concurrent pipelines.
92    pub fn with_max_concurrent_pipelines(mut self, limit: usize) -> Self {
93        self.max_concurrent_pipelines = limit;
94        self
95    }
96
97    /// Sets the channel capacity.
98    pub fn with_channel_capacity(mut self, capacity: usize) -> Self {
99        self.channel_capacity = capacity;
100        self
101    }
102
103    /// Enables or disables in-place live stats updates on stdout.
104    pub fn with_live_stats(mut self, enabled: bool) -> Self {
105        self.live_stats = enabled;
106        self
107    }
108
109    /// Sets the refresh interval used by live stats mode.
110    pub fn with_live_stats_interval(mut self, interval: Duration) -> Self {
111        self.live_stats_interval = interval;
112        self
113    }
114
115    /// Validates the configuration.
116    pub fn validate(&self) -> Result<(), String> {
117        if self.max_concurrent_downloads == 0 {
118            return Err("max_concurrent_downloads must be greater than 0".to_string());
119        }
120        if self.parser_workers == 0 {
121            return Err("parser_workers must be greater than 0".to_string());
122        }
123        if self.max_concurrent_pipelines == 0 {
124            return Err("max_concurrent_pipelines must be greater than 0".to_string());
125        }
126        if self.live_stats_interval.is_zero() {
127            return Err("live_stats_interval must be greater than 0".to_string());
128        }
129        Ok(())
130    }
131}
132
133/// Configuration for checkpoint save/load operations.
134///
135/// This struct holds settings for automatic checkpoint persistence,
136/// allowing crawls to be resumed after interruption.
137#[derive(Debug, Clone, Default)]
138pub struct CheckpointConfig {
139    /// Optional path for saving and loading checkpoints.
140    pub path: Option<PathBuf>,
141    /// Optional interval between automatic checkpoint saves.
142    pub interval: Option<Duration>,
143}
144
145impl CheckpointConfig {
146    /// Creates a new `CheckpointConfig` with no path or interval.
147    pub fn new() -> Self {
148        Self::default()
149    }
150
151    /// Creates a new `CheckpointConfigBuilder` for fluent construction.
152    pub fn builder() -> CheckpointConfigBuilder {
153        CheckpointConfigBuilder::default()
154    }
155
156    /// Sets the checkpoint path.
157    pub fn with_path<P: AsRef<Path>>(mut self, path: P) -> Self {
158        self.path = Some(path.as_ref().to_path_buf());
159        self
160    }
161
162    /// Sets the checkpoint interval.
163    pub fn with_interval(mut self, interval: Duration) -> Self {
164        self.interval = Some(interval);
165        self
166    }
167
168    /// Returns true if checkpointing is enabled.
169    pub fn is_enabled(&self) -> bool {
170        self.path.is_some()
171    }
172}
173
174/// Builder for `CheckpointConfig`.
175#[derive(Debug, Default)]
176pub struct CheckpointConfigBuilder {
177    path: Option<PathBuf>,
178    interval: Option<Duration>,
179}
180
181impl CheckpointConfigBuilder {
182    /// Creates a new builder with default settings.
183    pub fn new() -> Self {
184        Self::default()
185    }
186
187    /// Sets the checkpoint path.
188    pub fn path<P: AsRef<Path>>(mut self, path: P) -> Self {
189        self.path = Some(path.as_ref().to_path_buf());
190        self
191    }
192
193    /// Sets the checkpoint interval.
194    pub fn interval(mut self, interval: Duration) -> Self {
195        self.interval = Some(interval);
196        self
197    }
198
199    /// Builds the `CheckpointConfig`.
200    pub fn build(self) -> CheckpointConfig {
201        CheckpointConfig {
202            path: self.path,
203            interval: self.interval,
204        }
205    }
206}
207
208/// Configuration for the parser workers.
209///
210/// This struct holds settings specific to the response parsing subsystem.
211#[derive(Debug, Clone)]
212pub struct ParserConfig {
213    /// The number of parser worker tasks to spawn.
214    pub worker_count: usize,
215    /// The capacity of the internal parse queue per worker.
216    pub queue_capacity: usize,
217}
218
219impl Default for ParserConfig {
220    fn default() -> Self {
221        ParserConfig {
222            worker_count: num_cpus::get().clamp(4, 16),
223            queue_capacity: 100,
224        }
225    }
226}
227
228impl ParserConfig {
229    /// Creates a new `ParserConfig` with default settings.
230    pub fn new() -> Self {
231        Self::default()
232    }
233
234    /// Sets the number of parser workers.
235    pub fn with_worker_count(mut self, count: usize) -> Self {
236        self.worker_count = count;
237        self
238    }
239
240    /// Sets the internal queue capacity per worker.
241    pub fn with_queue_capacity(mut self, capacity: usize) -> Self {
242        self.queue_capacity = capacity;
243        self
244    }
245}
246
247/// Configuration for the downloader.
248///
249/// This struct holds settings specific to the HTTP download subsystem.
250#[derive(Debug, Clone)]
251pub struct DownloaderConfig {
252    /// The maximum number of concurrent downloads.
253    pub max_concurrent: usize,
254    /// The backpressure threshold for response channel occupancy.
255    pub backpressure_threshold: usize,
256}
257
258impl Default for DownloaderConfig {
259    fn default() -> Self {
260        let max_concurrent = num_cpus::get().max(16);
261        DownloaderConfig {
262            max_concurrent,
263            backpressure_threshold: max_concurrent * 2,
264        }
265    }
266}
267
268impl DownloaderConfig {
269    /// Creates a new `DownloaderConfig` with default settings.
270    pub fn new() -> Self {
271        Self::default()
272    }
273
274    /// Sets the maximum number of concurrent downloads.
275    pub fn with_max_concurrent(mut self, limit: usize) -> Self {
276        self.max_concurrent = limit;
277        self
278    }
279
280    /// Sets the backpressure threshold.
281    pub fn with_backpressure_threshold(mut self, threshold: usize) -> Self {
282        self.backpressure_threshold = threshold;
283        self
284    }
285}
286
287/// Configuration for the item processor.
288///
289/// This struct holds settings specific to the item processing pipeline.
290#[derive(Debug, Clone)]
291pub struct ItemProcessorConfig {
292    /// The maximum number of concurrent pipeline processors.
293    pub max_concurrent: usize,
294}
295
296impl Default for ItemProcessorConfig {
297    fn default() -> Self {
298        ItemProcessorConfig {
299            max_concurrent: num_cpus::get().min(8),
300        }
301    }
302}
303
304impl ItemProcessorConfig {
305    /// Creates a new `ItemProcessorConfig` with default settings.
306    pub fn new() -> Self {
307        Self::default()
308    }
309
310    /// Sets the maximum number of concurrent processors.
311    pub fn with_max_concurrent(mut self, limit: usize) -> Self {
312        self.max_concurrent = limit;
313        self
314    }
315}
316
317#[cfg(test)]
318mod tests {
319    use super::*;
320
321    #[test]
322    fn test_crawler_config_default() {
323        let config = CrawlerConfig::default();
324        assert!(config.max_concurrent_downloads > 0);
325        assert!(config.parser_workers > 0);
326        assert!(config.max_concurrent_pipelines > 0);
327        assert!(config.channel_capacity > 0);
328    }
329
330    #[test]
331    fn test_crawler_config_builder() {
332        let config = CrawlerConfig::new()
333            .with_max_concurrent_downloads(20)
334            .with_parser_workers(8)
335            .with_max_concurrent_pipelines(4)
336            .with_channel_capacity(500);
337
338        assert_eq!(config.max_concurrent_downloads, 20);
339        assert_eq!(config.parser_workers, 8);
340        assert_eq!(config.max_concurrent_pipelines, 4);
341        assert_eq!(config.channel_capacity, 500);
342    }
343
344    #[test]
345    fn test_crawler_config_validation() {
346        let valid_config = CrawlerConfig::default();
347        assert!(valid_config.validate().is_ok());
348
349        let invalid_config = CrawlerConfig::new().with_max_concurrent_downloads(0);
350        assert!(invalid_config.validate().is_err());
351    }
352
353    #[test]
354    fn test_checkpoint_config_builder() {
355        let config = CheckpointConfig::builder()
356            .path("./test.checkpoint")
357            .interval(Duration::from_secs(30))
358            .build();
359
360        assert!(config.path.is_some());
361        assert!(config.interval.is_some());
362        assert!(config.is_enabled());
363    }
364
365    #[test]
366    fn test_checkpoint_config_disabled() {
367        let config = CheckpointConfig::new();
368        assert!(!config.is_enabled());
369    }
370}