Skip to main content

spider_core/
config.rs

1//! Configuration types used by the crawler runtime.
2//!
3//! Most users touch these settings indirectly through [`crate::CrawlerBuilder`],
4//! but they are public because they are also useful for explicit configuration
5//! and inspection.
6//!
7//! ## Example
8//!
9//! ```rust,ignore
10//! use spider_core::config::{CrawlerConfig, CheckpointConfig};
11//! use std::time::Duration;
12//!
13//! let crawler_config = CrawlerConfig::default()
14//!     .with_max_concurrent_downloads(10)
15//!     .with_parser_workers(4)
16//!     .with_max_concurrent_pipelines(8)
17//!     .with_channel_capacity(2000);
18//!
19//! let checkpoint_config = CheckpointConfig::builder()
20//!     .path("./crawl.checkpoint")
21//!     .interval(Duration::from_secs(60))
22//!     .build();
23//! ```
24
25use std::path::{Path, PathBuf};
26use std::time::Duration;
27
28/// Core runtime configuration for the crawler.
29#[derive(Debug, Clone)]
30pub struct CrawlerConfig {
31    /// The maximum number of concurrent downloads.
32    pub max_concurrent_downloads: usize,
33    /// The maximum number of outstanding requests tracked by the scheduler.
34    pub max_pending_requests: usize,
35    /// The number of workers dedicated to parsing responses.
36    pub parser_workers: usize,
37    /// The maximum number of concurrent item processing pipelines.
38    pub max_concurrent_pipelines: usize,
39    /// The capacity of communication channels between components.
40    pub channel_capacity: usize,
41    /// Number of requests/items processed per parser output batch.
42    pub output_batch_size: usize,
43    /// Downloader backpressure threshold for the response channel.
44    pub response_backpressure_threshold: usize,
45    /// Parser backpressure threshold for the item channel.
46    pub item_backpressure_threshold: usize,
47    /// When enabled, retries are scheduled outside the downloader permit path.
48    pub retry_release_permit: bool,
49    /// Enables in-place live statistics updates on terminal stdout.
50    pub live_stats: bool,
51    /// Refresh interval for live statistics output.
52    pub live_stats_interval: Duration,
53    /// Optional item fields to show in live-stats preview instead of full JSON.
54    pub live_stats_preview_fields: Option<Vec<String>>,
55    /// Maximum time to wait for a graceful shutdown before forcing task abort.
56    pub shutdown_grace_period: Duration,
57    /// Maximum number of scraped items to process before stopping the crawl.
58    pub item_limit: Option<usize>,
59}
60
61impl Default for CrawlerConfig {
62    fn default() -> Self {
63        let cpu_count = num_cpus::get();
64        let max_concurrent_downloads = (cpu_count * 4).clamp(8, 128);
65        let max_pending_requests = (max_concurrent_downloads * 8).clamp(64, 4096);
66        let parser_workers = (cpu_count * 2).clamp(4, 32);
67        let max_concurrent_pipelines = (cpu_count * 2).clamp(4, 16);
68        let channel_capacity = (max_pending_requests / 2).clamp(512, 4096);
69        CrawlerConfig {
70            max_concurrent_downloads,
71            max_pending_requests,
72            parser_workers,
73            max_concurrent_pipelines,
74            channel_capacity,
75            output_batch_size: 64,
76            response_backpressure_threshold: (max_concurrent_downloads * 6).min(channel_capacity),
77            item_backpressure_threshold: (parser_workers * 6).min(channel_capacity),
78            retry_release_permit: true,
79            live_stats: false,
80            live_stats_interval: Duration::from_millis(50),
81            live_stats_preview_fields: None,
82            shutdown_grace_period: Duration::from_secs(5),
83            item_limit: None,
84        }
85    }
86}
87
88impl CrawlerConfig {
89    /// Creates a new `CrawlerConfig` with default settings.
90    pub fn new() -> Self {
91        Self::default()
92    }
93
94    /// Sets the maximum number of concurrent downloads.
95    pub fn with_max_concurrent_downloads(mut self, limit: usize) -> Self {
96        self.max_concurrent_downloads = limit;
97        self
98    }
99
100    /// Sets the maximum number of outstanding requests tracked by the scheduler.
101    pub fn with_max_pending_requests(mut self, limit: usize) -> Self {
102        self.max_pending_requests = limit;
103        self
104    }
105
106    /// Sets the number of parser workers.
107    pub fn with_parser_workers(mut self, count: usize) -> Self {
108        self.parser_workers = count;
109        self
110    }
111
112    /// Sets the maximum number of concurrent pipelines.
113    pub fn with_max_concurrent_pipelines(mut self, limit: usize) -> Self {
114        self.max_concurrent_pipelines = limit;
115        self
116    }
117
118    /// Sets the channel capacity.
119    pub fn with_channel_capacity(mut self, capacity: usize) -> Self {
120        self.channel_capacity = capacity;
121        self
122    }
123
124    /// Sets the parser output batch size.
125    pub fn with_output_batch_size(mut self, batch_size: usize) -> Self {
126        self.output_batch_size = batch_size;
127        self
128    }
129
130    /// Sets the downloader response-channel backpressure threshold.
131    pub fn with_response_backpressure_threshold(mut self, threshold: usize) -> Self {
132        self.response_backpressure_threshold = threshold;
133        self
134    }
135
136    /// Sets the parser item-channel backpressure threshold.
137    pub fn with_item_backpressure_threshold(mut self, threshold: usize) -> Self {
138        self.item_backpressure_threshold = threshold;
139        self
140    }
141
142    /// Controls whether retry delays release the downloader permit immediately.
143    pub fn with_retry_release_permit(mut self, enabled: bool) -> Self {
144        self.retry_release_permit = enabled;
145        self
146    }
147
148    /// Enables or disables in-place live stats updates on stdout.
149    pub fn with_live_stats(mut self, enabled: bool) -> Self {
150        self.live_stats = enabled;
151        self
152    }
153
154    /// Sets the refresh interval used by live stats mode.
155    pub fn with_live_stats_interval(mut self, interval: Duration) -> Self {
156        self.live_stats_interval = interval;
157        self
158    }
159
160    /// Sets which item fields should be shown in live stats preview output.
161    ///
162    /// Field names support dot notation for nested JSON objects, for example:
163    /// `title`, `source_url`, or `metadata.Japanese`.
164    ///
165    /// You can also set aliases with `label=path`, for example:
166    /// `url=source_url` or `jp=metadata.Japanese`.
167    pub fn with_live_stats_preview_fields(
168        mut self,
169        fields: impl IntoIterator<Item = impl Into<String>>,
170    ) -> Self {
171        self.live_stats_preview_fields = Some(fields.into_iter().map(Into::into).collect());
172        self
173    }
174
175    /// Sets the maximum grace period for crawler shutdown.
176    pub fn with_shutdown_grace_period(mut self, grace_period: Duration) -> Self {
177        self.shutdown_grace_period = grace_period;
178        self
179    }
180
181    /// Sets the maximum number of scraped items to process before stopping the crawl.
182    pub fn with_item_limit(mut self, limit: usize) -> Self {
183        self.item_limit = Some(limit);
184        self
185    }
186
187    /// Validates the configuration.
188    pub fn validate(&self) -> Result<(), String> {
189        if self.max_concurrent_downloads == 0 {
190            return Err("max_concurrent_downloads must be greater than 0".to_string());
191        }
192        if self.max_pending_requests == 0 {
193            return Err("max_pending_requests must be greater than 0".to_string());
194        }
195        if self.parser_workers == 0 {
196            return Err("parser_workers must be greater than 0".to_string());
197        }
198        if self.max_concurrent_pipelines == 0 {
199            return Err("max_concurrent_pipelines must be greater than 0".to_string());
200        }
201        if self.output_batch_size == 0 {
202            return Err("output_batch_size must be greater than 0".to_string());
203        }
204        if self.response_backpressure_threshold == 0 {
205            return Err("response_backpressure_threshold must be greater than 0".to_string());
206        }
207        if self.item_backpressure_threshold == 0 {
208            return Err("item_backpressure_threshold must be greater than 0".to_string());
209        }
210        if self.live_stats_interval.is_zero() {
211            return Err("live_stats_interval must be greater than 0".to_string());
212        }
213        if matches!(self.live_stats_preview_fields.as_ref(), Some(fields) if fields.is_empty()) {
214            return Err("live_stats_preview_fields must not be empty".to_string());
215        }
216        if self.shutdown_grace_period.is_zero() {
217            return Err("shutdown_grace_period must be greater than 0".to_string());
218        }
219        if matches!(self.item_limit, Some(0)) {
220            return Err("item_limit must be greater than 0".to_string());
221        }
222        Ok(())
223    }
224}
225
226/// Configuration for checkpoint save/load operations.
227///
228/// This struct holds settings for automatic checkpoint persistence,
229/// allowing crawls to be resumed after interruption.
230#[derive(Debug, Clone, Default)]
231pub struct CheckpointConfig {
232    /// Optional path for saving and loading checkpoints.
233    pub path: Option<PathBuf>,
234    /// Optional interval between automatic checkpoint saves.
235    pub interval: Option<Duration>,
236}
237
238impl CheckpointConfig {
239    /// Creates a new `CheckpointConfig` with no path or interval.
240    pub fn new() -> Self {
241        Self::default()
242    }
243
244    /// Creates a new `CheckpointConfigBuilder` for fluent construction.
245    pub fn builder() -> CheckpointConfigBuilder {
246        CheckpointConfigBuilder::default()
247    }
248
249    /// Sets the checkpoint path.
250    pub fn with_path<P: AsRef<Path>>(mut self, path: P) -> Self {
251        self.path = Some(path.as_ref().to_path_buf());
252        self
253    }
254
255    /// Sets the checkpoint interval.
256    pub fn with_interval(mut self, interval: Duration) -> Self {
257        self.interval = Some(interval);
258        self
259    }
260
261    /// Returns true if checkpointing is enabled.
262    pub fn is_enabled(&self) -> bool {
263        self.path.is_some()
264    }
265}
266
267/// Builder for `CheckpointConfig`.
268#[derive(Debug, Default)]
269pub struct CheckpointConfigBuilder {
270    path: Option<PathBuf>,
271    interval: Option<Duration>,
272}
273
274impl CheckpointConfigBuilder {
275    /// Creates a new builder with default settings.
276    pub fn new() -> Self {
277        Self::default()
278    }
279
280    /// Sets the checkpoint path.
281    pub fn path<P: AsRef<Path>>(mut self, path: P) -> Self {
282        self.path = Some(path.as_ref().to_path_buf());
283        self
284    }
285
286    /// Sets the checkpoint interval.
287    pub fn interval(mut self, interval: Duration) -> Self {
288        self.interval = Some(interval);
289        self
290    }
291
292    /// Builds the `CheckpointConfig`.
293    pub fn build(self) -> CheckpointConfig {
294        CheckpointConfig {
295            path: self.path,
296            interval: self.interval,
297        }
298    }
299}
300
301/// Configuration for the parser workers.
302///
303/// This struct holds settings specific to the response parsing subsystem.
304#[derive(Debug, Clone)]
305pub struct ParserConfig {
306    /// The number of parser worker tasks to spawn.
307    pub worker_count: usize,
308    /// The capacity of the internal parse queue per worker.
309    pub queue_capacity: usize,
310}
311
312impl Default for ParserConfig {
313    fn default() -> Self {
314        ParserConfig {
315            worker_count: num_cpus::get().clamp(4, 16),
316            queue_capacity: 100,
317        }
318    }
319}
320
321impl ParserConfig {
322    /// Creates a new `ParserConfig` with default settings.
323    pub fn new() -> Self {
324        Self::default()
325    }
326
327    /// Sets the number of parser workers.
328    pub fn with_worker_count(mut self, count: usize) -> Self {
329        self.worker_count = count;
330        self
331    }
332
333    /// Sets the internal queue capacity per worker.
334    pub fn with_queue_capacity(mut self, capacity: usize) -> Self {
335        self.queue_capacity = capacity;
336        self
337    }
338}
339
340/// Configuration for the downloader.
341///
342/// This struct holds settings specific to the HTTP download subsystem.
343#[derive(Debug, Clone)]
344pub struct DownloaderConfig {
345    /// The maximum number of concurrent downloads.
346    pub max_concurrent: usize,
347    /// The backpressure threshold for response channel occupancy.
348    pub backpressure_threshold: usize,
349}
350
351impl Default for DownloaderConfig {
352    fn default() -> Self {
353        let max_concurrent = num_cpus::get().max(16);
354        DownloaderConfig {
355            max_concurrent,
356            backpressure_threshold: max_concurrent * 2,
357        }
358    }
359}
360
361impl DownloaderConfig {
362    /// Creates a new `DownloaderConfig` with default settings.
363    pub fn new() -> Self {
364        Self::default()
365    }
366
367    /// Sets the maximum number of concurrent downloads.
368    pub fn with_max_concurrent(mut self, limit: usize) -> Self {
369        self.max_concurrent = limit;
370        self
371    }
372
373    /// Sets the backpressure threshold.
374    pub fn with_backpressure_threshold(mut self, threshold: usize) -> Self {
375        self.backpressure_threshold = threshold;
376        self
377    }
378}
379
380/// Configuration for the item processor.
381///
382/// This struct holds settings specific to the item processing pipeline.
383#[derive(Debug, Clone)]
384pub struct ItemProcessorConfig {
385    /// The maximum number of concurrent pipeline processors.
386    pub max_concurrent: usize,
387}
388
389impl Default for ItemProcessorConfig {
390    fn default() -> Self {
391        ItemProcessorConfig {
392            max_concurrent: num_cpus::get().min(8),
393        }
394    }
395}
396
397impl ItemProcessorConfig {
398    /// Creates a new `ItemProcessorConfig` with default settings.
399    pub fn new() -> Self {
400        Self::default()
401    }
402
403    /// Sets the maximum number of concurrent processors.
404    pub fn with_max_concurrent(mut self, limit: usize) -> Self {
405        self.max_concurrent = limit;
406        self
407    }
408}