1use std::path::{Path, PathBuf};
38use std::time::Duration;
39
40#[derive(Debug, Clone)]
45pub struct CrawlerConfig {
46 pub max_concurrent_downloads: usize,
48 pub parser_workers: usize,
50 pub max_concurrent_pipelines: usize,
52 pub channel_capacity: usize,
54}
55
56impl Default for CrawlerConfig {
57 fn default() -> Self {
58 CrawlerConfig {
59 max_concurrent_downloads: num_cpus::get().max(16),
60 parser_workers: num_cpus::get().clamp(4, 16),
61 max_concurrent_pipelines: num_cpus::get().min(8),
62 channel_capacity: 1000,
63 }
64 }
65}
66
67impl CrawlerConfig {
68 pub fn new() -> Self {
70 Self::default()
71 }
72
73 pub fn with_max_concurrent_downloads(mut self, limit: usize) -> Self {
75 self.max_concurrent_downloads = limit;
76 self
77 }
78
79 pub fn with_parser_workers(mut self, count: usize) -> Self {
81 self.parser_workers = count;
82 self
83 }
84
85 pub fn with_max_concurrent_pipelines(mut self, limit: usize) -> Self {
87 self.max_concurrent_pipelines = limit;
88 self
89 }
90
91 pub fn with_channel_capacity(mut self, capacity: usize) -> Self {
93 self.channel_capacity = capacity;
94 self
95 }
96
97 pub fn validate(&self) -> Result<(), String> {
99 if self.max_concurrent_downloads == 0 {
100 return Err("max_concurrent_downloads must be greater than 0".to_string());
101 }
102 if self.parser_workers == 0 {
103 return Err("parser_workers must be greater than 0".to_string());
104 }
105 if self.max_concurrent_pipelines == 0 {
106 return Err("max_concurrent_pipelines must be greater than 0".to_string());
107 }
108 Ok(())
109 }
110}
111
112#[derive(Debug, Clone, Default)]
117pub struct CheckpointConfig {
118 pub path: Option<PathBuf>,
120 pub interval: Option<Duration>,
122}
123
124impl CheckpointConfig {
125 pub fn new() -> Self {
127 Self::default()
128 }
129
130 pub fn builder() -> CheckpointConfigBuilder {
132 CheckpointConfigBuilder::default()
133 }
134
135 pub fn with_path<P: AsRef<Path>>(mut self, path: P) -> Self {
137 self.path = Some(path.as_ref().to_path_buf());
138 self
139 }
140
141 pub fn with_interval(mut self, interval: Duration) -> Self {
143 self.interval = Some(interval);
144 self
145 }
146
147 pub fn is_enabled(&self) -> bool {
149 self.path.is_some()
150 }
151}
152
153#[derive(Debug, Default)]
155pub struct CheckpointConfigBuilder {
156 path: Option<PathBuf>,
157 interval: Option<Duration>,
158}
159
160impl CheckpointConfigBuilder {
161 pub fn new() -> Self {
163 Self::default()
164 }
165
166 pub fn path<P: AsRef<Path>>(mut self, path: P) -> Self {
168 self.path = Some(path.as_ref().to_path_buf());
169 self
170 }
171
172 pub fn interval(mut self, interval: Duration) -> Self {
174 self.interval = Some(interval);
175 self
176 }
177
178 pub fn build(self) -> CheckpointConfig {
180 CheckpointConfig {
181 path: self.path,
182 interval: self.interval,
183 }
184 }
185}
186
187#[derive(Debug, Clone)]
191pub struct ParserConfig {
192 pub worker_count: usize,
194 pub queue_capacity: usize,
196}
197
198impl Default for ParserConfig {
199 fn default() -> Self {
200 ParserConfig {
201 worker_count: num_cpus::get().clamp(4, 16),
202 queue_capacity: 100,
203 }
204 }
205}
206
207impl ParserConfig {
208 pub fn new() -> Self {
210 Self::default()
211 }
212
213 pub fn with_worker_count(mut self, count: usize) -> Self {
215 self.worker_count = count;
216 self
217 }
218
219 pub fn with_queue_capacity(mut self, capacity: usize) -> Self {
221 self.queue_capacity = capacity;
222 self
223 }
224}
225
226#[derive(Debug, Clone)]
230pub struct DownloaderConfig {
231 pub max_concurrent: usize,
233 pub backpressure_threshold: usize,
235}
236
237impl Default for DownloaderConfig {
238 fn default() -> Self {
239 let max_concurrent = num_cpus::get().max(16);
240 DownloaderConfig {
241 max_concurrent,
242 backpressure_threshold: max_concurrent * 2,
243 }
244 }
245}
246
247impl DownloaderConfig {
248 pub fn new() -> Self {
250 Self::default()
251 }
252
253 pub fn with_max_concurrent(mut self, limit: usize) -> Self {
255 self.max_concurrent = limit;
256 self
257 }
258
259 pub fn with_backpressure_threshold(mut self, threshold: usize) -> Self {
261 self.backpressure_threshold = threshold;
262 self
263 }
264}
265
266#[derive(Debug, Clone)]
270pub struct ItemProcessorConfig {
271 pub max_concurrent: usize,
273}
274
275impl Default for ItemProcessorConfig {
276 fn default() -> Self {
277 ItemProcessorConfig {
278 max_concurrent: num_cpus::get().min(8),
279 }
280 }
281}
282
283impl ItemProcessorConfig {
284 pub fn new() -> Self {
286 Self::default()
287 }
288
289 pub fn with_max_concurrent(mut self, limit: usize) -> Self {
291 self.max_concurrent = limit;
292 self
293 }
294}
295
296#[cfg(test)]
297mod tests {
298 use super::*;
299
300 #[test]
301 fn test_crawler_config_default() {
302 let config = CrawlerConfig::default();
303 assert!(config.max_concurrent_downloads > 0);
304 assert!(config.parser_workers > 0);
305 assert!(config.max_concurrent_pipelines > 0);
306 assert!(config.channel_capacity > 0);
307 }
308
309 #[test]
310 fn test_crawler_config_builder() {
311 let config = CrawlerConfig::new()
312 .with_max_concurrent_downloads(20)
313 .with_parser_workers(8)
314 .with_max_concurrent_pipelines(4)
315 .with_channel_capacity(500);
316
317 assert_eq!(config.max_concurrent_downloads, 20);
318 assert_eq!(config.parser_workers, 8);
319 assert_eq!(config.max_concurrent_pipelines, 4);
320 assert_eq!(config.channel_capacity, 500);
321 }
322
323 #[test]
324 fn test_crawler_config_validation() {
325 let valid_config = CrawlerConfig::default();
326 assert!(valid_config.validate().is_ok());
327
328 let invalid_config = CrawlerConfig::new().with_max_concurrent_downloads(0);
329 assert!(invalid_config.validate().is_err());
330 }
331
332 #[test]
333 fn test_checkpoint_config_builder() {
334 let config = CheckpointConfig::builder()
335 .path("./test.checkpoint")
336 .interval(Duration::from_secs(30))
337 .build();
338
339 assert!(config.path.is_some());
340 assert!(config.interval.is_some());
341 assert!(config.is_enabled());
342 }
343
344 #[test]
345 fn test_checkpoint_config_disabled() {
346 let config = CheckpointConfig::new();
347 assert!(!config.is_enabled());
348 }
349}