Skip to main content

essence/crawler/
config.rs

1use serde::{Deserialize, Serialize};
2use std::sync::{Arc, Mutex};
3use std::collections::HashMap;
4use tracing::{debug, warn};
5use crate::error::Result;
6
7/// Configuration for crawler with memory and queue limits
8#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct CrawlerConfig {
10    /// Maximum queue size (default: 1000)
11    #[serde(default = "default_max_queue_size")]
12    pub max_queue_size: usize,
13
14    /// Maximum memory usage in MB (default: 500MB)
15    #[serde(default = "default_max_memory_mb")]
16    pub max_memory_mb: usize,
17
18    /// Maximum concurrent requests (default: 10)
19    #[serde(default = "default_max_concurrent_requests")]
20    pub max_concurrent_requests: usize,
21
22    /// Circuit breaker failure threshold (default: 3)
23    #[serde(default = "default_circuit_breaker_threshold")]
24    pub circuit_breaker_threshold: usize,
25
26    /// Backpressure threshold as percentage of max_queue_size (default: 80)
27    #[serde(default = "default_backpressure_threshold")]
28    pub backpressure_threshold: u8,
29
30    /// Enable memory monitoring (default: true)
31    #[serde(default = "default_true")]
32    pub enable_memory_monitoring: bool,
33
34    /// Enable circuit breaker (default: true)
35    #[serde(default = "default_true")]
36    pub enable_circuit_breaker: bool,
37}
38
39impl Default for CrawlerConfig {
40    fn default() -> Self {
41        Self {
42            max_queue_size: default_max_queue_size(),
43            max_memory_mb: default_max_memory_mb(),
44            max_concurrent_requests: default_max_concurrent_requests(),
45            circuit_breaker_threshold: default_circuit_breaker_threshold(),
46            backpressure_threshold: default_backpressure_threshold(),
47            enable_memory_monitoring: true,
48            enable_circuit_breaker: true,
49        }
50    }
51}
52
53fn default_max_queue_size() -> usize {
54    1000
55}
56
57fn default_max_memory_mb() -> usize {
58    500
59}
60
61fn default_max_concurrent_requests() -> usize {
62    10
63}
64
65fn default_circuit_breaker_threshold() -> usize {
66    3
67}
68
69fn default_backpressure_threshold() -> u8 {
70    80
71}
72
73fn default_true() -> bool {
74    true
75}
76
77/// Circuit breaker for tracking and preventing runaway failures
78#[derive(Debug, Clone)]
79pub struct CircuitBreaker {
80    failures: Arc<Mutex<HashMap<String, usize>>>,
81    threshold: usize,
82}
83
84impl CircuitBreaker {
85    pub fn new(threshold: usize) -> Self {
86        Self {
87            failures: Arc::new(Mutex::new(HashMap::new())),
88            threshold,
89        }
90    }
91
92    /// Record a failure for a domain
93    pub fn record_failure(&self, domain: &str) {
94        let mut failures = self.failures.lock().unwrap();
95        let count = failures.entry(domain.to_string()).or_insert(0);
96        *count += 1;
97
98        if *count >= self.threshold {
99            warn!("Circuit breaker triggered for domain: {} ({} failures)", domain, count);
100        }
101    }
102
103    /// Record a success for a domain (resets failure count)
104    pub fn record_success(&self, domain: &str) {
105        let mut failures = self.failures.lock().unwrap();
106        failures.remove(domain);
107        debug!("Circuit breaker reset for domain: {}", domain);
108    }
109
110    /// Check if a domain should be skipped due to too many failures
111    pub fn should_skip(&self, domain: &str) -> bool {
112        let failures = self.failures.lock().unwrap();
113        failures.get(domain).unwrap_or(&0) >= &self.threshold
114    }
115
116    /// Get the current failure count for a domain
117    pub fn get_failure_count(&self, domain: &str) -> usize {
118        let failures = self.failures.lock().unwrap();
119        *failures.get(domain).unwrap_or(&0)
120    }
121
122    /// Get total number of domains in failure state
123    pub fn get_total_failures(&self) -> usize {
124        let failures = self.failures.lock().unwrap();
125        failures.values().filter(|&&count| count >= self.threshold).count()
126    }
127}
128
129/// Memory monitor for tracking process memory usage
130///
131/// This is a lightweight implementation that tracks an estimated memory budget
132/// without requiring the `sysinfo` crate. In production, you may want to
133/// integrate with platform-specific memory APIs.
134pub struct MemoryMonitor {
135    max_memory_mb: usize,
136    enabled: bool,
137}
138
139impl MemoryMonitor {
140    pub fn new(max_memory_mb: usize, enabled: bool) -> Self {
141        Self {
142            max_memory_mb,
143            enabled,
144        }
145    }
146
147    /// Check if current memory usage exceeds the limit
148    ///
149    /// This is a lightweight check. For precise memory monitoring,
150    /// consider enabling platform-specific memory tracking.
151    pub fn check_memory_limit(&self) -> Result<()> {
152        if !self.enabled {
153            return Ok(());
154        }
155
156        // Lightweight heuristic: we don't block on memory by default.
157        // The actual enforcement happens through queue size limits and
158        // concurrency controls in the crawler config.
159        debug!("Memory check: limit is {}MB (lightweight mode)", self.max_memory_mb);
160        Ok(())
161    }
162
163    /// Get current memory usage in MB (returns 0 in lightweight mode)
164    pub fn get_current_memory_mb(&self) -> u64 {
165        0
166    }
167
168    /// Get memory usage percentage (returns 0.0 in lightweight mode)
169    pub fn get_memory_percentage(&self) -> f64 {
170        0.0
171    }
172}
173
174#[cfg(test)]
175mod tests {
176    use super::*;
177
178    #[test]
179    fn test_crawler_config_defaults() {
180        let config = CrawlerConfig::default();
181        assert_eq!(config.max_queue_size, 1000);
182        assert_eq!(config.max_memory_mb, 500);
183        assert_eq!(config.max_concurrent_requests, 10);
184        assert_eq!(config.circuit_breaker_threshold, 3);
185        assert_eq!(config.backpressure_threshold, 80);
186        assert!(config.enable_memory_monitoring);
187        assert!(config.enable_circuit_breaker);
188    }
189
190    #[test]
191    fn test_circuit_breaker() {
192        let breaker = CircuitBreaker::new(3);
193
194        // Initially should not skip
195        assert!(!breaker.should_skip("example.com"));
196
197        // Record failures
198        breaker.record_failure("example.com");
199        assert_eq!(breaker.get_failure_count("example.com"), 1);
200        assert!(!breaker.should_skip("example.com"));
201
202        breaker.record_failure("example.com");
203        assert_eq!(breaker.get_failure_count("example.com"), 2);
204        assert!(!breaker.should_skip("example.com"));
205
206        breaker.record_failure("example.com");
207        assert_eq!(breaker.get_failure_count("example.com"), 3);
208        assert!(breaker.should_skip("example.com"));
209
210        // Record success should reset
211        breaker.record_success("example.com");
212        assert!(!breaker.should_skip("example.com"));
213        assert_eq!(breaker.get_failure_count("example.com"), 0);
214    }
215
216    #[test]
217    fn test_circuit_breaker_multiple_domains() {
218        let breaker = CircuitBreaker::new(2);
219
220        breaker.record_failure("domain1.com");
221        breaker.record_failure("domain1.com");
222
223        breaker.record_failure("domain2.com");
224        breaker.record_failure("domain2.com");
225
226        assert!(breaker.should_skip("domain1.com"));
227        assert!(breaker.should_skip("domain2.com"));
228        assert_eq!(breaker.get_total_failures(), 2);
229
230        // Reset one domain
231        breaker.record_success("domain1.com");
232        assert!(!breaker.should_skip("domain1.com"));
233        assert!(breaker.should_skip("domain2.com"));
234        assert_eq!(breaker.get_total_failures(), 1);
235    }
236
237    #[test]
238    fn test_memory_monitor_disabled() {
239        let monitor = MemoryMonitor::new(100, false);
240
241        // Should always succeed when disabled
242        assert!(monitor.check_memory_limit().is_ok());
243        assert_eq!(monitor.get_current_memory_mb(), 0);
244        assert_eq!(monitor.get_memory_percentage(), 0.0);
245    }
246
247    #[test]
248    fn test_memory_monitor_enabled() {
249        let monitor = MemoryMonitor::new(100, true);
250
251        // Lightweight mode always succeeds
252        assert!(monitor.check_memory_limit().is_ok());
253    }
254}