essence/crawler/
config.rs1use serde::{Deserialize, Serialize};
2use std::sync::{Arc, Mutex};
3use std::collections::HashMap;
4use tracing::{debug, warn};
5use crate::error::Result;
6
7#[derive(Debug, Clone, Serialize, Deserialize)]
9pub struct CrawlerConfig {
10 #[serde(default = "default_max_queue_size")]
12 pub max_queue_size: usize,
13
14 #[serde(default = "default_max_memory_mb")]
16 pub max_memory_mb: usize,
17
18 #[serde(default = "default_max_concurrent_requests")]
20 pub max_concurrent_requests: usize,
21
22 #[serde(default = "default_circuit_breaker_threshold")]
24 pub circuit_breaker_threshold: usize,
25
26 #[serde(default = "default_backpressure_threshold")]
28 pub backpressure_threshold: u8,
29
30 #[serde(default = "default_true")]
32 pub enable_memory_monitoring: bool,
33
34 #[serde(default = "default_true")]
36 pub enable_circuit_breaker: bool,
37}
38
39impl Default for CrawlerConfig {
40 fn default() -> Self {
41 Self {
42 max_queue_size: default_max_queue_size(),
43 max_memory_mb: default_max_memory_mb(),
44 max_concurrent_requests: default_max_concurrent_requests(),
45 circuit_breaker_threshold: default_circuit_breaker_threshold(),
46 backpressure_threshold: default_backpressure_threshold(),
47 enable_memory_monitoring: true,
48 enable_circuit_breaker: true,
49 }
50 }
51}
52
53fn default_max_queue_size() -> usize {
54 1000
55}
56
57fn default_max_memory_mb() -> usize {
58 500
59}
60
61fn default_max_concurrent_requests() -> usize {
62 10
63}
64
65fn default_circuit_breaker_threshold() -> usize {
66 3
67}
68
69fn default_backpressure_threshold() -> u8 {
70 80
71}
72
73fn default_true() -> bool {
74 true
75}
76
77#[derive(Debug, Clone)]
79pub struct CircuitBreaker {
80 failures: Arc<Mutex<HashMap<String, usize>>>,
81 threshold: usize,
82}
83
84impl CircuitBreaker {
85 pub fn new(threshold: usize) -> Self {
86 Self {
87 failures: Arc::new(Mutex::new(HashMap::new())),
88 threshold,
89 }
90 }
91
92 pub fn record_failure(&self, domain: &str) {
94 let mut failures = self.failures.lock().unwrap();
95 let count = failures.entry(domain.to_string()).or_insert(0);
96 *count += 1;
97
98 if *count >= self.threshold {
99 warn!("Circuit breaker triggered for domain: {} ({} failures)", domain, count);
100 }
101 }
102
103 pub fn record_success(&self, domain: &str) {
105 let mut failures = self.failures.lock().unwrap();
106 failures.remove(domain);
107 debug!("Circuit breaker reset for domain: {}", domain);
108 }
109
110 pub fn should_skip(&self, domain: &str) -> bool {
112 let failures = self.failures.lock().unwrap();
113 failures.get(domain).unwrap_or(&0) >= &self.threshold
114 }
115
116 pub fn get_failure_count(&self, domain: &str) -> usize {
118 let failures = self.failures.lock().unwrap();
119 *failures.get(domain).unwrap_or(&0)
120 }
121
122 pub fn get_total_failures(&self) -> usize {
124 let failures = self.failures.lock().unwrap();
125 failures.values().filter(|&&count| count >= self.threshold).count()
126 }
127}
128
129pub struct MemoryMonitor {
135 max_memory_mb: usize,
136 enabled: bool,
137}
138
139impl MemoryMonitor {
140 pub fn new(max_memory_mb: usize, enabled: bool) -> Self {
141 Self {
142 max_memory_mb,
143 enabled,
144 }
145 }
146
147 pub fn check_memory_limit(&self) -> Result<()> {
152 if !self.enabled {
153 return Ok(());
154 }
155
156 debug!("Memory check: limit is {}MB (lightweight mode)", self.max_memory_mb);
160 Ok(())
161 }
162
163 pub fn get_current_memory_mb(&self) -> u64 {
165 0
166 }
167
168 pub fn get_memory_percentage(&self) -> f64 {
170 0.0
171 }
172}
173
174#[cfg(test)]
175mod tests {
176 use super::*;
177
178 #[test]
179 fn test_crawler_config_defaults() {
180 let config = CrawlerConfig::default();
181 assert_eq!(config.max_queue_size, 1000);
182 assert_eq!(config.max_memory_mb, 500);
183 assert_eq!(config.max_concurrent_requests, 10);
184 assert_eq!(config.circuit_breaker_threshold, 3);
185 assert_eq!(config.backpressure_threshold, 80);
186 assert!(config.enable_memory_monitoring);
187 assert!(config.enable_circuit_breaker);
188 }
189
190 #[test]
191 fn test_circuit_breaker() {
192 let breaker = CircuitBreaker::new(3);
193
194 assert!(!breaker.should_skip("example.com"));
196
197 breaker.record_failure("example.com");
199 assert_eq!(breaker.get_failure_count("example.com"), 1);
200 assert!(!breaker.should_skip("example.com"));
201
202 breaker.record_failure("example.com");
203 assert_eq!(breaker.get_failure_count("example.com"), 2);
204 assert!(!breaker.should_skip("example.com"));
205
206 breaker.record_failure("example.com");
207 assert_eq!(breaker.get_failure_count("example.com"), 3);
208 assert!(breaker.should_skip("example.com"));
209
210 breaker.record_success("example.com");
212 assert!(!breaker.should_skip("example.com"));
213 assert_eq!(breaker.get_failure_count("example.com"), 0);
214 }
215
216 #[test]
217 fn test_circuit_breaker_multiple_domains() {
218 let breaker = CircuitBreaker::new(2);
219
220 breaker.record_failure("domain1.com");
221 breaker.record_failure("domain1.com");
222
223 breaker.record_failure("domain2.com");
224 breaker.record_failure("domain2.com");
225
226 assert!(breaker.should_skip("domain1.com"));
227 assert!(breaker.should_skip("domain2.com"));
228 assert_eq!(breaker.get_total_failures(), 2);
229
230 breaker.record_success("domain1.com");
232 assert!(!breaker.should_skip("domain1.com"));
233 assert!(breaker.should_skip("domain2.com"));
234 assert_eq!(breaker.get_total_failures(), 1);
235 }
236
237 #[test]
238 fn test_memory_monitor_disabled() {
239 let monitor = MemoryMonitor::new(100, false);
240
241 assert!(monitor.check_memory_limit().is_ok());
243 assert_eq!(monitor.get_current_memory_mb(), 0);
244 assert_eq!(monitor.get_memory_percentage(), 0.0);
245 }
246
247 #[test]
248 fn test_memory_monitor_enabled() {
249 let monitor = MemoryMonitor::new(100, true);
250
251 assert!(monitor.check_memory_limit().is_ok());
253 }
254}