1use std::collections::HashMap;
28use std::sync::RwLock;
29
30use swarm_engine_core::actions::ParamResolver;
31use swarm_engine_core::agent::WorkResult;
32use swarm_engine_core::environment::Environment;
33use swarm_engine_core::types::{Action, WorkerId};
34
35#[derive(Debug, Clone, PartialEq)]
41pub enum SwarmErrorCode {
42 Sw1001RoutingInvalid,
44 Sw1002FailoverNotFound,
46 Sw1003CircuitBreakerTriggered,
48 Sw2001WorkerPoolExhausted,
50 Sw2002ManagerTimeout,
52 Sw3001StrategyMismatch,
54 Sw3002ExplorationDepthLimit,
56}
57
58impl SwarmErrorCode {
59 fn code(&self) -> &str {
60 match self {
61 Self::Sw1001RoutingInvalid => "SW-1001",
62 Self::Sw1002FailoverNotFound => "SW-1002",
63 Self::Sw1003CircuitBreakerTriggered => "SW-1003",
64 Self::Sw2001WorkerPoolExhausted => "SW-2001",
65 Self::Sw2002ManagerTimeout => "SW-2002",
66 Self::Sw3001StrategyMismatch => "SW-3001",
67 Self::Sw3002ExplorationDepthLimit => "SW-3002",
68 }
69 }
70
71 fn description(&self) -> &str {
72 match self {
73 Self::Sw1001RoutingInvalid => {
74 "Routing configuration is invalid. Check swarm.routing.targets format."
75 }
76 Self::Sw1002FailoverNotFound => {
77 "Failover target service not found. Verify swarm.failover.target exists."
78 }
79 Self::Sw1003CircuitBreakerTriggered => {
80 "Circuit breaker triggered due to high failure rate."
81 }
82 Self::Sw2001WorkerPoolExhausted => {
83 "Worker pool exhausted. Increase swarm.workers.max_count."
84 }
85 Self::Sw2002ManagerTimeout => {
86 "Manager decision timeout. Check swarm.manager.timeout_ms."
87 }
88 Self::Sw3001StrategyMismatch => {
89 "Strategy mismatch between manager and workers. Align swarm.strategy.type."
90 }
91 Self::Sw3002ExplorationDepthLimit => {
92 "Exploration depth limit reached. Adjust swarm.exploration.max_depth."
93 }
94 }
95 }
96
97 fn fix_hint(&self) -> &str {
98 match self {
99 Self::Sw1001RoutingInvalid => "routing.targets",
100 Self::Sw1002FailoverNotFound => "failover.target",
101 Self::Sw1003CircuitBreakerTriggered => "circuit_breaker.threshold",
102 Self::Sw2001WorkerPoolExhausted => "workers.max_count",
103 Self::Sw2002ManagerTimeout => "manager.timeout_ms",
104 Self::Sw3001StrategyMismatch => "strategy.type",
105 Self::Sw3002ExplorationDepthLimit => "exploration.max_depth",
106 }
107 }
108
109 fn correct_value(&self) -> &str {
110 match self {
111 Self::Sw1001RoutingInvalid => "[\"worker-1\", \"worker-2\"]",
112 Self::Sw1002FailoverNotFound => "backup-service",
113 Self::Sw1003CircuitBreakerTriggered => "0.7",
114 Self::Sw2001WorkerPoolExhausted => "16",
115 Self::Sw2002ManagerTimeout => "5000",
116 Self::Sw3001StrategyMismatch => "ucb1",
117 Self::Sw3002ExplorationDepthLimit => "10",
118 }
119 }
120}
121
122#[derive(Debug, Clone)]
128pub struct ConfigProblem {
129 pub error_code: SwarmErrorCode,
131 pub config_key: String,
133 pub current_value: String,
135 pub logs: Vec<String>,
137}
138
139impl ConfigProblem {
140 fn new(error_code: SwarmErrorCode, config_key: &str, current_value: &str) -> Self {
141 let logs = Self::generate_logs(&error_code);
142 Self {
143 error_code,
144 config_key: config_key.to_string(),
145 current_value: current_value.to_string(),
146 logs,
147 }
148 }
149
150 fn generate_logs(error_code: &SwarmErrorCode) -> Vec<String> {
151 let timestamp = "2024-01-15T10:30:45.123Z";
152 let code = error_code.code();
153
154 vec![
155 format!("[{}] ERROR [{}] {}", timestamp, code, error_code.description()),
156 format!("[{}] WARN [{}] Attempting recovery...", timestamp, code),
157 format!("[{}] ERROR [{}] Recovery failed, escalating", timestamp, code),
158 format!("[{}] INFO {{\"error_code\":\"{}\",\"component\":\"swarm-core\",\"action\":\"shutdown\"}}", timestamp, code),
159 ]
160 }
161}
162
163pub struct InternalDiagnosisEnvironment {
169 problem: ConfigProblem,
171 config: HashMap<String, String>,
173 state: RwLock<DiagnosisState>,
175}
176
177#[derive(Debug, Default)]
178struct DiagnosisState {
179 parsed_config: bool,
181 analyzed_logs: bool,
183 traced_error: Option<String>,
185 completed: Vec<WorkerId>,
187}
188
189impl InternalDiagnosisEnvironment {
190 pub fn new(problem: ConfigProblem) -> Self {
192 let mut config = Self::default_config();
193 config.insert(problem.config_key.clone(), problem.current_value.clone());
195
196 Self {
197 problem,
198 config,
199 state: RwLock::new(DiagnosisState::default()),
200 }
201 }
202
203 fn default_config() -> HashMap<String, String> {
205 let mut config = HashMap::new();
206 config.insert(
208 "swarm.routing.targets".into(),
209 "[\"worker-1\", \"worker-2\", \"worker-3\"]".into(),
210 );
211 config.insert("swarm.failover.target".into(), "backup-service".into());
212 config.insert("swarm.failover.enabled".into(), "true".into());
213 config.insert("swarm.circuit_breaker.threshold".into(), "0.5".into());
214 config.insert("swarm.circuit_breaker.window_ms".into(), "10000".into());
215 config.insert("swarm.workers.max_count".into(), "8".into());
216 config.insert("swarm.workers.min_count".into(), "2".into());
217 config.insert("swarm.manager.timeout_ms".into(), "3000".into());
218 config.insert("swarm.manager.retry_count".into(), "3".into());
219 config.insert("swarm.strategy.type".into(), "ucb1".into());
220 config.insert("swarm.strategy.exploration_c".into(), "1.414".into());
221 config.insert("swarm.exploration.max_depth".into(), "5".into());
222 config.insert("swarm.exploration.pruning".into(), "true".into());
223 config
224 }
225
226 pub fn routing_error_scenario() -> Self {
232 let problem = ConfigProblem::new(
233 SwarmErrorCode::Sw1001RoutingInvalid,
234 "swarm.routing.targets",
235 "invalid-format", );
237 Self::new(problem)
238 }
239
240 pub fn failover_error_scenario() -> Self {
242 let problem = ConfigProblem::new(
243 SwarmErrorCode::Sw1002FailoverNotFound,
244 "swarm.failover.target",
245 "nonexistent-service",
246 );
247 Self::new(problem)
248 }
249
250 pub fn worker_pool_scenario() -> Self {
252 let problem = ConfigProblem::new(
253 SwarmErrorCode::Sw2001WorkerPoolExhausted,
254 "swarm.workers.max_count",
255 "2", );
257 Self::new(problem)
258 }
259
260 pub fn strategy_mismatch_scenario() -> Self {
262 let problem = ConfigProblem::new(
263 SwarmErrorCode::Sw3001StrategyMismatch,
264 "swarm.strategy.type",
265 "unknown_strategy",
266 );
267 Self::new(problem)
268 }
269
270 pub fn exploration_depth_scenario() -> Self {
272 let problem = ConfigProblem::new(
273 SwarmErrorCode::Sw3002ExplorationDepthLimit,
274 "swarm.exploration.max_depth",
275 "1", );
277 Self::new(problem)
278 }
279
280 pub fn complex_scenario(seed: u64) -> Self {
282 let scenarios = [
284 Self::routing_error_scenario,
285 Self::failover_error_scenario,
286 Self::worker_pool_scenario,
287 Self::strategy_mismatch_scenario,
288 Self::exploration_depth_scenario,
289 ];
290 let idx = (seed as usize) % scenarios.len();
291 scenarios[idx]()
292 }
293
294 fn handle_parse_config(&self, _worker_id: WorkerId, action: &Action) -> WorkResult {
299 let section = action
300 .params
301 .args
302 .get("section")
303 .or(action.params.target.as_ref())
304 .cloned()
305 .filter(|s| !s.is_empty());
306
307 let mut state = self.state.write().unwrap();
308 state.parsed_config = true;
309
310 match section {
311 Some(section) => {
312 let prefix = format!("swarm.{}.", section);
314 let matching: Vec<_> = self
315 .config
316 .iter()
317 .filter(|(k, _)| k.starts_with(&prefix))
318 .collect();
319
320 if matching.is_empty() {
321 let mut output = format!(
323 "Section '{}' not found. Showing all configuration:\n\n=== SwarmEngine Configuration ===\n\n",
324 section
325 );
326
327 let sections = [
328 "routing",
329 "failover",
330 "circuit_breaker",
331 "workers",
332 "manager",
333 "strategy",
334 "exploration",
335 ];
336 for sec in sections {
337 let sec_prefix = format!("swarm.{}.", sec);
338 let sec_matching: Vec<_> = self
339 .config
340 .iter()
341 .filter(|(k, _)| k.starts_with(&sec_prefix))
342 .collect();
343
344 if !sec_matching.is_empty() {
345 output.push_str(&format!("[{}]\n", sec));
346 for (key, value) in sec_matching {
347 let short_key = key.strip_prefix(&sec_prefix).unwrap_or(key);
348 output.push_str(&format!(" {}: {}\n", short_key, value));
349 }
350 output.push('\n');
351 }
352 }
353
354 WorkResult::env_success(output)
355 } else {
356 let mut output = format!("=== Configuration: swarm.{} ===\n", section);
357 for (key, value) in matching {
358 let short_key = key.strip_prefix("swarm.").unwrap_or(key);
359 output.push_str(&format!("{}: {}\n", short_key, value));
360 }
361
362 if self.problem.config_key.starts_with(&prefix) {
364 output.push_str("\n[!] Potential issue detected in this section");
365 }
366
367 WorkResult::env_success(output)
368 }
369 }
370 None => {
371 let mut output = String::from("=== SwarmEngine Configuration ===\n\n");
373
374 let sections = [
376 "routing",
377 "failover",
378 "circuit_breaker",
379 "workers",
380 "manager",
381 "strategy",
382 "exploration",
383 ];
384 for section in sections {
385 let prefix = format!("swarm.{}.", section);
386 let matching: Vec<_> = self
387 .config
388 .iter()
389 .filter(|(k, _)| k.starts_with(&prefix))
390 .collect();
391
392 if !matching.is_empty() {
393 output.push_str(&format!("[{}]\n", section));
394 for (key, value) in matching {
395 let short_key = key.strip_prefix(&prefix).unwrap_or(key);
396 output.push_str(&format!(" {}: {}\n", short_key, value));
397 }
398 output.push('\n');
399 }
400 }
401
402 WorkResult::env_success(output)
403 }
404 }
405 }
406
407 fn handle_analyze_log(&self, _worker_id: WorkerId, action: &Action) -> WorkResult {
408 let resolver = ParamResolver::new(action);
409 let filter = resolver.get("filter").unwrap_or("");
410
411 let mut state = self.state.write().unwrap();
412
413 if !state.parsed_config {
415 return WorkResult::env_failure(
416 "Cannot analyze logs without parsing configuration first. Run ParseConfig first.",
417 );
418 }
419
420 state.analyzed_logs = true;
421
422 let logs: Vec<_> = if filter.is_empty() {
423 self.problem.logs.clone()
424 } else {
425 self.problem
426 .logs
427 .iter()
428 .filter(|log| log.contains(filter))
429 .cloned()
430 .collect()
431 };
432
433 if logs.is_empty() {
434 WorkResult::env_success("=== Log Analysis ===\nNo matching logs found.")
435 } else {
436 let error_code = self.problem.error_code.code();
437 let mut output = String::from("=== Log Analysis ===\n\n");
438 for log in &logs {
439 output.push_str(log);
440 output.push('\n');
441 }
442 output.push_str(&format!("\n[!] Detected error code: {}\n", error_code));
443 output.push_str("Use TraceError to investigate this error code.");
444
445 WorkResult::env_success(output)
446 }
447 }
448
449 fn handle_trace_error(&self, _worker_id: WorkerId, action: &Action) -> WorkResult {
450 let resolver = ParamResolver::new(action);
451 let error_code_input = resolver.get("code").unwrap_or("");
452
453 let mut state = self.state.write().unwrap();
454
455 if !state.analyzed_logs {
457 return WorkResult::env_failure(
458 "Cannot trace error without analyzing logs first. Run AnalyzeLog first.",
459 );
460 }
461
462 let expected_code = self.problem.error_code.code();
463
464 let error_code: String =
466 if error_code_input.starts_with("SW-") && error_code_input.len() == 7 {
467 error_code_input.to_string()
468 } else if error_code_input.is_empty() {
469 return WorkResult::env_failure(
470 "TraceError requires an error code. Usage: TraceError(SW-XXXX)",
471 );
472 } else {
473 expected_code.to_string()
475 };
476
477 if error_code == expected_code {
478 state.traced_error = Some(error_code.clone());
479
480 let desc = self.problem.error_code.description();
481 let fix_hint = self.problem.error_code.fix_hint();
482 let correct_value = self.problem.error_code.correct_value();
483
484 let output = format!(
485 "=== Error Trace: {} ===\n\n\
486 Description: {}\n\n\
487 Root Cause:\n\
488 Configuration key: swarm.{}\n\
489 Current value: {}\n\
490 Expected format: {}\n\n\
491 Recommended Fix:\n\
492 ApplyFix(key=\"{}\", value=\"{}\")",
493 error_code,
494 desc,
495 fix_hint,
496 self.problem.current_value,
497 correct_value,
498 fix_hint,
499 correct_value
500 );
501
502 WorkResult::env_success(output)
503 } else {
504 WorkResult::env_failure(format!(
505 "Error code {} not found in current logs.\nHint: Check the error code from AnalyzeLog output.",
506 error_code
507 ))
508 }
509 }
510
511 fn handle_apply_fix(&self, worker_id: WorkerId, action: &Action) -> WorkResult {
512 let resolver = ParamResolver::new(action);
513 let key_input = resolver.get("key").unwrap_or("");
514 let value = resolver.arg("value").unwrap_or("");
515
516 let mut state = self.state.write().unwrap();
517
518 if state.traced_error.is_none() {
520 return WorkResult::env_failure(
521 "Cannot apply fix without tracing the error first. Run TraceError first.",
522 );
523 }
524
525 let fix_hint = self.problem.error_code.fix_hint();
526 let correct_value = self.problem.error_code.correct_value();
527
528 let key: String = if key_input.is_empty() {
530 return WorkResult::env_failure(
531 "ApplyFix requires a configuration key. Usage: ApplyFix(key=\"...\", value=\"...\")"
532 );
533 } else if key_input.contains('.') || key_input.contains('_') {
534 key_input.to_string()
536 } else {
537 fix_hint.to_string()
539 };
540
541 let full_key = if key.starts_with("swarm.") {
543 key.clone()
544 } else {
545 format!("swarm.{}", key)
546 };
547
548 let expected_key = format!("swarm.{}", fix_hint);
549
550 if full_key != expected_key {
551 return WorkResult::env_failure(format!(
552 "Incorrect configuration key: {}\nHint: The issue is in swarm.{}",
553 key, fix_hint
554 ));
555 }
556
557 let value_is_correct = if value.is_empty() {
559 false
560 } else {
561 match (value.parse::<f64>(), correct_value.parse::<f64>()) {
563 (Ok(v), Ok(c)) => (v - c).abs() < 0.001 || v >= c,
564 _ => value.contains(correct_value) || correct_value.contains(value),
565 }
566 };
567
568 if !value_is_correct && !value.is_empty() {
569 return WorkResult::env_success(format!(
571 "=== Configuration Updated ===\n\
572 Key: {}\n\
573 Value: {}\n\n\
574 [!] Warning: Value may not fully resolve the issue.\n\
575 Recommended value: {}",
576 full_key, value, correct_value
577 ));
578 }
579
580 if !state.completed.contains(&worker_id) {
582 state.completed.push(worker_id);
583 }
584
585 WorkResult::done_success(format!(
586 "=== Configuration Fixed ===\n\
587 Key: {}\n\
588 Value: {}\n\n\
589 [OK] Error {} resolved.\n\
590 SwarmEngine is now running normally.",
591 full_key,
592 if value.is_empty() {
593 correct_value
594 } else {
595 &value
596 },
597 self.problem.error_code.code()
598 ))
599 }
600}
601
602impl Environment for InternalDiagnosisEnvironment {
603 fn step(&self, worker_id: WorkerId, action: &Action) -> WorkResult {
604 match action.name.to_lowercase().as_str() {
605 "parseconfig" | "parse_config" | "config" => {
606 self.handle_parse_config(worker_id, action)
607 }
608 "analyzelog" | "analyze_log" | "logs" | "log" => {
609 self.handle_analyze_log(worker_id, action)
610 }
611 "traceerror" | "trace_error" | "trace" => self.handle_trace_error(worker_id, action),
612 "applyfix" | "apply_fix" | "fix" => self.handle_apply_fix(worker_id, action),
613 "continue" => WorkResult::env_success("Continuing..."),
614 _ => WorkResult::unsupported(&action.name),
615 }
616 }
617
618 fn reset(&self) {
619 let mut state = self.state.write().unwrap();
620 state.parsed_config = false;
621 state.analyzed_logs = false;
622 state.traced_error = None;
623 state.completed.clear();
624 }
625
626 fn name(&self) -> &str {
627 "InternalDiagnosisEnvironment"
628 }
629}
630
631#[cfg(test)]
636mod tests {
637 use super::*;
638 use swarm_engine_core::types::ActionParams;
639
640 fn is_success(result: &WorkResult) -> bool {
641 match result {
642 WorkResult::Acted { action_result, .. } => action_result.success,
643 WorkResult::Done { success, .. } => *success,
644 _ => false,
645 }
646 }
647
648 fn is_done(result: &WorkResult) -> bool {
649 matches!(result, WorkResult::Done { .. })
650 }
651
652 fn action(name: &str, target: Option<&str>) -> Action {
653 Action {
654 name: name.into(),
655 params: ActionParams {
656 target: target.map(|s| s.into()),
657 args: HashMap::new(),
658 data: vec![],
659 },
660 }
661 }
662
663 fn action_with_args(name: &str, args: HashMap<String, String>) -> Action {
664 Action {
665 name: name.into(),
666 params: ActionParams {
667 target: None,
668 args,
669 data: vec![],
670 },
671 }
672 }
673
674 #[test]
675 fn test_parse_config_all() {
676 let env = InternalDiagnosisEnvironment::routing_error_scenario();
677 let worker = WorkerId(0);
678
679 let result = env.step(worker, &action("ParseConfig", None));
680 assert!(is_success(&result));
681 }
682
683 #[test]
684 fn test_parse_config_section() {
685 let env = InternalDiagnosisEnvironment::routing_error_scenario();
686 let worker = WorkerId(0);
687
688 let result = env.step(worker, &action("ParseConfig", Some("routing")));
689 assert!(is_success(&result));
690 }
691
692 #[test]
693 fn test_analyze_log_requires_parse() {
694 let env = InternalDiagnosisEnvironment::routing_error_scenario();
695 let worker = WorkerId(0);
696
697 let result = env.step(worker, &action("AnalyzeLog", None));
699 assert!(!is_success(&result));
700 }
701
702 #[test]
703 fn test_trace_error_requires_analyze() {
704 let env = InternalDiagnosisEnvironment::routing_error_scenario();
705 let worker = WorkerId(0);
706
707 env.step(worker, &action("ParseConfig", None));
709
710 let result = env.step(worker, &action("TraceError", Some("SW-1001")));
712 assert!(!is_success(&result));
713 }
714
715 #[test]
716 fn test_full_diagnosis_flow() {
717 let env = InternalDiagnosisEnvironment::routing_error_scenario();
718 let worker = WorkerId(0);
719
720 let result = env.step(worker, &action("ParseConfig", None));
722 assert!(is_success(&result));
723 assert!(!is_done(&result));
724
725 let result = env.step(worker, &action("AnalyzeLog", None));
727 assert!(is_success(&result));
728 assert!(!is_done(&result));
729
730 let result = env.step(worker, &action("TraceError", Some("SW-1001")));
732 assert!(is_success(&result));
733 assert!(!is_done(&result));
734
735 let mut args = HashMap::new();
737 args.insert("key".into(), "routing.targets".into());
738 args.insert("value".into(), "[\"worker-1\", \"worker-2\"]".into());
739 let result = env.step(worker, &action_with_args("ApplyFix", args));
740 assert!(is_success(&result));
741 assert!(is_done(&result));
742 }
743
744 #[test]
745 fn test_wrong_error_code_fails() {
746 let env = InternalDiagnosisEnvironment::routing_error_scenario();
747 let worker = WorkerId(0);
748
749 env.step(worker, &action("ParseConfig", None));
750 env.step(worker, &action("AnalyzeLog", None));
751
752 let result = env.step(worker, &action("TraceError", Some("SW-9999")));
754 assert!(!is_success(&result));
755 }
756
757 #[test]
758 fn test_wrong_fix_key_fails() {
759 let env = InternalDiagnosisEnvironment::routing_error_scenario();
760 let worker = WorkerId(0);
761
762 env.step(worker, &action("ParseConfig", None));
763 env.step(worker, &action("AnalyzeLog", None));
764 env.step(worker, &action("TraceError", Some("SW-1001")));
765
766 let mut args = HashMap::new();
768 args.insert("key".into(), "wrong.key".into());
769 args.insert("value".into(), "some-value".into());
770 let result = env.step(worker, &action_with_args("ApplyFix", args));
771 assert!(!is_success(&result));
772 }
773
774 #[test]
775 fn test_worker_pool_scenario() {
776 let env = InternalDiagnosisEnvironment::worker_pool_scenario();
777 let worker = WorkerId(0);
778
779 env.step(worker, &action("ParseConfig", None));
780 env.step(worker, &action("AnalyzeLog", None));
781
782 let result = env.step(worker, &action("TraceError", Some("SW-2001")));
783 assert!(is_success(&result));
784
785 let mut args = HashMap::new();
786 args.insert("key".into(), "workers.max_count".into());
787 args.insert("value".into(), "16".into());
788 let result = env.step(worker, &action_with_args("ApplyFix", args));
789 assert!(is_success(&result));
790 assert!(is_done(&result));
791 }
792}