1use std::collections::HashMap;
28use std::sync::RwLock;
29
30use swarm_engine_core::agent::WorkResult;
31use swarm_engine_core::environment::Environment;
32use swarm_engine_core::types::{Action, WorkerId};
33
34#[derive(Debug, Clone, PartialEq)]
40pub enum SwarmErrorCode {
41 Sw1001RoutingInvalid,
43 Sw1002FailoverNotFound,
45 Sw1003CircuitBreakerTriggered,
47 Sw2001WorkerPoolExhausted,
49 Sw2002ManagerTimeout,
51 Sw3001StrategyMismatch,
53 Sw3002ExplorationDepthLimit,
55}
56
57impl SwarmErrorCode {
58 fn code(&self) -> &str {
59 match self {
60 Self::Sw1001RoutingInvalid => "SW-1001",
61 Self::Sw1002FailoverNotFound => "SW-1002",
62 Self::Sw1003CircuitBreakerTriggered => "SW-1003",
63 Self::Sw2001WorkerPoolExhausted => "SW-2001",
64 Self::Sw2002ManagerTimeout => "SW-2002",
65 Self::Sw3001StrategyMismatch => "SW-3001",
66 Self::Sw3002ExplorationDepthLimit => "SW-3002",
67 }
68 }
69
70 fn description(&self) -> &str {
71 match self {
72 Self::Sw1001RoutingInvalid => {
73 "Routing configuration is invalid. Check swarm.routing.targets format."
74 }
75 Self::Sw1002FailoverNotFound => {
76 "Failover target service not found. Verify swarm.failover.target exists."
77 }
78 Self::Sw1003CircuitBreakerTriggered => {
79 "Circuit breaker triggered due to high failure rate."
80 }
81 Self::Sw2001WorkerPoolExhausted => {
82 "Worker pool exhausted. Increase swarm.workers.max_count."
83 }
84 Self::Sw2002ManagerTimeout => {
85 "Manager decision timeout. Check swarm.manager.timeout_ms."
86 }
87 Self::Sw3001StrategyMismatch => {
88 "Strategy mismatch between manager and workers. Align swarm.strategy.type."
89 }
90 Self::Sw3002ExplorationDepthLimit => {
91 "Exploration depth limit reached. Adjust swarm.exploration.max_depth."
92 }
93 }
94 }
95
96 fn fix_hint(&self) -> &str {
97 match self {
98 Self::Sw1001RoutingInvalid => "routing.targets",
99 Self::Sw1002FailoverNotFound => "failover.target",
100 Self::Sw1003CircuitBreakerTriggered => "circuit_breaker.threshold",
101 Self::Sw2001WorkerPoolExhausted => "workers.max_count",
102 Self::Sw2002ManagerTimeout => "manager.timeout_ms",
103 Self::Sw3001StrategyMismatch => "strategy.type",
104 Self::Sw3002ExplorationDepthLimit => "exploration.max_depth",
105 }
106 }
107
108 fn correct_value(&self) -> &str {
109 match self {
110 Self::Sw1001RoutingInvalid => "[\"worker-1\", \"worker-2\"]",
111 Self::Sw1002FailoverNotFound => "backup-service",
112 Self::Sw1003CircuitBreakerTriggered => "0.7",
113 Self::Sw2001WorkerPoolExhausted => "16",
114 Self::Sw2002ManagerTimeout => "5000",
115 Self::Sw3001StrategyMismatch => "ucb1",
116 Self::Sw3002ExplorationDepthLimit => "10",
117 }
118 }
119}
120
121#[derive(Debug, Clone)]
127pub struct ConfigProblem {
128 pub error_code: SwarmErrorCode,
130 pub config_key: String,
132 pub current_value: String,
134 pub logs: Vec<String>,
136}
137
138impl ConfigProblem {
139 fn new(error_code: SwarmErrorCode, config_key: &str, current_value: &str) -> Self {
140 let logs = Self::generate_logs(&error_code);
141 Self {
142 error_code,
143 config_key: config_key.to_string(),
144 current_value: current_value.to_string(),
145 logs,
146 }
147 }
148
149 fn generate_logs(error_code: &SwarmErrorCode) -> Vec<String> {
150 let timestamp = "2024-01-15T10:30:45.123Z";
151 let code = error_code.code();
152
153 vec![
154 format!("[{}] ERROR [{}] {}", timestamp, code, error_code.description()),
155 format!("[{}] WARN [{}] Attempting recovery...", timestamp, code),
156 format!("[{}] ERROR [{}] Recovery failed, escalating", timestamp, code),
157 format!("[{}] INFO {{\"error_code\":\"{}\",\"component\":\"swarm-core\",\"action\":\"shutdown\"}}", timestamp, code),
158 ]
159 }
160}
161
162pub struct InternalDiagnosisEnvironment {
168 problem: ConfigProblem,
170 config: HashMap<String, String>,
172 state: RwLock<DiagnosisState>,
174}
175
176#[derive(Debug, Default)]
177struct DiagnosisState {
178 parsed_config: bool,
180 analyzed_logs: bool,
182 traced_error: Option<String>,
184 completed: Vec<WorkerId>,
186}
187
188impl InternalDiagnosisEnvironment {
189 pub fn new(problem: ConfigProblem) -> Self {
191 let mut config = Self::default_config();
192 config.insert(problem.config_key.clone(), problem.current_value.clone());
194
195 Self {
196 problem,
197 config,
198 state: RwLock::new(DiagnosisState::default()),
199 }
200 }
201
202 fn default_config() -> HashMap<String, String> {
204 let mut config = HashMap::new();
205 config.insert(
207 "swarm.routing.targets".into(),
208 "[\"worker-1\", \"worker-2\", \"worker-3\"]".into(),
209 );
210 config.insert("swarm.failover.target".into(), "backup-service".into());
211 config.insert("swarm.failover.enabled".into(), "true".into());
212 config.insert("swarm.circuit_breaker.threshold".into(), "0.5".into());
213 config.insert("swarm.circuit_breaker.window_ms".into(), "10000".into());
214 config.insert("swarm.workers.max_count".into(), "8".into());
215 config.insert("swarm.workers.min_count".into(), "2".into());
216 config.insert("swarm.manager.timeout_ms".into(), "3000".into());
217 config.insert("swarm.manager.retry_count".into(), "3".into());
218 config.insert("swarm.strategy.type".into(), "ucb1".into());
219 config.insert("swarm.strategy.exploration_c".into(), "1.414".into());
220 config.insert("swarm.exploration.max_depth".into(), "5".into());
221 config.insert("swarm.exploration.pruning".into(), "true".into());
222 config
223 }
224
225 pub fn routing_error_scenario() -> Self {
231 let problem = ConfigProblem::new(
232 SwarmErrorCode::Sw1001RoutingInvalid,
233 "swarm.routing.targets",
234 "invalid-format", );
236 Self::new(problem)
237 }
238
239 pub fn failover_error_scenario() -> Self {
241 let problem = ConfigProblem::new(
242 SwarmErrorCode::Sw1002FailoverNotFound,
243 "swarm.failover.target",
244 "nonexistent-service",
245 );
246 Self::new(problem)
247 }
248
249 pub fn worker_pool_scenario() -> Self {
251 let problem = ConfigProblem::new(
252 SwarmErrorCode::Sw2001WorkerPoolExhausted,
253 "swarm.workers.max_count",
254 "2", );
256 Self::new(problem)
257 }
258
259 pub fn strategy_mismatch_scenario() -> Self {
261 let problem = ConfigProblem::new(
262 SwarmErrorCode::Sw3001StrategyMismatch,
263 "swarm.strategy.type",
264 "unknown_strategy",
265 );
266 Self::new(problem)
267 }
268
269 pub fn exploration_depth_scenario() -> Self {
271 let problem = ConfigProblem::new(
272 SwarmErrorCode::Sw3002ExplorationDepthLimit,
273 "swarm.exploration.max_depth",
274 "1", );
276 Self::new(problem)
277 }
278
279 pub fn complex_scenario(seed: u64) -> Self {
281 let scenarios = [
283 Self::routing_error_scenario,
284 Self::failover_error_scenario,
285 Self::worker_pool_scenario,
286 Self::strategy_mismatch_scenario,
287 Self::exploration_depth_scenario,
288 ];
289 let idx = (seed as usize) % scenarios.len();
290 scenarios[idx]()
291 }
292
293 fn handle_parse_config(&self, _worker_id: WorkerId, action: &Action) -> WorkResult {
298 let section = action
299 .params
300 .args
301 .get("section")
302 .or(action.params.target.as_ref())
303 .cloned()
304 .filter(|s| !s.is_empty());
305
306 let mut state = self.state.write().unwrap();
307 state.parsed_config = true;
308
309 match section {
310 Some(section) => {
311 let prefix = format!("swarm.{}.", section);
313 let matching: Vec<_> = self
314 .config
315 .iter()
316 .filter(|(k, _)| k.starts_with(&prefix))
317 .collect();
318
319 if matching.is_empty() {
320 let mut output = format!(
322 "Section '{}' not found. Showing all configuration:\n\n=== SwarmEngine Configuration ===\n\n",
323 section
324 );
325
326 let sections = [
327 "routing",
328 "failover",
329 "circuit_breaker",
330 "workers",
331 "manager",
332 "strategy",
333 "exploration",
334 ];
335 for sec in sections {
336 let sec_prefix = format!("swarm.{}.", sec);
337 let sec_matching: Vec<_> = self
338 .config
339 .iter()
340 .filter(|(k, _)| k.starts_with(&sec_prefix))
341 .collect();
342
343 if !sec_matching.is_empty() {
344 output.push_str(&format!("[{}]\n", sec));
345 for (key, value) in sec_matching {
346 let short_key = key.strip_prefix(&sec_prefix).unwrap_or(key);
347 output.push_str(&format!(" {}: {}\n", short_key, value));
348 }
349 output.push('\n');
350 }
351 }
352
353 WorkResult::env_success(output)
354 } else {
355 let mut output = format!("=== Configuration: swarm.{} ===\n", section);
356 for (key, value) in matching {
357 let short_key = key.strip_prefix("swarm.").unwrap_or(key);
358 output.push_str(&format!("{}: {}\n", short_key, value));
359 }
360
361 if self.problem.config_key.starts_with(&prefix) {
363 output.push_str("\n[!] Potential issue detected in this section");
364 }
365
366 WorkResult::env_success(output)
367 }
368 }
369 None => {
370 let mut output = String::from("=== SwarmEngine Configuration ===\n\n");
372
373 let sections = [
375 "routing",
376 "failover",
377 "circuit_breaker",
378 "workers",
379 "manager",
380 "strategy",
381 "exploration",
382 ];
383 for section in sections {
384 let prefix = format!("swarm.{}.", section);
385 let matching: Vec<_> = self
386 .config
387 .iter()
388 .filter(|(k, _)| k.starts_with(&prefix))
389 .collect();
390
391 if !matching.is_empty() {
392 output.push_str(&format!("[{}]\n", section));
393 for (key, value) in matching {
394 let short_key = key.strip_prefix(&prefix).unwrap_or(key);
395 output.push_str(&format!(" {}: {}\n", short_key, value));
396 }
397 output.push('\n');
398 }
399 }
400
401 WorkResult::env_success(output)
402 }
403 }
404 }
405
406 fn handle_analyze_log(&self, _worker_id: WorkerId, action: &Action) -> WorkResult {
407 let filter = action
408 .params
409 .args
410 .get("filter")
411 .or(action.params.target.as_ref())
412 .cloned()
413 .unwrap_or_default();
414
415 let mut state = self.state.write().unwrap();
416
417 if !state.parsed_config {
419 return WorkResult::env_failure(
420 "Cannot analyze logs without parsing configuration first. Run ParseConfig first.",
421 );
422 }
423
424 state.analyzed_logs = true;
425
426 let logs: Vec<_> = if filter.is_empty() {
427 self.problem.logs.clone()
428 } else {
429 self.problem
430 .logs
431 .iter()
432 .filter(|log| log.contains(&filter))
433 .cloned()
434 .collect()
435 };
436
437 if logs.is_empty() {
438 WorkResult::env_success("=== Log Analysis ===\nNo matching logs found.")
439 } else {
440 let error_code = self.problem.error_code.code();
441 let mut output = String::from("=== Log Analysis ===\n\n");
442 for log in &logs {
443 output.push_str(log);
444 output.push('\n');
445 }
446 output.push_str(&format!("\n[!] Detected error code: {}\n", error_code));
447 output.push_str("Use TraceError to investigate this error code.");
448
449 WorkResult::env_success(output)
450 }
451 }
452
453 fn handle_trace_error(&self, _worker_id: WorkerId, action: &Action) -> WorkResult {
454 let error_code_input = action
455 .params
456 .args
457 .get("code")
458 .or(action.params.target.as_ref())
459 .cloned()
460 .unwrap_or_default();
461
462 let mut state = self.state.write().unwrap();
463
464 if !state.analyzed_logs {
466 return WorkResult::env_failure(
467 "Cannot trace error without analyzing logs first. Run AnalyzeLog first.",
468 );
469 }
470
471 let expected_code = self.problem.error_code.code();
472
473 let error_code = if error_code_input.starts_with("SW-") && error_code_input.len() == 7 {
475 error_code_input
476 } else if error_code_input.is_empty() {
477 return WorkResult::env_failure(
478 "TraceError requires an error code. Usage: TraceError(SW-XXXX)",
479 );
480 } else {
481 expected_code.to_string()
483 };
484
485 if error_code == expected_code {
486 state.traced_error = Some(error_code.clone());
487
488 let desc = self.problem.error_code.description();
489 let fix_hint = self.problem.error_code.fix_hint();
490 let correct_value = self.problem.error_code.correct_value();
491
492 let output = format!(
493 "=== Error Trace: {} ===\n\n\
494 Description: {}\n\n\
495 Root Cause:\n\
496 Configuration key: swarm.{}\n\
497 Current value: {}\n\
498 Expected format: {}\n\n\
499 Recommended Fix:\n\
500 ApplyFix(key=\"{}\", value=\"{}\")",
501 error_code,
502 desc,
503 fix_hint,
504 self.problem.current_value,
505 correct_value,
506 fix_hint,
507 correct_value
508 );
509
510 WorkResult::env_success(output)
511 } else {
512 WorkResult::env_failure(format!(
513 "Error code {} not found in current logs.\nHint: Check the error code from AnalyzeLog output.",
514 error_code
515 ))
516 }
517 }
518
519 fn handle_apply_fix(&self, worker_id: WorkerId, action: &Action) -> WorkResult {
520 let key_input = action
521 .params
522 .args
523 .get("key")
524 .or(action.params.target.as_ref())
525 .cloned()
526 .unwrap_or_default();
527
528 let value = action.params.args.get("value").cloned().unwrap_or_default();
529
530 let mut state = self.state.write().unwrap();
531
532 if state.traced_error.is_none() {
534 return WorkResult::env_failure(
535 "Cannot apply fix without tracing the error first. Run TraceError first.",
536 );
537 }
538
539 let fix_hint = self.problem.error_code.fix_hint();
540 let correct_value = self.problem.error_code.correct_value();
541
542 let key = if key_input.is_empty() {
544 return WorkResult::env_failure(
545 "ApplyFix requires a configuration key. Usage: ApplyFix(key=\"...\", value=\"...\")"
546 );
547 } else if key_input.contains('.') || key_input.contains('_') {
548 key_input
550 } else {
551 fix_hint.to_string()
553 };
554
555 let full_key = if key.starts_with("swarm.") {
557 key.clone()
558 } else {
559 format!("swarm.{}", key)
560 };
561
562 let expected_key = format!("swarm.{}", fix_hint);
563
564 if full_key != expected_key {
565 return WorkResult::env_failure(format!(
566 "Incorrect configuration key: {}\nHint: The issue is in swarm.{}",
567 key, fix_hint
568 ));
569 }
570
571 let value_is_correct = if value.is_empty() {
573 false
574 } else {
575 match (value.parse::<f64>(), correct_value.parse::<f64>()) {
577 (Ok(v), Ok(c)) => (v - c).abs() < 0.001 || v >= c,
578 _ => value.contains(correct_value) || correct_value.contains(&value),
579 }
580 };
581
582 if !value_is_correct && !value.is_empty() {
583 return WorkResult::env_success(format!(
585 "=== Configuration Updated ===\n\
586 Key: {}\n\
587 Value: {}\n\n\
588 [!] Warning: Value may not fully resolve the issue.\n\
589 Recommended value: {}",
590 full_key, value, correct_value
591 ));
592 }
593
594 if !state.completed.contains(&worker_id) {
596 state.completed.push(worker_id);
597 }
598
599 WorkResult::done_success(format!(
600 "=== Configuration Fixed ===\n\
601 Key: {}\n\
602 Value: {}\n\n\
603 [OK] Error {} resolved.\n\
604 SwarmEngine is now running normally.",
605 full_key,
606 if value.is_empty() {
607 correct_value
608 } else {
609 &value
610 },
611 self.problem.error_code.code()
612 ))
613 }
614}
615
616impl Environment for InternalDiagnosisEnvironment {
617 fn step(&self, worker_id: WorkerId, action: &Action) -> WorkResult {
618 match action.name.to_lowercase().as_str() {
619 "parseconfig" | "parse_config" | "config" => {
620 self.handle_parse_config(worker_id, action)
621 }
622 "analyzelog" | "analyze_log" | "logs" | "log" => {
623 self.handle_analyze_log(worker_id, action)
624 }
625 "traceerror" | "trace_error" | "trace" => self.handle_trace_error(worker_id, action),
626 "applyfix" | "apply_fix" | "fix" => self.handle_apply_fix(worker_id, action),
627 "continue" => WorkResult::env_success("Continuing..."),
628 _ => WorkResult::unsupported(&action.name),
629 }
630 }
631
632 fn reset(&self) {
633 let mut state = self.state.write().unwrap();
634 state.parsed_config = false;
635 state.analyzed_logs = false;
636 state.traced_error = None;
637 state.completed.clear();
638 }
639
640 fn name(&self) -> &str {
641 "InternalDiagnosisEnvironment"
642 }
643}
644
645#[cfg(test)]
650mod tests {
651 use super::*;
652 use swarm_engine_core::types::ActionParams;
653
654 fn is_success(result: &WorkResult) -> bool {
655 match result {
656 WorkResult::Acted { action_result, .. } => action_result.success,
657 WorkResult::Done { success, .. } => *success,
658 _ => false,
659 }
660 }
661
662 fn is_done(result: &WorkResult) -> bool {
663 matches!(result, WorkResult::Done { .. })
664 }
665
666 fn action(name: &str, target: Option<&str>) -> Action {
667 Action {
668 name: name.into(),
669 params: ActionParams {
670 target: target.map(|s| s.into()),
671 args: HashMap::new(),
672 data: vec![],
673 },
674 }
675 }
676
677 fn action_with_args(name: &str, args: HashMap<String, String>) -> Action {
678 Action {
679 name: name.into(),
680 params: ActionParams {
681 target: None,
682 args,
683 data: vec![],
684 },
685 }
686 }
687
688 #[test]
689 fn test_parse_config_all() {
690 let env = InternalDiagnosisEnvironment::routing_error_scenario();
691 let worker = WorkerId(0);
692
693 let result = env.step(worker, &action("ParseConfig", None));
694 assert!(is_success(&result));
695 }
696
697 #[test]
698 fn test_parse_config_section() {
699 let env = InternalDiagnosisEnvironment::routing_error_scenario();
700 let worker = WorkerId(0);
701
702 let result = env.step(worker, &action("ParseConfig", Some("routing")));
703 assert!(is_success(&result));
704 }
705
706 #[test]
707 fn test_analyze_log_requires_parse() {
708 let env = InternalDiagnosisEnvironment::routing_error_scenario();
709 let worker = WorkerId(0);
710
711 let result = env.step(worker, &action("AnalyzeLog", None));
713 assert!(!is_success(&result));
714 }
715
716 #[test]
717 fn test_trace_error_requires_analyze() {
718 let env = InternalDiagnosisEnvironment::routing_error_scenario();
719 let worker = WorkerId(0);
720
721 env.step(worker, &action("ParseConfig", None));
723
724 let result = env.step(worker, &action("TraceError", Some("SW-1001")));
726 assert!(!is_success(&result));
727 }
728
729 #[test]
730 fn test_full_diagnosis_flow() {
731 let env = InternalDiagnosisEnvironment::routing_error_scenario();
732 let worker = WorkerId(0);
733
734 let result = env.step(worker, &action("ParseConfig", None));
736 assert!(is_success(&result));
737 assert!(!is_done(&result));
738
739 let result = env.step(worker, &action("AnalyzeLog", None));
741 assert!(is_success(&result));
742 assert!(!is_done(&result));
743
744 let result = env.step(worker, &action("TraceError", Some("SW-1001")));
746 assert!(is_success(&result));
747 assert!(!is_done(&result));
748
749 let mut args = HashMap::new();
751 args.insert("key".into(), "routing.targets".into());
752 args.insert("value".into(), "[\"worker-1\", \"worker-2\"]".into());
753 let result = env.step(worker, &action_with_args("ApplyFix", args));
754 assert!(is_success(&result));
755 assert!(is_done(&result));
756 }
757
758 #[test]
759 fn test_wrong_error_code_fails() {
760 let env = InternalDiagnosisEnvironment::routing_error_scenario();
761 let worker = WorkerId(0);
762
763 env.step(worker, &action("ParseConfig", None));
764 env.step(worker, &action("AnalyzeLog", None));
765
766 let result = env.step(worker, &action("TraceError", Some("SW-9999")));
768 assert!(!is_success(&result));
769 }
770
771 #[test]
772 fn test_wrong_fix_key_fails() {
773 let env = InternalDiagnosisEnvironment::routing_error_scenario();
774 let worker = WorkerId(0);
775
776 env.step(worker, &action("ParseConfig", None));
777 env.step(worker, &action("AnalyzeLog", None));
778 env.step(worker, &action("TraceError", Some("SW-1001")));
779
780 let mut args = HashMap::new();
782 args.insert("key".into(), "wrong.key".into());
783 args.insert("value".into(), "some-value".into());
784 let result = env.step(worker, &action_with_args("ApplyFix", args));
785 assert!(!is_success(&result));
786 }
787
788 #[test]
789 fn test_worker_pool_scenario() {
790 let env = InternalDiagnosisEnvironment::worker_pool_scenario();
791 let worker = WorkerId(0);
792
793 env.step(worker, &action("ParseConfig", None));
794 env.step(worker, &action("AnalyzeLog", None));
795
796 let result = env.step(worker, &action("TraceError", Some("SW-2001")));
797 assert!(is_success(&result));
798
799 let mut args = HashMap::new();
800 args.insert("key".into(), "workers.max_count".into());
801 args.insert("value".into(), "16".into());
802 let result = env.step(worker, &action_with_args("ApplyFix", args));
803 assert!(is_success(&result));
804 assert!(is_done(&result));
805 }
806}