use std::collections::HashMap;
use std::sync::RwLock;
use swarm_engine_core::actions::ParamResolver;
use swarm_engine_core::agent::WorkResult;
use swarm_engine_core::environment::Environment;
use swarm_engine_core::types::{Action, WorkerId};
#[derive(Debug, Clone, PartialEq)]
pub enum SwarmErrorCode {
Sw1001RoutingInvalid,
Sw1002FailoverNotFound,
Sw1003CircuitBreakerTriggered,
Sw2001WorkerPoolExhausted,
Sw2002ManagerTimeout,
Sw3001StrategyMismatch,
Sw3002ExplorationDepthLimit,
}
impl SwarmErrorCode {
fn code(&self) -> &str {
match self {
Self::Sw1001RoutingInvalid => "SW-1001",
Self::Sw1002FailoverNotFound => "SW-1002",
Self::Sw1003CircuitBreakerTriggered => "SW-1003",
Self::Sw2001WorkerPoolExhausted => "SW-2001",
Self::Sw2002ManagerTimeout => "SW-2002",
Self::Sw3001StrategyMismatch => "SW-3001",
Self::Sw3002ExplorationDepthLimit => "SW-3002",
}
}
fn description(&self) -> &str {
match self {
Self::Sw1001RoutingInvalid => {
"Routing configuration is invalid. Check swarm.routing.targets format."
}
Self::Sw1002FailoverNotFound => {
"Failover target service not found. Verify swarm.failover.target exists."
}
Self::Sw1003CircuitBreakerTriggered => {
"Circuit breaker triggered due to high failure rate."
}
Self::Sw2001WorkerPoolExhausted => {
"Worker pool exhausted. Increase swarm.workers.max_count."
}
Self::Sw2002ManagerTimeout => {
"Manager decision timeout. Check swarm.manager.timeout_ms."
}
Self::Sw3001StrategyMismatch => {
"Strategy mismatch between manager and workers. Align swarm.strategy.type."
}
Self::Sw3002ExplorationDepthLimit => {
"Exploration depth limit reached. Adjust swarm.exploration.max_depth."
}
}
}
fn fix_hint(&self) -> &str {
match self {
Self::Sw1001RoutingInvalid => "routing.targets",
Self::Sw1002FailoverNotFound => "failover.target",
Self::Sw1003CircuitBreakerTriggered => "circuit_breaker.threshold",
Self::Sw2001WorkerPoolExhausted => "workers.max_count",
Self::Sw2002ManagerTimeout => "manager.timeout_ms",
Self::Sw3001StrategyMismatch => "strategy.type",
Self::Sw3002ExplorationDepthLimit => "exploration.max_depth",
}
}
fn correct_value(&self) -> &str {
match self {
Self::Sw1001RoutingInvalid => "[\"worker-1\", \"worker-2\"]",
Self::Sw1002FailoverNotFound => "backup-service",
Self::Sw1003CircuitBreakerTriggered => "0.7",
Self::Sw2001WorkerPoolExhausted => "16",
Self::Sw2002ManagerTimeout => "5000",
Self::Sw3001StrategyMismatch => "ucb1",
Self::Sw3002ExplorationDepthLimit => "10",
}
}
}
#[derive(Debug, Clone)]
pub struct ConfigProblem {
pub error_code: SwarmErrorCode,
pub config_key: String,
pub current_value: String,
pub logs: Vec<String>,
}
impl ConfigProblem {
fn new(error_code: SwarmErrorCode, config_key: &str, current_value: &str) -> Self {
let logs = Self::generate_logs(&error_code);
Self {
error_code,
config_key: config_key.to_string(),
current_value: current_value.to_string(),
logs,
}
}
fn generate_logs(error_code: &SwarmErrorCode) -> Vec<String> {
let timestamp = "2024-01-15T10:30:45.123Z";
let code = error_code.code();
vec![
format!("[{}] ERROR [{}] {}", timestamp, code, error_code.description()),
format!("[{}] WARN [{}] Attempting recovery...", timestamp, code),
format!("[{}] ERROR [{}] Recovery failed, escalating", timestamp, code),
format!("[{}] INFO {{\"error_code\":\"{}\",\"component\":\"swarm-core\",\"action\":\"shutdown\"}}", timestamp, code),
]
}
}
pub struct InternalDiagnosisEnvironment {
problem: ConfigProblem,
config: HashMap<String, String>,
state: RwLock<DiagnosisState>,
}
#[derive(Debug, Default)]
struct DiagnosisState {
parsed_config: bool,
analyzed_logs: bool,
traced_error: Option<String>,
completed: Vec<WorkerId>,
}
impl InternalDiagnosisEnvironment {
pub fn new(problem: ConfigProblem) -> Self {
let mut config = Self::default_config();
config.insert(problem.config_key.clone(), problem.current_value.clone());
Self {
problem,
config,
state: RwLock::new(DiagnosisState::default()),
}
}
fn default_config() -> HashMap<String, String> {
let mut config = HashMap::new();
config.insert(
"swarm.routing.targets".into(),
"[\"worker-1\", \"worker-2\", \"worker-3\"]".into(),
);
config.insert("swarm.failover.target".into(), "backup-service".into());
config.insert("swarm.failover.enabled".into(), "true".into());
config.insert("swarm.circuit_breaker.threshold".into(), "0.5".into());
config.insert("swarm.circuit_breaker.window_ms".into(), "10000".into());
config.insert("swarm.workers.max_count".into(), "8".into());
config.insert("swarm.workers.min_count".into(), "2".into());
config.insert("swarm.manager.timeout_ms".into(), "3000".into());
config.insert("swarm.manager.retry_count".into(), "3".into());
config.insert("swarm.strategy.type".into(), "ucb1".into());
config.insert("swarm.strategy.exploration_c".into(), "1.414".into());
config.insert("swarm.exploration.max_depth".into(), "5".into());
config.insert("swarm.exploration.pruning".into(), "true".into());
config
}
pub fn routing_error_scenario() -> Self {
let problem = ConfigProblem::new(
SwarmErrorCode::Sw1001RoutingInvalid,
"swarm.routing.targets",
"invalid-format", );
Self::new(problem)
}
pub fn failover_error_scenario() -> Self {
let problem = ConfigProblem::new(
SwarmErrorCode::Sw1002FailoverNotFound,
"swarm.failover.target",
"nonexistent-service",
);
Self::new(problem)
}
pub fn worker_pool_scenario() -> Self {
let problem = ConfigProblem::new(
SwarmErrorCode::Sw2001WorkerPoolExhausted,
"swarm.workers.max_count",
"2", );
Self::new(problem)
}
pub fn strategy_mismatch_scenario() -> Self {
let problem = ConfigProblem::new(
SwarmErrorCode::Sw3001StrategyMismatch,
"swarm.strategy.type",
"unknown_strategy",
);
Self::new(problem)
}
pub fn exploration_depth_scenario() -> Self {
let problem = ConfigProblem::new(
SwarmErrorCode::Sw3002ExplorationDepthLimit,
"swarm.exploration.max_depth",
"1", );
Self::new(problem)
}
pub fn complex_scenario(seed: u64) -> Self {
let scenarios = [
Self::routing_error_scenario,
Self::failover_error_scenario,
Self::worker_pool_scenario,
Self::strategy_mismatch_scenario,
Self::exploration_depth_scenario,
];
let idx = (seed as usize) % scenarios.len();
scenarios[idx]()
}
fn handle_parse_config(&self, _worker_id: WorkerId, action: &Action) -> WorkResult {
let section = action
.params
.args
.get("section")
.or(action.params.target.as_ref())
.cloned()
.filter(|s| !s.is_empty());
let mut state = self.state.write().unwrap();
state.parsed_config = true;
match section {
Some(section) => {
let prefix = format!("swarm.{}.", section);
let matching: Vec<_> = self
.config
.iter()
.filter(|(k, _)| k.starts_with(&prefix))
.collect();
if matching.is_empty() {
let mut output = format!(
"Section '{}' not found. Showing all configuration:\n\n=== SwarmEngine Configuration ===\n\n",
section
);
let sections = [
"routing",
"failover",
"circuit_breaker",
"workers",
"manager",
"strategy",
"exploration",
];
for sec in sections {
let sec_prefix = format!("swarm.{}.", sec);
let sec_matching: Vec<_> = self
.config
.iter()
.filter(|(k, _)| k.starts_with(&sec_prefix))
.collect();
if !sec_matching.is_empty() {
output.push_str(&format!("[{}]\n", sec));
for (key, value) in sec_matching {
let short_key = key.strip_prefix(&sec_prefix).unwrap_or(key);
output.push_str(&format!(" {}: {}\n", short_key, value));
}
output.push('\n');
}
}
WorkResult::env_success(output)
} else {
let mut output = format!("=== Configuration: swarm.{} ===\n", section);
for (key, value) in matching {
let short_key = key.strip_prefix("swarm.").unwrap_or(key);
output.push_str(&format!("{}: {}\n", short_key, value));
}
if self.problem.config_key.starts_with(&prefix) {
output.push_str("\n[!] Potential issue detected in this section");
}
WorkResult::env_success(output)
}
}
None => {
let mut output = String::from("=== SwarmEngine Configuration ===\n\n");
let sections = [
"routing",
"failover",
"circuit_breaker",
"workers",
"manager",
"strategy",
"exploration",
];
for section in sections {
let prefix = format!("swarm.{}.", section);
let matching: Vec<_> = self
.config
.iter()
.filter(|(k, _)| k.starts_with(&prefix))
.collect();
if !matching.is_empty() {
output.push_str(&format!("[{}]\n", section));
for (key, value) in matching {
let short_key = key.strip_prefix(&prefix).unwrap_or(key);
output.push_str(&format!(" {}: {}\n", short_key, value));
}
output.push('\n');
}
}
WorkResult::env_success(output)
}
}
}
fn handle_analyze_log(&self, _worker_id: WorkerId, action: &Action) -> WorkResult {
let resolver = ParamResolver::new(action);
let filter = resolver.get("filter").unwrap_or("");
let mut state = self.state.write().unwrap();
if !state.parsed_config {
return WorkResult::env_failure(
"Cannot analyze logs without parsing configuration first. Run ParseConfig first.",
);
}
state.analyzed_logs = true;
let logs: Vec<_> = if filter.is_empty() {
self.problem.logs.clone()
} else {
self.problem
.logs
.iter()
.filter(|log| log.contains(filter))
.cloned()
.collect()
};
if logs.is_empty() {
WorkResult::env_success("=== Log Analysis ===\nNo matching logs found.")
} else {
let error_code = self.problem.error_code.code();
let mut output = String::from("=== Log Analysis ===\n\n");
for log in &logs {
output.push_str(log);
output.push('\n');
}
output.push_str(&format!("\n[!] Detected error code: {}\n", error_code));
output.push_str("Use TraceError to investigate this error code.");
WorkResult::env_success(output)
}
}
fn handle_trace_error(&self, _worker_id: WorkerId, action: &Action) -> WorkResult {
let resolver = ParamResolver::new(action);
let error_code_input = resolver.get("code").unwrap_or("");
let mut state = self.state.write().unwrap();
if !state.analyzed_logs {
return WorkResult::env_failure(
"Cannot trace error without analyzing logs first. Run AnalyzeLog first.",
);
}
let expected_code = self.problem.error_code.code();
let error_code: String =
if error_code_input.starts_with("SW-") && error_code_input.len() == 7 {
error_code_input.to_string()
} else if error_code_input.is_empty() {
return WorkResult::env_failure(
"TraceError requires an error code. Usage: TraceError(SW-XXXX)",
);
} else {
expected_code.to_string()
};
if error_code == expected_code {
state.traced_error = Some(error_code.clone());
let desc = self.problem.error_code.description();
let fix_hint = self.problem.error_code.fix_hint();
let correct_value = self.problem.error_code.correct_value();
let output = format!(
"=== Error Trace: {} ===\n\n\
Description: {}\n\n\
Root Cause:\n\
Configuration key: swarm.{}\n\
Current value: {}\n\
Expected format: {}\n\n\
Recommended Fix:\n\
ApplyFix(key=\"{}\", value=\"{}\")",
error_code,
desc,
fix_hint,
self.problem.current_value,
correct_value,
fix_hint,
correct_value
);
WorkResult::env_success(output)
} else {
WorkResult::env_failure(format!(
"Error code {} not found in current logs.\nHint: Check the error code from AnalyzeLog output.",
error_code
))
}
}
fn handle_apply_fix(&self, worker_id: WorkerId, action: &Action) -> WorkResult {
let resolver = ParamResolver::new(action);
let key_input = resolver.get("key").unwrap_or("");
let value = resolver.arg("value").unwrap_or("");
let mut state = self.state.write().unwrap();
if state.traced_error.is_none() {
return WorkResult::env_failure(
"Cannot apply fix without tracing the error first. Run TraceError first.",
);
}
let fix_hint = self.problem.error_code.fix_hint();
let correct_value = self.problem.error_code.correct_value();
let key: String = if key_input.is_empty() {
return WorkResult::env_failure(
"ApplyFix requires a configuration key. Usage: ApplyFix(key=\"...\", value=\"...\")"
);
} else if key_input.contains('.') || key_input.contains('_') {
key_input.to_string()
} else {
fix_hint.to_string()
};
let full_key = if key.starts_with("swarm.") {
key.clone()
} else {
format!("swarm.{}", key)
};
let expected_key = format!("swarm.{}", fix_hint);
if full_key != expected_key {
return WorkResult::env_failure(format!(
"Incorrect configuration key: {}\nHint: The issue is in swarm.{}",
key, fix_hint
));
}
let value_is_correct = if value.is_empty() {
false
} else {
match (value.parse::<f64>(), correct_value.parse::<f64>()) {
(Ok(v), Ok(c)) => (v - c).abs() < 0.001 || v >= c,
_ => value.contains(correct_value) || correct_value.contains(value),
}
};
if !value_is_correct && !value.is_empty() {
return WorkResult::env_success(format!(
"=== Configuration Updated ===\n\
Key: {}\n\
Value: {}\n\n\
[!] Warning: Value may not fully resolve the issue.\n\
Recommended value: {}",
full_key, value, correct_value
));
}
if !state.completed.contains(&worker_id) {
state.completed.push(worker_id);
}
WorkResult::done_success(format!(
"=== Configuration Fixed ===\n\
Key: {}\n\
Value: {}\n\n\
[OK] Error {} resolved.\n\
SwarmEngine is now running normally.",
full_key,
if value.is_empty() {
correct_value
} else {
&value
},
self.problem.error_code.code()
))
}
}
impl Environment for InternalDiagnosisEnvironment {
fn step(&self, worker_id: WorkerId, action: &Action) -> WorkResult {
match action.name.to_lowercase().as_str() {
"parseconfig" | "parse_config" | "config" => {
self.handle_parse_config(worker_id, action)
}
"analyzelog" | "analyze_log" | "logs" | "log" => {
self.handle_analyze_log(worker_id, action)
}
"traceerror" | "trace_error" | "trace" => self.handle_trace_error(worker_id, action),
"applyfix" | "apply_fix" | "fix" => self.handle_apply_fix(worker_id, action),
"continue" => WorkResult::env_success("Continuing..."),
_ => WorkResult::unsupported(&action.name),
}
}
fn reset(&self) {
let mut state = self.state.write().unwrap();
state.parsed_config = false;
state.analyzed_logs = false;
state.traced_error = None;
state.completed.clear();
}
fn name(&self) -> &str {
"InternalDiagnosisEnvironment"
}
}
#[cfg(test)]
mod tests {
use super::*;
use swarm_engine_core::types::ActionParams;
fn is_success(result: &WorkResult) -> bool {
match result {
WorkResult::Acted { action_result, .. } => action_result.success,
WorkResult::Done { success, .. } => *success,
_ => false,
}
}
fn is_done(result: &WorkResult) -> bool {
matches!(result, WorkResult::Done { .. })
}
fn action(name: &str, target: Option<&str>) -> Action {
Action {
name: name.into(),
params: ActionParams {
target: target.map(|s| s.into()),
args: HashMap::new(),
data: vec![],
},
}
}
fn action_with_args(name: &str, args: HashMap<String, String>) -> Action {
Action {
name: name.into(),
params: ActionParams {
target: None,
args,
data: vec![],
},
}
}
#[test]
fn test_parse_config_all() {
let env = InternalDiagnosisEnvironment::routing_error_scenario();
let worker = WorkerId(0);
let result = env.step(worker, &action("ParseConfig", None));
assert!(is_success(&result));
}
#[test]
fn test_parse_config_section() {
let env = InternalDiagnosisEnvironment::routing_error_scenario();
let worker = WorkerId(0);
let result = env.step(worker, &action("ParseConfig", Some("routing")));
assert!(is_success(&result));
}
#[test]
fn test_analyze_log_requires_parse() {
let env = InternalDiagnosisEnvironment::routing_error_scenario();
let worker = WorkerId(0);
let result = env.step(worker, &action("AnalyzeLog", None));
assert!(!is_success(&result));
}
#[test]
fn test_trace_error_requires_analyze() {
let env = InternalDiagnosisEnvironment::routing_error_scenario();
let worker = WorkerId(0);
env.step(worker, &action("ParseConfig", None));
let result = env.step(worker, &action("TraceError", Some("SW-1001")));
assert!(!is_success(&result));
}
#[test]
fn test_full_diagnosis_flow() {
let env = InternalDiagnosisEnvironment::routing_error_scenario();
let worker = WorkerId(0);
let result = env.step(worker, &action("ParseConfig", None));
assert!(is_success(&result));
assert!(!is_done(&result));
let result = env.step(worker, &action("AnalyzeLog", None));
assert!(is_success(&result));
assert!(!is_done(&result));
let result = env.step(worker, &action("TraceError", Some("SW-1001")));
assert!(is_success(&result));
assert!(!is_done(&result));
let mut args = HashMap::new();
args.insert("key".into(), "routing.targets".into());
args.insert("value".into(), "[\"worker-1\", \"worker-2\"]".into());
let result = env.step(worker, &action_with_args("ApplyFix", args));
assert!(is_success(&result));
assert!(is_done(&result));
}
#[test]
fn test_wrong_error_code_fails() {
let env = InternalDiagnosisEnvironment::routing_error_scenario();
let worker = WorkerId(0);
env.step(worker, &action("ParseConfig", None));
env.step(worker, &action("AnalyzeLog", None));
let result = env.step(worker, &action("TraceError", Some("SW-9999")));
assert!(!is_success(&result));
}
#[test]
fn test_wrong_fix_key_fails() {
let env = InternalDiagnosisEnvironment::routing_error_scenario();
let worker = WorkerId(0);
env.step(worker, &action("ParseConfig", None));
env.step(worker, &action("AnalyzeLog", None));
env.step(worker, &action("TraceError", Some("SW-1001")));
let mut args = HashMap::new();
args.insert("key".into(), "wrong.key".into());
args.insert("value".into(), "some-value".into());
let result = env.step(worker, &action_with_args("ApplyFix", args));
assert!(!is_success(&result));
}
#[test]
fn test_worker_pool_scenario() {
let env = InternalDiagnosisEnvironment::worker_pool_scenario();
let worker = WorkerId(0);
env.step(worker, &action("ParseConfig", None));
env.step(worker, &action("AnalyzeLog", None));
let result = env.step(worker, &action("TraceError", Some("SW-2001")));
assert!(is_success(&result));
let mut args = HashMap::new();
args.insert("key".into(), "workers.max_count".into());
args.insert("value".into(), "16".into());
let result = env.step(worker, &action_with_args("ApplyFix", args));
assert!(is_success(&result));
assert!(is_done(&result));
}
}