aof_core/
fleet.rs

1//! AgentFleet - Multi-agent coordination and orchestration
2//!
3//! AgentFleet provides Kubernetes-style configuration for managing groups
4//! of agents with different coordination modes:
5//! - Hierarchical: Manager agent coordinates workers
6//! - Peer: All agents coordinate as equals
7//! - Swarm: Dynamic self-organizing coordination
8//! - Pipeline: Sequential handoff between agents
9//! - Tiered: Tier-based parallel execution with consensus (for multi-model RCA)
10
11use serde::{Deserialize, Serialize};
12use std::collections::HashMap;
13
14/// AgentFleet configuration (K8s-style)
15#[derive(Debug, Clone, Serialize, Deserialize)]
16pub struct AgentFleet {
17    /// API version (e.g., "aof.dev/v1")
18    #[serde(rename = "apiVersion")]
19    pub api_version: String,
20
21    /// Resource kind (always "AgentFleet")
22    pub kind: String,
23
24    /// Fleet metadata
25    pub metadata: FleetMetadata,
26
27    /// Fleet specification
28    pub spec: FleetSpec,
29}
30
31/// Fleet metadata
32#[derive(Debug, Clone, Serialize, Deserialize)]
33pub struct FleetMetadata {
34    /// Fleet name
35    pub name: String,
36
37    /// Namespace
38    #[serde(default)]
39    pub namespace: Option<String>,
40
41    /// Labels for filtering
42    #[serde(default)]
43    pub labels: HashMap<String, String>,
44
45    /// Annotations for metadata
46    #[serde(default)]
47    pub annotations: HashMap<String, String>,
48}
49
50/// Fleet specification
51#[derive(Debug, Clone, Serialize, Deserialize)]
52pub struct FleetSpec {
53    /// Agent definitions in the fleet
54    pub agents: Vec<FleetAgent>,
55
56    /// Coordination configuration
57    #[serde(default)]
58    pub coordination: CoordinationConfig,
59
60    /// Shared resources across agents
61    #[serde(default)]
62    pub shared: Option<SharedResources>,
63
64    /// Communication patterns
65    #[serde(default)]
66    pub communication: Option<CommunicationConfig>,
67
68    /// Scaling configuration
69    #[serde(default)]
70    pub scaling: Option<ScalingConfig>,
71}
72
73/// Agent definition within a fleet
74#[derive(Debug, Clone, Serialize, Deserialize)]
75pub struct FleetAgent {
76    /// Agent name (unique within fleet)
77    pub name: String,
78
79    /// Path to agent configuration file
80    #[serde(default)]
81    pub config: Option<String>,
82
83    /// Inline agent configuration
84    #[serde(default)]
85    pub spec: Option<FleetAgentSpec>,
86
87    /// Number of agent replicas
88    #[serde(default = "default_replicas")]
89    pub replicas: u32,
90
91    /// Role in the fleet
92    #[serde(default)]
93    pub role: AgentRole,
94
95    /// Agent-specific labels
96    #[serde(default)]
97    pub labels: HashMap<String, String>,
98
99    /// Tier number for tiered coordination mode (1 = first tier, 2 = second, etc.)
100    /// Tier 1 agents run first, their results feed into tier 2, etc.
101    /// If not specified, defaults to 1.
102    #[serde(default)]
103    pub tier: Option<u32>,
104
105    /// Weight for weighted consensus voting (default: 1.0)
106    #[serde(default)]
107    pub weight: Option<f32>,
108}
109
110fn default_replicas() -> u32 {
111    1
112}
113
114/// Inline agent specification (simplified)
115#[derive(Debug, Clone, Serialize, Deserialize)]
116pub struct FleetAgentSpec {
117    /// Model to use
118    pub model: String,
119
120    /// Agent instructions/system prompt
121    #[serde(default)]
122    pub instructions: Option<String>,
123
124    /// Tools available to this agent
125    /// Supports both simple strings and qualified specs
126    #[serde(default)]
127    pub tools: Vec<crate::agent::ToolSpec>,
128
129    /// MCP servers for this agent
130    #[serde(default)]
131    pub mcp_servers: Vec<crate::McpServerConfig>,
132
133    /// Maximum iterations
134    #[serde(default)]
135    pub max_iterations: Option<u32>,
136
137    /// Temperature for model
138    #[serde(default)]
139    pub temperature: Option<f32>,
140}
141
142/// Agent role within the fleet
143#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
144#[serde(rename_all = "lowercase")]
145pub enum AgentRole {
146    /// Regular worker agent
147    #[default]
148    Worker,
149    /// Manager/coordinator agent
150    Manager,
151    /// Specialist agent for specific tasks
152    Specialist,
153    /// Validator/reviewer agent
154    Validator,
155}
156
157/// Coordination configuration
158#[derive(Debug, Clone, Serialize, Deserialize)]
159pub struct CoordinationConfig {
160    /// Coordination mode
161    #[serde(default)]
162    pub mode: CoordinationMode,
163
164    /// Manager agent name (for hierarchical mode)
165    #[serde(default)]
166    pub manager: Option<String>,
167
168    /// Task distribution strategy
169    #[serde(default)]
170    pub distribution: TaskDistribution,
171
172    /// Consensus configuration (for peer/tiered mode)
173    #[serde(default)]
174    pub consensus: Option<ConsensusConfig>,
175
176    /// Result aggregation strategy (for peer mode with complementary specialists)
177    /// When set, peer mode will collect and merge ALL agent results instead of using consensus.
178    /// Use this when agents provide complementary information (e.g., security + quality reviews)
179    /// rather than competing perspectives.
180    #[serde(default)]
181    pub aggregation: Option<FinalAggregation>,
182
183    /// Tiered execution configuration (for tiered mode)
184    #[serde(default)]
185    pub tiered: Option<TieredConfig>,
186
187    /// Deep execution configuration (for deep mode)
188    #[serde(default)]
189    pub deep: Option<DeepConfig>,
190}
191
192/// Configuration for tiered coordination mode
193#[derive(Debug, Clone, Serialize, Deserialize, Default)]
194pub struct TieredConfig {
195    /// Per-tier consensus configuration
196    /// Key: tier number (as string), Value: consensus config for that tier
197    #[serde(default)]
198    pub tier_consensus: HashMap<String, ConsensusConfig>,
199
200    /// Whether to pass all tier results to next tier or just the consensus result
201    #[serde(default)]
202    pub pass_all_results: bool,
203
204    /// Final tier aggregation strategy
205    #[serde(default)]
206    pub final_aggregation: FinalAggregation,
207}
208
209/// Configuration for deep coordination mode (iterative planning + execution)
210///
211/// # Example
212/// ```yaml
213/// coordination:
214///   mode: deep
215///   deep:
216///     max_iterations: 10
217///     planning: true
218///     memory: true
219/// ```
220#[derive(Debug, Clone, Serialize, Deserialize)]
221pub struct DeepConfig {
222    /// Maximum iterations before stopping (safety limit)
223    #[serde(default = "default_max_iterations")]
224    pub max_iterations: u32,
225
226    /// Enable planning phase (LLM generates investigation steps)
227    #[serde(default = "default_true")]
228    pub planning: bool,
229
230    /// Enable memory persistence across iterations
231    #[serde(default = "default_true")]
232    pub memory: bool,
233
234    /// Model to use for planning (defaults to fleet's model)
235    #[serde(default)]
236    pub planner_model: Option<String>,
237
238    /// System prompt for the planning phase
239    #[serde(default)]
240    pub planner_prompt: Option<String>,
241
242    /// System prompt for the synthesis phase
243    #[serde(default)]
244    pub synthesizer_prompt: Option<String>,
245}
246
247fn default_max_iterations() -> u32 {
248    10
249}
250
251fn default_true() -> bool {
252    true
253}
254
255impl Default for DeepConfig {
256    fn default() -> Self {
257        Self {
258            max_iterations: default_max_iterations(),
259            planning: default_true(),
260            memory: default_true(),
261            planner_model: None,
262            planner_prompt: None,
263            synthesizer_prompt: None,
264        }
265    }
266}
267
268/// How to aggregate results from the final tier
269#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
270#[serde(rename_all = "snake_case")]
271pub enum FinalAggregation {
272    /// Use consensus algorithm to pick best result
273    #[default]
274    Consensus,
275    /// Merge all results into combined output
276    Merge,
277    /// Use manager agent to synthesize final result
278    ManagerSynthesis,
279}
280
281impl Default for CoordinationConfig {
282    fn default() -> Self {
283        Self {
284            mode: CoordinationMode::Peer,
285            manager: None,
286            distribution: TaskDistribution::RoundRobin,
287            consensus: None,
288            aggregation: None,
289            tiered: None,
290            deep: None,
291        }
292    }
293}
294
295/// Coordination mode for the fleet
296///
297/// Choose the mode that best fits your use case:
298/// - **Peer**: All agents work on the same task in parallel. Results can be combined via
299///   consensus (competing perspectives) or aggregation (complementary specialists).
300///   Best for code review, multi-expert analysis, RCA.
301/// - **Hierarchical**: Manager delegates to workers, aggregates results. Best for
302///   complex multi-step tasks with oversight.
303/// - **Pipeline**: Sequential handoff between agents. Best for workflows where
304///   each stage transforms data for the next.
305/// - **Swarm**: Self-organizing dynamic coordination. Best for large-scale
306///   parallel processing with adaptive load balancing.
307/// - **Tiered**: Tier-based parallel execution with consensus. Best for multi-model
308///   RCA where cheap data-collectors feed reasoning models.
309/// - **Deep**: Iterative planning and execution loop. Best for complex investigations
310///   that require multi-step reasoning with re-planning based on findings.
311#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
312#[serde(rename_all = "lowercase")]
313pub enum CoordinationMode {
314    /// Hierarchical: Manager coordinates workers
315    Hierarchical,
316    /// Peer-to-peer: Agents coordinate as equals
317    #[default]
318    Peer,
319    /// Swarm: Self-organizing dynamic coordination
320    Swarm,
321    /// Pipeline: Sequential handoff between agents
322    Pipeline,
323    /// Tiered: Tier-based parallel execution with consensus
324    /// Agents are grouped by tier (e.g., tier 1 = data collectors, tier 2 = reasoners)
325    /// Each tier runs in parallel, results flow to next tier
326    Tiered,
327    /// Deep: Iterative planning and execution loop (like Claude Code)
328    /// Plans investigation steps, executes iteratively, re-plans based on findings
329    Deep,
330}
331
332/// Task distribution strategy
333#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
334#[serde(rename_all = "kebab-case")]
335pub enum TaskDistribution {
336    /// Round-robin distribution
337    #[default]
338    RoundRobin,
339    /// Least-loaded agent gets the task
340    LeastLoaded,
341    /// Random distribution
342    Random,
343    /// Skill-based routing (agents with matching skills)
344    SkillBased,
345    /// Sticky routing (same task types go to same agent)
346    Sticky,
347}
348
349/// Consensus configuration for peer/tiered coordination
350///
351/// # Example
352/// ```yaml
353/// consensus:
354///   algorithm: weighted
355///   min_votes: 2
356///   timeout_ms: 60000
357///   allow_partial: true
358///   weights:
359///     senior-reviewer: 2.0
360///     junior-reviewer: 1.0
361///   min_confidence: 0.7
362/// ```
363#[derive(Debug, Clone, Serialize, Deserialize)]
364pub struct ConsensusConfig {
365    /// Consensus algorithm
366    #[serde(default)]
367    pub algorithm: ConsensusAlgorithm,
368
369    /// Minimum votes required for consensus
370    #[serde(default)]
371    pub min_votes: Option<u32>,
372
373    /// Timeout for reaching consensus (milliseconds)
374    #[serde(default)]
375    pub timeout_ms: Option<u64>,
376
377    /// Allow partial consensus if timeout reached
378    #[serde(default)]
379    pub allow_partial: bool,
380
381    /// Per-agent weights for weighted voting
382    /// Key: agent name, Value: weight (default 1.0)
383    #[serde(default)]
384    pub weights: HashMap<String, f32>,
385
386    /// Minimum confidence threshold (0.0-1.0)
387    /// Below this threshold, result is flagged for human review
388    #[serde(default)]
389    pub min_confidence: Option<f32>,
390}
391
392/// Consensus algorithm type
393///
394/// Choose based on your requirements:
395/// - **Majority**: >50% agreement wins. Fast, tolerates outliers.
396/// - **Unanimous**: 100% agreement required. High confidence, may timeout.
397/// - **Weighted**: Per-agent weights (senior reviewers count more). Balanced expertise.
398/// - **FirstWins**: First response wins. Fastest, no consensus overhead.
399/// - **HumanReview**: Flags for human operator decision. High-stakes scenarios.
400#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
401#[serde(rename_all = "snake_case")]
402pub enum ConsensusAlgorithm {
403    /// Simple majority voting (>50% agreement)
404    #[default]
405    Majority,
406    /// Unanimous agreement required (100%)
407    Unanimous,
408    /// Weighted voting based on agent weights
409    Weighted,
410    /// First response wins (no consensus)
411    FirstWins,
412    /// Flags result for human operator review
413    HumanReview,
414}
415
416/// Shared resources configuration
417#[derive(Debug, Clone, Serialize, Deserialize, Default)]
418pub struct SharedResources {
419    /// Shared memory configuration
420    #[serde(default)]
421    pub memory: Option<SharedMemoryConfig>,
422
423    /// Shared tools available to all agents
424    #[serde(default)]
425    pub tools: Vec<SharedToolConfig>,
426
427    /// Shared knowledge base
428    #[serde(default)]
429    pub knowledge: Option<SharedKnowledgeConfig>,
430}
431
432/// Shared memory configuration
433#[derive(Debug, Clone, Serialize, Deserialize)]
434pub struct SharedMemoryConfig {
435    /// Memory backend type
436    #[serde(rename = "type")]
437    pub memory_type: SharedMemoryType,
438
439    /// Connection URL
440    #[serde(default)]
441    pub url: Option<String>,
442
443    /// Memory namespace/prefix
444    #[serde(default)]
445    pub namespace: Option<String>,
446
447    /// TTL for memory entries (seconds)
448    #[serde(default)]
449    pub ttl: Option<u64>,
450}
451
452/// Shared memory backend type
453#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
454#[serde(rename_all = "lowercase")]
455pub enum SharedMemoryType {
456    /// In-memory (single process)
457    #[default]
458    InMemory,
459    /// Redis backend
460    Redis,
461    /// SQLite backend
462    Sqlite,
463    /// PostgreSQL backend
464    Postgres,
465}
466
467/// Shared tool configuration
468#[derive(Debug, Clone, Serialize, Deserialize)]
469pub struct SharedToolConfig {
470    /// MCP server reference
471    #[serde(rename = "mcp-server")]
472    pub mcp_server: Option<String>,
473
474    /// Built-in tool name
475    pub tool: Option<String>,
476
477    /// Tool configuration
478    #[serde(default)]
479    pub config: HashMap<String, serde_json::Value>,
480}
481
482/// Shared knowledge configuration
483#[derive(Debug, Clone, Serialize, Deserialize)]
484pub struct SharedKnowledgeConfig {
485    /// Knowledge base type
486    #[serde(rename = "type")]
487    pub kb_type: String,
488
489    /// Source path or URL
490    pub source: String,
491
492    /// Additional configuration
493    #[serde(default)]
494    pub config: HashMap<String, serde_json::Value>,
495}
496
497/// Communication configuration
498#[derive(Debug, Clone, Serialize, Deserialize)]
499pub struct CommunicationConfig {
500    /// Message passing pattern
501    #[serde(default)]
502    pub pattern: MessagePattern,
503
504    /// Message queue configuration
505    #[serde(default)]
506    pub queue: Option<QueueConfig>,
507
508    /// Broadcast configuration
509    #[serde(default)]
510    pub broadcast: Option<BroadcastConfig>,
511}
512
513/// Message passing pattern
514#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
515#[serde(rename_all = "kebab-case")]
516pub enum MessagePattern {
517    /// Direct point-to-point messaging
518    #[default]
519    Direct,
520    /// Publish-subscribe pattern
521    PubSub,
522    /// Request-reply pattern
523    RequestReply,
524    /// Broadcast to all agents
525    Broadcast,
526}
527
528/// Queue configuration
529#[derive(Debug, Clone, Serialize, Deserialize)]
530pub struct QueueConfig {
531    /// Queue type
532    #[serde(rename = "type")]
533    pub queue_type: String,
534
535    /// Connection URL
536    pub url: String,
537
538    /// Queue name
539    pub name: String,
540}
541
542/// Broadcast configuration
543#[derive(Debug, Clone, Serialize, Deserialize)]
544pub struct BroadcastConfig {
545    /// Broadcast channel name
546    pub channel: String,
547
548    /// Include sender in broadcast
549    #[serde(default)]
550    pub include_sender: bool,
551}
552
553/// Scaling configuration
554#[derive(Debug, Clone, Serialize, Deserialize)]
555pub struct ScalingConfig {
556    /// Minimum replicas
557    #[serde(default)]
558    pub min_replicas: Option<u32>,
559
560    /// Maximum replicas
561    #[serde(default)]
562    pub max_replicas: Option<u32>,
563
564    /// Auto-scaling enabled
565    #[serde(default)]
566    pub auto_scale: bool,
567
568    /// Scaling metrics
569    #[serde(default)]
570    pub metrics: Vec<ScalingMetric>,
571}
572
573/// Scaling metric
574#[derive(Debug, Clone, Serialize, Deserialize)]
575pub struct ScalingMetric {
576    /// Metric name
577    pub name: String,
578
579    /// Target value
580    pub target: f64,
581
582    /// Metric type
583    #[serde(rename = "type")]
584    pub metric_type: String,
585}
586
587/// Fleet runtime state
588#[derive(Debug, Clone, Serialize, Deserialize)]
589pub struct FleetState {
590    /// Fleet name
591    pub fleet_name: String,
592
593    /// Fleet status
594    pub status: FleetStatus,
595
596    /// Agent instances
597    pub agents: HashMap<String, AgentInstanceState>,
598
599    /// Active tasks
600    pub active_tasks: Vec<FleetTask>,
601
602    /// Completed tasks
603    pub completed_tasks: Vec<FleetTask>,
604
605    /// Fleet metrics
606    pub metrics: FleetMetrics,
607
608    /// Start time
609    pub started_at: Option<chrono::DateTime<chrono::Utc>>,
610}
611
612impl FleetState {
613    /// Create new fleet state
614    pub fn new(fleet_name: &str) -> Self {
615        Self {
616            fleet_name: fleet_name.to_string(),
617            status: FleetStatus::Initializing,
618            agents: HashMap::new(),
619            active_tasks: Vec::new(),
620            completed_tasks: Vec::new(),
621            metrics: FleetMetrics::default(),
622            started_at: None,
623        }
624    }
625}
626
627/// Fleet status
628#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
629#[serde(rename_all = "lowercase")]
630pub enum FleetStatus {
631    /// Fleet is initializing
632    #[default]
633    Initializing,
634    /// Fleet is ready and idle
635    Ready,
636    /// Fleet is actively processing tasks
637    Active,
638    /// Fleet is paused
639    Paused,
640    /// Fleet has failed
641    Failed,
642    /// Fleet is shutting down
643    ShuttingDown,
644}
645
646/// Agent instance state
647#[derive(Debug, Clone, Serialize, Deserialize)]
648pub struct AgentInstanceState {
649    /// Instance ID
650    pub instance_id: String,
651
652    /// Agent name from fleet config
653    pub agent_name: String,
654
655    /// Replica index
656    pub replica_index: u32,
657
658    /// Instance status
659    pub status: AgentInstanceStatus,
660
661    /// Current task (if any)
662    pub current_task: Option<String>,
663
664    /// Tasks processed count
665    pub tasks_processed: u64,
666
667    /// Last activity timestamp
668    pub last_activity: Option<chrono::DateTime<chrono::Utc>>,
669}
670
671/// Agent instance status
672#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
673#[serde(rename_all = "lowercase")]
674pub enum AgentInstanceStatus {
675    /// Instance is starting
676    #[default]
677    Starting,
678    /// Instance is idle and ready
679    Idle,
680    /// Instance is processing a task
681    Busy,
682    /// Instance has failed
683    Failed,
684    /// Instance is stopped
685    Stopped,
686}
687
688/// Fleet task
689#[derive(Debug, Clone, Serialize, Deserialize)]
690pub struct FleetTask {
691    /// Task ID
692    pub task_id: String,
693
694    /// Task input
695    pub input: serde_json::Value,
696
697    /// Assigned agent instance
698    pub assigned_to: Option<String>,
699
700    /// Task status
701    pub status: FleetTaskStatus,
702
703    /// Task result (if completed)
704    pub result: Option<serde_json::Value>,
705
706    /// Error (if failed)
707    pub error: Option<String>,
708
709    /// Created timestamp
710    pub created_at: chrono::DateTime<chrono::Utc>,
711
712    /// Started timestamp
713    pub started_at: Option<chrono::DateTime<chrono::Utc>>,
714
715    /// Completed timestamp
716    pub completed_at: Option<chrono::DateTime<chrono::Utc>>,
717}
718
719/// Fleet task status
720#[derive(Debug, Clone, Copy, Serialize, Deserialize, Default, PartialEq, Eq)]
721#[serde(rename_all = "lowercase")]
722pub enum FleetTaskStatus {
723    /// Task is pending assignment
724    #[default]
725    Pending,
726    /// Task is assigned to an agent
727    Assigned,
728    /// Task is being processed
729    Running,
730    /// Task completed successfully
731    Completed,
732    /// Task failed
733    Failed,
734    /// Task was cancelled
735    Cancelled,
736}
737
738/// Fleet metrics
739#[derive(Debug, Clone, Serialize, Deserialize, Default)]
740pub struct FleetMetrics {
741    /// Total tasks submitted
742    pub total_tasks: u64,
743
744    /// Completed tasks
745    pub completed_tasks: u64,
746
747    /// Failed tasks
748    pub failed_tasks: u64,
749
750    /// Average task duration (ms)
751    pub avg_task_duration_ms: f64,
752
753    /// Active agent count
754    pub active_agents: u32,
755
756    /// Total agent count
757    pub total_agents: u32,
758
759    /// Messages exchanged between agents
760    pub messages_exchanged: u64,
761
762    /// Consensus rounds (for peer mode)
763    pub consensus_rounds: u64,
764}
765
766impl AgentFleet {
767    /// Load fleet from YAML file
768    pub fn from_yaml(yaml: &str) -> Result<Self, crate::AofError> {
769        serde_yaml::from_str(yaml).map_err(|e| crate::AofError::config(format!("Failed to parse fleet YAML: {}", e)))
770    }
771
772    /// Load fleet from file
773    pub fn from_file(path: &str) -> Result<Self, crate::AofError> {
774        let content = std::fs::read_to_string(path)
775            .map_err(|e| crate::AofError::config(format!("Failed to read fleet file: {}", e)))?;
776        Self::from_yaml(&content)
777    }
778
779    /// Get agent by name
780    pub fn get_agent(&self, name: &str) -> Option<&FleetAgent> {
781        self.spec.agents.iter().find(|a| a.name == name)
782    }
783
784    /// Get all agents with a specific role
785    pub fn get_agents_by_role(&self, role: AgentRole) -> Vec<&FleetAgent> {
786        self.spec.agents.iter().filter(|a| a.role == role).collect()
787    }
788
789    /// Get the manager agent (for hierarchical mode)
790    pub fn get_manager(&self) -> Option<&FleetAgent> {
791        if let Some(ref manager_name) = self.spec.coordination.manager {
792            self.get_agent(manager_name)
793        } else {
794            self.get_agents_by_role(AgentRole::Manager).first().copied()
795        }
796    }
797
798    /// Total replica count across all agents
799    pub fn total_replicas(&self) -> u32 {
800        self.spec.agents.iter().map(|a| a.replicas).sum()
801    }
802
803    /// Validate fleet configuration
804    pub fn validate(&self) -> Result<(), crate::AofError> {
805        // Check for duplicate agent names
806        let mut names = std::collections::HashSet::new();
807        for agent in &self.spec.agents {
808            if !names.insert(&agent.name) {
809                return Err(crate::AofError::config(format!(
810                    "Duplicate agent name in fleet: {}",
811                    agent.name
812                )));
813            }
814        }
815
816        // Validate hierarchical mode has a manager
817        if self.spec.coordination.mode == CoordinationMode::Hierarchical {
818            if self.get_manager().is_none() {
819                return Err(crate::AofError::config(
820                    "Hierarchical mode requires a manager agent".to_string(),
821                ));
822            }
823        }
824
825        // Validate tiered mode has agents with tier assignments
826        if self.spec.coordination.mode == CoordinationMode::Tiered {
827            let tiers = self.get_tiers();
828            if tiers.is_empty() {
829                return Err(crate::AofError::config(
830                    "Tiered mode requires at least one agent with a tier assignment".to_string(),
831                ));
832            }
833            // Ensure we have at least 2 tiers for meaningful tiered execution
834            if tiers.len() < 2 {
835                // This is a warning, not an error - single tier still works
836                tracing::warn!(
837                    "Tiered mode with only one tier ({}) - consider using peer mode instead",
838                    tiers[0]
839                );
840            }
841        }
842
843        // Validate agent configurations
844        for agent in &self.spec.agents {
845            if agent.config.is_none() && agent.spec.is_none() {
846                return Err(crate::AofError::config(format!(
847                    "Agent '{}' must have either 'config' or 'spec' defined",
848                    agent.name
849                )));
850            }
851        }
852
853        Ok(())
854    }
855
856    /// Get unique tiers in the fleet (sorted ascending)
857    pub fn get_tiers(&self) -> Vec<u32> {
858        let mut tiers: Vec<u32> = self
859            .spec
860            .agents
861            .iter()
862            .map(|a| a.tier.unwrap_or(1))
863            .collect::<std::collections::HashSet<_>>()
864            .into_iter()
865            .collect();
866        tiers.sort();
867        tiers
868    }
869
870    /// Get agents for a specific tier
871    pub fn get_agents_by_tier(&self, tier: u32) -> Vec<&FleetAgent> {
872        self.spec
873            .agents
874            .iter()
875            .filter(|a| a.tier.unwrap_or(1) == tier)
876            .collect()
877    }
878
879    /// Get agent weight (for weighted consensus)
880    pub fn get_agent_weight(&self, agent_name: &str) -> f32 {
881        // First check agent-level weight
882        if let Some(agent) = self.get_agent(agent_name) {
883            if let Some(weight) = agent.weight {
884                return weight;
885            }
886        }
887        // Then check consensus config weights
888        if let Some(ref consensus) = self.spec.coordination.consensus {
889            if let Some(weight) = consensus.weights.get(agent_name) {
890                return *weight;
891            }
892        }
893        // Default weight
894        1.0
895    }
896}
897
898#[cfg(test)]
899mod tests {
900    use super::*;
901
902    #[test]
903    fn test_parse_fleet_yaml() {
904        let yaml = r#"
905apiVersion: aof.dev/v1
906kind: AgentFleet
907metadata:
908  name: incident-team
909  labels:
910    team: sre
911spec:
912  agents:
913    - name: detector
914      config: ./agents/detector.yaml
915      replicas: 2
916      role: worker
917    - name: analyzer
918      config: ./agents/analyzer.yaml
919      replicas: 1
920      role: specialist
921    - name: coordinator
922      config: ./agents/coordinator.yaml
923      replicas: 1
924      role: manager
925  coordination:
926    mode: hierarchical
927    manager: coordinator
928    distribution: skill-based
929  shared:
930    memory:
931      type: redis
932      url: redis://localhost:6379
933    tools:
934      - mcp-server: kubectl-ai
935"#;
936
937        let fleet = AgentFleet::from_yaml(yaml).unwrap();
938        assert_eq!(fleet.metadata.name, "incident-team");
939        assert_eq!(fleet.spec.agents.len(), 3);
940        assert_eq!(fleet.spec.coordination.mode, CoordinationMode::Hierarchical);
941        assert_eq!(fleet.total_replicas(), 4);
942
943        let manager = fleet.get_manager().unwrap();
944        assert_eq!(manager.name, "coordinator");
945    }
946
947    #[test]
948    fn test_peer_mode_fleet() {
949        let yaml = r#"
950apiVersion: aof.dev/v1
951kind: AgentFleet
952metadata:
953  name: review-team
954spec:
955  agents:
956    - name: reviewer-1
957      config: ./reviewer.yaml
958    - name: reviewer-2
959      config: ./reviewer.yaml
960    - name: reviewer-3
961      config: ./reviewer.yaml
962  coordination:
963    mode: peer
964    distribution: round-robin
965    consensus:
966      algorithm: majority
967      minVotes: 2
968"#;
969
970        let fleet = AgentFleet::from_yaml(yaml).unwrap();
971        assert_eq!(fleet.spec.coordination.mode, CoordinationMode::Peer);
972        assert!(fleet.spec.coordination.consensus.is_some());
973    }
974
975    #[test]
976    fn test_fleet_validation() {
977        // Valid fleet
978        let yaml = r#"
979apiVersion: aof.dev/v1
980kind: AgentFleet
981metadata:
982  name: test-fleet
983spec:
984  agents:
985    - name: agent-1
986      config: ./agent.yaml
987"#;
988        let fleet = AgentFleet::from_yaml(yaml).unwrap();
989        assert!(fleet.validate().is_ok());
990
991        // Invalid: hierarchical without manager
992        let yaml = r#"
993apiVersion: aof.dev/v1
994kind: AgentFleet
995metadata:
996  name: test-fleet
997spec:
998  agents:
999    - name: agent-1
1000      config: ./agent.yaml
1001  coordination:
1002    mode: hierarchical
1003"#;
1004        let fleet = AgentFleet::from_yaml(yaml).unwrap();
1005        assert!(fleet.validate().is_err());
1006    }
1007
1008    #[test]
1009    fn test_fleet_state() {
1010        let mut state = FleetState::new("test-fleet");
1011        assert_eq!(state.status, FleetStatus::Initializing);
1012
1013        state.status = FleetStatus::Ready;
1014        state.metrics.total_agents = 3;
1015        state.metrics.active_agents = 3;
1016
1017        assert_eq!(state.metrics.total_agents, 3);
1018    }
1019
1020    #[test]
1021    fn test_tiered_mode_fleet() {
1022        let yaml = r#"
1023apiVersion: aof.dev/v1
1024kind: AgentFleet
1025metadata:
1026  name: rca-team
1027spec:
1028  agents:
1029    # Tier 1: Data collectors (cheap models)
1030    - name: loki-collector
1031      config: ./agents/loki.yaml
1032      tier: 1
1033    - name: prometheus-collector
1034      config: ./agents/prometheus.yaml
1035      tier: 1
1036    - name: k8s-collector
1037      config: ./agents/k8s.yaml
1038      tier: 1
1039    # Tier 2: Reasoning models
1040    - name: claude-analyzer
1041      config: ./agents/claude.yaml
1042      tier: 2
1043      weight: 2.0
1044    - name: gemini-analyzer
1045      config: ./agents/gemini.yaml
1046      tier: 2
1047    # Tier 3: Synthesizer
1048    - name: rca-coordinator
1049      config: ./agents/coordinator.yaml
1050      tier: 3
1051      role: manager
1052  coordination:
1053    mode: tiered
1054    consensus:
1055      algorithm: weighted
1056      min_votes: 2
1057      min_confidence: 0.7
1058    tiered:
1059      pass_all_results: true
1060      final_aggregation: manager_synthesis
1061"#;
1062
1063        let fleet = AgentFleet::from_yaml(yaml).unwrap();
1064        assert_eq!(fleet.metadata.name, "rca-team");
1065        assert_eq!(fleet.spec.coordination.mode, CoordinationMode::Tiered);
1066
1067        // Test tier detection
1068        let tiers = fleet.get_tiers();
1069        assert_eq!(tiers, vec![1, 2, 3]);
1070
1071        // Test agents by tier
1072        let tier1_agents = fleet.get_agents_by_tier(1);
1073        assert_eq!(tier1_agents.len(), 3);
1074
1075        let tier2_agents = fleet.get_agents_by_tier(2);
1076        assert_eq!(tier2_agents.len(), 2);
1077
1078        let tier3_agents = fleet.get_agents_by_tier(3);
1079        assert_eq!(tier3_agents.len(), 1);
1080
1081        // Test agent weights
1082        assert_eq!(fleet.get_agent_weight("claude-analyzer"), 2.0);
1083        assert_eq!(fleet.get_agent_weight("gemini-analyzer"), 1.0); // default
1084
1085        // Validate configuration
1086        assert!(fleet.validate().is_ok());
1087    }
1088
1089    #[test]
1090    fn test_consensus_algorithms() {
1091        // Test all consensus algorithm variants parse correctly
1092        let algorithms = vec![
1093            ("majority", ConsensusAlgorithm::Majority),
1094            ("unanimous", ConsensusAlgorithm::Unanimous),
1095            ("weighted", ConsensusAlgorithm::Weighted),
1096            ("first_wins", ConsensusAlgorithm::FirstWins),
1097            ("human_review", ConsensusAlgorithm::HumanReview),
1098        ];
1099
1100        for (yaml_value, expected) in algorithms {
1101            let yaml = format!(r#"
1102apiVersion: aof.dev/v1
1103kind: AgentFleet
1104metadata:
1105  name: test
1106spec:
1107  agents:
1108    - name: agent-1
1109      config: ./agent.yaml
1110  coordination:
1111    mode: peer
1112    consensus:
1113      algorithm: {}
1114"#, yaml_value);
1115
1116            let fleet = AgentFleet::from_yaml(&yaml).unwrap();
1117            assert_eq!(
1118                fleet.spec.coordination.consensus.as_ref().unwrap().algorithm,
1119                expected,
1120                "Failed for algorithm: {}",
1121                yaml_value
1122            );
1123        }
1124    }
1125
1126    #[test]
1127    fn test_weighted_consensus_config() {
1128        let yaml = r#"
1129apiVersion: aof.dev/v1
1130kind: AgentFleet
1131metadata:
1132  name: weighted-team
1133spec:
1134  agents:
1135    - name: senior-reviewer
1136      config: ./reviewer.yaml
1137      weight: 2.0
1138    - name: junior-reviewer
1139      config: ./reviewer.yaml
1140  coordination:
1141    mode: peer
1142    consensus:
1143      algorithm: weighted
1144      min_votes: 2
1145      min_confidence: 0.8
1146      weights:
1147        senior-reviewer: 2.0
1148        junior-reviewer: 1.0
1149"#;
1150
1151        let fleet = AgentFleet::from_yaml(yaml).unwrap();
1152        let consensus = fleet.spec.coordination.consensus.as_ref().unwrap();
1153
1154        assert_eq!(consensus.algorithm, ConsensusAlgorithm::Weighted);
1155        assert_eq!(consensus.min_votes, Some(2));
1156        assert_eq!(consensus.min_confidence, Some(0.8));
1157        assert_eq!(consensus.weights.get("senior-reviewer"), Some(&2.0));
1158        assert_eq!(consensus.weights.get("junior-reviewer"), Some(&1.0));
1159
1160        // Test weight lookup via fleet method
1161        assert_eq!(fleet.get_agent_weight("senior-reviewer"), 2.0);
1162    }
1163
1164    #[test]
1165    fn test_human_review_consensus() {
1166        let yaml = r#"
1167apiVersion: aof.dev/v1
1168kind: AgentFleet
1169metadata:
1170  name: critical-review
1171spec:
1172  agents:
1173    - name: analyzer-1
1174      config: ./analyzer.yaml
1175    - name: analyzer-2
1176      config: ./analyzer.yaml
1177  coordination:
1178    mode: peer
1179    consensus:
1180      algorithm: human_review
1181      timeout_ms: 300000
1182      min_confidence: 0.9
1183"#;
1184
1185        let fleet = AgentFleet::from_yaml(yaml).unwrap();
1186        let consensus = fleet.spec.coordination.consensus.as_ref().unwrap();
1187
1188        assert_eq!(consensus.algorithm, ConsensusAlgorithm::HumanReview);
1189        assert_eq!(consensus.timeout_ms, Some(300000));
1190        assert_eq!(consensus.min_confidence, Some(0.9));
1191    }
1192
1193    #[test]
1194    fn test_all_coordination_modes() {
1195        let modes = vec![
1196            ("peer", CoordinationMode::Peer),
1197            ("hierarchical", CoordinationMode::Hierarchical),
1198            ("pipeline", CoordinationMode::Pipeline),
1199            ("swarm", CoordinationMode::Swarm),
1200            ("tiered", CoordinationMode::Tiered),
1201            ("deep", CoordinationMode::Deep),
1202        ];
1203
1204        for (yaml_value, expected) in modes {
1205            let yaml = format!(r#"
1206apiVersion: aof.dev/v1
1207kind: AgentFleet
1208metadata:
1209  name: test
1210spec:
1211  agents:
1212    - name: agent-1
1213      config: ./agent.yaml
1214      role: manager
1215      tier: 1
1216    - name: agent-2
1217      config: ./agent.yaml
1218      tier: 2
1219  coordination:
1220    mode: {}
1221    manager: agent-1
1222"#, yaml_value);
1223
1224            let fleet = AgentFleet::from_yaml(&yaml).unwrap();
1225            assert_eq!(
1226                fleet.spec.coordination.mode,
1227                expected,
1228                "Failed for mode: {}",
1229                yaml_value
1230            );
1231        }
1232    }
1233
1234    #[test]
1235    fn test_tiered_config() {
1236        let yaml = r#"
1237apiVersion: aof.dev/v1
1238kind: AgentFleet
1239metadata:
1240  name: tiered-team
1241spec:
1242  agents:
1243    - name: collector
1244      config: ./collector.yaml
1245      tier: 1
1246    - name: reasoner
1247      config: ./reasoner.yaml
1248      tier: 2
1249  coordination:
1250    mode: tiered
1251    tiered:
1252      pass_all_results: true
1253      final_aggregation: merge
1254      tier_consensus:
1255        "1":
1256          algorithm: first_wins
1257        "2":
1258          algorithm: majority
1259          min_votes: 1
1260"#;
1261
1262        let fleet = AgentFleet::from_yaml(yaml).unwrap();
1263        let tiered = fleet.spec.coordination.tiered.as_ref().unwrap();
1264
1265        assert!(tiered.pass_all_results);
1266        assert_eq!(tiered.final_aggregation, FinalAggregation::Merge);
1267
1268        let tier1_consensus = tiered.tier_consensus.get("1").unwrap();
1269        assert_eq!(tier1_consensus.algorithm, ConsensusAlgorithm::FirstWins);
1270
1271        let tier2_consensus = tiered.tier_consensus.get("2").unwrap();
1272        assert_eq!(tier2_consensus.algorithm, ConsensusAlgorithm::Majority);
1273    }
1274
1275    #[test]
1276    fn test_final_aggregation_modes() {
1277        let modes = vec![
1278            ("consensus", FinalAggregation::Consensus),
1279            ("merge", FinalAggregation::Merge),
1280            ("manager_synthesis", FinalAggregation::ManagerSynthesis),
1281        ];
1282
1283        for (yaml_value, expected) in modes {
1284            let yaml = format!(r#"
1285apiVersion: aof.dev/v1
1286kind: AgentFleet
1287metadata:
1288  name: test
1289spec:
1290  agents:
1291    - name: agent-1
1292      config: ./agent.yaml
1293      tier: 1
1294    - name: agent-2
1295      config: ./agent.yaml
1296      tier: 2
1297  coordination:
1298    mode: tiered
1299    tiered:
1300      final_aggregation: {}
1301"#, yaml_value);
1302
1303            let fleet = AgentFleet::from_yaml(&yaml).unwrap();
1304            let tiered = fleet.spec.coordination.tiered.as_ref().unwrap();
1305            assert_eq!(
1306                tiered.final_aggregation,
1307                expected,
1308                "Failed for aggregation: {}",
1309                yaml_value
1310            );
1311        }
1312    }
1313
1314    #[test]
1315    fn test_existing_simple_fleet_unchanged() {
1316        // Ensure existing simple fleet configurations still work
1317        let yaml = r#"
1318apiVersion: aof.dev/v1
1319kind: AgentFleet
1320metadata:
1321  name: simple-fleet
1322spec:
1323  agents:
1324    - name: worker
1325      config: ./worker.yaml
1326"#;
1327
1328        let fleet = AgentFleet::from_yaml(yaml).unwrap();
1329        assert_eq!(fleet.spec.coordination.mode, CoordinationMode::Peer); // default
1330        assert!(fleet.validate().is_ok());
1331    }
1332
1333    #[test]
1334    fn test_pipeline_mode_unchanged() {
1335        let yaml = r#"
1336apiVersion: aof.dev/v1
1337kind: AgentFleet
1338metadata:
1339  name: pipeline-fleet
1340spec:
1341  agents:
1342    - name: stage1
1343      config: ./stage1.yaml
1344    - name: stage2
1345      config: ./stage2.yaml
1346    - name: stage3
1347      config: ./stage3.yaml
1348  coordination:
1349    mode: pipeline
1350"#;
1351
1352        let fleet = AgentFleet::from_yaml(yaml).unwrap();
1353        assert_eq!(fleet.spec.coordination.mode, CoordinationMode::Pipeline);
1354        assert!(fleet.validate().is_ok());
1355    }
1356
1357    #[test]
1358    fn test_deep_mode_config() {
1359        let yaml = r#"
1360apiVersion: aof.dev/v1
1361kind: AgentFleet
1362metadata:
1363  name: deep-fleet
1364spec:
1365  agents:
1366    - name: investigator
1367      spec:
1368        model: openai:gpt-4
1369        instructions: "Deep investigator"
1370        tools: []
1371  coordination:
1372    mode: deep
1373    deep:
1374      max_iterations: 15
1375      planning: true
1376      memory: true
1377      planner_model: anthropic:claude-sonnet-4
1378      planner_prompt: "Generate investigation steps."
1379      synthesizer_prompt: "Synthesize findings into a report."
1380"#;
1381
1382        let fleet = AgentFleet::from_yaml(yaml).unwrap();
1383        assert_eq!(fleet.spec.coordination.mode, CoordinationMode::Deep);
1384
1385        let deep_config = fleet.spec.coordination.deep.as_ref().unwrap();
1386        assert_eq!(deep_config.max_iterations, 15);
1387        assert!(deep_config.planning);
1388        assert!(deep_config.memory);
1389        assert_eq!(deep_config.planner_model.as_deref(), Some("anthropic:claude-sonnet-4"));
1390        assert!(deep_config.planner_prompt.is_some());
1391        assert!(deep_config.synthesizer_prompt.is_some());
1392    }
1393
1394    #[test]
1395    fn test_deep_mode_defaults() {
1396        let yaml = r#"
1397apiVersion: aof.dev/v1
1398kind: AgentFleet
1399metadata:
1400  name: deep-fleet-defaults
1401spec:
1402  agents:
1403    - name: agent
1404      config: ./agent.yaml
1405  coordination:
1406    mode: deep
1407"#;
1408
1409        let fleet = AgentFleet::from_yaml(yaml).unwrap();
1410        assert_eq!(fleet.spec.coordination.mode, CoordinationMode::Deep);
1411
1412        // Deep config should have defaults when not specified
1413        let deep_config = fleet.spec.coordination.deep.unwrap_or_default();
1414        assert_eq!(deep_config.max_iterations, 10); // default
1415        assert!(deep_config.planning); // default true
1416        assert!(deep_config.memory); // default true
1417        assert!(deep_config.planner_model.is_none());
1418        assert!(deep_config.planner_prompt.is_none());
1419        assert!(deep_config.synthesizer_prompt.is_none());
1420    }
1421}