swarm_engine_eval/lib.rs
1//! swarm-engine-eval - Evaluation Framework for SwarmEngine
2//!
3//! Evaluation framework for multi-agent systems.
4//! Builds and executes SwarmApp from TOML scenario definitions for reproducible evaluation.
5//!
6//! ## Design Philosophy
7//!
8//! Inspired by Python evaluation frameworks (lm-evaluation-harness, RAGAS, etc.),
9//! providing a comprehensive evaluation foundation for Rust/SwarmEngine:
10//!
11//! 1. **Declarative Scenario Definition**: Describe evaluation conditions and success criteria in TOML
12//! 2. **Reproducibility**: Deterministic evaluation execution through seed management
13//! 3. **Extensibility**: Plugin architecture via Registry pattern
14//! 4. **Statistical Analysis**: Aggregation of N runs, pass@k, confidence intervals
15//!
16//! ## Architecture
17//!
18//! ```text
19//! ┌────────────────────────────────────────────────────────────────┐
20//! │ scenarios/*.toml │
21//! │ ┌────────────┐ ┌─────────────┐ ┌────────────┐ ┌─────────┐ │
22//! │ │ meta │ │ app_config │ │ agents │ │conditions│ │
23//! │ └────────────┘ └─────────────┘ └────────────┘ └─────────┘ │
24//! └────────────────────────────────────────────────────────────────┘
25//! │
26//! ▼
27//! ┌────────────────────────────────────────────────────────────────┐
28//! │ ScenarioRunner │
29//! │ ┌─────────────────┐ ┌──────────────────┐ ┌───────────────┐ │
30//! │ │ SwarmRegistry │ │EnvironmentRegistry│ │ Seed Manager │ │
31//! │ │ (Agent Factory)│ │ (Fixture Factory)│ │(Reproducibility)│
32//! │ └─────────────────┘ └──────────────────┘ └───────────────┘ │
33//! │ │ │
34//! │ ▼ │
35//! │ ┌──────────────────────────────────────────────────────────┐ │
36//! │ │ SwarmApp │ │
37//! │ │ Workers + Manager + Hooks → Orchestrator → Outcome │ │
38//! │ └──────────────────────────────────────────────────────────┘ │
39//! │ │ │
40//! │ ▼ │
41//! │ ┌──────────────────────────────────────────────────────────┐ │
42//! │ │ Condition Evaluation Engine │ │
43//! │ │ success / failure conditions / timeout / milestone │ │
44//! │ └──────────────────────────────────────────────────────────┘ │
45//! └────────────────────────────────────────────────────────────────┘
46//! │
47//! ▼
48//! ┌────────────────────────────────────────────────────────────────┐
49//! │ EvalReport │
50//! │ ConfigSummary + runs[] + AggregatedResults + Assertions │
51//! │ ↓ JSON output │
52//! │ report.json │
53//! └────────────────────────────────────────────────────────────────┘
54//! ```
55//!
56//! ## Features
57//!
58//! - **N-Run Statistical Processing**: Mean, standard deviation, 95% confidence interval
59//! - **pass@k Calculation**: Success probability accounting for non-determinism
60//! - **Tick-Level Metrics**: latency p95/p99, jitter, miss rate
61//! - **Coordination Metrics**: Manager intervention rate, delegation efficiency
62//! - **Condition Evaluation**: success/failure conditions, timeout handling, milestones
63//! - **Fault Injection**: Effect-based declarative fault injection (TODO)
64//!
65//! ## Key Components
66//!
67//! | Module | Description |
68//! |--------|-------------|
69//! | [`runner`] | ScenarioRunner - Integrated evaluation framework |
70//! | [`scenario`] | EvalScenario, Conditions, Milestone definitions |
71//! | [`environment`] | EnvironmentRegistry - Evaluation environment factory |
72//! | [`swarms`] | SwarmRegistry - Agent generation factory |
73//! | [`metrics`] | Task, Coordination, Performance metrics |
74//! | [`aggregator`] | Statistical aggregation (pass@k, confidence intervals) |
75//! | [`reporter`] | EvalReport, JSON output |
76//!
77//! ## Usage Examples
78//!
79//! ### Running Evaluation from TOML Scenario
80//!
81//! ```ignore
82//! use swarm_engine_eval::prelude::*;
83//! use swarm_engine_eval::runner::ScenarioRunner;
84//!
85//! // Load scenario from TOML file
86//! let content = std::fs::read_to_string("scenarios/simple_task.toml")?;
87//! let scenario: EvalScenario = toml::from_str(&content)?;
88//!
89//! // Execute evaluation with ScenarioRunner
90//! let runner = ScenarioRunner::new(scenario, runtime.handle().clone())
91//! .with_runs(5) // Run 5 times
92//! .with_seed(42); // Fix seed for reproducibility
93//!
94//! let report = runner.run()?;
95//!
96//! // Display results
97//! println!("Success rate: {:.1}%", report.aggregated.success_rate * 100.0);
98//! println!("Pass@1: {:.1}%", report.aggregated.pass_at_1 * 100.0);
99//!
100//! // JSON output
101//! report.to_json_file("report.json")?;
102//! ```
103//!
104//! ### Programmatic Scenario Definition
105//!
106//! ```ignore
107//! use swarm_engine_eval::scenario::*;
108//!
109//! let scenario = EvalScenario {
110//! meta: ScenarioMeta {
111//! name: "My Evaluation".to_string(),
112//! id: ScenarioId::new("my:eval:v1"),
113//! // ...
114//! },
115//! app_config: AppConfigTemplate::default(),
116//! agents: AgentsConfig {
117//! workers: vec![WorkerTemplate {
118//! id_pattern: "worker_{i}".to_string(),
119//! count: 4,
120//! role: "counter".to_string(),
121//! config: serde_json::json!({"total_tasks": 10}),
122//! }],
123//! managers: vec![],
124//! },
125//! conditions: EvalConditions {
126//! success: vec![Condition::new("done", "task.completed_count", CompareOp::Gte, 40)],
127//! failure: vec![],
128//! on_timeout: TimeoutBehavior::Fail,
129//! },
130//! // ...
131//! };
132//! ```
133//!
134//! ## Future Extensions
135//!
136//! - [ ] SwarmRegistry: Manager support
137//! - [ ] EnvironmentRegistry: task_queue, shared_workspace, etc.
138//! - [ ] Comparator: Multi-scenario/configuration comparison
139//! - [ ] FaultInjector: Fault injection framework
140
141pub mod aggregator;
142pub mod config;
143pub mod environment;
144pub mod environments;
145pub mod error;
146pub mod metrics;
147pub mod reporter;
148pub mod run;
149pub mod runner;
150pub mod runtime;
151pub mod scenario;
152pub mod validation;
153
154/// Prelude - commonly used types for convenient import
155pub mod prelude {
156 pub use crate::aggregator::Aggregator;
157 pub use crate::config::{AssertionConfig, EvalConfig, EvalSettings, FaultConfig, FaultType};
158 pub use crate::environment::{EnvironmentRegistry, EnvironmentType};
159 pub use crate::error::{EvalError, Result};
160 pub use crate::metrics::{
161 CoordinationMetrics, PerformanceMetrics, RobustnessMetrics, RunMetrics, TaskMetrics,
162 };
163 pub use crate::reporter::{EvalReport, JsonReporter, Reporter};
164 pub use crate::run::{EvalRun, TerminationReason};
165 pub use crate::runner::{EvalRunner, EvalSeed};
166 pub use crate::runtime::RuntimeTaskSpec;
167 pub use crate::scenario::{
168 EvalConditions, EvalScenario, KpiCalculator, KpiScore, Milestone, ScenarioId,
169 ScenarioRegistry,
170 };
171 pub use swarm_engine_core::environment::Environment;
172
173 // Re-export core types
174 pub use swarm_engine_core::prelude::*;
175}
176
177// Re-exports
178pub use error::{EvalError, Result};