swarm-engine-eval 0.1.6

Evaluation framework for SwarmEngine
Documentation
//! swarm-engine-eval - Evaluation Framework for SwarmEngine
//!
//! Evaluation framework for multi-agent systems.
//! Builds and executes SwarmApp from TOML scenario definitions for reproducible evaluation.
//!
//! ## Design Philosophy
//!
//! Inspired by Python evaluation frameworks (lm-evaluation-harness, RAGAS, etc.),
//! providing a comprehensive evaluation foundation for Rust/SwarmEngine:
//!
//! 1. **Declarative Scenario Definition**: Describe evaluation conditions and success criteria in TOML
//! 2. **Reproducibility**: Deterministic evaluation execution through seed management
//! 3. **Extensibility**: Plugin architecture via Registry pattern
//! 4. **Statistical Analysis**: Aggregation of N runs, pass@k, confidence intervals
//!
//! ## Architecture
//!
//! ```text
//! ┌────────────────────────────────────────────────────────────────┐
//! │                    scenarios/*.toml                            │
//! │  ┌────────────┐  ┌─────────────┐  ┌────────────┐  ┌─────────┐ │
//! │  │    meta    │  │  app_config │  │   agents   │  │conditions│ │
//! │  └────────────┘  └─────────────┘  └────────────┘  └─────────┘ │
//! └────────────────────────────────────────────────────────────────┘
//!//!//! ┌────────────────────────────────────────────────────────────────┐
//! │                      ScenarioRunner                            │
//! │  ┌─────────────────┐  ┌──────────────────┐  ┌───────────────┐ │
//! │  │  SwarmRegistry  │  │EnvironmentRegistry│  │ Seed Manager │ │
//! │  │  (Agent Factory)│  │ (Fixture Factory)│  │(Reproducibility)│
//! │  └─────────────────┘  └──────────────────┘  └───────────────┘ │
//! │                              │                                 │
//! │                              ▼                                 │
//! │  ┌──────────────────────────────────────────────────────────┐ │
//! │  │                      SwarmApp                            │ │
//! │  │   Workers + Manager + Hooks → Orchestrator → Outcome     │ │
//! │  └──────────────────────────────────────────────────────────┘ │
//! │                              │                                 │
//! │                              ▼                                 │
//! │  ┌──────────────────────────────────────────────────────────┐ │
//! │  │                Condition Evaluation Engine               │ │
//! │  │   success / failure conditions / timeout / milestone     │ │
//! │  └──────────────────────────────────────────────────────────┘ │
//! └────────────────────────────────────────────────────────────────┘
//!//!//! ┌────────────────────────────────────────────────────────────────┐
//! │                        EvalReport                              │
//! │   ConfigSummary + runs[] + AggregatedResults + Assertions      │
//! │                      ↓ JSON output                             │
//! │                 report.json                                    │
//! └────────────────────────────────────────────────────────────────┘
//! ```
//!
//! ## Features
//!
//! - **N-Run Statistical Processing**: Mean, standard deviation, 95% confidence interval
//! - **pass@k Calculation**: Success probability accounting for non-determinism
//! - **Tick-Level Metrics**: latency p95/p99, jitter, miss rate
//! - **Coordination Metrics**: Manager intervention rate, delegation efficiency
//! - **Condition Evaluation**: success/failure conditions, timeout handling, milestones
//! - **Fault Injection**: Effect-based declarative fault injection (TODO)
//!
//! ## Key Components
//!
//! | Module | Description |
//! |--------|-------------|
//! | [`runner`] | ScenarioRunner - Integrated evaluation framework |
//! | [`scenario`] | EvalScenario, Conditions, Milestone definitions |
//! | [`environment`] | EnvironmentRegistry - Evaluation environment factory |
//! | [`swarms`] | SwarmRegistry - Agent generation factory |
//! | [`metrics`] | Task, Coordination, Performance metrics |
//! | [`aggregator`] | Statistical aggregation (pass@k, confidence intervals) |
//! | [`reporter`] | EvalReport, JSON output |
//!
//! ## Usage Examples
//!
//! ### Running Evaluation from TOML Scenario
//!
//! ```ignore
//! use swarm_engine_eval::prelude::*;
//! use swarm_engine_eval::runner::ScenarioRunner;
//!
//! // Load scenario from TOML file
//! let content = std::fs::read_to_string("scenarios/simple_task.toml")?;
//! let scenario: EvalScenario = toml::from_str(&content)?;
//!
//! // Execute evaluation with ScenarioRunner
//! let runner = ScenarioRunner::new(scenario, runtime.handle().clone())
//!     .with_runs(5)      // Run 5 times
//!     .with_seed(42);    // Fix seed for reproducibility
//!
//! let report = runner.run()?;
//!
//! // Display results
//! println!("Success rate: {:.1}%", report.aggregated.success_rate * 100.0);
//! println!("Pass@1: {:.1}%", report.aggregated.pass_at_1 * 100.0);
//!
//! // JSON output
//! report.to_json_file("report.json")?;
//! ```
//!
//! ### Programmatic Scenario Definition
//!
//! ```ignore
//! use swarm_engine_eval::scenario::*;
//!
//! let scenario = EvalScenario {
//!     meta: ScenarioMeta {
//!         name: "My Evaluation".to_string(),
//!         id: ScenarioId::new("my:eval:v1"),
//!         // ...
//!     },
//!     app_config: AppConfigTemplate::default(),
//!     agents: AgentsConfig {
//!         workers: vec![WorkerTemplate {
//!             id_pattern: "worker_{i}".to_string(),
//!             count: 4,
//!             role: "counter".to_string(),
//!             config: serde_json::json!({"total_tasks": 10}),
//!         }],
//!         managers: vec![],
//!     },
//!     conditions: EvalConditions {
//!         success: vec![Condition::new("done", "task.completed_count", CompareOp::Gte, 40)],
//!         failure: vec![],
//!         on_timeout: TimeoutBehavior::Fail,
//!     },
//!     // ...
//! };
//! ```
//!
//! ## Future Extensions
//!
//! - [ ] SwarmRegistry: Manager support
//! - [ ] EnvironmentRegistry: task_queue, shared_workspace, etc.
//! - [ ] Comparator: Multi-scenario/configuration comparison
//! - [ ] FaultInjector: Fault injection framework

pub mod aggregator;
pub mod config;
pub mod environment;
pub mod environments;
pub mod error;
pub mod metrics;
pub mod reporter;
pub mod run;
pub mod runner;
pub mod runtime;
pub mod scenario;
pub mod validation;

/// Prelude - commonly used types for convenient import
pub mod prelude {
    pub use crate::aggregator::Aggregator;
    pub use crate::config::{AssertionConfig, EvalConfig, EvalSettings, FaultConfig, FaultType};
    pub use crate::environment::{EnvironmentRegistry, EnvironmentType};
    pub use crate::error::{EvalError, Result};
    pub use crate::metrics::{
        CoordinationMetrics, PerformanceMetrics, RobustnessMetrics, RunMetrics, TaskMetrics,
    };
    pub use crate::reporter::{EvalReport, JsonReporter, Reporter};
    pub use crate::run::{EvalRun, TerminationReason};
    pub use crate::runner::{EvalRunner, EvalSeed};
    pub use crate::runtime::RuntimeTaskSpec;
    pub use crate::scenario::{
        EvalConditions, EvalScenario, KpiCalculator, KpiScore, Milestone, ScenarioId,
        ScenarioRegistry,
    };
    pub use swarm_engine_core::environment::Environment;

    // Re-export core types
    pub use swarm_engine_core::prelude::*;
}

// Re-exports
pub use error::{EvalError, Result};