swarm_engine_eval/
runner.rs

1//! EvalRunner - LearnableSwarm を使用した評価実行
2
3use std::time::Duration;
4
5use tokio::runtime::Handle;
6
7use swarm_engine_core::actions::ActionDef;
8use swarm_engine_core::agent::{
9    BatchInvoker, DefaultBatchManagerAgent, GenericWorker, ManagementStrategy, ManagerAgent,
10    ManagerId, WorkerAgent,
11};
12use swarm_engine_core::environment::EnvironmentBox;
13use swarm_engine_core::events::TraceSubscriber;
14use swarm_engine_core::exploration::{LearnedDependencyProvider, NodeRules, OperatorProvider};
15use swarm_engine_core::extensions::Extensions;
16use swarm_engine_core::learn::{
17    profile_to_offline_model, CountTrigger, LearnableSwarmBuilder, LearningStore, OfflineModel,
18    ScenarioProfile, TrainTrigger,
19};
20use swarm_engine_core::orchestrator::SwarmConfig;
21use swarm_engine_core::types::{GroupId, SwarmTask};
22
23use crate::environments::{
24    CodeEnvironment, DeepSearchEnvironment, InternalDiagnosisEnvironment, MazeEnvironment,
25    SearchEnvironment, TroubleshootingEnvironment,
26};
27
28use crate::aggregator::Aggregator;
29use crate::error::Result;
30use crate::metrics::RunMetrics;
31use crate::reporter::{ConfigSummary, EvalReport, SeedInfo};
32use crate::run::{EvalRun, TerminationReason};
33use crate::scenario::conditions::{ConditionValue, TimeoutBehavior};
34use crate::scenario::{EvalScenario, ManagementStrategyConfig};
35use crate::validation::{ScenarioValidator, WarningSeverity};
36
37/// Evaluation seed for reproducibility
38///
39/// Stored in Extensions to allow Environment and other components to access it.
40#[derive(Debug, Clone, Copy)]
41pub struct EvalSeed(pub u64);
42
43/// Factory for creating ManagerAgent
44pub type ManagerFactory = Box<dyn Fn() -> Box<dyn ManagerAgent> + Send + Sync>;
45
46/// Factory for creating BatchInvoker
47pub type BatchInvokerFactory = Box<dyn Fn() -> Box<dyn BatchInvoker> + Send + Sync>;
48
49/// Factory for creating OperatorProvider
50pub type OperatorProviderFactory =
51    Box<dyn Fn() -> Box<dyn OperatorProvider<NodeRules>> + Send + Sync>;
52
53/// Evaluation runner using Orchestrator directly
54///
55/// # Example
56///
57/// ```ignore
58/// let runner = EvalRunner::new(scenario, runtime.handle().clone())
59///     .with_runs(5)
60///     .with_seed(42)
61///     .with_task(SwarmTask::new("Find the auth handler"))
62///     .with_manager_factory(|| Box::new(MyManager::new()))
63///     .with_batch_invoker_factory(|| Box::new(MyInvoker::new()));
64///
65/// let report = runner.run()?;
66/// ```
67pub struct EvalRunner {
68    scenario: EvalScenario,
69    runtime: Handle,
70    runs: usize,
71    seed: u64,
72    /// Task to execute (optional)
73    task: Option<SwarmTask>,
74    /// Manager factory (creates new instance per run)
75    manager_factory: Option<ManagerFactory>,
76    /// BatchInvoker factory (creates new instance per run)
77    batch_invoker_factory: Option<BatchInvokerFactory>,
78    /// Extensions factory (creates new instance per run)
79    extensions_factory: Option<Box<dyn Fn() -> Extensions + Send + Sync>>,
80    /// OperatorProvider factory (creates new instance per run)
81    operator_provider_factory: Option<OperatorProviderFactory>,
82    /// Verbose output (print tick snapshots)
83    verbose: bool,
84    /// Enable ExplorationSpaceV2 tracking
85    enable_exploration: bool,
86    /// Dependency graph for action sequencing
87    dependency_graph: Option<swarm_engine_core::exploration::DependencyGraph>,
88    /// LearningStore for cross-session learning
89    learning_store: Option<LearningStore>,
90    /// TrainTrigger for Learning (default: run after every eval)
91    train_trigger: Option<std::sync::Arc<dyn TrainTrigger>>,
92    /// Skip learned action order from offline model
93    skip_learned_action_order: bool,
94    /// Trace subscriber for ActionEvent output
95    trace_subscriber: Option<std::sync::Arc<dyn TraceSubscriber>>,
96    /// ScenarioProfile for applying learned parameters
97    scenario_profile: Option<ScenarioProfile>,
98    /// Cached OfflineModel from ScenarioProfile (to avoid repeated conversion)
99    offline_model_from_profile: Option<OfflineModel>,
100}
101
102impl EvalRunner {
103    pub fn new(scenario: EvalScenario, runtime: Handle) -> Self {
104        Self {
105            scenario,
106            runtime,
107            runs: 1,
108            seed: 42,
109            task: None,
110            manager_factory: None,
111            batch_invoker_factory: None,
112            extensions_factory: None,
113            operator_provider_factory: None,
114            verbose: false,
115            enable_exploration: false,
116            dependency_graph: None,
117            learning_store: None,
118            train_trigger: None,
119            skip_learned_action_order: false,
120            trace_subscriber: None,
121            scenario_profile: None,
122            offline_model_from_profile: None,
123        }
124    }
125
126    /// Enable verbose output (print tick snapshots)
127    pub fn with_verbose(mut self, verbose: bool) -> Self {
128        self.verbose = verbose;
129        self
130    }
131
132    /// Enable ExplorationSpace tracking
133    pub fn with_exploration(mut self, enable: bool) -> Self {
134        self.enable_exploration = enable;
135        self
136    }
137
138    /// Set dependency graph for action sequencing
139    ///
140    /// The graph defines valid action transitions and terminal conditions.
141    /// When set, actions will be filtered based on the graph structure.
142    pub fn with_dependency_graph(
143        mut self,
144        graph: swarm_engine_core::exploration::DependencyGraph,
145    ) -> Self {
146        self.dependency_graph = Some(graph);
147        self
148    }
149
150    /// Enable LearningStore for cross-session learning
151    ///
152    /// When enabled, statistics will be saved after each run and loaded as prior
153    /// for subsequent runs. This enables incremental learning across sessions.
154    ///
155    /// # Example
156    ///
157    /// ```ignore
158    /// runner.with_learning_store("~/.swarm-engine/learning")
159    /// ```
160    pub fn with_learning_store(mut self, path: impl AsRef<std::path::Path>) -> Self {
161        match LearningStore::new(path) {
162            Ok(store) => {
163                // prior_snapshot, offline_model loading is delegated to LearnableSwarmBuilder
164                self.learning_store = Some(store);
165            }
166            Err(e) => {
167                eprintln!("Warning: Failed to create LearningStore: {}", e);
168            }
169        }
170        self
171    }
172
173    /// Set TrainTrigger for Learning
174    ///
175    /// Controls when offline learning is executed after eval runs.
176    /// Default: Learn after every run (if learning_store is configured).
177    ///
178    /// # Example
179    ///
180    /// ```ignore
181    /// use swarm_engine_core::learn::CountTrigger;
182    ///
183    /// // Run learning after every 5 eval iterations
184    /// runner.with_train_trigger(Arc::new(CountTrigger::new(5)))
185    /// ```
186    pub fn with_train_trigger(mut self, trigger: std::sync::Arc<dyn TrainTrigger>) -> Self {
187        self.train_trigger = Some(trigger);
188        self
189    }
190
191    /// Skip learned action order from offline model
192    ///
193    /// When enabled, the learned dependency graph (action_order) from the offline
194    /// model will not be applied. This is useful for testing without learned priors.
195    pub fn skip_learned_action_order(mut self, skip: bool) -> Self {
196        self.skip_learned_action_order = skip;
197        self
198    }
199
200    /// Set trace subscriber for ActionEvent output
201    ///
202    /// The subscriber will receive all ActionEvents during evaluation.
203    /// Use InMemoryTraceSubscriber to collect events and dump them after evaluation,
204    /// or JsonlTraceSubscriber for real-time output.
205    ///
206    /// # Example
207    ///
208    /// ```ignore
209    /// use std::sync::Arc;
210    /// use swarm_engine_core::events::InMemoryTraceSubscriber;
211    ///
212    /// let trace = Arc::new(InMemoryTraceSubscriber::new());
213    /// runner.with_trace_subscriber(trace.clone());
214    /// // ... run evaluation ...
215    /// trace.dump_to_file("trace.jsonl")?;
216    /// ```
217    pub fn with_trace_subscriber(
218        mut self,
219        subscriber: std::sync::Arc<dyn TraceSubscriber>,
220    ) -> Self {
221        self.trace_subscriber = Some(subscriber);
222        self
223    }
224
225    /// Apply a ScenarioProfile to use learned exploration parameters and strategies
226    ///
227    /// This converts the profile to an OfflineModel and applies it during execution.
228    /// When set, the profile's learned components (exploration params, strategy config,
229    /// action order) will be used instead of defaults.
230    ///
231    /// # Example
232    ///
233    /// ```ignore
234    /// let store = ProfileStore::new("~/.swarm-engine/profiles")?;
235    /// let profile = store.load("troubleshooting")?;
236    /// runner.with_scenario_profile(profile);
237    /// ```
238    pub fn with_scenario_profile(mut self, profile: ScenarioProfile) -> Self {
239        let offline_model = profile_to_offline_model(&profile);
240        self.offline_model_from_profile = Some(offline_model);
241        self.scenario_profile = Some(profile);
242        self
243    }
244
245    pub fn with_runs(mut self, runs: usize) -> Self {
246        self.runs = runs;
247        self
248    }
249
250    pub fn with_seed(mut self, seed: u64) -> Self {
251        self.seed = seed;
252        self
253    }
254
255    /// Set the task to execute
256    pub fn with_task(mut self, task: SwarmTask) -> Self {
257        self.task = Some(task);
258        self
259    }
260
261    /// Set manager factory (creates new Manager for each run)
262    pub fn with_manager_factory<F>(mut self, factory: F) -> Self
263    where
264        F: Fn() -> Box<dyn ManagerAgent> + Send + Sync + 'static,
265    {
266        self.manager_factory = Some(Box::new(factory));
267        self
268    }
269
270    /// Set batch invoker factory (creates new BatchInvoker for each run)
271    pub fn with_batch_invoker_factory<F>(mut self, factory: F) -> Self
272    where
273        F: Fn() -> Box<dyn BatchInvoker> + Send + Sync + 'static,
274    {
275        self.batch_invoker_factory = Some(Box::new(factory));
276        self
277    }
278
279    /// Set extensions factory (creates new Extensions for each run)
280    pub fn with_extensions_factory<F>(mut self, factory: F) -> Self
281    where
282        F: Fn() -> Extensions + Send + Sync + 'static,
283    {
284        self.extensions_factory = Some(Box::new(factory));
285        self
286    }
287
288    /// Set OperatorProvider factory (creates new provider for each run)
289    ///
290    /// Use this to configure the Selection strategy for exploration.
291    /// Default is `AdaptiveProvider` if not specified.
292    ///
293    /// # Example
294    ///
295    /// ```ignore
296    /// use swarm_engine_core::exploration::{HybridLlmProvider, ReviewPolicy};
297    /// use swarm_engine_llm::LlmStrategyAdvisor;
298    ///
299    /// runner.with_operator_provider_factory(|| {
300    ///     let advisor = LlmStrategyAdvisor::new(decider.clone(), handle.clone());
301    ///     let policy = ReviewPolicy::default();
302    ///     Box::new(HybridLlmProvider::new(advisor, policy))
303    /// })
304    /// ```
305    pub fn with_operator_provider_factory<F>(mut self, factory: F) -> Self
306    where
307        F: Fn() -> Box<dyn OperatorProvider<NodeRules>> + Send + Sync + 'static,
308    {
309        self.operator_provider_factory = Some(Box::new(factory));
310        self
311    }
312
313    pub fn run(&self) -> Result<EvalReport> {
314        // Validate scenario before running
315        let warnings = ScenarioValidator::validate_scenario(&self.scenario);
316        for warning in &warnings {
317            match warning.severity() {
318                WarningSeverity::High => {
319                    tracing::warn!(
320                        severity = %warning.severity(),
321                        "Scenario validation: {}",
322                        warning
323                    );
324                }
325                WarningSeverity::Medium => {
326                    tracing::info!(
327                        severity = %warning.severity(),
328                        "Scenario validation: {}",
329                        warning
330                    );
331                }
332                _ => {
333                    tracing::debug!(
334                        severity = %warning.severity(),
335                        "Scenario validation: {}",
336                        warning
337                    );
338                }
339            }
340        }
341
342        let mut eval_runs = Vec::with_capacity(self.runs);
343        let mut run_seeds = Vec::with_capacity(self.runs);
344
345        // Generate GroupId for this eval run (all iterations share the same GroupId)
346        // This enables DPO learning to compare multiple executions under the same conditions
347        let group_id = GroupId::new();
348
349        for i in 0..self.runs {
350            let run_seed = self.seed.wrapping_add(i as u64);
351            run_seeds.push(run_seed);
352
353            let result = self.run_single(i, run_seed, group_id)?;
354            eval_runs.push(result);
355        }
356
357        let aggregated = Aggregator::aggregate(&eval_runs);
358
359        // Note: action_order learning is handled by `learn` command, not eval
360        // If specific behavior is needed, use MockProvider via LearnableSwarmBuilder
361
362        Ok(EvalReport {
363            config_summary: ConfigSummary {
364                scenario_name: self.scenario.meta.name.clone(),
365                scenario_id: self.scenario.meta.id.to_string(),
366                worker_count: self.scenario.agents.workers.iter().map(|w| w.count).sum(),
367                max_ticks: self.scenario.app_config.max_ticks,
368                run_count: self.runs,
369            },
370            seed_info: SeedInfo {
371                base_seed: self.seed,
372                run_seeds,
373            },
374            runs: eval_runs,
375            aggregated,
376            assertion_results: vec![],
377        })
378    }
379
380    /// Run a single evaluation iteration
381    ///
382    /// # Arguments
383    /// * `index` - The iteration index (0-based)
384    /// * `seed` - The random seed for this iteration
385    /// * `group_id` - The group ID shared by all iterations in this eval run.
386    ///   Used for DPO learning to compare multiple executions.
387    fn run_single(&self, index: usize, seed: u64, group_id: GroupId) -> Result<EvalRun> {
388        // ========================================================================
389        // LearnableSwarmBuilder で Swarm を構築
390        // ========================================================================
391        let workers = self.build_workers();
392        let management_strategy = self.build_management_strategy();
393
394        let swarm_config = SwarmConfig {
395            tick_duration: Duration::from_millis(self.scenario.app_config.tick_duration_ms),
396            max_ticks: self.scenario.app_config.max_ticks,
397            management_strategy,
398        };
399
400        // Extensions（LlmConfig, ActionsConfig, EvalSeed 等）
401        let extensions = self.build_extensions_from_scenario(seed);
402
403        // LearnableSwarmBuilder で構築開始
404        let scenario_key = self.scenario.meta.id.learning_key();
405        let mut builder = LearnableSwarmBuilder::new(self.runtime.clone())
406            .scenario(&scenario_key)
407            .swarm_config(swarm_config)
408            .workers(workers)
409            .extensions(extensions)
410            .enable_exploration(
411                self.enable_exploration || self.scenario.app_config.enable_exploration,
412            );
413
414        // Managers: from factory if provided, otherwise from scenario templates
415        if let Some(factory) = &self.manager_factory {
416            let manager = factory();
417            builder = builder.add_manager(Box::new(DynManagerWrapper(manager)));
418        } else {
419            let managers = self.build_managers();
420            for manager in managers {
421                builder = builder.add_manager(Box::new(manager));
422            }
423        }
424
425        // BatchInvoker if factory provided
426        if let Some(factory) = &self.batch_invoker_factory {
427            let invoker = factory();
428            builder = builder.batch_invoker(Box::new(DynBatchInvokerWrapper(invoker)));
429        }
430
431        // OperatorProvider factory
432        if let Some(factory) = &self.operator_provider_factory {
433            let provider = factory();
434            builder = builder.operator_provider(Box::new(DynOperatorProviderWrapper(provider)));
435        }
436
437        // ScenarioProfile の OfflineModel を適用（LearningStore より優先）
438        if let Some(ref model) = self.offline_model_from_profile {
439            builder = builder.offline_model(model.clone());
440
441            // Offline model 適用の通知
442            if self.operator_provider_factory.is_none() {
443                println!(
444                    "Profile offline model applied: ucb1_c={:.3}, maturity={}, strategy={}",
445                    model.parameters.ucb1_c,
446                    model.strategy_config.maturity_threshold,
447                    model.strategy_config.initial_strategy
448                );
449            }
450
451            // Learned action order（DependencyGraph 自動生成をスキップ）
452            if !self.skip_learned_action_order {
453                if let Some(ref action_order) = model.action_order {
454                    let provider = LearnedDependencyProvider::new(action_order.clone());
455                    builder = builder.dependency_provider(Box::new(provider));
456                    println!(
457                        "Learned action order applied: discover={:?}, not_discover={:?}",
458                        action_order.discover, action_order.not_discover
459                    );
460                }
461            } else if model.action_order.is_some() {
462                println!("Learned action order skipped (--no-dep-graph)");
463            }
464        }
465
466        // Learning 設定（LearningStore がある場合）
467        // with_learning_store で prior_snapshot, offline_model, data_dir, learning_enabled を自動設定
468        if let Some(ref store) = self.learning_store {
469            builder = builder.with_learning_store(store.clone());
470
471            // ScenarioProfile が設定されていない場合のみ、LearningStore の OfflineModel を適用
472            if self.offline_model_from_profile.is_none() {
473                // Offline model の情報を取得（println! と action_order 処理用）
474                let offline_model_opt = builder.offline_model_ref().cloned();
475                if let Some(ref model) = offline_model_opt {
476                    // Offline model 適用の通知（OperatorProvider factory がない場合のみ）
477                    if self.operator_provider_factory.is_none() {
478                        println!(
479                            "Offline model applied: ucb1_c={:.3}, maturity={}, strategy={}",
480                            model.parameters.ucb1_c,
481                            model.strategy_config.maturity_threshold,
482                            model.strategy_config.initial_strategy
483                        );
484                    }
485
486                    // Learned action order（DependencyGraph 自動生成をスキップ）
487                    if !self.skip_learned_action_order {
488                        if let Some(ref action_order) = model.action_order {
489                            let provider = LearnedDependencyProvider::new(action_order.clone());
490                            builder = builder.dependency_provider(Box::new(provider));
491                            println!(
492                                "Learned action order applied: discover={:?}, not_discover={:?}",
493                                action_order.discover, action_order.not_discover
494                            );
495                        }
496                    } else if model.action_order.is_some() {
497                        println!("Learned action order skipped (--no-dep-graph)");
498                    }
499                }
500            }
501
502            // Train trigger
503            if let Some(ref trigger) = self.train_trigger {
504                builder = builder.train_trigger(std::sync::Arc::clone(trigger));
505            } else {
506                builder = builder.train_trigger(std::sync::Arc::new(CountTrigger::new(self.runs)));
507            }
508        }
509
510        // Trace subscriber
511        if let Some(ref subscriber) = self.trace_subscriber {
512            builder = builder.with_trace_subscriber(std::sync::Arc::clone(subscriber));
513        }
514
515        // ========================================================================
516        // LearnableSwarm を構築・実行
517        // ========================================================================
518        let mut swarm = builder.build()?;
519
520        // Enable partitioning when multiple managers are configured
521        let manager_count = self
522            .scenario
523            .agents
524            .managers
525            .iter()
526            .map(|t| t.count)
527            .sum::<usize>();
528        if manager_count > 1 {
529            swarm.orchestrator_mut().enable_partitioning();
530        }
531
532        // Determine task
533        let task_to_run = self
534            .task
535            .clone()
536            .or_else(|| self.build_task_from_scenario())
537            .map(|task| task.with_group_id(group_id));
538
539        // Run
540        let result = if let Some(task) = task_to_run {
541            swarm.run_task(task)?
542        } else {
543            swarm.run()
544        };
545
546        // ========================================================================
547        // メトリクス収集
548        // ========================================================================
549        let state = swarm.orchestrator().state();
550        let timed_out = result.total_ticks >= self.scenario.app_config.max_ticks;
551        let environment_done = state.shared.is_environment_done();
552        let total_actions = state.shared.stats.total_visits() as u64;
553        let successful_actions = state.shared.stats.total_successes() as u64;
554        let llm_invocations = state.shared.llm_invocations();
555        let llm_invoke_errors = state.shared.llm_errors();
556
557        let metrics = RunMetrics {
558            task: crate::metrics::TaskMetrics {
559                total_ticks: result.total_ticks,
560                total_tasks: 0,
561                completed_tasks: 0,
562                total_actions,
563                successful_actions,
564                success_rate: state.shared.stats.success_rate(),
565            },
566            coordination: crate::metrics::CoordinationMetrics {
567                manager_activations: llm_invocations,
568                manager_intervention_rate: if result.total_ticks > 0 {
569                    llm_invocations as f64 / result.total_ticks as f64
570                } else {
571                    0.0
572                },
573                ..Default::default()
574            },
575            performance: {
576                let llm_error_rate = if llm_invocations > 0 {
577                    llm_invoke_errors as f64 / llm_invocations as f64
578                } else {
579                    0.0
580                };
581                crate::metrics::PerformanceMetrics {
582                    total_duration_ms: result.total_duration.as_millis() as f64,
583                    avg_tick_latency_ms: if result.total_ticks > 0 {
584                        result.total_duration.as_millis() as f64 / result.total_ticks as f64
585                    } else {
586                        0.0
587                    },
588                    raw_throughput_per_sec: if result.total_duration.as_secs_f64() > 0.0 {
589                        total_actions as f64 / result.total_duration.as_secs_f64()
590                    } else {
591                        0.0
592                    },
593                    effective_throughput_per_sec: if result.total_duration.as_secs_f64() > 0.0 {
594                        successful_actions as f64 / result.total_duration.as_secs_f64()
595                    } else {
596                        0.0
597                    },
598                    llm_invocations,
599                    llm_invoke_errors,
600                    llm_error_rate,
601                    ..Default::default()
602                }
603            },
604            robustness: Default::default(),
605        };
606
607        // Evaluate success/failure conditions
608        let (success, termination_reason) = if !result.completed {
609            (false, TerminationReason::Stopped)
610        } else {
611            self.evaluate_conditions(&metrics, environment_done, timed_out)
612        };
613
614        // Send shutdown signal to LearningDaemon (non-blocking)
615        // NOTE: block_on() inside tokio runtime causes deadlock,
616        //       so we only send shutdown signal and let Drop handle cleanup
617        if swarm.is_learning_enabled() {
618            // Emit stats snapshot before shutdown (required for learning data to be saved)
619            swarm.emit_stats_snapshot();
620
621            // Wait for LearningEventSubscriber to flush the event to LearningDaemon
622            // Default flush interval is 1000ms, but we need to ensure the event is processed
623            std::thread::sleep(std::time::Duration::from_millis(150));
624
625            if let Some(tx) = swarm.take_shutdown_tx() {
626                // Fire-and-forget shutdown signal
627                let _ = tx.try_send(());
628            }
629        }
630
631        Ok(EvalRun::new(
632            index,
633            seed,
634            success,
635            termination_reason,
636            metrics,
637        ))
638    }
639
640    fn build_workers(&self) -> Vec<Box<dyn WorkerAgent>> {
641        let mut workers: Vec<Box<dyn WorkerAgent>> = Vec::new();
642
643        for template in &self.scenario.agents.workers {
644            for i in 0..template.count {
645                let id = workers.len();
646                let name = template.id_pattern.replace("{i}", &i.to_string());
647
648                let worker = GenericWorker::new(id)
649                    .with_name(name)
650                    .with_require_guidance(true);
651
652                workers.push(Box::new(worker));
653            }
654        }
655
656        workers
657    }
658
659    fn build_managers(&self) -> Vec<DefaultBatchManagerAgent> {
660        let mut managers = Vec::new();
661        let mut manager_index = 0;
662
663        for template in &self.scenario.agents.managers {
664            let ids = template.generate_ids();
665            for name in ids {
666                let manager = DefaultBatchManagerAgent::new(ManagerId(manager_index))
667                    .with_name(name)
668                    .with_interval(self.scenario.manager.process_interval_ticks);
669
670                managers.push(manager);
671                manager_index += 1;
672            }
673        }
674
675        // デフォルト: Manager テンプレートがない場合は 1 つ作成
676        if managers.is_empty() {
677            managers.push(
678                DefaultBatchManagerAgent::new(ManagerId(0))
679                    .with_name("default_manager")
680                    .with_interval(self.scenario.manager.process_interval_ticks),
681            );
682        }
683
684        managers
685    }
686
687    fn build_management_strategy(&self) -> ManagementStrategy {
688        match &self.scenario.app_config.management_strategy {
689            ManagementStrategyConfig::EveryTick {} => ManagementStrategy::EveryTick,
690            ManagementStrategyConfig::IntervalBased { max_interval } => {
691                ManagementStrategy::FixedInterval {
692                    interval: *max_interval,
693                }
694            }
695            ManagementStrategyConfig::EventDriven { triggers: _ } => {
696                // Event-driven maps to completion-based
697                ManagementStrategy::CompletionBased { max_wait_ticks: 50 }
698            }
699            ManagementStrategyConfig::Hybrid {
700                max_interval,
701                triggers: _,
702            } => ManagementStrategy::Hybrid {
703                preferred_interval: *max_interval,
704                force_after_ticks: max_interval * 2,
705            },
706            ManagementStrategyConfig::Disabled {} => {
707                // Disabled = very large interval (effectively never)
708                ManagementStrategy::FixedInterval { interval: u64::MAX }
709            }
710        }
711    }
712
713    /// Build SwarmTask from scenario task config
714    ///
715    /// Returns None if task goal is empty
716    fn build_task_from_scenario(&self) -> Option<SwarmTask> {
717        let task_config = &self.scenario.task;
718
719        if task_config.goal.is_empty() {
720            return None;
721        }
722
723        // Build context JSON object
724        let mut context = serde_json::Map::new();
725
726        if let Some(target_path) = &task_config.context.target_path {
727            context.insert(
728                "target_path".to_string(),
729                serde_json::Value::String(target_path.clone()),
730            );
731        }
732        if let Some(working_dir) = &task_config.context.working_dir {
733            context.insert(
734                "working_dir".to_string(),
735                serde_json::Value::String(working_dir.clone()),
736            );
737        }
738        if let Some(max_depth) = task_config.context.max_depth {
739            context.insert(
740                "max_depth".to_string(),
741                serde_json::Value::Number(serde_json::Number::from(max_depth)),
742            );
743        }
744
745        // Add extra context (convert toml::Value to serde_json::Value)
746        for (key, value) in &task_config.context.extra {
747            if let Ok(json_value) = serde_json::to_value(value) {
748                context.insert(key.clone(), json_value);
749            }
750        }
751
752        let task =
753            SwarmTask::new(&task_config.goal).with_context(serde_json::Value::Object(context));
754
755        Some(task)
756    }
757
758    /// Build Extensions with LlmConfig, ActionsConfig, and EvalSeed from scenario
759    fn build_extensions_from_scenario(&self, seed: u64) -> Extensions {
760        let mut extensions = if let Some(factory) = &self.extensions_factory {
761            factory()
762        } else {
763            Extensions::new()
764        };
765
766        // Insert EvalSeed for reproducibility
767        extensions.insert(EvalSeed(seed));
768
769        // Insert LlmConfig for BatchInvoker/Manager to use
770        extensions.insert(self.scenario.llm.clone());
771
772        // Insert LoRA config if specified (for BatchInvoker to use)
773        if let Some(ref lora) = self.scenario.llm.lora {
774            extensions.insert(lora.clone());
775        }
776
777        // Insert ManagerConfig for Manager to use
778        extensions.insert(self.scenario.manager.clone());
779
780        // Insert BatchProcessorConfig for BatchInvoker to use
781        extensions.insert(self.scenario.batch_processor.clone());
782
783        // Convert EvalActionsConfig to Core ActionsConfig for Worker/Manager to use
784        let core_actions_config = self.scenario.actions.to_core_config();
785        extensions.insert(core_actions_config);
786
787        // Create and insert Environment based on env_type
788        let env_type = self.scenario.environment.env_type.as_str();
789        let env_params = &self.scenario.environment.params;
790
791        let env_box: Option<EnvironmentBox> = match env_type {
792            "maze" => {
793                let map = env_params.get("map").and_then(|v| v.as_str()).unwrap_or("");
794                let worker_count = env_params
795                    .get("worker_count")
796                    .and_then(|v| v.as_u64())
797                    .unwrap_or(1) as usize;
798                Some(Box::new(MazeEnvironment::from_str(map, worker_count)))
799            }
800            "code" => {
801                // Currently only "auth" scenario is supported, default to it
802                Some(Box::new(CodeEnvironment::auth_scenario()))
803            }
804            "troubleshooting" => {
805                let scenario_name = env_params
806                    .get("scenario")
807                    .and_then(|v| v.as_str())
808                    .unwrap_or("memory_leak");
809                let env = match scenario_name {
810                    "memory_leak" => TroubleshootingEnvironment::memory_leak_scenario(),
811                    "cpu_spike" => TroubleshootingEnvironment::cpu_spike_scenario(),
812                    "network_timeout" => TroubleshootingEnvironment::network_timeout_scenario(),
813                    "medium" => TroubleshootingEnvironment::complex_scenario(15, 3, 2, seed),
814                    "high" => TroubleshootingEnvironment::complex_scenario(30, 8, 3, seed),
815                    "extreme" => TroubleshootingEnvironment::complex_scenario(50, 15, 4, seed),
816                    "complex" => {
817                        let total_services = env_params
818                            .get("total_services")
819                            .and_then(|v| v.as_u64())
820                            .unwrap_or(15) as usize;
821                        let noise_services = env_params
822                            .get("noise_services")
823                            .and_then(|v| v.as_u64())
824                            .unwrap_or(3) as usize;
825                        let cascade_depth = env_params
826                            .get("cascade_depth")
827                            .and_then(|v| v.as_u64())
828                            .unwrap_or(2) as usize;
829                        TroubleshootingEnvironment::complex_scenario(
830                            total_services,
831                            noise_services,
832                            cascade_depth,
833                            seed,
834                        )
835                    }
836                    _ => TroubleshootingEnvironment::memory_leak_scenario(),
837                };
838                Some(Box::new(env))
839            }
840            "search" => {
841                let scenario_name = env_params
842                    .get("scenario")
843                    .and_then(|v| v.as_str())
844                    .unwrap_or("basic");
845                let env = match scenario_name {
846                    "basic" => SearchEnvironment::basic_scenario(),
847                    "medium" => SearchEnvironment::medium_scenario(),
848                    "large" => SearchEnvironment::large_scenario(),
849                    "custom" => {
850                        let file_count = env_params
851                            .get("file_count")
852                            .and_then(|v| v.as_u64())
853                            .unwrap_or(5) as usize;
854                        let target_index = env_params
855                            .get("target_index")
856                            .and_then(|v| v.as_u64())
857                            .unwrap_or(2) as usize;
858                        SearchEnvironment::custom_scenario(file_count, target_index, seed)
859                    }
860                    _ => SearchEnvironment::basic_scenario(),
861                };
862                Some(Box::new(env))
863            }
864            "internal_diagnosis" => {
865                let scenario_name = env_params
866                    .get("scenario")
867                    .and_then(|v| v.as_str())
868                    .unwrap_or("routing");
869                let env = match scenario_name {
870                    "routing" => InternalDiagnosisEnvironment::routing_error_scenario(),
871                    "failover" => InternalDiagnosisEnvironment::failover_error_scenario(),
872                    "worker_pool" => InternalDiagnosisEnvironment::worker_pool_scenario(),
873                    "strategy" => InternalDiagnosisEnvironment::strategy_mismatch_scenario(),
874                    "exploration" => InternalDiagnosisEnvironment::exploration_depth_scenario(),
875                    "complex" => InternalDiagnosisEnvironment::complex_scenario(seed),
876                    _ => InternalDiagnosisEnvironment::routing_error_scenario(),
877                };
878                Some(Box::new(env))
879            }
880            "deep_search" => {
881                // TODO: Support multiple deep_search scenarios
882                let _scenario_name = env_params
883                    .get("scenario")
884                    .and_then(|v| v.as_str())
885                    .unwrap_or("tech_question");
886                let env = DeepSearchEnvironment::tech_question_scenario();
887                Some(Box::new(env))
888            }
889            // Real filesystem environment (Bash, Read, Write, Grep, Glob)
890            "default" | "realworld" => {
891                use swarm_engine_core::environment::DefaultEnvironment;
892                let working_dir = env_params
893                    .get("working_dir")
894                    .and_then(|v| v.as_str())
895                    .map(std::path::PathBuf::from);
896                let env = if let Some(dir) = working_dir {
897                    DefaultEnvironment::with_working_dir(dir)
898                } else {
899                    DefaultEnvironment::new()
900                };
901                Some(Box::new(env))
902            }
903            _ => None, // Unknown env_type - no Environment inserted
904        };
905
906        if let Some(env) = env_box {
907            extensions.insert(env);
908        }
909
910        // Insert DependencyGraph if specified (explicit or from scenario)
911        let graph = self.dependency_graph.clone().or_else(|| {
912            self.scenario.dependency_graph.as_ref().and_then(|cfg| {
913                let action_names = self.scenario.actions.action_names();
914                cfg.to_core_graph(&action_names)
915            })
916        });
917        if let Some(g) = graph {
918            extensions.insert(g);
919        }
920
921        // Note: prior_snapshot is inserted by LearnableSwarmBuilder.with_learning_store()
922
923        extensions
924    }
925
926    /// Evaluate scenario conditions to determine success/failure
927    ///
928    /// Returns (success, termination_reason)
929    fn evaluate_conditions(
930        &self,
931        metrics: &RunMetrics,
932        environment_done: bool,
933        timed_out: bool,
934    ) -> (bool, TerminationReason) {
935        let conditions = &self.scenario.conditions;
936
937        // 1. Check failure conditions first (any match = fail)
938        for condition in &conditions.failure {
939            if let Some(actual) =
940                self.get_metric_value(&condition.metric, metrics, environment_done)
941            {
942                if condition.evaluate(&actual) {
943                    return (false, TerminationReason::Failure);
944                }
945            }
946        }
947
948        // 2. Handle timeout
949        if timed_out {
950            return match conditions.on_timeout {
951                TimeoutBehavior::Fail => (false, TerminationReason::Timeout),
952                TimeoutBehavior::PartialSuccess => {
953                    // Check if success conditions are met
954                    let success = self.check_success_conditions(metrics, environment_done);
955                    (success, TerminationReason::Timeout)
956                }
957                TimeoutBehavior::MilestoneScore => {
958                    // TODO: Implement milestone scoring
959                    (false, TerminationReason::Timeout)
960                }
961            };
962        }
963
964        // 3. Check success conditions (all must match)
965        let success = self.check_success_conditions(metrics, environment_done);
966        if success {
967            (true, TerminationReason::Success)
968        } else {
969            // Not yet successful, but no failure conditions met either
970            // This shouldn't happen if called after completion
971            (false, TerminationReason::Stopped)
972        }
973    }
974
975    /// Check if all success conditions are met
976    fn check_success_conditions(&self, metrics: &RunMetrics, environment_done: bool) -> bool {
977        let conditions = &self.scenario.conditions;
978
979        // If no success conditions defined, consider successful
980        if conditions.success.is_empty() {
981            return true;
982        }
983
984        // All conditions must pass
985        conditions.success.iter().all(|condition| {
986            self.get_metric_value(&condition.metric, metrics, environment_done)
987                .map(|actual| condition.evaluate(&actual))
988                .unwrap_or(false)
989        })
990    }
991
992    /// Get metric value by path (e.g., "environment.done", "task.success_rate")
993    fn get_metric_value(
994        &self,
995        path: &str,
996        metrics: &RunMetrics,
997        environment_done: bool,
998    ) -> Option<ConditionValue> {
999        match path {
1000            // Environment metrics
1001            "environment.done" => Some(ConditionValue::Bool(environment_done)),
1002
1003            // Task metrics
1004            "task.total_ticks" | "total_ticks" => {
1005                Some(ConditionValue::Integer(metrics.task.total_ticks as i64))
1006            }
1007            "task.success_rate" | "success_rate" => {
1008                Some(ConditionValue::Float(metrics.task.success_rate))
1009            }
1010            "task.total_actions" | "total_actions" => {
1011                Some(ConditionValue::Integer(metrics.task.total_actions as i64))
1012            }
1013            "task.successful_actions" | "successful_actions" => Some(ConditionValue::Integer(
1014                metrics.task.successful_actions as i64,
1015            )),
1016
1017            // Performance metrics
1018            "performance.llm_error_rate" | "llm_error_rate" => {
1019                Some(ConditionValue::Float(metrics.performance.llm_error_rate))
1020            }
1021            "performance.llm_invocations" | "llm_invocations" => Some(ConditionValue::Integer(
1022                metrics.performance.llm_invocations as i64,
1023            )),
1024
1025            // Coordination metrics
1026            "coordination.manager_activations" | "manager_activations" => Some(
1027                ConditionValue::Integer(metrics.coordination.manager_activations as i64),
1028            ),
1029
1030            // Error metrics (from failed_actions)
1031            "errors.count" => {
1032                let failed = metrics
1033                    .task
1034                    .total_actions
1035                    .saturating_sub(metrics.task.successful_actions);
1036                Some(ConditionValue::Integer(failed as i64))
1037            }
1038
1039            // Unknown metric
1040            _ => None,
1041        }
1042    }
1043}
1044
1045// Wrapper to convert Box<dyn ManagerAgent> to impl ManagerAgent
1046struct DynManagerWrapper(Box<dyn ManagerAgent>);
1047
1048impl ManagerAgent for DynManagerWrapper {
1049    fn prepare(
1050        &self,
1051        context: &swarm_engine_core::agent::TaskContext,
1052    ) -> swarm_engine_core::agent::BatchDecisionRequest {
1053        self.0.prepare(context)
1054    }
1055
1056    fn finalize(
1057        &self,
1058        context: &swarm_engine_core::agent::TaskContext,
1059        responses: Vec<(
1060            swarm_engine_core::types::WorkerId,
1061            swarm_engine_core::agent::DecisionResponse,
1062        )>,
1063    ) -> swarm_engine_core::agent::ManagementDecision {
1064        self.0.finalize(context, responses)
1065    }
1066
1067    fn id(&self) -> swarm_engine_core::agent::ManagerId {
1068        self.0.id()
1069    }
1070
1071    fn name(&self) -> &str {
1072        self.0.name()
1073    }
1074}
1075
1076// Wrapper to convert Box<dyn BatchInvoker> to impl BatchInvoker
1077struct DynBatchInvokerWrapper(Box<dyn BatchInvoker>);
1078
1079impl BatchInvoker for DynBatchInvokerWrapper {
1080    fn invoke(
1081        &self,
1082        request: swarm_engine_core::agent::BatchDecisionRequest,
1083        extensions: &swarm_engine_core::extensions::Extensions,
1084    ) -> swarm_engine_core::agent::BatchInvokeResult {
1085        self.0.invoke(request, extensions)
1086    }
1087
1088    fn plan_dependencies(
1089        &self,
1090        task: &str,
1091        actions: &[ActionDef],
1092    ) -> Option<swarm_engine_core::exploration::DependencyGraph> {
1093        self.0.plan_dependencies(task, actions)
1094    }
1095
1096    fn name(&self) -> &str {
1097        self.0.name()
1098    }
1099}
1100
1101// Wrapper to convert Box<dyn OperatorProvider<NodeRules>> to impl OperatorProvider<NodeRules>
1102struct DynOperatorProviderWrapper(Box<dyn OperatorProvider<NodeRules>>);
1103
1104impl OperatorProvider<NodeRules> for DynOperatorProviderWrapper {
1105    fn provide(
1106        &self,
1107        rules: NodeRules,
1108        context: Option<
1109            &swarm_engine_core::exploration::ProviderContext<
1110                '_,
1111                swarm_engine_core::exploration::ActionNodeData,
1112                String,
1113                swarm_engine_core::exploration::MapNodeState,
1114            >,
1115        >,
1116    ) -> swarm_engine_core::exploration::ConfigurableOperator<NodeRules> {
1117        self.0.provide(rules, context)
1118    }
1119
1120    fn reevaluate(
1121        &self,
1122        operator: &mut swarm_engine_core::exploration::ConfigurableOperator<NodeRules>,
1123        ctx: &swarm_engine_core::exploration::ProviderContext<
1124            '_,
1125            swarm_engine_core::exploration::ActionNodeData,
1126            String,
1127            swarm_engine_core::exploration::MapNodeState,
1128        >,
1129    ) {
1130        self.0.reevaluate(operator, ctx)
1131    }
1132
1133    fn name(&self) -> &str {
1134        self.0.name()
1135    }
1136}
swarm_engine_eval/runner.rs

swarm_engine_eval/
runner.rs