swarm_engine_eval/
runner.rs

1//! EvalRunner - LearnableSwarm を使用した評価実行
2
3use std::time::Duration;
4
5use tokio::runtime::Handle;
6
7use swarm_engine_core::actions::ActionDef;
8use swarm_engine_core::agent::{
9    BatchInvoker, DefaultBatchManagerAgent, GenericWorker, ManagementStrategy, ManagerAgent,
10    ManagerId, WorkerAgent,
11};
12use swarm_engine_core::environment::EnvironmentBox;
13use swarm_engine_core::events::TraceSubscriber;
14use swarm_engine_core::exploration::{LearnedDependencyProvider, NodeRules, OperatorProvider};
15use swarm_engine_core::extensions::Extensions;
16use swarm_engine_core::learn::{
17    profile_to_offline_model, CountTrigger, LearnableSwarmBuilder, LearningStore, OfflineModel,
18    ScenarioProfile, TrainTrigger,
19};
20use swarm_engine_core::orchestrator::SwarmConfig;
21use swarm_engine_core::types::{GroupId, SwarmTask};
22
23use crate::environments::{
24    CodeEnvironment, DeepSearchEnvironment, InternalDiagnosisEnvironment, MazeEnvironment,
25    SearchEnvironment, TroubleshootingEnvironment,
26};
27
28use crate::aggregator::Aggregator;
29use crate::error::Result;
30use crate::metrics::RunMetrics;
31use crate::reporter::{ConfigSummary, EvalReport, SeedInfo};
32use crate::run::{EvalRun, TerminationReason};
33use crate::scenario::conditions::{ConditionValue, TimeoutBehavior};
34use crate::scenario::{EvalScenario, ManagementStrategyConfig};
35
36/// Evaluation seed for reproducibility
37///
38/// Stored in Extensions to allow Environment and other components to access it.
39#[derive(Debug, Clone, Copy)]
40pub struct EvalSeed(pub u64);
41
42/// Factory for creating ManagerAgent
43pub type ManagerFactory = Box<dyn Fn() -> Box<dyn ManagerAgent> + Send + Sync>;
44
45/// Factory for creating BatchInvoker
46pub type BatchInvokerFactory = Box<dyn Fn() -> Box<dyn BatchInvoker> + Send + Sync>;
47
48/// Factory for creating OperatorProvider
49pub type OperatorProviderFactory =
50    Box<dyn Fn() -> Box<dyn OperatorProvider<NodeRules>> + Send + Sync>;
51
52/// Evaluation runner using Orchestrator directly
53///
54/// # Example
55///
56/// ```ignore
57/// let runner = EvalRunner::new(scenario, runtime.handle().clone())
58///     .with_runs(5)
59///     .with_seed(42)
60///     .with_task(SwarmTask::new("Find the auth handler"))
61///     .with_manager_factory(|| Box::new(MyManager::new()))
62///     .with_batch_invoker_factory(|| Box::new(MyInvoker::new()));
63///
64/// let report = runner.run()?;
65/// ```
66pub struct EvalRunner {
67    scenario: EvalScenario,
68    runtime: Handle,
69    runs: usize,
70    seed: u64,
71    /// Task to execute (optional)
72    task: Option<SwarmTask>,
73    /// Manager factory (creates new instance per run)
74    manager_factory: Option<ManagerFactory>,
75    /// BatchInvoker factory (creates new instance per run)
76    batch_invoker_factory: Option<BatchInvokerFactory>,
77    /// Extensions factory (creates new instance per run)
78    extensions_factory: Option<Box<dyn Fn() -> Extensions + Send + Sync>>,
79    /// OperatorProvider factory (creates new instance per run)
80    operator_provider_factory: Option<OperatorProviderFactory>,
81    /// Verbose output (print tick snapshots)
82    verbose: bool,
83    /// Enable ExplorationSpaceV2 tracking
84    enable_exploration: bool,
85    /// Dependency graph for action sequencing
86    dependency_graph: Option<swarm_engine_core::exploration::DependencyGraph>,
87    /// LearningStore for cross-session learning
88    learning_store: Option<LearningStore>,
89    /// TrainTrigger for Learning (default: run after every eval)
90    train_trigger: Option<std::sync::Arc<dyn TrainTrigger>>,
91    /// Skip learned action order from offline model
92    skip_learned_action_order: bool,
93    /// Trace subscriber for ActionEvent output
94    trace_subscriber: Option<std::sync::Arc<dyn TraceSubscriber>>,
95    /// ScenarioProfile for applying learned parameters
96    scenario_profile: Option<ScenarioProfile>,
97    /// Cached OfflineModel from ScenarioProfile (to avoid repeated conversion)
98    offline_model_from_profile: Option<OfflineModel>,
99}
100
101impl EvalRunner {
102    pub fn new(scenario: EvalScenario, runtime: Handle) -> Self {
103        Self {
104            scenario,
105            runtime,
106            runs: 1,
107            seed: 42,
108            task: None,
109            manager_factory: None,
110            batch_invoker_factory: None,
111            extensions_factory: None,
112            operator_provider_factory: None,
113            verbose: false,
114            enable_exploration: false,
115            dependency_graph: None,
116            learning_store: None,
117            train_trigger: None,
118            skip_learned_action_order: false,
119            trace_subscriber: None,
120            scenario_profile: None,
121            offline_model_from_profile: None,
122        }
123    }
124
125    /// Enable verbose output (print tick snapshots)
126    pub fn with_verbose(mut self, verbose: bool) -> Self {
127        self.verbose = verbose;
128        self
129    }
130
131    /// Enable ExplorationSpace tracking
132    pub fn with_exploration(mut self, enable: bool) -> Self {
133        self.enable_exploration = enable;
134        self
135    }
136
137    /// Set dependency graph for action sequencing
138    ///
139    /// The graph defines valid action transitions and terminal conditions.
140    /// When set, actions will be filtered based on the graph structure.
141    pub fn with_dependency_graph(
142        mut self,
143        graph: swarm_engine_core::exploration::DependencyGraph,
144    ) -> Self {
145        self.dependency_graph = Some(graph);
146        self
147    }
148
149    /// Enable LearningStore for cross-session learning
150    ///
151    /// When enabled, statistics will be saved after each run and loaded as prior
152    /// for subsequent runs. This enables incremental learning across sessions.
153    ///
154    /// # Example
155    ///
156    /// ```ignore
157    /// runner.with_learning_store("~/.swarm-engine/learning")
158    /// ```
159    pub fn with_learning_store(mut self, path: impl AsRef<std::path::Path>) -> Self {
160        match LearningStore::new(path) {
161            Ok(store) => {
162                // prior_snapshot, offline_model loading is delegated to LearnableSwarmBuilder
163                self.learning_store = Some(store);
164            }
165            Err(e) => {
166                eprintln!("Warning: Failed to create LearningStore: {}", e);
167            }
168        }
169        self
170    }
171
172    /// Set TrainTrigger for Learning
173    ///
174    /// Controls when offline learning is executed after eval runs.
175    /// Default: Learn after every run (if learning_store is configured).
176    ///
177    /// # Example
178    ///
179    /// ```ignore
180    /// use swarm_engine_core::learn::CountTrigger;
181    ///
182    /// // Run learning after every 5 eval iterations
183    /// runner.with_train_trigger(Arc::new(CountTrigger::new(5)))
184    /// ```
185    pub fn with_train_trigger(mut self, trigger: std::sync::Arc<dyn TrainTrigger>) -> Self {
186        self.train_trigger = Some(trigger);
187        self
188    }
189
190    /// Skip learned action order from offline model
191    ///
192    /// When enabled, the learned dependency graph (action_order) from the offline
193    /// model will not be applied. This is useful for testing without learned priors.
194    pub fn skip_learned_action_order(mut self, skip: bool) -> Self {
195        self.skip_learned_action_order = skip;
196        self
197    }
198
199    /// Set trace subscriber for ActionEvent output
200    ///
201    /// The subscriber will receive all ActionEvents during evaluation.
202    /// Use InMemoryTraceSubscriber to collect events and dump them after evaluation,
203    /// or JsonlTraceSubscriber for real-time output.
204    ///
205    /// # Example
206    ///
207    /// ```ignore
208    /// use std::sync::Arc;
209    /// use swarm_engine_core::events::InMemoryTraceSubscriber;
210    ///
211    /// let trace = Arc::new(InMemoryTraceSubscriber::new());
212    /// runner.with_trace_subscriber(trace.clone());
213    /// // ... run evaluation ...
214    /// trace.dump_to_file("trace.jsonl")?;
215    /// ```
216    pub fn with_trace_subscriber(
217        mut self,
218        subscriber: std::sync::Arc<dyn TraceSubscriber>,
219    ) -> Self {
220        self.trace_subscriber = Some(subscriber);
221        self
222    }
223
224    /// Apply a ScenarioProfile to use learned exploration parameters and strategies
225    ///
226    /// This converts the profile to an OfflineModel and applies it during execution.
227    /// When set, the profile's learned components (exploration params, strategy config,
228    /// action order) will be used instead of defaults.
229    ///
230    /// # Example
231    ///
232    /// ```ignore
233    /// let store = ProfileStore::new("~/.swarm-engine/profiles")?;
234    /// let profile = store.load("troubleshooting")?;
235    /// runner.with_scenario_profile(profile);
236    /// ```
237    pub fn with_scenario_profile(mut self, profile: ScenarioProfile) -> Self {
238        let offline_model = profile_to_offline_model(&profile);
239        self.offline_model_from_profile = Some(offline_model);
240        self.scenario_profile = Some(profile);
241        self
242    }
243
244    pub fn with_runs(mut self, runs: usize) -> Self {
245        self.runs = runs;
246        self
247    }
248
249    pub fn with_seed(mut self, seed: u64) -> Self {
250        self.seed = seed;
251        self
252    }
253
254    /// Set the task to execute
255    pub fn with_task(mut self, task: SwarmTask) -> Self {
256        self.task = Some(task);
257        self
258    }
259
260    /// Set manager factory (creates new Manager for each run)
261    pub fn with_manager_factory<F>(mut self, factory: F) -> Self
262    where
263        F: Fn() -> Box<dyn ManagerAgent> + Send + Sync + 'static,
264    {
265        self.manager_factory = Some(Box::new(factory));
266        self
267    }
268
269    /// Set batch invoker factory (creates new BatchInvoker for each run)
270    pub fn with_batch_invoker_factory<F>(mut self, factory: F) -> Self
271    where
272        F: Fn() -> Box<dyn BatchInvoker> + Send + Sync + 'static,
273    {
274        self.batch_invoker_factory = Some(Box::new(factory));
275        self
276    }
277
278    /// Set extensions factory (creates new Extensions for each run)
279    pub fn with_extensions_factory<F>(mut self, factory: F) -> Self
280    where
281        F: Fn() -> Extensions + Send + Sync + 'static,
282    {
283        self.extensions_factory = Some(Box::new(factory));
284        self
285    }
286
287    /// Set OperatorProvider factory (creates new provider for each run)
288    ///
289    /// Use this to configure the Selection strategy for exploration.
290    /// Default is `AdaptiveProvider` if not specified.
291    ///
292    /// # Example
293    ///
294    /// ```ignore
295    /// use swarm_engine_core::exploration::{HybridLlmProvider, ReviewPolicy};
296    /// use swarm_engine_llm::LlmStrategyAdvisor;
297    ///
298    /// runner.with_operator_provider_factory(|| {
299    ///     let advisor = LlmStrategyAdvisor::new(decider.clone(), handle.clone());
300    ///     let policy = ReviewPolicy::default();
301    ///     Box::new(HybridLlmProvider::new(advisor, policy))
302    /// })
303    /// ```
304    pub fn with_operator_provider_factory<F>(mut self, factory: F) -> Self
305    where
306        F: Fn() -> Box<dyn OperatorProvider<NodeRules>> + Send + Sync + 'static,
307    {
308        self.operator_provider_factory = Some(Box::new(factory));
309        self
310    }
311
312    pub fn run(&self) -> Result<EvalReport> {
313        let mut eval_runs = Vec::with_capacity(self.runs);
314        let mut run_seeds = Vec::with_capacity(self.runs);
315
316        // Generate GroupId for this eval run (all iterations share the same GroupId)
317        // This enables DPO learning to compare multiple executions under the same conditions
318        let group_id = GroupId::new();
319
320        for i in 0..self.runs {
321            let run_seed = self.seed.wrapping_add(i as u64);
322            run_seeds.push(run_seed);
323
324            let result = self.run_single(i, run_seed, group_id)?;
325            eval_runs.push(result);
326        }
327
328        let aggregated = Aggregator::aggregate(&eval_runs);
329
330        // Note: action_order learning is handled by `learn` command, not eval
331        // If specific behavior is needed, use MockProvider via LearnableSwarmBuilder
332
333        Ok(EvalReport {
334            config_summary: ConfigSummary {
335                scenario_name: self.scenario.meta.name.clone(),
336                scenario_id: self.scenario.meta.id.to_string(),
337                worker_count: self.scenario.agents.workers.iter().map(|w| w.count).sum(),
338                max_ticks: self.scenario.app_config.max_ticks,
339                run_count: self.runs,
340            },
341            seed_info: SeedInfo {
342                base_seed: self.seed,
343                run_seeds,
344            },
345            runs: eval_runs,
346            aggregated,
347            assertion_results: vec![],
348        })
349    }
350
351    /// Run a single evaluation iteration
352    ///
353    /// # Arguments
354    /// * `index` - The iteration index (0-based)
355    /// * `seed` - The random seed for this iteration
356    /// * `group_id` - The group ID shared by all iterations in this eval run.
357    ///   Used for DPO learning to compare multiple executions.
358    fn run_single(&self, index: usize, seed: u64, group_id: GroupId) -> Result<EvalRun> {
359        // ========================================================================
360        // LearnableSwarmBuilder で Swarm を構築
361        // ========================================================================
362        let workers = self.build_workers();
363        let management_strategy = self.build_management_strategy();
364
365        let swarm_config = SwarmConfig {
366            tick_duration: Duration::from_millis(self.scenario.app_config.tick_duration_ms),
367            max_ticks: self.scenario.app_config.max_ticks,
368            management_strategy,
369        };
370
371        // Extensions（LlmConfig, ActionsConfig, EvalSeed 等）
372        let extensions = self.build_extensions_from_scenario(seed);
373
374        // LearnableSwarmBuilder で構築開始
375        let scenario_key = self.scenario.meta.id.learning_key();
376        let mut builder = LearnableSwarmBuilder::new(self.runtime.clone())
377            .scenario(&scenario_key)
378            .swarm_config(swarm_config)
379            .workers(workers)
380            .extensions(extensions)
381            .enable_exploration(
382                self.enable_exploration || self.scenario.app_config.enable_exploration,
383            );
384
385        // Managers: from factory if provided, otherwise from scenario templates
386        if let Some(factory) = &self.manager_factory {
387            let manager = factory();
388            builder = builder.add_manager(Box::new(DynManagerWrapper(manager)));
389        } else {
390            let managers = self.build_managers();
391            for manager in managers {
392                builder = builder.add_manager(Box::new(manager));
393            }
394        }
395
396        // BatchInvoker if factory provided
397        if let Some(factory) = &self.batch_invoker_factory {
398            let invoker = factory();
399            builder = builder.batch_invoker(Box::new(DynBatchInvokerWrapper(invoker)));
400        }
401
402        // OperatorProvider factory
403        if let Some(factory) = &self.operator_provider_factory {
404            let provider = factory();
405            builder = builder.operator_provider(Box::new(DynOperatorProviderWrapper(provider)));
406        }
407
408        // ScenarioProfile の OfflineModel を適用（LearningStore より優先）
409        if let Some(ref model) = self.offline_model_from_profile {
410            builder = builder.offline_model(model.clone());
411
412            // Offline model 適用の通知
413            if self.operator_provider_factory.is_none() {
414                println!(
415                    "Profile offline model applied: ucb1_c={:.3}, maturity={}, strategy={}",
416                    model.parameters.ucb1_c,
417                    model.strategy_config.maturity_threshold,
418                    model.strategy_config.initial_strategy
419                );
420            }
421
422            // Learned action order（DependencyGraph 自動生成をスキップ）
423            if !self.skip_learned_action_order {
424                if let Some(ref action_order) = model.action_order {
425                    let provider = LearnedDependencyProvider::new(action_order.clone());
426                    builder = builder.dependency_provider(Box::new(provider));
427                    println!(
428                        "Learned action order applied: discover={:?}, not_discover={:?}",
429                        action_order.discover, action_order.not_discover
430                    );
431                }
432            } else if model.action_order.is_some() {
433                println!("Learned action order skipped (--no-dep-graph)");
434            }
435        }
436
437        // Learning 設定（LearningStore がある場合）
438        // with_learning_store で prior_snapshot, offline_model, data_dir, learning_enabled を自動設定
439        if let Some(ref store) = self.learning_store {
440            builder = builder.with_learning_store(store.clone());
441
442            // ScenarioProfile が設定されていない場合のみ、LearningStore の OfflineModel を適用
443            if self.offline_model_from_profile.is_none() {
444                // Offline model の情報を取得（println! と action_order 処理用）
445                let offline_model_opt = builder.offline_model_ref().cloned();
446                if let Some(ref model) = offline_model_opt {
447                    // Offline model 適用の通知（OperatorProvider factory がない場合のみ）
448                    if self.operator_provider_factory.is_none() {
449                        println!(
450                            "Offline model applied: ucb1_c={:.3}, maturity={}, strategy={}",
451                            model.parameters.ucb1_c,
452                            model.strategy_config.maturity_threshold,
453                            model.strategy_config.initial_strategy
454                        );
455                    }
456
457                    // Learned action order（DependencyGraph 自動生成をスキップ）
458                    if !self.skip_learned_action_order {
459                        if let Some(ref action_order) = model.action_order {
460                            let provider = LearnedDependencyProvider::new(action_order.clone());
461                            builder = builder.dependency_provider(Box::new(provider));
462                            println!(
463                                "Learned action order applied: discover={:?}, not_discover={:?}",
464                                action_order.discover, action_order.not_discover
465                            );
466                        }
467                    } else if model.action_order.is_some() {
468                        println!("Learned action order skipped (--no-dep-graph)");
469                    }
470                }
471            }
472
473            // Train trigger
474            if let Some(ref trigger) = self.train_trigger {
475                builder = builder.train_trigger(std::sync::Arc::clone(trigger));
476            } else {
477                builder = builder.train_trigger(std::sync::Arc::new(CountTrigger::new(self.runs)));
478            }
479        }
480
481        // Trace subscriber
482        if let Some(ref subscriber) = self.trace_subscriber {
483            builder = builder.with_trace_subscriber(std::sync::Arc::clone(subscriber));
484        }
485
486        // ========================================================================
487        // LearnableSwarm を構築・実行
488        // ========================================================================
489        let mut swarm = builder.build()?;
490
491        // Enable partitioning when multiple managers are configured
492        let manager_count = self
493            .scenario
494            .agents
495            .managers
496            .iter()
497            .map(|t| t.count)
498            .sum::<usize>();
499        if manager_count > 1 {
500            swarm.orchestrator_mut().enable_partitioning();
501        }
502
503        // Determine task
504        let task_to_run = self
505            .task
506            .clone()
507            .or_else(|| self.build_task_from_scenario())
508            .map(|task| task.with_group_id(group_id));
509
510        // Run
511        let result = if let Some(task) = task_to_run {
512            swarm.run_task(task)?
513        } else {
514            swarm.run()
515        };
516
517        // ========================================================================
518        // メトリクス収集
519        // ========================================================================
520        let state = swarm.orchestrator().state();
521        let timed_out = result.total_ticks >= self.scenario.app_config.max_ticks;
522        let environment_done = state.shared.is_environment_done();
523        let total_actions = state.shared.stats.total_visits() as u64;
524        let successful_actions = state.shared.stats.total_successes() as u64;
525        let llm_invocations = state.shared.llm_invocations();
526        let llm_invoke_errors = state.shared.llm_errors();
527
528        let metrics = RunMetrics {
529            task: crate::metrics::TaskMetrics {
530                total_ticks: result.total_ticks,
531                total_tasks: 0,
532                completed_tasks: 0,
533                total_actions,
534                successful_actions,
535                success_rate: state.shared.stats.success_rate(),
536            },
537            coordination: crate::metrics::CoordinationMetrics {
538                manager_activations: llm_invocations,
539                manager_intervention_rate: if result.total_ticks > 0 {
540                    llm_invocations as f64 / result.total_ticks as f64
541                } else {
542                    0.0
543                },
544                ..Default::default()
545            },
546            performance: {
547                let llm_error_rate = if llm_invocations > 0 {
548                    llm_invoke_errors as f64 / llm_invocations as f64
549                } else {
550                    0.0
551                };
552                crate::metrics::PerformanceMetrics {
553                    total_duration_ms: result.total_duration.as_millis() as f64,
554                    avg_tick_latency_ms: if result.total_ticks > 0 {
555                        result.total_duration.as_millis() as f64 / result.total_ticks as f64
556                    } else {
557                        0.0
558                    },
559                    raw_throughput_per_sec: if result.total_duration.as_secs_f64() > 0.0 {
560                        total_actions as f64 / result.total_duration.as_secs_f64()
561                    } else {
562                        0.0
563                    },
564                    effective_throughput_per_sec: if result.total_duration.as_secs_f64() > 0.0 {
565                        successful_actions as f64 / result.total_duration.as_secs_f64()
566                    } else {
567                        0.0
568                    },
569                    llm_invocations,
570                    llm_invoke_errors,
571                    llm_error_rate,
572                    ..Default::default()
573                }
574            },
575            robustness: Default::default(),
576        };
577
578        // Evaluate success/failure conditions
579        let (success, termination_reason) = if !result.completed {
580            (false, TerminationReason::Stopped)
581        } else {
582            self.evaluate_conditions(&metrics, environment_done, timed_out)
583        };
584
585        // Send shutdown signal to LearningDaemon (non-blocking)
586        // NOTE: block_on() inside tokio runtime causes deadlock,
587        //       so we only send shutdown signal and let Drop handle cleanup
588        if swarm.is_learning_enabled() {
589            if let Some(tx) = swarm.take_shutdown_tx() {
590                // Fire-and-forget shutdown signal
591                let _ = tx.try_send(());
592            }
593        }
594
595        Ok(EvalRun::new(
596            index,
597            seed,
598            success,
599            termination_reason,
600            metrics,
601        ))
602    }
603
604    fn build_workers(&self) -> Vec<Box<dyn WorkerAgent>> {
605        let mut workers: Vec<Box<dyn WorkerAgent>> = Vec::new();
606
607        for template in &self.scenario.agents.workers {
608            for i in 0..template.count {
609                let id = workers.len();
610                let name = template.id_pattern.replace("{i}", &i.to_string());
611
612                let worker = GenericWorker::new(id)
613                    .with_name(name)
614                    .with_require_guidance(true);
615
616                workers.push(Box::new(worker));
617            }
618        }
619
620        workers
621    }
622
623    fn build_managers(&self) -> Vec<DefaultBatchManagerAgent> {
624        let mut managers = Vec::new();
625        let mut manager_index = 0;
626
627        for template in &self.scenario.agents.managers {
628            let ids = template.generate_ids();
629            for name in ids {
630                let manager = DefaultBatchManagerAgent::new(ManagerId(manager_index))
631                    .with_name(name)
632                    .with_interval(self.scenario.manager.process_interval_ticks);
633
634                managers.push(manager);
635                manager_index += 1;
636            }
637        }
638
639        // デフォルト: Manager テンプレートがない場合は 1 つ作成
640        if managers.is_empty() {
641            managers.push(
642                DefaultBatchManagerAgent::new(ManagerId(0))
643                    .with_name("default_manager")
644                    .with_interval(self.scenario.manager.process_interval_ticks),
645            );
646        }
647
648        managers
649    }
650
651    fn build_management_strategy(&self) -> ManagementStrategy {
652        match &self.scenario.app_config.management_strategy {
653            ManagementStrategyConfig::EveryTick {} => ManagementStrategy::EveryTick,
654            ManagementStrategyConfig::IntervalBased { max_interval } => {
655                ManagementStrategy::FixedInterval {
656                    interval: *max_interval,
657                }
658            }
659            ManagementStrategyConfig::EventDriven { triggers: _ } => {
660                // Event-driven maps to completion-based
661                ManagementStrategy::CompletionBased { max_wait_ticks: 50 }
662            }
663            ManagementStrategyConfig::Hybrid {
664                max_interval,
665                triggers: _,
666            } => ManagementStrategy::Hybrid {
667                preferred_interval: *max_interval,
668                force_after_ticks: max_interval * 2,
669            },
670            ManagementStrategyConfig::Disabled {} => {
671                // Disabled = very large interval (effectively never)
672                ManagementStrategy::FixedInterval { interval: u64::MAX }
673            }
674        }
675    }
676
677    /// Build SwarmTask from scenario task config
678    ///
679    /// Returns None if task goal is empty
680    fn build_task_from_scenario(&self) -> Option<SwarmTask> {
681        let task_config = &self.scenario.task;
682
683        if task_config.goal.is_empty() {
684            return None;
685        }
686
687        // Build context JSON object
688        let mut context = serde_json::Map::new();
689
690        if let Some(target_path) = &task_config.context.target_path {
691            context.insert(
692                "target_path".to_string(),
693                serde_json::Value::String(target_path.clone()),
694            );
695        }
696        if let Some(working_dir) = &task_config.context.working_dir {
697            context.insert(
698                "working_dir".to_string(),
699                serde_json::Value::String(working_dir.clone()),
700            );
701        }
702        if let Some(max_depth) = task_config.context.max_depth {
703            context.insert(
704                "max_depth".to_string(),
705                serde_json::Value::Number(serde_json::Number::from(max_depth)),
706            );
707        }
708
709        // Add extra context (convert toml::Value to serde_json::Value)
710        for (key, value) in &task_config.context.extra {
711            if let Ok(json_value) = serde_json::to_value(value) {
712                context.insert(key.clone(), json_value);
713            }
714        }
715
716        let task =
717            SwarmTask::new(&task_config.goal).with_context(serde_json::Value::Object(context));
718
719        Some(task)
720    }
721
722    /// Build Extensions with LlmConfig, ActionsConfig, and EvalSeed from scenario
723    fn build_extensions_from_scenario(&self, seed: u64) -> Extensions {
724        let mut extensions = if let Some(factory) = &self.extensions_factory {
725            factory()
726        } else {
727            Extensions::new()
728        };
729
730        // Insert EvalSeed for reproducibility
731        extensions.insert(EvalSeed(seed));
732
733        // Insert LlmConfig for BatchInvoker/Manager to use
734        extensions.insert(self.scenario.llm.clone());
735
736        // Insert LoRA config if specified (for BatchInvoker to use)
737        if let Some(ref lora) = self.scenario.llm.lora {
738            extensions.insert(lora.clone());
739        }
740
741        // Insert ManagerConfig for Manager to use
742        extensions.insert(self.scenario.manager.clone());
743
744        // Insert BatchProcessorConfig for BatchInvoker to use
745        extensions.insert(self.scenario.batch_processor.clone());
746
747        // Convert EvalActionsConfig to Core ActionsConfig for Worker/Manager to use
748        let core_actions_config = self.scenario.actions.to_core_config();
749        extensions.insert(core_actions_config);
750
751        // Create and insert Environment based on env_type
752        let env_type = self.scenario.environment.env_type.as_str();
753        let env_params = &self.scenario.environment.params;
754
755        let env_box: Option<EnvironmentBox> = match env_type {
756            "maze" => {
757                let map = env_params.get("map").and_then(|v| v.as_str()).unwrap_or("");
758                let worker_count = env_params
759                    .get("worker_count")
760                    .and_then(|v| v.as_u64())
761                    .unwrap_or(1) as usize;
762                Some(Box::new(MazeEnvironment::from_str(map, worker_count)))
763            }
764            "code" => {
765                // Currently only "auth" scenario is supported, default to it
766                Some(Box::new(CodeEnvironment::auth_scenario()))
767            }
768            "troubleshooting" => {
769                let scenario_name = env_params
770                    .get("scenario")
771                    .and_then(|v| v.as_str())
772                    .unwrap_or("memory_leak");
773                let env = match scenario_name {
774                    "memory_leak" => TroubleshootingEnvironment::memory_leak_scenario(),
775                    "cpu_spike" => TroubleshootingEnvironment::cpu_spike_scenario(),
776                    "network_timeout" => TroubleshootingEnvironment::network_timeout_scenario(),
777                    "medium" => TroubleshootingEnvironment::complex_scenario(15, 3, 2, seed),
778                    "high" => TroubleshootingEnvironment::complex_scenario(30, 8, 3, seed),
779                    "extreme" => TroubleshootingEnvironment::complex_scenario(50, 15, 4, seed),
780                    "complex" => {
781                        let total_services = env_params
782                            .get("total_services")
783                            .and_then(|v| v.as_u64())
784                            .unwrap_or(15) as usize;
785                        let noise_services = env_params
786                            .get("noise_services")
787                            .and_then(|v| v.as_u64())
788                            .unwrap_or(3) as usize;
789                        let cascade_depth = env_params
790                            .get("cascade_depth")
791                            .and_then(|v| v.as_u64())
792                            .unwrap_or(2) as usize;
793                        TroubleshootingEnvironment::complex_scenario(
794                            total_services,
795                            noise_services,
796                            cascade_depth,
797                            seed,
798                        )
799                    }
800                    _ => TroubleshootingEnvironment::memory_leak_scenario(),
801                };
802                Some(Box::new(env))
803            }
804            "search" => {
805                let scenario_name = env_params
806                    .get("scenario")
807                    .and_then(|v| v.as_str())
808                    .unwrap_or("basic");
809                let env = match scenario_name {
810                    "basic" => SearchEnvironment::basic_scenario(),
811                    "medium" => SearchEnvironment::medium_scenario(),
812                    "large" => SearchEnvironment::large_scenario(),
813                    "custom" => {
814                        let file_count = env_params
815                            .get("file_count")
816                            .and_then(|v| v.as_u64())
817                            .unwrap_or(5) as usize;
818                        let target_index = env_params
819                            .get("target_index")
820                            .and_then(|v| v.as_u64())
821                            .unwrap_or(2) as usize;
822                        SearchEnvironment::custom_scenario(file_count, target_index, seed)
823                    }
824                    _ => SearchEnvironment::basic_scenario(),
825                };
826                Some(Box::new(env))
827            }
828            "internal_diagnosis" => {
829                let scenario_name = env_params
830                    .get("scenario")
831                    .and_then(|v| v.as_str())
832                    .unwrap_or("routing");
833                let env = match scenario_name {
834                    "routing" => InternalDiagnosisEnvironment::routing_error_scenario(),
835                    "failover" => InternalDiagnosisEnvironment::failover_error_scenario(),
836                    "worker_pool" => InternalDiagnosisEnvironment::worker_pool_scenario(),
837                    "strategy" => InternalDiagnosisEnvironment::strategy_mismatch_scenario(),
838                    "exploration" => InternalDiagnosisEnvironment::exploration_depth_scenario(),
839                    "complex" => InternalDiagnosisEnvironment::complex_scenario(seed),
840                    _ => InternalDiagnosisEnvironment::routing_error_scenario(),
841                };
842                Some(Box::new(env))
843            }
844            "deep_search" => {
845                // TODO: Support multiple deep_search scenarios
846                let _scenario_name = env_params
847                    .get("scenario")
848                    .and_then(|v| v.as_str())
849                    .unwrap_or("tech_question");
850                let env = DeepSearchEnvironment::tech_question_scenario();
851                Some(Box::new(env))
852            }
853            // Real filesystem environment (Bash, Read, Write, Grep, Glob)
854            "default" | "realworld" => {
855                use swarm_engine_core::environment::DefaultEnvironment;
856                let working_dir = env_params
857                    .get("working_dir")
858                    .and_then(|v| v.as_str())
859                    .map(std::path::PathBuf::from);
860                let env = if let Some(dir) = working_dir {
861                    DefaultEnvironment::with_working_dir(dir)
862                } else {
863                    DefaultEnvironment::new()
864                };
865                Some(Box::new(env))
866            }
867            _ => None, // Unknown env_type - no Environment inserted
868        };
869
870        if let Some(env) = env_box {
871            extensions.insert(env);
872        }
873
874        // Insert DependencyGraph if specified (explicit or from scenario)
875        let graph = self.dependency_graph.clone().or_else(|| {
876            self.scenario.dependency_graph.as_ref().and_then(|cfg| {
877                let action_names = self.scenario.actions.action_names();
878                cfg.to_core_graph(&action_names)
879            })
880        });
881        if let Some(g) = graph {
882            extensions.insert(g);
883        }
884
885        // Note: prior_snapshot is inserted by LearnableSwarmBuilder.with_learning_store()
886
887        extensions
888    }
889
890    /// Evaluate scenario conditions to determine success/failure
891    ///
892    /// Returns (success, termination_reason)
893    fn evaluate_conditions(
894        &self,
895        metrics: &RunMetrics,
896        environment_done: bool,
897        timed_out: bool,
898    ) -> (bool, TerminationReason) {
899        let conditions = &self.scenario.conditions;
900
901        // 1. Check failure conditions first (any match = fail)
902        for condition in &conditions.failure {
903            if let Some(actual) =
904                self.get_metric_value(&condition.metric, metrics, environment_done)
905            {
906                if condition.evaluate(&actual) {
907                    return (false, TerminationReason::Failure);
908                }
909            }
910        }
911
912        // 2. Handle timeout
913        if timed_out {
914            return match conditions.on_timeout {
915                TimeoutBehavior::Fail => (false, TerminationReason::Timeout),
916                TimeoutBehavior::PartialSuccess => {
917                    // Check if success conditions are met
918                    let success = self.check_success_conditions(metrics, environment_done);
919                    (success, TerminationReason::Timeout)
920                }
921                TimeoutBehavior::MilestoneScore => {
922                    // TODO: Implement milestone scoring
923                    (false, TerminationReason::Timeout)
924                }
925            };
926        }
927
928        // 3. Check success conditions (all must match)
929        let success = self.check_success_conditions(metrics, environment_done);
930        if success {
931            (true, TerminationReason::Success)
932        } else {
933            // Not yet successful, but no failure conditions met either
934            // This shouldn't happen if called after completion
935            (false, TerminationReason::Stopped)
936        }
937    }
938
939    /// Check if all success conditions are met
940    fn check_success_conditions(&self, metrics: &RunMetrics, environment_done: bool) -> bool {
941        let conditions = &self.scenario.conditions;
942
943        // If no success conditions defined, consider successful
944        if conditions.success.is_empty() {
945            return true;
946        }
947
948        // All conditions must pass
949        conditions.success.iter().all(|condition| {
950            self.get_metric_value(&condition.metric, metrics, environment_done)
951                .map(|actual| condition.evaluate(&actual))
952                .unwrap_or(false)
953        })
954    }
955
956    /// Get metric value by path (e.g., "environment.done", "task.success_rate")
957    fn get_metric_value(
958        &self,
959        path: &str,
960        metrics: &RunMetrics,
961        environment_done: bool,
962    ) -> Option<ConditionValue> {
963        match path {
964            // Environment metrics
965            "environment.done" => Some(ConditionValue::Bool(environment_done)),
966
967            // Task metrics
968            "task.total_ticks" | "total_ticks" => {
969                Some(ConditionValue::Integer(metrics.task.total_ticks as i64))
970            }
971            "task.success_rate" | "success_rate" => {
972                Some(ConditionValue::Float(metrics.task.success_rate))
973            }
974            "task.total_actions" | "total_actions" => {
975                Some(ConditionValue::Integer(metrics.task.total_actions as i64))
976            }
977            "task.successful_actions" | "successful_actions" => Some(ConditionValue::Integer(
978                metrics.task.successful_actions as i64,
979            )),
980
981            // Performance metrics
982            "performance.llm_error_rate" | "llm_error_rate" => {
983                Some(ConditionValue::Float(metrics.performance.llm_error_rate))
984            }
985            "performance.llm_invocations" | "llm_invocations" => Some(ConditionValue::Integer(
986                metrics.performance.llm_invocations as i64,
987            )),
988
989            // Coordination metrics
990            "coordination.manager_activations" | "manager_activations" => Some(
991                ConditionValue::Integer(metrics.coordination.manager_activations as i64),
992            ),
993
994            // Error metrics (from failed_actions)
995            "errors.count" => {
996                let failed = metrics
997                    .task
998                    .total_actions
999                    .saturating_sub(metrics.task.successful_actions);
1000                Some(ConditionValue::Integer(failed as i64))
1001            }
1002
1003            // Unknown metric
1004            _ => None,
1005        }
1006    }
1007}
1008
1009// Wrapper to convert Box<dyn ManagerAgent> to impl ManagerAgent
1010struct DynManagerWrapper(Box<dyn ManagerAgent>);
1011
1012impl ManagerAgent for DynManagerWrapper {
1013    fn prepare(
1014        &self,
1015        context: &swarm_engine_core::agent::TaskContext,
1016    ) -> swarm_engine_core::agent::BatchDecisionRequest {
1017        self.0.prepare(context)
1018    }
1019
1020    fn finalize(
1021        &self,
1022        context: &swarm_engine_core::agent::TaskContext,
1023        responses: Vec<(
1024            swarm_engine_core::types::WorkerId,
1025            swarm_engine_core::agent::DecisionResponse,
1026        )>,
1027    ) -> swarm_engine_core::agent::ManagementDecision {
1028        self.0.finalize(context, responses)
1029    }
1030
1031    fn id(&self) -> swarm_engine_core::agent::ManagerId {
1032        self.0.id()
1033    }
1034
1035    fn name(&self) -> &str {
1036        self.0.name()
1037    }
1038}
1039
1040// Wrapper to convert Box<dyn BatchInvoker> to impl BatchInvoker
1041struct DynBatchInvokerWrapper(Box<dyn BatchInvoker>);
1042
1043impl BatchInvoker for DynBatchInvokerWrapper {
1044    fn invoke(
1045        &self,
1046        request: swarm_engine_core::agent::BatchDecisionRequest,
1047        extensions: &swarm_engine_core::extensions::Extensions,
1048    ) -> swarm_engine_core::agent::BatchInvokeResult {
1049        self.0.invoke(request, extensions)
1050    }
1051
1052    fn plan_dependencies(
1053        &self,
1054        task: &str,
1055        actions: &[ActionDef],
1056    ) -> Option<swarm_engine_core::exploration::DependencyGraph> {
1057        self.0.plan_dependencies(task, actions)
1058    }
1059
1060    fn name(&self) -> &str {
1061        self.0.name()
1062    }
1063}
1064
1065// Wrapper to convert Box<dyn OperatorProvider<NodeRules>> to impl OperatorProvider<NodeRules>
1066struct DynOperatorProviderWrapper(Box<dyn OperatorProvider<NodeRules>>);
1067
1068impl OperatorProvider<NodeRules> for DynOperatorProviderWrapper {
1069    fn provide(
1070        &self,
1071        rules: NodeRules,
1072        context: Option<
1073            &swarm_engine_core::exploration::ProviderContext<
1074                '_,
1075                swarm_engine_core::exploration::ActionNodeData,
1076                String,
1077                swarm_engine_core::exploration::MapNodeState,
1078            >,
1079        >,
1080    ) -> swarm_engine_core::exploration::ConfigurableOperator<NodeRules> {
1081        self.0.provide(rules, context)
1082    }
1083
1084    fn reevaluate(
1085        &self,
1086        operator: &mut swarm_engine_core::exploration::ConfigurableOperator<NodeRules>,
1087        ctx: &swarm_engine_core::exploration::ProviderContext<
1088            '_,
1089            swarm_engine_core::exploration::ActionNodeData,
1090            String,
1091            swarm_engine_core::exploration::MapNodeState,
1092        >,
1093    ) {
1094        self.0.reevaluate(operator, ctx)
1095    }
1096
1097    fn name(&self) -> &str {
1098        self.0.name()
1099    }
1100}
swarm_engine_eval/runner.rs

swarm_engine_eval/
runner.rs