Skip to main content

zeph_experiments/
engine.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Experiment engine — core async loop for autonomous parameter tuning.
5//!
6//! [`ExperimentEngine`] orchestrates baseline evaluation, variation generation,
7//! candidate scoring, acceptance decisions, and optional SQLite persistence.
8//! Cancellation is supported via [`tokio_util::sync::CancellationToken`].
9//!
10//! # Loop Summary
11//!
12//! 1. Evaluate the baseline configuration once to establish `initial_baseline_score`.
13//! 2. Ask the [`VariationGenerator`] for the next untested variation.
14//! 3. Clone the subject provider with generation overrides from the candidate snapshot.
15//! 4. Evaluate the candidate; accept if `delta >= config.min_improvement`.
16//! 5. On acceptance, update the progressive baseline (greedy hill-climbing).
17//! 6. Optionally persist the result to SQLite.
18//! 7. Repeat until: max experiments, wall-time limit, search exhaustion, or cancellation.
19//!
20//! [`VariationGenerator`]: crate::VariationGenerator
21
22use std::collections::HashSet;
23use std::sync::Arc;
24use std::time::Instant;
25
26use serde::{Deserialize, Serialize};
27use tokio_util::sync::CancellationToken;
28use zeph_common::SessionId;
29use zeph_llm::any::AnyProvider;
30use zeph_memory::semantic::SemanticMemory;
31use zeph_memory::store::experiments::NewExperimentResult;
32
33use super::error::EvalError;
34use super::evaluator::Evaluator;
35use super::generator::VariationGenerator;
36use super::snapshot::ConfigSnapshot;
37use super::types::{ExperimentResult, ExperimentSource, Variation};
38use zeph_config::ExperimentConfig;
39
40/// Final report produced by [`ExperimentEngine::run`].
41///
42/// `total_improvement` can be negative if no variation improved the baseline,
43/// or `NaN` if the baseline evaluation itself returned `NaN` (which causes
44/// an early [`EvalError`] rather than a report).
45///
46/// [`EvalError`]: crate::EvalError
47#[derive(Debug, Clone, Serialize, Deserialize)]
48pub struct ExperimentSessionReport {
49    /// Session ID generated at [`ExperimentEngine`] construction time.
50    pub session_id: SessionId,
51    /// All variation results recorded in this session (both accepted and rejected).
52    pub results: Vec<ExperimentResult>,
53    /// The best-known config snapshot at session end (may equal the initial baseline).
54    pub best_config: ConfigSnapshot,
55    /// Baseline mean score captured before the variation loop started.
56    ///
57    /// `NaN` when the session was cancelled before the baseline evaluation completed.
58    pub baseline_score: f64,
59    /// Mean score of the best-found configuration at session end.
60    ///
61    /// `NaN` when the session was cancelled before the baseline evaluation completed.
62    pub final_score: f64,
63    /// `final_score - baseline_score` (positive means improvement over baseline).
64    pub total_improvement: f64,
65    /// Total wall-clock time for the session in milliseconds.
66    pub wall_time_ms: u64,
67    /// `true` when the session was stopped via [`ExperimentEngine::stop`] or a
68    /// [`CancellationToken`] before the variation loop completed naturally.
69    ///
70    /// [`CancellationToken`]: tokio_util::sync::CancellationToken
71    pub cancelled: bool,
72}
73
74/// Autonomous parameter-tuning engine.
75///
76/// The engine evaluates a baseline configuration, then generates and tests
77/// parameter variations one at a time. Accepted variations update the progressive
78/// baseline (greedy hill-climbing). The loop terminates on budget exhaustion,
79/// search-space exhaustion, wall-time limit, or cancellation.
80///
81/// # Storage
82///
83/// When `memory` is `Some`, each result is persisted to `SQLite` via
84/// [`SemanticMemory::sqlite`]. When `None`, results are kept only in the
85/// in-memory `results` vec of the final report.
86///
87/// # Budget ownership
88///
89/// The `Evaluator` is passed pre-built by the caller. The caller is responsible
90/// for constructing it with the desired `budget_tokens` (typically
91/// `config.eval_budget_tokens`). The `eval_budget_tokens` field in
92/// [`ExperimentConfig`] is a hint for the caller — the engine itself does not
93/// construct the evaluator.
94pub struct ExperimentEngine {
95    evaluator: Evaluator,
96    generator: Box<dyn VariationGenerator>,
97    subject: Arc<AnyProvider>,
98    baseline: ConfigSnapshot,
99    config: ExperimentConfig,
100    memory: Option<Arc<SemanticMemory>>,
101    session_id: SessionId,
102    cancel: CancellationToken,
103    source: ExperimentSource,
104}
105
106/// Maximum number of consecutive NaN-scored evaluations before the loop breaks.
107/// Prevents unbounded spinning when the evaluator consistently returns degenerate reports.
108const MAX_CONSECUTIVE_NAN: u32 = 3;
109
110impl ExperimentEngine {
111    /// Create a new `ExperimentEngine`.
112    ///
113    /// A fresh UUID session ID is generated at construction time.
114    /// The `evaluator` should already be configured with the desired token budget
115    /// (typically `config.eval_budget_tokens`).
116    ///
117    /// # Contract
118    ///
119    /// The caller must ensure `config` is valid before constructing the engine.
120    /// Call [`ExperimentConfig::validate`] during bootstrap — passing invalid config
121    /// (e.g., `max_experiments=0`, `max_wall_time_secs=0`) results in unspecified
122    /// loop behaviour (immediate exit or no effective budget enforcement).
123    pub fn new(
124        evaluator: Evaluator,
125        generator: Box<dyn VariationGenerator>,
126        subject: Arc<AnyProvider>,
127        baseline: ConfigSnapshot,
128        config: ExperimentConfig,
129        memory: Option<Arc<SemanticMemory>>,
130    ) -> Self {
131        Self {
132            evaluator,
133            generator,
134            subject,
135            baseline,
136            config,
137            memory,
138            session_id: SessionId::generate(),
139            cancel: CancellationToken::new(),
140            source: ExperimentSource::Manual,
141        }
142    }
143
144    /// Set the [`ExperimentSource`] for this session.
145    ///
146    /// Defaults to [`ExperimentSource::Manual`]. Use [`ExperimentSource::Scheduled`]
147    /// for runs triggered by the scheduler.
148    #[must_use]
149    pub fn with_source(mut self, source: ExperimentSource) -> Self {
150        self.source = source;
151        self
152    }
153
154    /// Return a clone of the internal [`CancellationToken`].
155    ///
156    /// External callers (CLI, TUI, scheduler) can hold a token handle and call
157    /// `.cancel()` to trigger graceful shutdown. See also [`Self::stop`].
158    #[must_use]
159    pub fn cancel_token(&self) -> CancellationToken {
160        self.cancel.clone()
161    }
162
163    /// Stop the engine by cancelling the internal [`CancellationToken`].
164    ///
165    /// The current evaluation call will complete; the loop exits after it returns.
166    pub fn stop(&self) {
167        self.cancel.cancel();
168    }
169
170    /// Run the experiment loop and return a session report.
171    ///
172    /// The loop:
173    /// 1. Evaluates the baseline once to obtain `initial_baseline_score`.
174    /// 2. Generates variations via the [`VariationGenerator`].
175    /// 3. Evaluates each variation with a clone of `subject` patched with generation overrides
176    ///    derived from the candidate `ConfigSnapshot` via `AnyProvider::with_generation_overrides`.
177    /// 4. Accepts the variation if `delta >= config.min_improvement`.
178    /// 5. On acceptance, updates the progressive baseline (greedy hill-climbing).
179    ///    **Known limitation (S1):** single-sample acceptance has no statistical
180    ///    confidence check. Noise in the evaluator can cause gradual score drift.
181    ///    Phase 5 should add repeated trials or a confidence margin derived from
182    ///    per-case variance before promoting a variation.
183    /// 6. Optionally persists results to `SQLite` when `memory` is `Some`.
184    /// 7. Breaks on: max experiments, wall-time, search exhaustion, or cancellation.
185    ///
186    /// # Errors
187    ///
188    /// Returns [`EvalError`] if the baseline evaluation or any subject LLM call fails.
189    /// `SQLite` persistence failures are returned as [`EvalError::Storage`].
190    pub async fn run(&mut self) -> Result<ExperimentSessionReport, EvalError> {
191        let start = Instant::now();
192        let best_snapshot = self.baseline.clone();
193
194        // Step 0: evaluate baseline once, with cancellation support.
195        // Issue #4: wrapped in select! so a cancel during a slow baseline evaluation is honoured.
196        let baseline_report = tokio::select! {
197            biased;
198            () = self.cancel.cancelled() => {
199                tracing::info!(session_id = %self.session_id, "cancelled before baseline");
200                #[allow(clippy::cast_possible_truncation)]
201                return Ok(ExperimentSessionReport {
202                    session_id: self.session_id.clone(),
203                    results: vec![],
204                    best_config: best_snapshot,
205                    baseline_score: f64::NAN,
206                    final_score: f64::NAN,
207                    total_improvement: 0.0,
208                    wall_time_ms: start.elapsed().as_millis() as u64,
209                    cancelled: true,
210                });
211            }
212            report = self.evaluator.evaluate(&self.subject) => report?,
213        };
214
215        // Bug #3: if baseline produces NaN, there is no meaningful anchor — fail fast.
216        let initial_baseline_score = baseline_report.mean_score;
217        if initial_baseline_score.is_nan() {
218            return Err(EvalError::Storage(
219                "baseline evaluation produced NaN mean score; \
220                 check evaluator budget and judge responses"
221                    .into(),
222            ));
223        }
224        tracing::info!(
225            session_id = %self.session_id,
226            baseline_score = initial_baseline_score,
227            "experiment session started"
228        );
229        self.run_loop(start, initial_baseline_score, best_snapshot)
230            .await
231    }
232
233    /// Inner experiment loop — runs after a successful baseline evaluation.
234    ///
235    /// # Errors
236    ///
237    /// Returns [`EvalError`] if any LLM call or `SQLite` persist fails.
238    #[allow(clippy::too_many_lines)] // experiment loop with inherent complexity: variation→evaluate→compare
239    async fn run_loop(
240        &mut self,
241        start: Instant,
242        initial_baseline_score: f64,
243        mut best_snapshot: ConfigSnapshot,
244    ) -> Result<ExperimentSessionReport, EvalError> {
245        let wall_limit = std::time::Duration::from_secs(self.config.max_wall_time_secs);
246        let mut results: Vec<ExperimentResult> = Vec::new();
247        let mut visited: HashSet<Variation> = HashSet::new();
248        let (mut best_score, mut counter, mut consecutive_nan) =
249            (initial_baseline_score, 0i64, 0u32);
250
251        'main: loop {
252            if results.len() >= self.config.max_experiments as usize {
253                tracing::info!(session_id = %self.session_id, "budget exhausted");
254                break;
255            }
256            if start.elapsed() >= wall_limit {
257                tracing::info!(session_id = %self.session_id, "wall-time limit reached");
258                break;
259            }
260            let Some(variation) = self.generator.next(&best_snapshot, &visited) else {
261                tracing::info!(session_id = %self.session_id, "search space exhausted");
262                break;
263            };
264            visited.insert(variation.clone());
265            let candidate_snapshot = best_snapshot.apply(&variation);
266            let patched = (*self.subject)
267                .clone()
268                .with_generation_overrides(candidate_snapshot.to_generation_overrides());
269            let candidate_report = tokio::select! {
270                biased;
271                () = self.cancel.cancelled() => {
272                    tracing::info!(session_id = %self.session_id, "experiment cancelled");
273                    break 'main;
274                }
275                report = self.evaluator.evaluate(&patched) => report?,
276            };
277            if candidate_report.mean_score.is_nan() {
278                consecutive_nan += 1;
279                tracing::warn!(
280                    session_id = %self.session_id, param = %variation.parameter,
281                    is_partial = candidate_report.is_partial, consecutive_nan,
282                    "NaN mean score — skipping variation"
283                );
284                if consecutive_nan >= MAX_CONSECUTIVE_NAN {
285                    tracing::warn!(session_id = %self.session_id, "consecutive NaN cap reached");
286                    break;
287                }
288                continue;
289            }
290            consecutive_nan = 0;
291            let candidate_score = candidate_report.mean_score;
292            let delta = candidate_score - best_score;
293            let accepted = delta >= self.config.min_improvement;
294            let result_id = self
295                .persist_result(
296                    &variation,
297                    best_score,
298                    candidate_score,
299                    delta,
300                    accepted,
301                    candidate_report.p50_latency_ms,
302                    candidate_report.total_tokens,
303                    counter,
304                )
305                .await?;
306            counter += 1;
307            let pre_accept_baseline = best_score;
308            self.log_outcome(&variation, delta, accepted, best_score);
309            if accepted {
310                best_snapshot = candidate_snapshot;
311                best_score = candidate_score;
312            }
313            results.push(ExperimentResult {
314                id: result_id,
315                session_id: self.session_id.clone(),
316                variation,
317                baseline_score: pre_accept_baseline,
318                candidate_score,
319                delta,
320                latency_ms: candidate_report.p50_latency_ms,
321                tokens_used: candidate_report.total_tokens,
322                accepted,
323                source: self.source.clone(),
324                created_at: chrono_now_utc(),
325            });
326        }
327
328        #[allow(clippy::cast_possible_truncation)]
329        let wall_time_ms = start.elapsed().as_millis() as u64;
330        let total_improvement = best_score - initial_baseline_score;
331        tracing::info!(
332            session_id = %self.session_id, total = results.len(),
333            baseline_score = initial_baseline_score, final_score = best_score,
334            total_improvement, wall_time_ms, cancelled = self.cancel.is_cancelled(),
335            "experiment session complete"
336        );
337        Ok(ExperimentSessionReport {
338            session_id: self.session_id.clone(),
339            results,
340            best_config: best_snapshot,
341            baseline_score: initial_baseline_score,
342            final_score: best_score,
343            total_improvement,
344            wall_time_ms,
345            cancelled: self.cancel.is_cancelled(),
346        })
347    }
348
349    /// Persist a single experiment result to `SQLite` when memory is configured.
350    ///
351    /// Returns the row ID from `SQLite`, or a synthetic monotonic counter when
352    /// persistence is disabled (`memory` is `None`).
353    ///
354    /// # Errors
355    ///
356    /// Returns [`EvalError::Storage`] if the `SQLite` insert fails.
357    #[allow(clippy::too_many_arguments)]
358    async fn persist_result(
359        &self,
360        variation: &Variation,
361        baseline_score: f64,
362        candidate_score: f64,
363        delta: f64,
364        accepted: bool,
365        p50_latency_ms: u64,
366        total_tokens: u64,
367        counter: i64,
368    ) -> Result<i64, EvalError> {
369        let Some(mem) = &self.memory else {
370            return Ok(counter);
371        };
372        let value_json = serde_json::to_string(&variation.value)
373            .map_err(|e| EvalError::Storage(e.to_string()))?;
374        #[allow(clippy::cast_possible_wrap)]
375        let new_result = NewExperimentResult {
376            session_id: self.session_id.as_str(),
377            parameter: variation.parameter.as_str(),
378            value_json: &value_json,
379            baseline_score,
380            candidate_score,
381            delta,
382            latency_ms: p50_latency_ms as i64,
383            tokens_used: total_tokens as i64,
384            accepted,
385            source: self.source.as_str(),
386        };
387        mem.sqlite()
388            .insert_experiment_result(&new_result)
389            .await
390            .map_err(|e: zeph_memory::error::MemoryError| EvalError::Storage(e.to_string()))
391    }
392
393    fn log_outcome(&self, variation: &Variation, delta: f64, accepted: bool, new_score: f64) {
394        if accepted {
395            tracing::info!(
396                session_id = %self.session_id,
397                param = %variation.parameter,
398                value = %variation.value,
399                delta,
400                new_best_score = new_score,
401                "variation accepted — new baseline"
402            );
403        } else {
404            tracing::info!(
405                session_id = %self.session_id,
406                param = %variation.parameter,
407                value = %variation.value,
408                delta,
409                "variation rejected"
410            );
411        }
412    }
413}
414
415/// Return a UTC timestamp string in `YYYY-MM-DD HH:MM:SS` format.
416#[allow(clippy::many_single_char_names)]
417fn chrono_now_utc() -> String {
418    use std::time::{SystemTime, UNIX_EPOCH};
419    let secs = SystemTime::now()
420        .duration_since(UNIX_EPOCH)
421        .unwrap_or_default()
422        .as_secs();
423    // Simple UTC formatter — no external date dependency.
424    let s = secs % 60;
425    let m = (secs / 60) % 60;
426    let h = (secs / 3600) % 24;
427    let days = secs / 86400;
428    // Days since 1970-01-01
429    let (y, mo, d) = days_to_ymd(days);
430    format!("{y:04}-{mo:02}-{d:02} {h:02}:{m:02}:{s:02}")
431}
432
433/// Convert days since Unix epoch to (year, month, day).
434fn days_to_ymd(mut days: u64) -> (u64, u64, u64) {
435    // Gregorian calendar algorithm.
436    let mut year = 1970u64;
437    loop {
438        let leap = is_leap(year);
439        let dy = if leap { 366 } else { 365 };
440        if days < dy {
441            break;
442        }
443        days -= dy;
444        year += 1;
445    }
446    let leap = is_leap(year);
447    let month_days: [u64; 12] = if leap {
448        [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
449    } else {
450        [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
451    };
452    let mut month = 1u64;
453    for md in &month_days {
454        if days < *md {
455            break;
456        }
457        days -= md;
458        month += 1;
459    }
460    (year, month, days + 1)
461}
462
463fn is_leap(y: u64) -> bool {
464    (y.is_multiple_of(4) && !y.is_multiple_of(100)) || y.is_multiple_of(400)
465}
466
467#[cfg(test)]
468mod tests {
469    #![allow(clippy::doc_markdown)]
470
471    use super::*;
472    use crate::benchmark::{BenchmarkCase, BenchmarkSet};
473    use crate::evaluator::Evaluator;
474    use crate::generator::VariationGenerator;
475    use crate::snapshot::ConfigSnapshot;
476    use crate::types::{ParameterKind, Variation, VariationValue};
477    use ordered_float::OrderedFloat;
478    use std::sync::Arc;
479    use zeph_config::ExperimentConfig;
480
481    fn make_benchmark() -> BenchmarkSet {
482        BenchmarkSet {
483            cases: vec![BenchmarkCase {
484                prompt: "What is 2+2?".into(),
485                context: None,
486                reference: None,
487                tags: None,
488            }],
489        }
490    }
491
492    fn default_config() -> ExperimentConfig {
493        ExperimentConfig {
494            max_experiments: 10,
495            max_wall_time_secs: 3600,
496            min_improvement: 0.0,
497            ..Default::default()
498        }
499    }
500
501    /// Generates exactly N variations and then exhausts.
502    struct NVariationGenerator {
503        variations: Vec<Variation>,
504        pos: usize,
505    }
506
507    impl NVariationGenerator {
508        fn new(n: usize) -> Self {
509            let variations = (0..n)
510                .map(|i| Variation {
511                    parameter: ParameterKind::Temperature,
512                    #[allow(clippy::cast_precision_loss)]
513                    value: VariationValue::Float(OrderedFloat(0.5 + i as f64 * 0.1)),
514                })
515                .collect();
516            Self { variations, pos: 0 }
517        }
518    }
519
520    impl VariationGenerator for NVariationGenerator {
521        fn next(
522            &mut self,
523            _baseline: &ConfigSnapshot,
524            visited: &HashSet<Variation>,
525        ) -> Option<Variation> {
526            while self.pos < self.variations.len() {
527                let v = self.variations[self.pos].clone();
528                self.pos += 1;
529                if !visited.contains(&v) {
530                    return Some(v);
531                }
532            }
533            None
534        }
535
536        fn name(&self) -> &'static str {
537            "n_variation"
538        }
539    }
540
541    #[cfg(test)]
542    fn make_subject_mock(n_responses: usize) -> zeph_llm::any::AnyProvider {
543        use zeph_llm::any::AnyProvider;
544        use zeph_llm::mock::MockProvider;
545
546        // Each evaluate() call runs 1 subject call + 1 judge call per benchmark case.
547        // With 1 case: 1 subject + 1 judge response per evaluate() invocation.
548        // We need n_responses pairs (subject + judge) for n variations + 1 baseline.
549        let responses: Vec<String> = (0..n_responses).map(|_| "Four".to_string()).collect();
550        AnyProvider::Mock(MockProvider::with_responses(responses))
551    }
552
553    #[cfg(test)]
554    fn make_judge_mock(n_responses: usize) -> zeph_llm::any::AnyProvider {
555        use zeph_llm::any::AnyProvider;
556        use zeph_llm::mock::MockProvider;
557
558        let responses: Vec<String> = (0..n_responses)
559            .map(|_| r#"{"score": 8.0, "reason": "correct"}"#.to_string())
560            .collect();
561        AnyProvider::Mock(MockProvider::with_responses(responses))
562    }
563
564    #[cfg(test)]
565    #[tokio::test]
566    async fn engine_completes_with_no_accepted_variations() {
567        // min_improvement very high so nothing is accepted.
568        let config = ExperimentConfig {
569            max_experiments: 10,
570            max_wall_time_secs: 3600,
571            min_improvement: 100.0,
572            ..Default::default()
573        };
574        // 1 variation + 1 baseline = 2 evaluate() calls (2 subject + 2 judge responses).
575        let subject = make_subject_mock(2);
576        let judge = make_judge_mock(2);
577        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
578
579        let mut engine = ExperimentEngine::new(
580            evaluator,
581            Box::new(NVariationGenerator::new(1)),
582            Arc::new(subject),
583            ConfigSnapshot::default(),
584            config,
585            None,
586        );
587
588        let report = engine.run().await.unwrap();
589        assert_eq!(report.results.len(), 1);
590        assert!(!report.results[0].accepted);
591        assert!(!report.session_id.is_empty());
592        assert!(!report.cancelled);
593    }
594
595    #[cfg(test)]
596    #[tokio::test]
597    async fn engine_respects_max_experiments() {
598        let config = ExperimentConfig {
599            max_experiments: 3,
600            max_wall_time_secs: 3600,
601            min_improvement: 0.0,
602            ..Default::default()
603        };
604        // 5 variations available but max_experiments=3.
605        // 1 baseline + 3 candidate evaluate() calls = 4 calls, each needing 1 subject + 1 judge.
606        let subject = make_subject_mock(4);
607        let judge = make_judge_mock(4);
608        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
609
610        let mut engine = ExperimentEngine::new(
611            evaluator,
612            Box::new(NVariationGenerator::new(5)),
613            Arc::new(subject),
614            ConfigSnapshot::default(),
615            config,
616            None,
617        );
618
619        let report = engine.run().await.unwrap();
620        assert_eq!(report.results.len(), 3);
621        assert!(!report.cancelled);
622    }
623
624    #[cfg(test)]
625    #[tokio::test]
626    async fn engine_cancellation_before_baseline() {
627        // Pre-cancel: cancel token fires during baseline evaluation select!.
628        let config = ExperimentConfig {
629            max_experiments: 100,
630            max_wall_time_secs: 3600,
631            min_improvement: 0.0,
632            ..Default::default()
633        };
634        let subject = make_subject_mock(2);
635        let judge = make_judge_mock(2);
636        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
637
638        let mut engine = ExperimentEngine::new(
639            evaluator,
640            Box::new(NVariationGenerator::new(100)),
641            Arc::new(subject),
642            ConfigSnapshot::default(),
643            config,
644            None,
645        );
646        engine.stop(); // cancel before any evaluation
647        let report = engine.run().await.unwrap();
648        assert!(report.cancelled);
649        assert!(report.results.is_empty());
650    }
651
652    #[cfg(test)]
653    #[tokio::test]
654    async fn engine_cancellation_stops_loop() {
655        // Verify the loop-level select! path: cancel token pre-fired, baseline completes
656        // (because biased baseline select! checks cancel FIRST — fires immediately), then
657        // run() returns early. Since MockProvider is instantaneous, we test the cancel
658        // token semantics via stop() called between construction and run().
659        //
660        // NOTE: engine_cancellation_before_baseline covers the biased baseline path.
661        // This test verifies that cancelling after construction but before run() sets
662        // cancelled=true in the report regardless of results count.
663        let config = ExperimentConfig {
664            max_experiments: 10,
665            max_wall_time_secs: 3600,
666            min_improvement: 0.0,
667            ..Default::default()
668        };
669        let subject = make_subject_mock(2);
670        let judge = make_judge_mock(2);
671        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
672
673        let mut engine = ExperimentEngine::new(
674            evaluator,
675            Box::new(NVariationGenerator::new(10)),
676            Arc::new(subject),
677            ConfigSnapshot::default(),
678            config,
679            None,
680        );
681
682        // Verify cancel_token() gives an independent handle that controls the same token.
683        let external_token = engine.cancel_token();
684        assert!(!external_token.is_cancelled());
685        engine.stop();
686        assert!(
687            external_token.is_cancelled(),
688            "cancel_token() must share the same token"
689        );
690
691        let report = engine.run().await.unwrap();
692        assert!(report.cancelled);
693    }
694
695    #[cfg(test)]
696    #[tokio::test]
697    async fn engine_progressive_baseline_updates() {
698        // One variation applied via NVariationGenerator generates temperature=0.5.
699        // min_improvement=0.0 so it is accepted, updating best_config.
700        let config = ExperimentConfig {
701            max_experiments: 1,
702            max_wall_time_secs: 3600,
703            min_improvement: 0.0,
704            ..Default::default()
705        };
706        // 1 baseline + 1 candidate = 2 evaluate() calls.
707        let subject = make_subject_mock(2);
708        let judge = make_judge_mock(2);
709        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
710
711        let initial_baseline = ConfigSnapshot::default();
712        let mut engine = ExperimentEngine::new(
713            evaluator,
714            Box::new(NVariationGenerator::new(1)),
715            Arc::new(subject),
716            initial_baseline.clone(),
717            config,
718            None,
719        );
720
721        let report = engine.run().await.unwrap();
722        assert_eq!(report.results.len(), 1);
723        assert!(report.results[0].accepted, "variation should be accepted");
724        // best_config should differ from the initial baseline (temperature changed to 0.5).
725        assert!(
726            (report.best_config.temperature - initial_baseline.temperature).abs() > 1e-9,
727            "best_config.temperature should have changed after accepted variation"
728        );
729        assert!(!report.baseline_score.is_nan());
730        assert!(!report.final_score.is_nan());
731        // Bug #1 regression: baseline_score in result must be the PRE-acceptance score.
732        assert!(
733            (report.results[0].baseline_score - report.baseline_score).abs() < 1e-9,
734            "result.baseline_score must equal initial baseline_score (pre-acceptance)"
735        );
736    }
737
738    #[cfg(test)]
739    #[tokio::test]
740    async fn engine_handles_search_space_exhaustion() {
741        let config = default_config();
742        // Generator returns None immediately (0 variations).
743        // Only the baseline evaluate() call is needed.
744        let subject = make_subject_mock(1);
745        let judge = make_judge_mock(1);
746        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
747
748        let mut engine = ExperimentEngine::new(
749            evaluator,
750            Box::new(NVariationGenerator::new(0)),
751            Arc::new(subject),
752            ConfigSnapshot::default(),
753            config,
754            None,
755        );
756
757        let report = engine.run().await.unwrap();
758        assert!(report.results.is_empty());
759        assert!(!report.cancelled);
760    }
761
762    #[cfg(test)]
763    #[tokio::test]
764    async fn engine_skips_nan_scores() {
765        use zeph_llm::any::AnyProvider;
766        use zeph_llm::mock::MockProvider;
767
768        // Candidate evaluations produce NaN (empty judge responses → error → 0 scored → NaN mean).
769        // Baseline uses a sufficient budget to succeed; candidate budget is tiny.
770        // We use two separate Evaluator instances: one for baseline (high budget),
771        // one for candidates (zero budget). Since ExperimentEngine uses a single Evaluator,
772        // we use a judge mock with 1 valid response (baseline) and no more (candidates fail).
773        let config = ExperimentConfig {
774            max_experiments: 5,
775            max_wall_time_secs: 3600,
776            min_improvement: 0.0,
777            ..Default::default()
778        };
779        // Subject: baseline + 3 candidate subject calls (3 NaN iterations before cap).
780        let subject = AnyProvider::Mock(MockProvider::with_responses(vec![
781            "A".into(),
782            "A".into(),
783            "A".into(),
784            "A".into(),
785        ]));
786        // Judge: 1 valid response for baseline, then errors for candidates (mock exhausted).
787        let judge = AnyProvider::Mock(MockProvider::with_responses(vec![
788            r#"{"score": 8.0, "reason": "ok"}"#.into(),
789        ]));
790        // Use large budget — judge errors (not budget) produce NaN via 0 cases scored.
791        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
792
793        let mut engine = ExperimentEngine::new(
794            evaluator,
795            Box::new(NVariationGenerator::new(5)),
796            Arc::new(subject),
797            ConfigSnapshot::default(),
798            config,
799            None,
800        );
801
802        // Should not panic — NaN scores are skipped; loop breaks after MAX_CONSECUTIVE_NAN.
803        let report = engine.run().await.unwrap();
804        // No variations accepted (all NaN); loop stopped at consecutive NaN limit.
805        assert!(
806            report.results.is_empty(),
807            "all NaN iterations should be skipped"
808        );
809        assert!(!report.cancelled);
810    }
811
812    #[cfg(test)]
813    #[tokio::test]
814    async fn engine_nan_baseline_returns_error() {
815        use zeph_llm::any::AnyProvider;
816        use zeph_llm::mock::MockProvider;
817
818        // Budget=0 and no judge responses → baseline evaluation returns NaN mean → engine errors.
819        let config = ExperimentConfig {
820            max_experiments: 5,
821            max_wall_time_secs: 3600,
822            min_improvement: 0.0,
823            ..Default::default()
824        };
825        // Subject responds for baseline subject call.
826        let subject = AnyProvider::Mock(MockProvider::with_responses(vec!["A".into()]));
827        // Judge has no responses — all judge calls error, 0 cases scored, NaN mean.
828        let judge = AnyProvider::Mock(MockProvider::with_responses(vec![]));
829        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
830
831        let mut engine = ExperimentEngine::new(
832            evaluator,
833            Box::new(NVariationGenerator::new(5)),
834            Arc::new(subject),
835            ConfigSnapshot::default(),
836            config,
837            None,
838        );
839
840        let result = engine.run().await;
841        assert!(result.is_err(), "NaN baseline should return an error");
842        let err = result.unwrap_err();
843        assert!(
844            matches!(err, EvalError::Storage(_)),
845            "expected EvalError::Storage, got: {err:?}"
846        );
847    }
848
849    #[cfg(test)]
850    #[tokio::test]
851    async fn engine_persists_results_to_sqlite() {
852        use zeph_memory::testing::mock_semantic_memory;
853
854        let memory = mock_semantic_memory().await.unwrap();
855        let config = ExperimentConfig {
856            max_experiments: 1,
857            max_wall_time_secs: 3600,
858            min_improvement: 0.0,
859            ..Default::default()
860        };
861        // 1 baseline + 1 candidate = 2 evaluate() calls.
862        let subject = make_subject_mock(2);
863        let judge = make_judge_mock(2);
864        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
865
866        let session_id = {
867            let mut engine = ExperimentEngine::new(
868                evaluator,
869                Box::new(NVariationGenerator::new(1)),
870                Arc::new(subject),
871                ConfigSnapshot::default(),
872                config,
873                Some(Arc::clone(&memory)),
874            );
875            engine.run().await.unwrap();
876            engine.session_id.clone()
877        };
878
879        let rows = memory
880            .sqlite()
881            .list_experiment_results(Some(&session_id), 10)
882            .await
883            .unwrap();
884        assert_eq!(rows.len(), 1, "expected one persisted result");
885        assert_eq!(rows[0].session_id, session_id.as_str());
886    }
887
888    #[test]
889    fn session_report_serde_roundtrip() {
890        let report = ExperimentSessionReport {
891            session_id: SessionId::new("test-session"),
892            results: vec![],
893            best_config: ConfigSnapshot::default(),
894            baseline_score: 7.5,
895            final_score: 8.0,
896            total_improvement: 0.5,
897            wall_time_ms: 1_234,
898            cancelled: false,
899        };
900        let json = serde_json::to_string(&report).expect("serialize");
901        let report2: ExperimentSessionReport = serde_json::from_str(&json).expect("deserialize");
902        assert_eq!(report2.session_id, report.session_id);
903        assert!((report2.baseline_score - report.baseline_score).abs() < f64::EPSILON);
904        assert!((report2.final_score - report.final_score).abs() < f64::EPSILON);
905        assert_eq!(report2.wall_time_ms, report.wall_time_ms);
906        assert!(!report2.cancelled);
907    }
908
909    #[test]
910    fn chrono_now_utc_format() {
911        let s = chrono_now_utc();
912        assert_eq!(s.len(), 19, "timestamp must be 19 chars: {s}");
913        assert_eq!(&s[4..5], "-");
914        assert_eq!(&s[7..8], "-");
915        assert_eq!(&s[10..11], " ");
916        assert_eq!(&s[13..14], ":");
917        assert_eq!(&s[16..17], ":");
918    }
919
920    /// Verify that `days_to_ymd` correctly handles a known date including a leap year.
921    /// 2024-02-29 (leap day) = 19782 days since 1970-01-01.
922    #[test]
923    fn chrono_known_timestamp_leap_year() {
924        // 2024-02-29 00:00:00 UTC = 1_709_164_800 seconds since epoch.
925        // Verified via: date -d "2024-02-29 00:00:00 UTC" +%s
926        let secs: u64 = 1_709_164_800;
927        let second = secs % 60;
928        let minute = (secs / 60) % 60;
929        let hour = (secs / 3600) % 24;
930        let days = secs / 86400;
931        let (year, month, day) = days_to_ymd(days);
932        assert_eq!(year, 2024);
933        assert_eq!(month, 2);
934        assert_eq!(day, 29);
935        assert_eq!(second, 0);
936        assert_eq!(minute, 0);
937        assert_eq!(hour, 0);
938    }
939
940    /// `ExperimentEngine` must be Send to be used with `tokio::spawn`.
941    #[test]
942    fn experiment_engine_is_send() {
943        fn assert_send<T: Send>() {}
944        // This is a compile-time check — if ExperimentEngine is not Send, this fails to compile.
945        // We cannot instantiate the engine here without providers, so we use a fn pointer trick.
946        let _ = assert_send::<ExperimentEngine>;
947    }
948
949    #[tokio::test]
950    async fn engine_with_source_scheduled_propagates_to_results() {
951        let config = ExperimentConfig {
952            max_experiments: 1,
953            max_wall_time_secs: 3600,
954            min_improvement: 0.0,
955            ..Default::default()
956        };
957        let subject = make_subject_mock(2);
958        let judge = make_judge_mock(2);
959        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
960
961        let mut engine = ExperimentEngine::new(
962            evaluator,
963            Box::new(NVariationGenerator::new(1)),
964            Arc::new(subject),
965            ConfigSnapshot::default(),
966            config,
967            None,
968        )
969        .with_source(ExperimentSource::Scheduled);
970
971        let report = engine.run().await.unwrap();
972        assert_eq!(report.results.len(), 1);
973        assert_eq!(
974            report.results[0].source,
975            ExperimentSource::Scheduled,
976            "with_source(Scheduled) must propagate to ExperimentResult"
977        );
978    }
979}