Skip to main content

zeph_experiments/
engine.rs

1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Experiment engine — core async loop for autonomous parameter tuning.
5//!
6//! [`ExperimentEngine`] orchestrates baseline evaluation, variation generation,
7//! candidate scoring, acceptance decisions, and optional `SQLite` persistence.
8//! Cancellation is supported via [`tokio_util::sync::CancellationToken`].
9
10use std::collections::HashSet;
11use std::sync::Arc;
12use std::time::Instant;
13
14use serde::{Deserialize, Serialize};
15use tokio_util::sync::CancellationToken;
16use zeph_llm::any::AnyProvider;
17use zeph_memory::semantic::SemanticMemory;
18use zeph_memory::store::experiments::NewExperimentResult;
19
20use super::error::EvalError;
21use super::evaluator::Evaluator;
22use super::generator::VariationGenerator;
23use super::snapshot::ConfigSnapshot;
24use super::types::{ExperimentResult, ExperimentSource, Variation};
25use zeph_config::ExperimentConfig;
26
27/// Final report produced by [`ExperimentEngine::run`].
28#[derive(Debug, Clone, Serialize, Deserialize)]
29pub struct ExperimentSessionReport {
30    /// UUID identifying this experiment session.
31    pub session_id: String,
32    /// All experiment results recorded in this session (accepted and rejected).
33    pub results: Vec<ExperimentResult>,
34    /// The best-known config snapshot at session end (progressive baseline winner).
35    pub best_config: ConfigSnapshot,
36    /// Baseline mean score captured before the loop started.
37    pub baseline_score: f64,
38    /// Final best-known mean score at session end.
39    pub final_score: f64,
40    /// `final_score - baseline_score` (positive means improvement).
41    pub total_improvement: f64,
42    /// Wall-clock time for the full session in milliseconds.
43    pub wall_time_ms: u64,
44    /// Whether the session was stopped via [`ExperimentEngine::stop`].
45    pub cancelled: bool,
46}
47
48/// Autonomous parameter-tuning engine.
49///
50/// The engine evaluates a baseline configuration, then generates and tests
51/// parameter variations one at a time. Accepted variations update the progressive
52/// baseline (greedy hill-climbing). The loop terminates on budget exhaustion,
53/// search-space exhaustion, wall-time limit, or cancellation.
54///
55/// # Storage
56///
57/// When `memory` is `Some`, each result is persisted to `SQLite` via
58/// [`SemanticMemory::sqlite`]. When `None`, results are kept only in the
59/// in-memory `results` vec of the final report.
60///
61/// # Budget ownership
62///
63/// The `Evaluator` is passed pre-built by the caller. The caller is responsible
64/// for constructing it with the desired `budget_tokens` (typically
65/// `config.eval_budget_tokens`). The `eval_budget_tokens` field in
66/// [`ExperimentConfig`] is a hint for the caller — the engine itself does not
67/// construct the evaluator.
68pub struct ExperimentEngine {
69    evaluator: Evaluator,
70    generator: Box<dyn VariationGenerator>,
71    subject: Arc<AnyProvider>,
72    baseline: ConfigSnapshot,
73    config: ExperimentConfig,
74    memory: Option<Arc<SemanticMemory>>,
75    session_id: String,
76    cancel: CancellationToken,
77    source: ExperimentSource,
78}
79
80/// Maximum number of consecutive NaN-scored evaluations before the loop breaks.
81/// Prevents unbounded spinning when the evaluator consistently returns degenerate reports.
82const MAX_CONSECUTIVE_NAN: u32 = 3;
83
84impl ExperimentEngine {
85    /// Create a new `ExperimentEngine`.
86    ///
87    /// A fresh UUID session ID is generated at construction time.
88    /// The `evaluator` should already be configured with the desired token budget
89    /// (typically `config.eval_budget_tokens`).
90    ///
91    /// # Contract
92    ///
93    /// The caller must ensure `config` is valid before constructing the engine.
94    /// Call [`ExperimentConfig::validate`] during bootstrap — passing invalid config
95    /// (e.g., `max_experiments=0`, `max_wall_time_secs=0`) results in unspecified
96    /// loop behaviour (immediate exit or no effective budget enforcement).
97    pub fn new(
98        evaluator: Evaluator,
99        generator: Box<dyn VariationGenerator>,
100        subject: Arc<AnyProvider>,
101        baseline: ConfigSnapshot,
102        config: ExperimentConfig,
103        memory: Option<Arc<SemanticMemory>>,
104    ) -> Self {
105        Self {
106            evaluator,
107            generator,
108            subject,
109            baseline,
110            config,
111            memory,
112            session_id: uuid::Uuid::new_v4().to_string(),
113            cancel: CancellationToken::new(),
114            source: ExperimentSource::Manual,
115        }
116    }
117
118    /// Set the [`ExperimentSource`] for this session.
119    ///
120    /// Defaults to [`ExperimentSource::Manual`]. Use [`ExperimentSource::Scheduled`]
121    /// for runs triggered by the scheduler.
122    #[must_use]
123    pub fn with_source(mut self, source: ExperimentSource) -> Self {
124        self.source = source;
125        self
126    }
127
128    /// Return a clone of the internal [`CancellationToken`].
129    ///
130    /// External callers (CLI, TUI, scheduler) can hold a token handle and call
131    /// `.cancel()` to trigger graceful shutdown. See also [`Self::stop`].
132    #[must_use]
133    pub fn cancel_token(&self) -> CancellationToken {
134        self.cancel.clone()
135    }
136
137    /// Stop the engine by cancelling the internal [`CancellationToken`].
138    ///
139    /// The current evaluation call will complete; the loop exits after it returns.
140    pub fn stop(&self) {
141        self.cancel.cancel();
142    }
143
144    /// Run the experiment loop and return a session report.
145    ///
146    /// The loop:
147    /// 1. Evaluates the baseline once to obtain `initial_baseline_score`.
148    /// 2. Generates variations via the [`VariationGenerator`].
149    /// 3. Evaluates each variation with a clone of `subject` patched with generation overrides
150    ///    derived from the candidate `ConfigSnapshot` via `AnyProvider::with_generation_overrides`.
151    /// 4. Accepts the variation if `delta >= config.min_improvement`.
152    /// 5. On acceptance, updates the progressive baseline (greedy hill-climbing).
153    ///    **Known limitation (S1):** single-sample acceptance has no statistical
154    ///    confidence check. Noise in the evaluator can cause gradual score drift.
155    ///    Phase 5 should add repeated trials or a confidence margin derived from
156    ///    per-case variance before promoting a variation.
157    /// 6. Optionally persists results to `SQLite` when `memory` is `Some`.
158    /// 7. Breaks on: max experiments, wall-time, search exhaustion, or cancellation.
159    ///
160    /// # Errors
161    ///
162    /// Returns [`EvalError`] if the baseline evaluation or any subject LLM call fails.
163    /// `SQLite` persistence failures are returned as [`EvalError::Storage`].
164    pub async fn run(&mut self) -> Result<ExperimentSessionReport, EvalError> {
165        let start = Instant::now();
166        let best_snapshot = self.baseline.clone();
167
168        // Step 0: evaluate baseline once, with cancellation support.
169        // Issue #4: wrapped in select! so a cancel during a slow baseline evaluation is honoured.
170        let baseline_report = tokio::select! {
171            biased;
172            () = self.cancel.cancelled() => {
173                tracing::info!(session_id = %self.session_id, "cancelled before baseline");
174                #[allow(clippy::cast_possible_truncation)]
175                return Ok(ExperimentSessionReport {
176                    session_id: self.session_id.clone(),
177                    results: vec![],
178                    best_config: best_snapshot,
179                    baseline_score: f64::NAN,
180                    final_score: f64::NAN,
181                    total_improvement: 0.0,
182                    wall_time_ms: start.elapsed().as_millis() as u64,
183                    cancelled: true,
184                });
185            }
186            report = self.evaluator.evaluate(&self.subject) => report?,
187        };
188
189        // Bug #3: if baseline produces NaN, there is no meaningful anchor — fail fast.
190        let initial_baseline_score = baseline_report.mean_score;
191        if initial_baseline_score.is_nan() {
192            return Err(EvalError::Storage(
193                "baseline evaluation produced NaN mean score; \
194                 check evaluator budget and judge responses"
195                    .into(),
196            ));
197        }
198        tracing::info!(
199            session_id = %self.session_id,
200            baseline_score = initial_baseline_score,
201            "experiment session started"
202        );
203        self.run_loop(start, initial_baseline_score, best_snapshot)
204            .await
205    }
206
207    /// Inner experiment loop — runs after a successful baseline evaluation.
208    ///
209    /// # Errors
210    ///
211    /// Returns [`EvalError`] if any LLM call or `SQLite` persist fails.
212    #[allow(clippy::too_many_lines)] // experiment loop with inherent complexity: variation→evaluate→compare
213    async fn run_loop(
214        &mut self,
215        start: Instant,
216        initial_baseline_score: f64,
217        mut best_snapshot: ConfigSnapshot,
218    ) -> Result<ExperimentSessionReport, EvalError> {
219        let wall_limit = std::time::Duration::from_secs(self.config.max_wall_time_secs);
220        let mut results: Vec<ExperimentResult> = Vec::new();
221        let mut visited: HashSet<Variation> = HashSet::new();
222        let (mut best_score, mut counter, mut consecutive_nan) =
223            (initial_baseline_score, 0i64, 0u32);
224
225        'main: loop {
226            if results.len() >= self.config.max_experiments as usize {
227                tracing::info!(session_id = %self.session_id, "budget exhausted");
228                break;
229            }
230            if start.elapsed() >= wall_limit {
231                tracing::info!(session_id = %self.session_id, "wall-time limit reached");
232                break;
233            }
234            let Some(variation) = self.generator.next(&best_snapshot, &visited) else {
235                tracing::info!(session_id = %self.session_id, "search space exhausted");
236                break;
237            };
238            visited.insert(variation.clone());
239            let candidate_snapshot = best_snapshot.apply(&variation);
240            let patched = (*self.subject)
241                .clone()
242                .with_generation_overrides(candidate_snapshot.to_generation_overrides());
243            let candidate_report = tokio::select! {
244                biased;
245                () = self.cancel.cancelled() => {
246                    tracing::info!(session_id = %self.session_id, "experiment cancelled");
247                    break 'main;
248                }
249                report = self.evaluator.evaluate(&patched) => report?,
250            };
251            if candidate_report.mean_score.is_nan() {
252                consecutive_nan += 1;
253                tracing::warn!(
254                    session_id = %self.session_id, param = %variation.parameter,
255                    is_partial = candidate_report.is_partial, consecutive_nan,
256                    "NaN mean score — skipping variation"
257                );
258                if consecutive_nan >= MAX_CONSECUTIVE_NAN {
259                    tracing::warn!(session_id = %self.session_id, "consecutive NaN cap reached");
260                    break;
261                }
262                continue;
263            }
264            consecutive_nan = 0;
265            let candidate_score = candidate_report.mean_score;
266            let delta = candidate_score - best_score;
267            let accepted = delta >= self.config.min_improvement;
268            let result_id = self
269                .persist_result(
270                    &variation,
271                    best_score,
272                    candidate_score,
273                    delta,
274                    accepted,
275                    candidate_report.p50_latency_ms,
276                    candidate_report.total_tokens,
277                    counter,
278                )
279                .await?;
280            counter += 1;
281            let pre_accept_baseline = best_score;
282            self.log_outcome(&variation, delta, accepted, best_score);
283            if accepted {
284                best_snapshot = candidate_snapshot;
285                best_score = candidate_score;
286            }
287            results.push(ExperimentResult {
288                id: result_id,
289                session_id: self.session_id.clone(),
290                variation,
291                baseline_score: pre_accept_baseline,
292                candidate_score,
293                delta,
294                latency_ms: candidate_report.p50_latency_ms,
295                tokens_used: candidate_report.total_tokens,
296                accepted,
297                source: self.source.clone(),
298                created_at: chrono_now_utc(),
299            });
300        }
301
302        #[allow(clippy::cast_possible_truncation)]
303        let wall_time_ms = start.elapsed().as_millis() as u64;
304        let total_improvement = best_score - initial_baseline_score;
305        tracing::info!(
306            session_id = %self.session_id, total = results.len(),
307            baseline_score = initial_baseline_score, final_score = best_score,
308            total_improvement, wall_time_ms, cancelled = self.cancel.is_cancelled(),
309            "experiment session complete"
310        );
311        Ok(ExperimentSessionReport {
312            session_id: self.session_id.clone(),
313            results,
314            best_config: best_snapshot,
315            baseline_score: initial_baseline_score,
316            final_score: best_score,
317            total_improvement,
318            wall_time_ms,
319            cancelled: self.cancel.is_cancelled(),
320        })
321    }
322
323    /// Persist a single experiment result to `SQLite` when memory is configured.
324    ///
325    /// Returns the row ID from `SQLite`, or a synthetic monotonic counter when
326    /// persistence is disabled (`memory` is `None`).
327    ///
328    /// # Errors
329    ///
330    /// Returns [`EvalError::Storage`] if the `SQLite` insert fails.
331    #[allow(clippy::too_many_arguments)]
332    async fn persist_result(
333        &self,
334        variation: &Variation,
335        baseline_score: f64,
336        candidate_score: f64,
337        delta: f64,
338        accepted: bool,
339        p50_latency_ms: u64,
340        total_tokens: u64,
341        counter: i64,
342    ) -> Result<i64, EvalError> {
343        let Some(mem) = &self.memory else {
344            return Ok(counter);
345        };
346        let value_json = serde_json::to_string(&variation.value)
347            .map_err(|e| EvalError::Storage(e.to_string()))?;
348        #[allow(clippy::cast_possible_wrap)]
349        let new_result = NewExperimentResult {
350            session_id: &self.session_id,
351            parameter: variation.parameter.as_str(),
352            value_json: &value_json,
353            baseline_score,
354            candidate_score,
355            delta,
356            latency_ms: p50_latency_ms as i64,
357            tokens_used: total_tokens as i64,
358            accepted,
359            source: self.source.as_str(),
360        };
361        mem.sqlite()
362            .insert_experiment_result(&new_result)
363            .await
364            .map_err(|e: zeph_memory::error::MemoryError| EvalError::Storage(e.to_string()))
365    }
366
367    fn log_outcome(&self, variation: &Variation, delta: f64, accepted: bool, new_score: f64) {
368        if accepted {
369            tracing::info!(
370                session_id = %self.session_id,
371                param = %variation.parameter,
372                value = %variation.value,
373                delta,
374                new_best_score = new_score,
375                "variation accepted — new baseline"
376            );
377        } else {
378            tracing::info!(
379                session_id = %self.session_id,
380                param = %variation.parameter,
381                value = %variation.value,
382                delta,
383                "variation rejected"
384            );
385        }
386    }
387}
388
389/// Return a UTC timestamp string in `YYYY-MM-DD HH:MM:SS` format.
390#[allow(clippy::many_single_char_names)]
391fn chrono_now_utc() -> String {
392    use std::time::{SystemTime, UNIX_EPOCH};
393    let secs = SystemTime::now()
394        .duration_since(UNIX_EPOCH)
395        .unwrap_or_default()
396        .as_secs();
397    // Simple UTC formatter — no external date dependency.
398    let s = secs % 60;
399    let m = (secs / 60) % 60;
400    let h = (secs / 3600) % 24;
401    let days = secs / 86400;
402    // Days since 1970-01-01
403    let (y, mo, d) = days_to_ymd(days);
404    format!("{y:04}-{mo:02}-{d:02} {h:02}:{m:02}:{s:02}")
405}
406
407/// Convert days since Unix epoch to (year, month, day).
408fn days_to_ymd(mut days: u64) -> (u64, u64, u64) {
409    // Gregorian calendar algorithm.
410    let mut year = 1970u64;
411    loop {
412        let leap = is_leap(year);
413        let dy = if leap { 366 } else { 365 };
414        if days < dy {
415            break;
416        }
417        days -= dy;
418        year += 1;
419    }
420    let leap = is_leap(year);
421    let month_days: [u64; 12] = if leap {
422        [31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
423    } else {
424        [31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]
425    };
426    let mut month = 1u64;
427    for md in &month_days {
428        if days < *md {
429            break;
430        }
431        days -= md;
432        month += 1;
433    }
434    (year, month, days + 1)
435}
436
437fn is_leap(y: u64) -> bool {
438    (y.is_multiple_of(4) && !y.is_multiple_of(100)) || y.is_multiple_of(400)
439}
440
441#[cfg(test)]
442mod tests {
443    #![allow(clippy::doc_markdown)]
444
445    use super::*;
446    use crate::benchmark::{BenchmarkCase, BenchmarkSet};
447    use crate::evaluator::Evaluator;
448    use crate::generator::VariationGenerator;
449    use crate::snapshot::ConfigSnapshot;
450    use crate::types::{ParameterKind, Variation, VariationValue};
451    use ordered_float::OrderedFloat;
452    use std::sync::Arc;
453    use zeph_config::ExperimentConfig;
454
455    fn make_benchmark() -> BenchmarkSet {
456        BenchmarkSet {
457            cases: vec![BenchmarkCase {
458                prompt: "What is 2+2?".into(),
459                context: None,
460                reference: None,
461                tags: None,
462            }],
463        }
464    }
465
466    fn default_config() -> ExperimentConfig {
467        ExperimentConfig {
468            max_experiments: 10,
469            max_wall_time_secs: 3600,
470            min_improvement: 0.0,
471            ..Default::default()
472        }
473    }
474
475    /// Generates exactly N variations and then exhausts.
476    struct NVariationGenerator {
477        variations: Vec<Variation>,
478        pos: usize,
479    }
480
481    impl NVariationGenerator {
482        fn new(n: usize) -> Self {
483            let variations = (0..n)
484                .map(|i| Variation {
485                    parameter: ParameterKind::Temperature,
486                    #[allow(clippy::cast_precision_loss)]
487                    value: VariationValue::Float(OrderedFloat(0.5 + i as f64 * 0.1)),
488                })
489                .collect();
490            Self { variations, pos: 0 }
491        }
492    }
493
494    impl VariationGenerator for NVariationGenerator {
495        fn next(
496            &mut self,
497            _baseline: &ConfigSnapshot,
498            visited: &HashSet<Variation>,
499        ) -> Option<Variation> {
500            while self.pos < self.variations.len() {
501                let v = self.variations[self.pos].clone();
502                self.pos += 1;
503                if !visited.contains(&v) {
504                    return Some(v);
505                }
506            }
507            None
508        }
509
510        fn name(&self) -> &'static str {
511            "n_variation"
512        }
513    }
514
515    #[cfg(test)]
516    fn make_subject_mock(n_responses: usize) -> zeph_llm::any::AnyProvider {
517        use zeph_llm::any::AnyProvider;
518        use zeph_llm::mock::MockProvider;
519
520        // Each evaluate() call runs 1 subject call + 1 judge call per benchmark case.
521        // With 1 case: 1 subject + 1 judge response per evaluate() invocation.
522        // We need n_responses pairs (subject + judge) for n variations + 1 baseline.
523        let responses: Vec<String> = (0..n_responses).map(|_| "Four".to_string()).collect();
524        AnyProvider::Mock(MockProvider::with_responses(responses))
525    }
526
527    #[cfg(test)]
528    fn make_judge_mock(n_responses: usize) -> zeph_llm::any::AnyProvider {
529        use zeph_llm::any::AnyProvider;
530        use zeph_llm::mock::MockProvider;
531
532        let responses: Vec<String> = (0..n_responses)
533            .map(|_| r#"{"score": 8.0, "reason": "correct"}"#.to_string())
534            .collect();
535        AnyProvider::Mock(MockProvider::with_responses(responses))
536    }
537
538    #[cfg(test)]
539    #[tokio::test]
540    async fn engine_completes_with_no_accepted_variations() {
541        // min_improvement very high so nothing is accepted.
542        let config = ExperimentConfig {
543            max_experiments: 10,
544            max_wall_time_secs: 3600,
545            min_improvement: 100.0,
546            ..Default::default()
547        };
548        // 1 variation + 1 baseline = 2 evaluate() calls (2 subject + 2 judge responses).
549        let subject = make_subject_mock(2);
550        let judge = make_judge_mock(2);
551        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
552
553        let mut engine = ExperimentEngine::new(
554            evaluator,
555            Box::new(NVariationGenerator::new(1)),
556            Arc::new(subject),
557            ConfigSnapshot::default(),
558            config,
559            None,
560        );
561
562        let report = engine.run().await.unwrap();
563        assert_eq!(report.results.len(), 1);
564        assert!(!report.results[0].accepted);
565        assert!(!report.session_id.is_empty());
566        assert!(!report.cancelled);
567    }
568
569    #[cfg(test)]
570    #[tokio::test]
571    async fn engine_respects_max_experiments() {
572        let config = ExperimentConfig {
573            max_experiments: 3,
574            max_wall_time_secs: 3600,
575            min_improvement: 0.0,
576            ..Default::default()
577        };
578        // 5 variations available but max_experiments=3.
579        // 1 baseline + 3 candidate evaluate() calls = 4 calls, each needing 1 subject + 1 judge.
580        let subject = make_subject_mock(4);
581        let judge = make_judge_mock(4);
582        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
583
584        let mut engine = ExperimentEngine::new(
585            evaluator,
586            Box::new(NVariationGenerator::new(5)),
587            Arc::new(subject),
588            ConfigSnapshot::default(),
589            config,
590            None,
591        );
592
593        let report = engine.run().await.unwrap();
594        assert_eq!(report.results.len(), 3);
595        assert!(!report.cancelled);
596    }
597
598    #[cfg(test)]
599    #[tokio::test]
600    async fn engine_cancellation_before_baseline() {
601        // Pre-cancel: cancel token fires during baseline evaluation select!.
602        let config = ExperimentConfig {
603            max_experiments: 100,
604            max_wall_time_secs: 3600,
605            min_improvement: 0.0,
606            ..Default::default()
607        };
608        let subject = make_subject_mock(2);
609        let judge = make_judge_mock(2);
610        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
611
612        let mut engine = ExperimentEngine::new(
613            evaluator,
614            Box::new(NVariationGenerator::new(100)),
615            Arc::new(subject),
616            ConfigSnapshot::default(),
617            config,
618            None,
619        );
620        engine.stop(); // cancel before any evaluation
621        let report = engine.run().await.unwrap();
622        assert!(report.cancelled);
623        assert!(report.results.is_empty());
624    }
625
626    #[cfg(test)]
627    #[tokio::test]
628    async fn engine_cancellation_stops_loop() {
629        // Verify the loop-level select! path: cancel token pre-fired, baseline completes
630        // (because biased baseline select! checks cancel FIRST — fires immediately), then
631        // run() returns early. Since MockProvider is instantaneous, we test the cancel
632        // token semantics via stop() called between construction and run().
633        //
634        // NOTE: engine_cancellation_before_baseline covers the biased baseline path.
635        // This test verifies that cancelling after construction but before run() sets
636        // cancelled=true in the report regardless of results count.
637        let config = ExperimentConfig {
638            max_experiments: 10,
639            max_wall_time_secs: 3600,
640            min_improvement: 0.0,
641            ..Default::default()
642        };
643        let subject = make_subject_mock(2);
644        let judge = make_judge_mock(2);
645        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
646
647        let mut engine = ExperimentEngine::new(
648            evaluator,
649            Box::new(NVariationGenerator::new(10)),
650            Arc::new(subject),
651            ConfigSnapshot::default(),
652            config,
653            None,
654        );
655
656        // Verify cancel_token() gives an independent handle that controls the same token.
657        let external_token = engine.cancel_token();
658        assert!(!external_token.is_cancelled());
659        engine.stop();
660        assert!(
661            external_token.is_cancelled(),
662            "cancel_token() must share the same token"
663        );
664
665        let report = engine.run().await.unwrap();
666        assert!(report.cancelled);
667    }
668
669    #[cfg(test)]
670    #[tokio::test]
671    async fn engine_progressive_baseline_updates() {
672        // One variation applied via NVariationGenerator generates temperature=0.5.
673        // min_improvement=0.0 so it is accepted, updating best_config.
674        let config = ExperimentConfig {
675            max_experiments: 1,
676            max_wall_time_secs: 3600,
677            min_improvement: 0.0,
678            ..Default::default()
679        };
680        // 1 baseline + 1 candidate = 2 evaluate() calls.
681        let subject = make_subject_mock(2);
682        let judge = make_judge_mock(2);
683        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
684
685        let initial_baseline = ConfigSnapshot::default();
686        let mut engine = ExperimentEngine::new(
687            evaluator,
688            Box::new(NVariationGenerator::new(1)),
689            Arc::new(subject),
690            initial_baseline.clone(),
691            config,
692            None,
693        );
694
695        let report = engine.run().await.unwrap();
696        assert_eq!(report.results.len(), 1);
697        assert!(report.results[0].accepted, "variation should be accepted");
698        // best_config should differ from the initial baseline (temperature changed to 0.5).
699        assert!(
700            (report.best_config.temperature - initial_baseline.temperature).abs() > 1e-9,
701            "best_config.temperature should have changed after accepted variation"
702        );
703        assert!(!report.baseline_score.is_nan());
704        assert!(!report.final_score.is_nan());
705        // Bug #1 regression: baseline_score in result must be the PRE-acceptance score.
706        assert!(
707            (report.results[0].baseline_score - report.baseline_score).abs() < 1e-9,
708            "result.baseline_score must equal initial baseline_score (pre-acceptance)"
709        );
710    }
711
712    #[cfg(test)]
713    #[tokio::test]
714    async fn engine_handles_search_space_exhaustion() {
715        let config = default_config();
716        // Generator returns None immediately (0 variations).
717        // Only the baseline evaluate() call is needed.
718        let subject = make_subject_mock(1);
719        let judge = make_judge_mock(1);
720        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
721
722        let mut engine = ExperimentEngine::new(
723            evaluator,
724            Box::new(NVariationGenerator::new(0)),
725            Arc::new(subject),
726            ConfigSnapshot::default(),
727            config,
728            None,
729        );
730
731        let report = engine.run().await.unwrap();
732        assert!(report.results.is_empty());
733        assert!(!report.cancelled);
734    }
735
736    #[cfg(test)]
737    #[tokio::test]
738    async fn engine_skips_nan_scores() {
739        use zeph_llm::any::AnyProvider;
740        use zeph_llm::mock::MockProvider;
741
742        // Candidate evaluations produce NaN (empty judge responses → error → 0 scored → NaN mean).
743        // Baseline uses a sufficient budget to succeed; candidate budget is tiny.
744        // We use two separate Evaluator instances: one for baseline (high budget),
745        // one for candidates (zero budget). Since ExperimentEngine uses a single Evaluator,
746        // we use a judge mock with 1 valid response (baseline) and no more (candidates fail).
747        let config = ExperimentConfig {
748            max_experiments: 5,
749            max_wall_time_secs: 3600,
750            min_improvement: 0.0,
751            ..Default::default()
752        };
753        // Subject: baseline + 3 candidate subject calls (3 NaN iterations before cap).
754        let subject = AnyProvider::Mock(MockProvider::with_responses(vec![
755            "A".into(),
756            "A".into(),
757            "A".into(),
758            "A".into(),
759        ]));
760        // Judge: 1 valid response for baseline, then errors for candidates (mock exhausted).
761        let judge = AnyProvider::Mock(MockProvider::with_responses(vec![
762            r#"{"score": 8.0, "reason": "ok"}"#.into(),
763        ]));
764        // Use large budget — judge errors (not budget) produce NaN via 0 cases scored.
765        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
766
767        let mut engine = ExperimentEngine::new(
768            evaluator,
769            Box::new(NVariationGenerator::new(5)),
770            Arc::new(subject),
771            ConfigSnapshot::default(),
772            config,
773            None,
774        );
775
776        // Should not panic — NaN scores are skipped; loop breaks after MAX_CONSECUTIVE_NAN.
777        let report = engine.run().await.unwrap();
778        // No variations accepted (all NaN); loop stopped at consecutive NaN limit.
779        assert!(
780            report.results.is_empty(),
781            "all NaN iterations should be skipped"
782        );
783        assert!(!report.cancelled);
784    }
785
786    #[cfg(test)]
787    #[tokio::test]
788    async fn engine_nan_baseline_returns_error() {
789        use zeph_llm::any::AnyProvider;
790        use zeph_llm::mock::MockProvider;
791
792        // Budget=0 and no judge responses → baseline evaluation returns NaN mean → engine errors.
793        let config = ExperimentConfig {
794            max_experiments: 5,
795            max_wall_time_secs: 3600,
796            min_improvement: 0.0,
797            ..Default::default()
798        };
799        // Subject responds for baseline subject call.
800        let subject = AnyProvider::Mock(MockProvider::with_responses(vec!["A".into()]));
801        // Judge has no responses — all judge calls error, 0 cases scored, NaN mean.
802        let judge = AnyProvider::Mock(MockProvider::with_responses(vec![]));
803        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
804
805        let mut engine = ExperimentEngine::new(
806            evaluator,
807            Box::new(NVariationGenerator::new(5)),
808            Arc::new(subject),
809            ConfigSnapshot::default(),
810            config,
811            None,
812        );
813
814        let result = engine.run().await;
815        assert!(result.is_err(), "NaN baseline should return an error");
816        let err = result.unwrap_err();
817        assert!(
818            matches!(err, EvalError::Storage(_)),
819            "expected EvalError::Storage, got: {err:?}"
820        );
821    }
822
823    #[cfg(test)]
824    #[tokio::test]
825    async fn engine_persists_results_to_sqlite() {
826        use zeph_memory::testing::mock_semantic_memory;
827
828        let memory = mock_semantic_memory().await.unwrap();
829        let config = ExperimentConfig {
830            max_experiments: 1,
831            max_wall_time_secs: 3600,
832            min_improvement: 0.0,
833            ..Default::default()
834        };
835        // 1 baseline + 1 candidate = 2 evaluate() calls.
836        let subject = make_subject_mock(2);
837        let judge = make_judge_mock(2);
838        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
839
840        let session_id = {
841            let mut engine = ExperimentEngine::new(
842                evaluator,
843                Box::new(NVariationGenerator::new(1)),
844                Arc::new(subject),
845                ConfigSnapshot::default(),
846                config,
847                Some(Arc::clone(&memory)),
848            );
849            engine.run().await.unwrap();
850            engine.session_id.clone()
851        };
852
853        let rows = memory
854            .sqlite()
855            .list_experiment_results(Some(&session_id), 10)
856            .await
857            .unwrap();
858        assert_eq!(rows.len(), 1, "expected one persisted result");
859        assert_eq!(rows[0].session_id, session_id);
860    }
861
862    #[test]
863    fn session_report_serde_roundtrip() {
864        let report = ExperimentSessionReport {
865            session_id: "test-session".to_string(),
866            results: vec![],
867            best_config: ConfigSnapshot::default(),
868            baseline_score: 7.5,
869            final_score: 8.0,
870            total_improvement: 0.5,
871            wall_time_ms: 1_234,
872            cancelled: false,
873        };
874        let json = serde_json::to_string(&report).expect("serialize");
875        let report2: ExperimentSessionReport = serde_json::from_str(&json).expect("deserialize");
876        assert_eq!(report2.session_id, report.session_id);
877        assert!((report2.baseline_score - report.baseline_score).abs() < f64::EPSILON);
878        assert!((report2.final_score - report.final_score).abs() < f64::EPSILON);
879        assert_eq!(report2.wall_time_ms, report.wall_time_ms);
880        assert!(!report2.cancelled);
881    }
882
883    #[test]
884    fn chrono_now_utc_format() {
885        let s = chrono_now_utc();
886        assert_eq!(s.len(), 19, "timestamp must be 19 chars: {s}");
887        assert_eq!(&s[4..5], "-");
888        assert_eq!(&s[7..8], "-");
889        assert_eq!(&s[10..11], " ");
890        assert_eq!(&s[13..14], ":");
891        assert_eq!(&s[16..17], ":");
892    }
893
894    /// Verify that `days_to_ymd` correctly handles a known date including a leap year.
895    /// 2024-02-29 (leap day) = 19782 days since 1970-01-01.
896    #[test]
897    fn chrono_known_timestamp_leap_year() {
898        // 2024-02-29 00:00:00 UTC = 1_709_164_800 seconds since epoch.
899        // Verified via: date -d "2024-02-29 00:00:00 UTC" +%s
900        let secs: u64 = 1_709_164_800;
901        let second = secs % 60;
902        let minute = (secs / 60) % 60;
903        let hour = (secs / 3600) % 24;
904        let days = secs / 86400;
905        let (year, month, day) = days_to_ymd(days);
906        assert_eq!(year, 2024);
907        assert_eq!(month, 2);
908        assert_eq!(day, 29);
909        assert_eq!(second, 0);
910        assert_eq!(minute, 0);
911        assert_eq!(hour, 0);
912    }
913
914    /// `ExperimentEngine` must be Send to be used with `tokio::spawn`.
915    #[test]
916    fn experiment_engine_is_send() {
917        fn assert_send<T: Send>() {}
918        // This is a compile-time check — if ExperimentEngine is not Send, this fails to compile.
919        // We cannot instantiate the engine here without providers, so we use a fn pointer trick.
920        let _ = assert_send::<ExperimentEngine>;
921    }
922
923    #[tokio::test]
924    async fn engine_with_source_scheduled_propagates_to_results() {
925        let config = ExperimentConfig {
926            max_experiments: 1,
927            max_wall_time_secs: 3600,
928            min_improvement: 0.0,
929            ..Default::default()
930        };
931        let subject = make_subject_mock(2);
932        let judge = make_judge_mock(2);
933        let evaluator = Evaluator::new(Arc::new(judge), make_benchmark(), 1_000_000).unwrap();
934
935        let mut engine = ExperimentEngine::new(
936            evaluator,
937            Box::new(NVariationGenerator::new(1)),
938            Arc::new(subject),
939            ConfigSnapshot::default(),
940            config,
941            None,
942        )
943        .with_source(ExperimentSource::Scheduled);
944
945        let report = engine.run().await.unwrap();
946        assert_eq!(report.results.len(), 1);
947        assert_eq!(
948            report.results[0].source,
949            ExperimentSource::Scheduled,
950            "with_source(Scheduled) must propagate to ExperimentResult"
951        );
952    }
953}