datasynth_eval/calibration/
loop_runner.rs

1//! C3 Piece 2 — iteration controller for the adversarial calibration
2//! loop.
3//!
4//! Drives the generate → eval → propose → accept/reject cycle.
5//! Wires:
6//!   - [`super::CalibrationObjective`] (what we minimise)
7//!   - [`super::CalibrationKnob`] (parameter space)
8//!   - [`Proposer`] (who suggests the next step)
9//!   - [`Evaluator`] (who runs generation + BF eval; abstract so
10//!     unit tests can inject a deterministic mock without the
11//!     orchestrator).
12//!
13//! See `docs/design/2026-05-27-c3-adversarial-calibration-design.md`
14//! for the broader plan. This module ships the iteration body +
15//! convergence + rollback logic; the AutoTuner-driven proposer and
16//! the real generator-backed evaluator are Pieces 2.5 / 4.
17
18use std::collections::BTreeMap;
19
20use serde::{Deserialize, Serialize};
21
22use crate::behavioral_fidelity::report::BehavioralFidelityReport;
23
24use super::knob::{CalibrationKnob, KnobValue};
25use super::objective::CalibrationObjective;
26
27// ── Public types ──────────────────────────────────────────────────────────────
28
29/// What to do when a step makes the multi-seed mean loss WORSE than
30/// the best-seen value (by more than the noise floor).
31#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
32pub enum RollbackPolicy {
33    /// Revert the knob to its pre-step value and try a different
34    /// patch on the next iteration.
35    #[default]
36    Revert,
37    /// Keep the worse θ — the optimizer accepts noise; useful for
38    /// pure gradient-descent-style runs that should walk uphill
39    /// occasionally to escape plateaus.
40    Keep,
41    /// Halve the loop's effective damping and re-propose the same
42    /// patch direction on the next iteration. Simulated-annealing
43    /// flavour.
44    HalveDamping,
45}
46
47/// Loop-level config.
48#[derive(Debug, Clone)]
49pub struct CalibrationConfig {
50    /// Maximum iterations before the loop gives up. Default 20.
51    pub max_iterations: usize,
52    /// Seeds evaluated per iteration to amortise the T3 single-shard
53    /// composite CV ≈ 25 %. Default 3 — matches the v5.31 T3
54    /// methodology finding (memory note `project_v5_31_overnight_loop`).
55    pub seeds_per_iteration: usize,
56    /// Convergence patience — stop when the best mean loss hasn't
57    /// improved by > `min_improvement` × σ across `patience`
58    /// consecutive iterations. Default 3.
59    pub patience: usize,
60    /// Minimum relative improvement (in units of σ_loss) needed to
61    /// credit a step as "better". Default 1.0 — i.e. > 1 standard
62    /// deviation of the aggregate.
63    pub min_improvement: f64,
64    /// Damping applied to proposed Δ before clipping. Default 0.5
65    /// (take half the proposed step to avoid oscillation).
66    pub damping: f64,
67    /// How to handle a step that worsens the loss.
68    pub rollback: RollbackPolicy,
69}
70
71impl Default for CalibrationConfig {
72    fn default() -> Self {
73        Self {
74            max_iterations: 20,
75            seeds_per_iteration: 3,
76            patience: 3,
77            min_improvement: 1.0,
78            damping: 0.5,
79            rollback: RollbackPolicy::default(),
80        }
81    }
82}
83
84/// One step's outcome. Persisted to the history so a long-running
85/// loop can resume after interruption.
86#[derive(Debug, Clone, Serialize, Deserialize)]
87pub struct StepReport {
88    /// 0-indexed step number.
89    pub iter: usize,
90    /// Multi-seed mean loss BEFORE this step's patch was applied.
91    pub loss_before_mean: f64,
92    /// Multi-seed standard deviation BEFORE (the noise floor used
93    /// for accept/reject).
94    pub loss_before_std: f64,
95    /// Patch the proposer suggested (`None` → proposer gave up,
96    /// loop stops).
97    pub proposed_patch: Option<ProposedPatch>,
98    /// Multi-seed mean loss AFTER applying the patch (and any
99    /// clipping). `None` when `proposed_patch == None`.
100    pub loss_after_mean: Option<f64>,
101    pub loss_after_std: Option<f64>,
102    /// Knob's value AFTER apply (= current state at end of step).
103    pub knob_values: BTreeMap<String, KnobValue>,
104    /// Whether the step was accepted, rolled back, or otherwise
105    /// resolved. See [`StepOutcome`].
106    pub outcome: StepOutcome,
107}
108
109/// What the loop did with the proposed step.
110#[derive(Debug, Clone, Copy, PartialEq, Eq, Serialize, Deserialize)]
111pub enum StepOutcome {
112    /// Patch reduced loss enough to credit as improvement.
113    Improved,
114    /// Patch was applied but didn't beat the noise floor. Kept anyway
115    /// (RollbackPolicy::Keep) OR damping was halved (HalveDamping).
116    AcceptedNoNoiseFloorBeat,
117    /// Patch worsened loss; reverted per RollbackPolicy::Revert.
118    Reverted,
119    /// Proposer returned None — nothing left to try.
120    ProposerExhausted,
121    /// Convergence target met before this step ran.
122    TargetMet,
123    /// Patience exhausted (best loss flat for `patience` iterations).
124    PatienceExhausted,
125}
126
127/// What a proposer suggests: a knob to change and what value to try.
128#[derive(Debug, Clone, Serialize, Deserialize)]
129pub struct ProposedPatch {
130    /// Index into the knobs slice.
131    pub knob_index: usize,
132    /// Proposed new value (before clipping).
133    pub proposed_value: KnobValue,
134    /// Optional rationale for the logs.
135    pub rationale: String,
136}
137
138/// Strategy for choosing the next knob + value to try.
139///
140/// Implementations receive the current knob state, the multi-seed
141/// loss observation, and the history. They return `None` when they
142/// have nothing more to propose (the loop then stops). The default
143/// [`BoundsScanProposer`] cycles through knobs proposing
144/// `current ± max_step` in the direction that improved the loss
145/// most recently; an AutoTuner-driven proposer is a Piece 2.5 follow-up.
146pub trait Proposer {
147    fn propose(
148        &mut self,
149        knobs: &[CalibrationKnob],
150        current_loss: (f64, f64),
151        history: &[StepReport],
152    ) -> Option<ProposedPatch>;
153}
154
155/// What runs a generation + eval for a given knob state + seed.
156///
157/// Real implementations call into the orchestrator + emit a
158/// `BehavioralFidelityReport`. Unit tests inject a deterministic
159/// mock that maps `(knobs, seed)` to a hand-crafted report so the
160/// loop's accept/reject logic can be exercised without running the
161/// engine.
162pub trait Evaluator {
163    fn evaluate(
164        &self,
165        knobs: &[CalibrationKnob],
166        seed: u64,
167    ) -> Result<BehavioralFidelityReport, EvaluatorError>;
168}
169
170/// Generic error wrapper so the iteration loop can propagate
171/// underlying engine + IO failures without the loop being tied to
172/// any one error type.
173#[derive(Debug)]
174pub struct EvaluatorError(pub String);
175
176impl std::fmt::Display for EvaluatorError {
177    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
178        write!(f, "evaluator error: {}", self.0)
179    }
180}
181
182impl std::error::Error for EvaluatorError {}
183
184// ── The loop ─────────────────────────────────────────────────────────────────
185
186/// The iteration controller. Owns the knobs, objective, config, and
187/// the accumulated history.
188pub struct CalibrationLoop {
189    pub objective: CalibrationObjective,
190    pub knobs: Vec<CalibrationKnob>,
191    pub config: CalibrationConfig,
192    pub history: Vec<StepReport>,
193    /// Best (mean, std) seen so far.
194    pub best_loss: Option<(f64, f64)>,
195    /// Knob values when `best_loss` was observed.
196    pub best_knob_values: BTreeMap<String, KnobValue>,
197    /// Effective damping (may shrink under HalveDamping policy).
198    effective_damping: f64,
199    /// Steps since last improvement — counts toward patience.
200    steps_since_improvement: usize,
201}
202
203impl CalibrationLoop {
204    pub fn new(
205        objective: CalibrationObjective,
206        knobs: Vec<CalibrationKnob>,
207        config: CalibrationConfig,
208    ) -> Self {
209        let damping = config.damping;
210        Self {
211            objective,
212            knobs,
213            config,
214            history: Vec::new(),
215            best_loss: None,
216            best_knob_values: BTreeMap::new(),
217            effective_damping: damping,
218            steps_since_improvement: 0,
219        }
220    }
221
222    /// Run one step. Returns the resulting [`StepReport`] which is
223    /// also appended to `self.history`.
224    pub fn step<E: Evaluator, P: Proposer>(
225        &mut self,
226        evaluator: &E,
227        proposer: &mut P,
228    ) -> Result<&StepReport, EvaluatorError> {
229        let iter = self.history.len();
230
231        // 1. Measure current loss at the existing knob state (multi-seed).
232        let (mean_before, std_before) = self.measure_loss(evaluator)?;
233
234        // 2. Check convergence target.
235        if let Some(target) = self.objective.target {
236            if mean_before <= target {
237                return Ok(self.record(StepReport {
238                    iter,
239                    loss_before_mean: mean_before,
240                    loss_before_std: std_before,
241                    proposed_patch: None,
242                    loss_after_mean: None,
243                    loss_after_std: None,
244                    knob_values: self.snapshot_knobs(),
245                    outcome: StepOutcome::TargetMet,
246                }));
247            }
248        }
249
250        // 3. Update best-seen tracker BEFORE proposing a new step
251        //    (matters on iteration 0 when history is empty).
252        if self.best_loss.map(|(m, _)| mean_before < m).unwrap_or(true) {
253            self.best_loss = Some((mean_before, std_before));
254            self.best_knob_values = self.snapshot_knobs();
255            self.steps_since_improvement = 0;
256        }
257
258        // 4. Ask the proposer for a patch.
259        let raw_patch = proposer.propose(&self.knobs, (mean_before, std_before), &self.history);
260        let Some(patch) = raw_patch else {
261            return Ok(self.record(StepReport {
262                iter,
263                loss_before_mean: mean_before,
264                loss_before_std: std_before,
265                proposed_patch: None,
266                loss_after_mean: None,
267                loss_after_std: None,
268                knob_values: self.snapshot_knobs(),
269                outcome: StepOutcome::ProposerExhausted,
270            }));
271        };
272
273        // 5. Apply damping to Δ before letting the knob clip.
274        let damped_value = damp_value(
275            self.knobs[patch.knob_index].current,
276            patch.proposed_value,
277            self.effective_damping,
278        );
279
280        // 6. Save the pre-step value for rollback.
281        let pre_value = self.knobs[patch.knob_index].current;
282        let _clip_result = self.knobs[patch.knob_index].apply(damped_value);
283
284        // 7. Re-measure loss at the new state.
285        let (mean_after, std_after) = self.measure_loss(evaluator)?;
286
287        // 8. Decide outcome.
288        let outcome = self.decide_outcome(
289            mean_before,
290            std_before,
291            mean_after,
292            patch.knob_index,
293            pre_value,
294        );
295
296        // 9. Patience accounting.
297        match outcome {
298            StepOutcome::Improved => {
299                self.steps_since_improvement = 0;
300            }
301            _ => {
302                self.steps_since_improvement += 1;
303            }
304        }
305
306        // 10. Persist + return.
307        Ok(self.record(StepReport {
308            iter,
309            loss_before_mean: mean_before,
310            loss_before_std: std_before,
311            proposed_patch: Some(ProposedPatch {
312                knob_index: patch.knob_index,
313                proposed_value: damped_value,
314                rationale: patch.rationale,
315            }),
316            loss_after_mean: Some(mean_after),
317            loss_after_std: Some(std_after),
318            knob_values: self.snapshot_knobs(),
319            outcome,
320        }))
321    }
322
323    /// Run until convergence, patience exhaustion, max-iter, or
324    /// proposer exhaustion. Returns the final history.
325    pub fn run<E: Evaluator, P: Proposer>(
326        &mut self,
327        evaluator: &E,
328        proposer: &mut P,
329    ) -> Result<&[StepReport], EvaluatorError> {
330        for _ in 0..self.config.max_iterations {
331            let outcome = self.step(evaluator, proposer)?.outcome;
332            if matches!(
333                outcome,
334                StepOutcome::TargetMet
335                    | StepOutcome::ProposerExhausted
336                    | StepOutcome::PatienceExhausted
337            ) {
338                break;
339            }
340            if self.steps_since_improvement >= self.config.patience {
341                // Synthesise a terminal entry so the caller sees why we stopped.
342                let last = self.history.last().expect("history non-empty after step");
343                let mut term = last.clone();
344                term.iter = self.history.len();
345                term.outcome = StepOutcome::PatienceExhausted;
346                term.proposed_patch = None;
347                term.loss_after_mean = None;
348                term.loss_after_std = None;
349                self.history.push(term);
350                break;
351            }
352        }
353        Ok(&self.history)
354    }
355
356    // ── Internals ────────────────────────────────────────────────────────────
357
358    /// Measure (mean, std) loss across `seeds_per_iteration` seeds.
359    fn measure_loss<E: Evaluator>(&self, evaluator: &E) -> Result<(f64, f64), EvaluatorError> {
360        let mut reports = Vec::with_capacity(self.config.seeds_per_iteration);
361        for seed in 0..self.config.seeds_per_iteration as u64 {
362            reports.push(evaluator.evaluate(&self.knobs, seed)?);
363        }
364        self.objective.aggregate(&reports).ok_or_else(|| {
365            EvaluatorError("objective returned None from non-empty report set".into())
366        })
367    }
368
369    fn snapshot_knobs(&self) -> BTreeMap<String, KnobValue> {
370        self.knobs
371            .iter()
372            .map(|k| (k.path.clone(), k.current))
373            .collect()
374    }
375
376    fn decide_outcome(
377        &mut self,
378        mean_before: f64,
379        std_before: f64,
380        mean_after: f64,
381        knob_idx: usize,
382        pre_value: KnobValue,
383    ) -> StepOutcome {
384        let beat_noise_floor = std_before > 0.0
385            && (mean_before - mean_after) > self.config.min_improvement * std_before;
386        if mean_after < mean_before && beat_noise_floor {
387            // Real improvement.
388            self.best_loss = Some((mean_after, 0.0));
389            self.best_knob_values = self.snapshot_knobs();
390            return StepOutcome::Improved;
391        }
392
393        if mean_after >= mean_before {
394            // Worse than baseline.
395            match self.config.rollback {
396                RollbackPolicy::Revert => {
397                    self.knobs[knob_idx].current = pre_value;
398                    StepOutcome::Reverted
399                }
400                RollbackPolicy::Keep => StepOutcome::AcceptedNoNoiseFloorBeat,
401                RollbackPolicy::HalveDamping => {
402                    self.effective_damping *= 0.5;
403                    self.knobs[knob_idx].current = pre_value;
404                    StepOutcome::Reverted
405                }
406            }
407        } else {
408            // Better numerically but didn't beat noise floor.
409            StepOutcome::AcceptedNoNoiseFloorBeat
410        }
411    }
412
413    fn record(&mut self, report: StepReport) -> &StepReport {
414        self.history.push(report);
415        self.history.last().expect("just pushed")
416    }
417}
418
419/// Apply `damping ∈ [0, 1]` to the (proposed - current) delta and
420/// return the damped value. Numeric only — type taken from
421/// `proposed`'s variant.
422fn damp_value(current: KnobValue, proposed: KnobValue, damping: f64) -> KnobValue {
423    let cur = current.as_f64();
424    let prop = proposed.as_f64();
425    let damped = cur + (prop - cur) * damping;
426    match proposed {
427        KnobValue::F64(_) => KnobValue::F64(damped),
428        KnobValue::Usize(_) => KnobValue::Usize(damped.round().max(0.0) as usize),
429    }
430}
431
432// ── Tests ────────────────────────────────────────────────────────────────────
433
434#[cfg(test)]
435mod tests {
436    use super::*;
437    use crate::behavioral_fidelity::report::{
438        BaselineValues, BehavioralFidelityReport, CorpusSummary, EntityMetrics, GateResult,
439        PerMetric,
440    };
441    use chrono::Utc;
442
443    fn empty_per_metric() -> PerMetric {
444        PerMetric {
445            raw: 0.0,
446            baseline: 0.0,
447            dr: 0.0,
448            is_degenerate_baseline: false,
449            is_volume_bounded: false,
450        }
451    }
452
453    fn empty_em() -> EntityMetrics {
454        EntityMetrics {
455            entity_column: "t".into(),
456            p1_ietd: empty_per_metric(),
457            p1_autocorr: empty_per_metric(),
458            p2_active_lifetime: empty_per_metric(),
459            p2_burst_len_by_threshold: BTreeMap::new(),
460            p2_je_line_burst: empty_per_metric(),
461            p3_fanout_by_attr: BTreeMap::new(),
462            p3_clustering: empty_per_metric(),
463            p3_triangle_log_ratio: empty_per_metric(),
464            p4_rule_results: vec![],
465            p4_mean_gap: empty_per_metric(),
466        }
467    }
468
469    fn make_report(composite: f64) -> BehavioralFidelityReport {
470        BehavioralFidelityReport {
471            profile: "t".into(),
472            generator_id: "t".into(),
473            generator_version: "v5.x".into(),
474            seed: 0,
475            generated_at: Utc::now(),
476            reference_corpus: CorpusSummary {
477                path: "/dev/null".into(),
478                n_rows: 0,
479                n_entities_primary: 0,
480                n_entities_secondary: 0,
481                period_start: None,
482                period_end: None,
483            },
484            synthetic: CorpusSummary {
485                path: "/dev/null".into(),
486                n_rows: 0,
487                n_entities_primary: 0,
488                n_entities_secondary: 0,
489                period_start: None,
490                period_end: None,
491            },
492            noise_floor: BaselineValues {
493                p1_ietd_w1_days: 0.0,
494                p1_autocorr_gap: 0.0,
495                p2_active_lifetime_w1: 0.0,
496                p2_burst_len_by_threshold: BTreeMap::new(),
497                p2_je_line_burst_w1: 0.0,
498                p3_fanout_by_attr: BTreeMap::new(),
499                p3_clustering_gap: 0.0,
500                p3_triangle_log_ratio: 0.0,
501                p4_mean_gap: 0.0,
502            },
503            per_entity: {
504                let mut m = BTreeMap::new();
505                m.insert("t".into(), empty_em());
506                m
507            },
508            composite_bf_score: composite,
509            composite_bf_median: composite,
510            n_metrics_aggregated: 1,
511            n_metrics_excluded_degenerate: 0,
512            composite_bf_volume_corrected: composite,
513            n_metrics_excluded_volume: 0,
514            intraday_structural: None,
515            gates: GateResult {
516                fail_if_dr_above: 100.0,
517                fail_if_composite_above: 100.0,
518                passed: true,
519                failures: vec![],
520            },
521        }
522    }
523
524    /// Mock evaluator: maps the first knob's f64 value linearly to a
525    /// composite. Optimum at the supplied `optimum`. `noise` is
526    /// added deterministically as a function of seed so we can
527    /// exercise the multi-seed aggregator.
528    struct LinearMockEvaluator {
529        optimum: f64,
530        noise: f64,
531    }
532    impl Evaluator for LinearMockEvaluator {
533        fn evaluate(
534            &self,
535            knobs: &[CalibrationKnob],
536            seed: u64,
537        ) -> Result<BehavioralFidelityReport, EvaluatorError> {
538            let v = knobs[0].current.as_f64();
539            // Loss = |v - optimum| + per-seed deterministic noise.
540            let noise = self.noise * (seed as f64 - 1.0); // -1, 0, +1 for seeds 0,1,2
541            let composite = (v - self.optimum).abs() + noise;
542            Ok(make_report(composite))
543        }
544    }
545
546    /// Trivial proposer: always pushes knob 0 toward `target`,
547    /// bounded by max_step.
548    struct StepTowardProposer {
549        target: f64,
550    }
551    impl Proposer for StepTowardProposer {
552        fn propose(
553            &mut self,
554            knobs: &[CalibrationKnob],
555            _current_loss: (f64, f64),
556            _history: &[StepReport],
557        ) -> Option<ProposedPatch> {
558            let cur = knobs[0].current.as_f64();
559            if (cur - self.target).abs() < 1e-9 {
560                return None;
561            }
562            let direction = (self.target - cur).signum();
563            let step = direction * knobs[0].max_step;
564            Some(ProposedPatch {
565                knob_index: 0,
566                proposed_value: KnobValue::F64(cur + step),
567                rationale: format!("step toward {target}", target = self.target),
568            })
569        }
570    }
571
572    #[test]
573    fn step_reduces_loss_when_moving_toward_optimum() {
574        let knobs = vec![CalibrationKnob::new_f64("test.rate", 0.10, 0.0, 1.0, 0.05)];
575        let mut loop_ = CalibrationLoop::new(
576            CalibrationObjective::bf_composite(),
577            knobs,
578            CalibrationConfig {
579                seeds_per_iteration: 1,
580                max_iterations: 1,
581                min_improvement: 0.0, // disable noise-floor gate for deterministic loss
582                damping: 1.0,         // no damping so the step lands at proposed
583                ..CalibrationConfig::default()
584            },
585        );
586        let eval = LinearMockEvaluator {
587            optimum: 0.02,
588            noise: 0.0,
589        };
590        let mut prop = StepTowardProposer { target: 0.02 };
591
592        let report = loop_.step(&eval, &mut prop).expect("step ok").clone();
593
594        assert!(report.loss_before_mean > 0.0);
595        let after = report.loss_after_mean.unwrap();
596        assert!(
597            after < report.loss_before_mean,
598            "step should reduce loss (before={}, after={})",
599            report.loss_before_mean,
600            after
601        );
602    }
603
604    #[test]
605    fn run_converges_to_optimum_within_max_iter() {
606        // Start at 0.10, optimum at 0.02, max_step 0.02 — needs 4 steps.
607        let knobs = vec![CalibrationKnob::new_f64("test.rate", 0.10, 0.0, 1.0, 0.02)];
608        let mut loop_ = CalibrationLoop::new(
609            CalibrationObjective::bf_composite().with_target(0.001),
610            knobs,
611            CalibrationConfig {
612                seeds_per_iteration: 1,
613                max_iterations: 10,
614                min_improvement: 0.0,
615                damping: 1.0,
616                patience: 20, // disable patience so the loop runs to convergence
617                ..CalibrationConfig::default()
618            },
619        );
620        let eval = LinearMockEvaluator {
621            optimum: 0.02,
622            noise: 0.0,
623        };
624        let mut prop = StepTowardProposer { target: 0.02 };
625
626        let history = loop_.run(&eval, &mut prop).unwrap().to_vec();
627
628        assert!(!history.is_empty());
629        let final_value = loop_.knobs[0].current.as_f64();
630        assert!(
631            (final_value - 0.02).abs() < 1e-6,
632            "final knob value should converge to optimum: got {final_value}"
633        );
634        assert!(
635            history
636                .iter()
637                .any(|s| matches!(s.outcome, StepOutcome::TargetMet)),
638            "convergence target should have been met before max_iter"
639        );
640    }
641
642    #[test]
643    fn proposer_exhaustion_stops_the_loop() {
644        // Proposer returns None immediately (knob is already at target).
645        let knobs = vec![CalibrationKnob::new_f64("test.rate", 0.02, 0.0, 1.0, 0.02)];
646        let mut loop_ = CalibrationLoop::new(
647            CalibrationObjective::bf_composite(),
648            knobs,
649            CalibrationConfig {
650                seeds_per_iteration: 1,
651                max_iterations: 10,
652                ..CalibrationConfig::default()
653            },
654        );
655        let eval = LinearMockEvaluator {
656            optimum: 0.02,
657            noise: 0.0,
658        };
659        let mut prop = StepTowardProposer { target: 0.02 };
660
661        let history = loop_.run(&eval, &mut prop).unwrap().to_vec();
662        assert_eq!(history.len(), 1);
663        assert!(matches!(history[0].outcome, StepOutcome::ProposerExhausted));
664    }
665
666    #[test]
667    fn rollback_revert_restores_pre_step_value() {
668        // Optimum at 0.02 but proposer steps AWAY (toward 0.5). The
669        // loop should rollback every step under Revert policy.
670        let knobs = vec![CalibrationKnob::new_f64("test.rate", 0.02, 0.0, 1.0, 0.05)];
671        let mut loop_ = CalibrationLoop::new(
672            CalibrationObjective::bf_composite(),
673            knobs,
674            CalibrationConfig {
675                seeds_per_iteration: 1,
676                max_iterations: 3,
677                min_improvement: 0.0,
678                damping: 1.0,
679                rollback: RollbackPolicy::Revert,
680                patience: 20,
681            },
682        );
683        let eval = LinearMockEvaluator {
684            optimum: 0.02,
685            noise: 0.0,
686        };
687        let mut prop = StepTowardProposer { target: 0.5 };
688
689        loop_.run(&eval, &mut prop).unwrap();
690
691        let final_value = loop_.knobs[0].current.as_f64();
692        assert!(
693            (final_value - 0.02).abs() < 1e-9,
694            "Revert policy should restore the starting value; got {final_value}"
695        );
696        assert!(
697            loop_
698                .history
699                .iter()
700                .any(|s| matches!(s.outcome, StepOutcome::Reverted)),
701            "at least one step should have been Reverted"
702        );
703    }
704
705    #[test]
706    fn multi_seed_aggregate_produces_std() {
707        // Three seeds with noise = 0.01 → composites {-0.01, 0.0, +0.01}
708        // relative to base (0.10 - 0.02) = 0.08. So actual composites:
709        // {0.07, 0.08, 0.09}. Mean 0.08, std sqrt(((-0.01)² + 0 +
710        // (0.01)²)/3) = sqrt(0.0002/3) ≈ 0.00816.
711        let knobs = vec![CalibrationKnob::new_f64("test.rate", 0.10, 0.0, 1.0, 0.05)];
712        let loop_ = CalibrationLoop::new(
713            CalibrationObjective::bf_composite(),
714            knobs,
715            CalibrationConfig {
716                seeds_per_iteration: 3,
717                ..CalibrationConfig::default()
718            },
719        );
720        let eval = LinearMockEvaluator {
721            optimum: 0.02,
722            noise: 0.01,
723        };
724        let (mean, std) = loop_.measure_loss(&eval).unwrap();
725        assert!((mean - 0.08).abs() < 1e-9, "expected mean 0.08, got {mean}");
726        assert!(
727            (std - (2.0_f64 / 30000.0).sqrt()).abs() < 1e-9,
728            "expected std ≈ 0.00816, got {std}"
729        );
730    }
731}
datasynth_eval/calibration/loop_runner.rs

datasynth_eval/calibration/
loop_runner.rs