1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
// SPDX-License-Identifier: MIT OR Apache-2.0
//! Experiment engine for adaptive agent behavior testing and hyperparameter tuning.
//!
//! `zeph-experiments` provides the infrastructure for running autonomous A/B experiments
//! over Zeph's tunable parameters (temperature, top-p, retrieval depth, etc.) using an
//! LLM-as-judge evaluation loop.
//!
//! # Architecture
//!
//! The crate is organized around three main concerns:
//!
//! 1. **Benchmark datasets** — [`BenchmarkSet`] / [`BenchmarkCase`]: TOML-loaded prompt/reference
//! pairs that define *what* to measure.
//! 2. **Evaluation** — [`Evaluator`]: runs cases against a subject model and scores responses
//! with a judge model, producing an [`EvalReport`].
//! 3. **Search strategies** — [`VariationGenerator`] implementations ([`GridStep`], [`Random`],
//! [`Neighborhood`]) that decide *which* parameter to try next.
//!
//! [`ExperimentEngine`] ties all three together: it evaluates a baseline, iterates over
//! variations produced by the generator, accepts improvements (greedy hill-climbing), and
//! optionally persists results to SQLite.
//!
//! # Quick Start
//!
//! ```rust,no_run
//! use std::sync::Arc;
//! use zeph_experiments::{
//! BenchmarkCase, BenchmarkSet, ConfigSnapshot, EvalError, Evaluator, ExperimentEngine,
//! GridStep, SearchSpace,
//! };
//! # use zeph_llm::any::AnyProvider;
//! # use zeph_llm::mock::MockProvider;
//! # use zeph_config::ExperimentConfig;
//!
//! # async fn example() -> Result<(), EvalError> {
//! let benchmark = BenchmarkSet {
//! cases: vec![BenchmarkCase {
//! prompt: "What is the capital of France?".into(),
//! context: None,
//! reference: Some("Paris".into()),
//! tags: None,
//! }],
//! };
//!
//! // Use a mock provider for the judge in tests; real providers in production.
//! let judge = Arc::new(AnyProvider::Mock(MockProvider::with_responses(vec![
//! r#"{"score": 9.0, "reason": "correct"}"#.into(),
//! ])));
//! let subject = Arc::new(AnyProvider::Mock(MockProvider::with_responses(vec![
//! "Paris".into(),
//! ])));
//!
//! let evaluator = Evaluator::new(Arc::clone(&judge), benchmark, 100_000)?;
//! let generator = Box::new(GridStep::new(SearchSpace::default()));
//! let baseline = ConfigSnapshot::default();
//! let config = ExperimentConfig::default();
//!
//! let mut engine = ExperimentEngine::new(evaluator, generator, subject, baseline, config, None);
//! let report = engine.run().await?;
//! println!("baseline={:.2} final={:.2}", report.baseline_score, report.final_score);
//! # Ok(())
//! # }
//! ```
pub use ;
pub use ;
pub use EvalError;
pub use ;
pub use VariationGenerator;
pub use GridStep;
pub use Neighborhood;
pub use Random;
pub use ;
pub use ;
pub use ;