zeph_experiments/lib.rs
1// SPDX-FileCopyrightText: 2026 Andrei G <bug-ops>
2// SPDX-License-Identifier: MIT OR Apache-2.0
3
4//! Experiment engine for adaptive agent behavior testing and hyperparameter tuning.
5//!
6//! `zeph-experiments` provides the infrastructure for running autonomous A/B experiments
7//! over Zeph's tunable parameters (temperature, top-p, retrieval depth, etc.) using an
8//! LLM-as-judge evaluation loop.
9//!
10//! # Architecture
11//!
12//! The crate is organized around three main concerns:
13//!
14//! 1. **Benchmark datasets** — [`BenchmarkSet`] / [`BenchmarkCase`]: TOML-loaded prompt/reference
15//! pairs that define *what* to measure.
16//! 2. **Evaluation** — [`Evaluator`]: runs cases against a subject model and scores responses
17//! with a judge model, producing an [`EvalReport`].
18//! 3. **Search strategies** — [`VariationGenerator`] implementations ([`GridStep`], [`Random`],
19//! [`Neighborhood`]) that decide *which* parameter to try next.
20//!
21//! [`ExperimentEngine`] ties all three together: it evaluates a baseline, iterates over
22//! variations produced by the generator, accepts improvements (greedy hill-climbing), and
23//! optionally persists results to SQLite.
24//!
25//! # Quick Start
26//!
27//! ```rust,no_run
28//! use std::sync::Arc;
29//! use zeph_experiments::{
30//! BenchmarkCase, BenchmarkSet, ConfigSnapshot, EvalError, Evaluator, ExperimentEngine,
31//! GridStep, SearchSpace,
32//! };
33//! # use zeph_llm::any::AnyProvider;
34//! # use zeph_llm::mock::MockProvider;
35//! # use zeph_config::ExperimentConfig;
36//!
37//! # async fn example() -> Result<(), EvalError> {
38//! let benchmark = BenchmarkSet {
39//! cases: vec![BenchmarkCase {
40//! prompt: "What is the capital of France?".into(),
41//! context: None,
42//! reference: Some("Paris".into()),
43//! tags: None,
44//! }],
45//! };
46//!
47//! // Use a mock provider for the judge in tests; real providers in production.
48//! let judge = Arc::new(AnyProvider::Mock(MockProvider::with_responses(vec![
49//! r#"{"score": 9.0, "reason": "correct"}"#.into(),
50//! ])));
51//! let subject = Arc::new(AnyProvider::Mock(MockProvider::with_responses(vec![
52//! "Paris".into(),
53//! ])));
54//!
55//! let evaluator = Evaluator::new(Arc::clone(&judge), benchmark, 100_000)?;
56//! let generator = Box::new(GridStep::new(SearchSpace::default()));
57//! let baseline = ConfigSnapshot::default();
58//! let config = ExperimentConfig::default();
59//!
60//! let mut engine = ExperimentEngine::new(evaluator, generator, subject, baseline, config, None);
61//! let report = engine.run().await?;
62//! println!("baseline={:.2} final={:.2}", report.baseline_score, report.final_score);
63//! # Ok(())
64//! # }
65//! ```
66pub mod benchmark;
67pub mod engine;
68pub mod error;
69pub mod evaluator;
70pub mod generator;
71pub mod grid;
72pub mod neighborhood;
73pub mod random;
74pub mod search_space;
75pub mod snapshot;
76pub mod types;
77pub use benchmark::{BenchmarkCase, BenchmarkSet};
78pub use engine::{ExperimentEngine, ExperimentSessionReport};
79pub use error::EvalError;
80pub use evaluator::{CaseScore, EvalReport, Evaluator, JudgeOutput};
81pub use generator::VariationGenerator;
82pub use grid::GridStep;
83pub use neighborhood::Neighborhood;
84pub use random::Random;
85pub use search_space::{ParameterRange, SearchSpace};
86pub use snapshot::{ConfigSnapshot, GenerationOverrides};
87pub use types::{ExperimentResult, ExperimentSource, ParameterKind, Variation, VariationValue};