gepa 0.1.0

GEPA: Reflective Prompt Evolution — a Rust implementation of the genetic-Pareto prompt optimizer
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
/// `GEPAResult` — immutable snapshot returned at the end of optimisation.
///
/// Captures everything worth knowing about a completed GEPA run: all explored
/// candidates, their validation scores, the Pareto frontier, lineage, and
/// optional raw best outputs.  It is the sole public-facing artefact of the
/// optimisation engine.
///
/// Mirrors the Python `gepa.core.result` module.
use std::collections::{HashMap, HashSet};

use serde::{Deserialize, Serialize};

use crate::core::adapter::Candidate;
use crate::core::data_loader::DataId;
use crate::core::state::{GEPAState, ProgramIdx};
use crate::error::Result;

// ---------------------------------------------------------------------------
// Serde helpers — shared with state.rs; serialise HashMap<Id,V> as Vec pairs
// to avoid requiring Id to be a JSON string key.
// ---------------------------------------------------------------------------

use crate::core::serde_helpers::{serde_map_set_as_vec, serde_vec_of_maps};

// ---------------------------------------------------------------------------
// GEPAResult
// ---------------------------------------------------------------------------

/// Immutable snapshot of a completed (or in-progress) GEPA optimisation run.
///
/// ### Serialisation
/// [`GEPAResult`] implements `serde::Serialize` / `Deserialize`.  Round-trip
/// via `to_json()` / `from_json()`.
///
/// ### Construction
/// Prefer [`GEPAResult::from_state`] rather than constructing directly.
#[derive(Debug, Clone, Serialize, Deserialize)]
#[serde(bound(deserialize = "Id: DataId"))]
pub struct GEPAResult<Id: DataId> {
    // ---- Core data -------------------------------------------------------
    /// All explored candidates in discovery order.
    pub candidates: Vec<Candidate>,

    /// Lineage — `parents[i]` lists parent indices for candidate `i`.
    /// The seed candidate has `[None]` in its parent list.
    pub parents: Vec<Vec<Option<ProgramIdx>>>,

    /// Per-candidate average validation score.
    /// `f64::NEG_INFINITY` for unseen candidates.
    pub val_aggregate_scores: Vec<f64>,

    /// Per-candidate dict of `{val_id: score}` for each validated example.
    #[serde(
        serialize_with = "serde_vec_of_maps::serialize",
        deserialize_with = "serde_vec_of_maps::deserialize"
    )]
    pub val_subscores: Vec<HashMap<Id, f64>>,

    /// Instance-level Pareto frontier — for each validation example, the set
    /// of candidate indices that achieved the best score on that example.
    #[serde(
        serialize_with = "serde_map_set_as_vec::serialize",
        deserialize_with = "serde_map_set_as_vec::deserialize"
    )]
    pub per_val_instance_best_candidates: HashMap<Id, HashSet<ProgramIdx>>,

    /// Number of per-example metric evaluations consumed to discover each candidate.
    pub discovery_eval_counts: Vec<usize>,

    // ---- Optional multi-objective data -----------------------------------
    /// Per-candidate per-objective aggregate scores.
    /// `None` when no objective scores are tracked.
    pub val_aggregate_subscores: Option<Vec<HashMap<String, f64>>>,

    /// Per-objective Pareto frontier.
    /// `None` when no objective scores are tracked.
    pub per_objective_best_candidates: Option<HashMap<String, HashSet<ProgramIdx>>>,

    /// Per-objective best aggregate score achieved across all candidates.
    pub objective_pareto_front: Option<HashMap<String, f64>>,

    // ---- Run metadata ----------------------------------------------------
    /// Total number of per-example metric evaluations consumed by the full run.
    pub total_metric_calls: Option<usize>,

    /// Number of full validation-set evaluations performed.
    pub num_full_val_evals: Option<usize>,

    /// Directory where run artefacts were saved, or `None`.
    pub run_dir: Option<String>,

    /// RNG seed used for the run, or `None`.
    pub seed: Option<u64>,

    /// Internal: when `Some`, `best_candidate()` unwraps the dict and returns
    /// the plain `String` under this key.
    pub str_candidate_key: Option<String>,

    /// Schema version for forward-compatibility detection.
    pub validation_schema_version: u32,
}

impl<Id: DataId> GEPAResult<Id> {
    /// Current serialisation schema version.
    pub const SCHEMA_VERSION: u32 = 2;

    // ------------------------------------------------------------------
    // Convenience properties
    // ------------------------------------------------------------------

    /// Total number of candidates explored during optimisation.
    pub fn num_candidates(&self) -> usize {
        self.candidates.len()
    }

    /// Number of validation examples tracked in the instance Pareto front.
    pub fn num_val_instances(&self) -> usize {
        self.per_val_instance_best_candidates.len()
    }

    /// Index of the highest-scoring candidate (by average validation score).
    ///
    /// In case of ties, the candidate with the smaller index (discovered
    /// earlier) wins.
    ///
    /// # Errors
    /// Returns `Err` when there are no candidates.
    ///
    /// # Panics
    /// Cannot panic: the early-return guard ensures the iterator is non-empty.
    pub fn best_idx(&self) -> Result<ProgramIdx> {
        if self.val_aggregate_scores.is_empty() {
            return Err(crate::error::GEPAError::NoCandidates);
        }
        let best = self
            .val_aggregate_scores
            .iter()
            .copied()
            .enumerate()
            .max_by(|(i_a, a), (i_b, b)| {
                a.partial_cmp(b)
                    .unwrap_or(std::cmp::Ordering::Equal)
                    .then(i_b.cmp(i_a)) // prefer smaller index on tie
            })
            .map(|(i, _)| i)
            .unwrap();
        Ok(best)
    }

    /// The highest-scoring candidate as a map.
    ///
    /// # Errors
    /// Returns `Err` when there are no candidates.
    pub fn best_candidate(&self) -> Result<&Candidate> {
        let idx = self.best_idx()?;
        Ok(&self.candidates[idx])
    }

    /// The highest-scoring candidate as a plain string.
    ///
    /// When `str_candidate_key` is set and that key is present in the
    /// best candidate, returns `Some(&str)`.  Mirrors the Python behaviour
    /// where `best_candidate` unwraps the dict when `str_candidate_key` is
    /// configured.
    ///
    /// Returns `None` when `str_candidate_key` is not set, the key is absent
    /// from the candidate, or there are no candidates at all.
    pub fn best_candidate_str(&self) -> Option<&str> {
        let key = self.str_candidate_key.as_deref()?;
        let idx = self.best_idx().ok()?;
        self.candidates[idx].get(key).map(String::as_str)
    }

    // ------------------------------------------------------------------
    // Construction from live state
    // ------------------------------------------------------------------

    /// Construct a [`GEPAResult`] snapshot from the engine's live state.
    ///
    /// This is the primary factory used by the engine at the end of
    /// optimisation (or on-demand for intermediate snapshots).
    ///
    /// # Arguments
    /// * `state`             — the live `GEPAState` to snapshot.
    /// * `run_dir`           — directory where artefacts were persisted.
    /// * `seed`              — RNG seed that was used, if any.
    /// * `str_candidate_key` — when set, `best_candidate` unwraps the internal
    ///   dict and returns the plain `String` value stored under this key.
    pub fn from_state(
        state: &GEPAState<Id>,
        run_dir: Option<String>,
        seed: Option<u64>,
        str_candidate_key: Option<String>,
    ) -> Self {
        let objective_scores_list: Vec<HashMap<String, f64>> =
            state.prog_candidate_objective_scores.clone();
        let has_objective_scores = objective_scores_list.iter().any(|m| !m.is_empty());

        let per_objective_best: HashMap<String, HashSet<ProgramIdx>> = state
            .program_at_pareto_front_objectives
            .iter()
            .map(|(k, v)| (k.clone(), v.clone()))
            .collect();

        let objective_front: HashMap<String, f64> = state.objective_pareto_front.clone();

        Self {
            candidates: state.program_candidates.clone(),
            parents: state.parent_program_for_candidate.clone(),
            val_aggregate_scores: state.program_full_scores_val_set(),
            val_subscores: state.prog_candidate_val_subscores.clone(),
            per_val_instance_best_candidates: state.program_at_pareto_front_valset.clone(),
            discovery_eval_counts: state.num_metric_calls_by_discovery.clone(),
            val_aggregate_subscores: if has_objective_scores {
                Some(objective_scores_list)
            } else {
                None
            },
            per_objective_best_candidates: if per_objective_best.is_empty() {
                None
            } else {
                Some(per_objective_best)
            },
            objective_pareto_front: if objective_front.is_empty() {
                None
            } else {
                Some(objective_front)
            },
            total_metric_calls: Some(state.total_num_evals),
            num_full_val_evals: Some(state.num_full_ds_evals),
            run_dir,
            seed,
            str_candidate_key,
            validation_schema_version: Self::SCHEMA_VERSION,
        }
    }

    // ------------------------------------------------------------------
    // Serialisation
    // ------------------------------------------------------------------

    /// Serialise to a JSON string.
    ///
    /// # Errors
    /// Returns `Err` if serialisation fails.
    pub fn to_json(&self) -> Result<String> {
        Ok(serde_json::to_string_pretty(self)?)
    }

    /// Deserialise from a JSON string produced by `to_json()`.
    ///
    /// # Errors
    /// Returns `Err` if the JSON is malformed or the schema version is newer
    /// than what this code supports.
    pub fn from_json(s: &str) -> Result<Self> {
        let v: serde_json::Value = serde_json::from_str(s)?;
        let version = u32::try_from(
            v.get("validation_schema_version")
                .and_then(serde_json::Value::as_u64)
                .unwrap_or(0),
        )
        .unwrap_or(u32::MAX);
        if version > Self::SCHEMA_VERSION {
            return Err(crate::error::GEPAError::Config(format!(
                "Unsupported GEPAResult schema version {version}; \
                 maximum supported is {}",
                Self::SCHEMA_VERSION
            )));
        }
        Ok(serde_json::from_value(v)?)
    }
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::core::state::{FrontierType, ValsetEvaluation};

    fn make_state() -> GEPAState<usize> {
        let mut seed = Candidate::new();
        seed.insert("instructions".into(), "test".into());

        let outputs = (0..3).map(|i| serde_json::json!(i)).collect::<Vec<_>>();
        let eval = ValsetEvaluation::from_vecs(vec![0, 1, 2], outputs, vec![0.3, 0.6, 0.9], None);
        GEPAState::new(seed, eval, FrontierType::Instance, None).unwrap()
    }

    #[test]
    fn from_state_produces_valid_result() {
        let state = make_state();
        let result = GEPAResult::from_state(&state, None, None, None);

        assert_eq!(result.num_candidates(), 1);
        assert_eq!(result.num_val_instances(), 3);
        assert!(result.best_idx().is_ok());
    }

    #[test]
    fn best_idx_with_multiple_candidates() {
        let mut state = make_state();

        // Add a better program (score ~0.95 average).
        let mut new_prog = Candidate::new();
        new_prog.insert("instructions".into(), "improved".into());
        let new_eval = ValsetEvaluation::from_vecs(
            vec![0, 1, 2],
            vec![
                serde_json::json!("a"),
                serde_json::json!("b"),
                serde_json::json!("c"),
            ],
            vec![0.95, 0.95, 0.95],
            None,
        );
        state
            .update_state_with_new_program(vec![0], new_prog, new_eval, 3)
            .unwrap();

        let result = GEPAResult::from_state(&state, None, Some(42), None);
        assert_eq!(
            result.best_idx().unwrap(),
            1,
            "improved candidate should be best"
        );
        assert_eq!(result.seed, Some(42));
    }

    #[test]
    fn json_round_trip() {
        let state = make_state();
        let result = GEPAResult::from_state(&state, Some("/tmp/run".into()), None, None);

        let json = result.to_json().expect("serialisation should succeed");
        let restored: GEPAResult<usize> =
            GEPAResult::from_json(&json).expect("deserialisation should succeed");

        assert_eq!(restored.num_candidates(), result.num_candidates());
        assert_eq!(restored.val_aggregate_scores, result.val_aggregate_scores);
        assert_eq!(restored.run_dir, Some("/tmp/run".into()));
    }

    #[test]
    fn from_json_rejects_future_schema_version() {
        let json = serde_json::json!({
            "validation_schema_version": 9999,
            "candidates": [],
            "parents": [],
            "val_aggregate_scores": [],
            "val_subscores": [],
            "per_val_instance_best_candidates": {},
            "discovery_eval_counts": []
        })
        .to_string();

        let result = GEPAResult::<usize>::from_json(&json);
        assert!(result.is_err(), "should reject future schema version");
    }

    // -------------------------------------------------------------------------
    // Gap 59 — best_idx returns the smaller index when scores are tied
    // -------------------------------------------------------------------------

    #[test]
    fn test_best_idx_tie_prefers_smaller_index() {
        // Build a state with two candidates whose average scores are identical.
        let mut state = make_state(); // seed score is avg(0.3, 0.6, 0.9) = 0.6

        let tied_eval = ValsetEvaluation::from_vecs(
            vec![0, 1, 2],
            vec![
                serde_json::json!("a"),
                serde_json::json!("b"),
                serde_json::json!("c"),
            ],
            vec![0.3, 0.6, 0.9], // same average 0.6
            None,
        );
        let mut tied_candidate = Candidate::new();
        tied_candidate.insert("instructions".into(), "tied".into());
        state
            .update_state_with_new_program(vec![0], tied_candidate, tied_eval, 3)
            .unwrap();

        let result = GEPAResult::from_state(&state, None, None, None);
        assert_eq!(result.num_candidates(), 2);

        // Both candidates have the same average score; the smaller index wins.
        assert_eq!(
            result.best_idx().expect("should have candidates"),
            0,
            "smaller index should win on tie"
        );
    }

    // -------------------------------------------------------------------------
    // Gap 60a — best_candidate_str returns the string when key is present
    // -------------------------------------------------------------------------

    #[test]
    fn test_best_candidate_str_returns_value() {
        let state = make_state();
        let result = GEPAResult::from_state(&state, None, None, Some("instructions".into()));

        let s = result.best_candidate_str();
        assert!(
            s.is_some(),
            "best_candidate_str should return Some when key is present"
        );
        assert_eq!(s.unwrap(), "test");
    }

    // -------------------------------------------------------------------------
    // Gap 60b — best_candidate_str returns None when key is absent
    // -------------------------------------------------------------------------

    #[test]
    fn test_best_candidate_str_missing_key_returns_none() {
        let state = make_state();
        // Set str_candidate_key to a key that does not exist in the candidate.
        let result = GEPAResult::from_state(&state, None, None, Some("nonexistent_key".into()));

        assert!(
            result.best_candidate_str().is_none(),
            "best_candidate_str should return None when the key is absent from the candidate"
        );
    }
}