1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
//! Comparison designs for evaluation reports.
use serde::{Deserialize, Serialize};
use agent_sdk_core::EntityRef;
use crate::EvaluationScope;
#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)]
#[serde(tag = "design", rename_all = "snake_case")]
/// How an evaluation should compare an observed outcome.
pub enum ComparisonDesign {
/// Judge the observed run or turn only.
#[default]
ObservedOnly,
/// Judge the observed result against expected criteria only.
ExpectedOutcome,
/// Compare the observed result with a baseline run.
BaselineRun {
/// Baseline run or trace ref used as comparison evidence.
baseline_ref: EntityRef,
},
/// Compare two recorded runs.
PairedRuns {
/// Observed run ref.
observed_ref: EntityRef,
/// Comparison run ref.
comparison_ref: EntityRef,
},
/// Compare two durable evaluation scopes without inventing entity refs.
PairedScopes {
/// Observed scope.
observed_scope: EvaluationScope,
/// Comparison scope.
comparison_scope: EvaluationScope,
},
/// Compare the observed result with one or more evidence refs removed.
Ablation {
/// Evidence refs removed for the ablation comparison.
removed_refs: Vec<EntityRef>,
},
/// Ask for a counterfactual judgment without claiming measurement.
Counterfactual {
/// Safe summary of the counterfactual condition.
redacted_summary: String,
},
/// Compare a cohort of repeated experiments.
RepeatedExperiment {
/// Cohort or experiment ref used as comparison evidence.
cohort_ref: EntityRef,
},
}
impl ComparisonDesign {
/// Returns true when this design can support measured confidence if metric
/// deltas are also present.
pub fn supports_measured_confidence(&self) -> bool {
matches!(
self,
Self::BaselineRun { .. }
| Self::PairedRuns { .. }
| Self::PairedScopes { .. }
| Self::Ablation { .. }
| Self::RepeatedExperiment { .. }
)
}
/// Returns comparison refs available for validation and report evidence.
pub fn comparison_refs(&self) -> Vec<EntityRef> {
match self {
Self::ObservedOnly | Self::ExpectedOutcome | Self::Counterfactual { .. } => Vec::new(),
Self::BaselineRun { baseline_ref } => vec![baseline_ref.clone()],
Self::PairedRuns {
observed_ref,
comparison_ref,
} => vec![observed_ref.clone(), comparison_ref.clone()],
Self::PairedScopes { .. } => Vec::new(),
Self::Ablation { removed_refs } => removed_refs.clone(),
Self::RepeatedExperiment { cohort_ref } => vec![cohort_ref.clone()],
}
}
/// Returns true when this design carries comparison evidence.
pub fn has_comparison_evidence(&self) -> bool {
matches!(self, Self::PairedScopes { .. }) || !self.comparison_refs().is_empty()
}
}