Skip to main content

agent_sdk_eval/
comparison.rs

1//! Comparison designs for evaluation reports.
2
3use serde::{Deserialize, Serialize};
4
5use agent_sdk_core::EntityRef;
6
7use crate::EvaluationScope;
8
9#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)]
10#[serde(tag = "design", rename_all = "snake_case")]
11/// How an evaluation should compare an observed outcome.
12pub enum ComparisonDesign {
13    /// Judge the observed run or turn only.
14    #[default]
15    ObservedOnly,
16    /// Judge the observed result against expected criteria only.
17    ExpectedOutcome,
18    /// Compare the observed result with a baseline run.
19    BaselineRun {
20        /// Baseline run or trace ref used as comparison evidence.
21        baseline_ref: EntityRef,
22    },
23    /// Compare two recorded runs.
24    PairedRuns {
25        /// Observed run ref.
26        observed_ref: EntityRef,
27        /// Comparison run ref.
28        comparison_ref: EntityRef,
29    },
30    /// Compare two durable evaluation scopes without inventing entity refs.
31    PairedScopes {
32        /// Observed scope.
33        observed_scope: EvaluationScope,
34        /// Comparison scope.
35        comparison_scope: EvaluationScope,
36    },
37    /// Compare the observed result with one or more evidence refs removed.
38    Ablation {
39        /// Evidence refs removed for the ablation comparison.
40        removed_refs: Vec<EntityRef>,
41    },
42    /// Ask for a counterfactual judgment without claiming measurement.
43    Counterfactual {
44        /// Safe summary of the counterfactual condition.
45        redacted_summary: String,
46    },
47    /// Compare a cohort of repeated experiments.
48    RepeatedExperiment {
49        /// Cohort or experiment ref used as comparison evidence.
50        cohort_ref: EntityRef,
51    },
52}
53
54impl ComparisonDesign {
55    /// Returns true when this design can support measured confidence if metric
56    /// deltas are also present.
57    pub fn supports_measured_confidence(&self) -> bool {
58        matches!(
59            self,
60            Self::BaselineRun { .. }
61                | Self::PairedRuns { .. }
62                | Self::PairedScopes { .. }
63                | Self::Ablation { .. }
64                | Self::RepeatedExperiment { .. }
65        )
66    }
67
68    /// Returns comparison refs available for validation and report evidence.
69    pub fn comparison_refs(&self) -> Vec<EntityRef> {
70        match self {
71            Self::ObservedOnly | Self::ExpectedOutcome | Self::Counterfactual { .. } => Vec::new(),
72            Self::BaselineRun { baseline_ref } => vec![baseline_ref.clone()],
73            Self::PairedRuns {
74                observed_ref,
75                comparison_ref,
76            } => vec![observed_ref.clone(), comparison_ref.clone()],
77            Self::PairedScopes { .. } => Vec::new(),
78            Self::Ablation { removed_refs } => removed_refs.clone(),
79            Self::RepeatedExperiment { cohort_ref } => vec![cohort_ref.clone()],
80        }
81    }
82
83    /// Returns true when this design carries comparison evidence.
84    pub fn has_comparison_evidence(&self) -> bool {
85        matches!(self, Self::PairedScopes { .. }) || !self.comparison_refs().is_empty()
86    }
87}