Skip to main content

agent_sdk_eval/
report.rs

1//! Evaluation report records and confidence validation.
2
3use serde::{Deserialize, Serialize};
4
5use agent_sdk_core::{AgentError, EntityRef};
6
7use crate::{ComparisonDesign, EvaluationId, EvaluationRequest, EvaluationScope, EvaluationUsage};
8
9#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
10#[serde(rename_all = "snake_case")]
11/// Confidence level for an evaluation result.
12pub enum EvaluationConfidence {
13    /// Evidence was available, but no evaluator judgment or cited support exists.
14    Available,
15    /// The evaluator cited refs that were validated against the evidence bundle.
16    Cited,
17    /// The evaluator judged the result without measured comparison evidence.
18    Judged,
19    /// A comparison, baseline, ablation, or repeated experiment produced a metric delta.
20    Measured,
21    /// Repeated experiments produced statistical evidence.
22    Statistical,
23}
24
25impl EvaluationConfidence {
26    /// Returns true when this confidence claims measured impact.
27    pub fn is_measured(&self) -> bool {
28        matches!(self, Self::Measured | Self::Statistical)
29    }
30}
31
32#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
33#[serde(rename_all = "snake_case")]
34/// Top-level evaluation verdict.
35pub enum EvaluationVerdict {
36    /// Expected outcome passed.
37    Passed,
38    /// Expected outcome failed.
39    Failed,
40    /// Evidence was insufficient or ambiguous.
41    Inconclusive,
42}
43
44#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
45/// Optional metric delta for measured evaluations.
46pub struct EvaluationMetricDelta {
47    /// Stable metric reference owned by the host or eval fixture.
48    pub metric_ref: String,
49    /// Baseline or comparison artifact.
50    pub baseline_ref: Option<EntityRef>,
51    /// Delta value encoded as a string so the SDK stays metric-neutral.
52    pub delta_value: String,
53    /// Bounded summary of how the metric was computed.
54    pub redacted_summary: String,
55}
56
57impl EvaluationMetricDelta {
58    /// Creates a metric delta.
59    pub fn new(
60        metric_ref: impl Into<String>,
61        delta_value: impl Into<String>,
62        redacted_summary: impl Into<String>,
63    ) -> Self {
64        Self {
65            metric_ref: metric_ref.into(),
66            baseline_ref: None,
67            delta_value: delta_value.into(),
68            redacted_summary: redacted_summary.into(),
69        }
70    }
71
72    /// Returns this metric delta with a baseline ref attached.
73    pub fn with_baseline_ref(mut self, baseline_ref: EntityRef) -> Self {
74        self.baseline_ref = Some(baseline_ref);
75        self
76    }
77}
78
79#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
80/// Evaluator judgment for one criterion or subject.
81pub struct EvaluatorJudgment {
82    /// Optional judgment id owned by the evaluator or test fixture.
83    pub judgment_id: Option<String>,
84    /// Subject this judgment is about.
85    pub subject_ref: EntityRef,
86    /// Optional criterion id this judgment answers.
87    pub criterion_id: Option<String>,
88    /// Judgment verdict.
89    pub verdict: EvaluationVerdict,
90    /// Optional score encoded as a string so the SDK stays rubric-neutral.
91    pub score: Option<String>,
92    /// Validated support refs.
93    pub support_refs: Vec<EntityRef>,
94    /// Refs cited by the evaluator but not present in the evidence bundle.
95    pub rejected_support_refs: Vec<EntityRef>,
96    /// Confidence level for this judgment.
97    pub confidence: EvaluationConfidence,
98    /// Bounded summary safe for logs and prompts.
99    pub redacted_summary: String,
100    /// Limitations or validation notes.
101    pub limitations: Vec<String>,
102}
103
104impl EvaluatorJudgment {
105    /// Creates a judgment with no cited refs.
106    pub fn new(
107        subject_ref: EntityRef,
108        verdict: EvaluationVerdict,
109        confidence: EvaluationConfidence,
110        redacted_summary: impl Into<String>,
111    ) -> Self {
112        Self {
113            judgment_id: None,
114            subject_ref,
115            criterion_id: None,
116            verdict,
117            score: None,
118            support_refs: Vec::new(),
119            rejected_support_refs: Vec::new(),
120            confidence,
121            redacted_summary: redacted_summary.into(),
122            limitations: Vec::new(),
123        }
124    }
125}
126
127#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
128/// Top-level report returned by an evaluator.
129pub struct EvaluationReport {
130    /// Stable evaluation id.
131    pub evaluation_id: EvaluationId,
132    /// Scope this report evaluates.
133    pub scope: EvaluationScope,
134    /// Comparison design actually used.
135    pub comparison: ComparisonDesign,
136    /// Top-level verdict.
137    pub verdict: EvaluationVerdict,
138    /// Optional top-level score.
139    pub score: Option<String>,
140    /// Top-level confidence.
141    pub confidence: EvaluationConfidence,
142    /// Per-subject or per-criterion judgments.
143    pub judgments: Vec<EvaluatorJudgment>,
144    /// Metric deltas for measured evaluations.
145    pub metric_deltas: Vec<EvaluationMetricDelta>,
146    /// Evidence refs used by this report.
147    pub evidence_refs: Vec<EntityRef>,
148    /// Usage captured during evaluation.
149    pub usage: EvaluationUsage,
150    /// Bounded report summary.
151    pub redacted_summary: String,
152    /// Limitations or validation notes.
153    pub limitations: Vec<String>,
154}
155
156impl EvaluationReport {
157    /// Creates a report with no metric deltas.
158    pub fn new(
159        evaluation_id: EvaluationId,
160        scope: EvaluationScope,
161        comparison: ComparisonDesign,
162        verdict: EvaluationVerdict,
163        confidence: EvaluationConfidence,
164        redacted_summary: impl Into<String>,
165    ) -> Self {
166        Self {
167            evaluation_id,
168            scope,
169            comparison,
170            verdict,
171            score: None,
172            confidence,
173            judgments: Vec::new(),
174            metric_deltas: Vec::new(),
175            evidence_refs: Vec::new(),
176            usage: EvaluationUsage::default(),
177            redacted_summary: redacted_summary.into(),
178            limitations: Vec::new(),
179        }
180    }
181
182    /// Returns this report with usage attached.
183    pub fn with_usage(mut self, usage: EvaluationUsage) -> Self {
184        self.usage = usage;
185        self
186    }
187
188    /// Returns this report with one judgment appended.
189    pub fn with_judgment(mut self, judgment: EvaluatorJudgment) -> Self {
190        self.judgments.push(judgment);
191        self
192    }
193
194    /// Returns this report with one metric delta appended.
195    pub fn with_metric_delta(mut self, metric_delta: EvaluationMetricDelta) -> Self {
196        self.metric_deltas.push(metric_delta);
197        self
198    }
199
200    /// Validates that measured confidence is backed by comparison evidence and
201    /// metric deltas.
202    pub fn validate_confidence_contract(&self) -> Result<(), AgentError> {
203        self.validate_measured_confidence(&self.comparison, &self.metric_deltas)
204    }
205
206    /// Validates measured confidence against request-owned metric deltas.
207    pub fn validate_confidence_contract_for_request(
208        &self,
209        request: &EvaluationRequest,
210    ) -> Result<(), AgentError> {
211        self.validate_measured_confidence(&request.comparison, &request.metric_deltas)?;
212        let claims_measured = self.confidence.is_measured()
213            || self
214                .judgments
215                .iter()
216                .any(|judgment| judgment.confidence.is_measured());
217        if claims_measured && self.comparison != request.comparison {
218            return Err(AgentError::contract_violation(
219                "measured evaluation comparison must match the evaluation request",
220            ));
221        }
222        if claims_measured && self.metric_deltas != request.metric_deltas {
223            return Err(AgentError::contract_violation(
224                "measured evaluation metric deltas must come from the evaluation request",
225            ));
226        }
227        Ok(())
228    }
229
230    fn validate_measured_confidence(
231        &self,
232        comparison: &ComparisonDesign,
233        metric_deltas: &[EvaluationMetricDelta],
234    ) -> Result<(), AgentError> {
235        let claims_measured = self.confidence.is_measured()
236            || self
237                .judgments
238                .iter()
239                .any(|judgment| judgment.confidence.is_measured());
240        if !claims_measured {
241            return Ok(());
242        }
243        if !comparison.supports_measured_confidence() {
244            return Err(AgentError::contract_violation(
245                "measured evaluation confidence requires a comparison design",
246            ));
247        }
248        if !comparison.has_comparison_evidence() {
249            return Err(AgentError::contract_violation(
250                "measured evaluation confidence requires comparison evidence refs",
251            ));
252        }
253        if metric_deltas.is_empty() {
254            return Err(AgentError::contract_violation(
255                "measured evaluation confidence requires at least one metric delta",
256            ));
257        }
258        Ok(())
259    }
260}