1use serde::{Deserialize, Serialize};
4
5use agent_sdk_core::{AgentError, EntityRef};
6
7use crate::{ComparisonDesign, EvaluationId, EvaluationRequest, EvaluationScope, EvaluationUsage};
8
9#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
10#[serde(rename_all = "snake_case")]
11pub enum EvaluationConfidence {
13 Available,
15 Cited,
17 Judged,
19 Measured,
21 Statistical,
23}
24
25impl EvaluationConfidence {
26 pub fn is_measured(&self) -> bool {
28 matches!(self, Self::Measured | Self::Statistical)
29 }
30}
31
32#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
33#[serde(rename_all = "snake_case")]
34pub enum EvaluationVerdict {
36 Passed,
38 Failed,
40 Inconclusive,
42}
43
44#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
45pub struct EvaluationMetricDelta {
47 pub metric_ref: String,
49 pub baseline_ref: Option<EntityRef>,
51 pub delta_value: String,
53 pub redacted_summary: String,
55}
56
57impl EvaluationMetricDelta {
58 pub fn new(
60 metric_ref: impl Into<String>,
61 delta_value: impl Into<String>,
62 redacted_summary: impl Into<String>,
63 ) -> Self {
64 Self {
65 metric_ref: metric_ref.into(),
66 baseline_ref: None,
67 delta_value: delta_value.into(),
68 redacted_summary: redacted_summary.into(),
69 }
70 }
71
72 pub fn with_baseline_ref(mut self, baseline_ref: EntityRef) -> Self {
74 self.baseline_ref = Some(baseline_ref);
75 self
76 }
77}
78
79#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
80pub struct EvaluatorJudgment {
82 pub judgment_id: Option<String>,
84 pub subject_ref: EntityRef,
86 pub criterion_id: Option<String>,
88 pub verdict: EvaluationVerdict,
90 pub score: Option<String>,
92 pub support_refs: Vec<EntityRef>,
94 pub rejected_support_refs: Vec<EntityRef>,
96 pub confidence: EvaluationConfidence,
98 pub redacted_summary: String,
100 pub limitations: Vec<String>,
102}
103
104impl EvaluatorJudgment {
105 pub fn new(
107 subject_ref: EntityRef,
108 verdict: EvaluationVerdict,
109 confidence: EvaluationConfidence,
110 redacted_summary: impl Into<String>,
111 ) -> Self {
112 Self {
113 judgment_id: None,
114 subject_ref,
115 criterion_id: None,
116 verdict,
117 score: None,
118 support_refs: Vec::new(),
119 rejected_support_refs: Vec::new(),
120 confidence,
121 redacted_summary: redacted_summary.into(),
122 limitations: Vec::new(),
123 }
124 }
125}
126
127#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
128pub struct EvaluationReport {
130 pub evaluation_id: EvaluationId,
132 pub scope: EvaluationScope,
134 pub comparison: ComparisonDesign,
136 pub verdict: EvaluationVerdict,
138 pub score: Option<String>,
140 pub confidence: EvaluationConfidence,
142 pub judgments: Vec<EvaluatorJudgment>,
144 pub metric_deltas: Vec<EvaluationMetricDelta>,
146 pub evidence_refs: Vec<EntityRef>,
148 pub usage: EvaluationUsage,
150 pub redacted_summary: String,
152 pub limitations: Vec<String>,
154}
155
156impl EvaluationReport {
157 pub fn new(
159 evaluation_id: EvaluationId,
160 scope: EvaluationScope,
161 comparison: ComparisonDesign,
162 verdict: EvaluationVerdict,
163 confidence: EvaluationConfidence,
164 redacted_summary: impl Into<String>,
165 ) -> Self {
166 Self {
167 evaluation_id,
168 scope,
169 comparison,
170 verdict,
171 score: None,
172 confidence,
173 judgments: Vec::new(),
174 metric_deltas: Vec::new(),
175 evidence_refs: Vec::new(),
176 usage: EvaluationUsage::default(),
177 redacted_summary: redacted_summary.into(),
178 limitations: Vec::new(),
179 }
180 }
181
182 pub fn with_usage(mut self, usage: EvaluationUsage) -> Self {
184 self.usage = usage;
185 self
186 }
187
188 pub fn with_judgment(mut self, judgment: EvaluatorJudgment) -> Self {
190 self.judgments.push(judgment);
191 self
192 }
193
194 pub fn with_metric_delta(mut self, metric_delta: EvaluationMetricDelta) -> Self {
196 self.metric_deltas.push(metric_delta);
197 self
198 }
199
200 pub fn validate_confidence_contract(&self) -> Result<(), AgentError> {
203 self.validate_measured_confidence(&self.comparison, &self.metric_deltas)
204 }
205
206 pub fn validate_confidence_contract_for_request(
208 &self,
209 request: &EvaluationRequest,
210 ) -> Result<(), AgentError> {
211 self.validate_measured_confidence(&request.comparison, &request.metric_deltas)?;
212 let claims_measured = self.confidence.is_measured()
213 || self
214 .judgments
215 .iter()
216 .any(|judgment| judgment.confidence.is_measured());
217 if claims_measured && self.comparison != request.comparison {
218 return Err(AgentError::contract_violation(
219 "measured evaluation comparison must match the evaluation request",
220 ));
221 }
222 if claims_measured && self.metric_deltas != request.metric_deltas {
223 return Err(AgentError::contract_violation(
224 "measured evaluation metric deltas must come from the evaluation request",
225 ));
226 }
227 Ok(())
228 }
229
230 fn validate_measured_confidence(
231 &self,
232 comparison: &ComparisonDesign,
233 metric_deltas: &[EvaluationMetricDelta],
234 ) -> Result<(), AgentError> {
235 let claims_measured = self.confidence.is_measured()
236 || self
237 .judgments
238 .iter()
239 .any(|judgment| judgment.confidence.is_measured());
240 if !claims_measured {
241 return Ok(());
242 }
243 if !comparison.supports_measured_confidence() {
244 return Err(AgentError::contract_violation(
245 "measured evaluation confidence requires a comparison design",
246 ));
247 }
248 if !comparison.has_comparison_evidence() {
249 return Err(AgentError::contract_violation(
250 "measured evaluation confidence requires comparison evidence refs",
251 ));
252 }
253 if metric_deltas.is_empty() {
254 return Err(AgentError::contract_violation(
255 "measured evaluation confidence requires at least one metric delta",
256 ));
257 }
258 Ok(())
259 }
260}