use serde::{Deserialize, Serialize};
use agent_sdk_core::{AgentError, EntityRef};
use crate::{ComparisonDesign, EvaluationId, EvaluationRequest, EvaluationScope, EvaluationUsage};
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum EvaluationConfidence {
Available,
Cited,
Judged,
Measured,
Statistical,
}
impl EvaluationConfidence {
pub fn is_measured(&self) -> bool {
matches!(self, Self::Measured | Self::Statistical)
}
}
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
#[serde(rename_all = "snake_case")]
pub enum EvaluationVerdict {
Passed,
Failed,
Inconclusive,
}
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct EvaluationMetricDelta {
pub metric_ref: String,
pub baseline_ref: Option<EntityRef>,
pub delta_value: String,
pub redacted_summary: String,
}
impl EvaluationMetricDelta {
pub fn new(
metric_ref: impl Into<String>,
delta_value: impl Into<String>,
redacted_summary: impl Into<String>,
) -> Self {
Self {
metric_ref: metric_ref.into(),
baseline_ref: None,
delta_value: delta_value.into(),
redacted_summary: redacted_summary.into(),
}
}
pub fn with_baseline_ref(mut self, baseline_ref: EntityRef) -> Self {
self.baseline_ref = Some(baseline_ref);
self
}
}
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct EvaluatorJudgment {
pub judgment_id: Option<String>,
pub subject_ref: EntityRef,
pub criterion_id: Option<String>,
pub verdict: EvaluationVerdict,
pub score: Option<String>,
pub support_refs: Vec<EntityRef>,
pub rejected_support_refs: Vec<EntityRef>,
pub confidence: EvaluationConfidence,
pub redacted_summary: String,
pub limitations: Vec<String>,
}
impl EvaluatorJudgment {
pub fn new(
subject_ref: EntityRef,
verdict: EvaluationVerdict,
confidence: EvaluationConfidence,
redacted_summary: impl Into<String>,
) -> Self {
Self {
judgment_id: None,
subject_ref,
criterion_id: None,
verdict,
score: None,
support_refs: Vec::new(),
rejected_support_refs: Vec::new(),
confidence,
redacted_summary: redacted_summary.into(),
limitations: Vec::new(),
}
}
}
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
pub struct EvaluationReport {
pub evaluation_id: EvaluationId,
pub scope: EvaluationScope,
pub comparison: ComparisonDesign,
pub verdict: EvaluationVerdict,
pub score: Option<String>,
pub confidence: EvaluationConfidence,
pub judgments: Vec<EvaluatorJudgment>,
pub metric_deltas: Vec<EvaluationMetricDelta>,
pub evidence_refs: Vec<EntityRef>,
pub usage: EvaluationUsage,
pub redacted_summary: String,
pub limitations: Vec<String>,
}
impl EvaluationReport {
pub fn new(
evaluation_id: EvaluationId,
scope: EvaluationScope,
comparison: ComparisonDesign,
verdict: EvaluationVerdict,
confidence: EvaluationConfidence,
redacted_summary: impl Into<String>,
) -> Self {
Self {
evaluation_id,
scope,
comparison,
verdict,
score: None,
confidence,
judgments: Vec::new(),
metric_deltas: Vec::new(),
evidence_refs: Vec::new(),
usage: EvaluationUsage::default(),
redacted_summary: redacted_summary.into(),
limitations: Vec::new(),
}
}
pub fn with_usage(mut self, usage: EvaluationUsage) -> Self {
self.usage = usage;
self
}
pub fn with_judgment(mut self, judgment: EvaluatorJudgment) -> Self {
self.judgments.push(judgment);
self
}
pub fn with_metric_delta(mut self, metric_delta: EvaluationMetricDelta) -> Self {
self.metric_deltas.push(metric_delta);
self
}
pub fn validate_confidence_contract(&self) -> Result<(), AgentError> {
self.validate_measured_confidence(&self.comparison, &self.metric_deltas)
}
pub fn validate_confidence_contract_for_request(
&self,
request: &EvaluationRequest,
) -> Result<(), AgentError> {
self.validate_measured_confidence(&request.comparison, &request.metric_deltas)?;
let claims_measured = self.confidence.is_measured()
|| self
.judgments
.iter()
.any(|judgment| judgment.confidence.is_measured());
if claims_measured && self.comparison != request.comparison {
return Err(AgentError::contract_violation(
"measured evaluation comparison must match the evaluation request",
));
}
if claims_measured && self.metric_deltas != request.metric_deltas {
return Err(AgentError::contract_violation(
"measured evaluation metric deltas must come from the evaluation request",
));
}
Ok(())
}
fn validate_measured_confidence(
&self,
comparison: &ComparisonDesign,
metric_deltas: &[EvaluationMetricDelta],
) -> Result<(), AgentError> {
let claims_measured = self.confidence.is_measured()
|| self
.judgments
.iter()
.any(|judgment| judgment.confidence.is_measured());
if !claims_measured {
return Ok(());
}
if !comparison.supports_measured_confidence() {
return Err(AgentError::contract_violation(
"measured evaluation confidence requires a comparison design",
));
}
if !comparison.has_comparison_evidence() {
return Err(AgentError::contract_violation(
"measured evaluation confidence requires comparison evidence refs",
));
}
if metric_deltas.is_empty() {
return Err(AgentError::contract_violation(
"measured evaluation confidence requires at least one metric delta",
));
}
Ok(())
}
}