Skip to main content

agent_sdk_eval/
scope.rs

1//! Evaluation scopes, subjects, and expected outcomes.
2
3use serde::{Deserialize, Serialize};
4
5use agent_sdk_core::{EntityRef, RunId, SessionId, TurnId};
6
7#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
8#[serde(tag = "scope", rename_all = "snake_case")]
9/// The durable boundary an evaluation is about.
10pub enum EvaluationScope {
11    /// Evaluate one run.
12    Run {
13        /// Run identifier used for lineage, filtering, replay, and dedupe.
14        run_id: RunId,
15    },
16    /// Evaluate one turn, optionally grouped by session.
17    Turn {
18        /// Optional host-provided session identifier for grouping related turns.
19        session_id: Option<SessionId>,
20        /// Turn identifier for one loop turn within a run.
21        turn_id: TurnId,
22    },
23    /// Evaluate one session timeline.
24    Session {
25        /// Session identifier for grouping related turns.
26        session_id: SessionId,
27    },
28    /// Evaluate a host-defined scope represented by an entity ref.
29    Custom {
30        /// Scope ref owned by the host or optional integration layer.
31        scope_ref: EntityRef,
32    },
33}
34
35#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
36#[serde(rename_all = "snake_case")]
37/// Role a subject plays in an evaluation.
38pub enum EvaluationSubjectRole {
39    /// Main thing being evaluated.
40    Primary,
41    /// Candidate evidence that may have helped the outcome.
42    CandidateEvidence,
43    /// Baseline subject used for comparison.
44    Baseline,
45    /// Comparator subject used in paired or ablation evals.
46    Comparator,
47    /// Constraint that shaped the expected result.
48    Constraint,
49}
50
51#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
52/// One entity being evaluated or compared.
53pub struct EvaluationSubject {
54    /// Entity ref for the subject.
55    pub subject_ref: EntityRef,
56    /// Subject role in this evaluation.
57    pub role: EvaluationSubjectRole,
58    /// Bounded summary safe for logs, journals, events, and telemetry.
59    pub redacted_summary: Option<String>,
60}
61
62impl EvaluationSubject {
63    /// Creates a primary evaluation subject.
64    pub fn primary(subject_ref: EntityRef) -> Self {
65        Self {
66            subject_ref,
67            role: EvaluationSubjectRole::Primary,
68            redacted_summary: None,
69        }
70    }
71
72    /// Returns this subject with a safe summary attached.
73    pub fn with_redacted_summary(mut self, redacted_summary: impl Into<String>) -> Self {
74        self.redacted_summary = Some(redacted_summary.into());
75        self
76    }
77}
78
79#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
80/// One expected-outcome criterion.
81pub struct EvaluationCriterion {
82    /// Stable criterion id owned by the host or eval fixture.
83    pub criterion_id: String,
84    /// Bounded summary of the condition to check.
85    pub redacted_summary: String,
86    /// Optional weight encoded as a string so the SDK stays metric-neutral.
87    pub weight: Option<String>,
88}
89
90impl EvaluationCriterion {
91    /// Creates a criterion from a stable id and safe summary.
92    pub fn new(criterion_id: impl Into<String>, redacted_summary: impl Into<String>) -> Self {
93        Self {
94            criterion_id: criterion_id.into(),
95            redacted_summary: redacted_summary.into(),
96            weight: None,
97        }
98    }
99
100    /// Returns this criterion with a host-defined weight attached.
101    pub fn with_weight(mut self, weight: impl Into<String>) -> Self {
102        self.weight = Some(weight.into());
103        self
104    }
105}
106
107#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
108/// Expected result supplied by a test, host, human reviewer, or eval fixture.
109pub struct ExpectedOutcome {
110    /// Optional outcome entity being checked.
111    pub outcome_ref: Option<EntityRef>,
112    /// Criteria the evaluator should judge.
113    pub criteria: Vec<EvaluationCriterion>,
114    /// Bounded summary safe for provider prompts and test output.
115    pub redacted_summary: String,
116}
117
118impl ExpectedOutcome {
119    /// Creates an expected outcome from a safe summary.
120    pub fn new(redacted_summary: impl Into<String>) -> Self {
121        Self {
122            outcome_ref: None,
123            criteria: Vec::new(),
124            redacted_summary: redacted_summary.into(),
125        }
126    }
127
128    /// Creates a common completion expectation.
129    pub fn completed() -> Self {
130        Self::new("agent completed the requested task").with_criterion(EvaluationCriterion::new(
131            "criterion.completed",
132            "run completed",
133        ))
134    }
135
136    /// Returns this expected outcome with an outcome ref attached.
137    pub fn with_outcome_ref(mut self, outcome_ref: EntityRef) -> Self {
138        self.outcome_ref = Some(outcome_ref);
139        self
140    }
141
142    /// Returns this expected outcome with one criterion appended.
143    pub fn with_criterion(mut self, criterion: EvaluationCriterion) -> Self {
144        self.criteria.push(criterion);
145        self
146    }
147}