1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
//! Evaluation request, budget, and usage records.
use serde::{Deserialize, Serialize};
use agent_sdk_core::{AgentError, ProviderUsage};
use crate::{
ComparisonDesign, EvaluationId, EvaluationMetricDelta, EvaluationScope, EvaluationSubject,
ExpectedOutcome,
};
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
/// Budget for evaluator work. Provider-backed evaluators should reject requests
/// that cannot fit this budget rather than silently making extra calls.
pub struct EvaluationBudget {
/// Maximum provider calls an evaluator may make for this evaluation.
pub max_provider_calls: u32,
/// Maximum prompt characters sent to a provider-backed evaluator.
pub max_prompt_chars: usize,
/// Maximum cited support refs accepted from evaluator output.
pub max_support_refs: usize,
}
impl Default for EvaluationBudget {
fn default() -> Self {
Self {
max_provider_calls: 1,
max_prompt_chars: 4_096,
max_support_refs: 8,
}
}
}
impl EvaluationBudget {
/// Ensures a provider-backed evaluator may spend one provider call.
pub fn require_provider_call(&self) -> Result<(), AgentError> {
if self.max_provider_calls == 0 {
return Err(AgentError::contract_violation(
"evaluation budget allows zero provider calls",
));
}
Ok(())
}
}
#[derive(Clone, Debug, Default, Deserialize, Eq, PartialEq, Serialize)]
/// Usage captured by an evaluator run.
pub struct EvaluationUsage {
/// Number of provider calls made by this evaluator.
pub provider_calls: u32,
/// Provider usage accounting when the adapter reports it.
pub provider_usage: Option<ProviderUsage>,
}
#[derive(Clone, Debug, Deserialize, Eq, PartialEq, Serialize)]
/// Request passed to an evaluator.
pub struct EvaluationRequest {
/// Stable evaluation id for lineage and test output.
pub evaluation_id: EvaluationId,
/// Durable scope being evaluated.
pub scope: EvaluationScope,
/// Subjects being evaluated or compared.
pub subjects: Vec<EvaluationSubject>,
/// Expected outcome supplied by a test, host, or reviewer.
pub expected_outcome: ExpectedOutcome,
/// Comparison design for the evaluation.
pub comparison: ComparisonDesign,
/// Deterministic metric deltas supplied by local evaluators or tests.
pub metric_deltas: Vec<EvaluationMetricDelta>,
/// Budget for evaluator work.
pub budget: EvaluationBudget,
/// Bounded request summary safe for logs and prompts.
pub redacted_summary: String,
}
impl EvaluationRequest {
/// Creates an evaluation request with observed-only comparison defaults.
pub fn new(
evaluation_id: EvaluationId,
scope: EvaluationScope,
expected_outcome: ExpectedOutcome,
) -> Self {
Self {
evaluation_id,
scope,
subjects: Vec::new(),
redacted_summary: expected_outcome.redacted_summary.clone(),
expected_outcome,
comparison: ComparisonDesign::ObservedOnly,
metric_deltas: Vec::new(),
budget: EvaluationBudget::default(),
}
}
/// Returns this request with one subject appended.
pub fn with_subject(mut self, subject: EvaluationSubject) -> Self {
self.subjects.push(subject);
self
}
/// Returns this request with its comparison design replaced.
pub fn with_comparison(mut self, comparison: ComparisonDesign) -> Self {
self.comparison = comparison;
self
}
/// Returns this request with one deterministic metric delta appended.
pub fn with_metric_delta(mut self, metric_delta: EvaluationMetricDelta) -> Self {
self.metric_deltas.push(metric_delta);
self
}
/// Returns this request with deterministic metric deltas appended.
pub fn with_metric_deltas(
mut self,
metric_deltas: impl IntoIterator<Item = EvaluationMetricDelta>,
) -> Self {
self.metric_deltas.extend(metric_deltas);
self
}
/// Returns this request with its budget replaced.
pub fn with_budget(mut self, budget: EvaluationBudget) -> Self {
self.budget = budget;
self
}
}