swink_agent_eval/evaluators/
rag.rs1#![forbid(unsafe_code)]
24#![cfg(feature = "evaluator-rag")]
25
26use std::sync::Arc;
27
28use crate::evaluator::Evaluator;
29use crate::score::Score;
30use crate::types::{EvalCase, EvalMetricResult, Invocation};
31
32use super::{JudgeEvaluatorConfig, build_prompt_context, evaluate_with_builtin};
33
34fn has_final_response(invocation: &Invocation) -> bool {
35 invocation
36 .final_response
37 .as_deref()
38 .is_some_and(|s| !s.trim().is_empty())
39}
40
41fn has_user_prompt(case: &EvalCase) -> bool {
42 !case.user_messages.is_empty()
43}
44
45fn has_retrieved_context(case: &EvalCase) -> bool {
46 !case.few_shot_examples.is_empty()
47}
48
49macro_rules! rag_evaluator {
52 (
53 $(#[$meta:meta])*
54 $name:ident, $eval_name:literal, $template:literal, $criterion:expr
55 ) => {
56 $(#[$meta])*
57 pub struct $name {
58 config: JudgeEvaluatorConfig,
59 }
60
61 impl $name {
62 #[must_use]
64 pub const fn new(config: JudgeEvaluatorConfig) -> Self {
65 Self { config }
66 }
67
68 #[must_use]
70 pub fn with_prompt(mut self, template: Arc<dyn crate::prompt::JudgePromptTemplate>) -> Self {
71 self.config = self.config.with_prompt(template);
72 self
73 }
74
75 #[must_use]
78 pub fn with_few_shot(mut self, examples: Vec<crate::types::FewShotExample>) -> Self {
79 self.config = self.config.with_few_shot(examples);
80 self
81 }
82
83 #[must_use]
85 pub fn with_system_prompt(mut self, prompt: impl Into<String>) -> Self {
86 self.config = self.config.with_system_prompt(prompt);
87 self
88 }
89
90 #[must_use]
92 pub fn with_output_schema(mut self, schema: serde_json::Value) -> Self {
93 self.config = self.config.with_output_schema(schema);
94 self
95 }
96
97 #[must_use]
99 pub fn with_use_reasoning(mut self, flag: bool) -> Self {
100 self.config = self.config.with_use_reasoning(flag);
101 self
102 }
103
104 #[must_use]
106 pub fn with_feedback_key(mut self, key: impl Into<String>) -> Self {
107 self.config = self.config.with_feedback_key(key);
108 self
109 }
110
111 #[must_use]
114 pub const fn config(&self) -> &JudgeEvaluatorConfig {
115 &self.config
116 }
117 }
118
119 impl $crate::evaluators::JudgeEvaluatorBuilder for $name {
120 fn judge_config_mut(&mut self) -> &mut JudgeEvaluatorConfig {
121 &mut self.config
122 }
123 }
124
125 impl Evaluator for $name {
126 fn name(&self) -> &'static str {
127 $eval_name
128 }
129
130 fn evaluate(
131 &self,
132 case: &EvalCase,
133 invocation: &Invocation,
134 ) -> Option<EvalMetricResult> {
135 let criterion: fn(&EvalCase, &Invocation) -> bool = $criterion;
137 if !criterion(case, invocation) {
138 return None;
139 }
140
141 Some(evaluate_with_builtin(
142 $eval_name,
143 $template,
144 &self.config,
145 &build_prompt_context(&self.config, case, invocation),
146 ))
147 }
148 }
149 };
150}
151
152rag_evaluator! {
153 RAGGroundednessEvaluator,
160 "rag_groundedness",
161 "rag_groundedness_v0",
162 |case, invocation| has_retrieved_context(case)
163 && has_user_prompt(case)
164 && has_final_response(invocation)
165}
166
167rag_evaluator! {
168 RAGRetrievalRelevanceEvaluator,
175 "rag_retrieval_relevance",
176 "rag_retrieval_relevance_v0",
177 |case, _invocation| has_retrieved_context(case) && has_user_prompt(case)
178}
179
180rag_evaluator! {
181 RAGHelpfulnessEvaluator,
187 "rag_helpfulness",
188 "rag_helpfulness_v0",
189 |case, invocation| has_retrieved_context(case)
190 && has_user_prompt(case)
191 && has_final_response(invocation)
192}
193
194#[derive(Debug, thiserror::Error)]
203pub enum EmbedderError {
204 #[error("invalid input: {reason}")]
206 InvalidInput {
207 reason: String,
209 },
210 #[error("embedder backend error: {reason}")]
212 Backend {
213 reason: String,
215 },
216 #[error("dimension mismatch: response={response_dim} reference={reference_dim}")]
218 DimensionMismatch {
219 response_dim: usize,
221 reference_dim: usize,
223 },
224}
225
226pub trait Embedder: Send + Sync {
232 fn embed(&self, text: &str) -> Result<Vec<f32>, EmbedderError>;
234}
235
236fn cosine_similarity(a: &[f32], b: &[f32]) -> f64 {
240 if a.len() != b.len() || a.is_empty() {
241 return 0.0;
242 }
243 let mut dot: f64 = 0.0;
244 let mut na: f64 = 0.0;
245 let mut nb: f64 = 0.0;
246 for (x, y) in a.iter().zip(b.iter()) {
247 let xf = f64::from(*x);
248 let yf = f64::from(*y);
249 dot += xf * yf;
250 na += xf * xf;
251 nb += yf * yf;
252 }
253 if na == 0.0 || nb == 0.0 {
254 return 0.0;
255 }
256 let sim = dot / (na.sqrt() * nb.sqrt());
257 sim.clamp(-1.0, 1.0)
258}
259
260pub const DEFAULT_EMBEDDING_SIMILARITY_THRESHOLD: f64 = 0.8;
262
263pub struct EmbeddingSimilarityEvaluator {
278 name: &'static str,
279 reference: String,
280 threshold: f64,
281 embedder: Arc<dyn Embedder>,
282}
283
284impl EmbeddingSimilarityEvaluator {
285 #[must_use]
287 pub fn new(reference: impl Into<String>, embedder: Arc<dyn Embedder>) -> Self {
288 Self {
289 name: "embedding_similarity",
290 reference: reference.into(),
291 threshold: DEFAULT_EMBEDDING_SIMILARITY_THRESHOLD,
292 embedder,
293 }
294 }
295
296 #[must_use]
298 pub const fn with_name(mut self, name: &'static str) -> Self {
299 self.name = name;
300 self
301 }
302
303 #[must_use]
310 pub const fn with_threshold(mut self, threshold: f64) -> Self {
311 self.threshold = threshold;
312 self
313 }
314
315 #[must_use]
317 pub fn reference(&self) -> &str {
318 &self.reference
319 }
320
321 #[must_use]
323 pub const fn threshold(&self) -> f64 {
324 self.threshold
325 }
326}
327
328impl Evaluator for EmbeddingSimilarityEvaluator {
329 fn name(&self) -> &'static str {
330 self.name
331 }
332
333 fn evaluate(&self, _case: &EvalCase, invocation: &Invocation) -> Option<EvalMetricResult> {
334 let actual = invocation.final_response.as_deref()?;
336 if actual.trim().is_empty() {
337 return None;
338 }
339
340 let name = self.name.to_string();
341 let a = match self.embedder.embed(actual) {
342 Ok(v) => v,
343 Err(err) => {
344 return Some(EvalMetricResult {
345 evaluator_name: name,
346 score: Score::fail(),
347 details: Some(format!("embed_response: {err}")),
348 });
349 }
350 };
351 let b = match self.embedder.embed(&self.reference) {
352 Ok(v) => v,
353 Err(err) => {
354 return Some(EvalMetricResult {
355 evaluator_name: name,
356 score: Score::fail(),
357 details: Some(format!("embed_reference: {err}")),
358 });
359 }
360 };
361 if a.len() != b.len() {
362 let err = EmbedderError::DimensionMismatch {
363 response_dim: a.len(),
364 reference_dim: b.len(),
365 };
366 return Some(EvalMetricResult {
367 evaluator_name: name,
368 score: Score::fail(),
369 details: Some(err.to_string()),
370 });
371 }
372
373 let raw = cosine_similarity(&a, &b);
374 let remapped = f64::midpoint(raw, 1.0).clamp(0.0, 1.0);
377 let score = Score::new(remapped, self.threshold);
378 Some(EvalMetricResult {
379 evaluator_name: name,
380 score,
381 details: Some(format!(
382 "cosine_similarity={raw:.4} remapped={remapped:.4} threshold={:.4}",
383 self.threshold
384 )),
385 })
386 }
387}
388
389#[cfg(test)]
390mod tests {
391 use super::*;
392
393 #[test]
394 fn cosine_similarity_of_identical_vectors_is_one() {
395 let a = vec![1.0_f32, 0.0, 0.0];
396 assert!((cosine_similarity(&a, &a) - 1.0).abs() < 1e-9);
397 }
398
399 #[test]
400 fn cosine_similarity_of_opposite_vectors_is_minus_one() {
401 let a = vec![1.0_f32, 0.0];
402 let b = vec![-1.0_f32, 0.0];
403 assert!((cosine_similarity(&a, &b) + 1.0).abs() < 1e-9);
404 }
405
406 #[test]
407 fn cosine_similarity_orthogonal_vectors_is_zero() {
408 let a = vec![1.0_f32, 0.0];
409 let b = vec![0.0_f32, 1.0];
410 assert!(cosine_similarity(&a, &b).abs() < 1e-9);
411 }
412
413 #[test]
414 fn cosine_similarity_mismatched_dims_is_zero() {
415 let a = vec![1.0_f32, 0.0];
416 let b = vec![1.0_f32];
417 assert!(cosine_similarity(&a, &b).abs() < 1e-9);
418 }
419
420 #[test]
421 fn cosine_similarity_empty_vectors_is_zero() {
422 let a: Vec<f32> = vec![];
423 let b: Vec<f32> = vec![];
424 assert!(cosine_similarity(&a, &b).abs() < 1e-9);
425 }
426}