swink_agent_eval/evaluators/
rag.rs

1//! RAG-family evaluators (T066, T067).
2//!
3//! Three judge-backed evaluators score retrieval-augmented generation:
4//!
5//! * [`RAGGroundednessEvaluator`] — every claim in the response is supported
6//!   by the retrieved context (prompt: `rag_groundedness_v0`).
7//! * [`RAGRetrievalRelevanceEvaluator`] — the retrieved context is relevant
8//!   to the user's prompt (prompt: `rag_retrieval_relevance_v0`).
9//! * [`RAGHelpfulnessEvaluator`] — the response leverages the retrieved
10//!   context to help the user (prompt: `rag_helpfulness_v0`).
11//!
12//! All three consume the retrieved context from `EvalCase::few_shot_examples`
13//! — spec 043's canonical retrieval surface (see Quality family's
14//! `FaithfulnessEvaluator` for the same convention).
15//!
16//! This module also ships the deterministic [`EmbeddingSimilarityEvaluator`]
17//! and the [`Embedder`] trait + [`EmbedderError`] enum. The similarity
18//! evaluator does NOT call a judge: it embeds the response and the reference
19//! text via a caller-supplied [`Embedder`] implementation and scores by
20//! cosine similarity, passing when similarity meets or exceeds the
21//! configured threshold (default `0.8`).
22
23#![forbid(unsafe_code)]
24#![cfg(feature = "evaluator-rag")]
25
26use std::sync::Arc;
27
28use crate::evaluator::Evaluator;
29use crate::score::Score;
30use crate::types::{EvalCase, EvalMetricResult, Invocation};
31
32use super::{JudgeEvaluatorConfig, build_prompt_context, evaluate_with_builtin};
33
34fn has_final_response(invocation: &Invocation) -> bool {
35    invocation
36        .final_response
37        .as_deref()
38        .is_some_and(|s| !s.trim().is_empty())
39}
40
41fn has_user_prompt(case: &EvalCase) -> bool {
42    !case.user_messages.is_empty()
43}
44
45fn has_retrieved_context(case: &EvalCase) -> bool {
46    !case.few_shot_examples.is_empty()
47}
48
49/// Macro for single-rubric RAG evaluators. Each evaluator's FR-020 criterion
50/// is supplied as a closure; bodies dispatch via [`evaluate_with_builtin`].
51macro_rules! rag_evaluator {
52    (
53        $(#[$meta:meta])*
54        $name:ident, $eval_name:literal, $template:literal, $criterion:expr
55    ) => {
56        $(#[$meta])*
57        pub struct $name {
58            config: JudgeEvaluatorConfig,
59        }
60
61        impl $name {
62            /// Construct with the supplied judge config.
63            #[must_use]
64            pub const fn new(config: JudgeEvaluatorConfig) -> Self {
65                Self { config }
66            }
67
68            /// Override the prompt template used by this evaluator.
69            #[must_use]
70            pub fn with_prompt(mut self, template: Arc<dyn crate::prompt::JudgePromptTemplate>) -> Self {
71                self.config = self.config.with_prompt(template);
72                self
73            }
74
75            /// Attach evaluator-level few-shot examples that render before any
76            /// case-level examples.
77            #[must_use]
78            pub fn with_few_shot(mut self, examples: Vec<crate::types::FewShotExample>) -> Self {
79                self.config = self.config.with_few_shot(examples);
80                self
81            }
82
83            /// Override the system prompt visible to the template render.
84            #[must_use]
85            pub fn with_system_prompt(mut self, prompt: impl Into<String>) -> Self {
86                self.config = self.config.with_system_prompt(prompt);
87                self
88            }
89
90            /// Attach an output schema for custom prompt templates.
91            #[must_use]
92            pub fn with_output_schema(mut self, schema: serde_json::Value) -> Self {
93                self.config = self.config.with_output_schema(schema);
94                self
95            }
96
97            /// Toggle judge reasoning capture.
98            #[must_use]
99            pub fn with_use_reasoning(mut self, flag: bool) -> Self {
100                self.config = self.config.with_use_reasoning(flag);
101                self
102            }
103
104            /// Override the feedback key used by downstream exporters.
105            #[must_use]
106            pub fn with_feedback_key(mut self, key: impl Into<String>) -> Self {
107                self.config = self.config.with_feedback_key(key);
108                self
109            }
110
111            /// Borrow the underlying config (e.g., to inspect the judge
112            /// registry or feedback key).
113            #[must_use]
114            pub const fn config(&self) -> &JudgeEvaluatorConfig {
115                &self.config
116            }
117        }
118
119        impl $crate::evaluators::JudgeEvaluatorBuilder for $name {
120            fn judge_config_mut(&mut self) -> &mut JudgeEvaluatorConfig {
121                &mut self.config
122            }
123        }
124
125        impl Evaluator for $name {
126            fn name(&self) -> &'static str {
127                $eval_name
128            }
129
130            fn evaluate(
131                &self,
132                case: &EvalCase,
133                invocation: &Invocation,
134            ) -> Option<EvalMetricResult> {
135                // FR-020: return None when the criterion is absent.
136                let criterion: fn(&EvalCase, &Invocation) -> bool = $criterion;
137                if !criterion(case, invocation) {
138                    return None;
139                }
140
141                Some(evaluate_with_builtin(
142                    $eval_name,
143                    $template,
144                    &self.config,
145                    &build_prompt_context(&self.config, case, invocation),
146                ))
147            }
148        }
149    };
150}
151
152rag_evaluator! {
153    /// Groundedness of the response against the retrieved context
154    /// (prompt: `rag_groundedness_v0`).
155    ///
156    /// Criterion: the case must carry retrieved context
157    /// (`few_shot_examples` non-empty), a user prompt, and a non-empty
158    /// final response.
159    RAGGroundednessEvaluator,
160    "rag_groundedness",
161    "rag_groundedness_v0",
162    |case, invocation| has_retrieved_context(case)
163        && has_user_prompt(case)
164        && has_final_response(invocation)
165}
166
167rag_evaluator! {
168    /// Relevance of the retrieved context to the user prompt
169    /// (prompt: `rag_retrieval_relevance_v0`).
170    ///
171    /// Criterion: the case must carry retrieved context and a user prompt.
172    /// A final response is not required — this rubric scores retrieval
173    /// quality, not generation quality.
174    RAGRetrievalRelevanceEvaluator,
175    "rag_retrieval_relevance",
176    "rag_retrieval_relevance_v0",
177    |case, _invocation| has_retrieved_context(case) && has_user_prompt(case)
178}
179
180rag_evaluator! {
181    /// Helpfulness of the response with respect to the retrieved context
182    /// (prompt: `rag_helpfulness_v0`).
183    ///
184    /// Criterion: retrieved context, a user prompt, and a non-empty final
185    /// response must all be present.
186    RAGHelpfulnessEvaluator,
187    "rag_helpfulness",
188    "rag_helpfulness_v0",
189    |case, invocation| has_retrieved_context(case)
190        && has_user_prompt(case)
191        && has_final_response(invocation)
192}
193
194// ─── Embedding similarity (deterministic, no judge) ─────────────────────────
195
196/// Errors reported by an [`Embedder`] implementation.
197///
198/// Surfaced verbatim in [`EvalMetricResult::details`] when the
199/// [`EmbeddingSimilarityEvaluator`] folds an embedding failure into
200/// `Score::fail()` (FR-021 — the evaluator never crashes on a transport
201/// hiccup).
202#[derive(Debug, thiserror::Error)]
203pub enum EmbedderError {
204    /// Input text was rejected by the embedder (empty, too long, etc.).
205    #[error("invalid input: {reason}")]
206    InvalidInput {
207        /// Human-readable explanation.
208        reason: String,
209    },
210    /// The embedding backend failed (network, auth, quota, etc.).
211    #[error("embedder backend error: {reason}")]
212    Backend {
213        /// Human-readable explanation.
214        reason: String,
215    },
216    /// The returned vectors had mismatched dimensions.
217    #[error("dimension mismatch: response={response_dim} reference={reference_dim}")]
218    DimensionMismatch {
219        /// Dimensionality of the response embedding.
220        response_dim: usize,
221        /// Dimensionality of the reference embedding.
222        reference_dim: usize,
223    },
224}
225
226/// Pluggable embedding backend used by [`EmbeddingSimilarityEvaluator`].
227///
228/// Implementations map a string to a dense vector. The evaluator only
229/// consumes the trait — concrete backends (OpenAI, local model, stub) live
230/// outside this crate.
231pub trait Embedder: Send + Sync {
232    /// Embed a single text into a dense vector.
233    fn embed(&self, text: &str) -> Result<Vec<f32>, EmbedderError>;
234}
235
236/// Cosine similarity between two vectors, clamped into `[-1.0, 1.0]`.
237///
238/// Returns `0.0` when either vector has zero magnitude.
239fn cosine_similarity(a: &[f32], b: &[f32]) -> f64 {
240    if a.len() != b.len() || a.is_empty() {
241        return 0.0;
242    }
243    let mut dot: f64 = 0.0;
244    let mut na: f64 = 0.0;
245    let mut nb: f64 = 0.0;
246    for (x, y) in a.iter().zip(b.iter()) {
247        let xf = f64::from(*x);
248        let yf = f64::from(*y);
249        dot += xf * yf;
250        na += xf * xf;
251        nb += yf * yf;
252    }
253    if na == 0.0 || nb == 0.0 {
254        return 0.0;
255    }
256    let sim = dot / (na.sqrt() * nb.sqrt());
257    sim.clamp(-1.0, 1.0)
258}
259
260/// Default threshold for [`EmbeddingSimilarityEvaluator`] (cosine similarity).
261pub const DEFAULT_EMBEDDING_SIMILARITY_THRESHOLD: f64 = 0.8;
262
263/// Deterministic cosine-similarity evaluator (T067).
264///
265/// Given a caller-supplied reference text and a caller-supplied [`Embedder`],
266/// the evaluator:
267///
268/// 1. Returns `None` when the invocation has no final response (FR-020).
269/// 2. Embeds both strings via the configured embedder.
270/// 3. Computes cosine similarity, remaps it into `[0.0, 1.0]` via
271///    `(sim + 1) / 2`, and emits a [`Score`] with the configured threshold.
272///
273/// Embedder failures fold into `Score::fail()` with the error message
274/// recorded in `details` — panics are not possible from this codepath
275/// because no user code runs synchronously beyond the trait call, which is
276/// wrapped by the registry's `isolate_panic` guard.
277pub struct EmbeddingSimilarityEvaluator {
278    name: &'static str,
279    reference: String,
280    threshold: f64,
281    embedder: Arc<dyn Embedder>,
282}
283
284impl EmbeddingSimilarityEvaluator {
285    /// Construct with the given reference text and embedder.
286    #[must_use]
287    pub fn new(reference: impl Into<String>, embedder: Arc<dyn Embedder>) -> Self {
288        Self {
289            name: "embedding_similarity",
290            reference: reference.into(),
291            threshold: DEFAULT_EMBEDDING_SIMILARITY_THRESHOLD,
292            embedder,
293        }
294    }
295
296    /// Override the evaluator's reported name.
297    #[must_use]
298    pub const fn with_name(mut self, name: &'static str) -> Self {
299        self.name = name;
300        self
301    }
302
303    /// Override the pass threshold applied to the remapped similarity score.
304    ///
305    /// The threshold is interpreted in `[0.0, 1.0]` — the evaluator remaps
306    /// raw cosine similarity from `[-1.0, 1.0]` into `[0.0, 1.0]` before
307    /// comparing against the threshold. Default is
308    /// [`DEFAULT_EMBEDDING_SIMILARITY_THRESHOLD`] (`0.8`).
309    #[must_use]
310    pub const fn with_threshold(mut self, threshold: f64) -> Self {
311        self.threshold = threshold;
312        self
313    }
314
315    /// Borrow the reference text.
316    #[must_use]
317    pub fn reference(&self) -> &str {
318        &self.reference
319    }
320
321    /// The configured pass threshold.
322    #[must_use]
323    pub const fn threshold(&self) -> f64 {
324        self.threshold
325    }
326}
327
328impl Evaluator for EmbeddingSimilarityEvaluator {
329    fn name(&self) -> &'static str {
330        self.name
331    }
332
333    fn evaluate(&self, _case: &EvalCase, invocation: &Invocation) -> Option<EvalMetricResult> {
334        // FR-020: criterion is a non-empty final response.
335        let actual = invocation.final_response.as_deref()?;
336        if actual.trim().is_empty() {
337            return None;
338        }
339
340        let name = self.name.to_string();
341        let a = match self.embedder.embed(actual) {
342            Ok(v) => v,
343            Err(err) => {
344                return Some(EvalMetricResult {
345                    evaluator_name: name,
346                    score: Score::fail(),
347                    details: Some(format!("embed_response: {err}")),
348                });
349            }
350        };
351        let b = match self.embedder.embed(&self.reference) {
352            Ok(v) => v,
353            Err(err) => {
354                return Some(EvalMetricResult {
355                    evaluator_name: name,
356                    score: Score::fail(),
357                    details: Some(format!("embed_reference: {err}")),
358                });
359            }
360        };
361        if a.len() != b.len() {
362            let err = EmbedderError::DimensionMismatch {
363                response_dim: a.len(),
364                reference_dim: b.len(),
365            };
366            return Some(EvalMetricResult {
367                evaluator_name: name,
368                score: Score::fail(),
369                details: Some(err.to_string()),
370            });
371        }
372
373        let raw = cosine_similarity(&a, &b);
374        // Remap cosine similarity from [-1, 1] into [0, 1] so the score
375        // honours the Score::new clamp without collapsing negative values.
376        let remapped = f64::midpoint(raw, 1.0).clamp(0.0, 1.0);
377        let score = Score::new(remapped, self.threshold);
378        Some(EvalMetricResult {
379            evaluator_name: name,
380            score,
381            details: Some(format!(
382                "cosine_similarity={raw:.4} remapped={remapped:.4} threshold={:.4}",
383                self.threshold
384            )),
385        })
386    }
387}
388
389#[cfg(test)]
390mod tests {
391    use super::*;
392
393    #[test]
394    fn cosine_similarity_of_identical_vectors_is_one() {
395        let a = vec![1.0_f32, 0.0, 0.0];
396        assert!((cosine_similarity(&a, &a) - 1.0).abs() < 1e-9);
397    }
398
399    #[test]
400    fn cosine_similarity_of_opposite_vectors_is_minus_one() {
401        let a = vec![1.0_f32, 0.0];
402        let b = vec![-1.0_f32, 0.0];
403        assert!((cosine_similarity(&a, &b) + 1.0).abs() < 1e-9);
404    }
405
406    #[test]
407    fn cosine_similarity_orthogonal_vectors_is_zero() {
408        let a = vec![1.0_f32, 0.0];
409        let b = vec![0.0_f32, 1.0];
410        assert!(cosine_similarity(&a, &b).abs() < 1e-9);
411    }
412
413    #[test]
414    fn cosine_similarity_mismatched_dims_is_zero() {
415        let a = vec![1.0_f32, 0.0];
416        let b = vec![1.0_f32];
417        assert!(cosine_similarity(&a, &b).abs() < 1e-9);
418    }
419
420    #[test]
421    fn cosine_similarity_empty_vectors_is_zero() {
422        let a: Vec<f32> = vec![];
423        let b: Vec<f32> = vec![];
424        assert!(cosine_similarity(&a, &b).abs() < 1e-9);
425    }
426}
swink_agent_eval/evaluators/rag.rs

swink_agent_eval/evaluators/
rag.rs