trueno_rag/eval/
types.rs

1//! Core types for the evaluation framework
2
3use serde::{Deserialize, Serialize};
4use std::collections::HashMap;
5
6/// Configuration for eval operations
7#[derive(Debug, Clone, Serialize, Deserialize)]
8pub struct EvalConfig {
9    /// Claude model for generation/judging
10    pub model: String,
11    /// Number of query-chunk pairs to generate
12    pub sample_size: usize,
13    /// Random seed for reproducibility
14    pub seed: u64,
15    /// Top-k results to retrieve
16    pub top_k: usize,
17}
18
19impl Default for EvalConfig {
20    fn default() -> Self {
21        Self {
22            model: "claude-sonnet-4-20250514".to_string(),
23            sample_size: 250,
24            seed: 42,
25            top_k: 10,
26        }
27    }
28}
29
30/// A single ground truth entry (query paired with its source chunk)
31#[derive(Debug, Clone, Serialize, Deserialize)]
32pub struct GroundTruthEntry {
33    /// The evaluation query
34    pub query: String,
35    /// Full text of the source chunk this query was generated from
36    pub chunk_content: String,
37    /// File path of the source chunk
38    pub chunk_source: String,
39    /// Start time in seconds (for media chunks)
40    #[serde(skip_serializing_if = "Option::is_none")]
41    pub chunk_start_secs: Option<f64>,
42    /// End time in seconds (for media chunks)
43    #[serde(skip_serializing_if = "Option::is_none")]
44    pub chunk_end_secs: Option<f64>,
45    /// Domain classification
46    pub domain: String,
47    /// Course directory name
48    pub course: String,
49}
50
51/// Raw retrieval results for a single query
52#[derive(Debug, Clone, Serialize, Deserialize)]
53pub struct RetrievalResultEntry {
54    /// The query that was run
55    pub query: String,
56    /// Domain classification
57    pub domain: String,
58    /// Course directory name
59    pub course: String,
60    /// Retrieved chunks with scores
61    pub results: Vec<RetrievedChunk>,
62    /// Query latency in seconds
63    pub latency_s: f64,
64}
65
66/// A single retrieved chunk from a query
67#[derive(Debug, Clone, Serialize, Deserialize)]
68pub struct RetrievedChunk {
69    /// Chunk text content
70    pub content: String,
71    /// Source file path
72    #[serde(skip_serializing_if = "Option::is_none")]
73    pub source: Option<String>,
74    /// Retrieval score
75    pub score: f32,
76    /// Title
77    #[serde(skip_serializing_if = "Option::is_none")]
78    pub title: Option<String>,
79    /// Start time
80    #[serde(skip_serializing_if = "Option::is_none")]
81    pub start_secs: Option<f64>,
82    /// End time
83    #[serde(skip_serializing_if = "Option::is_none")]
84    pub end_secs: Option<f64>,
85}
86
87/// LLM judge verdict for a (query, chunk) pair
88#[derive(Debug, Clone, Serialize, Deserialize)]
89pub struct JudgeVerdict {
90    /// Is the chunk relevant to the query?
91    pub relevant: bool,
92    /// Brief reasoning from the judge
93    pub reasoning: String,
94}
95
96/// Single cache entry
97#[derive(Debug, Clone, Serialize, Deserialize)]
98pub struct JudgeCacheEntry {
99    /// The verdict
100    pub verdict: JudgeVerdict,
101    /// Model used for judging
102    pub model: String,
103}
104
105/// Persistent cache for LLM judge verdicts
106#[derive(Debug, Clone, Default, Serialize, Deserialize)]
107pub struct JudgeCache {
108    /// Map from cache key (sha256 hex prefix) to verdict
109    pub entries: HashMap<String, JudgeCacheEntry>,
110}
111
112impl JudgeCache {
113    /// Load cache from a JSON file, or return empty cache
114    pub fn load(path: &std::path::Path) -> Self {
115        std::fs::read_to_string(path)
116            .ok()
117            .and_then(|s| serde_json::from_str(&s).ok())
118            .unwrap_or_default()
119    }
120
121    /// Save cache to a JSON file
122    pub fn save(&self, path: &std::path::Path) -> std::io::Result<()> {
123        let json = serde_json::to_string_pretty(self)?;
124        std::fs::write(path, json)
125    }
126
127    /// Compute cache key from query + content
128    pub fn cache_key(query: &str, content: &str) -> String {
129        use sha2::{Digest, Sha256};
130        let mut hasher = Sha256::new();
131        hasher.update(query.as_bytes());
132        hasher.update(b"|||");
133        hasher.update(content.as_bytes());
134        let result = hasher.finalize();
135        hex::encode(&result[..8]) // 16 hex chars
136    }
137
138    /// Look up a cached verdict
139    pub fn get(&self, query: &str, content: &str) -> Option<&JudgeVerdict> {
140        let key = Self::cache_key(query, content);
141        self.entries.get(&key).map(|e| &e.verdict)
142    }
143
144    /// Insert a verdict into the cache
145    pub fn insert(&mut self, query: &str, content: &str, verdict: JudgeVerdict, model: &str) {
146        let key = Self::cache_key(query, content);
147        self.entries.insert(key, JudgeCacheEntry { verdict, model: model.to_string() });
148    }
149}
150
151/// Inline hex encoding (avoid adding hex crate dep)
152mod hex {
153    pub(crate) fn encode(bytes: &[u8]) -> String {
154        use std::fmt::Write;
155        bytes.iter().fold(String::with_capacity(bytes.len() * 2), |mut s, b| {
156            let _ = write!(s, "{b:02x}");
157            s
158        })
159    }
160}
161
162/// A single judgment entry (written by Claude Code or external judge)
163#[derive(Debug, Clone, Serialize, Deserialize)]
164pub struct JudgmentEntry {
165    /// The query
166    pub query: String,
167    /// Rank of the chunk being judged (1-indexed)
168    pub rank: usize,
169    /// Whether the chunk is relevant
170    pub relevant: bool,
171    /// Brief reasoning
172    pub reasoning: String,
173    /// Source path (for correlation)
174    #[serde(skip_serializing_if = "Option::is_none")]
175    pub source: Option<String>,
176    /// Retrieval score (for correlation)
177    #[serde(skip_serializing_if = "Option::is_none")]
178    pub score: Option<f32>,
179}
180
181/// Eval output with full results
182#[derive(Debug, Clone, Serialize, Deserialize)]
183pub struct EvalOutput {
184    /// Timestamp of the eval run
185    pub timestamp: String,
186    /// Config used
187    pub config: EvalRunConfig,
188    /// Aggregate metrics
189    pub aggregate: AggregateMetrics,
190    /// Per-domain metrics
191    pub by_domain: HashMap<String, AggregateMetrics>,
192    /// Per-query details
193    pub per_query: Vec<QueryResult>,
194}
195
196/// Config recorded in eval output
197#[derive(Debug, Clone, Serialize, Deserialize)]
198pub struct EvalRunConfig {
199    /// Number of queries evaluated
200    pub num_queries: usize,
201    /// Top-k used for retrieval
202    pub top_k: usize,
203    /// Model used for judging
204    pub judge_model: String,
205    /// Cache hits (saved API calls)
206    pub cache_hits: usize,
207    /// New API calls made
208    pub api_calls: usize,
209}
210
211/// Aggregate metrics across queries
212#[derive(Debug, Clone, Default, Serialize, Deserialize)]
213pub struct AggregateMetrics {
214    /// Number of queries
215    pub num_queries: usize,
216    /// Mean Reciprocal Rank
217    pub mrr: f64,
218    /// NDCG at k=5
219    #[serde(rename = "ndcg@5")]
220    pub ndcg_5: f64,
221    /// NDCG at k=10
222    #[serde(rename = "ndcg@10")]
223    pub ndcg_10: f64,
224    /// Recall at k=5
225    #[serde(rename = "recall@5")]
226    pub recall_5: f64,
227    /// Precision at k=5
228    #[serde(rename = "precision@5")]
229    pub precision_5: f64,
230    /// Hit rate at k=5
231    #[serde(rename = "hit_rate@5")]
232    pub hit_rate_5: f64,
233    /// Hit rate at k=10
234    #[serde(rename = "hit_rate@10")]
235    pub hit_rate_10: f64,
236    /// Mean Average Precision
237    pub map: f64,
238    /// Mean query latency
239    pub mean_latency_s: f64,
240}
241
242/// Per-query result with judge details
243#[derive(Debug, Clone, Serialize, Deserialize)]
244pub struct QueryResult {
245    /// The query
246    pub query: String,
247    /// Domain
248    pub domain: String,
249    /// MRR for this query
250    pub mrr: f64,
251    /// Hit at k=5
252    pub hit_5: bool,
253    /// Number of relevant results in top-10
254    pub relevant_count: usize,
255    /// Total results
256    pub total_results: usize,
257    /// Latency
258    pub latency_s: f64,
259    /// Per-result judgments
260    pub judgments: Vec<ChunkJudgment>,
261}
262
263/// Judgment for a single retrieved chunk
264#[derive(Debug, Clone, Serialize, Deserialize)]
265pub struct ChunkJudgment {
266    /// Rank position (1-indexed)
267    pub rank: usize,
268    /// Retrieval score
269    pub score: f32,
270    /// Source path
271    pub source: Option<String>,
272    /// Whether the judge deemed it relevant
273    pub relevant: bool,
274    /// Judge reasoning
275    pub reasoning: String,
276}
trueno_rag/eval/types.rs

trueno_rag/eval/
types.rs