coding_agent_search/
bakeoff.rs

1use serde::{Deserialize, Serialize};
2use std::cmp::Ordering;
3use std::time::{Duration, Instant};
4
5/// Hard eligibility cutoff: models must be released on/after this date.
6/// Format: YYYY-MM-DD
7pub const ELIGIBILITY_CUTOFF: &str = "2025-11-01";
8
9/// Success criteria from the epic.
10pub mod criteria {
11    /// Cold start must be under 2 seconds.
12    pub const COLD_START_MAX_MS: u64 = 2000;
13    /// Warm p99 latency must be under 250ms.
14    pub const WARM_P99_MAX_MS: u64 = 250;
15    /// Memory usage must be under 300MB per model.
16    pub const MEMORY_MAX_MB: u64 = 300;
17    /// Quality must be at least 80% of baseline (MiniLM).
18    pub const QUALITY_MIN_RATIO: f64 = 0.80;
19}
20
21/// Model metadata for eligibility checking.
22#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
23pub struct ModelMetadata {
24    /// Model identifier (e.g., "bge-small-en-v1.5").
25    pub id: String,
26    /// Human-readable name.
27    pub name: String,
28    /// HuggingFace model ID or source.
29    pub source: String,
30    /// Release/update date (YYYY-MM-DD format).
31    pub release_date: String,
32    /// Embedding dimension (for embedders).
33    pub dimension: Option<usize>,
34    /// Model size in bytes.
35    pub size_bytes: Option<u64>,
36    /// Whether this is a baseline model (not eligible to win, but used for comparison).
37    pub is_baseline: bool,
38}
39
40impl ModelMetadata {
41    /// Check if the model is eligible based on release date.
42    pub fn is_eligible(&self) -> bool {
43        if self.is_baseline {
44            return false;
45        }
46        self.release_date.as_str() >= ELIGIBILITY_CUTOFF
47    }
48}
49
50/// Minimal validation report for bake-off runs.
51#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
52pub struct ValidationReport {
53    pub model_id: String,
54    pub corpus_hash: String,
55    pub ndcg_at_10: f64,
56    pub latency_ms_p50: u64,
57    pub latency_ms_p95: u64,
58    pub latency_ms_p99: u64,
59    pub cold_start_ms: u64,
60    pub memory_mb: u64,
61    pub eligible: bool,
62    pub meets_criteria: bool,
63    pub warnings: Vec<String>,
64}
65
66impl ValidationReport {
67    /// Check if this report meets all success criteria.
68    pub fn check_criteria(&self) -> bool {
69        self.cold_start_ms <= criteria::COLD_START_MAX_MS
70            && self.latency_ms_p99 <= criteria::WARM_P99_MAX_MS
71            && self.memory_mb <= criteria::MEMORY_MAX_MB
72    }
73
74    /// Check quality against a baseline report.
75    pub fn meets_quality_threshold(&self, baseline: &ValidationReport) -> bool {
76        if baseline.ndcg_at_10 == 0.0 {
77            return true;
78        }
79        self.ndcg_at_10 / baseline.ndcg_at_10 >= criteria::QUALITY_MIN_RATIO
80    }
81}
82
83/// Latency statistics from a benchmark run.
84#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
85pub struct LatencyStats {
86    pub samples: usize,
87    pub min_ms: u64,
88    pub max_ms: u64,
89    pub mean_ms: f64,
90    pub p50_ms: u64,
91    pub p95_ms: u64,
92    pub p99_ms: u64,
93}
94
95impl LatencyStats {
96    /// Compute latency statistics from a list of durations.
97    pub fn from_durations(durations: &[Duration]) -> Self {
98        if durations.is_empty() {
99            return Self {
100                samples: 0,
101                min_ms: 0,
102                max_ms: 0,
103                mean_ms: 0.0,
104                p50_ms: 0,
105                p95_ms: 0,
106                p99_ms: 0,
107            };
108        }
109
110        let mut millis: Vec<u64> = durations.iter().map(|d| d.as_millis() as u64).collect();
111        millis.sort_unstable();
112
113        let n = millis.len();
114        let sum: u64 = millis.iter().sum();
115
116        Self {
117            samples: n,
118            min_ms: millis[0],
119            max_ms: millis[n - 1],
120            mean_ms: sum as f64 / n as f64,
121            p50_ms: percentile(&millis, 50),
122            p95_ms: percentile(&millis, 95),
123            p99_ms: percentile(&millis, 99),
124        }
125    }
126}
127
128/// Compute percentile from sorted values.
129fn percentile(sorted: &[u64], p: usize) -> u64 {
130    if sorted.is_empty() {
131        return 0;
132    }
133    let idx = (p * sorted.len() / 100).min(sorted.len() - 1);
134    sorted[idx]
135}
136
137/// Timer for measuring operation latency.
138pub struct LatencyTimer {
139    samples: Vec<Duration>,
140}
141
142impl LatencyTimer {
143    pub fn new() -> Self {
144        Self {
145            samples: Vec::new(),
146        }
147    }
148
149    /// Time a single operation and record the duration.
150    pub fn time<F, T>(&mut self, f: F) -> T
151    where
152        F: FnOnce() -> T,
153    {
154        let start = Instant::now();
155        let result = f();
156        self.samples.push(start.elapsed());
157        result
158    }
159
160    /// Get statistics from recorded samples.
161    pub fn stats(&self) -> LatencyStats {
162        LatencyStats::from_durations(&self.samples)
163    }
164
165    /// Clear recorded samples.
166    pub fn clear(&mut self) {
167        self.samples.clear();
168    }
169}
170
171impl Default for LatencyTimer {
172    fn default() -> Self {
173        Self::new()
174    }
175}
176
177/// Bake-off comparison result.
178#[derive(Debug, Clone, Serialize, Deserialize)]
179pub struct BakeoffComparison {
180    /// Corpus hash for reproducibility.
181    pub corpus_hash: String,
182    /// Baseline model report.
183    pub baseline: ValidationReport,
184    /// All candidate reports.
185    pub candidates: Vec<ValidationReport>,
186    /// Recommended model ID (best eligible candidate meeting criteria).
187    pub recommendation: Option<String>,
188    /// Reason for recommendation.
189    pub recommendation_reason: String,
190}
191
192impl BakeoffComparison {
193    /// Find the best eligible candidate that meets all criteria.
194    pub fn find_winner(&self) -> Option<&ValidationReport> {
195        self.candidates
196            .iter()
197            .filter(|r| r.eligible && r.meets_criteria && r.meets_quality_threshold(&self.baseline))
198            .max_by(|a, b| {
199                // Prefer higher quality, then lower latency
200                a.ndcg_at_10
201                    .partial_cmp(&b.ndcg_at_10)
202                    .unwrap_or(Ordering::Equal)
203                    .then_with(|| b.latency_ms_p99.cmp(&a.latency_ms_p99))
204            })
205    }
206}
207
208/// Compute NDCG@k for a list of relevances in rank order.
209/// Non-finite or <= 0 relevances are treated as non-relevant.
210///
211/// `all_ground_truth` should contain ALL ground-truth relevances for the query
212/// (not just those that appeared in the results). The IDCG is computed from
213/// the ideal ranking of these values. Passing only the returned-doc relevances
214/// would inflate NDCG scores for poor retrievers.
215pub fn ndcg_at_k(relevances: &[f64], k: usize, all_ground_truth: &[f64]) -> f64 {
216    if k == 0 || relevances.is_empty() {
217        return 0.0;
218    }
219    let dcg = dcg_at_k(relevances, k);
220    if dcg == 0.0 {
221        return 0.0;
222    }
223    let mut ideal: Vec<f64> = all_ground_truth
224        .iter()
225        .map(|rel| if rel.is_finite() { rel.max(0.0) } else { 0.0 })
226        .collect();
227    ideal.sort_by(|a, b| b.partial_cmp(a).unwrap_or(Ordering::Equal));
228    let idcg = dcg_at_k(&ideal, k);
229    if idcg == 0.0 { 0.0 } else { dcg / idcg }
230}
231
232fn dcg_at_k(relevances: &[f64], k: usize) -> f64 {
233    relevances
234        .iter()
235        .take(k)
236        .enumerate()
237        .map(|(idx, rel)| {
238            let rel = if rel.is_finite() { *rel } else { 0.0 };
239            let rel = rel.max(0.0);
240            let denom = (idx as f64 + 2.0).log2();
241            (2.0_f64.powf(rel) - 1.0) / denom
242        })
243        .sum()
244}
245
246// ==================== Evaluation Harness ====================
247
248/// A document in the evaluation corpus.
249#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
250pub struct Document {
251    /// Unique document identifier.
252    pub id: String,
253    /// Document content (text to embed).
254    pub content: String,
255}
256
257/// Ground truth relevance judgment for a query-document pair.
258#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
259pub struct RelevanceJudgment {
260    /// Document ID.
261    pub doc_id: String,
262    /// Relevance score (0=not relevant, 1=somewhat, 2=highly, 3=perfect).
263    pub relevance: f64,
264}
265
266/// A query with ground truth relevance judgments.
267#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
268pub struct QueryWithJudgments {
269    /// Query text.
270    pub query: String,
271    /// Ground truth relevance judgments for this query.
272    pub judgments: Vec<RelevanceJudgment>,
273}
274
275/// Evaluation corpus containing documents and queries with ground truth.
276#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
277pub struct EvaluationCorpus {
278    /// Corpus name/identifier.
279    pub name: String,
280    /// Documents in the corpus.
281    pub documents: Vec<Document>,
282    /// Queries with ground truth judgments.
283    pub queries: Vec<QueryWithJudgments>,
284}
285
286impl EvaluationCorpus {
287    /// Create a new empty corpus.
288    pub fn new(name: &str) -> Self {
289        Self {
290            name: name.to_string(),
291            documents: Vec::new(),
292            queries: Vec::new(),
293        }
294    }
295
296    /// Add a document to the corpus.
297    pub fn add_document(&mut self, id: &str, content: &str) {
298        self.documents.push(Document {
299            id: id.to_string(),
300            content: content.to_string(),
301        });
302    }
303
304    /// Add a query with judgments.
305    pub fn add_query(&mut self, query: &str, judgments: Vec<(&str, f64)>) {
306        self.queries.push(QueryWithJudgments {
307            query: query.to_string(),
308            judgments: judgments
309                .into_iter()
310                .map(|(doc_id, relevance)| RelevanceJudgment {
311                    doc_id: doc_id.to_string(),
312                    relevance,
313                })
314                .collect(),
315        });
316    }
317
318    /// Compute a hash of the corpus for reproducibility.
319    pub fn compute_hash(&self) -> String {
320        use std::collections::hash_map::DefaultHasher;
321        use std::hash::{Hash, Hasher};
322
323        let mut hasher = DefaultHasher::new();
324        self.name.hash(&mut hasher);
325        for doc in &self.documents {
326            doc.id.hash(&mut hasher);
327            doc.content.hash(&mut hasher);
328        }
329        for query in &self.queries {
330            query.query.hash(&mut hasher);
331            for j in &query.judgments {
332                j.doc_id.hash(&mut hasher);
333                // Hash relevance as bits to avoid float issues
334                j.relevance.to_bits().hash(&mut hasher);
335            }
336        }
337        format!("{:016x}", hasher.finish())
338    }
339
340    /// Create a sample corpus for testing embedders on code search scenarios.
341    pub fn code_search_sample() -> Self {
342        let mut corpus = Self::new("code-search-sample");
343
344        // Add sample documents representing code snippets and discussions
345        corpus.add_document("d1", "implementing authentication with jwt tokens in rust using jsonwebtoken crate for secure api access");
346        corpus.add_document("d2", "database connection pool configuration using sqlx with postgres for high performance queries");
347        corpus.add_document(
348            "d3",
349            "error handling patterns in rust using thiserror and anyhow for better error messages",
350        );
351        corpus.add_document(
352            "d4",
353            "async runtime setup with asupersync for concurrent task processing and io operations",
354        );
355        corpus.add_document(
356            "d5",
357            "parsing json data with serde for serialization and deserialization of structs",
358        );
359        corpus.add_document(
360            "d6",
361            "logging configuration using tracing crate for structured observability and debugging",
362        );
363        corpus.add_document(
364            "d7",
365            "cli argument parsing with clap for building command line applications",
366        );
367        corpus.add_document(
368            "d8",
369            "http client requests using asupersync http primitives for external service calls",
370        );
371        corpus.add_document(
372            "d9",
373            "unit testing patterns with cargo test and mock objects for reliable tests",
374        );
375        corpus.add_document(
376            "d10",
377            "file system operations reading and writing files with std fs module",
378        );
379
380        // Add queries with ground truth relevance judgments
381        // Relevance: 0=not relevant, 1=somewhat, 2=highly, 3=perfect match
382        corpus.add_query(
383            "how to authenticate users with jwt",
384            vec![
385                ("d1", 3.0), // Perfect match
386                ("d2", 0.0), // Not relevant
387                ("d8", 1.0), // Somewhat (might involve API auth)
388            ],
389        );
390
391        corpus.add_query(
392            "database connection setup",
393            vec![
394                ("d2", 3.0),  // Perfect match
395                ("d4", 1.0),  // Async might be related
396                ("d10", 0.0), // Not relevant
397            ],
398        );
399
400        corpus.add_query(
401            "error handling best practices",
402            vec![
403                ("d3", 3.0), // Perfect match
404                ("d6", 1.0), // Logging errors
405                ("d9", 1.0), // Testing error cases
406            ],
407        );
408
409        corpus.add_query(
410            "async programming asupersync",
411            vec![
412                ("d4", 3.0), // Perfect match
413                ("d2", 1.0), // Async DB queries
414                ("d8", 2.0), // Async HTTP
415            ],
416        );
417
418        corpus.add_query(
419            "json serialization",
420            vec![
421                ("d5", 3.0), // Perfect match
422                ("d8", 1.0), // API often uses JSON
423                ("d1", 1.0), // JWT is JSON-based
424            ],
425        );
426
427        corpus
428    }
429}
430
431/// Result of evaluating a single query.
432#[derive(Debug, Clone, Serialize, Deserialize)]
433pub struct QueryEvalResult {
434    /// The query text.
435    pub query: String,
436    /// NDCG@10 for this query.
437    pub ndcg_at_10: f64,
438    /// Ranked document IDs returned by the model.
439    pub ranked_docs: Vec<String>,
440    /// Latency for this query in milliseconds.
441    pub latency_ms: u64,
442}
443
444/// Configuration for the evaluation harness.
445#[derive(Debug, Clone)]
446pub struct EvaluationConfig {
447    /// Number of warmup queries before timing.
448    pub warmup_queries: usize,
449    /// Number of timing iterations per query.
450    pub timing_iterations: usize,
451    /// Top-k for NDCG calculation.
452    pub ndcg_k: usize,
453}
454
455impl Default for EvaluationConfig {
456    fn default() -> Self {
457        Self {
458            warmup_queries: 3,
459            timing_iterations: 5,
460            ndcg_k: 10,
461        }
462    }
463}
464
465/// Compute cosine similarity between two vectors.
466pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
467    if a.len() != b.len() {
468        return 0.0;
469    }
470    let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
471    let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
472    let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
473    if norm_a == 0.0 || norm_b == 0.0 {
474        return 0.0;
475    }
476    dot / (norm_a * norm_b)
477}
478
479/// Evaluation harness for running bake-off evaluations.
480pub struct EvaluationHarness {
481    config: EvaluationConfig,
482}
483
484impl EvaluationHarness {
485    /// Create a new evaluation harness with default config.
486    pub fn new() -> Self {
487        Self {
488            config: EvaluationConfig::default(),
489        }
490    }
491
492    /// Create with custom config.
493    pub fn with_config(config: EvaluationConfig) -> Self {
494        Self { config }
495    }
496
497    /// Evaluate an embedder against a corpus.
498    ///
499    /// Returns a ValidationReport with NDCG, latency, and memory metrics.
500    pub fn evaluate<E: crate::search::embedder::Embedder>(
501        &self,
502        embedder: &E,
503        corpus: &EvaluationCorpus,
504        metadata: &ModelMetadata,
505    ) -> Result<ValidationReport, String> {
506        let corpus_hash = corpus.compute_hash();
507        let first_doc = corpus.documents.first().ok_or("Empty corpus")?;
508        if corpus.queries.is_empty() {
509            return Err("Empty query set".to_string());
510        }
511
512        // Measure cold start (first embedding)
513        let cold_start = Instant::now();
514        embedder
515            .embed_sync(&first_doc.content)
516            .map_err(|e| e.to_string())?;
517        let cold_start_ms = cold_start.elapsed().as_millis() as u64;
518
519        // Embed all documents
520        let doc_embeddings: Vec<Vec<f32>> = corpus
521            .documents
522            .iter()
523            .map(|d| embedder.embed_sync(&d.content))
524            .collect::<Result<Vec<_>, _>>()
525            .map_err(|e| e.to_string())?;
526
527        // Warmup queries
528        for i in 0..self.config.warmup_queries.min(corpus.queries.len()) {
529            let _ = embedder.embed_sync(&corpus.queries[i].query);
530        }
531
532        // Evaluate each query
533        let mut query_results = Vec::new();
534        let mut latencies = Vec::new();
535
536        for query_with_judgments in &corpus.queries {
537            // Build relevance map
538            let relevance_map: std::collections::HashMap<&str, f64> = query_with_judgments
539                .judgments
540                .iter()
541                .map(|j| (j.doc_id.as_str(), j.relevance))
542                .collect();
543
544            // Time the query embedding (average over iterations, minimum 1)
545            let iterations = self.config.timing_iterations.max(1);
546            let mut query_latencies = Vec::with_capacity(iterations);
547            let mut query_embedding = Vec::new();
548            for _ in 0..iterations {
549                let start = Instant::now();
550                query_embedding = embedder
551                    .embed_sync(&query_with_judgments.query)
552                    .map_err(|e| e.to_string())?;
553                query_latencies.push(start.elapsed());
554            }
555            let avg_latency = query_latencies
556                .iter()
557                .map(|d| d.as_millis() as u64)
558                .sum::<u64>()
559                / query_latencies.len() as u64;
560            latencies.push(Duration::from_millis(avg_latency));
561
562            // Rank documents by similarity
563            let mut scored_docs: Vec<(usize, f32)> = doc_embeddings
564                .iter()
565                .enumerate()
566                .map(|(idx, emb)| (idx, cosine_similarity(&query_embedding, emb)))
567                .collect();
568            scored_docs.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
569
570            // Get ranked doc IDs
571            let ranked_docs: Vec<String> = scored_docs
572                .iter()
573                .take(self.config.ndcg_k)
574                .map(|(idx, _)| corpus.documents[*idx].id.clone())
575                .collect();
576
577            // Compute relevances in ranked order
578            let relevances: Vec<f64> = ranked_docs
579                .iter()
580                .map(|id| *relevance_map.get(id.as_str()).unwrap_or(&0.0))
581                .collect();
582
583            // All ground-truth relevances (for ideal DCG computation)
584            let all_gt: Vec<f64> = relevance_map.values().copied().collect();
585            let ndcg = ndcg_at_k(&relevances, self.config.ndcg_k, &all_gt);
586
587            query_results.push(QueryEvalResult {
588                query: query_with_judgments.query.clone(),
589                ndcg_at_10: ndcg,
590                ranked_docs,
591                latency_ms: avg_latency,
592            });
593        }
594
595        // Compute aggregate metrics
596        let avg_ndcg = if query_results.is_empty() {
597            0.0
598        } else {
599            query_results.iter().map(|r| r.ndcg_at_10).sum::<f64>() / query_results.len() as f64
600        };
601
602        let latency_stats = LatencyStats::from_durations(&latencies);
603
604        // Estimate memory (model size as proxy - real measurement would need system APIs)
605        let memory_mb = metadata.size_bytes.unwrap_or(0) / (1024 * 1024);
606
607        let eligible = metadata.is_eligible();
608        let mut report = ValidationReport {
609            model_id: metadata.id.clone(),
610            corpus_hash,
611            ndcg_at_10: avg_ndcg,
612            latency_ms_p50: latency_stats.p50_ms,
613            latency_ms_p95: latency_stats.p95_ms,
614            latency_ms_p99: latency_stats.p99_ms,
615            cold_start_ms,
616            memory_mb,
617            eligible,
618            meets_criteria: false,
619            warnings: Vec::new(),
620        };
621
622        report.meets_criteria = report.check_criteria();
623
624        // Add warnings
625        if cold_start_ms > criteria::COLD_START_MAX_MS {
626            report.warnings.push(format!(
627                "Cold start {}ms exceeds {}ms limit",
628                cold_start_ms,
629                criteria::COLD_START_MAX_MS
630            ));
631        }
632        if latency_stats.p99_ms > criteria::WARM_P99_MAX_MS {
633            report.warnings.push(format!(
634                "P99 latency {}ms exceeds {}ms limit",
635                latency_stats.p99_ms,
636                criteria::WARM_P99_MAX_MS
637            ));
638        }
639        if memory_mb > criteria::MEMORY_MAX_MB {
640            report.warnings.push(format!(
641                "Memory {}MB exceeds {}MB limit",
642                memory_mb,
643                criteria::MEMORY_MAX_MB
644            ));
645        }
646
647        Ok(report)
648    }
649
650    /// Run a full bake-off comparison with baseline and candidates.
651    pub fn run_comparison<E: crate::search::embedder::Embedder>(
652        &self,
653        baseline: (&E, &ModelMetadata),
654        candidates: Vec<(&E, &ModelMetadata)>,
655        corpus: &EvaluationCorpus,
656    ) -> Result<BakeoffComparison, String> {
657        let corpus_hash = corpus.compute_hash();
658
659        // Evaluate baseline
660        let baseline_report = self.evaluate(baseline.0, corpus, baseline.1)?;
661
662        // Evaluate all candidates
663        let mut candidate_reports = Vec::new();
664        for (embedder, metadata) in candidates {
665            let report = self.evaluate(embedder, corpus, metadata)?;
666            candidate_reports.push(report);
667        }
668
669        // Build initial comparison
670        let mut comparison = BakeoffComparison {
671            corpus_hash,
672            baseline: baseline_report.clone(),
673            candidates: candidate_reports,
674            recommendation: None,
675            recommendation_reason: String::new(),
676        };
677
678        // Find the winner and extract data before mutating
679        let winner_data = comparison.find_winner().map(|w| {
680            (
681                w.model_id.clone(),
682                w.ndcg_at_10,
683                w.latency_ms_p99,
684                w.memory_mb,
685            )
686        });
687
688        if let Some((model_id, ndcg, p99, memory)) = winner_data {
689            comparison.recommendation = Some(model_id.clone());
690            let pct_of_baseline = if baseline_report.ndcg_at_10 > 0.0 {
691                format!("{}%", (ndcg / baseline_report.ndcg_at_10 * 100.0) as u32)
692            } else {
693                "N/A".to_string()
694            };
695            comparison.recommendation_reason = format!(
696                "Best eligible candidate with NDCG@10={:.3} ({} of baseline), p99={}ms, memory={}MB",
697                ndcg, pct_of_baseline, p99, memory
698            );
699        } else {
700            comparison.recommendation_reason =
701                "No eligible candidate meets all criteria".to_string();
702        }
703
704        Ok(comparison)
705    }
706}
707
708impl Default for EvaluationHarness {
709    fn default() -> Self {
710        Self::new()
711    }
712}
713
714/// Format a comparison as a markdown table for reporting.
715pub fn format_comparison_table(comparison: &BakeoffComparison) -> String {
716    let mut output = String::new();
717
718    output.push_str("# Bake-off Results\n\n");
719    output.push_str(&format!("Corpus hash: `{}`\n\n", comparison.corpus_hash));
720
721    output.push_str("| Model | NDCG@10 | P50 (ms) | P95 (ms) | P99 (ms) | Cold (ms) | Memory (MB) | Eligible | Meets Criteria |\n");
722    output.push_str("|-------|---------|----------|----------|----------|-----------|-------------|----------|----------------|\n");
723
724    // Baseline first
725    let b = &comparison.baseline;
726    output.push_str(&format!(
727        "| {} (baseline) | {:.3} | {} | {} | {} | {} | {} | {} | {} |\n",
728        b.model_id,
729        b.ndcg_at_10,
730        b.latency_ms_p50,
731        b.latency_ms_p95,
732        b.latency_ms_p99,
733        b.cold_start_ms,
734        b.memory_mb,
735        if b.eligible { "✓" } else { "✗" },
736        if b.meets_criteria { "✓" } else { "✗" }
737    ));
738
739    // Candidates
740    for c in &comparison.candidates {
741        let marker = if Some(&c.model_id) == comparison.recommendation.as_ref() {
742            " ⭐"
743        } else {
744            ""
745        };
746        output.push_str(&format!(
747            "| {}{} | {:.3} | {} | {} | {} | {} | {} | {} | {} |\n",
748            c.model_id,
749            marker,
750            c.ndcg_at_10,
751            c.latency_ms_p50,
752            c.latency_ms_p95,
753            c.latency_ms_p99,
754            c.cold_start_ms,
755            c.memory_mb,
756            if c.eligible { "✓" } else { "✗" },
757            if c.meets_criteria { "✓" } else { "✗" }
758        ));
759    }
760
761    output.push_str("\n## Recommendation\n\n");
762    if let Some(ref winner) = comparison.recommendation {
763        output.push_str(&format!("**Winner:** {}\n\n", winner));
764    }
765    output.push_str(&format!("{}\n", comparison.recommendation_reason));
766
767    output
768}
769
770#[cfg(test)]
771mod tests {
772    use super::*;
773
774    #[test]
775    fn ndcg_perfect_is_one() {
776        let relevances = vec![3.0, 2.0, 1.0];
777        let ndcg = ndcg_at_k(&relevances, 3, &relevances);
778        assert!((ndcg - 1.0).abs() < 1e-9);
779    }
780
781    #[test]
782    fn ndcg_zero_when_no_relevance() {
783        let relevances = vec![0.0, 0.0, 0.0];
784        let ndcg = ndcg_at_k(&relevances, 3, &relevances);
785        assert_eq!(ndcg, 0.0);
786    }
787
788    #[test]
789    fn ndcg_handles_partial_relevance() {
790        let all_gt = vec![2.0, 1.0, 0.0];
791        let returned = vec![1.0, 0.0, 2.0]; // out of ideal order
792        let ndcg = ndcg_at_k(&returned, 3, &all_gt);
793        assert!(ndcg > 0.0 && ndcg < 1.0);
794    }
795
796    #[test]
797    fn report_roundtrip() {
798        let report = ValidationReport {
799            model_id: "hash".to_string(),
800            corpus_hash: "deadbeef".to_string(),
801            ndcg_at_10: 0.42,
802            latency_ms_p50: 12,
803            latency_ms_p95: 30,
804            latency_ms_p99: 45,
805            cold_start_ms: 500,
806            memory_mb: 150,
807            eligible: true,
808            meets_criteria: true,
809            warnings: vec!["example warning".to_string()],
810        };
811        let encoded = serde_json::to_string(&report).expect("serialize");
812        let decoded: ValidationReport = serde_json::from_str(&encoded).expect("deserialize");
813        assert_eq!(report, decoded);
814    }
815
816    #[test]
817    fn model_eligibility_by_date() {
818        let eligible_model = ModelMetadata {
819            id: "new-model".to_string(),
820            name: "New Model".to_string(),
821            source: "huggingface".to_string(),
822            release_date: "2025-12-01".to_string(),
823            dimension: Some(384),
824            size_bytes: Some(100_000_000),
825            is_baseline: false,
826        };
827        assert!(eligible_model.is_eligible());
828
829        let old_model = ModelMetadata {
830            id: "old-model".to_string(),
831            name: "Old Model".to_string(),
832            source: "huggingface".to_string(),
833            release_date: "2025-06-01".to_string(),
834            dimension: Some(384),
835            size_bytes: Some(100_000_000),
836            is_baseline: false,
837        };
838        assert!(!old_model.is_eligible());
839
840        let baseline_model = ModelMetadata {
841            id: "baseline".to_string(),
842            name: "Baseline".to_string(),
843            source: "huggingface".to_string(),
844            release_date: "2025-12-01".to_string(),
845            dimension: Some(384),
846            size_bytes: Some(100_000_000),
847            is_baseline: true,
848        };
849        assert!(!baseline_model.is_eligible());
850    }
851
852    #[test]
853    fn latency_stats_from_durations() {
854        let durations = vec![
855            Duration::from_millis(10),
856            Duration::from_millis(20),
857            Duration::from_millis(30),
858            Duration::from_millis(40),
859            Duration::from_millis(100),
860        ];
861        let stats = LatencyStats::from_durations(&durations);
862
863        assert_eq!(stats.samples, 5);
864        assert_eq!(stats.min_ms, 10);
865        assert_eq!(stats.max_ms, 100);
866        assert!((stats.mean_ms - 40.0).abs() < 0.1);
867        assert_eq!(stats.p50_ms, 30);
868    }
869
870    #[test]
871    fn latency_stats_empty() {
872        let stats = LatencyStats::from_durations(&[]);
873        assert_eq!(stats.samples, 0);
874        assert_eq!(stats.p50_ms, 0);
875    }
876
877    #[test]
878    fn latency_timer_records_samples() {
879        let mut timer = LatencyTimer::new();
880
881        // Time a simple operation
882        let result = timer.time(|| 42);
883        assert_eq!(result, 42);
884
885        let stats = timer.stats();
886        assert_eq!(stats.samples, 1);
887    }
888
889    #[test]
890    fn report_meets_criteria() {
891        let good_report = ValidationReport {
892            model_id: "good".to_string(),
893            corpus_hash: "test".to_string(),
894            ndcg_at_10: 0.85,
895            latency_ms_p50: 50,
896            latency_ms_p95: 100,
897            latency_ms_p99: 200, // Under 250ms
898            cold_start_ms: 1500, // Under 2s
899            memory_mb: 200,      // Under 300MB
900            eligible: true,
901            meets_criteria: true,
902            warnings: vec![],
903        };
904        assert!(good_report.check_criteria());
905
906        let bad_latency = ValidationReport {
907            latency_ms_p99: 300, // Over 250ms
908            ..good_report.clone()
909        };
910        assert!(!bad_latency.check_criteria());
911
912        let bad_cold_start = ValidationReport {
913            cold_start_ms: 3000, // Over 2s
914            ..good_report.clone()
915        };
916        assert!(!bad_cold_start.check_criteria());
917
918        let bad_memory = ValidationReport {
919            memory_mb: 400, // Over 300MB
920            ..good_report
921        };
922        assert!(!bad_memory.check_criteria());
923    }
924
925    #[test]
926    fn report_quality_threshold() {
927        let baseline = ValidationReport {
928            model_id: "baseline".to_string(),
929            corpus_hash: "test".to_string(),
930            ndcg_at_10: 0.80,
931            latency_ms_p50: 50,
932            latency_ms_p95: 100,
933            latency_ms_p99: 150,
934            cold_start_ms: 1000,
935            memory_mb: 200,
936            eligible: false,
937            meets_criteria: true,
938            warnings: vec![],
939        };
940
941        let good_candidate = ValidationReport {
942            model_id: "good".to_string(),
943            ndcg_at_10: 0.70, // 87.5% of baseline, above 80%
944            ..baseline.clone()
945        };
946        assert!(good_candidate.meets_quality_threshold(&baseline));
947
948        let bad_candidate = ValidationReport {
949            model_id: "bad".to_string(),
950            ndcg_at_10: 0.60, // 75% of baseline, below 80%
951            ..baseline.clone()
952        };
953        assert!(!bad_candidate.meets_quality_threshold(&baseline));
954    }
955
956    #[test]
957    fn bakeoff_comparison_finds_winner() {
958        let baseline = ValidationReport {
959            model_id: "baseline".to_string(),
960            corpus_hash: "test".to_string(),
961            ndcg_at_10: 0.80,
962            latency_ms_p50: 50,
963            latency_ms_p95: 100,
964            latency_ms_p99: 150,
965            cold_start_ms: 1000,
966            memory_mb: 200,
967            eligible: false,
968            meets_criteria: true,
969            warnings: vec![],
970        };
971
972        let candidate1 = ValidationReport {
973            model_id: "candidate1".to_string(),
974            ndcg_at_10: 0.75, // Good quality
975            eligible: true,
976            meets_criteria: true,
977            ..baseline.clone()
978        };
979
980        let candidate2 = ValidationReport {
981            model_id: "candidate2".to_string(),
982            ndcg_at_10: 0.85, // Better quality
983            eligible: true,
984            meets_criteria: true,
985            ..baseline.clone()
986        };
987
988        let ineligible = ValidationReport {
989            model_id: "ineligible".to_string(),
990            ndcg_at_10: 0.90, // Best quality but not eligible
991            eligible: false,
992            meets_criteria: true,
993            ..baseline.clone()
994        };
995
996        let comparison = BakeoffComparison {
997            corpus_hash: "test".to_string(),
998            baseline: baseline.clone(),
999            candidates: vec![candidate1, candidate2.clone(), ineligible],
1000            recommendation: None,
1001            recommendation_reason: String::new(),
1002        };
1003
1004        let winner = comparison.find_winner();
1005        assert!(winner.is_some());
1006        assert_eq!(winner.unwrap().model_id, "candidate2");
1007    }
1008
1009    // ==================== Harness Tests ====================
1010
1011    #[test]
1012    fn corpus_creation_and_hash() {
1013        let mut corpus = EvaluationCorpus::new("test-corpus");
1014        corpus.add_document("d1", "hello world");
1015        corpus.add_document("d2", "goodbye world");
1016        corpus.add_query("hello", vec![("d1", 3.0), ("d2", 0.0)]);
1017
1018        assert_eq!(corpus.name, "test-corpus");
1019        assert_eq!(corpus.documents.len(), 2);
1020        assert_eq!(corpus.queries.len(), 1);
1021
1022        let hash1 = corpus.compute_hash();
1023        assert_eq!(hash1.len(), 16); // 16 hex chars
1024
1025        // Same corpus should produce same hash
1026        let hash2 = corpus.compute_hash();
1027        assert_eq!(hash1, hash2);
1028
1029        // Different corpus should produce different hash
1030        corpus.add_document("d3", "new document");
1031        let hash3 = corpus.compute_hash();
1032        assert_ne!(hash1, hash3);
1033    }
1034
1035    #[test]
1036    fn evaluation_rejects_empty_query_set() {
1037        let harness = EvaluationHarness::new();
1038        let mut corpus = EvaluationCorpus::new("no-queries");
1039        corpus.add_document("d1", "hello world");
1040        let embedder = crate::search::hash_embedder::HashEmbedder::new(16);
1041        let metadata = ModelMetadata {
1042            id: "hash".to_string(),
1043            name: "Hash".to_string(),
1044            source: "test".to_string(),
1045            release_date: "2025-12-01".to_string(),
1046            dimension: Some(16),
1047            size_bytes: Some(0),
1048            is_baseline: false,
1049        };
1050
1051        let err = harness
1052            .evaluate(&embedder, &corpus, &metadata)
1053            .expect_err("empty query set must not produce a successful bakeoff report");
1054        assert!(err.contains("Empty query set"));
1055    }
1056
1057    #[test]
1058    fn sample_corpus_is_valid() {
1059        let corpus = EvaluationCorpus::code_search_sample();
1060        assert!(!corpus.documents.is_empty());
1061        assert!(!corpus.queries.is_empty());
1062
1063        // Each query should have at least one judgment
1064        for query in &corpus.queries {
1065            assert!(!query.judgments.is_empty());
1066        }
1067
1068        // Hash should be stable
1069        let hash = corpus.compute_hash();
1070        assert!(!hash.is_empty());
1071    }
1072
1073    #[test]
1074    fn cosine_similarity_identical_vectors() {
1075        let v = vec![1.0, 2.0, 3.0];
1076        let sim = cosine_similarity(&v, &v);
1077        assert!((sim - 1.0).abs() < 1e-6);
1078    }
1079
1080    #[test]
1081    fn cosine_similarity_orthogonal_vectors() {
1082        let a = vec![1.0, 0.0, 0.0];
1083        let b = vec![0.0, 1.0, 0.0];
1084        let sim = cosine_similarity(&a, &b);
1085        assert!(sim.abs() < 1e-6);
1086    }
1087
1088    #[test]
1089    fn cosine_similarity_opposite_vectors() {
1090        let a = vec![1.0, 2.0, 3.0];
1091        let b = vec![-1.0, -2.0, -3.0];
1092        let sim = cosine_similarity(&a, &b);
1093        assert!((sim + 1.0).abs() < 1e-6);
1094    }
1095
1096    #[test]
1097    fn cosine_similarity_different_lengths() {
1098        let a = vec![1.0, 2.0];
1099        let b = vec![1.0, 2.0, 3.0];
1100        let sim = cosine_similarity(&a, &b);
1101        assert_eq!(sim, 0.0);
1102    }
1103
1104    #[test]
1105    fn evaluation_config_defaults() {
1106        let config = EvaluationConfig::default();
1107        assert_eq!(config.warmup_queries, 3);
1108        assert_eq!(config.timing_iterations, 5);
1109        assert_eq!(config.ndcg_k, 10);
1110    }
1111
1112    #[test]
1113    fn harness_creation() {
1114        let harness = EvaluationHarness::new();
1115        assert_eq!(harness.config.ndcg_k, 10);
1116
1117        let custom_config = EvaluationConfig {
1118            warmup_queries: 5,
1119            timing_iterations: 10,
1120            ndcg_k: 5,
1121        };
1122        let harness = EvaluationHarness::with_config(custom_config);
1123        assert_eq!(harness.config.ndcg_k, 5);
1124    }
1125
1126    #[test]
1127    fn corpus_roundtrip() {
1128        let corpus = EvaluationCorpus::code_search_sample();
1129        let json = serde_json::to_string(&corpus).expect("serialize");
1130        let decoded: EvaluationCorpus = serde_json::from_str(&json).expect("deserialize");
1131        assert_eq!(corpus, decoded);
1132    }
1133
1134    #[test]
1135    fn query_eval_result_roundtrip() {
1136        let result = QueryEvalResult {
1137            query: "test query".to_string(),
1138            ndcg_at_10: 0.85,
1139            ranked_docs: vec!["d1".to_string(), "d2".to_string()],
1140            latency_ms: 15,
1141        };
1142        let json = serde_json::to_string(&result).expect("serialize");
1143        let decoded: QueryEvalResult = serde_json::from_str(&json).expect("deserialize");
1144        assert_eq!(result.query, decoded.query);
1145        assert_eq!(result.ndcg_at_10, decoded.ndcg_at_10);
1146    }
1147
1148    #[test]
1149    fn format_comparison_table_output() {
1150        let baseline = ValidationReport {
1151            model_id: "baseline".to_string(),
1152            corpus_hash: "test123".to_string(),
1153            ndcg_at_10: 0.80,
1154            latency_ms_p50: 50,
1155            latency_ms_p95: 100,
1156            latency_ms_p99: 150,
1157            cold_start_ms: 1000,
1158            memory_mb: 200,
1159            eligible: false,
1160            meets_criteria: true,
1161            warnings: vec![],
1162        };
1163
1164        let candidate = ValidationReport {
1165            model_id: "winner".to_string(),
1166            ndcg_at_10: 0.85,
1167            eligible: true,
1168            meets_criteria: true,
1169            ..baseline.clone()
1170        };
1171
1172        let comparison = BakeoffComparison {
1173            corpus_hash: "test123".to_string(),
1174            baseline,
1175            candidates: vec![candidate],
1176            recommendation: Some("winner".to_string()),
1177            recommendation_reason: "Best candidate".to_string(),
1178        };
1179
1180        let table = format_comparison_table(&comparison);
1181        assert!(table.contains("Bake-off Results"));
1182        assert!(table.contains("baseline"));
1183        assert!(table.contains("winner"));
1184        assert!(table.contains("⭐")); // Winner marker
1185        assert!(table.contains("Recommendation"));
1186    }
1187}
coding_agent_search/bakeoff.rs

coding_agent_search/
bakeoff.rs