1use serde::{Deserialize, Serialize};
2use std::cmp::Ordering;
3use std::time::{Duration, Instant};
4
5pub const ELIGIBILITY_CUTOFF: &str = "2025-11-01";
8
9pub mod criteria {
11 pub const COLD_START_MAX_MS: u64 = 2000;
13 pub const WARM_P99_MAX_MS: u64 = 250;
15 pub const MEMORY_MAX_MB: u64 = 300;
17 pub const QUALITY_MIN_RATIO: f64 = 0.80;
19}
20
21#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
23pub struct ModelMetadata {
24 pub id: String,
26 pub name: String,
28 pub source: String,
30 pub release_date: String,
32 pub dimension: Option<usize>,
34 pub size_bytes: Option<u64>,
36 pub is_baseline: bool,
38}
39
40impl ModelMetadata {
41 pub fn is_eligible(&self) -> bool {
43 if self.is_baseline {
44 return false;
45 }
46 self.release_date.as_str() >= ELIGIBILITY_CUTOFF
47 }
48}
49
50#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
52pub struct ValidationReport {
53 pub model_id: String,
54 pub corpus_hash: String,
55 pub ndcg_at_10: f64,
56 pub latency_ms_p50: u64,
57 pub latency_ms_p95: u64,
58 pub latency_ms_p99: u64,
59 pub cold_start_ms: u64,
60 pub memory_mb: u64,
61 pub eligible: bool,
62 pub meets_criteria: bool,
63 pub warnings: Vec<String>,
64}
65
66impl ValidationReport {
67 pub fn check_criteria(&self) -> bool {
69 self.cold_start_ms <= criteria::COLD_START_MAX_MS
70 && self.latency_ms_p99 <= criteria::WARM_P99_MAX_MS
71 && self.memory_mb <= criteria::MEMORY_MAX_MB
72 }
73
74 pub fn meets_quality_threshold(&self, baseline: &ValidationReport) -> bool {
76 if baseline.ndcg_at_10 == 0.0 {
77 return true;
78 }
79 self.ndcg_at_10 / baseline.ndcg_at_10 >= criteria::QUALITY_MIN_RATIO
80 }
81}
82
83#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
85pub struct LatencyStats {
86 pub samples: usize,
87 pub min_ms: u64,
88 pub max_ms: u64,
89 pub mean_ms: f64,
90 pub p50_ms: u64,
91 pub p95_ms: u64,
92 pub p99_ms: u64,
93}
94
95impl LatencyStats {
96 pub fn from_durations(durations: &[Duration]) -> Self {
98 if durations.is_empty() {
99 return Self {
100 samples: 0,
101 min_ms: 0,
102 max_ms: 0,
103 mean_ms: 0.0,
104 p50_ms: 0,
105 p95_ms: 0,
106 p99_ms: 0,
107 };
108 }
109
110 let mut millis: Vec<u64> = durations.iter().map(|d| d.as_millis() as u64).collect();
111 millis.sort_unstable();
112
113 let n = millis.len();
114 let sum: u64 = millis.iter().sum();
115
116 Self {
117 samples: n,
118 min_ms: millis[0],
119 max_ms: millis[n - 1],
120 mean_ms: sum as f64 / n as f64,
121 p50_ms: percentile(&millis, 50),
122 p95_ms: percentile(&millis, 95),
123 p99_ms: percentile(&millis, 99),
124 }
125 }
126}
127
128fn percentile(sorted: &[u64], p: usize) -> u64 {
130 if sorted.is_empty() {
131 return 0;
132 }
133 let idx = (p * sorted.len() / 100).min(sorted.len() - 1);
134 sorted[idx]
135}
136
137pub struct LatencyTimer {
139 samples: Vec<Duration>,
140}
141
142impl LatencyTimer {
143 pub fn new() -> Self {
144 Self {
145 samples: Vec::new(),
146 }
147 }
148
149 pub fn time<F, T>(&mut self, f: F) -> T
151 where
152 F: FnOnce() -> T,
153 {
154 let start = Instant::now();
155 let result = f();
156 self.samples.push(start.elapsed());
157 result
158 }
159
160 pub fn stats(&self) -> LatencyStats {
162 LatencyStats::from_durations(&self.samples)
163 }
164
165 pub fn clear(&mut self) {
167 self.samples.clear();
168 }
169}
170
171impl Default for LatencyTimer {
172 fn default() -> Self {
173 Self::new()
174 }
175}
176
177#[derive(Debug, Clone, Serialize, Deserialize)]
179pub struct BakeoffComparison {
180 pub corpus_hash: String,
182 pub baseline: ValidationReport,
184 pub candidates: Vec<ValidationReport>,
186 pub recommendation: Option<String>,
188 pub recommendation_reason: String,
190}
191
192impl BakeoffComparison {
193 pub fn find_winner(&self) -> Option<&ValidationReport> {
195 self.candidates
196 .iter()
197 .filter(|r| r.eligible && r.meets_criteria && r.meets_quality_threshold(&self.baseline))
198 .max_by(|a, b| {
199 a.ndcg_at_10
201 .partial_cmp(&b.ndcg_at_10)
202 .unwrap_or(Ordering::Equal)
203 .then_with(|| b.latency_ms_p99.cmp(&a.latency_ms_p99))
204 })
205 }
206}
207
208pub fn ndcg_at_k(relevances: &[f64], k: usize, all_ground_truth: &[f64]) -> f64 {
216 if k == 0 || relevances.is_empty() {
217 return 0.0;
218 }
219 let dcg = dcg_at_k(relevances, k);
220 if dcg == 0.0 {
221 return 0.0;
222 }
223 let mut ideal: Vec<f64> = all_ground_truth
224 .iter()
225 .map(|rel| if rel.is_finite() { rel.max(0.0) } else { 0.0 })
226 .collect();
227 ideal.sort_by(|a, b| b.partial_cmp(a).unwrap_or(Ordering::Equal));
228 let idcg = dcg_at_k(&ideal, k);
229 if idcg == 0.0 { 0.0 } else { dcg / idcg }
230}
231
232fn dcg_at_k(relevances: &[f64], k: usize) -> f64 {
233 relevances
234 .iter()
235 .take(k)
236 .enumerate()
237 .map(|(idx, rel)| {
238 let rel = if rel.is_finite() { *rel } else { 0.0 };
239 let rel = rel.max(0.0);
240 let denom = (idx as f64 + 2.0).log2();
241 (2.0_f64.powf(rel) - 1.0) / denom
242 })
243 .sum()
244}
245
246#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
250pub struct Document {
251 pub id: String,
253 pub content: String,
255}
256
257#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
259pub struct RelevanceJudgment {
260 pub doc_id: String,
262 pub relevance: f64,
264}
265
266#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
268pub struct QueryWithJudgments {
269 pub query: String,
271 pub judgments: Vec<RelevanceJudgment>,
273}
274
275#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
277pub struct EvaluationCorpus {
278 pub name: String,
280 pub documents: Vec<Document>,
282 pub queries: Vec<QueryWithJudgments>,
284}
285
286impl EvaluationCorpus {
287 pub fn new(name: &str) -> Self {
289 Self {
290 name: name.to_string(),
291 documents: Vec::new(),
292 queries: Vec::new(),
293 }
294 }
295
296 pub fn add_document(&mut self, id: &str, content: &str) {
298 self.documents.push(Document {
299 id: id.to_string(),
300 content: content.to_string(),
301 });
302 }
303
304 pub fn add_query(&mut self, query: &str, judgments: Vec<(&str, f64)>) {
306 self.queries.push(QueryWithJudgments {
307 query: query.to_string(),
308 judgments: judgments
309 .into_iter()
310 .map(|(doc_id, relevance)| RelevanceJudgment {
311 doc_id: doc_id.to_string(),
312 relevance,
313 })
314 .collect(),
315 });
316 }
317
318 pub fn compute_hash(&self) -> String {
320 use std::collections::hash_map::DefaultHasher;
321 use std::hash::{Hash, Hasher};
322
323 let mut hasher = DefaultHasher::new();
324 self.name.hash(&mut hasher);
325 for doc in &self.documents {
326 doc.id.hash(&mut hasher);
327 doc.content.hash(&mut hasher);
328 }
329 for query in &self.queries {
330 query.query.hash(&mut hasher);
331 for j in &query.judgments {
332 j.doc_id.hash(&mut hasher);
333 j.relevance.to_bits().hash(&mut hasher);
335 }
336 }
337 format!("{:016x}", hasher.finish())
338 }
339
340 pub fn code_search_sample() -> Self {
342 let mut corpus = Self::new("code-search-sample");
343
344 corpus.add_document("d1", "implementing authentication with jwt tokens in rust using jsonwebtoken crate for secure api access");
346 corpus.add_document("d2", "database connection pool configuration using sqlx with postgres for high performance queries");
347 corpus.add_document(
348 "d3",
349 "error handling patterns in rust using thiserror and anyhow for better error messages",
350 );
351 corpus.add_document(
352 "d4",
353 "async runtime setup with asupersync for concurrent task processing and io operations",
354 );
355 corpus.add_document(
356 "d5",
357 "parsing json data with serde for serialization and deserialization of structs",
358 );
359 corpus.add_document(
360 "d6",
361 "logging configuration using tracing crate for structured observability and debugging",
362 );
363 corpus.add_document(
364 "d7",
365 "cli argument parsing with clap for building command line applications",
366 );
367 corpus.add_document(
368 "d8",
369 "http client requests using asupersync http primitives for external service calls",
370 );
371 corpus.add_document(
372 "d9",
373 "unit testing patterns with cargo test and mock objects for reliable tests",
374 );
375 corpus.add_document(
376 "d10",
377 "file system operations reading and writing files with std fs module",
378 );
379
380 corpus.add_query(
383 "how to authenticate users with jwt",
384 vec![
385 ("d1", 3.0), ("d2", 0.0), ("d8", 1.0), ],
389 );
390
391 corpus.add_query(
392 "database connection setup",
393 vec![
394 ("d2", 3.0), ("d4", 1.0), ("d10", 0.0), ],
398 );
399
400 corpus.add_query(
401 "error handling best practices",
402 vec![
403 ("d3", 3.0), ("d6", 1.0), ("d9", 1.0), ],
407 );
408
409 corpus.add_query(
410 "async programming asupersync",
411 vec![
412 ("d4", 3.0), ("d2", 1.0), ("d8", 2.0), ],
416 );
417
418 corpus.add_query(
419 "json serialization",
420 vec![
421 ("d5", 3.0), ("d8", 1.0), ("d1", 1.0), ],
425 );
426
427 corpus
428 }
429}
430
431#[derive(Debug, Clone, Serialize, Deserialize)]
433pub struct QueryEvalResult {
434 pub query: String,
436 pub ndcg_at_10: f64,
438 pub ranked_docs: Vec<String>,
440 pub latency_ms: u64,
442}
443
444#[derive(Debug, Clone)]
446pub struct EvaluationConfig {
447 pub warmup_queries: usize,
449 pub timing_iterations: usize,
451 pub ndcg_k: usize,
453}
454
455impl Default for EvaluationConfig {
456 fn default() -> Self {
457 Self {
458 warmup_queries: 3,
459 timing_iterations: 5,
460 ndcg_k: 10,
461 }
462 }
463}
464
465pub fn cosine_similarity(a: &[f32], b: &[f32]) -> f32 {
467 if a.len() != b.len() {
468 return 0.0;
469 }
470 let dot: f32 = a.iter().zip(b.iter()).map(|(x, y)| x * y).sum();
471 let norm_a: f32 = a.iter().map(|x| x * x).sum::<f32>().sqrt();
472 let norm_b: f32 = b.iter().map(|x| x * x).sum::<f32>().sqrt();
473 if norm_a == 0.0 || norm_b == 0.0 {
474 return 0.0;
475 }
476 dot / (norm_a * norm_b)
477}
478
479pub struct EvaluationHarness {
481 config: EvaluationConfig,
482}
483
484impl EvaluationHarness {
485 pub fn new() -> Self {
487 Self {
488 config: EvaluationConfig::default(),
489 }
490 }
491
492 pub fn with_config(config: EvaluationConfig) -> Self {
494 Self { config }
495 }
496
497 pub fn evaluate<E: crate::search::embedder::Embedder>(
501 &self,
502 embedder: &E,
503 corpus: &EvaluationCorpus,
504 metadata: &ModelMetadata,
505 ) -> Result<ValidationReport, String> {
506 let corpus_hash = corpus.compute_hash();
507 let first_doc = corpus.documents.first().ok_or("Empty corpus")?;
508 if corpus.queries.is_empty() {
509 return Err("Empty query set".to_string());
510 }
511
512 let cold_start = Instant::now();
514 embedder
515 .embed_sync(&first_doc.content)
516 .map_err(|e| e.to_string())?;
517 let cold_start_ms = cold_start.elapsed().as_millis() as u64;
518
519 let doc_embeddings: Vec<Vec<f32>> = corpus
521 .documents
522 .iter()
523 .map(|d| embedder.embed_sync(&d.content))
524 .collect::<Result<Vec<_>, _>>()
525 .map_err(|e| e.to_string())?;
526
527 for i in 0..self.config.warmup_queries.min(corpus.queries.len()) {
529 let _ = embedder.embed_sync(&corpus.queries[i].query);
530 }
531
532 let mut query_results = Vec::new();
534 let mut latencies = Vec::new();
535
536 for query_with_judgments in &corpus.queries {
537 let relevance_map: std::collections::HashMap<&str, f64> = query_with_judgments
539 .judgments
540 .iter()
541 .map(|j| (j.doc_id.as_str(), j.relevance))
542 .collect();
543
544 let iterations = self.config.timing_iterations.max(1);
546 let mut query_latencies = Vec::with_capacity(iterations);
547 let mut query_embedding = Vec::new();
548 for _ in 0..iterations {
549 let start = Instant::now();
550 query_embedding = embedder
551 .embed_sync(&query_with_judgments.query)
552 .map_err(|e| e.to_string())?;
553 query_latencies.push(start.elapsed());
554 }
555 let avg_latency = query_latencies
556 .iter()
557 .map(|d| d.as_millis() as u64)
558 .sum::<u64>()
559 / query_latencies.len() as u64;
560 latencies.push(Duration::from_millis(avg_latency));
561
562 let mut scored_docs: Vec<(usize, f32)> = doc_embeddings
564 .iter()
565 .enumerate()
566 .map(|(idx, emb)| (idx, cosine_similarity(&query_embedding, emb)))
567 .collect();
568 scored_docs.sort_by(|a, b| b.1.partial_cmp(&a.1).unwrap_or(std::cmp::Ordering::Equal));
569
570 let ranked_docs: Vec<String> = scored_docs
572 .iter()
573 .take(self.config.ndcg_k)
574 .map(|(idx, _)| corpus.documents[*idx].id.clone())
575 .collect();
576
577 let relevances: Vec<f64> = ranked_docs
579 .iter()
580 .map(|id| *relevance_map.get(id.as_str()).unwrap_or(&0.0))
581 .collect();
582
583 let all_gt: Vec<f64> = relevance_map.values().copied().collect();
585 let ndcg = ndcg_at_k(&relevances, self.config.ndcg_k, &all_gt);
586
587 query_results.push(QueryEvalResult {
588 query: query_with_judgments.query.clone(),
589 ndcg_at_10: ndcg,
590 ranked_docs,
591 latency_ms: avg_latency,
592 });
593 }
594
595 let avg_ndcg = if query_results.is_empty() {
597 0.0
598 } else {
599 query_results.iter().map(|r| r.ndcg_at_10).sum::<f64>() / query_results.len() as f64
600 };
601
602 let latency_stats = LatencyStats::from_durations(&latencies);
603
604 let memory_mb = metadata.size_bytes.unwrap_or(0) / (1024 * 1024);
606
607 let eligible = metadata.is_eligible();
608 let mut report = ValidationReport {
609 model_id: metadata.id.clone(),
610 corpus_hash,
611 ndcg_at_10: avg_ndcg,
612 latency_ms_p50: latency_stats.p50_ms,
613 latency_ms_p95: latency_stats.p95_ms,
614 latency_ms_p99: latency_stats.p99_ms,
615 cold_start_ms,
616 memory_mb,
617 eligible,
618 meets_criteria: false,
619 warnings: Vec::new(),
620 };
621
622 report.meets_criteria = report.check_criteria();
623
624 if cold_start_ms > criteria::COLD_START_MAX_MS {
626 report.warnings.push(format!(
627 "Cold start {}ms exceeds {}ms limit",
628 cold_start_ms,
629 criteria::COLD_START_MAX_MS
630 ));
631 }
632 if latency_stats.p99_ms > criteria::WARM_P99_MAX_MS {
633 report.warnings.push(format!(
634 "P99 latency {}ms exceeds {}ms limit",
635 latency_stats.p99_ms,
636 criteria::WARM_P99_MAX_MS
637 ));
638 }
639 if memory_mb > criteria::MEMORY_MAX_MB {
640 report.warnings.push(format!(
641 "Memory {}MB exceeds {}MB limit",
642 memory_mb,
643 criteria::MEMORY_MAX_MB
644 ));
645 }
646
647 Ok(report)
648 }
649
650 pub fn run_comparison<E: crate::search::embedder::Embedder>(
652 &self,
653 baseline: (&E, &ModelMetadata),
654 candidates: Vec<(&E, &ModelMetadata)>,
655 corpus: &EvaluationCorpus,
656 ) -> Result<BakeoffComparison, String> {
657 let corpus_hash = corpus.compute_hash();
658
659 let baseline_report = self.evaluate(baseline.0, corpus, baseline.1)?;
661
662 let mut candidate_reports = Vec::new();
664 for (embedder, metadata) in candidates {
665 let report = self.evaluate(embedder, corpus, metadata)?;
666 candidate_reports.push(report);
667 }
668
669 let mut comparison = BakeoffComparison {
671 corpus_hash,
672 baseline: baseline_report.clone(),
673 candidates: candidate_reports,
674 recommendation: None,
675 recommendation_reason: String::new(),
676 };
677
678 let winner_data = comparison.find_winner().map(|w| {
680 (
681 w.model_id.clone(),
682 w.ndcg_at_10,
683 w.latency_ms_p99,
684 w.memory_mb,
685 )
686 });
687
688 if let Some((model_id, ndcg, p99, memory)) = winner_data {
689 comparison.recommendation = Some(model_id.clone());
690 let pct_of_baseline = if baseline_report.ndcg_at_10 > 0.0 {
691 format!("{}%", (ndcg / baseline_report.ndcg_at_10 * 100.0) as u32)
692 } else {
693 "N/A".to_string()
694 };
695 comparison.recommendation_reason = format!(
696 "Best eligible candidate with NDCG@10={:.3} ({} of baseline), p99={}ms, memory={}MB",
697 ndcg, pct_of_baseline, p99, memory
698 );
699 } else {
700 comparison.recommendation_reason =
701 "No eligible candidate meets all criteria".to_string();
702 }
703
704 Ok(comparison)
705 }
706}
707
708impl Default for EvaluationHarness {
709 fn default() -> Self {
710 Self::new()
711 }
712}
713
714pub fn format_comparison_table(comparison: &BakeoffComparison) -> String {
716 let mut output = String::new();
717
718 output.push_str("# Bake-off Results\n\n");
719 output.push_str(&format!("Corpus hash: `{}`\n\n", comparison.corpus_hash));
720
721 output.push_str("| Model | NDCG@10 | P50 (ms) | P95 (ms) | P99 (ms) | Cold (ms) | Memory (MB) | Eligible | Meets Criteria |\n");
722 output.push_str("|-------|---------|----------|----------|----------|-----------|-------------|----------|----------------|\n");
723
724 let b = &comparison.baseline;
726 output.push_str(&format!(
727 "| {} (baseline) | {:.3} | {} | {} | {} | {} | {} | {} | {} |\n",
728 b.model_id,
729 b.ndcg_at_10,
730 b.latency_ms_p50,
731 b.latency_ms_p95,
732 b.latency_ms_p99,
733 b.cold_start_ms,
734 b.memory_mb,
735 if b.eligible { "✓" } else { "✗" },
736 if b.meets_criteria { "✓" } else { "✗" }
737 ));
738
739 for c in &comparison.candidates {
741 let marker = if Some(&c.model_id) == comparison.recommendation.as_ref() {
742 " ⭐"
743 } else {
744 ""
745 };
746 output.push_str(&format!(
747 "| {}{} | {:.3} | {} | {} | {} | {} | {} | {} | {} |\n",
748 c.model_id,
749 marker,
750 c.ndcg_at_10,
751 c.latency_ms_p50,
752 c.latency_ms_p95,
753 c.latency_ms_p99,
754 c.cold_start_ms,
755 c.memory_mb,
756 if c.eligible { "✓" } else { "✗" },
757 if c.meets_criteria { "✓" } else { "✗" }
758 ));
759 }
760
761 output.push_str("\n## Recommendation\n\n");
762 if let Some(ref winner) = comparison.recommendation {
763 output.push_str(&format!("**Winner:** {}\n\n", winner));
764 }
765 output.push_str(&format!("{}\n", comparison.recommendation_reason));
766
767 output
768}
769
770#[cfg(test)]
771mod tests {
772 use super::*;
773
774 #[test]
775 fn ndcg_perfect_is_one() {
776 let relevances = vec![3.0, 2.0, 1.0];
777 let ndcg = ndcg_at_k(&relevances, 3, &relevances);
778 assert!((ndcg - 1.0).abs() < 1e-9);
779 }
780
781 #[test]
782 fn ndcg_zero_when_no_relevance() {
783 let relevances = vec![0.0, 0.0, 0.0];
784 let ndcg = ndcg_at_k(&relevances, 3, &relevances);
785 assert_eq!(ndcg, 0.0);
786 }
787
788 #[test]
789 fn ndcg_handles_partial_relevance() {
790 let all_gt = vec![2.0, 1.0, 0.0];
791 let returned = vec![1.0, 0.0, 2.0]; let ndcg = ndcg_at_k(&returned, 3, &all_gt);
793 assert!(ndcg > 0.0 && ndcg < 1.0);
794 }
795
796 #[test]
797 fn report_roundtrip() {
798 let report = ValidationReport {
799 model_id: "hash".to_string(),
800 corpus_hash: "deadbeef".to_string(),
801 ndcg_at_10: 0.42,
802 latency_ms_p50: 12,
803 latency_ms_p95: 30,
804 latency_ms_p99: 45,
805 cold_start_ms: 500,
806 memory_mb: 150,
807 eligible: true,
808 meets_criteria: true,
809 warnings: vec!["example warning".to_string()],
810 };
811 let encoded = serde_json::to_string(&report).expect("serialize");
812 let decoded: ValidationReport = serde_json::from_str(&encoded).expect("deserialize");
813 assert_eq!(report, decoded);
814 }
815
816 #[test]
817 fn model_eligibility_by_date() {
818 let eligible_model = ModelMetadata {
819 id: "new-model".to_string(),
820 name: "New Model".to_string(),
821 source: "huggingface".to_string(),
822 release_date: "2025-12-01".to_string(),
823 dimension: Some(384),
824 size_bytes: Some(100_000_000),
825 is_baseline: false,
826 };
827 assert!(eligible_model.is_eligible());
828
829 let old_model = ModelMetadata {
830 id: "old-model".to_string(),
831 name: "Old Model".to_string(),
832 source: "huggingface".to_string(),
833 release_date: "2025-06-01".to_string(),
834 dimension: Some(384),
835 size_bytes: Some(100_000_000),
836 is_baseline: false,
837 };
838 assert!(!old_model.is_eligible());
839
840 let baseline_model = ModelMetadata {
841 id: "baseline".to_string(),
842 name: "Baseline".to_string(),
843 source: "huggingface".to_string(),
844 release_date: "2025-12-01".to_string(),
845 dimension: Some(384),
846 size_bytes: Some(100_000_000),
847 is_baseline: true,
848 };
849 assert!(!baseline_model.is_eligible());
850 }
851
852 #[test]
853 fn latency_stats_from_durations() {
854 let durations = vec![
855 Duration::from_millis(10),
856 Duration::from_millis(20),
857 Duration::from_millis(30),
858 Duration::from_millis(40),
859 Duration::from_millis(100),
860 ];
861 let stats = LatencyStats::from_durations(&durations);
862
863 assert_eq!(stats.samples, 5);
864 assert_eq!(stats.min_ms, 10);
865 assert_eq!(stats.max_ms, 100);
866 assert!((stats.mean_ms - 40.0).abs() < 0.1);
867 assert_eq!(stats.p50_ms, 30);
868 }
869
870 #[test]
871 fn latency_stats_empty() {
872 let stats = LatencyStats::from_durations(&[]);
873 assert_eq!(stats.samples, 0);
874 assert_eq!(stats.p50_ms, 0);
875 }
876
877 #[test]
878 fn latency_timer_records_samples() {
879 let mut timer = LatencyTimer::new();
880
881 let result = timer.time(|| 42);
883 assert_eq!(result, 42);
884
885 let stats = timer.stats();
886 assert_eq!(stats.samples, 1);
887 }
888
889 #[test]
890 fn report_meets_criteria() {
891 let good_report = ValidationReport {
892 model_id: "good".to_string(),
893 corpus_hash: "test".to_string(),
894 ndcg_at_10: 0.85,
895 latency_ms_p50: 50,
896 latency_ms_p95: 100,
897 latency_ms_p99: 200, cold_start_ms: 1500, memory_mb: 200, eligible: true,
901 meets_criteria: true,
902 warnings: vec![],
903 };
904 assert!(good_report.check_criteria());
905
906 let bad_latency = ValidationReport {
907 latency_ms_p99: 300, ..good_report.clone()
909 };
910 assert!(!bad_latency.check_criteria());
911
912 let bad_cold_start = ValidationReport {
913 cold_start_ms: 3000, ..good_report.clone()
915 };
916 assert!(!bad_cold_start.check_criteria());
917
918 let bad_memory = ValidationReport {
919 memory_mb: 400, ..good_report
921 };
922 assert!(!bad_memory.check_criteria());
923 }
924
925 #[test]
926 fn report_quality_threshold() {
927 let baseline = ValidationReport {
928 model_id: "baseline".to_string(),
929 corpus_hash: "test".to_string(),
930 ndcg_at_10: 0.80,
931 latency_ms_p50: 50,
932 latency_ms_p95: 100,
933 latency_ms_p99: 150,
934 cold_start_ms: 1000,
935 memory_mb: 200,
936 eligible: false,
937 meets_criteria: true,
938 warnings: vec![],
939 };
940
941 let good_candidate = ValidationReport {
942 model_id: "good".to_string(),
943 ndcg_at_10: 0.70, ..baseline.clone()
945 };
946 assert!(good_candidate.meets_quality_threshold(&baseline));
947
948 let bad_candidate = ValidationReport {
949 model_id: "bad".to_string(),
950 ndcg_at_10: 0.60, ..baseline.clone()
952 };
953 assert!(!bad_candidate.meets_quality_threshold(&baseline));
954 }
955
956 #[test]
957 fn bakeoff_comparison_finds_winner() {
958 let baseline = ValidationReport {
959 model_id: "baseline".to_string(),
960 corpus_hash: "test".to_string(),
961 ndcg_at_10: 0.80,
962 latency_ms_p50: 50,
963 latency_ms_p95: 100,
964 latency_ms_p99: 150,
965 cold_start_ms: 1000,
966 memory_mb: 200,
967 eligible: false,
968 meets_criteria: true,
969 warnings: vec![],
970 };
971
972 let candidate1 = ValidationReport {
973 model_id: "candidate1".to_string(),
974 ndcg_at_10: 0.75, eligible: true,
976 meets_criteria: true,
977 ..baseline.clone()
978 };
979
980 let candidate2 = ValidationReport {
981 model_id: "candidate2".to_string(),
982 ndcg_at_10: 0.85, eligible: true,
984 meets_criteria: true,
985 ..baseline.clone()
986 };
987
988 let ineligible = ValidationReport {
989 model_id: "ineligible".to_string(),
990 ndcg_at_10: 0.90, eligible: false,
992 meets_criteria: true,
993 ..baseline.clone()
994 };
995
996 let comparison = BakeoffComparison {
997 corpus_hash: "test".to_string(),
998 baseline: baseline.clone(),
999 candidates: vec![candidate1, candidate2.clone(), ineligible],
1000 recommendation: None,
1001 recommendation_reason: String::new(),
1002 };
1003
1004 let winner = comparison.find_winner();
1005 assert!(winner.is_some());
1006 assert_eq!(winner.unwrap().model_id, "candidate2");
1007 }
1008
1009 #[test]
1012 fn corpus_creation_and_hash() {
1013 let mut corpus = EvaluationCorpus::new("test-corpus");
1014 corpus.add_document("d1", "hello world");
1015 corpus.add_document("d2", "goodbye world");
1016 corpus.add_query("hello", vec![("d1", 3.0), ("d2", 0.0)]);
1017
1018 assert_eq!(corpus.name, "test-corpus");
1019 assert_eq!(corpus.documents.len(), 2);
1020 assert_eq!(corpus.queries.len(), 1);
1021
1022 let hash1 = corpus.compute_hash();
1023 assert_eq!(hash1.len(), 16); let hash2 = corpus.compute_hash();
1027 assert_eq!(hash1, hash2);
1028
1029 corpus.add_document("d3", "new document");
1031 let hash3 = corpus.compute_hash();
1032 assert_ne!(hash1, hash3);
1033 }
1034
1035 #[test]
1036 fn evaluation_rejects_empty_query_set() {
1037 let harness = EvaluationHarness::new();
1038 let mut corpus = EvaluationCorpus::new("no-queries");
1039 corpus.add_document("d1", "hello world");
1040 let embedder = crate::search::hash_embedder::HashEmbedder::new(16);
1041 let metadata = ModelMetadata {
1042 id: "hash".to_string(),
1043 name: "Hash".to_string(),
1044 source: "test".to_string(),
1045 release_date: "2025-12-01".to_string(),
1046 dimension: Some(16),
1047 size_bytes: Some(0),
1048 is_baseline: false,
1049 };
1050
1051 let err = harness
1052 .evaluate(&embedder, &corpus, &metadata)
1053 .expect_err("empty query set must not produce a successful bakeoff report");
1054 assert!(err.contains("Empty query set"));
1055 }
1056
1057 #[test]
1058 fn sample_corpus_is_valid() {
1059 let corpus = EvaluationCorpus::code_search_sample();
1060 assert!(!corpus.documents.is_empty());
1061 assert!(!corpus.queries.is_empty());
1062
1063 for query in &corpus.queries {
1065 assert!(!query.judgments.is_empty());
1066 }
1067
1068 let hash = corpus.compute_hash();
1070 assert!(!hash.is_empty());
1071 }
1072
1073 #[test]
1074 fn cosine_similarity_identical_vectors() {
1075 let v = vec![1.0, 2.0, 3.0];
1076 let sim = cosine_similarity(&v, &v);
1077 assert!((sim - 1.0).abs() < 1e-6);
1078 }
1079
1080 #[test]
1081 fn cosine_similarity_orthogonal_vectors() {
1082 let a = vec![1.0, 0.0, 0.0];
1083 let b = vec![0.0, 1.0, 0.0];
1084 let sim = cosine_similarity(&a, &b);
1085 assert!(sim.abs() < 1e-6);
1086 }
1087
1088 #[test]
1089 fn cosine_similarity_opposite_vectors() {
1090 let a = vec![1.0, 2.0, 3.0];
1091 let b = vec![-1.0, -2.0, -3.0];
1092 let sim = cosine_similarity(&a, &b);
1093 assert!((sim + 1.0).abs() < 1e-6);
1094 }
1095
1096 #[test]
1097 fn cosine_similarity_different_lengths() {
1098 let a = vec![1.0, 2.0];
1099 let b = vec![1.0, 2.0, 3.0];
1100 let sim = cosine_similarity(&a, &b);
1101 assert_eq!(sim, 0.0);
1102 }
1103
1104 #[test]
1105 fn evaluation_config_defaults() {
1106 let config = EvaluationConfig::default();
1107 assert_eq!(config.warmup_queries, 3);
1108 assert_eq!(config.timing_iterations, 5);
1109 assert_eq!(config.ndcg_k, 10);
1110 }
1111
1112 #[test]
1113 fn harness_creation() {
1114 let harness = EvaluationHarness::new();
1115 assert_eq!(harness.config.ndcg_k, 10);
1116
1117 let custom_config = EvaluationConfig {
1118 warmup_queries: 5,
1119 timing_iterations: 10,
1120 ndcg_k: 5,
1121 };
1122 let harness = EvaluationHarness::with_config(custom_config);
1123 assert_eq!(harness.config.ndcg_k, 5);
1124 }
1125
1126 #[test]
1127 fn corpus_roundtrip() {
1128 let corpus = EvaluationCorpus::code_search_sample();
1129 let json = serde_json::to_string(&corpus).expect("serialize");
1130 let decoded: EvaluationCorpus = serde_json::from_str(&json).expect("deserialize");
1131 assert_eq!(corpus, decoded);
1132 }
1133
1134 #[test]
1135 fn query_eval_result_roundtrip() {
1136 let result = QueryEvalResult {
1137 query: "test query".to_string(),
1138 ndcg_at_10: 0.85,
1139 ranked_docs: vec!["d1".to_string(), "d2".to_string()],
1140 latency_ms: 15,
1141 };
1142 let json = serde_json::to_string(&result).expect("serialize");
1143 let decoded: QueryEvalResult = serde_json::from_str(&json).expect("deserialize");
1144 assert_eq!(result.query, decoded.query);
1145 assert_eq!(result.ndcg_at_10, decoded.ndcg_at_10);
1146 }
1147
1148 #[test]
1149 fn format_comparison_table_output() {
1150 let baseline = ValidationReport {
1151 model_id: "baseline".to_string(),
1152 corpus_hash: "test123".to_string(),
1153 ndcg_at_10: 0.80,
1154 latency_ms_p50: 50,
1155 latency_ms_p95: 100,
1156 latency_ms_p99: 150,
1157 cold_start_ms: 1000,
1158 memory_mb: 200,
1159 eligible: false,
1160 meets_criteria: true,
1161 warnings: vec![],
1162 };
1163
1164 let candidate = ValidationReport {
1165 model_id: "winner".to_string(),
1166 ndcg_at_10: 0.85,
1167 eligible: true,
1168 meets_criteria: true,
1169 ..baseline.clone()
1170 };
1171
1172 let comparison = BakeoffComparison {
1173 corpus_hash: "test123".to_string(),
1174 baseline,
1175 candidates: vec![candidate],
1176 recommendation: Some("winner".to_string()),
1177 recommendation_reason: "Best candidate".to_string(),
1178 };
1179
1180 let table = format_comparison_table(&comparison);
1181 assert!(table.contains("Bake-off Results"));
1182 assert!(table.contains("baseline"));
1183 assert!(table.contains("winner"));
1184 assert!(table.contains("⭐")); assert!(table.contains("Recommendation"));
1186 }
1187}