Skip to main content

entrenar/eval/evaluator/
leaderboard.rs

1//! Leaderboard for comparing multiple models
2
3use super::metric::Metric;
4use super::result::EvalResult;
5use std::fmt;
6
7/// Leaderboard for comparing multiple models
8#[derive(Clone, Debug)]
9pub struct Leaderboard {
10    /// Evaluation results for each model
11    pub results: Vec<EvalResult>,
12    /// Primary metric for ranking
13    pub primary_metric: Metric,
14}
15
16impl Leaderboard {
17    /// Create a new leaderboard
18    pub fn new(primary_metric: Metric) -> Self {
19        Self { results: Vec::new(), primary_metric }
20    }
21
22    /// Add evaluation result
23    pub fn add(&mut self, result: EvalResult) {
24        self.results.push(result);
25        self.sort();
26    }
27
28    /// Sort by primary metric
29    ///
30    /// N-06 (Meyer DbC): Models with missing scores sort last, not to an
31    /// arbitrary position. `NEG_INFINITY` for "higher is better" and
32    /// `INFINITY` for "lower is better" ensure worst-possible semantics.
33    pub fn sort(&mut self) {
34        let higher_is_better = self.primary_metric.higher_is_better();
35        let missing = if higher_is_better { f64::NEG_INFINITY } else { f64::INFINITY };
36        self.results.sort_by(|a, b| {
37            let score_a = a.get_score(self.primary_metric).unwrap_or(missing);
38            let score_b = b.get_score(self.primary_metric).unwrap_or(missing);
39            if higher_is_better {
40                score_b.partial_cmp(&score_a).unwrap_or(std::cmp::Ordering::Equal)
41            } else {
42                score_a.partial_cmp(&score_b).unwrap_or(std::cmp::Ordering::Equal)
43            }
44        });
45    }
46
47    /// Sort by a specific metric
48    pub fn sort_by(&mut self, metric: Metric) {
49        let higher_is_better = metric.higher_is_better();
50        let missing = if higher_is_better { f64::NEG_INFINITY } else { f64::INFINITY };
51        self.results.sort_by(|a, b| {
52            let score_a = a.get_score(metric).unwrap_or(missing);
53            let score_b = b.get_score(metric).unwrap_or(missing);
54            if higher_is_better {
55                score_b.partial_cmp(&score_a).unwrap_or(std::cmp::Ordering::Equal)
56            } else {
57                score_a.partial_cmp(&score_b).unwrap_or(std::cmp::Ordering::Equal)
58            }
59        });
60    }
61
62    /// Get best model by primary metric
63    pub fn best(&self) -> Option<&EvalResult> {
64        self.results.first()
65    }
66
67    /// Print formatted leaderboard to stdout (Mieruka - visual control)
68    pub fn print(&self) {
69        println!("{self}");
70    }
71
72    /// Export as markdown table
73    pub fn to_markdown(&self) -> String {
74        let mut md = String::new();
75
76        // Collect all metrics
77        let metrics: Vec<Metric> = if let Some(first) = self.results.first() {
78            first.scores.keys().copied().collect()
79        } else {
80            return md;
81        };
82
83        // Header
84        md.push_str("| Model |");
85        for metric in &metrics {
86            md.push_str(&format!(" {metric} |"));
87        }
88        md.push_str(" Inference (ms) |\n");
89
90        // Separator
91        md.push_str("|-------|");
92        for _ in &metrics {
93            md.push_str("----------|");
94        }
95        md.push_str("---------------|\n");
96
97        // Rows
98        for result in &self.results {
99            md.push_str(&format!("| {} |", result.model_name));
100            for metric in &metrics {
101                match result.get_score(*metric) {
102                    Some(score) => md.push_str(&format!(" {score:.4} |")),
103                    // N-06: Missing scores display as "—" not "0.0000"
104                    None => md.push_str(" — |"),
105                }
106            }
107            md.push_str(&format!(" {:.2} |\n", result.inference_time_ms));
108        }
109
110        md
111    }
112}
113
114impl fmt::Display for Leaderboard {
115    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
116        if self.results.is_empty() {
117            return writeln!(f, "Leaderboard: (empty)");
118        }
119
120        // Collect all metrics
121        let metrics: Vec<Metric> = if let Some(first) = self.results.first() {
122            first.scores.keys().copied().collect()
123        } else {
124            return Ok(());
125        };
126
127        // Calculate column widths
128        let model_width = self.results.iter().map(|r| r.model_name.len()).max().unwrap_or(5).max(5);
129
130        // Header
131        write!(f, "┌{:─<width$}┬", "", width = model_width + 2)?;
132        for _ in &metrics {
133            write!(f, "{:─<12}┬", "")?;
134        }
135        writeln!(f, "{:─<15}┐", "")?;
136
137        write!(f, "│ {:width$} │", "Model", width = model_width)?;
138        for metric in &metrics {
139            write!(f, " {:>10} │", metric.name())?;
140        }
141        writeln!(f, " Inference (ms)│")?;
142
143        // Separator
144        write!(f, "├{:─<width$}┼", "", width = model_width + 2)?;
145        for _ in &metrics {
146            write!(f, "{:─<12}┼", "")?;
147        }
148        writeln!(f, "{:─<15}┤", "")?;
149
150        // Rows
151        for result in &self.results {
152            write!(f, "│ {:width$} │", result.model_name, width = model_width)?;
153            for metric in &metrics {
154                match result.get_score(*metric) {
155                    Some(score) => write!(f, " {score:>10.4} │")?,
156                    // N-06: Missing scores display as "—" not "0.0000"
157                    None => write!(f, " {:>10} │", "—")?,
158                }
159            }
160            writeln!(f, " {:>13.2} │", result.inference_time_ms)?;
161        }
162
163        // Footer
164        write!(f, "└{:─<width$}┴", "", width = model_width + 2)?;
165        for _ in &metrics {
166            write!(f, "{:─<12}┴", "")?;
167        }
168        writeln!(f, "{:─<15}┘", "")?;
169
170        Ok(())
171    }
172}
173
174#[cfg(test)]
175mod tests {
176    use super::*;
177
178    fn make_result(name: &str, metric: Metric, score: Option<f64>) -> EvalResult {
179        let mut r = EvalResult::new(name);
180        if let Some(s) = score {
181            r.add_score(metric, s);
182        }
183        r
184    }
185
186    // =========================================================================
187    // FALSIFY tests — contract violation sweep (N-06)
188    // =========================================================================
189
190    #[test]
191    fn test_falsify_n06_missing_score_sorts_last_higher_is_better() {
192        // N-06: A model with a missing score for a "higher is better" metric
193        // must sort LAST, not to an arbitrary middle position.
194        let metric = Metric::Accuracy;
195        assert!(metric.higher_is_better());
196
197        let mut lb = Leaderboard::new(metric);
198        lb.results.push(make_result("good", metric, Some(0.9)));
199        lb.results.push(make_result("missing", metric, None));
200        lb.results.push(make_result("bad", metric, Some(0.1)));
201        lb.sort();
202
203        assert_eq!(lb.results[0].model_name, "good");
204        assert_eq!(lb.results[1].model_name, "bad");
205        assert_eq!(
206            lb.results[2].model_name, "missing",
207            "Model with missing score must sort last for higher-is-better metric"
208        );
209    }
210
211    #[test]
212    fn test_falsify_n06_missing_score_sorts_last_lower_is_better() {
213        // N-06: A model with a missing score for a "lower is better" metric
214        // must also sort LAST.
215        let metric = Metric::MSE;
216        assert!(!metric.higher_is_better());
217
218        let mut lb = Leaderboard::new(metric);
219        lb.results.push(make_result("good", metric, Some(0.01)));
220        lb.results.push(make_result("missing", metric, None));
221        lb.results.push(make_result("bad", metric, Some(10.0)));
222        lb.sort();
223
224        assert_eq!(lb.results[0].model_name, "good");
225        assert_eq!(lb.results[1].model_name, "bad");
226        assert_eq!(
227            lb.results[2].model_name, "missing",
228            "Model with missing score must sort last for lower-is-better metric"
229        );
230    }
231
232    #[test]
233    fn test_falsify_n06_sort_by_missing_score_sorts_last() {
234        // N-06: sort_by() also uses correct missing-score semantics.
235        let primary = Metric::Accuracy;
236        let secondary = Metric::Perplexity; // lower is better
237        assert!(!secondary.higher_is_better());
238
239        let mut lb = Leaderboard::new(primary);
240
241        let mut r1 = make_result("model_a", primary, Some(0.8));
242        r1.add_score(secondary, 5.0);
243        lb.results.push(r1);
244
245        let r2 = make_result("model_b", primary, Some(0.9));
246        // model_b has NO perplexity score
247        lb.results.push(r2);
248
249        let mut r3 = make_result("model_c", primary, Some(0.7));
250        r3.add_score(secondary, 100.0);
251        lb.results.push(r3);
252
253        lb.sort_by(secondary);
254
255        assert_eq!(lb.results[0].model_name, "model_a", "lowest perplexity first");
256        assert_eq!(lb.results[1].model_name, "model_c");
257        assert_eq!(lb.results[2].model_name, "model_b", "Missing perplexity score must sort last");
258    }
259
260    #[test]
261    fn test_falsify_n06_display_missing_score_shows_dash() {
262        // N-06: Missing scores must display as "—", never "0.0000".
263        // A zero score is a valid measurement; a missing score is not.
264        let metric = Metric::Accuracy;
265        let mut lb = Leaderboard::new(metric);
266        lb.results.push(make_result("has_score", metric, Some(0.95)));
267        lb.results.push(make_result("no_score", metric, None));
268
269        let md = lb.to_markdown();
270        // Model with score should show numeric value
271        assert!(md.contains("0.95"), "scored model must show numeric value in markdown");
272        // Model without score must show dash, NOT "0.0000"
273        assert!(md.contains('—'), "missing score must show '—' in markdown, got:\n{md}");
274        assert!(
275            !md.contains("0.0000") || md.contains("0.9500"),
276            "markdown must not contain '0.0000' for missing scores"
277        );
278
279        // Also test Display trait
280        let display = format!("{lb}");
281        assert!(display.contains('—'), "missing score must show '—' in display output");
282    }
283
284    #[test]
285    fn test_leaderboard_add_and_best() {
286        let metric = Metric::Accuracy;
287        let mut lb = Leaderboard::new(metric);
288
289        lb.add(make_result("bad", metric, Some(0.5)));
290        lb.add(make_result("best", metric, Some(0.99)));
291        lb.add(make_result("mid", metric, Some(0.75)));
292
293        let best = lb.best().expect("should have a best");
294        assert_eq!(best.model_name, "best");
295    }
296}