1use super::metric::Metric;
4use super::result::EvalResult;
5use std::fmt;
6
7#[derive(Clone, Debug)]
9pub struct Leaderboard {
10 pub results: Vec<EvalResult>,
12 pub primary_metric: Metric,
14}
15
16impl Leaderboard {
17 pub fn new(primary_metric: Metric) -> Self {
19 Self { results: Vec::new(), primary_metric }
20 }
21
22 pub fn add(&mut self, result: EvalResult) {
24 self.results.push(result);
25 self.sort();
26 }
27
28 pub fn sort(&mut self) {
34 let higher_is_better = self.primary_metric.higher_is_better();
35 let missing = if higher_is_better { f64::NEG_INFINITY } else { f64::INFINITY };
36 self.results.sort_by(|a, b| {
37 let score_a = a.get_score(self.primary_metric).unwrap_or(missing);
38 let score_b = b.get_score(self.primary_metric).unwrap_or(missing);
39 if higher_is_better {
40 score_b.partial_cmp(&score_a).unwrap_or(std::cmp::Ordering::Equal)
41 } else {
42 score_a.partial_cmp(&score_b).unwrap_or(std::cmp::Ordering::Equal)
43 }
44 });
45 }
46
47 pub fn sort_by(&mut self, metric: Metric) {
49 let higher_is_better = metric.higher_is_better();
50 let missing = if higher_is_better { f64::NEG_INFINITY } else { f64::INFINITY };
51 self.results.sort_by(|a, b| {
52 let score_a = a.get_score(metric).unwrap_or(missing);
53 let score_b = b.get_score(metric).unwrap_or(missing);
54 if higher_is_better {
55 score_b.partial_cmp(&score_a).unwrap_or(std::cmp::Ordering::Equal)
56 } else {
57 score_a.partial_cmp(&score_b).unwrap_or(std::cmp::Ordering::Equal)
58 }
59 });
60 }
61
62 pub fn best(&self) -> Option<&EvalResult> {
64 self.results.first()
65 }
66
67 pub fn print(&self) {
69 println!("{self}");
70 }
71
72 pub fn to_markdown(&self) -> String {
74 let mut md = String::new();
75
76 let metrics: Vec<Metric> = if let Some(first) = self.results.first() {
78 first.scores.keys().copied().collect()
79 } else {
80 return md;
81 };
82
83 md.push_str("| Model |");
85 for metric in &metrics {
86 md.push_str(&format!(" {metric} |"));
87 }
88 md.push_str(" Inference (ms) |\n");
89
90 md.push_str("|-------|");
92 for _ in &metrics {
93 md.push_str("----------|");
94 }
95 md.push_str("---------------|\n");
96
97 for result in &self.results {
99 md.push_str(&format!("| {} |", result.model_name));
100 for metric in &metrics {
101 match result.get_score(*metric) {
102 Some(score) => md.push_str(&format!(" {score:.4} |")),
103 None => md.push_str(" — |"),
105 }
106 }
107 md.push_str(&format!(" {:.2} |\n", result.inference_time_ms));
108 }
109
110 md
111 }
112}
113
114impl fmt::Display for Leaderboard {
115 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
116 if self.results.is_empty() {
117 return writeln!(f, "Leaderboard: (empty)");
118 }
119
120 let metrics: Vec<Metric> = if let Some(first) = self.results.first() {
122 first.scores.keys().copied().collect()
123 } else {
124 return Ok(());
125 };
126
127 let model_width = self.results.iter().map(|r| r.model_name.len()).max().unwrap_or(5).max(5);
129
130 write!(f, "┌{:─<width$}┬", "", width = model_width + 2)?;
132 for _ in &metrics {
133 write!(f, "{:─<12}┬", "")?;
134 }
135 writeln!(f, "{:─<15}┐", "")?;
136
137 write!(f, "│ {:width$} │", "Model", width = model_width)?;
138 for metric in &metrics {
139 write!(f, " {:>10} │", metric.name())?;
140 }
141 writeln!(f, " Inference (ms)│")?;
142
143 write!(f, "├{:─<width$}┼", "", width = model_width + 2)?;
145 for _ in &metrics {
146 write!(f, "{:─<12}┼", "")?;
147 }
148 writeln!(f, "{:─<15}┤", "")?;
149
150 for result in &self.results {
152 write!(f, "│ {:width$} │", result.model_name, width = model_width)?;
153 for metric in &metrics {
154 match result.get_score(*metric) {
155 Some(score) => write!(f, " {score:>10.4} │")?,
156 None => write!(f, " {:>10} │", "—")?,
158 }
159 }
160 writeln!(f, " {:>13.2} │", result.inference_time_ms)?;
161 }
162
163 write!(f, "└{:─<width$}┴", "", width = model_width + 2)?;
165 for _ in &metrics {
166 write!(f, "{:─<12}┴", "")?;
167 }
168 writeln!(f, "{:─<15}┘", "")?;
169
170 Ok(())
171 }
172}
173
174#[cfg(test)]
175mod tests {
176 use super::*;
177
178 fn make_result(name: &str, metric: Metric, score: Option<f64>) -> EvalResult {
179 let mut r = EvalResult::new(name);
180 if let Some(s) = score {
181 r.add_score(metric, s);
182 }
183 r
184 }
185
186 #[test]
191 fn test_falsify_n06_missing_score_sorts_last_higher_is_better() {
192 let metric = Metric::Accuracy;
195 assert!(metric.higher_is_better());
196
197 let mut lb = Leaderboard::new(metric);
198 lb.results.push(make_result("good", metric, Some(0.9)));
199 lb.results.push(make_result("missing", metric, None));
200 lb.results.push(make_result("bad", metric, Some(0.1)));
201 lb.sort();
202
203 assert_eq!(lb.results[0].model_name, "good");
204 assert_eq!(lb.results[1].model_name, "bad");
205 assert_eq!(
206 lb.results[2].model_name, "missing",
207 "Model with missing score must sort last for higher-is-better metric"
208 );
209 }
210
211 #[test]
212 fn test_falsify_n06_missing_score_sorts_last_lower_is_better() {
213 let metric = Metric::MSE;
216 assert!(!metric.higher_is_better());
217
218 let mut lb = Leaderboard::new(metric);
219 lb.results.push(make_result("good", metric, Some(0.01)));
220 lb.results.push(make_result("missing", metric, None));
221 lb.results.push(make_result("bad", metric, Some(10.0)));
222 lb.sort();
223
224 assert_eq!(lb.results[0].model_name, "good");
225 assert_eq!(lb.results[1].model_name, "bad");
226 assert_eq!(
227 lb.results[2].model_name, "missing",
228 "Model with missing score must sort last for lower-is-better metric"
229 );
230 }
231
232 #[test]
233 fn test_falsify_n06_sort_by_missing_score_sorts_last() {
234 let primary = Metric::Accuracy;
236 let secondary = Metric::Perplexity; assert!(!secondary.higher_is_better());
238
239 let mut lb = Leaderboard::new(primary);
240
241 let mut r1 = make_result("model_a", primary, Some(0.8));
242 r1.add_score(secondary, 5.0);
243 lb.results.push(r1);
244
245 let r2 = make_result("model_b", primary, Some(0.9));
246 lb.results.push(r2);
248
249 let mut r3 = make_result("model_c", primary, Some(0.7));
250 r3.add_score(secondary, 100.0);
251 lb.results.push(r3);
252
253 lb.sort_by(secondary);
254
255 assert_eq!(lb.results[0].model_name, "model_a", "lowest perplexity first");
256 assert_eq!(lb.results[1].model_name, "model_c");
257 assert_eq!(lb.results[2].model_name, "model_b", "Missing perplexity score must sort last");
258 }
259
260 #[test]
261 fn test_falsify_n06_display_missing_score_shows_dash() {
262 let metric = Metric::Accuracy;
265 let mut lb = Leaderboard::new(metric);
266 lb.results.push(make_result("has_score", metric, Some(0.95)));
267 lb.results.push(make_result("no_score", metric, None));
268
269 let md = lb.to_markdown();
270 assert!(md.contains("0.95"), "scored model must show numeric value in markdown");
272 assert!(md.contains('—'), "missing score must show '—' in markdown, got:\n{md}");
274 assert!(
275 !md.contains("0.0000") || md.contains("0.9500"),
276 "markdown must not contain '0.0000' for missing scores"
277 );
278
279 let display = format!("{lb}");
281 assert!(display.contains('—'), "missing score must show '—' in display output");
282 }
283
284 #[test]
285 fn test_leaderboard_add_and_best() {
286 let metric = Metric::Accuracy;
287 let mut lb = Leaderboard::new(metric);
288
289 lb.add(make_result("bad", metric, Some(0.5)));
290 lb.add(make_result("best", metric, Some(0.99)));
291 lb.add(make_result("mid", metric, Some(0.75)));
292
293 let best = lb.best().expect("should have a best");
294 assert_eq!(best.model_name, "best");
295 }
296}