entrenar/hf_pipeline/leaderboard/
parser.rs

1//! Leaderboard-to-EvalResult parser
2//!
3//! Converts HuggingFace leaderboard data into the native evaluation types,
4//! enabling comparison of your models against published leaderboard entries.
5
6use super::types::{HfLeaderboard, LeaderboardKind};
7use crate::eval::evaluator::{EvalResult, Leaderboard, Metric};
8use crate::eval::RougeVariant;
9
10/// Convert an `HfLeaderboard` into a native `Leaderboard` for comparison.
11///
12/// Maps leaderboard columns to `Metric` variants using kind-specific mappings.
13pub fn to_leaderboard(hf: &HfLeaderboard) -> Leaderboard {
14    let primary = hf.kind.primary_metric();
15    let mut leaderboard = Leaderboard::new(primary);
16
17    for entry in &hf.entries {
18        let mut result = EvalResult::new(&entry.model_id);
19
20        for (column, &value) in &entry.scores {
21            if let Some(metric) = column_to_metric(&hf.kind, column) {
22                result.add_score(metric, value);
23            }
24        }
25
26        leaderboard.add(result);
27    }
28
29    leaderboard
30}
31
32/// Map a leaderboard column name to a `Metric` variant.
33///
34/// Each leaderboard kind has its own column naming conventions.
35#[must_use]
36pub fn column_to_metric(kind: &LeaderboardKind, column: &str) -> Option<Metric> {
37    let col_lower = column.to_lowercase();
38
39    match kind {
40        LeaderboardKind::OpenASR => match col_lower.as_str() {
41            "wer" | "average_wer" | "word_error_rate" => Some(Metric::WER),
42            "rtfx" | "rtf" | "real_time_factor" => Some(Metric::RTFx),
43            _ => None,
44        },
45        LeaderboardKind::OpenLLMv2 => match col_lower.as_str() {
46            "mmlu" | "mmlu_pro" | "mmlu_accuracy" => Some(Metric::MMLUAccuracy),
47            "accuracy" | "average" | "avg" => Some(Metric::Accuracy),
48            _ => None,
49        },
50        LeaderboardKind::MTEB => match col_lower.as_str() {
51            "ndcg@10" | "ndcg_at_10" => Some(Metric::NDCGAtK(10)),
52            "accuracy" => Some(Metric::Accuracy),
53            _ => None,
54        },
55        LeaderboardKind::BigCodeBench => match col_lower.as_str() {
56            "pass@1" | "pass_at_1" => Some(Metric::PassAtK(1)),
57            "pass@10" | "pass_at_10" => Some(Metric::PassAtK(10)),
58            _ => None,
59        },
60        LeaderboardKind::Custom(_) => generic_column_to_metric(&col_lower),
61    }
62}
63
64/// Best-effort column name → Metric mapping for custom leaderboards
65fn generic_column_to_metric(column: &str) -> Option<Metric> {
66    match column {
67        "accuracy" | "acc" => Some(Metric::Accuracy),
68        "wer" | "word_error_rate" => Some(Metric::WER),
69        "bleu" => Some(Metric::BLEU),
70        "rouge1" | "rouge_1" => Some(Metric::ROUGE(RougeVariant::Rouge1)),
71        "rouge2" | "rouge_2" => Some(Metric::ROUGE(RougeVariant::Rouge2)),
72        "rougel" | "rouge_l" => Some(Metric::ROUGE(RougeVariant::RougeL)),
73        "perplexity" | "ppl" => Some(Metric::Perplexity),
74        "mmlu" => Some(Metric::MMLUAccuracy),
75        "pass@1" | "pass_at_1" => Some(Metric::PassAtK(1)),
76        "ndcg@10" | "ndcg_at_10" => Some(Metric::NDCGAtK(10)),
77        _ => None,
78    }
79}
80
81/// Compare your model's `EvalResult` against a HuggingFace leaderboard.
82///
83/// Inserts your result into the leaderboard for ranking, returning
84/// a sorted `Leaderboard` with your model included.
85pub fn compare_with_leaderboard(my_result: &EvalResult, hf: &HfLeaderboard) -> Leaderboard {
86    let mut leaderboard = to_leaderboard(hf);
87    leaderboard.add(my_result.clone());
88    leaderboard
89}
entrenar/hf_pipeline/leaderboard/parser.rs

entrenar/hf_pipeline/leaderboard/
parser.rs