Skip to main content

oxibonsai_eval/
lib.rs

1//! # oxibonsai-eval
2//!
3//! Model evaluation harness for OxiBonsai.
4//!
5//! Provides utilities for:
6//!
7//! - **Perplexity** — measures how well a model predicts held-out text.
8//! - **MMLU-style multiple choice** — accuracy on four-option questions
9//!   (both string-parsing [`McEvaluator`] and logit-based
10//!   [`accuracy::McLogitEvaluator`]).
11//! - **Exact match** — token-level accuracy for text-generation tasks.
12//! - **BLEU** — corpus / sentence BLEU with 1..N orders and smoothing.
13//! - **chrF / chrF++** — character n-gram F-score (Popović 2015).
14//! - **METEOR (lexical)** — exact-match-only METEOR.
15//! - **SQuAD F1 + EM** — standard SQuAD 1.1 normalisation.
16//! - **Calibration** — ECE, Brier score, NLL (numerically stable).
17//! - **Bootstrap CIs** — seed-deterministic percentile intervals.
18//! - **Streaming / online** — running perplexity and accuracy counters.
19//! - **Throughput benchmarking** — tokens-per-second and latency statistics.
20//! - **Dataset loading** — JSONL-based [`EvalDataset`] and [`McDataset`].
21//! - **Report generation** — JSON and Markdown evaluation reports.
22//!
23//! ## Quick start
24//!
25//! ```rust
26//! use oxibonsai_eval::perplexity::PerplexityEvaluator;
27//!
28//! let eval = PerplexityEvaluator::new();
29//! // Perfect predictions → PPL ≈ 1.0
30//! let ppl = eval.compute(&[0.0f32; 10]);
31//! assert!((ppl - 1.0).abs() < 1e-5);
32//! ```
33
34pub mod accuracy;
35pub mod arc;
36pub mod bleu;
37pub mod boolq;
38pub mod bootstrap;
39pub mod calibration;
40pub mod chrf;
41pub mod dataset;
42pub mod error;
43pub mod gsm8k;
44pub mod hellaswag;
45pub mod meteor;
46pub mod mmlu;
47pub mod perplexity;
48pub mod qa;
49pub mod report;
50pub mod rouge;
51pub mod streaming;
52pub mod throughput;
53pub mod truthfulqa;
54pub mod winogrande;
55
56#[cfg(test)]
57mod tests;
58
59// ──────────────────────────────────────────────────────────────────────────────
60// Public re-exports
61// ──────────────────────────────────────────────────────────────────────────────
62
63pub use accuracy::{
64    AccuracyResult, ExactMatchEvaluator, LogitMcResult, McEvaluator, McLogitEvaluator,
65};
66pub use arc::{ArcEvaluator, ArcResult, ArcSplit};
67pub use bleu::{corpus_bleu, sentence_bleu, BleuConfig, BleuScore, SmoothingMethod};
68pub use boolq::{BoolQDataset, BoolQEvaluator, BoolQItem, BoolQResult};
69pub use bootstrap::{bootstrap_ci, ConfidenceInterval};
70pub use calibration::{
71    brier_score, calibration_all, expected_calibration_error, nll_from_logits, BinStat,
72    CalibrationResult,
73};
74pub use chrf::{chrf, chrf_plus_plus, chrf_with, ChrfScore};
75pub use dataset::{EvalDataset, EvalExample, McDataset, MultipleChoiceQuestion};
76pub use error::EvalError;
77pub use gsm8k::{Gsm8kEvaluator, Gsm8kResult};
78pub use hellaswag::{HellaSwagDataset, HellaSwagEvaluator, HellaSwagItem, HellaSwagResult};
79pub use meteor::{align_tokens, meteor, meteor_multi, MeteorConfig, MeteorScore};
80pub use mmlu::{MmluEvaluator, MmluResult};
81pub use perplexity::{PerplexityEvaluator, PerplexityResult};
82pub use qa::{
83    corpus_em_f1, exact_match as qa_exact_match, f1_score as qa_f1_score, normalize_answer,
84    normalize_tokens, score_multi as qa_score_multi, QaScore,
85};
86pub use report::{EvalReport, EvalResultEntry};
87pub use rouge::{
88    ngram_counts, tokenize, CorpusRouge, RougeLScore, RougeNScore, RougeSScore, TokenSeq,
89};
90pub use streaming::{OnlineAccuracy, OnlinePerplexity};
91pub use throughput::{percentile, time_fn, ThroughputBenchmark, ThroughputResult};
92pub use truthfulqa::{
93    TruthfulQaDataset, TruthfulQaEvaluator, TruthfulQaItem, TruthfulQaMode, TruthfulQaResult,
94};
95pub use winogrande::{WinoGrandeDataset, WinoGrandeEvaluator, WinoGrandeItem, WinoGrandeResult};