use std::collections::HashMap;
use thiserror::Error;
#[derive(Debug, Error, Clone, PartialEq)]
pub enum MetricError {
#[error("Empty input: {0}")]
EmptyInput(String),
#[error("Mismatched lengths: predictions {0}, references {1}")]
LengthMismatch(usize, usize),
#[error("Invalid n-gram order: {0}")]
InvalidNgramOrder(usize),
#[error("Math error: {0}")]
MathError(String),
}
#[derive(Debug, Clone, Default)]
pub struct RougeScore {
pub precision: f64,
pub recall: f64,
pub f1: f64,
}
#[derive(Debug, Clone, Default)]
pub struct F1Score {
pub precision: f64,
pub recall: f64,
pub f1: f64,
pub true_positives: usize,
pub false_positives: usize,
pub false_negatives: usize,
}
#[derive(Debug, Clone, Default)]
pub struct NlpBenchmarkResult {
pub bleu_1: Option<f64>,
pub bleu_2: Option<f64>,
pub bleu_4: Option<f64>,
pub rouge_1: Option<RougeScore>,
pub rouge_2: Option<RougeScore>,
pub rouge_l: Option<RougeScore>,
pub exact_match: Option<f64>,
pub f1: Option<F1Score>,
pub perplexity: Option<f64>,
}
impl NlpBenchmarkResult {
pub fn new() -> Self {
Self::default()
}
pub fn display_summary(&self) -> String {
let mut lines = Vec::new();
if let Some(b1) = self.bleu_1 {
lines.push(format!("BLEU-1: {:.4}", b1));
}
if let Some(b2) = self.bleu_2 {
lines.push(format!("BLEU-2: {:.4}", b2));
}
if let Some(b4) = self.bleu_4 {
lines.push(format!("BLEU-4: {:.4}", b4));
}
if let Some(ref r1) = self.rouge_1 {
lines.push(format!(
"ROUGE-1: P={:.4} R={:.4} F1={:.4}",
r1.precision, r1.recall, r1.f1
));
}
if let Some(ref r2) = self.rouge_2 {
lines.push(format!(
"ROUGE-2: P={:.4} R={:.4} F1={:.4}",
r2.precision, r2.recall, r2.f1
));
}
if let Some(ref rl) = self.rouge_l {
lines.push(format!(
"ROUGE-L: P={:.4} R={:.4} F1={:.4}",
rl.precision, rl.recall, rl.f1
));
}
if let Some(em) = self.exact_match {
lines.push(format!("Exact Match: {:.4}", em));
}
if let Some(ref f1) = self.f1 {
lines.push(format!(
"Token F1: P={:.4} R={:.4} F1={:.4}",
f1.precision, f1.recall, f1.f1
));
}
if let Some(ppl) = self.perplexity {
lines.push(format!("Perplexity: {:.4}", ppl));
}
if lines.is_empty() {
"NlpBenchmarkResult: (no metrics set)".to_string()
} else {
lines.join("\n")
}
}
}
fn count_ngrams<'a>(tokens: &[&'a str], n: usize) -> HashMap<Vec<&'a str>, usize> {
let mut counts: HashMap<Vec<&'a str>, usize> = HashMap::new();
if tokens.len() < n {
return counts;
}
for window in tokens.windows(n) {
*counts.entry(window.to_vec()).or_insert(0) += 1;
}
counts
}
fn clipped_ngram_count(
hyp_counts: &HashMap<Vec<&str>, usize>,
references: &[&[&str]],
n: usize,
) -> usize {
let mut clipped_total: usize = 0;
for (ngram, hyp_count) in hyp_counts {
let max_ref_count = references
.iter()
.map(|r| {
let ref_counts = count_ngrams(r, n);
*ref_counts.get(ngram.as_slice()).unwrap_or(&0)
})
.max()
.unwrap_or(0);
clipped_total += (*hyp_count).min(max_ref_count);
}
clipped_total
}
fn best_match_reference_length(hyp_len: usize, references: &[&[&str]]) -> usize {
references
.iter()
.map(|r| r.len())
.min_by_key(|&ref_len| {
let diff = ref_len.abs_diff(hyp_len);
(diff, ref_len) })
.unwrap_or(0)
}
fn brevity_penalty(hyp_len: usize, ref_len: usize) -> f64 {
if hyp_len == 0 {
return 0.0;
}
if hyp_len >= ref_len {
1.0
} else {
(1.0 - ref_len as f64 / hyp_len as f64).exp()
}
}
fn lcs_length(a: &[&str], b: &[&str]) -> usize {
let m = a.len();
let n = b.len();
let mut prev = vec![0usize; n + 1];
let mut curr = vec![0usize; n + 1];
for i in 1..=m {
for j in 1..=n {
curr[j] = if a[i - 1] == b[j - 1] { prev[j - 1] + 1 } else { curr[j - 1].max(prev[j]) };
}
std::mem::swap(&mut prev, &mut curr);
curr.fill(0);
}
prev[n]
}
#[inline]
fn safe_f1(p: f64, r: f64) -> f64 {
let denom = p + r;
if denom < f64::EPSILON {
0.0
} else {
2.0 * p * r / denom
}
}
pub fn bleu_score(
hypothesis: &[&str],
references: &[&[&str]],
max_n: usize,
smooth: bool,
) -> Result<f64, MetricError> {
if max_n == 0 {
return Err(MetricError::InvalidNgramOrder(0));
}
if references.is_empty() {
return Err(MetricError::EmptyInput(
"at least one reference is required for BLEU".to_string(),
));
}
if hypothesis.is_empty() {
return Ok(0.0);
}
let mut log_precision_sum = 0.0_f64;
let mut valid_orders = 0usize;
for n in 1..=max_n {
let prec = ngram_precision(hypothesis, references, n, smooth)?;
if prec > 0.0 {
log_precision_sum += prec.ln();
valid_orders += 1;
} else if !smooth {
return Ok(0.0);
}
}
if valid_orders == 0 {
return Ok(0.0);
}
let geo_mean = (log_precision_sum / max_n as f64).exp();
let bp = brevity_penalty(
hypothesis.len(),
best_match_reference_length(hypothesis.len(), references),
);
let score = bp * geo_mean;
if !score.is_finite() {
return Err(MetricError::MathError(format!(
"BLEU score is not finite: {score}"
)));
}
Ok(score.min(1.0))
}
pub fn ngram_precision(
hypothesis: &[&str],
references: &[&[&str]],
n: usize,
smooth: bool,
) -> Result<f64, MetricError> {
if n == 0 {
return Err(MetricError::InvalidNgramOrder(0));
}
if references.is_empty() {
return Err(MetricError::EmptyInput(
"references must not be empty".to_string(),
));
}
if hypothesis.len() < n {
return if smooth { Ok(1.0 / (1.0 + 1.0)) } else { Ok(0.0) };
}
let hyp_counts = count_ngrams(hypothesis, n);
let total_hyp: usize = hyp_counts.values().sum();
if total_hyp == 0 {
return Ok(0.0);
}
let clipped = clipped_ngram_count(&hyp_counts, references, n);
if smooth {
Ok((clipped as f64 + 1.0) / (total_hyp as f64 + 1.0))
} else {
Ok(clipped as f64 / total_hyp as f64)
}
}
pub fn rouge_n_score(
hypothesis: &[&str],
reference: &[&str],
n: usize,
) -> Result<RougeScore, MetricError> {
if n == 0 {
return Err(MetricError::InvalidNgramOrder(0));
}
let hyp_counts = count_ngrams(hypothesis, n);
let ref_counts = count_ngrams(reference, n);
let total_hyp_ngrams: usize = hyp_counts.values().sum();
let total_ref_ngrams: usize = ref_counts.values().sum();
let overlap: usize = hyp_counts
.iter()
.map(|(ng, &hc)| {
let rc = *ref_counts.get(ng).unwrap_or(&0);
hc.min(rc)
})
.sum();
let precision = if total_hyp_ngrams == 0 {
0.0
} else {
overlap as f64 / total_hyp_ngrams as f64
};
let recall = if total_ref_ngrams == 0 {
0.0
} else {
overlap as f64 / total_ref_ngrams as f64
};
let f1 = safe_f1(precision, recall);
Ok(RougeScore {
precision,
recall,
f1,
})
}
pub fn rouge_l_score(hypothesis: &[&str], reference: &[&str]) -> Result<RougeScore, MetricError> {
let lcs = lcs_length(hypothesis, reference);
let precision = if hypothesis.is_empty() { 0.0 } else { lcs as f64 / hypothesis.len() as f64 };
let recall = if reference.is_empty() { 0.0 } else { lcs as f64 / reference.len() as f64 };
let f1 = safe_f1(precision, recall);
Ok(RougeScore {
precision,
recall,
f1,
})
}
pub fn token_f1_score(predictions: &[&str], references: &[&str]) -> Result<F1Score, MetricError> {
let pred_counts = token_counts(predictions);
let ref_counts = token_counts(references);
let total_pred: usize = pred_counts.values().sum();
let total_ref: usize = ref_counts.values().sum();
let overlap: usize = pred_counts
.iter()
.map(|(tok, &pc)| {
let rc = *ref_counts.get(tok).unwrap_or(&0);
pc.min(rc)
})
.sum();
let true_positives = overlap;
let false_positives = total_pred.saturating_sub(overlap);
let false_negatives = total_ref.saturating_sub(overlap);
let precision = if total_pred == 0 { 0.0 } else { overlap as f64 / total_pred as f64 };
let recall = if total_ref == 0 { 0.0 } else { overlap as f64 / total_ref as f64 };
let f1 = safe_f1(precision, recall);
Ok(F1Score {
precision,
recall,
f1,
true_positives,
false_positives,
false_negatives,
})
}
fn token_counts<'a>(tokens: &[&'a str]) -> HashMap<&'a str, usize> {
let mut counts = HashMap::new();
for &tok in tokens {
*counts.entry(tok).or_insert(0) += 1;
}
counts
}
pub fn exact_match(prediction: &str, reference: &str) -> bool {
normalise(prediction) == normalise(reference)
}
pub fn exact_match_score(predictions: &[&str], references: &[&str]) -> Result<f64, MetricError> {
if predictions.is_empty() && references.is_empty() {
return Err(MetricError::EmptyInput(
"predictions and references are both empty".to_string(),
));
}
if predictions.len() != references.len() {
return Err(MetricError::LengthMismatch(
predictions.len(),
references.len(),
));
}
let correct = predictions
.iter()
.zip(references.iter())
.filter(|(p, r)| exact_match(p, r))
.count();
Ok(correct as f64 / predictions.len() as f64)
}
#[inline]
fn normalise(s: &str) -> String {
s.trim().to_lowercase()
}
pub fn perplexity(log_probs: &[f64]) -> Result<f64, MetricError> {
if log_probs.is_empty() {
return Err(MetricError::EmptyInput(
"log_probs must not be empty for perplexity".to_string(),
));
}
for (i, &lp) in log_probs.iter().enumerate() {
if !lp.is_finite() {
return Err(MetricError::MathError(format!(
"log_prob[{i}] = {lp} is not finite"
)));
}
if lp > f64::EPSILON {
return Err(MetricError::MathError(format!(
"log_prob[{i}] = {lp} is positive (log-probability must be <= 0)"
)));
}
}
let mean_neg_log_prob = -log_probs.iter().sum::<f64>() / log_probs.len() as f64;
let ppl = mean_neg_log_prob.exp();
if !ppl.is_finite() {
return Err(MetricError::MathError(format!(
"perplexity is not finite: {ppl}"
)));
}
Ok(ppl)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_bleu_perfect_match() {
let hyp = &["the", "cat", "sat", "on", "the", "mat"];
let refs: &[&[&str]] = &[&["the", "cat", "sat", "on", "the", "mat"]];
let score = bleu_score(hyp, refs, 4, false).expect("bleu");
assert!((score - 1.0).abs() < 1e-6, "expected 1.0, got {score}");
}
#[test]
fn test_bleu_no_overlap() {
let hyp = &["the", "quick", "brown", "fox"];
let refs: &[&[&str]] = &[&["a", "lazy", "dog", "ran"]];
let hyp2 = &["alpha", "beta", "gamma", "delta"];
let refs2: &[&[&str]] = &[&["zeta", "eta", "theta", "iota"]];
let score = bleu_score(hyp2, refs2, 4, false).expect("bleu");
assert_eq!(score, 0.0, "expected 0.0 for no overlap, got {score}");
let _ = hyp;
let _ = refs;
}
#[test]
fn test_bleu_partial_overlap() {
let hyp = &["the", "cat", "sat"];
let refs: &[&[&str]] = &[&["the", "cat", "is", "on", "the", "mat"]];
let score = bleu_score(hyp, refs, 2, true).expect("bleu");
assert!(
score > 0.0 && score < 1.0,
"expected partial score, got {score}"
);
}
#[test]
fn test_bleu_empty_hypothesis_returns_zero() {
let hyp: &[&str] = &[];
let refs: &[&[&str]] = &[&["a", "b", "c"]];
let score = bleu_score(hyp, refs, 4, false).expect("bleu");
assert_eq!(score, 0.0);
}
#[test]
fn test_bleu_invalid_order_fails() {
let hyp = &["a", "b"];
let refs: &[&[&str]] = &[&["a", "b"]];
assert!(bleu_score(hyp, refs, 0, false).is_err());
}
#[test]
fn test_bleu_no_references_fails() {
let hyp = &["a", "b"];
let refs: &[&[&str]] = &[];
assert!(bleu_score(hyp, refs, 4, false).is_err());
}
#[test]
fn test_ngram_precision_unigram_perfect() {
let hyp = &["a", "b", "c"];
let refs: &[&[&str]] = &[&["a", "b", "c"]];
let p = ngram_precision(hyp, refs, 1, false).expect("ngram_precision");
assert!((p - 1.0).abs() < 1e-6);
}
#[test]
fn test_ngram_precision_zero_order_fails() {
let hyp = &["a"];
let refs: &[&[&str]] = &[&["a"]];
assert!(ngram_precision(hyp, refs, 0, false).is_err());
}
#[test]
fn test_rouge_n_perfect() {
let hyp = &["the", "cat", "sat", "on", "the", "mat"];
let ref_ = &["the", "cat", "sat", "on", "the", "mat"];
let score = rouge_n_score(hyp, ref_, 1).expect("rouge_n");
assert!(
(score.precision - 1.0).abs() < 1e-6
&& (score.recall - 1.0).abs() < 1e-6
&& (score.f1 - 1.0).abs() < 1e-6,
"expected all-1.0, got {score:?}"
);
}
#[test]
fn test_rouge_n_no_overlap() {
let hyp = &["alpha", "beta", "gamma"];
let ref_ = &["zeta", "eta", "theta"];
let score = rouge_n_score(hyp, ref_, 1).expect("rouge_n");
assert_eq!(score.precision, 0.0);
assert_eq!(score.recall, 0.0);
assert_eq!(score.f1, 0.0);
}
#[test]
fn test_rouge_n_partial_overlap() {
let hyp = &["the", "cat", "sat"];
let ref_ = &["the", "cat", "is", "on", "the", "mat"];
let score = rouge_n_score(hyp, ref_, 1).expect("rouge_n");
assert!(score.precision > 0.0 && score.precision < 1.0);
assert!(score.recall > 0.0 && score.recall < 1.0);
assert!(score.f1 > 0.0 && score.f1 < 1.0);
}
#[test]
fn test_rouge_n_invalid_order_fails() {
assert!(rouge_n_score(&["a"], &["a"], 0).is_err());
}
#[test]
fn test_rouge_n_bigram_partial() {
let hyp = &["a", "b", "c", "d"];
let ref_ = &["a", "b", "e", "f"];
let score = rouge_n_score(hyp, ref_, 2).expect("rouge_n");
assert!(score.f1 > 0.0 && score.f1 < 1.0);
}
#[test]
fn test_rouge_l_basic() {
let hyp = &["police", "killed", "the", "gunman"];
let ref_ = &["police", "kill", "the", "gunman"];
let score = rouge_l_score(hyp, ref_).expect("rouge_l");
assert!((score.recall - 3.0 / 4.0).abs() < 1e-6);
assert!((score.precision - 3.0 / 4.0).abs() < 1e-6);
}
#[test]
fn test_rouge_l_perfect_match() {
let tokens = &["a", "b", "c", "d", "e"];
let score = rouge_l_score(tokens, tokens).expect("rouge_l");
assert!((score.f1 - 1.0).abs() < 1e-6);
}
#[test]
fn test_rouge_l_empty_hypothesis() {
let hyp: &[&str] = &[];
let ref_ = &["a", "b"];
let score = rouge_l_score(hyp, ref_).expect("rouge_l");
assert_eq!(score.f1, 0.0);
}
#[test]
fn test_exact_match_true() {
assert!(exact_match("Paris", "Paris"));
}
#[test]
fn test_exact_match_case_insensitive() {
assert!(exact_match("Paris", "paris"));
assert!(exact_match("THE QUICK BROWN FOX", "the quick brown fox"));
}
#[test]
fn test_exact_match_false() {
assert!(!exact_match("Paris", "France"));
}
#[test]
fn test_exact_match_whitespace_normalised() {
assert!(exact_match(" hello ", "hello"));
}
#[test]
fn test_exact_match_score_batch() {
let preds = &["Paris", "Tokyo", "berlin"];
let refs = &["paris", "london", "Berlin"];
let score = exact_match_score(preds, refs).expect("em_score");
assert!((score - 2.0 / 3.0).abs() < 1e-6, "got {score}");
}
#[test]
fn test_exact_match_score_empty_fails() {
assert!(exact_match_score(&[], &[]).is_err());
}
#[test]
fn test_exact_match_score_length_mismatch_fails() {
assert!(exact_match_score(&["a", "b"], &["a"]).is_err());
}
#[test]
fn test_f1_perfect() {
let tokens = &["the", "cat", "sat"];
let score = token_f1_score(tokens, tokens).expect("f1");
assert!((score.f1 - 1.0).abs() < 1e-6);
assert!((score.precision - 1.0).abs() < 1e-6);
assert!((score.recall - 1.0).abs() < 1e-6);
}
#[test]
fn test_f1_partial() {
let pred = &["the", "cat", "sat"];
let ref_ = &["the", "cat", "is", "on", "the", "mat"];
let score = token_f1_score(pred, ref_).expect("f1");
assert!(score.f1 > 0.0 && score.f1 < 1.0);
assert!(score.true_positives > 0);
}
#[test]
fn test_f1_no_overlap() {
let pred = &["alpha", "beta"];
let ref_ = &["gamma", "delta"];
let score = token_f1_score(pred, ref_).expect("f1");
assert_eq!(score.f1, 0.0);
assert_eq!(score.true_positives, 0);
}
#[test]
fn test_perplexity_perfect() {
let log_probs = vec![0.0_f64; 10];
let ppl = perplexity(&log_probs).expect("perplexity");
assert!((ppl - 1.0).abs() < 1e-6, "expected 1.0, got {ppl}");
}
#[test]
fn test_perplexity_uniform() {
let n = 10.0_f64;
let log_probs = vec![-n.ln(); 50];
let ppl = perplexity(&log_probs).expect("perplexity");
assert!((ppl - n).abs() < 1e-6, "expected {n}, got {ppl}");
}
#[test]
fn test_perplexity_empty_fails() {
assert!(perplexity(&[]).is_err());
}
#[test]
fn test_perplexity_positive_log_prob_fails() {
let log_probs = vec![0.5_f64];
assert!(perplexity(&log_probs).is_err());
}
#[test]
fn test_perplexity_non_finite_fails() {
let log_probs = vec![f64::NEG_INFINITY];
assert!(perplexity(&log_probs).is_err());
}
#[test]
fn test_benchmark_result_display() {
let mut result = NlpBenchmarkResult::new();
result.bleu_4 = Some(0.423);
result.perplexity = Some(15.3);
result.exact_match = Some(0.75);
let summary = result.display_summary();
assert!(summary.contains("BLEU-4"), "summary: {summary}");
assert!(summary.contains("Perplexity"), "summary: {summary}");
assert!(summary.contains("Exact Match"), "summary: {summary}");
}
#[test]
fn test_benchmark_result_empty_display() {
let result = NlpBenchmarkResult::new();
let summary = result.display_summary();
assert!(summary.contains("no metrics"));
}
#[test]
fn test_lcs_identical() {
let a = &["a", "b", "c", "d"];
let b = &["a", "b", "c", "d"];
assert_eq!(lcs_length(a, b), 4);
}
#[test]
fn test_lcs_disjoint() {
let a = &["x", "y", "z"];
let b = &["a", "b", "c"];
assert_eq!(lcs_length(a, b), 0);
}
#[test]
fn test_lcs_partial() {
let a = &["a", "b", "c", "d"];
let b = &["b", "c", "e"];
assert_eq!(lcs_length(a, b), 2);
}
}