use std::collections::HashMap;
pub type TokenSeq = Vec<String>;
pub fn tokenize(text: &str) -> TokenSeq {
text.split_whitespace()
.filter_map(|word| {
let stripped: String = word.chars().filter(|c| c.is_alphanumeric()).collect();
let lower = stripped.to_lowercase();
if lower.is_empty() {
None
} else {
Some(lower)
}
})
.collect()
}
pub fn ngram_counts(tokens: &TokenSeq, n: usize) -> HashMap<Vec<String>, usize> {
let mut counts: HashMap<Vec<String>, usize> = HashMap::new();
if n == 0 || tokens.len() < n {
return counts;
}
for window in tokens.windows(n) {
*counts.entry(window.to_vec()).or_insert(0) += 1;
}
counts
}
fn clipped_ngram_overlap(
cand_counts: &HashMap<Vec<String>, usize>,
ref_counts: &HashMap<Vec<String>, usize>,
) -> usize {
let mut overlap = 0usize;
for (ngram, &cand_count) in cand_counts {
if let Some(&ref_count) = ref_counts.get(ngram) {
overlap += cand_count.min(ref_count);
}
}
overlap
}
#[derive(Debug, Clone)]
pub struct RougeNScore {
pub n: usize,
pub precision: f32,
pub recall: f32,
pub f1: f32,
}
impl RougeNScore {
pub fn compute(candidate: &str, reference: &str, n: usize) -> Self {
let cand_tokens = tokenize(candidate);
let ref_tokens = tokenize(reference);
Self::from_tokens(&cand_tokens, &ref_tokens, n)
}
pub fn compute_multi_ref(candidate: &str, references: &[&str], n: usize) -> Self {
if references.is_empty() {
return Self {
n,
precision: 0.0,
recall: 0.0,
f1: 0.0,
};
}
let cand_tokens = tokenize(candidate);
references
.iter()
.map(|r| {
let ref_tokens = tokenize(r);
Self::from_tokens(&cand_tokens, &ref_tokens, n)
})
.max_by(|a, b| {
a.recall
.partial_cmp(&b.recall)
.unwrap_or(std::cmp::Ordering::Equal)
})
.unwrap_or(Self {
n,
precision: 0.0,
recall: 0.0,
f1: 0.0,
})
}
fn from_tokens(cand_tokens: &TokenSeq, ref_tokens: &TokenSeq, n: usize) -> Self {
let cand_counts = ngram_counts(cand_tokens, n);
let ref_counts = ngram_counts(ref_tokens, n);
let cand_total: usize = cand_counts.values().sum();
let ref_total: usize = ref_counts.values().sum();
let overlap = clipped_ngram_overlap(&cand_counts, &ref_counts);
let precision = if cand_total == 0 {
0.0
} else {
overlap as f32 / cand_total as f32
};
let recall = if ref_total == 0 {
0.0
} else {
overlap as f32 / ref_total as f32
};
let f1 = f1_score(precision, recall);
Self {
n,
precision,
recall,
f1,
}
}
}
#[derive(Debug, Clone)]
pub struct RougeLScore {
pub precision: f32,
pub recall: f32,
pub f1: f32,
pub lcs_length: usize,
}
impl RougeLScore {
pub fn compute(candidate: &str, reference: &str) -> Self {
let cand_tokens = tokenize(candidate);
let ref_tokens = tokenize(reference);
let lcs_len = Self::lcs_length(&cand_tokens, &ref_tokens);
let cand_len = cand_tokens.len();
let ref_len = ref_tokens.len();
let precision = if cand_len == 0 {
0.0
} else {
lcs_len as f32 / cand_len as f32
};
let recall = if ref_len == 0 {
0.0
} else {
lcs_len as f32 / ref_len as f32
};
let f1 = f1_score(precision, recall);
Self {
precision,
recall,
f1,
lcs_length: lcs_len,
}
}
pub fn lcs_length(a: &TokenSeq, b: &TokenSeq) -> usize {
let m = a.len();
let n = b.len();
if m == 0 || n == 0 {
return 0;
}
let mut dp: Vec<Vec<usize>> = vec![vec![0usize; n + 1]; m + 1];
for i in 1..=m {
for j in 1..=n {
if a[i - 1] == b[j - 1] {
dp[i][j] = dp[i - 1][j - 1] + 1;
} else {
dp[i][j] = dp[i - 1][j].max(dp[i][j - 1]);
}
}
}
dp[m][n]
}
}
#[derive(Debug, Clone)]
pub struct RougeSScore {
pub precision: f32,
pub recall: f32,
pub f1: f32,
}
impl RougeSScore {
pub fn compute(candidate: &str, reference: &str) -> Self {
let cand_tokens = tokenize(candidate);
let ref_tokens = tokenize(reference);
let cand_bigrams = Self::skip_bigrams(&cand_tokens);
let ref_bigrams = Self::skip_bigrams(&ref_tokens);
let cand_counts = bigram_counts(&cand_bigrams);
let ref_counts = bigram_counts(&ref_bigrams);
let cand_total = cand_bigrams.len();
let ref_total = ref_bigrams.len();
let overlap: usize = cand_counts
.iter()
.map(|(bg, &cc)| {
let rc = ref_counts.get(bg).copied().unwrap_or(0);
cc.min(rc)
})
.sum();
let precision = if cand_total == 0 {
0.0
} else {
overlap as f32 / cand_total as f32
};
let recall = if ref_total == 0 {
0.0
} else {
overlap as f32 / ref_total as f32
};
let f1 = f1_score(precision, recall);
Self {
precision,
recall,
f1,
}
}
fn skip_bigrams(tokens: &TokenSeq) -> Vec<(String, String)> {
let mut bigrams = Vec::new();
let n = tokens.len();
for i in 0..n {
for j in (i + 1)..n {
bigrams.push((tokens[i].clone(), tokens[j].clone()));
}
}
bigrams
}
}
fn bigram_counts(bigrams: &[(String, String)]) -> HashMap<(String, String), usize> {
let mut counts = HashMap::new();
for bg in bigrams {
*counts.entry(bg.clone()).or_insert(0) += 1;
}
counts
}
#[derive(Debug, Clone, Default)]
pub struct CorpusRouge {
pub rouge_1: Option<RougeNScore>,
pub rouge_2: Option<RougeNScore>,
pub rouge_l: Option<RougeLScore>,
pub num_samples: usize,
}
impl CorpusRouge {
pub fn compute(pairs: &[(&str, &str)]) -> Self {
if pairs.is_empty() {
return Self::default();
}
let n = pairs.len();
let mut sum_r1 = RougeAccum::default();
let mut sum_r2 = RougeAccum::default();
let mut sum_rl_p = 0.0f64;
let mut sum_rl_r = 0.0f64;
let mut sum_rl_f1 = 0.0f64;
let mut sum_rl_lcs = 0usize;
for &(cand, reference) in pairs {
let r1 = RougeNScore::compute(cand, reference, 1);
let r2 = RougeNScore::compute(cand, reference, 2);
let rl = RougeLScore::compute(cand, reference);
sum_r1.add(&r1);
sum_r2.add(&r2);
sum_rl_p += f64::from(rl.precision);
sum_rl_r += f64::from(rl.recall);
sum_rl_f1 += f64::from(rl.f1);
sum_rl_lcs += rl.lcs_length;
}
let nf = n as f64;
let rouge_1 = Some(RougeNScore {
n: 1,
precision: (sum_r1.precision / nf) as f32,
recall: (sum_r1.recall / nf) as f32,
f1: (sum_r1.f1 / nf) as f32,
});
let rouge_2 = Some(RougeNScore {
n: 2,
precision: (sum_r2.precision / nf) as f32,
recall: (sum_r2.recall / nf) as f32,
f1: (sum_r2.f1 / nf) as f32,
});
let rouge_l = Some(RougeLScore {
precision: (sum_rl_p / nf) as f32,
recall: (sum_rl_r / nf) as f32,
f1: (sum_rl_f1 / nf) as f32,
lcs_length: sum_rl_lcs / n, });
Self {
rouge_1,
rouge_2,
rouge_l,
num_samples: n,
}
}
pub fn summary(&self) -> String {
let r1_f1 = self.rouge_1.as_ref().map_or(0.0, |s| s.f1);
let r2_f1 = self.rouge_2.as_ref().map_or(0.0, |s| s.f1);
let rl_f1 = self.rouge_l.as_ref().map_or(0.0, |s| s.f1);
format!(
"CorpusROUGE(n={}) ROUGE-1 F1={:.4} | ROUGE-2 F1={:.4} | ROUGE-L F1={:.4}",
self.num_samples, r1_f1, r2_f1, rl_f1,
)
}
}
#[derive(Default)]
struct RougeAccum {
precision: f64,
recall: f64,
f1: f64,
}
impl RougeAccum {
fn add(&mut self, score: &RougeNScore) {
self.precision += f64::from(score.precision);
self.recall += f64::from(score.recall);
self.f1 += f64::from(score.f1);
}
}
#[inline]
fn f1_score(precision: f32, recall: f32) -> f32 {
let denom = precision + recall;
if denom == 0.0 {
0.0
} else {
2.0 * precision * recall / denom
}
}