pub(crate) struct NameCase {
pub label: &'static str,
pub pool: &'static [&'static str],
pub raw: &'static str,
pub truth: &'static str,
}
#[derive(Debug, Default, PartialEq, Eq)]
pub(crate) struct HarnessReport {
pub recovered: usize,
pub missed: usize,
pub false_corrections: usize,
pub structural_mismatches: usize,
}
pub(crate) const CORPUS: &[NameCase] = &[
NameCase {
label: "dutch-geert",
pool: &["Geert", "Sanne"],
raw: "thanks bert for the notes",
truth: "thanks Geert for the notes",
},
NameCase {
label: "french-jacques",
pool: &["Jacques", "Camille"],
raw: "merci jacque for joining",
truth: "merci Jacques for joining",
},
NameCase {
label: "spanish-monica-accent",
pool: &["Mónica", "Diego"],
raw: "gracias monica for the update",
truth: "gracias Mónica for the update",
},
NameCase {
label: "indian-aishwarya",
pool: &["Aishwarya", "Rohan"],
raw: "over to ashwarya now",
truth: "over to Aishwarya now",
},
NameCase {
label: "chinese-xiulan",
pool: &["Xiulan", "Wei"],
raw: "shulan will present next",
truth: "Xiulan will present next",
},
NameCase {
label: "vietnamese-thanh",
pool: &["Thanh", "Linh"],
raw: "tan owns the rollout",
truth: "Thanh owns the rollout",
},
NameCase {
label: "neg-common-word-mark",
pool: &["Mark", "Priya"],
raw: "that was a good mark on the exam",
truth: "that was a good mark on the exam",
},
NameCase {
label: "neg-already-correct",
pool: &["Sarah", "Tomas"],
raw: "hi Sarah how are you",
truth: "hi Sarah how are you",
},
NameCase {
label: "neg-no-name-present",
pool: &["Geert", "Mónica"],
raw: "we shipped the quarterly report",
truth: "we shipped the quarterly report",
},
NameCase {
label: "neg-distant-token",
pool: &["Aishwarya", "Jacques"],
raw: "the feature is ready to demo",
truth: "the feature is ready to demo",
},
NameCase {
label: "neg-pronoun-in-name-position",
pool: &["Wei", "Aki"],
raw: "we will demo today",
truth: "we will demo today",
},
NameCase {
label: "neg-speaker-prefix",
pool: &["Spencer"],
raw: "[SPEAKER_1 0:05] will present",
truth: "[SPEAKER_1 0:05] will present",
},
];
pub(crate) fn score_case(case: &NameCase, candidate: &str, report: &mut HarnessReport) {
let raw: Vec<&str> = case.raw.split_whitespace().collect();
let truth: Vec<&str> = case.truth.split_whitespace().collect();
let cand: Vec<&str> = candidate.split_whitespace().collect();
if cand.len() != truth.len() {
report.structural_mismatches += 1;
for i in 0..truth.len() {
if raw.get(i) != truth.get(i) {
report.missed += 1;
}
}
return;
}
for i in 0..truth.len() {
let is_target = raw.get(i) != truth.get(i);
let candidate_matches_truth = cand.get(i) == truth.get(i);
if is_target {
if candidate_matches_truth {
report.recovered += 1;
} else {
report.missed += 1;
}
} else if cand.get(i) != raw.get(i) {
report.false_corrections += 1;
}
}
}
pub(crate) fn run_harness(correct: impl Fn(&str, &[&str]) -> String) -> HarnessReport {
let mut report = HarnessReport::default();
for case in CORPUS {
let candidate = correct(case.raw, case.pool);
score_case(case, &candidate, &mut report);
}
report
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn corpus_shape_is_six_targets_six_negatives() {
let targets = CORPUS.iter().filter(|c| c.raw != c.truth).count();
let negatives = CORPUS.iter().filter(|c| c.raw == c.truth).count();
assert_eq!(targets, 6, "expected 6 correction-target cases");
assert_eq!(negatives, 6, "expected 6 negative (no-change) cases");
}
#[test]
fn baseline_identity_misses_all_targets_with_no_false_corrections() {
let report = run_harness(|raw, _pool| raw.to_string());
assert_eq!(report.recovered, 0, "identity recovers no names");
assert_eq!(report.missed, 6, "identity misses all 6 name targets");
assert_eq!(
report.false_corrections, 0,
"identity must never corrupt a token"
);
assert_eq!(report.structural_mismatches, 0);
}
#[test]
fn negatives_unchanged_under_identity() {
for case in CORPUS.iter().filter(|c| c.raw == c.truth) {
let mut report = HarnessReport::default();
score_case(case, case.raw, &mut report);
assert_eq!(
report.false_corrections, 0,
"negative case {} should report no false corrections under identity",
case.label
);
assert_eq!(
report.missed, 0,
"negative case {} has no name targets",
case.label
);
}
}
#[test]
fn post_pass_correction_recovers_without_false_corrections() {
let report = run_harness(|raw, pool| {
let pool_vec: Vec<String> = pool.iter().map(|s| (*s).to_string()).collect();
crate::name_correction::correct_names(raw, &pool_vec).0
});
eprintln!("NAME-CORRECTION HARNESS REPORT: {report:?}");
assert_eq!(
report.false_corrections, 0,
"false corrections must be zero: {report:?}"
);
assert_eq!(report.structural_mismatches, 0, "{report:?}");
assert_eq!(
report.recovered, 6,
"expected to recover all 6 corpus targets: {report:?}"
);
}
#[test]
fn scorer_flags_false_corrections() {
let report = run_harness(|raw, pool| {
let Some(name) = pool.first() else {
return raw.to_string();
};
let mut out: Vec<String> = raw.split_whitespace().map(str::to_string).collect();
if let Some(slot) = out
.iter_mut()
.find(|t| t.chars().next().is_some_and(|c| c.is_lowercase()))
{
*slot = (*name).to_string();
}
out.join(" ")
});
assert!(
report.false_corrections > 0,
"scorer must detect that the pathological corrector corrupts tokens"
);
}
}