use rphonetic::{DoubleMetaphone, Encoder};
use serde::{Deserialize, Serialize};
const MIN_MISSPELL_LEN: usize = 4;
#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize, schemars::JsonSchema)]
pub struct NameCorrection {
pub raw: String,
pub corrected: String,
}
pub fn build_name_pool(
attendees: &[String],
identity: Option<&crate::config::IdentityConfig>,
vocabulary: Option<&crate::vocabulary::VocabularyStore>,
) -> Vec<String> {
let mut candidates = Vec::new();
if let Some(identity) = identity {
if let Some(name) = identity.name.as_ref() {
candidates.push(name.clone());
}
candidates.extend(identity.aliases.iter().cloned());
}
candidates.extend(attendees.iter().cloned());
if let Some(vocabulary) = vocabulary {
candidates.extend(vocabulary.decode_phrases(8));
}
let mut names = Vec::new();
for token in candidates
.iter()
.flat_map(|candidate| candidate.split_whitespace())
.map(str::trim)
.filter(|token| token.chars().all(|c| c.is_alphabetic()))
.filter(|token| token.chars().count() >= 2)
.filter(|token| !is_stopword(&normalize(token)))
{
if !names.iter().any(|name| name == token) {
names.push(token.to_string());
}
}
names
}
struct PoolEntry {
surface: String,
norm: String,
dm: String,
is_participant: bool,
}
fn fold_char(c: char) -> char {
match c {
'á' | 'à' | 'â' | 'ä' | 'ã' | 'å' => 'a',
'é' | 'è' | 'ê' | 'ë' => 'e',
'í' | 'ì' | 'î' | 'ï' => 'i',
'ó' | 'ò' | 'ô' | 'ö' | 'õ' => 'o',
'ú' | 'ù' | 'û' | 'ü' => 'u',
'ñ' => 'n',
'ç' => 'c',
'ý' | 'ÿ' => 'y',
other => other,
}
}
fn normalize(s: &str) -> String {
s.chars()
.flat_map(char::to_lowercase)
.map(fold_char)
.collect()
}
fn differs_only_by_case(token: &str, surface: &str) -> bool {
token != surface && token.to_lowercase() == surface.to_lowercase()
}
fn levenshtein(a: &str, b: &str) -> usize {
let a: Vec<char> = a.chars().collect();
let b: Vec<char> = b.chars().collect();
let mut prev: Vec<usize> = (0..=b.len()).collect();
let mut cur = vec![0usize; b.len() + 1];
for (i, &ca) in a.iter().enumerate() {
cur[0] = i + 1;
for (j, &cb) in b.iter().enumerate() {
let cost = usize::from(ca != cb);
cur[j + 1] = (prev[j + 1] + 1).min(cur[j] + 1).min(prev[j] + cost);
}
std::mem::swap(&mut prev, &mut cur);
}
prev[b.len()]
}
fn distance_budget(len: usize) -> usize {
if len >= 6 {
2
} else {
1
}
}
fn dm_encode(dm: &DoubleMetaphone, s: &str) -> String {
if s.is_ascii() {
dm.encode(s)
} else {
String::new()
}
}
fn build_pool(
pool: &[String],
participant_norms: &std::collections::HashSet<String>,
) -> Vec<PoolEntry> {
let dm = DoubleMetaphone::default();
pool.iter()
.filter_map(|name| {
let surface = name.trim();
if surface.is_empty() || surface.split_whitespace().count() != 1 {
return None;
}
let norm = normalize(surface);
if norm.is_empty() {
return None;
}
let is_participant = participant_norms.contains(&norm);
Some(PoolEntry {
surface: surface.to_string(),
dm: dm_encode(&dm, surface),
is_participant,
norm,
})
})
.collect()
}
const ADDRESS_CUES: &[&str] = &[
"thanks", "thank", "hi", "hey", "hello", "dear", "ping", "merci", "gracias", "hola", "bonjour",
"ciao",
];
const STOPWORDS: &[&str] = &[
"a",
"an",
"the",
"and",
"or",
"but",
"so",
"as",
"at",
"by",
"for",
"from",
"in",
"into",
"of",
"off",
"on",
"onto",
"to",
"too",
"up",
"down",
"out",
"over",
"under",
"with",
"via",
"per",
"vs",
"we",
"you",
"your",
"yours",
"i",
"me",
"my",
"mine",
"he",
"him",
"his",
"she",
"her",
"hers",
"it",
"its",
"they",
"them",
"their",
"theirs",
"this",
"that",
"these",
"those",
"here",
"there",
"then",
"than",
"is",
"am",
"are",
"was",
"were",
"be",
"been",
"being",
"do",
"did",
"does",
"done",
"has",
"had",
"have",
"will",
"would",
"can",
"could",
"should",
"may",
"might",
"must",
"shall",
"go",
"got",
"get",
"well",
"yes",
"no",
"not",
"now",
"new",
"one",
"two",
"who",
"why",
"how",
"what",
"when",
"where",
"ok",
"okay",
"just",
"like",
"also",
"more",
"most",
"some",
"any",
"all",
"each",
"even",
"only",
"very",
"much",
"many",
"few",
"our",
"ours",
"us",
"if",
"else",
"about",
"after",
"before",
"again",
"team",
"group",
"staff",
"board",
"crew",
"panel",
"folks",
"everyone",
"everybody",
"guys",
"monday",
"tuesday",
"wednesday",
"thursday",
"friday",
"saturday",
"sunday",
"january",
"february",
"march",
"april",
"june",
"july",
"august",
"september",
"october",
"november",
"december",
"mark",
"bill",
"art",
"grace",
"hope",
"min",
"rose",
"dawn",
"sunny",
"drew",
"sun",
];
fn is_stopword(norm: &str) -> bool {
STOPWORDS.contains(&norm)
}
const NAME_VERB_CUES: &[&str] = &[
"will",
"owns",
"said",
"says",
"asked",
"mentioned",
"presented",
"joined",
"leads",
"wants",
"needs",
"added",
"noted",
"agreed",
"owns",
"owned",
"presents",
];
fn in_name_position(prev_word: Option<&str>, next_word: Option<&str>) -> bool {
let prev_hit = prev_word
.map(normalize)
.is_some_and(|w| ADDRESS_CUES.contains(&w.as_str()));
let next_hit = next_word
.map(normalize)
.is_some_and(|w| NAME_VERB_CUES.contains(&w.as_str()));
prev_hit || next_hit
}
fn match_token(
token: &str,
name_position: bool,
dm: &DoubleMetaphone,
pool: &[PoolEntry],
) -> Option<String> {
if token.is_empty() || !token.chars().all(|c| c.is_alphabetic()) {
return None;
}
let tok_norm = normalize(token);
if is_stopword(&tok_norm) {
return None;
}
let tok_dm = dm_encode(dm, token);
let mut candidate: Option<&PoolEntry> = None;
let mut candidate_count = 0usize;
for entry in pool {
if token == entry.surface || differs_only_by_case(token, &entry.surface) {
return None;
}
let is_candidate = if tok_norm == entry.norm {
true
} else {
let dist = levenshtein(&tok_norm, &entry.norm);
if dist == 0 {
false
} else {
let ascii = tok_norm.is_ascii() && entry.norm.is_ascii();
let relaxed =
name_position && entry.is_participant && tok_norm.len() >= 3 && dist <= 2;
let same_first = tok_norm.as_bytes().first() == entry.norm.as_bytes().first();
let dm_match = !tok_dm.is_empty() && tok_dm == entry.dm;
let conservative = tok_norm.len() >= MIN_MISSPELL_LEN
&& dist <= distance_budget(tok_norm.len().max(entry.norm.len()))
&& (same_first || dm_match);
ascii && (relaxed || conservative)
}
};
if is_candidate {
candidate = Some(entry);
candidate_count += 1;
}
}
if candidate_count == 1 {
candidate.map(|e| e.surface.clone())
} else {
None
}
}
pub fn correct_names(text: &str, pool: &[String]) -> (String, Vec<NameCorrection>) {
correct_names_with_participants(text, pool, pool)
}
pub fn correct_names_with_participants(
text: &str,
pool: &[String],
participants: &[String],
) -> (String, Vec<NameCorrection>) {
let participant_norms: std::collections::HashSet<String> = participants
.iter()
.flat_map(|p| p.split_whitespace())
.map(normalize)
.filter(|n| !n.is_empty())
.collect();
let entries = build_pool(pool, &participant_norms);
if entries.is_empty() {
return (text.to_string(), Vec::new());
}
let dm = DoubleMetaphone::default();
enum Seg {
Word(String),
Other(String),
}
let mut segs: Vec<Seg> = Vec::new();
let mut cur = String::new();
let mut cur_is_word = false;
for c in text.chars() {
let is_word = c.is_alphabetic();
if !cur.is_empty() && is_word != cur_is_word {
let taken = std::mem::take(&mut cur);
segs.push(if cur_is_word {
Seg::Word(taken)
} else {
Seg::Other(taken)
});
}
cur.push(c);
cur_is_word = is_word;
}
if !cur.is_empty() {
segs.push(if cur_is_word {
Seg::Word(cur)
} else {
Seg::Other(cur)
});
}
let word_positions: Vec<usize> = segs
.iter()
.enumerate()
.filter_map(|(i, s)| matches!(s, Seg::Word(_)).then_some(i))
.collect();
let word_at = |idx: Option<&usize>| -> Option<&str> {
idx.and_then(|&i| match &segs[i] {
Seg::Word(w) => Some(w.as_str()),
Seg::Other(_) => None,
})
};
let mut bracketed = vec![false; segs.len()];
let mut depth: i32 = 0;
for (i, s) in segs.iter().enumerate() {
match s {
Seg::Other(text) => {
for c in text.chars() {
match c {
'[' => depth += 1,
']' => depth = (depth - 1).max(0),
_ => {}
}
}
}
Seg::Word(_) => bracketed[i] = depth > 0,
}
}
let mut corrections = Vec::new();
let mut replacements: Vec<(usize, String)> = Vec::new();
for (k, &i) in word_positions.iter().enumerate() {
let Seg::Word(token) = &segs[i] else {
continue;
};
if bracketed[i] {
continue;
}
let prev = word_at(k.checked_sub(1).and_then(|kp| word_positions.get(kp)));
let next = word_at(word_positions.get(k + 1));
if let Some(surface) = match_token(token, in_name_position(prev, next), &dm, &entries) {
corrections.push(NameCorrection {
raw: token.clone(),
corrected: surface.clone(),
});
replacements.push((i, surface));
}
}
for (i, surface) in replacements {
segs[i] = Seg::Word(surface);
}
let out: String = segs
.iter()
.map(|s| match s {
Seg::Word(w) | Seg::Other(w) => w.as_str(),
})
.collect();
(out, corrections)
}
#[cfg(test)]
mod tests {
use super::*;
fn pool(names: &[&str]) -> Vec<String> {
names.iter().map(|s| s.to_string()).collect()
}
#[test]
fn build_name_pool_collects_unique_single_name_tokens() {
let identity = crate::config::IdentityConfig {
name: Some("Mathieu Silverstein".into()),
aliases: vec!["Mat".into(), "M S".into(), "J9".into()],
..Default::default()
};
let attendees = vec![
"Sarah Chen".into(),
"Mat".into(),
"A".into(),
"D4n".into(),
"Mónica".into(),
];
let pool = build_name_pool(&attendees, Some(&identity), None);
assert_eq!(
pool,
vec!["Mathieu", "Silverstein", "Mat", "Sarah", "Chen", "Mónica"]
);
}
#[test]
fn restores_accent_and_records_provenance() {
let (out, corr) = correct_names("gracias monica for the update", &pool(&["Mónica"]));
assert_eq!(out, "gracias Mónica for the update");
assert_eq!(corr.len(), 1);
assert_eq!(corr[0].raw, "monica");
assert_eq!(corr[0].corrected, "Mónica");
}
#[test]
fn corrects_same_first_letter_misspelling() {
let (out, _) = correct_names("merci jacque for joining", &pool(&["Jacques"]));
assert_eq!(out, "merci Jacques for joining");
}
#[test]
fn leaves_pure_case_common_word_alone() {
let (out, corr) = correct_names("that was a good mark on the exam", &pool(&["Mark"]));
assert_eq!(out, "that was a good mark on the exam");
assert!(corr.is_empty());
}
#[test]
fn leaves_already_correct_name_alone() {
let (out, corr) = correct_names("hi Sarah how are you", &pool(&["Sarah"]));
assert_eq!(out, "hi Sarah how are you");
assert!(corr.is_empty());
}
#[test]
fn does_not_touch_short_tokens_outside_name_position() {
let (out, corr) = correct_names("we got a nice tan today", &pool(&["Thanh"]));
assert_eq!(out, "we got a nice tan today");
assert!(corr.is_empty());
}
#[test]
fn corrects_hard_cases_in_name_position() {
let (out, _) = correct_names("thanks bert for the notes", &pool(&["Geert", "Sanne"]));
assert_eq!(out, "thanks Geert for the notes");
let (out2, _) = correct_names("tan owns the rollout", &pool(&["Thanh", "Linh"]));
assert_eq!(out2, "Thanh owns the rollout");
}
#[test]
fn name_position_is_still_distance_gated() {
let (out, corr) = correct_names("thanks everyone for joining", &pool(&["Geert"]));
assert_eq!(out, "thanks everyone for joining");
assert!(corr.is_empty());
}
#[test]
fn ambiguous_match_is_left_alone() {
let (out, corr) = correct_names("ping karan", &pool(&["Karen", "Kiran"]));
assert_eq!(out, "ping karan");
assert!(corr.is_empty());
}
#[test]
fn preserves_punctuation_and_structure() {
let (out, _) = correct_names("[SPEAKER_1 0:05] merci, jacque!", &pool(&["Jacques"]));
assert_eq!(out, "[SPEAKER_1 0:05] merci, Jacques!");
}
#[test]
fn empty_pool_is_a_noop() {
let (out, corr) = correct_names("merci jacque", &pool(&[]));
assert_eq!(out, "merci jacque");
assert!(corr.is_empty());
}
#[test]
fn stopword_in_name_position_is_never_corrected() {
let (out, corr) = correct_names("we will demo today", &pool(&["Wei", "Aki"]));
assert_eq!(out, "we will demo today");
assert!(corr.is_empty());
let (out2, _) = correct_names("thanks all for joining", &pool(&["Al"]));
assert_eq!(out2, "thanks all for joining");
}
#[test]
fn speaker_prefix_is_never_corrupted() {
let (out, corr) = correct_names("[SPEAKER_1 0:05] will present", &pool(&["Spencer"]));
assert_eq!(out, "[SPEAKER_1 0:05] will present");
assert!(corr.is_empty());
}
#[test]
fn pool_keeps_real_names_drops_stopwords() {
let names = build_name_pool(
&["Sarah Chen".to_string(), "The Team".to_string()],
None,
None,
);
assert!(names.iter().any(|n| n == "Sarah"));
assert!(names.iter().any(|n| n == "Chen"));
assert!(!names.iter().any(|n| n.eq_ignore_ascii_case("the")));
assert!(!names.iter().any(|n| n.eq_ignore_ascii_case("team")));
let (out, _) = correct_names("we did this for them today", &names);
assert_eq!(out, "we did this for them today");
}
#[test]
fn non_latin_token_is_not_fuzzy_matched_to_latin_name() {
let (out, corr) = correct_names("thanks 王 now", &pool(&["Al"]));
assert_eq!(out, "thanks 王 now");
assert!(corr.is_empty());
}
#[test]
fn accent_match_suppressed_when_another_name_is_also_close() {
let (out, corr) = correct_names("thanks Jose now", &pool(&["José", "Jase"]));
assert_eq!(out, "thanks Jose now");
assert!(corr.is_empty());
}
#[test]
fn dropped_preposition_cue_does_not_open_a_name_slot() {
let (out, corr) = correct_names("send this to bob", &pool(&["Rob"]));
assert_eq!(out, "send this to bob");
assert!(corr.is_empty());
}
#[test]
fn collective_noun_attendee_does_not_pollute_pool() {
let names = build_name_pool(&["The Team".to_string()], None, None);
assert!(!names.iter().any(|n| n.eq_ignore_ascii_case("team")));
let (out, corr) = correct_names("the term will change", &names);
assert_eq!(out, "the term will change");
assert!(corr.is_empty());
}
#[test]
fn common_word_names_are_left_alone_in_name_position() {
let (out, _) = correct_names("Bill noted the issue", &pool(&["Will"]));
assert_eq!(out, "Bill noted the issue");
let (out2, _) = correct_names("June said yes", &pool(&["Jane"]));
assert_eq!(out2, "June said yes");
}
#[test]
fn two_char_token_not_corrected_in_name_position() {
let (out, corr) = correct_names("thanks Bo now", &pool(&["Jo"]));
assert_eq!(out, "thanks Bo now");
assert!(corr.is_empty());
}
#[test]
fn relaxed_tier_requires_a_confirmed_participant() {
let names = pool(&["Geert"]);
let (gated, corr) =
correct_names_with_participants("thanks bert for the notes", &names, &[]);
assert_eq!(gated, "thanks bert for the notes");
assert!(corr.is_empty());
let (allowed, _) =
correct_names_with_participants("thanks bert for the notes", &names, &names);
assert_eq!(allowed, "thanks Geert for the notes");
}
#[test]
fn conservative_tier_does_not_require_participant() {
let names = pool(&["Jacques"]);
let (out, _) = correct_names_with_participants("merci jacque for joining", &names, &[]);
assert_eq!(out, "merci Jacques for joining");
}
}