worker_matcher/nicknames.rs
1//! Nickname equivalence tables for given-name matching.
2//!
3//! Worker records routinely carry the same worker under several given-name
4//! variants — `Michael` vs `Mike`, `Elizabeth` vs `Liz`, `Robert` vs `Bob`.
5//! No string-similarity metric (Jaro-Winkler, Levenshtein, Soundex) closes
6//! that gap on its own because the variants don't share enough characters
7//! or phonemes. The fix is a small lookup table of known equivalence
8//! classes, applied as a **post-similarity boost** so the matcher can lift
9//! the given-name score when both records carry forms that the table
10//! knows about.
11//!
12//! ## API
13//!
14//! Tables are constructed via [`NicknameTable::empty`] or
15//! [`NicknameTable::english`] (a built-in default for common English
16//! nicknames). Additional classes are added with
17//! [`NicknameTable::with_class`].
18//!
19//! Two names are *equivalent* under a table iff, after
20//! [`crate::Normalizer::normalize_name`] is applied, both end up in the
21//! same equivalence class. Identical normalised strings are trivially
22//! equivalent (the table does not need to list them explicitly).
23//!
24//! ## Integration with the matcher
25//!
26//! [`crate::MatchConfig::nickname_table`] is empty by default — nicknames
27//! are an opt-in feature so existing behaviour is preserved. When a
28//! non-empty table is configured, the matcher's name scoring computes the
29//! configured similarity algorithm as usual and then **lifts the score to
30//! `0.9` if-and-only-if the table considers the pair equivalent**. The
31//! boost never lowers a score.
32//!
33//! ## Scope and limitations
34//!
35//! - English-language nicknames only. Localised tables are tracked in
36//! `spec.md` §21 medium-term work and can be slotted in by constructing
37//! a fresh [`NicknameTable`] at the call site.
38//! - One-way ambiguity is intentional: `Sandy` can be a nickname for
39//! `Alexandra` or `Sandra`. Both are listed so that either canonical
40//! form matches `Sandy`; lookups return `true` when the two normalised
41//! inputs share *any* class.
42//! - Family names are out of scope. The matcher applies the table to
43//! both given and family names because `score_name` is shared, but the
44//! default English table contains no family-name entries.
45//!
46//! # Examples
47//!
48//! ```
49//! use worker_matcher::NicknameTable;
50//!
51//! let table = NicknameTable::english();
52//! assert!(table.are_equivalent("Mike", "Michael"));
53//! assert!(table.are_equivalent("Liz", "Elizabeth"));
54//! assert!(table.are_equivalent("Bob", "Robert"));
55//! assert!(!table.are_equivalent("Mike", "Robert"));
56//!
57//! // Add a custom class on top:
58//! let table = NicknameTable::english().with_class(["Reginald", "Reggie"]);
59//! assert!(table.are_equivalent("Reggie", "Reginald"));
60//! ```
61
62use crate::normalizer::Normalizer;
63use serde::{Deserialize, Serialize};
64
65/// Equivalence-class lookup table for given-name nicknames.
66///
67/// Each class is a `Vec<String>` of normalised forms that the table
68/// considers interchangeable. Two inputs are equivalent under the table
69/// iff their normalised forms appear in the same class — or are
70/// byte-identical after normalisation.
71///
72/// The type is `Clone + Debug + PartialEq + Eq` so it composes into
73/// [`crate::MatchConfig`] without surprises. Construction is cheap and
74/// allocates once per class; lookup is `O(classes × entries)` and is
75/// dominated by the table's size, not the input string length.
76///
77/// # Example
78///
79/// ```
80/// use worker_matcher::NicknameTable;
81///
82/// let t = NicknameTable::empty()
83/// .with_class(["Michael", "Mike", "Mickey"])
84/// .with_class(["Elizabeth", "Liz", "Beth"]);
85///
86/// assert!(t.are_equivalent("Mike", "Michael"));
87/// assert!(t.are_equivalent("liz", "BETH"));
88/// assert!(!t.are_equivalent("Michael", "Liz"));
89/// ```
90#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
91pub struct NicknameTable {
92 classes: Vec<Vec<String>>,
93}
94
95impl NicknameTable {
96 /// Construct an empty table that considers every pair of distinct
97 /// strings non-equivalent (identical strings remain trivially equal).
98 ///
99 /// ```
100 /// use worker_matcher::NicknameTable;
101 /// let t = NicknameTable::empty();
102 /// assert!(!t.are_equivalent("Mike", "Michael"));
103 /// assert!(t.are_equivalent("Mike", "Mike"));
104 /// ```
105 pub fn empty() -> Self {
106 Self {
107 classes: Vec::new(),
108 }
109 }
110
111 /// Append an equivalence class to the table.
112 ///
113 /// Each input string is normalised via
114 /// [`crate::Normalizer::normalize_name`] before insertion so the
115 /// table is closed under the same normalisation pipeline the matcher
116 /// uses at lookup time. Duplicate or empty entries are silently
117 /// dropped, and a class with fewer than two distinct normalised
118 /// entries is dropped entirely (it would never make a pair
119 /// equivalent).
120 ///
121 /// ```
122 /// use worker_matcher::NicknameTable;
123 /// let t = NicknameTable::empty().with_class(["Robert", "Bob", "Rob"]);
124 /// assert!(t.are_equivalent("BOB", "robert"));
125 /// ```
126 pub fn with_class<I, S>(mut self, names: I) -> Self
127 where
128 I: IntoIterator<Item = S>,
129 S: AsRef<str>,
130 {
131 let mut entries: Vec<String> = Vec::new();
132 for name in names {
133 let normalised = Normalizer::normalize_name(name.as_ref());
134 if !normalised.is_empty() && !entries.contains(&normalised) {
135 entries.push(normalised);
136 }
137 }
138 if entries.len() >= 2 {
139 self.classes.push(entries);
140 }
141 self
142 }
143
144 /// Return `true` iff `a` and `b`, after name normalisation, are
145 /// considered the same worker by this table.
146 ///
147 /// Identical normalised strings are trivially equivalent. Otherwise
148 /// both inputs must appear in the same equivalence class.
149 ///
150 /// ```
151 /// use worker_matcher::NicknameTable;
152 /// let t = NicknameTable::english();
153 /// assert!(t.are_equivalent("Mike", "Michael"));
154 /// assert!(t.are_equivalent("mike", "MICHAEL")); // case-insensitive
155 /// assert!(!t.are_equivalent("Mike", "Robert"));
156 /// assert!(t.are_equivalent("", "")); // trivially equal
157 /// ```
158 pub fn are_equivalent(&self, a: &str, b: &str) -> bool {
159 let na = Normalizer::normalize_name(a);
160 let nb = Normalizer::normalize_name(b);
161 if na == nb {
162 return true;
163 }
164 self.classes
165 .iter()
166 .any(|cls| cls.iter().any(|n| n == &na) && cls.iter().any(|n| n == &nb))
167 }
168
169 /// `true` iff the table contains no equivalence classes — equivalent
170 /// to comparing with [`NicknameTable::empty`] but cheaper to test.
171 ///
172 /// ```
173 /// use worker_matcher::NicknameTable;
174 /// assert!(NicknameTable::empty().is_empty());
175 /// assert!(!NicknameTable::english().is_empty());
176 /// ```
177 pub fn is_empty(&self) -> bool {
178 self.classes.is_empty()
179 }
180
181 /// Number of equivalence classes registered with this table.
182 ///
183 /// ```
184 /// use worker_matcher::NicknameTable;
185 /// assert_eq!(NicknameTable::empty().len(), 0);
186 /// let t = NicknameTable::empty().with_class(["A", "B"]);
187 /// assert_eq!(t.len(), 1);
188 /// ```
189 pub fn len(&self) -> usize {
190 self.classes.len()
191 }
192
193 /// A built-in table covering the most common English-language
194 /// nicknames encountered in healthcare data: `Michael`/`Mike`,
195 /// `Robert`/`Bob`, `Elizabeth`/`Liz`, and similar.
196 ///
197 /// The exact contents are not part of the public contract — entries
198 /// may be added in minor releases. Callers that need a stable
199 /// dictionary SHOULD construct their own via
200 /// [`NicknameTable::with_class`].
201 ///
202 /// ```
203 /// use worker_matcher::NicknameTable;
204 /// let t = NicknameTable::english();
205 /// assert!(t.are_equivalent("Bill", "William"));
206 /// assert!(t.are_equivalent("Liz", "Elizabeth"));
207 /// assert!(t.are_equivalent("Steve", "Steven"));
208 /// assert!(t.are_equivalent("Steve", "Stephen"));
209 /// ```
210 pub fn english() -> Self {
211 let pairs: &[&[&str]] = &[
212 &["michael", "mike", "mick", "mickey"],
213 &["robert", "bob", "rob", "robbie", "bobby"],
214 &["william", "will", "bill", "billy", "willy"],
215 &["james", "jim", "jimmy", "jamie"],
216 &["richard", "rick", "dick", "rich", "richie"],
217 &["thomas", "tom", "tommy"],
218 &[
219 "elizabeth",
220 "liz",
221 "beth",
222 "betty",
223 "eliza",
224 "lizzy",
225 "betsy",
226 ],
227 &[
228 "katherine",
229 "kate",
230 "kathy",
231 "katy",
232 "kat",
233 "cathy",
234 "katie",
235 ],
236 &[
237 "catherine",
238 "kate",
239 "kathy",
240 "katy",
241 "kat",
242 "cathy",
243 "katie",
244 ],
245 &["margaret", "maggie", "meg", "peggy", "marge"],
246 &["jennifer", "jen", "jenny", "jenn"],
247 &["patricia", "pat", "patty", "tricia", "trish"],
248 &["susan", "sue", "suzie", "susie"],
249 &["barbara", "barb", "babs"],
250 &["anthony", "tony"],
251 &["christopher", "chris", "kris"],
252 &["charles", "charlie", "chuck", "chas"],
253 &["daniel", "dan", "danny"],
254 &["david", "dave", "davy"],
255 &["edward", "ed", "eddie", "ted", "ned"],
256 &["joseph", "joe", "joey"],
257 &["kenneth", "ken", "kenny"],
258 &["nicholas", "nick", "nico"],
259 &["peter", "pete"],
260 &["samuel", "sam", "sammy"],
261 &["stephen", "steve", "stevie"],
262 &["steven", "steve", "stevie"],
263 &["timothy", "tim", "timmy"],
264 &["alexander", "alex", "xander"],
265 &["alexandra", "alex", "alexa", "sandy"],
266 &["sandra", "sandy"],
267 &["benjamin", "ben", "benny"],
268 &["rebecca", "becca", "becky"],
269 &["sarah", "sara", "sally"],
270 &["victoria", "vicky", "vic", "tori"],
271 &["matthew", "matt", "matty"],
272 &["jonathan", "jon", "jonny", "jonathon"],
273 &["frederick", "fred", "freddy", "freddie"],
274 &["lawrence", "larry"],
275 &["henry", "hank", "harry"],
276 &["ronald", "ron", "ronnie"],
277 &["donald", "don", "donnie"],
278 &["andrew", "andy", "drew"],
279 &["abigail", "abby", "gail"],
280 &["amanda", "mandy"],
281 &["isabella", "izzy", "bella"],
282 &["isabel", "izzy", "bella"],
283 &["olivia", "liv", "livy"],
284 &["nicole", "nikki"],
285 &["samantha", "sam", "sammy"],
286 &["pamela", "pam"],
287 &["deborah", "deb", "debbie"],
288 &["kimberly", "kim"],
289 &["jessica", "jess", "jessie"],
290 &["stephanie", "steph"],
291 &["madeline", "maddy", "maddie"],
292 ];
293 let mut table = Self::empty();
294 for class in pairs {
295 table = table.with_class(*class);
296 }
297 table
298 }
299}
300
301#[cfg(test)]
302mod tests {
303 use super::*;
304
305 #[test]
306 fn empty_table_treats_distinct_strings_as_inequivalent() {
307 let t = NicknameTable::empty();
308 assert!(!t.are_equivalent("Mike", "Michael"));
309 assert!(!t.are_equivalent("Liz", "Elizabeth"));
310 }
311
312 #[test]
313 fn identical_normalised_strings_are_trivially_equivalent_even_when_empty() {
314 let t = NicknameTable::empty();
315 assert!(t.are_equivalent("Mike", "mike"));
316 assert!(t.are_equivalent("MICHAEL", "michael"));
317 assert!(t.are_equivalent("", ""));
318 }
319
320 #[test]
321 fn with_class_normalises_entries_at_insertion() {
322 let t = NicknameTable::empty().with_class(["Robert", "Bob", "Rob"]);
323 assert!(t.are_equivalent("BOB", "robert"));
324 assert!(t.are_equivalent("Rob", "Robert"));
325 }
326
327 #[test]
328 fn with_class_dedupes_after_normalisation() {
329 let t = NicknameTable::empty().with_class(["mike", "MIKE", "Mike"]);
330 // All three normalise to "mike"; class collapses to a single
331 // entry and is therefore dropped (no pair makes the class useful).
332 assert_eq!(t.len(), 0);
333 }
334
335 #[test]
336 fn with_class_drops_classes_with_fewer_than_two_distinct_entries() {
337 let t = NicknameTable::empty().with_class(["Mike"]);
338 assert!(t.is_empty());
339 }
340
341 #[test]
342 fn with_class_drops_empty_strings_silently() {
343 let t = NicknameTable::empty().with_class(["", "Mike", ""]);
344 // After empties are dropped only "mike" remains → no useful class.
345 assert!(t.is_empty());
346 }
347
348 #[test]
349 fn english_table_covers_acceptance_criterion() {
350 let t = NicknameTable::english();
351 for (a, b) in [
352 ("Mike", "Michael"),
353 ("Liz", "Elizabeth"),
354 ("Bob", "Robert"),
355 ("Bill", "William"),
356 ("Dick", "Richard"),
357 ] {
358 assert!(t.are_equivalent(a, b), "{a:?} ↮ {b:?} in english()");
359 }
360 }
361
362 #[test]
363 fn english_table_treats_unrelated_names_as_inequivalent() {
364 let t = NicknameTable::english();
365 assert!(!t.are_equivalent("Mike", "Robert"));
366 assert!(!t.are_equivalent("Liz", "Tom"));
367 }
368
369 #[test]
370 fn english_table_handles_shared_nicknames_across_classes() {
371 let t = NicknameTable::english();
372 // "Sandy" is a recognised nickname for both Alexandra and Sandra;
373 // matching against either canonical succeeds.
374 assert!(t.are_equivalent("Sandy", "Alexandra"));
375 assert!(t.are_equivalent("Sandy", "Sandra"));
376 // Steve appears in both Stephen and Steven classes for similar
377 // reasons.
378 assert!(t.are_equivalent("Steve", "Stephen"));
379 assert!(t.are_equivalent("Steve", "Steven"));
380 }
381
382 #[test]
383 fn with_class_composes_on_top_of_english() {
384 let t = NicknameTable::english().with_class(["Reginald", "Reggie"]);
385 assert!(t.are_equivalent("Reggie", "Reginald"));
386 // Original entries still work.
387 assert!(t.are_equivalent("Mike", "Michael"));
388 }
389
390 #[test]
391 fn lookup_is_case_and_punctuation_insensitive() {
392 let t = NicknameTable::english();
393 assert!(t.are_equivalent("MIKE", "michael"));
394 assert!(t.are_equivalent(" Mike ", "Michael"));
395 }
396
397 #[test]
398 fn default_is_empty() {
399 let t = NicknameTable::default();
400 assert!(t.is_empty());
401 }
402}