Skip to main content

worker_matcher/
nicknames.rs

1//! Nickname equivalence tables for given-name matching.
2//!
3//! Worker records routinely carry the same worker under several given-name
4//! variants — `Michael` vs `Mike`, `Elizabeth` vs `Liz`, `Robert` vs `Bob`.
5//! No string-similarity metric (Jaro-Winkler, Levenshtein, Soundex) closes
6//! that gap on its own because the variants don't share enough characters
7//! or phonemes. The fix is a small lookup table of known equivalence
8//! classes, applied as a **post-similarity boost** so the matcher can lift
9//! the given-name score when both records carry forms that the table
10//! knows about.
11//!
12//! ## API
13//!
14//! Tables are constructed via [`NicknameTable::empty`] or
15//! [`NicknameTable::english`] (a built-in default for common English
16//! nicknames). Additional classes are added with
17//! [`NicknameTable::with_class`].
18//!
19//! Two names are *equivalent* under a table iff, after
20//! [`crate::Normalizer::normalize_name`] is applied, both end up in the
21//! same equivalence class. Identical normalised strings are trivially
22//! equivalent (the table does not need to list them explicitly).
23//!
24//! ## Integration with the matcher
25//!
26//! [`crate::MatchConfig::nickname_table`] is empty by default — nicknames
27//! are an opt-in feature so existing behaviour is preserved. When a
28//! non-empty table is configured, the matcher's name scoring computes the
29//! configured similarity algorithm as usual and then **lifts the score to
30//! `0.9` if-and-only-if the table considers the pair equivalent**. The
31//! boost never lowers a score.
32//!
33//! ## Scope and limitations
34//!
35//! - English-language nicknames only. Localised tables are tracked in
36//!   `spec.md` §21 medium-term work and can be slotted in by constructing
37//!   a fresh [`NicknameTable`] at the call site.
38//! - One-way ambiguity is intentional: `Sandy` can be a nickname for
39//!   `Alexandra` or `Sandra`. Both are listed so that either canonical
40//!   form matches `Sandy`; lookups return `true` when the two normalised
41//!   inputs share *any* class.
42//! - Family names are out of scope. The matcher applies the table to
43//!   both given and family names because `score_name` is shared, but the
44//!   default English table contains no family-name entries.
45//!
46//! # Examples
47//!
48//! ```
49//! use worker_matcher::NicknameTable;
50//!
51//! let table = NicknameTable::english();
52//! assert!(table.are_equivalent("Mike", "Michael"));
53//! assert!(table.are_equivalent("Liz", "Elizabeth"));
54//! assert!(table.are_equivalent("Bob", "Robert"));
55//! assert!(!table.are_equivalent("Mike", "Robert"));
56//!
57//! // Add a custom class on top:
58//! let table = NicknameTable::english().with_class(["Reginald", "Reggie"]);
59//! assert!(table.are_equivalent("Reggie", "Reginald"));
60//! ```
61
62use crate::normalizer::Normalizer;
63use serde::{Deserialize, Serialize};
64
65/// Equivalence-class lookup table for given-name nicknames.
66///
67/// Each class is a `Vec<String>` of normalised forms that the table
68/// considers interchangeable. Two inputs are equivalent under the table
69/// iff their normalised forms appear in the same class — or are
70/// byte-identical after normalisation.
71///
72/// The type is `Clone + Debug + PartialEq + Eq` so it composes into
73/// [`crate::MatchConfig`] without surprises. Construction is cheap and
74/// allocates once per class; lookup is `O(classes × entries)` and is
75/// dominated by the table's size, not the input string length.
76///
77/// # Example
78///
79/// ```
80/// use worker_matcher::NicknameTable;
81///
82/// let t = NicknameTable::empty()
83///     .with_class(["Michael", "Mike", "Mickey"])
84///     .with_class(["Elizabeth", "Liz", "Beth"]);
85///
86/// assert!(t.are_equivalent("Mike", "Michael"));
87/// assert!(t.are_equivalent("liz", "BETH"));
88/// assert!(!t.are_equivalent("Michael", "Liz"));
89/// ```
90#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
91pub struct NicknameTable {
92    classes: Vec<Vec<String>>,
93}
94
95impl NicknameTable {
96    /// Construct an empty table that considers every pair of distinct
97    /// strings non-equivalent (identical strings remain trivially equal).
98    ///
99    /// ```
100    /// use worker_matcher::NicknameTable;
101    /// let t = NicknameTable::empty();
102    /// assert!(!t.are_equivalent("Mike", "Michael"));
103    /// assert!(t.are_equivalent("Mike", "Mike"));
104    /// ```
105    pub fn empty() -> Self {
106        Self {
107            classes: Vec::new(),
108        }
109    }
110
111    /// Append an equivalence class to the table.
112    ///
113    /// Each input string is normalised via
114    /// [`crate::Normalizer::normalize_name`] before insertion so the
115    /// table is closed under the same normalisation pipeline the matcher
116    /// uses at lookup time. Duplicate or empty entries are silently
117    /// dropped, and a class with fewer than two distinct normalised
118    /// entries is dropped entirely (it would never make a pair
119    /// equivalent).
120    ///
121    /// ```
122    /// use worker_matcher::NicknameTable;
123    /// let t = NicknameTable::empty().with_class(["Robert", "Bob", "Rob"]);
124    /// assert!(t.are_equivalent("BOB", "robert"));
125    /// ```
126    pub fn with_class<I, S>(mut self, names: I) -> Self
127    where
128        I: IntoIterator<Item = S>,
129        S: AsRef<str>,
130    {
131        let mut entries: Vec<String> = Vec::new();
132        for name in names {
133            let normalised = Normalizer::normalize_name(name.as_ref());
134            if !normalised.is_empty() && !entries.contains(&normalised) {
135                entries.push(normalised);
136            }
137        }
138        if entries.len() >= 2 {
139            self.classes.push(entries);
140        }
141        self
142    }
143
144    /// Return `true` iff `a` and `b`, after name normalisation, are
145    /// considered the same worker by this table.
146    ///
147    /// Identical normalised strings are trivially equivalent. Otherwise
148    /// both inputs must appear in the same equivalence class.
149    ///
150    /// ```
151    /// use worker_matcher::NicknameTable;
152    /// let t = NicknameTable::english();
153    /// assert!(t.are_equivalent("Mike",  "Michael"));
154    /// assert!(t.are_equivalent("mike",  "MICHAEL"));   // case-insensitive
155    /// assert!(!t.are_equivalent("Mike", "Robert"));
156    /// assert!(t.are_equivalent("",       ""));         // trivially equal
157    /// ```
158    pub fn are_equivalent(&self, a: &str, b: &str) -> bool {
159        let na = Normalizer::normalize_name(a);
160        let nb = Normalizer::normalize_name(b);
161        if na == nb {
162            return true;
163        }
164        self.classes
165            .iter()
166            .any(|cls| cls.iter().any(|n| n == &na) && cls.iter().any(|n| n == &nb))
167    }
168
169    /// `true` iff the table contains no equivalence classes — equivalent
170    /// to comparing with [`NicknameTable::empty`] but cheaper to test.
171    ///
172    /// ```
173    /// use worker_matcher::NicknameTable;
174    /// assert!(NicknameTable::empty().is_empty());
175    /// assert!(!NicknameTable::english().is_empty());
176    /// ```
177    pub fn is_empty(&self) -> bool {
178        self.classes.is_empty()
179    }
180
181    /// Number of equivalence classes registered with this table.
182    ///
183    /// ```
184    /// use worker_matcher::NicknameTable;
185    /// assert_eq!(NicknameTable::empty().len(), 0);
186    /// let t = NicknameTable::empty().with_class(["A", "B"]);
187    /// assert_eq!(t.len(), 1);
188    /// ```
189    pub fn len(&self) -> usize {
190        self.classes.len()
191    }
192
193    /// A built-in table covering the most common English-language
194    /// nicknames encountered in healthcare data: `Michael`/`Mike`,
195    /// `Robert`/`Bob`, `Elizabeth`/`Liz`, and similar.
196    ///
197    /// The exact contents are not part of the public contract — entries
198    /// may be added in minor releases. Callers that need a stable
199    /// dictionary SHOULD construct their own via
200    /// [`NicknameTable::with_class`].
201    ///
202    /// ```
203    /// use worker_matcher::NicknameTable;
204    /// let t = NicknameTable::english();
205    /// assert!(t.are_equivalent("Bill",      "William"));
206    /// assert!(t.are_equivalent("Liz",       "Elizabeth"));
207    /// assert!(t.are_equivalent("Steve",     "Steven"));
208    /// assert!(t.are_equivalent("Steve",     "Stephen"));
209    /// ```
210    pub fn english() -> Self {
211        let pairs: &[&[&str]] = &[
212            &["michael", "mike", "mick", "mickey"],
213            &["robert", "bob", "rob", "robbie", "bobby"],
214            &["william", "will", "bill", "billy", "willy"],
215            &["james", "jim", "jimmy", "jamie"],
216            &["richard", "rick", "dick", "rich", "richie"],
217            &["thomas", "tom", "tommy"],
218            &[
219                "elizabeth",
220                "liz",
221                "beth",
222                "betty",
223                "eliza",
224                "lizzy",
225                "betsy",
226            ],
227            &[
228                "katherine",
229                "kate",
230                "kathy",
231                "katy",
232                "kat",
233                "cathy",
234                "katie",
235            ],
236            &[
237                "catherine",
238                "kate",
239                "kathy",
240                "katy",
241                "kat",
242                "cathy",
243                "katie",
244            ],
245            &["margaret", "maggie", "meg", "peggy", "marge"],
246            &["jennifer", "jen", "jenny", "jenn"],
247            &["patricia", "pat", "patty", "tricia", "trish"],
248            &["susan", "sue", "suzie", "susie"],
249            &["barbara", "barb", "babs"],
250            &["anthony", "tony"],
251            &["christopher", "chris", "kris"],
252            &["charles", "charlie", "chuck", "chas"],
253            &["daniel", "dan", "danny"],
254            &["david", "dave", "davy"],
255            &["edward", "ed", "eddie", "ted", "ned"],
256            &["joseph", "joe", "joey"],
257            &["kenneth", "ken", "kenny"],
258            &["nicholas", "nick", "nico"],
259            &["peter", "pete"],
260            &["samuel", "sam", "sammy"],
261            &["stephen", "steve", "stevie"],
262            &["steven", "steve", "stevie"],
263            &["timothy", "tim", "timmy"],
264            &["alexander", "alex", "xander"],
265            &["alexandra", "alex", "alexa", "sandy"],
266            &["sandra", "sandy"],
267            &["benjamin", "ben", "benny"],
268            &["rebecca", "becca", "becky"],
269            &["sarah", "sara", "sally"],
270            &["victoria", "vicky", "vic", "tori"],
271            &["matthew", "matt", "matty"],
272            &["jonathan", "jon", "jonny", "jonathon"],
273            &["frederick", "fred", "freddy", "freddie"],
274            &["lawrence", "larry"],
275            &["henry", "hank", "harry"],
276            &["ronald", "ron", "ronnie"],
277            &["donald", "don", "donnie"],
278            &["andrew", "andy", "drew"],
279            &["abigail", "abby", "gail"],
280            &["amanda", "mandy"],
281            &["isabella", "izzy", "bella"],
282            &["isabel", "izzy", "bella"],
283            &["olivia", "liv", "livy"],
284            &["nicole", "nikki"],
285            &["samantha", "sam", "sammy"],
286            &["pamela", "pam"],
287            &["deborah", "deb", "debbie"],
288            &["kimberly", "kim"],
289            &["jessica", "jess", "jessie"],
290            &["stephanie", "steph"],
291            &["madeline", "maddy", "maddie"],
292        ];
293        let mut table = Self::empty();
294        for class in pairs {
295            table = table.with_class(*class);
296        }
297        table
298    }
299}
300
301#[cfg(test)]
302mod tests {
303    use super::*;
304
305    #[test]
306    fn empty_table_treats_distinct_strings_as_inequivalent() {
307        let t = NicknameTable::empty();
308        assert!(!t.are_equivalent("Mike", "Michael"));
309        assert!(!t.are_equivalent("Liz", "Elizabeth"));
310    }
311
312    #[test]
313    fn identical_normalised_strings_are_trivially_equivalent_even_when_empty() {
314        let t = NicknameTable::empty();
315        assert!(t.are_equivalent("Mike", "mike"));
316        assert!(t.are_equivalent("MICHAEL", "michael"));
317        assert!(t.are_equivalent("", ""));
318    }
319
320    #[test]
321    fn with_class_normalises_entries_at_insertion() {
322        let t = NicknameTable::empty().with_class(["Robert", "Bob", "Rob"]);
323        assert!(t.are_equivalent("BOB", "robert"));
324        assert!(t.are_equivalent("Rob", "Robert"));
325    }
326
327    #[test]
328    fn with_class_dedupes_after_normalisation() {
329        let t = NicknameTable::empty().with_class(["mike", "MIKE", "Mike"]);
330        // All three normalise to "mike"; class collapses to a single
331        // entry and is therefore dropped (no pair makes the class useful).
332        assert_eq!(t.len(), 0);
333    }
334
335    #[test]
336    fn with_class_drops_classes_with_fewer_than_two_distinct_entries() {
337        let t = NicknameTable::empty().with_class(["Mike"]);
338        assert!(t.is_empty());
339    }
340
341    #[test]
342    fn with_class_drops_empty_strings_silently() {
343        let t = NicknameTable::empty().with_class(["", "Mike", ""]);
344        // After empties are dropped only "mike" remains → no useful class.
345        assert!(t.is_empty());
346    }
347
348    #[test]
349    fn english_table_covers_acceptance_criterion() {
350        let t = NicknameTable::english();
351        for (a, b) in [
352            ("Mike", "Michael"),
353            ("Liz", "Elizabeth"),
354            ("Bob", "Robert"),
355            ("Bill", "William"),
356            ("Dick", "Richard"),
357        ] {
358            assert!(t.are_equivalent(a, b), "{a:?} ↮ {b:?} in english()");
359        }
360    }
361
362    #[test]
363    fn english_table_treats_unrelated_names_as_inequivalent() {
364        let t = NicknameTable::english();
365        assert!(!t.are_equivalent("Mike", "Robert"));
366        assert!(!t.are_equivalent("Liz", "Tom"));
367    }
368
369    #[test]
370    fn english_table_handles_shared_nicknames_across_classes() {
371        let t = NicknameTable::english();
372        // "Sandy" is a recognised nickname for both Alexandra and Sandra;
373        // matching against either canonical succeeds.
374        assert!(t.are_equivalent("Sandy", "Alexandra"));
375        assert!(t.are_equivalent("Sandy", "Sandra"));
376        // Steve appears in both Stephen and Steven classes for similar
377        // reasons.
378        assert!(t.are_equivalent("Steve", "Stephen"));
379        assert!(t.are_equivalent("Steve", "Steven"));
380    }
381
382    #[test]
383    fn with_class_composes_on_top_of_english() {
384        let t = NicknameTable::english().with_class(["Reginald", "Reggie"]);
385        assert!(t.are_equivalent("Reggie", "Reginald"));
386        // Original entries still work.
387        assert!(t.are_equivalent("Mike", "Michael"));
388    }
389
390    #[test]
391    fn lookup_is_case_and_punctuation_insensitive() {
392        let t = NicknameTable::english();
393        assert!(t.are_equivalent("MIKE", "michael"));
394        assert!(t.are_equivalent("  Mike  ", "Michael"));
395    }
396
397    #[test]
398    fn default_is_empty() {
399        let t = NicknameTable::default();
400        assert!(t.is_empty());
401    }
402}