worker-matcher 0.6.1

Worker matcher for healthcare information exchange: deterministic and probabilistic matching with multinational national identifiers (UK NHS / FR NIR / ES TSI / IE IHI / UK NI H&C / US SSN), E.164 phone normalisation, address parsing, nickname dictionary, email scoring, and explainable per-field breakdowns.
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
//! Nickname equivalence tables for given-name matching.
//!
//! Worker records routinely carry the same worker under several given-name
//! variants — `Michael` vs `Mike`, `Elizabeth` vs `Liz`, `Robert` vs `Bob`.
//! No string-similarity metric (Jaro-Winkler, Levenshtein, Soundex) closes
//! that gap on its own because the variants don't share enough characters
//! or phonemes. The fix is a small lookup table of known equivalence
//! classes, applied as a **post-similarity boost** so the matcher can lift
//! the given-name score when both records carry forms that the table
//! knows about.
//!
//! ## API
//!
//! Tables are constructed via [`NicknameTable::empty`] or
//! [`NicknameTable::english`] (a built-in default for common English
//! nicknames). Additional classes are added with
//! [`NicknameTable::with_class`].
//!
//! Two names are *equivalent* under a table iff, after
//! [`crate::Normalizer::normalize_name`] is applied, both end up in the
//! same equivalence class. Identical normalised strings are trivially
//! equivalent (the table does not need to list them explicitly).
//!
//! ## Integration with the matcher
//!
//! [`crate::MatchConfig::nickname_table`] is empty by default — nicknames
//! are an opt-in feature so existing behaviour is preserved. When a
//! non-empty table is configured, the matcher's name scoring computes the
//! configured similarity algorithm as usual and then **lifts the score to
//! `0.9` if-and-only-if the table considers the pair equivalent**. The
//! boost never lowers a score.
//!
//! ## Scope and limitations
//!
//! - English-language nicknames only. Localised tables are tracked in
//!   `spec.md` §21 medium-term work and can be slotted in by constructing
//!   a fresh [`NicknameTable`] at the call site.
//! - One-way ambiguity is intentional: `Sandy` can be a nickname for
//!   `Alexandra` or `Sandra`. Both are listed so that either canonical
//!   form matches `Sandy`; lookups return `true` when the two normalised
//!   inputs share *any* class.
//! - Family names are out of scope. The matcher applies the table to
//!   both given and family names because `score_name` is shared, but the
//!   default English table contains no family-name entries.
//!
//! # Examples
//!
//! ```
//! use worker_matcher::NicknameTable;
//!
//! let table = NicknameTable::english();
//! assert!(table.are_equivalent("Mike", "Michael"));
//! assert!(table.are_equivalent("Liz", "Elizabeth"));
//! assert!(table.are_equivalent("Bob", "Robert"));
//! assert!(!table.are_equivalent("Mike", "Robert"));
//!
//! // Add a custom class on top:
//! let table = NicknameTable::english().with_class(["Reginald", "Reggie"]);
//! assert!(table.are_equivalent("Reggie", "Reginald"));
//! ```

use crate::normalizer::Normalizer;
use serde::{Deserialize, Serialize};

/// Equivalence-class lookup table for given-name nicknames.
///
/// Each class is a `Vec<String>` of normalised forms that the table
/// considers interchangeable. Two inputs are equivalent under the table
/// iff their normalised forms appear in the same class — or are
/// byte-identical after normalisation.
///
/// The type is `Clone + Debug + PartialEq + Eq` so it composes into
/// [`crate::MatchConfig`] without surprises. Construction is cheap and
/// allocates once per class; lookup is `O(classes × entries)` and is
/// dominated by the table's size, not the input string length.
///
/// # Example
///
/// ```
/// use worker_matcher::NicknameTable;
///
/// let t = NicknameTable::empty()
///     .with_class(["Michael", "Mike", "Mickey"])
///     .with_class(["Elizabeth", "Liz", "Beth"]);
///
/// assert!(t.are_equivalent("Mike", "Michael"));
/// assert!(t.are_equivalent("liz", "BETH"));
/// assert!(!t.are_equivalent("Michael", "Liz"));
/// ```
#[derive(Debug, Clone, PartialEq, Eq, Default, Serialize, Deserialize)]
pub struct NicknameTable {
    classes: Vec<Vec<String>>,
}

impl NicknameTable {
    /// Construct an empty table that considers every pair of distinct
    /// strings non-equivalent (identical strings remain trivially equal).
    ///
    /// ```
    /// use worker_matcher::NicknameTable;
    /// let t = NicknameTable::empty();
    /// assert!(!t.are_equivalent("Mike", "Michael"));
    /// assert!(t.are_equivalent("Mike", "Mike"));
    /// ```
    #[must_use]
    pub fn empty() -> Self {
        Self {
            classes: Vec::new(),
        }
    }

    /// Append an equivalence class to the table.
    ///
    /// Each input string is normalised via
    /// [`crate::Normalizer::normalize_name`] before insertion so the
    /// table is closed under the same normalisation pipeline the matcher
    /// uses at lookup time. Duplicate or empty entries are silently
    /// dropped, and a class with fewer than two distinct normalised
    /// entries is dropped entirely (it would never make a pair
    /// equivalent).
    ///
    /// ```
    /// use worker_matcher::NicknameTable;
    /// let t = NicknameTable::empty().with_class(["Robert", "Bob", "Rob"]);
    /// assert!(t.are_equivalent("BOB", "robert"));
    /// ```
    #[must_use]
    pub fn with_class<I, S>(mut self, names: I) -> Self
    where
        I: IntoIterator<Item = S>,
        S: AsRef<str>,
    {
        let mut entries: Vec<String> = Vec::new();
        for name in names {
            let normalised = Normalizer::normalize_name(name.as_ref());
            if !normalised.is_empty() && !entries.contains(&normalised) {
                entries.push(normalised);
            }
        }
        if entries.len() >= 2 {
            self.classes.push(entries);
        }
        self
    }

    /// Return `true` iff `a` and `b`, after name normalisation, are
    /// considered the same worker by this table.
    ///
    /// Identical normalised strings are trivially equivalent. Otherwise
    /// both inputs must appear in the same equivalence class.
    ///
    /// ```
    /// use worker_matcher::NicknameTable;
    /// let t = NicknameTable::english();
    /// assert!(t.are_equivalent("Mike",  "Michael"));
    /// assert!(t.are_equivalent("mike",  "MICHAEL"));   // case-insensitive
    /// assert!(!t.are_equivalent("Mike", "Robert"));
    /// assert!(t.are_equivalent("",       ""));         // trivially equal
    /// ```
    #[must_use]
    pub fn are_equivalent(&self, a: &str, b: &str) -> bool {
        let na = Normalizer::normalize_name(a);
        let nb = Normalizer::normalize_name(b);
        if na == nb {
            return true;
        }
        self.classes
            .iter()
            .any(|cls| cls.iter().any(|n| n == &na) && cls.iter().any(|n| n == &nb))
    }

    /// `true` iff the table contains no equivalence classes — equivalent
    /// to comparing with [`NicknameTable::empty`] but cheaper to test.
    ///
    /// ```
    /// use worker_matcher::NicknameTable;
    /// assert!(NicknameTable::empty().is_empty());
    /// assert!(!NicknameTable::english().is_empty());
    /// ```
    #[must_use]
    pub fn is_empty(&self) -> bool {
        self.classes.is_empty()
    }

    /// Number of equivalence classes registered with this table.
    ///
    /// ```
    /// use worker_matcher::NicknameTable;
    /// assert_eq!(NicknameTable::empty().len(), 0);
    /// let t = NicknameTable::empty().with_class(["A", "B"]);
    /// assert_eq!(t.len(), 1);
    /// ```
    #[must_use]
    pub fn len(&self) -> usize {
        self.classes.len()
    }

    /// A built-in table covering the most common English-language
    /// nicknames encountered in healthcare data: `Michael`/`Mike`,
    /// `Robert`/`Bob`, `Elizabeth`/`Liz`, and similar.
    ///
    /// The exact contents are not part of the public contract — entries
    /// may be added in minor releases. Callers that need a stable
    /// dictionary SHOULD construct their own via
    /// [`NicknameTable::with_class`].
    ///
    /// ```
    /// use worker_matcher::NicknameTable;
    /// let t = NicknameTable::english();
    /// assert!(t.are_equivalent("Bill",      "William"));
    /// assert!(t.are_equivalent("Liz",       "Elizabeth"));
    /// assert!(t.are_equivalent("Steve",     "Steven"));
    /// assert!(t.are_equivalent("Steve",     "Stephen"));
    /// ```
    #[must_use]
    pub fn english() -> Self {
        let pairs: &[&[&str]] = &[
            &["michael", "mike", "mick", "mickey"],
            &["robert", "bob", "rob", "robbie", "bobby"],
            &["william", "will", "bill", "billy", "willy"],
            &["james", "jim", "jimmy", "jamie"],
            &["richard", "rick", "dick", "rich", "richie"],
            &["thomas", "tom", "tommy"],
            &[
                "elizabeth",
                "liz",
                "beth",
                "betty",
                "eliza",
                "lizzy",
                "betsy",
            ],
            &[
                "katherine",
                "kate",
                "kathy",
                "katy",
                "kat",
                "cathy",
                "katie",
            ],
            &[
                "catherine",
                "kate",
                "kathy",
                "katy",
                "kat",
                "cathy",
                "katie",
            ],
            &["margaret", "maggie", "meg", "peggy", "marge"],
            &["jennifer", "jen", "jenny", "jenn"],
            &["patricia", "pat", "patty", "tricia", "trish"],
            &["susan", "sue", "suzie", "susie"],
            &["barbara", "barb", "babs"],
            &["anthony", "tony"],
            &["christopher", "chris", "kris"],
            &["charles", "charlie", "chuck", "chas"],
            &["daniel", "dan", "danny"],
            &["david", "dave", "davy"],
            &["edward", "ed", "eddie", "ted", "ned"],
            &["joseph", "joe", "joey"],
            &["kenneth", "ken", "kenny"],
            &["nicholas", "nick", "nico"],
            &["peter", "pete"],
            &["samuel", "sam", "sammy"],
            &["stephen", "steve", "stevie"],
            &["steven", "steve", "stevie"],
            &["timothy", "tim", "timmy"],
            &["alexander", "alex", "xander"],
            &["alexandra", "alex", "alexa", "sandy"],
            &["sandra", "sandy"],
            &["benjamin", "ben", "benny"],
            &["rebecca", "becca", "becky"],
            &["sarah", "sara", "sally"],
            &["victoria", "vicky", "vic", "tori"],
            &["matthew", "matt", "matty"],
            &["jonathan", "jon", "jonny", "jonathon"],
            &["frederick", "fred", "freddy", "freddie"],
            &["lawrence", "larry"],
            &["henry", "hank", "harry"],
            &["ronald", "ron", "ronnie"],
            &["donald", "don", "donnie"],
            &["andrew", "andy", "drew"],
            &["abigail", "abby", "gail"],
            &["amanda", "mandy"],
            &["isabella", "izzy", "bella"],
            &["isabel", "izzy", "bella"],
            &["olivia", "liv", "livy"],
            &["nicole", "nikki"],
            &["samantha", "sam", "sammy"],
            &["pamela", "pam"],
            &["deborah", "deb", "debbie"],
            &["kimberly", "kim"],
            &["jessica", "jess", "jessie"],
            &["stephanie", "steph"],
            &["madeline", "maddy", "maddie"],
        ];
        let mut table = Self::empty();
        for class in pairs {
            table = table.with_class(*class);
        }
        table
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn empty_table_treats_distinct_strings_as_inequivalent() {
        let t = NicknameTable::empty();
        assert!(!t.are_equivalent("Mike", "Michael"));
        assert!(!t.are_equivalent("Liz", "Elizabeth"));
    }

    #[test]
    fn identical_normalised_strings_are_trivially_equivalent_even_when_empty() {
        let t = NicknameTable::empty();
        assert!(t.are_equivalent("Mike", "mike"));
        assert!(t.are_equivalent("MICHAEL", "michael"));
        assert!(t.are_equivalent("", ""));
    }

    #[test]
    fn with_class_normalises_entries_at_insertion() {
        let t = NicknameTable::empty().with_class(["Robert", "Bob", "Rob"]);
        assert!(t.are_equivalent("BOB", "robert"));
        assert!(t.are_equivalent("Rob", "Robert"));
    }

    #[test]
    fn with_class_dedupes_after_normalisation() {
        let t = NicknameTable::empty().with_class(["mike", "MIKE", "Mike"]);
        // All three normalise to "mike"; class collapses to a single
        // entry and is therefore dropped (no pair makes the class useful).
        assert_eq!(t.len(), 0);
    }

    #[test]
    fn with_class_drops_classes_with_fewer_than_two_distinct_entries() {
        let t = NicknameTable::empty().with_class(["Mike"]);
        assert!(t.is_empty());
    }

    #[test]
    fn with_class_drops_empty_strings_silently() {
        let t = NicknameTable::empty().with_class(["", "Mike", ""]);
        // After empties are dropped only "mike" remains → no useful class.
        assert!(t.is_empty());
    }

    #[test]
    fn english_table_covers_acceptance_criterion() {
        let t = NicknameTable::english();
        for (a, b) in [
            ("Mike", "Michael"),
            ("Liz", "Elizabeth"),
            ("Bob", "Robert"),
            ("Bill", "William"),
            ("Dick", "Richard"),
        ] {
            assert!(t.are_equivalent(a, b), "{a:?} ↮ {b:?} in english()");
        }
    }

    #[test]
    fn english_table_treats_unrelated_names_as_inequivalent() {
        let t = NicknameTable::english();
        assert!(!t.are_equivalent("Mike", "Robert"));
        assert!(!t.are_equivalent("Liz", "Tom"));
    }

    #[test]
    fn english_table_handles_shared_nicknames_across_classes() {
        let t = NicknameTable::english();
        // "Sandy" is a recognised nickname for both Alexandra and Sandra;
        // matching against either canonical succeeds.
        assert!(t.are_equivalent("Sandy", "Alexandra"));
        assert!(t.are_equivalent("Sandy", "Sandra"));
        // Steve appears in both Stephen and Steven classes for similar
        // reasons.
        assert!(t.are_equivalent("Steve", "Stephen"));
        assert!(t.are_equivalent("Steve", "Steven"));
    }

    #[test]
    fn with_class_composes_on_top_of_english() {
        let t = NicknameTable::english().with_class(["Reginald", "Reggie"]);
        assert!(t.are_equivalent("Reggie", "Reginald"));
        // Original entries still work.
        assert!(t.are_equivalent("Mike", "Michael"));
    }

    #[test]
    fn lookup_is_case_and_punctuation_insensitive() {
        let t = NicknameTable::english();
        assert!(t.are_equivalent("MIKE", "michael"));
        assert!(t.are_equivalent("  Mike  ", "Michael"));
    }

    #[test]
    fn default_is_empty() {
        let t = NicknameTable::default();
        assert!(t.is_empty());
    }
}