Skip to main content

thing_matcher/
normalizer.rs

1//! Text normalisation for `Thing` matching.
2//!
3//! Most matching accuracy gains come from **standardising the input** before
4//! scoring, not from cleverer similarity algorithms. This module exposes the
5//! canonical transformations the matching engine applies to names, free-form
6//! text, URLs, and phonetic codes.
7//!
8//! All transformations are **idempotent**: `f(f(x)) == f(x)`. They are also
9//! **deterministic** and allocate at most a single new `String`.
10//!
11//! ## Quick examples
12//!
13//! ```
14//! use thing_matcher::Normalizer;
15//!
16//! // Names: lowercase, drop diacritics, drop ASCII punctuation, collapse spaces.
17//! assert_eq!(Normalizer::normalize_name("  O'Brien  "), "obrien");
18//! assert_eq!(Normalizer::normalize_name("Siân"),         "sian");
19//!
20//! // Free-form text: lowercase, NFKD, collapse whitespace; keep punctuation
21//! // (so descriptions remain readable).
22//! assert_eq!(
23//!     Normalizer::normalize_text("  The   Eiffel Tower.  "),
24//!     "the eiffel tower.",
25//! );
26//!
27//! // URLs: lowercase scheme + host, drop trailing slash on the path root.
28//! assert_eq!(
29//!     Normalizer::normalize_url("HTTPS://Example.ORG/"),
30//!     "https://example.org",
31//! );
32//! ```
33//!
34//! ## What this module deliberately does *not* do
35//!
36//! - It does not handle non-ASCII punctuation such as the curly apostrophe
37//!   `’` (U+2019). Upstream code should convert those to ASCII first.
38//! - It does not perform DNS-aware URL normalisation, percent-encoding
39//!   canonicalisation, or punycode decoding.
40
41use unicode_normalization::UnicodeNormalization;
42use unicode_normalization::char::is_combining_mark;
43
44/// Stateless namespace for text normalisation routines.
45///
46/// `Normalizer` is a unit type with no fields; every method is associated.
47/// It is held as a struct rather than a free function module purely so the
48/// public API has a single, discoverable entry point.
49///
50/// ```
51/// use thing_matcher::Normalizer;
52///
53/// let canonical = Normalizer::normalize_name("José-María");
54/// assert_eq!(canonical, "josemaria");
55/// ```
56pub struct Normalizer;
57
58impl Normalizer {
59    /// Normalise a name for comparison.
60    ///
61    /// Steps, in order:
62    ///
63    /// 1. Decompose to Unicode NFKD form (`é` → `e` + combining acute).
64    /// 2. Drop combining marks (diacritics).
65    /// 3. Drop ASCII punctuation (apostrophes, hyphens, full stops, …).
66    /// 4. Lowercase.
67    /// 5. Collapse consecutive whitespace to single ASCII spaces; trim ends.
68    ///
69    /// The result is suitable for direct equality comparison or for feeding
70    /// into a string-similarity scorer.
71    ///
72    /// # Examples
73    ///
74    /// Whitespace is collapsed and trimmed:
75    ///
76    /// ```
77    /// use thing_matcher::Normalizer;
78    /// assert_eq!(Normalizer::normalize_name("  John  Smith  "), "john smith");
79    /// ```
80    ///
81    /// Apostrophes and hyphens are stripped:
82    ///
83    /// ```
84    /// # use thing_matcher::Normalizer;
85    /// assert_eq!(Normalizer::normalize_name("O'Brien"),    "obrien");
86    /// assert_eq!(Normalizer::normalize_name("MARY-JANE"),  "maryjane");
87    /// ```
88    ///
89    /// Diacritics are removed:
90    ///
91    /// ```
92    /// # use thing_matcher::Normalizer;
93    /// assert_eq!(Normalizer::normalize_name("Siân"),    "sian");
94    /// assert_eq!(Normalizer::normalize_name("café"),    "cafe");
95    /// // Letters with an integral stroke do not decompose under NFKD, so
96    /// // they pass through (lowercased), while the combining acute on `ó`
97    /// // and `ź` is stripped:
98    /// assert_eq!(Normalizer::normalize_name("Łódź"),    "łodz");
99    /// ```
100    pub fn normalize_name(name: &str) -> String {
101        let mut out = String::with_capacity(name.len());
102        for ch in name.nfkd() {
103            // Skip combining marks (Unicode categories Mn / Mc / Me).
104            if is_combining_mark(ch) {
105                continue;
106            }
107            if ch.is_ascii_punctuation() {
108                continue;
109            }
110            for lc in ch.to_lowercase() {
111                out.push(lc);
112            }
113        }
114        collapse_whitespace(&out)
115    }
116
117    /// Normalise free-form text (descriptions, etc.) for similarity scoring.
118    ///
119    /// Like [`Normalizer::normalize_name`], but keeps ASCII punctuation —
120    /// punctuation carries information in longer text (sentence boundaries,
121    /// abbreviations) that should not be discarded.
122    ///
123    /// Steps, in order:
124    ///
125    /// 1. Decompose to Unicode NFKD form.
126    /// 2. Drop combining marks (diacritics).
127    /// 3. Lowercase.
128    /// 4. Collapse consecutive whitespace to single ASCII spaces; trim ends.
129    ///
130    /// # Examples
131    ///
132    /// ```
133    /// use thing_matcher::Normalizer;
134    /// assert_eq!(
135    ///     Normalizer::normalize_text("  The Eiffel Tower, in Paris.  "),
136    ///     "the eiffel tower, in paris.",
137    /// );
138    /// assert_eq!(
139    ///     Normalizer::normalize_text("café au lait"),
140    ///     "cafe au lait",
141    /// );
142    /// ```
143    pub fn normalize_text(text: &str) -> String {
144        let mut out = String::with_capacity(text.len());
145        for ch in text.nfkd() {
146            if is_combining_mark(ch) {
147                continue;
148            }
149            for lc in ch.to_lowercase() {
150                out.push(lc);
151            }
152        }
153        collapse_whitespace(&out)
154    }
155
156    /// Normalise a URL for equality comparison.
157    ///
158    /// The transformation is **lossless enough for matching** but **not a
159    /// full URL canonicalisation**:
160    ///
161    /// 1. Trim surrounding whitespace.
162    /// 2. Lowercase the scheme and host portions (`HTTPS://Example.ORG` →
163    ///    `https://example.org`). The path is left case-sensitive.
164    /// 3. Drop a trailing slash from a root path (`https://x.org/` →
165    ///    `https://x.org`). Non-root trailing slashes are kept, because
166    ///    `/foo` and `/foo/` are legitimately different on many servers.
167    /// 4. Drop a `#fragment` suffix — fragments do not travel over HTTP
168    ///    and never identify a different resource.
169    ///
170    /// No percent-encoding canonicalisation is attempted; callers that
171    /// need strict canonical URLs should pre-process the input.
172    ///
173    /// # Examples
174    ///
175    /// ```
176    /// use thing_matcher::Normalizer;
177    /// assert_eq!(
178    ///     Normalizer::normalize_url("HTTPS://Example.ORG/"),
179    ///     "https://example.org",
180    /// );
181    /// assert_eq!(
182    ///     Normalizer::normalize_url("  https://EXAMPLE.org/foo  "),
183    ///     "https://example.org/foo",
184    /// );
185    /// assert_eq!(
186    ///     Normalizer::normalize_url("https://example.org/foo/#bar"),
187    ///     "https://example.org/foo/",
188    /// );
189    /// ```
190    ///
191    /// Strings that are not URL-shaped are returned trimmed + lowercased
192    /// so they remain comparable as opaque identifiers:
193    ///
194    /// ```
195    /// # use thing_matcher::Normalizer;
196    /// assert_eq!(Normalizer::normalize_url("  URN:ISBN:0451450523  "), "urn:isbn:0451450523");
197    /// ```
198    pub fn normalize_url(url: &str) -> String {
199        let trimmed = url.trim();
200        // Drop fragment, if present.
201        let no_frag = match trimmed.find('#') {
202            Some(idx) => &trimmed[..idx],
203            None => trimmed,
204        };
205
206        // Locate scheme delimiter.
207        let (scheme, after_scheme) = match no_frag.find("://") {
208            Some(idx) => (&no_frag[..idx], Some(&no_frag[idx + 3..])),
209            None => (no_frag, None),
210        };
211
212        // No scheme — fall back to a trimmed lowercase opaque form. Useful
213        // for `urn:` / `mailto:` / `tel:` style identifiers.
214        let Some(rest) = after_scheme else {
215            return no_frag.to_ascii_lowercase();
216        };
217
218        // Split host from path.
219        let (host, path) = match rest.find('/') {
220            Some(idx) => (&rest[..idx], &rest[idx..]),
221            None => (rest, ""),
222        };
223
224        let mut out = String::with_capacity(no_frag.len());
225        out.push_str(&scheme.to_ascii_lowercase());
226        out.push_str("://");
227        out.push_str(&host.to_ascii_lowercase());
228
229        // Drop a trailing slash only when the path *is* the root.
230        if !(path.is_empty() || path == "/") {
231            out.push_str(path);
232        }
233        out
234    }
235
236    /// Soundex-like phonetic code for an ASCII-ish name, used as a coarse
237    /// blocking key and as the gate for the phonetic-bonus in the matcher.
238    ///
239    /// Implementation note: delegates to the `soundex` crate after first
240    /// applying [`Normalizer::normalize_name`]. Returns an empty string
241    /// when the input is empty or normalises to an empty string.
242    ///
243    /// # Examples
244    ///
245    /// ```
246    /// use thing_matcher::Normalizer;
247    /// let a = Normalizer::phonetic_code("Stephen");
248    /// let b = Normalizer::phonetic_code("Steven");
249    /// assert!(!a.is_empty());
250    /// assert_eq!(a, b);
251    /// ```
252    pub fn phonetic_code(name: &str) -> String {
253        let normalised = Self::normalize_name(name);
254        if normalised.is_empty() {
255            return String::new();
256        }
257        // The `soundex` crate's `american_soundex` is infallible for any
258        // ASCII input. Strip non-ASCII bytes before handing it over.
259        let ascii: String = normalised.chars().filter(|c| c.is_ascii()).collect();
260        if ascii.is_empty() {
261            return String::new();
262        }
263        soundex::american_soundex(&ascii)
264    }
265}
266
267/// Collapse consecutive whitespace into single ASCII spaces and trim ends.
268fn collapse_whitespace(s: &str) -> String {
269    let mut out = String::with_capacity(s.len());
270    let mut prev_space = true; // start of string = no leading spaces
271    for ch in s.chars() {
272        if ch.is_whitespace() {
273            if !prev_space {
274                out.push(' ');
275                prev_space = true;
276            }
277        } else {
278            out.push(ch);
279            prev_space = false;
280        }
281    }
282    if out.ends_with(' ') {
283        out.pop();
284    }
285    out
286}
287
288#[cfg(test)]
289mod tests {
290    use super::*;
291
292    // ---------- normalize_name ----------
293
294    #[test]
295    fn normalize_name_lowercases_and_trims() {
296        assert_eq!(Normalizer::normalize_name("  HELLO  "), "hello");
297    }
298
299    #[test]
300    fn normalize_name_collapses_internal_whitespace() {
301        assert_eq!(Normalizer::normalize_name("a  \t  b\nc"), "a b c");
302    }
303
304    #[test]
305    fn normalize_name_drops_punctuation() {
306        assert_eq!(Normalizer::normalize_name("O'Brien"), "obrien");
307        assert_eq!(Normalizer::normalize_name("Mary-Jane!"), "maryjane");
308    }
309
310    #[test]
311    fn normalize_name_drops_diacritics() {
312        assert_eq!(Normalizer::normalize_name("Siân"), "sian");
313        assert_eq!(Normalizer::normalize_name("café"), "cafe");
314        assert_eq!(Normalizer::normalize_name("Zoë"), "zoe");
315    }
316
317    #[test]
318    fn normalize_name_is_idempotent() {
319        let cases = ["hello", "O'Brien", " café au lait ", "JOSÉ-MARÍA"];
320        for c in cases {
321            let once = Normalizer::normalize_name(c);
322            let twice = Normalizer::normalize_name(&once);
323            assert_eq!(once, twice, "non-idempotent for {c:?}");
324        }
325    }
326
327    #[test]
328    fn normalize_name_empty_returns_empty() {
329        assert!(Normalizer::normalize_name("").is_empty());
330        assert!(Normalizer::normalize_name("    ").is_empty());
331    }
332
333    // ---------- normalize_text ----------
334
335    #[test]
336    fn normalize_text_preserves_punctuation() {
337        assert_eq!(Normalizer::normalize_text("Hello, World!"), "hello, world!");
338    }
339
340    #[test]
341    fn normalize_text_drops_diacritics() {
342        assert_eq!(Normalizer::normalize_text("Café au lait."), "cafe au lait.");
343    }
344
345    #[test]
346    fn normalize_text_is_idempotent() {
347        let cases = [
348            "The Eiffel Tower, in Paris.",
349            "  multi    space   ",
350            "Plain.",
351        ];
352        for c in cases {
353            let once = Normalizer::normalize_text(c);
354            let twice = Normalizer::normalize_text(&once);
355            assert_eq!(once, twice, "non-idempotent for {c:?}");
356        }
357    }
358
359    // ---------- normalize_url ----------
360
361    #[test]
362    fn normalize_url_lowercases_scheme_and_host() {
363        assert_eq!(
364            Normalizer::normalize_url("HTTPS://Example.ORG/foo"),
365            "https://example.org/foo",
366        );
367    }
368
369    #[test]
370    fn normalize_url_drops_root_trailing_slash() {
371        assert_eq!(
372            Normalizer::normalize_url("https://example.org/"),
373            "https://example.org",
374        );
375    }
376
377    #[test]
378    fn normalize_url_keeps_subpath_trailing_slash() {
379        assert_eq!(
380            Normalizer::normalize_url("https://example.org/foo/"),
381            "https://example.org/foo/",
382        );
383    }
384
385    #[test]
386    fn normalize_url_drops_fragment() {
387        assert_eq!(
388            Normalizer::normalize_url("https://example.org/foo#bar"),
389            "https://example.org/foo",
390        );
391    }
392
393    #[test]
394    fn normalize_url_handles_opaque_uri() {
395        assert_eq!(
396            Normalizer::normalize_url("URN:ISBN:0451450523"),
397            "urn:isbn:0451450523",
398        );
399    }
400
401    #[test]
402    fn normalize_url_is_idempotent() {
403        let cases = [
404            "https://example.org/",
405            "HTTPS://EXAMPLE.org/foo#frag",
406            "urn:isbn:123",
407        ];
408        for c in cases {
409            let once = Normalizer::normalize_url(c);
410            let twice = Normalizer::normalize_url(&once);
411            assert_eq!(once, twice, "non-idempotent for {c:?}");
412        }
413    }
414
415    // ---------- phonetic_code ----------
416
417    #[test]
418    fn phonetic_code_matches_homophones() {
419        assert_eq!(
420            Normalizer::phonetic_code("Stephen"),
421            Normalizer::phonetic_code("Steven"),
422        );
423    }
424
425    #[test]
426    fn phonetic_code_distinct_for_unrelated_names() {
427        assert_ne!(
428            Normalizer::phonetic_code("Alice"),
429            Normalizer::phonetic_code("Zachary"),
430        );
431    }
432
433    #[test]
434    fn phonetic_code_empty_for_empty_input() {
435        assert!(Normalizer::phonetic_code("").is_empty());
436        assert!(Normalizer::phonetic_code("   ").is_empty());
437    }
438}