thing_matcher/normalizer.rs
1//! Text normalisation for `Thing` matching.
2//!
3//! Most matching accuracy gains come from **standardising the input** before
4//! scoring, not from cleverer similarity algorithms. This module exposes the
5//! canonical transformations the matching engine applies to names, free-form
6//! text, URLs, and phonetic codes.
7//!
8//! All transformations are **idempotent**: `f(f(x)) == f(x)`. They are also
9//! **deterministic** and allocate at most a single new `String`.
10//!
11//! ## Quick examples
12//!
13//! ```
14//! use thing_matcher::Normalizer;
15//!
16//! // Names: lowercase, drop diacritics, drop ASCII punctuation, collapse spaces.
17//! assert_eq!(Normalizer::normalize_name(" O'Brien "), "obrien");
18//! assert_eq!(Normalizer::normalize_name("Siân"), "sian");
19//!
20//! // Free-form text: lowercase, NFKD, collapse whitespace; keep punctuation
21//! // (so descriptions remain readable).
22//! assert_eq!(
23//! Normalizer::normalize_text(" The Eiffel Tower. "),
24//! "the eiffel tower.",
25//! );
26//!
27//! // URLs: lowercase scheme + host, drop trailing slash on the path root.
28//! assert_eq!(
29//! Normalizer::normalize_url("HTTPS://Example.ORG/"),
30//! "https://example.org",
31//! );
32//! ```
33//!
34//! ## What this module deliberately does *not* do
35//!
36//! - It does not handle non-ASCII punctuation such as the curly apostrophe
37//! `’` (U+2019). Upstream code should convert those to ASCII first.
38//! - It does not perform DNS-aware URL normalisation, percent-encoding
39//! canonicalisation, or punycode decoding.
40
41use unicode_normalization::UnicodeNormalization;
42use unicode_normalization::char::is_combining_mark;
43
44/// Stateless namespace for text normalisation routines.
45///
46/// `Normalizer` is a unit type with no fields; every method is associated.
47/// It is held as a struct rather than a free function module purely so the
48/// public API has a single, discoverable entry point.
49///
50/// ```
51/// use thing_matcher::Normalizer;
52///
53/// let canonical = Normalizer::normalize_name("José-María");
54/// assert_eq!(canonical, "josemaria");
55/// ```
56pub struct Normalizer;
57
58impl Normalizer {
59 /// Normalise a name for comparison.
60 ///
61 /// Steps, in order:
62 ///
63 /// 1. Decompose to Unicode NFKD form (`é` → `e` + combining acute).
64 /// 2. Drop combining marks (diacritics).
65 /// 3. Drop ASCII punctuation (apostrophes, hyphens, full stops, …).
66 /// 4. Lowercase.
67 /// 5. Collapse consecutive whitespace to single ASCII spaces; trim ends.
68 ///
69 /// The result is suitable for direct equality comparison or for feeding
70 /// into a string-similarity scorer.
71 ///
72 /// # Examples
73 ///
74 /// Whitespace is collapsed and trimmed:
75 ///
76 /// ```
77 /// use thing_matcher::Normalizer;
78 /// assert_eq!(Normalizer::normalize_name(" John Smith "), "john smith");
79 /// ```
80 ///
81 /// Apostrophes and hyphens are stripped:
82 ///
83 /// ```
84 /// # use thing_matcher::Normalizer;
85 /// assert_eq!(Normalizer::normalize_name("O'Brien"), "obrien");
86 /// assert_eq!(Normalizer::normalize_name("MARY-JANE"), "maryjane");
87 /// ```
88 ///
89 /// Diacritics are removed:
90 ///
91 /// ```
92 /// # use thing_matcher::Normalizer;
93 /// assert_eq!(Normalizer::normalize_name("Siân"), "sian");
94 /// assert_eq!(Normalizer::normalize_name("café"), "cafe");
95 /// // Letters with an integral stroke do not decompose under NFKD, so
96 /// // they pass through (lowercased), while the combining acute on `ó`
97 /// // and `ź` is stripped:
98 /// assert_eq!(Normalizer::normalize_name("Łódź"), "łodz");
99 /// ```
100 pub fn normalize_name(name: &str) -> String {
101 let mut out = String::with_capacity(name.len());
102 for ch in name.nfkd() {
103 // Skip combining marks (Unicode categories Mn / Mc / Me).
104 if is_combining_mark(ch) {
105 continue;
106 }
107 if ch.is_ascii_punctuation() {
108 continue;
109 }
110 for lc in ch.to_lowercase() {
111 out.push(lc);
112 }
113 }
114 collapse_whitespace(&out)
115 }
116
117 /// Normalise free-form text (descriptions, etc.) for similarity scoring.
118 ///
119 /// Like [`Normalizer::normalize_name`], but keeps ASCII punctuation —
120 /// punctuation carries information in longer text (sentence boundaries,
121 /// abbreviations) that should not be discarded.
122 ///
123 /// Steps, in order:
124 ///
125 /// 1. Decompose to Unicode NFKD form.
126 /// 2. Drop combining marks (diacritics).
127 /// 3. Lowercase.
128 /// 4. Collapse consecutive whitespace to single ASCII spaces; trim ends.
129 ///
130 /// # Examples
131 ///
132 /// ```
133 /// use thing_matcher::Normalizer;
134 /// assert_eq!(
135 /// Normalizer::normalize_text(" The Eiffel Tower, in Paris. "),
136 /// "the eiffel tower, in paris.",
137 /// );
138 /// assert_eq!(
139 /// Normalizer::normalize_text("café au lait"),
140 /// "cafe au lait",
141 /// );
142 /// ```
143 pub fn normalize_text(text: &str) -> String {
144 let mut out = String::with_capacity(text.len());
145 for ch in text.nfkd() {
146 if is_combining_mark(ch) {
147 continue;
148 }
149 for lc in ch.to_lowercase() {
150 out.push(lc);
151 }
152 }
153 collapse_whitespace(&out)
154 }
155
156 /// Normalise a URL for equality comparison.
157 ///
158 /// The transformation is **lossless enough for matching** but **not a
159 /// full URL canonicalisation**:
160 ///
161 /// 1. Trim surrounding whitespace.
162 /// 2. Lowercase the scheme and host portions (`HTTPS://Example.ORG` →
163 /// `https://example.org`). The path is left case-sensitive.
164 /// 3. Drop a trailing slash from a root path (`https://x.org/` →
165 /// `https://x.org`). Non-root trailing slashes are kept, because
166 /// `/foo` and `/foo/` are legitimately different on many servers.
167 /// 4. Drop a `#fragment` suffix — fragments do not travel over HTTP
168 /// and never identify a different resource.
169 ///
170 /// No percent-encoding canonicalisation is attempted; callers that
171 /// need strict canonical URLs should pre-process the input.
172 ///
173 /// # Examples
174 ///
175 /// ```
176 /// use thing_matcher::Normalizer;
177 /// assert_eq!(
178 /// Normalizer::normalize_url("HTTPS://Example.ORG/"),
179 /// "https://example.org",
180 /// );
181 /// assert_eq!(
182 /// Normalizer::normalize_url(" https://EXAMPLE.org/foo "),
183 /// "https://example.org/foo",
184 /// );
185 /// assert_eq!(
186 /// Normalizer::normalize_url("https://example.org/foo/#bar"),
187 /// "https://example.org/foo/",
188 /// );
189 /// ```
190 ///
191 /// Strings that are not URL-shaped are returned trimmed + lowercased
192 /// so they remain comparable as opaque identifiers:
193 ///
194 /// ```
195 /// # use thing_matcher::Normalizer;
196 /// assert_eq!(Normalizer::normalize_url(" URN:ISBN:0451450523 "), "urn:isbn:0451450523");
197 /// ```
198 pub fn normalize_url(url: &str) -> String {
199 let trimmed = url.trim();
200 // Drop fragment, if present.
201 let no_frag = match trimmed.find('#') {
202 Some(idx) => &trimmed[..idx],
203 None => trimmed,
204 };
205
206 // Locate scheme delimiter.
207 let (scheme, after_scheme) = match no_frag.find("://") {
208 Some(idx) => (&no_frag[..idx], Some(&no_frag[idx + 3..])),
209 None => (no_frag, None),
210 };
211
212 // No scheme — fall back to a trimmed lowercase opaque form. Useful
213 // for `urn:` / `mailto:` / `tel:` style identifiers.
214 let Some(rest) = after_scheme else {
215 return no_frag.to_ascii_lowercase();
216 };
217
218 // Split host from path.
219 let (host, path) = match rest.find('/') {
220 Some(idx) => (&rest[..idx], &rest[idx..]),
221 None => (rest, ""),
222 };
223
224 let mut out = String::with_capacity(no_frag.len());
225 out.push_str(&scheme.to_ascii_lowercase());
226 out.push_str("://");
227 out.push_str(&host.to_ascii_lowercase());
228
229 // Drop a trailing slash only when the path *is* the root.
230 if !(path.is_empty() || path == "/") {
231 out.push_str(path);
232 }
233 out
234 }
235
236 /// Soundex-like phonetic code for an ASCII-ish name, used as a coarse
237 /// blocking key and as the gate for the phonetic-bonus in the matcher.
238 ///
239 /// Implementation note: delegates to the `soundex` crate after first
240 /// applying [`Normalizer::normalize_name`]. Returns an empty string
241 /// when the input is empty or normalises to an empty string.
242 ///
243 /// # Examples
244 ///
245 /// ```
246 /// use thing_matcher::Normalizer;
247 /// let a = Normalizer::phonetic_code("Stephen");
248 /// let b = Normalizer::phonetic_code("Steven");
249 /// assert!(!a.is_empty());
250 /// assert_eq!(a, b);
251 /// ```
252 pub fn phonetic_code(name: &str) -> String {
253 let normalised = Self::normalize_name(name);
254 if normalised.is_empty() {
255 return String::new();
256 }
257 // The `soundex` crate's `american_soundex` is infallible for any
258 // ASCII input. Strip non-ASCII bytes before handing it over.
259 let ascii: String = normalised.chars().filter(|c| c.is_ascii()).collect();
260 if ascii.is_empty() {
261 return String::new();
262 }
263 soundex::american_soundex(&ascii)
264 }
265}
266
267/// Collapse consecutive whitespace into single ASCII spaces and trim ends.
268fn collapse_whitespace(s: &str) -> String {
269 let mut out = String::with_capacity(s.len());
270 let mut prev_space = true; // start of string = no leading spaces
271 for ch in s.chars() {
272 if ch.is_whitespace() {
273 if !prev_space {
274 out.push(' ');
275 prev_space = true;
276 }
277 } else {
278 out.push(ch);
279 prev_space = false;
280 }
281 }
282 if out.ends_with(' ') {
283 out.pop();
284 }
285 out
286}
287
288#[cfg(test)]
289mod tests {
290 use super::*;
291
292 // ---------- normalize_name ----------
293
294 #[test]
295 fn normalize_name_lowercases_and_trims() {
296 assert_eq!(Normalizer::normalize_name(" HELLO "), "hello");
297 }
298
299 #[test]
300 fn normalize_name_collapses_internal_whitespace() {
301 assert_eq!(Normalizer::normalize_name("a \t b\nc"), "a b c");
302 }
303
304 #[test]
305 fn normalize_name_drops_punctuation() {
306 assert_eq!(Normalizer::normalize_name("O'Brien"), "obrien");
307 assert_eq!(Normalizer::normalize_name("Mary-Jane!"), "maryjane");
308 }
309
310 #[test]
311 fn normalize_name_drops_diacritics() {
312 assert_eq!(Normalizer::normalize_name("Siân"), "sian");
313 assert_eq!(Normalizer::normalize_name("café"), "cafe");
314 assert_eq!(Normalizer::normalize_name("Zoë"), "zoe");
315 }
316
317 #[test]
318 fn normalize_name_is_idempotent() {
319 let cases = ["hello", "O'Brien", " café au lait ", "JOSÉ-MARÍA"];
320 for c in cases {
321 let once = Normalizer::normalize_name(c);
322 let twice = Normalizer::normalize_name(&once);
323 assert_eq!(once, twice, "non-idempotent for {c:?}");
324 }
325 }
326
327 #[test]
328 fn normalize_name_empty_returns_empty() {
329 assert!(Normalizer::normalize_name("").is_empty());
330 assert!(Normalizer::normalize_name(" ").is_empty());
331 }
332
333 // ---------- normalize_text ----------
334
335 #[test]
336 fn normalize_text_preserves_punctuation() {
337 assert_eq!(Normalizer::normalize_text("Hello, World!"), "hello, world!");
338 }
339
340 #[test]
341 fn normalize_text_drops_diacritics() {
342 assert_eq!(Normalizer::normalize_text("Café au lait."), "cafe au lait.");
343 }
344
345 #[test]
346 fn normalize_text_is_idempotent() {
347 let cases = [
348 "The Eiffel Tower, in Paris.",
349 " multi space ",
350 "Plain.",
351 ];
352 for c in cases {
353 let once = Normalizer::normalize_text(c);
354 let twice = Normalizer::normalize_text(&once);
355 assert_eq!(once, twice, "non-idempotent for {c:?}");
356 }
357 }
358
359 // ---------- normalize_url ----------
360
361 #[test]
362 fn normalize_url_lowercases_scheme_and_host() {
363 assert_eq!(
364 Normalizer::normalize_url("HTTPS://Example.ORG/foo"),
365 "https://example.org/foo",
366 );
367 }
368
369 #[test]
370 fn normalize_url_drops_root_trailing_slash() {
371 assert_eq!(
372 Normalizer::normalize_url("https://example.org/"),
373 "https://example.org",
374 );
375 }
376
377 #[test]
378 fn normalize_url_keeps_subpath_trailing_slash() {
379 assert_eq!(
380 Normalizer::normalize_url("https://example.org/foo/"),
381 "https://example.org/foo/",
382 );
383 }
384
385 #[test]
386 fn normalize_url_drops_fragment() {
387 assert_eq!(
388 Normalizer::normalize_url("https://example.org/foo#bar"),
389 "https://example.org/foo",
390 );
391 }
392
393 #[test]
394 fn normalize_url_handles_opaque_uri() {
395 assert_eq!(
396 Normalizer::normalize_url("URN:ISBN:0451450523"),
397 "urn:isbn:0451450523",
398 );
399 }
400
401 #[test]
402 fn normalize_url_is_idempotent() {
403 let cases = [
404 "https://example.org/",
405 "HTTPS://EXAMPLE.org/foo#frag",
406 "urn:isbn:123",
407 ];
408 for c in cases {
409 let once = Normalizer::normalize_url(c);
410 let twice = Normalizer::normalize_url(&once);
411 assert_eq!(once, twice, "non-idempotent for {c:?}");
412 }
413 }
414
415 // ---------- phonetic_code ----------
416
417 #[test]
418 fn phonetic_code_matches_homophones() {
419 assert_eq!(
420 Normalizer::phonetic_code("Stephen"),
421 Normalizer::phonetic_code("Steven"),
422 );
423 }
424
425 #[test]
426 fn phonetic_code_distinct_for_unrelated_names() {
427 assert_ne!(
428 Normalizer::phonetic_code("Alice"),
429 Normalizer::phonetic_code("Zachary"),
430 );
431 }
432
433 #[test]
434 fn phonetic_code_empty_for_empty_input() {
435 assert!(Normalizer::phonetic_code("").is_empty());
436 assert!(Normalizer::phonetic_code(" ").is_empty());
437 }
438}