patient-matching 0.2.0

//! Text normalisation for patient demographic data.
//!
//! Research on patient identification (see `spec.md` §5) is unanimous: most
//! accuracy gains come from **standardising the input** before scoring, not
//! from cleverer similarity algorithms. This module exposes the canonical
//! transformations the matching engine applies to names, postcodes, phone
//! numbers, and phonetic codes.
//!
//! All transformations are **idempotent**: `f(f(x)) == f(x)`. They are also
//! **deterministic** and allocate at most a single new `String`.
//!
//! ## Quick examples
//!
//! ```
//! use patient_matching::Normalizer;
//!
//! // Names: lowercase, drop diacritics, drop ASCII punctuation, collapse spaces.
//! assert_eq!(Normalizer::normalize_name("  O'Brien  "), "obrien");
//! assert_eq!(Normalizer::normalize_name("Siân"),         "sian");
//!
//! // Postcodes: strip whitespace, uppercase.
//! assert_eq!(Normalizer::normalize_postcode("cf10 1aa"), "CF101AA");
//!
//! // Phone numbers: keep digits, strip international and trunk prefixes.
//! assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
//! ```
//!
//! ## What this module deliberately does *not* do
//!
//! - It does not validate NHS numbers — that is delegated to the
//!   `nhs-number` crate at the call-site (see [`crate::matcher`]).
//! - It does not normalise email addresses or middle names (see spec
//!   tasks T-11 and OQ-1 respectively).
//! - It does not handle non-ASCII punctuation such as the curly apostrophe
//!   `’` (U+2019). Upstream code should convert those to ASCII first.

use unicode_normalization::UnicodeNormalization;

/// Stateless namespace for text normalisation routines.
///
/// `Normalizer` is a unit type with no fields; every method is associated.
/// It is held as a struct rather than a free function module purely so the
/// public API has a single, discoverable entry point.
///
/// ```
/// use patient_matching::Normalizer;
///
/// let canonical = Normalizer::normalize_name("José-María");
/// assert_eq!(canonical, "josemaria");
/// ```
pub struct Normalizer;

impl Normalizer {
    /// Normalise a human name for comparison.
    ///
    /// Steps, in order:
    ///
    /// 1. Decompose to Unicode NFKD form (`é` → `e` + combining acute).
    /// 2. Drop combining marks (diacritics).
    /// 3. Drop ASCII punctuation (apostrophes, hyphens, full stops, …).
    /// 4. Lowercase.
    /// 5. Collapse consecutive whitespace to single ASCII spaces; trim ends.
    ///
    /// The result is suitable for direct equality comparison or for feeding
    /// into a string-similarity scorer.
    ///
    /// # Examples
    ///
    /// Whitespace is collapsed and trimmed:
    ///
    /// ```
    /// use patient_matching::Normalizer;
    /// assert_eq!(Normalizer::normalize_name("  John  Smith  "), "john smith");
    /// ```
    ///
    /// Apostrophes and hyphens are stripped:
    ///
    /// ```
    /// # use patient_matching::Normalizer;
    /// assert_eq!(Normalizer::normalize_name("O'Brien"),    "obrien");
    /// assert_eq!(Normalizer::normalize_name("MARY-JANE"),  "maryjane");
    /// ```
    ///
    /// Diacritics are removed:
    ///
    /// ```
    /// # use patient_matching::Normalizer;
    /// assert_eq!(Normalizer::normalize_name("José"),  "jose");
    /// assert_eq!(Normalizer::normalize_name("Siân"),  "sian");
    /// assert_eq!(Normalizer::normalize_name("Łukasz"), "łukasz");  // ł has no decomposition
    /// ```
    ///
    /// Empty and whitespace-only input round-trip cleanly:
    ///
    /// ```
    /// # use patient_matching::Normalizer;
    /// assert_eq!(Normalizer::normalize_name(""),       "");
    /// assert_eq!(Normalizer::normalize_name("    "),   "");
    /// ```
    ///
    /// The function is **idempotent**:
    ///
    /// ```
    /// # use patient_matching::Normalizer;
    /// let once = Normalizer::normalize_name("  José-María  ");
    /// let twice = Normalizer::normalize_name(&once);
    /// assert_eq!(once, twice);
    /// ```
    pub fn normalize_name(name: &str) -> String {
        name.nfkd()
            .filter(|c| !unicode_normalization::char::is_combining_mark(*c))
            .filter(|c| !c.is_ascii_punctuation())
            .collect::<String>()
            .to_lowercase()
            .split_whitespace()
            .collect::<Vec<_>>()
            .join(" ")
    }

    /// Normalise a postcode for comparison.
    ///
    /// Steps: drop all whitespace, then uppercase. No locale-specific
    /// validation — that is intentionally out of scope.
    ///
    /// # Examples
    ///
    /// UK postcodes with and without the conventional space are equivalent:
    ///
    /// ```
    /// use patient_matching::Normalizer;
    /// assert_eq!(Normalizer::normalize_postcode("CF10 1AA"),    "CF101AA");
    /// assert_eq!(Normalizer::normalize_postcode("cf101aa"),     "CF101AA");
    /// assert_eq!(Normalizer::normalize_postcode("  cf10 1aa "), "CF101AA");
    /// ```
    ///
    /// Empty input is preserved:
    ///
    /// ```
    /// # use patient_matching::Normalizer;
    /// assert_eq!(Normalizer::normalize_postcode(""), "");
    /// ```
    ///
    /// Idempotent:
    ///
    /// ```
    /// # use patient_matching::Normalizer;
    /// let once = Normalizer::normalize_postcode("sw1a 2aa");
    /// let twice = Normalizer::normalize_postcode(&once);
    /// assert_eq!(once, twice);
    /// ```
    pub fn normalize_postcode(postcode: &str) -> String {
        postcode
            .chars()
            .filter(|c| !c.is_whitespace())
            .collect::<String>()
            .to_uppercase()
    }

    /// Normalise a phone number for comparison.
    ///
    /// Steps:
    ///
    /// 1. Keep only ASCII digits (drop spaces, brackets, hyphens, `+`, …).
    /// 2. If the result starts with `0044`, drop those four characters.
    /// 3. Else, if the result starts with `44` and is at least 12 digits long,
    ///    drop the leading `44`.
    /// 4. Else, if the result starts with `0` and is longer than one digit,
    ///    drop the leading `0`.
    ///
    /// This canonicalises the common UK formats into a single subscriber
    /// number with no leading prefix. International numbers from other
    /// countries pass through unchanged.
    ///
    /// # Examples
    ///
    /// ```
    /// use patient_matching::Normalizer;
    ///
    /// // UK mobile, in three formats:
    /// assert_eq!(Normalizer::normalize_phone("07700 900123"),    "7700900123");
    /// assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
    /// assert_eq!(Normalizer::normalize_phone("0044 7700 900123"), "7700900123");
    ///
    /// // UK landline with brackets and spaces:
    /// assert_eq!(Normalizer::normalize_phone("(029) 2034 5678"), "2920345678");
    ///
    /// // Empty input is preserved (no digits to keep):
    /// assert_eq!(Normalizer::normalize_phone(""), "");
    /// ```
    ///
    /// Idempotent on canonical inputs:
    ///
    /// ```
    /// # use patient_matching::Normalizer;
    /// let once = Normalizer::normalize_phone("07700 900123");
    /// let twice = Normalizer::normalize_phone(&once);
    /// assert_eq!(once, twice);
    /// ```
    pub fn normalize_phone(phone: &str) -> String {
        let digits: String = phone.chars().filter(|c| c.is_ascii_digit()).collect();

        if digits.starts_with("0044") && digits.len() > 4 {
            return digits[4..].to_string();
        }

        if digits.starts_with("44") && digits.len() >= 12 {
            return digits[2..].to_string();
        }

        if digits.starts_with('0') && digits.len() > 1 {
            return digits[1..].to_string();
        }

        digits
    }

    /// Compute a phonetic (Soundex) code for a name.
    ///
    /// Internally, the input is first normalised via
    /// [`Normalizer::normalize_name`] and then encoded with the American
    /// Soundex algorithm. Names that sound alike map to the same code, which
    /// lets the matcher catch spelling variants such as "Smith" / "Smyth" or
    /// "Stephen" / "Steven".
    ///
    /// The implementation is suitable for English-language names. Non-English
    /// phonemes may be lost — see spec task T-9 for the locale-aware roadmap.
    ///
    /// # Examples
    ///
    /// Similar-sounding spellings share a code:
    ///
    /// ```
    /// use patient_matching::Normalizer;
    /// assert_eq!(Normalizer::phonetic_code("Smith"), Normalizer::phonetic_code("Smyth"));
    /// assert_eq!(Normalizer::phonetic_code("Stephen"), Normalizer::phonetic_code("Steven"));
    /// ```
    ///
    /// Different families produce different codes:
    ///
    /// ```
    /// # use patient_matching::Normalizer;
    /// assert_ne!(Normalizer::phonetic_code("Jones"), Normalizer::phonetic_code("Smith"));
    /// ```
    ///
    /// Empty input returns an empty string, not a default Soundex value:
    ///
    /// ```
    /// # use patient_matching::Normalizer;
    /// assert_eq!(Normalizer::phonetic_code(""),       "");
    /// assert_eq!(Normalizer::phonetic_code("   "),    "");
    /// ```
    pub fn phonetic_code(name: &str) -> String {
        let normalized = Self::normalize_name(name);
        if normalized.is_empty() {
            return String::new();
        }
        soundex::american_soundex(&normalized)
    }
}

#[cfg(test)]
mod tests {
    use super::*;

    // ---------- normalize_name ----------

    #[test]
    fn normalize_name_collapses_whitespace_and_trims() {
        assert_eq!(Normalizer::normalize_name("  John  Smith  "), "john smith");
    }

    #[test]
    fn normalize_name_strips_ascii_punctuation() {
        assert_eq!(Normalizer::normalize_name("O'Brien"), "obrien");
        assert_eq!(Normalizer::normalize_name("Mary-Jane"), "maryjane");
        assert_eq!(Normalizer::normalize_name("Dr. Who"), "dr who");
    }

    #[test]
    fn normalize_name_strips_diacritics() {
        assert_eq!(Normalizer::normalize_name("José"), "jose");
        assert_eq!(Normalizer::normalize_name("Siân"), "sian");
        // common test cases
        assert_eq!(Normalizer::normalize_name("naïve"), "naive");
        assert_eq!(Normalizer::normalize_name("crème"), "creme");
        // ŷ and ŵ decompose; ł has no NFKD decomposition and survives lowercased.
        assert_eq!(Normalizer::normalize_name("Łŷŵ"), "ływ");
    }

    #[test]
    fn normalize_name_handles_empty_and_whitespace() {
        assert_eq!(Normalizer::normalize_name(""), "");
        assert_eq!(Normalizer::normalize_name("   "), "");
        assert_eq!(Normalizer::normalize_name("\t\n"), "");
    }

    #[test]
    fn normalize_name_lowercases() {
        assert_eq!(Normalizer::normalize_name("MARY"), "mary");
        assert_eq!(Normalizer::normalize_name("McDONALD"), "mcdonald");
    }

    #[test]
    fn normalize_name_is_idempotent() {
        for input in [
            "  John  Smith  ",
            "O'Brien-Jones",
            "JOSÉ MARÍA",
            "",
            "  ",
            "Siân",
        ] {
            let once = Normalizer::normalize_name(input);
            let twice = Normalizer::normalize_name(&once);
            assert_eq!(once, twice, "not idempotent for {input:?}");
        }
    }

    #[test]
    fn normalize_name_does_not_normalise_unicode_punctuation() {
        // Curly apostrophe (U+2019) is intentionally not stripped.
        // This is documented in AGENTS/normalization.md as a known limitation.
        let with_curly = Normalizer::normalize_name("O\u{2019}Brien");
        assert!(with_curly.contains('\u{2019}'));
    }

    // ---------- normalize_postcode ----------

    #[test]
    fn normalize_postcode_uppercases() {
        assert_eq!(Normalizer::normalize_postcode("cf10 1aa"), "CF101AA");
    }

    #[test]
    fn normalize_postcode_strips_all_whitespace() {
        assert_eq!(Normalizer::normalize_postcode("CF10 1AA"), "CF101AA");
        assert_eq!(Normalizer::normalize_postcode(" CF10  1AA "), "CF101AA");
        assert_eq!(Normalizer::normalize_postcode("CF10\t1AA"), "CF101AA");
    }

    #[test]
    fn normalize_postcode_handles_empty() {
        assert_eq!(Normalizer::normalize_postcode(""), "");
        assert_eq!(Normalizer::normalize_postcode("   "), "");
    }

    #[test]
    fn normalize_postcode_is_idempotent() {
        for input in ["cf10 1aa", "SW1A 2AA", "  EH8 9YL  ", ""] {
            let once = Normalizer::normalize_postcode(input);
            let twice = Normalizer::normalize_postcode(&once);
            assert_eq!(once, twice);
        }
    }

    // ---------- normalize_phone ----------

    #[test]
    fn normalize_phone_strips_uk_trunk_prefix() {
        assert_eq!(Normalizer::normalize_phone("07700 900123"), "7700900123");
    }

    #[test]
    fn normalize_phone_strips_plus_44_international() {
        assert_eq!(Normalizer::normalize_phone("+44 7700 900123"), "7700900123");
    }

    #[test]
    fn normalize_phone_strips_0044_international() {
        assert_eq!(
            Normalizer::normalize_phone("0044 7700 900123"),
            "7700900123"
        );
    }

    #[test]
    fn normalize_phone_handles_brackets_and_spaces() {
        assert_eq!(Normalizer::normalize_phone("(029) 2034 5678"), "2920345678");
    }

    #[test]
    fn normalize_phone_handles_empty() {
        assert_eq!(Normalizer::normalize_phone(""), "");
        assert_eq!(Normalizer::normalize_phone("---"), "");
    }

    #[test]
    fn normalize_phone_does_not_strip_44_if_too_short() {
        // 44 followed by fewer than 10 more digits: keep the 44 (not international prefix).
        assert_eq!(Normalizer::normalize_phone("4412345"), "4412345");
    }

    #[test]
    fn normalize_phone_is_idempotent() {
        for input in [
            "07700 900123",
            "+44 7700 900123",
            "0044 7700 900123",
            "(029) 2034 5678",
            "",
        ] {
            let once = Normalizer::normalize_phone(input);
            let twice = Normalizer::normalize_phone(&once);
            assert_eq!(once, twice, "not idempotent for {input:?}");
        }
    }

    #[test]
    fn normalize_phone_keeps_lone_zero() {
        // A bare "0" is not stripped (guard: len > 1).
        assert_eq!(Normalizer::normalize_phone("0"), "0");
    }

    // ---------- phonetic_code ----------

    #[test]
    fn phonetic_code_groups_smith_and_smyth() {
        assert_eq!(
            Normalizer::phonetic_code("Smith"),
            Normalizer::phonetic_code("Smyth")
        );
    }

    #[test]
    fn phonetic_code_groups_stephen_and_steven() {
        assert_eq!(
            Normalizer::phonetic_code("Stephen"),
            Normalizer::phonetic_code("Steven")
        );
    }

    #[test]
    fn phonetic_code_distinguishes_different_families() {
        assert_ne!(
            Normalizer::phonetic_code("Jones"),
            Normalizer::phonetic_code("Smith")
        );
        assert_ne!(
            Normalizer::phonetic_code("Anderson"),
            Normalizer::phonetic_code("Zimmerman")
        );
    }

    #[test]
    fn phonetic_code_specific_values() {
        // Pinned values from the underlying soundex crate; act as a regression net.
        assert_eq!(Normalizer::phonetic_code("Smith"), "S530");
        assert_eq!(Normalizer::phonetic_code("Smyth"), "S530");
        assert_eq!(Normalizer::phonetic_code("Jones"), "J520");
        assert_eq!(Normalizer::phonetic_code("Johnson"), "J525");
    }

    #[test]
    fn phonetic_code_handles_empty() {
        assert_eq!(Normalizer::phonetic_code(""), "");
        assert_eq!(Normalizer::phonetic_code("   "), "");
    }

    #[test]
    fn phonetic_code_is_case_insensitive() {
        assert_eq!(
            Normalizer::phonetic_code("SMITH"),
            Normalizer::phonetic_code("smith")
        );
    }
}