amt-phonetic 1.0.0

Articulatory Moment Transform — language-agnostic phonetic name matching
Documentation
//! Universal sonority alphabet.
//!
//! Maps a Unicode `char` to a sonority class in `[1, 8]`, or `0` for
//! unrecognized characters. ASCII uppercase letters go through a 128-byte
//! lookup table (single array index); Arabic and other characters fall
//! through to a `match` expression that `rustc` compiles to a jump table.
//!
//! This is the only piece of linguistic knowledge in the algorithm; all
//! other modules are pure math or plumbing.

/// 0 means "unrecognized". Non-zero values are the class `[1, 8]`.
pub type Class = u8;

/// ASCII-uppercase lookup table. Indexed directly by byte value.
///
/// For input characters outside `[0x20, 0x7E]` ASCII, use [`class_of`] which
/// handles lowercase ASCII, Arabic, and all other cases.
#[rustfmt::skip]
pub const ASCII_CLASS: [Class; 128] = {
    let mut t = [0u8; 128];
    // Latin uppercase class assignments
    // class 1: K Q C G X
    t[b'K' as usize] = 1; t[b'Q' as usize] = 1; t[b'C' as usize] = 1;
    t[b'G' as usize] = 1; t[b'X' as usize] = 1;
    // class 2: P B T D
    t[b'P' as usize] = 2; t[b'B' as usize] = 2; t[b'T' as usize] = 2;
    t[b'D' as usize] = 2;
    // class 3: F V S Z H
    t[b'F' as usize] = 3; t[b'V' as usize] = 3; t[b'S' as usize] = 3;
    t[b'Z' as usize] = 3; t[b'H' as usize] = 3;
    // class 4: M N
    t[b'M' as usize] = 4; t[b'N' as usize] = 4;
    // class 5: L R
    t[b'L' as usize] = 5; t[b'R' as usize] = 5;
    // class 6: W Y J
    t[b'W' as usize] = 6; t[b'Y' as usize] = 6; t[b'J' as usize] = 6;
    // class 7: I U
    t[b'I' as usize] = 7; t[b'U' as usize] = 7;
    // class 8: A E O
    t[b'A' as usize] = 8; t[b'E' as usize] = 8; t[b'O' as usize] = 8;
    t
};

/// Returns the sonority class for a character, or 0 if unrecognized.
///
/// Characters are always uppercased by the caller before this is invoked;
/// this function handles both ASCII uppercase (fast path) and Arabic script.
#[inline]
#[must_use]
pub fn class_of(c: char) -> Class {
    // Fast path: ASCII
    if (c as u32) < 128 {
        return ASCII_CLASS[c as usize];
    }
    // Arabic (and any other scripts added)
    match c {
        // class 1: back/dorsal stops
        'ك' | 'ق' | 'غ' | 'خ' => 1,
        // class 2: front stops
        'ت' | 'د' | 'ط' | 'ض' | 'ث' | 'ذ' | 'ب' => 2,
        // class 3: fricatives (including ta-marbuta ة which is usually silent word-final)
        'ف' | 'س' | 'ش' | 'ص' | 'ز' | 'ظ' | 'ح' | 'ه' | 'ة' => 3,
        // class 4: nasals
        'م' | 'ن' => 4,
        // class 5: liquids
        'ل' | 'ر' => 5,
        // class 6: affricates (matres lectionis و/ي handled contextually, not here)
        'ج' => 6,
        // class 8: low/mid vowels (ع/ء treated as vowel-like for matching purposes)
        'ا' | 'ى' | 'آ' | 'أ' | 'إ' | 'ع' | 'ء' => 8,
        _ => 0,
    }
}

/// Characters with contextual class assignment.
///
/// Word-initial: class 6 (glide). Medial: class 7 (high vowel).
#[inline]
#[must_use]
pub fn is_arabic_matres(c: char) -> bool {
    matches!(c, 'و' | 'ي')
}

/// Characters whose pronunciation varies across dialects, triggering
/// multi-key generation.
///
/// `G` / `ج` can be /g/ (Egyptian Arabic, Germanic) or /dʒ/ (Standard Arabic).
/// Both variants are emitted as separate spectral keys.
#[inline]
#[must_use]
pub fn is_g_ambiguous(c: char) -> bool {
    matches!(c, 'G' | 'ج')
}

/// Digraph lookup — applied before single-character class assignment.
#[inline]
#[must_use]
pub fn digraph_class(a: char, b: char) -> Option<Class> {
    // Only digraphs of two ASCII letters
    if (a as u32) >= 128 || (b as u32) >= 128 {
        return None;
    }
    match (a, b) {
        ('K', 'H') | ('G', 'H') | ('C', 'K') | ('Q', 'U') => Some(1),
        ('C', 'H') | ('S', 'H') | ('T', 'H') | ('P', 'H') | ('D', 'H') | ('Z', 'H') => Some(3),
        _ => None,
    }
}

/// Latin definite-article-style prefixes.
pub const LATIN_PREFIXES: &[&str] = &["AL", "EL", "UL", "AS", "ES"];

/// Arabic definite article, as two chars.
pub const ARABIC_PREFIX: [char; 2] = ['ا', 'ل'];

/// Silent word-final letters that are always dropped.
#[inline]
#[must_use]
pub fn is_silent_trailing(c: char) -> bool {
    matches!(c, 'H' | 'ه' | 'ة')
}