amt-phonetic 1.0.0

//! Core encoding pipeline.
//!
//! Stage-by-stage:
//!   1. [`preprocess`] — NFKD normalize, uppercase, strip prefixes/affixes.
//!   2. [`class_sequence`] — walk chars, emit sonority classes with contextual rules.
//!   3. [`pack_spectral`] — Chebyshev accumulate, Gray-quantize, bit-pack.
//!   4. [`bloom_signature`] — skip-bigram Bloom filter.
//!   5. [`encode_token`] — orchestrate the above, emit multi-key variants.
//!
//! Performance notes:
//!   * All hot functions are `#[inline]`.
//!   * [`ClassVec`], [`SpectralVec`], [`BloomVec`] use `SmallVec` (default
//!     feature) so the common case (≤16 classes, ≤2 variants) avoids any
//!     heap allocation.
//!   * [`bloom_signature`] uses a fixed-size stack array to sidestep
//!     allocation entirely.
//!   * The Chebyshev inner loop compiles to a tight 4-way FMA sequence.

use unicode_normalization::char::canonical_combining_class;
use unicode_normalization::UnicodeNormalization;

use crate::chebyshev;
use crate::sonority::{self, Class};

// ---------------------------------------------------------------------------
// Type aliases — smallvec optional
// ---------------------------------------------------------------------------

/// Sequence of sonority classes; inline-stored when ≤ 16 entries.
#[cfg(feature = "smallvec")]
pub type ClassVec = smallvec::SmallVec<[u8; 16]>;
/// Spectral keys for a single token; inline-stored when ≤ 2 entries.
#[cfg(feature = "smallvec")]
pub type SpectralVec = smallvec::SmallVec<[u32; 2]>;
/// Bloom signatures for a single token; inline-stored when ≤ 2 entries.
#[cfg(feature = "smallvec")]
pub type BloomVec = smallvec::SmallVec<[u64; 2]>;

/// Sequence of sonority classes (heap-allocated when `smallvec` feature is off).
#[cfg(not(feature = "smallvec"))]
pub type ClassVec = Vec<u8>;
/// Spectral keys for a single token (heap-allocated when `smallvec` feature is off).
#[cfg(not(feature = "smallvec"))]
pub type SpectralVec = Vec<u32>;
/// Bloom signatures for a single token (heap-allocated when `smallvec` feature is off).
#[cfg(not(feature = "smallvec"))]
pub type BloomVec = Vec<u64>;

// ---------------------------------------------------------------------------
// Public type
// ---------------------------------------------------------------------------

/// A fully-encoded token.
///
/// Invariant: `spectrals.len() == blooms.len()` and equals either 1 or 2
/// (2 only when the token contains ambiguous phonemes like `G`/`ج`).
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Code {
    /// 32-bit packed spectral keys. First key is always the primary variant.
    pub spectrals: SpectralVec,
    /// 64-bit Bloom signatures, index-aligned with `spectrals`.
    pub blooms: BloomVec,
    /// The class sequence from the primary variant (for debugging / re-analysis).
    pub classes: ClassVec,
}

impl Code {
    /// True if any spectral key is shared with `other`.
    #[inline]
    #[must_use]
    pub fn matches(&self, other: &Code) -> bool {
        for &a in &self.spectrals {
            for &b in &other.spectrals {
                if a == b {
                    return true;
                }
            }
        }
        false
    }
}

// ---------------------------------------------------------------------------
// Stage 1: Preprocessing
// ---------------------------------------------------------------------------

/// NFKD-normalize, strip combining marks, uppercase, remove hyphens/apostrophes,
/// strip prefixes, strip silent trailing letters.
#[must_use]
pub fn preprocess(token: &str) -> String {
    // Unicode normalization: NFKD splits composed characters into base + combining.
    // Filtering by canonical_combining_class == 0 keeps only base characters.
    // Then uppercase (handles Latin case; Arabic is caseless).
    let mut s: String = token
        .nfkd()
        .filter(|c| canonical_combining_class(*c) == 0)
        .flat_map(char::to_uppercase)
        .collect();

    // Strip hyphens, apostrophes, right single quotation mark.
    s.retain(|c| c != '-' && c != '\'' && c != '\u{2019}');

    // Strip Arabic definite article "ال" — possibly multiple compounds.
    loop {
        let consumed_end = {
            let mut it = s.char_indices();
            let alif = it.next();
            let lam = it.next();
            let first_real = it.next();
            match (alif, lam, first_real) {
                (Some((_, 'ا')), Some((_, 'ل')), Some((idx, ch)))
                    if sonority::class_of(ch) > 0 && it.next().is_some() =>
                {
                    idx
                }
                _ => break,
            }
        };
        s.drain(..consumed_end);
    }

    // Strip Latin article-style prefixes (AL, EL, UL, AS, ES) — but only
    // when the next character is a real consonant (not a vowel/class ≥ 7).
    for &px in sonority::LATIN_PREFIXES {
        if s.len() > px.len() + 1 && s.starts_with(px) {
            if let Some(next_ch) = s[px.len()..].chars().next() {
                let cls = sonority::class_of(next_ch);
                if (1..7).contains(&cls) {
                    s.drain(..px.len());
                    break;
                }
            }
        }
    }

    // Strip silent word-final letters (H / ه / ة).
    // Walk from the end byte-by-byte, using char_indices to get valid boundaries.
    loop {
        let last = s.char_indices().last();
        match last {
            Some((idx, c)) if sonority::is_silent_trailing(c) => {
                // Require at least ~2 remaining chars
                let remaining = s[..idx].chars().count();
                if remaining < 2 {
                    break;
                }
                s.truncate(idx);
            }
            _ => break,
        }
    }

    s
}

// ---------------------------------------------------------------------------
// Stage 2: Class sequence construction
// ---------------------------------------------------------------------------

/// Walk the preprocessed string, emitting one sonority class per phoneme.
///
/// `g_class` controls the resolution of ambiguous `G` / `ج`:
///   - `1` → treat as /g/ (Egyptian Arabic, Germanic)
///   - `6` → treat as /dʒ/ (Standard Arabic, French)
#[must_use]
pub fn class_sequence(s: &str, g_class: Class) -> ClassVec {
    // Collect chars onto the stack (SmallVec spills to heap only for very
    // long names, which are rare).
    #[cfg(feature = "smallvec")]
    let chars: smallvec::SmallVec<[char; 24]> = s.chars().collect();
    #[cfg(not(feature = "smallvec"))]
    let chars: Vec<char> = s.chars().collect();

    let n = chars.len();
    let mut out: ClassVec = ClassVec::new();
    let mut i = 0usize;
    let mut prev = '\0';

    while i < n {
        let ch = chars[i];

        // True gemination: KK → K (same literal char twice in a row)
        if ch == prev && prev != '\0' {
            i += 1;
            continue;
        }

        // Arabic matres lectionis: glide word-initial, vowel medially
        if sonority::is_arabic_matres(ch) {
            let cls = if i == 0 { 6 } else { 7 };
            out.push(cls);
            prev = ch;
            i += 1;
            continue;
        }

        // Latin Y: glide only at word-initial before a vowel, else high vowel
        if ch == 'Y' {
            let at_start = i == 0;
            let next_is_vowel = (i + 1 < n) && sonority::class_of(chars[i + 1]) >= 7;
            let cls = if at_start && next_is_vowel { 6 } else { 7 };
            out.push(cls);
            prev = ch;
            i += 1;
            continue;
        }

        // Ambiguous G / ج — class decided by caller
        if sonority::is_g_ambiguous(ch) {
            out.push(g_class);
            prev = ch;
            i += 1;
            continue;
        }

        // Digraphs (KH, SH, TH, ...)
        if i + 1 < n {
            if let Some(cls) = sonority::digraph_class(ch, chars[i + 1]) {
                out.push(cls);
                prev = chars[i + 1];
                i += 2;
                continue;
            }
        }

        // Plain character
        let cls = sonority::class_of(ch);
        if cls > 0 {
            // Adjacent vowel collapse: AE → max(A, E)
            if let Some(&last) = out.last() {
                if cls >= 7 && last >= 7 {
                    if cls > last {
                        *out.last_mut().unwrap() = cls;
                    }
                } else {
                    out.push(cls);
                }
            } else {
                out.push(cls);
            }
        }
        prev = ch;
        i += 1;
    }

    out
}

// ---------------------------------------------------------------------------
// Stage 3: Spectral packing
// ---------------------------------------------------------------------------

const CHEB_RANGES: [(f32, f32); 4] = [
    (1.0, 6.0),  // a_0: mean class
    (-3.0, 3.0), // a_1: linear trend
    (-2.0, 2.0), // a_2: quadratic shape
    (-1.5, 1.5), // a_3: cubic wobble
];

const CHEB_BITS: [u32; 4] = [5, 7, 6, 6];
const LENGTH_SHIFT: u32 = 29;

/// Compute 4 Chebyshev coefficients and Gray-code-quantize into a 32-bit key.
///
/// The top 3 bits are the length bucket (log-bucketed), enabling a single-
/// comparison length prefilter during retrieval. The remaining 24 bits are
/// the quantized coefficients, with a trailing 5-bit reserved pad.
#[inline]
#[must_use]
pub fn pack_spectral(consonants: &[u8]) -> u32 {
    let n = consonants.len();
    if n == 0 {
        return 0;
    }

    let n_eff = n.min(chebyshev::MAX_LEN);
    let table = chebyshev::table();

    let mut a0 = 0f32;
    let mut a1 = 0f32;
    let mut a2 = 0f32;
    let mut a3 = 0f32;

    for (i, &c_i) in consonants.iter().take(n_eff).enumerate() {
        let c = c_i as f32;
        let row = table.row(n_eff, i);
        // row is a [f32; K] slice; indexing is bounds-checked but the compiler
        // can elide the checks since K is const and we access 0..K.
        a0 += c * row[0];
        a1 += c * row[1];
        a2 += c * row[2];
        a3 += c * row[3];
    }

    let inv_n = 2.0 / (n_eff as f32);
    a0 *= inv_n * 0.5; // standard a_0 convention
    a1 *= inv_n;
    a2 *= inv_n;
    a3 *= inv_n;

    let mut packed = (chebyshev::length_bucket(n) as u32) << LENGTH_SHIFT;
    let mut shift = LENGTH_SHIFT;
    let coeffs = [a0, a1, a2, a3];
    for k in 0..4 {
        let (lo, hi) = CHEB_RANGES[k];
        let b = CHEB_BITS[k];
        shift -= b;
        let span = hi - lo;
        let u = ((coeffs[k] - lo) / span).clamp(0.0, 1.0);
        let hi_val = (1u32 << b) - 1;
        let v = (u * (hi_val as f32) + 0.5) as u32;
        let v = v.min(hi_val);
        let gray = v ^ (v >> 1); // Gray code
        packed |= gray << shift;
    }

    packed
}

// ---------------------------------------------------------------------------
// Stage 4: Bloom signature
// ---------------------------------------------------------------------------

/// 64-bit skip-bigram Bloom over consonants.
///
/// Hashes each (c_i, c_{i+d}) pair for d ∈ {1, 2} to a bit position.
/// Provides edit-distance tolerance beyond what moment-based fingerprints
/// can capture.
#[inline]
#[must_use]
pub fn bloom_signature(classes: &[u8]) -> u64 {
    // Filter to consonants on the stack — no allocation.
    let mut cons = [0u8; 32];
    let mut n = 0usize;
    for &c in classes {
        if c < 7 && n < cons.len() {
            cons[n] = c;
            n += 1;
        }
    }
    if n < 2 {
        return 0;
    }

    let mut sig: u64 = 0;
    for i in 0..n {
        let c_i = cons[i] as u64;
        if i + 1 < n {
            let c_j = cons[i + 1] as u64;
            let key = (c_i
                .wrapping_mul(11)
                .wrapping_add(c_j.wrapping_mul(97))
                .wrapping_add(17))
                & 63;
            sig |= 1u64 << key;
        }
        if i + 2 < n {
            let c_j = cons[i + 2] as u64;
            let key = (c_i
                .wrapping_mul(11)
                .wrapping_add(c_j.wrapping_mul(97))
                .wrapping_add(34))
                & 63;
            sig |= 1u64 << key;
        }
    }

    sig
}

// ---------------------------------------------------------------------------
// Stage 5: Encode
// ---------------------------------------------------------------------------

#[inline]
fn filter_consonants(seq: &[u8]) -> ClassVec {
    let mut out: ClassVec = ClassVec::with_capacity(seq.len());
    for &c in seq {
        if c < 7 {
            out.push(c);
        }
    }
    out
}

/// Encode a single token into an AMT [`Code`].
///
/// Whitespace and punctuation in the input are treated as invisible —
/// callers wanting multi-token handling should use [`encode`].
#[must_use]
pub fn encode_token(token: &str) -> Code {
    let s = preprocess(token);
    let has_g = s.chars().any(sonority::is_g_ambiguous);

    let seq1 = class_sequence(&s, 1);
    let primary_classes = seq1.clone();

    let cons1 = filter_consonants(&seq1);
    let sp1 = pack_spectral(&cons1);
    let bl1 = bloom_signature(&seq1);

    let mut spectrals: SpectralVec = SpectralVec::new();
    let mut blooms: BloomVec = BloomVec::new();
    spectrals.push(sp1);
    blooms.push(bl1);

    if has_g {
        let seq2 = class_sequence(&s, 6);
        let cons2 = filter_consonants(&seq2);
        let sp2 = pack_spectral(&cons2);
        if sp2 != sp1 {
            let bl2 = bloom_signature(&seq2);
            spectrals.push(sp2);
            blooms.push(bl2);
        }
    }

    Code {
        spectrals,
        blooms,
        classes: primary_classes,
    }
}

/// Encode a multi-token name into a list of per-token codes.
///
/// Splits on ASCII whitespace.
#[must_use]
pub fn encode(name: &str) -> Vec<Code> {
    name.split_whitespace()
        .filter(|t| !t.is_empty())
        .map(encode_token)
        .collect()
}

/// Encode many names in a single call.
#[must_use]
pub fn encode_batch<I, S>(names: I) -> Vec<Vec<Code>>
where
    I: IntoIterator<Item = S>,
    S: AsRef<str>,
{
    names.into_iter().map(|s| encode(s.as_ref())).collect()
}