amt-phonetic 1.0.0

Articulatory Moment Transform — language-agnostic phonetic name matching
Documentation
//! Precomputed Chebyshev polynomial values at Chebyshev nodes.
//!
//! For a class sequence of length `n ∈ [1, MAX_LEN]`, position `i ∈ [0, n)`,
//! and polynomial order `k ∈ [0, K)`, we store `T_k(x_i)` where
//! `x_i = 2(i + 0.5)/n − 1`.
//!
//! The table is lazily built once per process via `OnceLock`. Building it
//! costs ~0.1 ms (< 600 `f32` ops); after that, every encoding is pure
//! lookup.
//!
//! Memory footprint: `MAX_LEN(MAX_LEN+1)/2 × K` floats ≈ 4.8 KB.

use std::sync::OnceLock;

/// Maximum length of consonant sequence that uses precomputed values.
/// Longer sequences are truncated (names that long are extremely rare).
pub(crate) const MAX_LEN: usize = 24;

/// Number of Chebyshev coefficients retained per encoding.
pub(crate) const K: usize = 4;

/// Flat table indexed by `(n, i, k)` via [`cheb`].
///
/// Layout: for length `n`, the `n` rows of `K` floats are stored contiguously
/// starting at offset `TABLE_OFFSETS[n]`. This gives cache-friendly access
/// during the inner accumulation loop.
pub(crate) struct ChebTable {
    data: Vec<f32>,
    offsets: [usize; MAX_LEN + 1],
}

impl ChebTable {
    fn build() -> Self {
        // Total entries = sum_{n=1}^{MAX_LEN} n*K = K * MAX_LEN*(MAX_LEN+1)/2
        let total = K * MAX_LEN * (MAX_LEN + 1) / 2;
        let mut data = Vec::with_capacity(total);
        let mut offsets = [0usize; MAX_LEN + 1];

        for (n, off) in offsets.iter_mut().enumerate().take(MAX_LEN + 1).skip(1) {
            *off = data.len();
            let n_f = n as f32;
            for i in 0..n {
                let x = (2.0 * (i as f32) + 1.0) / n_f - 1.0;
                // Three-term recurrence: T_{k+1}(x) = 2x · T_k(x) − T_{k-1}(x)
                let mut t_prev = 1.0f32;
                let mut t_curr = x;
                data.push(t_prev);
                if K > 1 {
                    data.push(t_curr);
                }
                for _ in 2..K {
                    let t_next = 2.0 * x * t_curr - t_prev;
                    data.push(t_next);
                    t_prev = t_curr;
                    t_curr = t_next;
                }
            }
        }

        Self { data, offsets }
    }

    /// Return the K-length slice `[T_0(x_i), T_1(x_i), ..., T_{K-1}(x_i)]`.
    #[inline]
    pub(crate) fn row(&self, n: usize, i: usize) -> &[f32] {
        debug_assert!(n > 0 && n <= MAX_LEN);
        debug_assert!(i < n);
        let base = self.offsets[n] + i * K;
        &self.data[base..base + K]
    }
}

static CHEB_TABLE: OnceLock<ChebTable> = OnceLock::new();

#[inline]
pub(crate) fn table() -> &'static ChebTable {
    CHEB_TABLE.get_or_init(ChebTable::build)
}

/// Length → 3-bit bucket (logarithmic). Absorbs ±1 consonant variation
/// in the middle range where most Arabic names cluster.
#[rustfmt::skip]
pub(crate) const LENGTH_BUCKETS: [u8; 14] = [
    0, 1, 2, 3, 4, 4, 5, 5, 5, 6, 6, 6, 6, 6,
];

#[inline]
pub(crate) fn length_bucket(n: usize) -> u8 {
    if n < LENGTH_BUCKETS.len() {
        LENGTH_BUCKETS[n]
    } else {
        7
    }
}