amt-phonetic 1.0.0

Articulatory Moment Transform — language-agnostic phonetic name matching
Documentation
//! Distance and similarity over AMT codes.

use crate::core::{encode, Code};

/// Weight on the spectral channel vs the Bloom channel in [`token_distance`].
///
/// The spectral key has 32 bits of fine-grained moment information; the
/// Bloom has ~20 effective bits of coarser co-occurrence information. The
/// 0.65 weight reflects this ratio empirically.
const ALPHA: f32 = 0.65;

/// Distance between two token codes, in `[0, 1]`.
///
/// Takes the minimum over all pairs of variants — for tokens with
/// multi-key spectrals, any variant can align with any other.
/// A length-bucket prefilter (top 3 bits of the spectral key) eliminates
/// length-incompatible pairs without computing full popcount.
#[inline]
#[must_use]
pub fn token_distance(a: &Code, b: &Code) -> f32 {
    let mut best = 1.0f32;

    for (&sp_a, &bl_a) in a.spectrals.iter().zip(a.blooms.iter()) {
        let bucket_a = sp_a >> 29;
        for (&sp_b, &bl_b) in b.spectrals.iter().zip(b.blooms.iter()) {
            let bucket_b = sp_b >> 29;
            if bucket_a.abs_diff(bucket_b) > 1 {
                continue;
            }

            // Hamming on spectral
            let h = (sp_a ^ sp_b).count_ones() as f32 / 32.0;

            // Jaccard on bloom
            let inter = (bl_a & bl_b).count_ones() as f32;
            let union = (bl_a | bl_b).count_ones() as f32;
            let j = if union > 0.0 {
                1.0 - inter / union
            } else {
                0.0
            };

            let d = ALPHA * h + (1.0 - ALPHA) * j;
            if d < best {
                best = d;
            }
        }
    }
    best
}

/// Graded similarity between two full names, in `[0, 1]`.
///
/// Multi-token names are greedily matched token-by-token. Normalized by
/// query length, making the score autocomplete-friendly: extra candidate
/// tokens don't reduce the score.
#[must_use]
pub fn similarity(query: &str, candidate: &str) -> f32 {
    let qs = encode(query);
    let cs = encode(candidate);
    if qs.is_empty() || cs.is_empty() {
        return 0.0;
    }

    let mut remaining: Vec<Code> = cs;
    let mut total = 0.0f32;

    for qt in &qs {
        let mut best = 1.0f32;
        let mut best_i: Option<usize> = None;
        for (i, ct) in remaining.iter().enumerate() {
            let d = token_distance(qt, ct);
            if d < best {
                best = d;
                best_i = Some(i);
            }
        }
        total += 1.0 - best;
        if let Some(i) = best_i {
            remaining.swap_remove(i);
        }
    }

    total / qs.len() as f32
}

/// Fast boolean match test — true if any spectral key is shared.
///
/// Strictly faster than [`similarity`]; use as a prefilter or for
/// exact-bucket queries.
#[must_use]
pub fn matches(a: &str, b: &str) -> bool {
    let qa = encode(a);
    let qb = encode(b);

    for ca in &qa {
        for cb in &qb {
            for &sa in &ca.spectrals {
                for &sb in &cb.spectrals {
                    if sa == sb {
                        return true;
                    }
                }
            }
        }
    }
    false
}