amt-phonetic 1.0.0

Articulatory Moment Transform — language-agnostic phonetic name matching
Documentation
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
//! Core encoding pipeline.
//!
//! Stage-by-stage:
//!   1. [`preprocess`] — NFKD normalize, uppercase, strip prefixes/affixes.
//!   2. [`class_sequence`] — walk chars, emit sonority classes with contextual rules.
//!   3. [`pack_spectral`] — Chebyshev accumulate, Gray-quantize, bit-pack.
//!   4. [`bloom_signature`] — skip-bigram Bloom filter.
//!   5. [`encode_token`] — orchestrate the above, emit multi-key variants.
//!
//! Performance notes:
//!   * All hot functions are `#[inline]`.
//!   * [`ClassVec`], [`SpectralVec`], [`BloomVec`] use `SmallVec` (default
//!     feature) so the common case (≤16 classes, ≤2 variants) avoids any
//!     heap allocation.
//!   * [`bloom_signature`] uses a fixed-size stack array to sidestep
//!     allocation entirely.
//!   * The Chebyshev inner loop compiles to a tight 4-way FMA sequence.

use unicode_normalization::char::canonical_combining_class;
use unicode_normalization::UnicodeNormalization;

use crate::chebyshev;
use crate::sonority::{self, Class};

// ---------------------------------------------------------------------------
// Type aliases — smallvec optional
// ---------------------------------------------------------------------------

/// Sequence of sonority classes; inline-stored when ≤ 16 entries.
#[cfg(feature = "smallvec")]
pub type ClassVec = smallvec::SmallVec<[u8; 16]>;
/// Spectral keys for a single token; inline-stored when ≤ 2 entries.
#[cfg(feature = "smallvec")]
pub type SpectralVec = smallvec::SmallVec<[u32; 2]>;
/// Bloom signatures for a single token; inline-stored when ≤ 2 entries.
#[cfg(feature = "smallvec")]
pub type BloomVec = smallvec::SmallVec<[u64; 2]>;

/// Sequence of sonority classes (heap-allocated when `smallvec` feature is off).
#[cfg(not(feature = "smallvec"))]
pub type ClassVec = Vec<u8>;
/// Spectral keys for a single token (heap-allocated when `smallvec` feature is off).
#[cfg(not(feature = "smallvec"))]
pub type SpectralVec = Vec<u32>;
/// Bloom signatures for a single token (heap-allocated when `smallvec` feature is off).
#[cfg(not(feature = "smallvec"))]
pub type BloomVec = Vec<u64>;

// ---------------------------------------------------------------------------
// Public type
// ---------------------------------------------------------------------------

/// A fully-encoded token.
///
/// Invariant: `spectrals.len() == blooms.len()` and equals either 1 or 2
/// (2 only when the token contains ambiguous phonemes like `G`/`ج`).
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Code {
    /// 32-bit packed spectral keys. First key is always the primary variant.
    pub spectrals: SpectralVec,
    /// 64-bit Bloom signatures, index-aligned with `spectrals`.
    pub blooms: BloomVec,
    /// The class sequence from the primary variant (for debugging / re-analysis).
    pub classes: ClassVec,
}

impl Code {
    /// True if any spectral key is shared with `other`.
    #[inline]
    #[must_use]
    pub fn matches(&self, other: &Code) -> bool {
        for &a in &self.spectrals {
            for &b in &other.spectrals {
                if a == b {
                    return true;
                }
            }
        }
        false
    }
}

// ---------------------------------------------------------------------------
// Stage 1: Preprocessing
// ---------------------------------------------------------------------------

/// NFKD-normalize, strip combining marks, uppercase, remove hyphens/apostrophes,
/// strip prefixes, strip silent trailing letters.
#[must_use]
pub fn preprocess(token: &str) -> String {
    // Unicode normalization: NFKD splits composed characters into base + combining.
    // Filtering by canonical_combining_class == 0 keeps only base characters.
    // Then uppercase (handles Latin case; Arabic is caseless).
    let mut s: String = token
        .nfkd()
        .filter(|c| canonical_combining_class(*c) == 0)
        .flat_map(char::to_uppercase)
        .collect();

    // Strip hyphens, apostrophes, right single quotation mark.
    s.retain(|c| c != '-' && c != '\'' && c != '\u{2019}');

    // Strip Arabic definite article "ال" — possibly multiple compounds.
    loop {
        let consumed_end = {
            let mut it = s.char_indices();
            let alif = it.next();
            let lam = it.next();
            let first_real = it.next();
            match (alif, lam, first_real) {
                (Some((_, 'ا')), Some((_, 'ل')), Some((idx, ch)))
                    if sonority::class_of(ch) > 0 && it.next().is_some() =>
                {
                    idx
                }
                _ => break,
            }
        };
        s.drain(..consumed_end);
    }

    // Strip Latin article-style prefixes (AL, EL, UL, AS, ES) — but only
    // when the next character is a real consonant (not a vowel/class ≥ 7).
    for &px in sonority::LATIN_PREFIXES {
        if s.len() > px.len() + 1 && s.starts_with(px) {
            if let Some(next_ch) = s[px.len()..].chars().next() {
                let cls = sonority::class_of(next_ch);
                if (1..7).contains(&cls) {
                    s.drain(..px.len());
                    break;
                }
            }
        }
    }

    // Strip silent word-final letters (H / ه / ة).
    // Walk from the end byte-by-byte, using char_indices to get valid boundaries.
    loop {
        let last = s.char_indices().last();
        match last {
            Some((idx, c)) if sonority::is_silent_trailing(c) => {
                // Require at least ~2 remaining chars
                let remaining = s[..idx].chars().count();
                if remaining < 2 {
                    break;
                }
                s.truncate(idx);
            }
            _ => break,
        }
    }

    s
}

// ---------------------------------------------------------------------------
// Stage 2: Class sequence construction
// ---------------------------------------------------------------------------

/// Walk the preprocessed string, emitting one sonority class per phoneme.
///
/// `g_class` controls the resolution of ambiguous `G` / `ج`:
///   - `1` → treat as /g/ (Egyptian Arabic, Germanic)
///   - `6` → treat as /dʒ/ (Standard Arabic, French)
#[must_use]
pub fn class_sequence(s: &str, g_class: Class) -> ClassVec {
    // Collect chars onto the stack (SmallVec spills to heap only for very
    // long names, which are rare).
    #[cfg(feature = "smallvec")]
    let chars: smallvec::SmallVec<[char; 24]> = s.chars().collect();
    #[cfg(not(feature = "smallvec"))]
    let chars: Vec<char> = s.chars().collect();

    let n = chars.len();
    let mut out: ClassVec = ClassVec::new();
    let mut i = 0usize;
    let mut prev = '\0';

    while i < n {
        let ch = chars[i];

        // True gemination: KK → K (same literal char twice in a row)
        if ch == prev && prev != '\0' {
            i += 1;
            continue;
        }

        // Arabic matres lectionis: glide word-initial, vowel medially
        if sonority::is_arabic_matres(ch) {
            let cls = if i == 0 { 6 } else { 7 };
            out.push(cls);
            prev = ch;
            i += 1;
            continue;
        }

        // Latin Y: glide only at word-initial before a vowel, else high vowel
        if ch == 'Y' {
            let at_start = i == 0;
            let next_is_vowel = (i + 1 < n) && sonority::class_of(chars[i + 1]) >= 7;
            let cls = if at_start && next_is_vowel { 6 } else { 7 };
            out.push(cls);
            prev = ch;
            i += 1;
            continue;
        }

        // Ambiguous G / ج — class decided by caller
        if sonority::is_g_ambiguous(ch) {
            out.push(g_class);
            prev = ch;
            i += 1;
            continue;
        }

        // Digraphs (KH, SH, TH, ...)
        if i + 1 < n {
            if let Some(cls) = sonority::digraph_class(ch, chars[i + 1]) {
                out.push(cls);
                prev = chars[i + 1];
                i += 2;
                continue;
            }
        }

        // Plain character
        let cls = sonority::class_of(ch);
        if cls > 0 {
            // Adjacent vowel collapse: AE → max(A, E)
            if let Some(&last) = out.last() {
                if cls >= 7 && last >= 7 {
                    if cls > last {
                        *out.last_mut().unwrap() = cls;
                    }
                } else {
                    out.push(cls);
                }
            } else {
                out.push(cls);
            }
        }
        prev = ch;
        i += 1;
    }

    out
}

// ---------------------------------------------------------------------------
// Stage 3: Spectral packing
// ---------------------------------------------------------------------------

const CHEB_RANGES: [(f32, f32); 4] = [
    (1.0, 6.0),  // a_0: mean class
    (-3.0, 3.0), // a_1: linear trend
    (-2.0, 2.0), // a_2: quadratic shape
    (-1.5, 1.5), // a_3: cubic wobble
];

const CHEB_BITS: [u32; 4] = [5, 7, 6, 6];
const LENGTH_SHIFT: u32 = 29;

/// Compute 4 Chebyshev coefficients and Gray-code-quantize into a 32-bit key.
///
/// The top 3 bits are the length bucket (log-bucketed), enabling a single-
/// comparison length prefilter during retrieval. The remaining 24 bits are
/// the quantized coefficients, with a trailing 5-bit reserved pad.
#[inline]
#[must_use]
pub fn pack_spectral(consonants: &[u8]) -> u32 {
    let n = consonants.len();
    if n == 0 {
        return 0;
    }

    let n_eff = n.min(chebyshev::MAX_LEN);
    let table = chebyshev::table();

    let mut a0 = 0f32;
    let mut a1 = 0f32;
    let mut a2 = 0f32;
    let mut a3 = 0f32;

    for (i, &c_i) in consonants.iter().take(n_eff).enumerate() {
        let c = c_i as f32;
        let row = table.row(n_eff, i);
        // row is a [f32; K] slice; indexing is bounds-checked but the compiler
        // can elide the checks since K is const and we access 0..K.
        a0 += c * row[0];
        a1 += c * row[1];
        a2 += c * row[2];
        a3 += c * row[3];
    }

    let inv_n = 2.0 / (n_eff as f32);
    a0 *= inv_n * 0.5; // standard a_0 convention
    a1 *= inv_n;
    a2 *= inv_n;
    a3 *= inv_n;

    let mut packed = (chebyshev::length_bucket(n) as u32) << LENGTH_SHIFT;
    let mut shift = LENGTH_SHIFT;
    let coeffs = [a0, a1, a2, a3];
    for k in 0..4 {
        let (lo, hi) = CHEB_RANGES[k];
        let b = CHEB_BITS[k];
        shift -= b;
        let span = hi - lo;
        let u = ((coeffs[k] - lo) / span).clamp(0.0, 1.0);
        let hi_val = (1u32 << b) - 1;
        let v = (u * (hi_val as f32) + 0.5) as u32;
        let v = v.min(hi_val);
        let gray = v ^ (v >> 1); // Gray code
        packed |= gray << shift;
    }

    packed
}

// ---------------------------------------------------------------------------
// Stage 4: Bloom signature
// ---------------------------------------------------------------------------

/// 64-bit skip-bigram Bloom over consonants.
///
/// Hashes each (c_i, c_{i+d}) pair for d ∈ {1, 2} to a bit position.
/// Provides edit-distance tolerance beyond what moment-based fingerprints
/// can capture.
#[inline]
#[must_use]
pub fn bloom_signature(classes: &[u8]) -> u64 {
    // Filter to consonants on the stack — no allocation.
    let mut cons = [0u8; 32];
    let mut n = 0usize;
    for &c in classes {
        if c < 7 && n < cons.len() {
            cons[n] = c;
            n += 1;
        }
    }
    if n < 2 {
        return 0;
    }

    let mut sig: u64 = 0;
    for i in 0..n {
        let c_i = cons[i] as u64;
        if i + 1 < n {
            let c_j = cons[i + 1] as u64;
            let key = (c_i
                .wrapping_mul(11)
                .wrapping_add(c_j.wrapping_mul(97))
                .wrapping_add(17))
                & 63;
            sig |= 1u64 << key;
        }
        if i + 2 < n {
            let c_j = cons[i + 2] as u64;
            let key = (c_i
                .wrapping_mul(11)
                .wrapping_add(c_j.wrapping_mul(97))
                .wrapping_add(34))
                & 63;
            sig |= 1u64 << key;
        }
    }

    sig
}

// ---------------------------------------------------------------------------
// Stage 5: Encode
// ---------------------------------------------------------------------------

#[inline]
fn filter_consonants(seq: &[u8]) -> ClassVec {
    let mut out: ClassVec = ClassVec::with_capacity(seq.len());
    for &c in seq {
        if c < 7 {
            out.push(c);
        }
    }
    out
}

/// Encode a single token into an AMT [`Code`].
///
/// Whitespace and punctuation in the input are treated as invisible —
/// callers wanting multi-token handling should use [`encode`].
#[must_use]
pub fn encode_token(token: &str) -> Code {
    let s = preprocess(token);
    let has_g = s.chars().any(sonority::is_g_ambiguous);

    let seq1 = class_sequence(&s, 1);
    let primary_classes = seq1.clone();

    let cons1 = filter_consonants(&seq1);
    let sp1 = pack_spectral(&cons1);
    let bl1 = bloom_signature(&seq1);

    let mut spectrals: SpectralVec = SpectralVec::new();
    let mut blooms: BloomVec = BloomVec::new();
    spectrals.push(sp1);
    blooms.push(bl1);

    if has_g {
        let seq2 = class_sequence(&s, 6);
        let cons2 = filter_consonants(&seq2);
        let sp2 = pack_spectral(&cons2);
        if sp2 != sp1 {
            let bl2 = bloom_signature(&seq2);
            spectrals.push(sp2);
            blooms.push(bl2);
        }
    }

    Code {
        spectrals,
        blooms,
        classes: primary_classes,
    }
}

/// Encode a multi-token name into a list of per-token codes.
///
/// Splits on ASCII whitespace.
#[must_use]
pub fn encode(name: &str) -> Vec<Code> {
    name.split_whitespace()
        .filter(|t| !t.is_empty())
        .map(encode_token)
        .collect()
}

/// Encode many names in a single call.
#[must_use]
pub fn encode_batch<I, S>(names: I) -> Vec<Vec<Code>>
where
    I: IntoIterator<Item = S>,
    S: AsRef<str>,
{
    names.into_iter().map(|s| encode(s.as_ref())).collect()
}