Skip to main content

amt/
core.rs

1//! Core encoding pipeline.
2//!
3//! Stage-by-stage:
4//!   1. [`preprocess`] — NFKD normalize, uppercase, strip prefixes/affixes.
5//!   2. [`class_sequence`] — walk chars, emit sonority classes with contextual rules.
6//!   3. [`pack_spectral`] — Chebyshev accumulate, Gray-quantize, bit-pack.
7//!   4. [`bloom_signature`] — skip-bigram Bloom filter.
8//!   5. [`encode_token`] — orchestrate the above, emit multi-key variants.
9//!
10//! Performance notes:
11//!   * All hot functions are `#[inline]`.
12//!   * [`ClassVec`], [`SpectralVec`], [`BloomVec`] use `SmallVec` (default
13//!     feature) so the common case (≤16 classes, ≤2 variants) avoids any
14//!     heap allocation.
15//!   * [`bloom_signature`] uses a fixed-size stack array to sidestep
16//!     allocation entirely.
17//!   * The Chebyshev inner loop compiles to a tight 4-way FMA sequence.
18
19use unicode_normalization::char::canonical_combining_class;
20use unicode_normalization::UnicodeNormalization;
21
22use crate::chebyshev;
23use crate::sonority::{self, Class};
24
25// ---------------------------------------------------------------------------
26// Type aliases — smallvec optional
27// ---------------------------------------------------------------------------
28
29/// Sequence of sonority classes; inline-stored when ≤ 16 entries.
30#[cfg(feature = "smallvec")]
31pub type ClassVec = smallvec::SmallVec<[u8; 16]>;
32/// Spectral keys for a single token; inline-stored when ≤ 2 entries.
33#[cfg(feature = "smallvec")]
34pub type SpectralVec = smallvec::SmallVec<[u32; 2]>;
35/// Bloom signatures for a single token; inline-stored when ≤ 2 entries.
36#[cfg(feature = "smallvec")]
37pub type BloomVec = smallvec::SmallVec<[u64; 2]>;
38
39/// Sequence of sonority classes (heap-allocated when `smallvec` feature is off).
40#[cfg(not(feature = "smallvec"))]
41pub type ClassVec = Vec<u8>;
42/// Spectral keys for a single token (heap-allocated when `smallvec` feature is off).
43#[cfg(not(feature = "smallvec"))]
44pub type SpectralVec = Vec<u32>;
45/// Bloom signatures for a single token (heap-allocated when `smallvec` feature is off).
46#[cfg(not(feature = "smallvec"))]
47pub type BloomVec = Vec<u64>;
48
49// ---------------------------------------------------------------------------
50// Public type
51// ---------------------------------------------------------------------------
52
53/// A fully-encoded token.
54///
55/// Invariant: `spectrals.len() == blooms.len()` and equals either 1 or 2
56/// (2 only when the token contains ambiguous phonemes like `G`/`ج`).
57#[derive(Debug, Clone, PartialEq, Eq, Hash)]
58pub struct Code {
59    /// 32-bit packed spectral keys. First key is always the primary variant.
60    pub spectrals: SpectralVec,
61    /// 64-bit Bloom signatures, index-aligned with `spectrals`.
62    pub blooms: BloomVec,
63    /// The class sequence from the primary variant (for debugging / re-analysis).
64    pub classes: ClassVec,
65}
66
67impl Code {
68    /// True if any spectral key is shared with `other`.
69    #[inline]
70    #[must_use]
71    pub fn matches(&self, other: &Code) -> bool {
72        for &a in &self.spectrals {
73            for &b in &other.spectrals {
74                if a == b {
75                    return true;
76                }
77            }
78        }
79        false
80    }
81}
82
83// ---------------------------------------------------------------------------
84// Stage 1: Preprocessing
85// ---------------------------------------------------------------------------
86
87/// NFKD-normalize, strip combining marks, uppercase, remove hyphens/apostrophes,
88/// strip prefixes, strip silent trailing letters.
89#[must_use]
90pub fn preprocess(token: &str) -> String {
91    // Unicode normalization: NFKD splits composed characters into base + combining.
92    // Filtering by canonical_combining_class == 0 keeps only base characters.
93    // Then uppercase (handles Latin case; Arabic is caseless).
94    let mut s: String = token
95        .nfkd()
96        .filter(|c| canonical_combining_class(*c) == 0)
97        .flat_map(char::to_uppercase)
98        .collect();
99
100    // Strip hyphens, apostrophes, right single quotation mark.
101    s.retain(|c| c != '-' && c != '\'' && c != '\u{2019}');
102
103    // Strip Arabic definite article "ال" — possibly multiple compounds.
104    loop {
105        let consumed_end = {
106            let mut it = s.char_indices();
107            let alif = it.next();
108            let lam = it.next();
109            let first_real = it.next();
110            match (alif, lam, first_real) {
111                (Some((_, 'ا')), Some((_, 'ل')), Some((idx, ch)))
112                    if sonority::class_of(ch) > 0 && it.next().is_some() =>
113                {
114                    idx
115                }
116                _ => break,
117            }
118        };
119        s.drain(..consumed_end);
120    }
121
122    // Strip Latin article-style prefixes (AL, EL, UL, AS, ES) — but only
123    // when the next character is a real consonant (not a vowel/class ≥ 7).
124    for &px in sonority::LATIN_PREFIXES {
125        if s.len() > px.len() + 1 && s.starts_with(px) {
126            if let Some(next_ch) = s[px.len()..].chars().next() {
127                let cls = sonority::class_of(next_ch);
128                if (1..7).contains(&cls) {
129                    s.drain(..px.len());
130                    break;
131                }
132            }
133        }
134    }
135
136    // Strip silent word-final letters (H / ه / ة).
137    // Walk from the end byte-by-byte, using char_indices to get valid boundaries.
138    loop {
139        let last = s.char_indices().last();
140        match last {
141            Some((idx, c)) if sonority::is_silent_trailing(c) => {
142                // Require at least ~2 remaining chars
143                let remaining = s[..idx].chars().count();
144                if remaining < 2 {
145                    break;
146                }
147                s.truncate(idx);
148            }
149            _ => break,
150        }
151    }
152
153    s
154}
155
156// ---------------------------------------------------------------------------
157// Stage 2: Class sequence construction
158// ---------------------------------------------------------------------------
159
160/// Walk the preprocessed string, emitting one sonority class per phoneme.
161///
162/// `g_class` controls the resolution of ambiguous `G` / `ج`:
163///   - `1` → treat as /g/ (Egyptian Arabic, Germanic)
164///   - `6` → treat as /dʒ/ (Standard Arabic, French)
165#[must_use]
166pub fn class_sequence(s: &str, g_class: Class) -> ClassVec {
167    // Collect chars onto the stack (SmallVec spills to heap only for very
168    // long names, which are rare).
169    #[cfg(feature = "smallvec")]
170    let chars: smallvec::SmallVec<[char; 24]> = s.chars().collect();
171    #[cfg(not(feature = "smallvec"))]
172    let chars: Vec<char> = s.chars().collect();
173
174    let n = chars.len();
175    let mut out: ClassVec = ClassVec::new();
176    let mut i = 0usize;
177    let mut prev = '\0';
178
179    while i < n {
180        let ch = chars[i];
181
182        // True gemination: KK → K (same literal char twice in a row)
183        if ch == prev && prev != '\0' {
184            i += 1;
185            continue;
186        }
187
188        // Arabic matres lectionis: glide word-initial, vowel medially
189        if sonority::is_arabic_matres(ch) {
190            let cls = if i == 0 { 6 } else { 7 };
191            out.push(cls);
192            prev = ch;
193            i += 1;
194            continue;
195        }
196
197        // Latin Y: glide only at word-initial before a vowel, else high vowel
198        if ch == 'Y' {
199            let at_start = i == 0;
200            let next_is_vowel = (i + 1 < n) && sonority::class_of(chars[i + 1]) >= 7;
201            let cls = if at_start && next_is_vowel { 6 } else { 7 };
202            out.push(cls);
203            prev = ch;
204            i += 1;
205            continue;
206        }
207
208        // Ambiguous G / ج — class decided by caller
209        if sonority::is_g_ambiguous(ch) {
210            out.push(g_class);
211            prev = ch;
212            i += 1;
213            continue;
214        }
215
216        // Digraphs (KH, SH, TH, ...)
217        if i + 1 < n {
218            if let Some(cls) = sonority::digraph_class(ch, chars[i + 1]) {
219                out.push(cls);
220                prev = chars[i + 1];
221                i += 2;
222                continue;
223            }
224        }
225
226        // Plain character
227        let cls = sonority::class_of(ch);
228        if cls > 0 {
229            // Adjacent vowel collapse: AE → max(A, E)
230            if let Some(&last) = out.last() {
231                if cls >= 7 && last >= 7 {
232                    if cls > last {
233                        *out.last_mut().unwrap() = cls;
234                    }
235                } else {
236                    out.push(cls);
237                }
238            } else {
239                out.push(cls);
240            }
241        }
242        prev = ch;
243        i += 1;
244    }
245
246    out
247}
248
249// ---------------------------------------------------------------------------
250// Stage 3: Spectral packing
251// ---------------------------------------------------------------------------
252
253const CHEB_RANGES: [(f32, f32); 4] = [
254    (1.0, 6.0),  // a_0: mean class
255    (-3.0, 3.0), // a_1: linear trend
256    (-2.0, 2.0), // a_2: quadratic shape
257    (-1.5, 1.5), // a_3: cubic wobble
258];
259
260const CHEB_BITS: [u32; 4] = [5, 7, 6, 6];
261const LENGTH_SHIFT: u32 = 29;
262
263/// Compute 4 Chebyshev coefficients and Gray-code-quantize into a 32-bit key.
264///
265/// The top 3 bits are the length bucket (log-bucketed), enabling a single-
266/// comparison length prefilter during retrieval. The remaining 24 bits are
267/// the quantized coefficients, with a trailing 5-bit reserved pad.
268#[inline]
269#[must_use]
270pub fn pack_spectral(consonants: &[u8]) -> u32 {
271    let n = consonants.len();
272    if n == 0 {
273        return 0;
274    }
275
276    let n_eff = n.min(chebyshev::MAX_LEN);
277    let table = chebyshev::table();
278
279    let mut a0 = 0f32;
280    let mut a1 = 0f32;
281    let mut a2 = 0f32;
282    let mut a3 = 0f32;
283
284    for (i, &c_i) in consonants.iter().take(n_eff).enumerate() {
285        let c = c_i as f32;
286        let row = table.row(n_eff, i);
287        // row is a [f32; K] slice; indexing is bounds-checked but the compiler
288        // can elide the checks since K is const and we access 0..K.
289        a0 += c * row[0];
290        a1 += c * row[1];
291        a2 += c * row[2];
292        a3 += c * row[3];
293    }
294
295    let inv_n = 2.0 / (n_eff as f32);
296    a0 *= inv_n * 0.5; // standard a_0 convention
297    a1 *= inv_n;
298    a2 *= inv_n;
299    a3 *= inv_n;
300
301    let mut packed = (chebyshev::length_bucket(n) as u32) << LENGTH_SHIFT;
302    let mut shift = LENGTH_SHIFT;
303    let coeffs = [a0, a1, a2, a3];
304    for k in 0..4 {
305        let (lo, hi) = CHEB_RANGES[k];
306        let b = CHEB_BITS[k];
307        shift -= b;
308        let span = hi - lo;
309        let u = ((coeffs[k] - lo) / span).clamp(0.0, 1.0);
310        let hi_val = (1u32 << b) - 1;
311        let v = (u * (hi_val as f32) + 0.5) as u32;
312        let v = v.min(hi_val);
313        let gray = v ^ (v >> 1); // Gray code
314        packed |= gray << shift;
315    }
316
317    packed
318}
319
320// ---------------------------------------------------------------------------
321// Stage 4: Bloom signature
322// ---------------------------------------------------------------------------
323
324/// 64-bit skip-bigram Bloom over consonants.
325///
326/// Hashes each (c_i, c_{i+d}) pair for d ∈ {1, 2} to a bit position.
327/// Provides edit-distance tolerance beyond what moment-based fingerprints
328/// can capture.
329#[inline]
330#[must_use]
331pub fn bloom_signature(classes: &[u8]) -> u64 {
332    // Filter to consonants on the stack — no allocation.
333    let mut cons = [0u8; 32];
334    let mut n = 0usize;
335    for &c in classes {
336        if c < 7 && n < cons.len() {
337            cons[n] = c;
338            n += 1;
339        }
340    }
341    if n < 2 {
342        return 0;
343    }
344
345    let mut sig: u64 = 0;
346    for i in 0..n {
347        let c_i = cons[i] as u64;
348        if i + 1 < n {
349            let c_j = cons[i + 1] as u64;
350            let key = (c_i
351                .wrapping_mul(11)
352                .wrapping_add(c_j.wrapping_mul(97))
353                .wrapping_add(17))
354                & 63;
355            sig |= 1u64 << key;
356        }
357        if i + 2 < n {
358            let c_j = cons[i + 2] as u64;
359            let key = (c_i
360                .wrapping_mul(11)
361                .wrapping_add(c_j.wrapping_mul(97))
362                .wrapping_add(34))
363                & 63;
364            sig |= 1u64 << key;
365        }
366    }
367
368    sig
369}
370
371// ---------------------------------------------------------------------------
372// Stage 5: Encode
373// ---------------------------------------------------------------------------
374
375#[inline]
376fn filter_consonants(seq: &[u8]) -> ClassVec {
377    let mut out: ClassVec = ClassVec::with_capacity(seq.len());
378    for &c in seq {
379        if c < 7 {
380            out.push(c);
381        }
382    }
383    out
384}
385
386/// Encode a single token into an AMT [`Code`].
387///
388/// Whitespace and punctuation in the input are treated as invisible —
389/// callers wanting multi-token handling should use [`encode`].
390#[must_use]
391pub fn encode_token(token: &str) -> Code {
392    let s = preprocess(token);
393    let has_g = s.chars().any(sonority::is_g_ambiguous);
394
395    let seq1 = class_sequence(&s, 1);
396    let primary_classes = seq1.clone();
397
398    let cons1 = filter_consonants(&seq1);
399    let sp1 = pack_spectral(&cons1);
400    let bl1 = bloom_signature(&seq1);
401
402    let mut spectrals: SpectralVec = SpectralVec::new();
403    let mut blooms: BloomVec = BloomVec::new();
404    spectrals.push(sp1);
405    blooms.push(bl1);
406
407    if has_g {
408        let seq2 = class_sequence(&s, 6);
409        let cons2 = filter_consonants(&seq2);
410        let sp2 = pack_spectral(&cons2);
411        if sp2 != sp1 {
412            let bl2 = bloom_signature(&seq2);
413            spectrals.push(sp2);
414            blooms.push(bl2);
415        }
416    }
417
418    Code {
419        spectrals,
420        blooms,
421        classes: primary_classes,
422    }
423}
424
425/// Encode a multi-token name into a list of per-token codes.
426///
427/// Splits on ASCII whitespace.
428#[must_use]
429pub fn encode(name: &str) -> Vec<Code> {
430    name.split_whitespace()
431        .filter(|t| !t.is_empty())
432        .map(encode_token)
433        .collect()
434}
435
436/// Encode many names in a single call.
437#[must_use]
438pub fn encode_batch<I, S>(names: I) -> Vec<Vec<Code>>
439where
440    I: IntoIterator<Item = S>,
441    S: AsRef<str>,
442{
443    names.into_iter().map(|s| encode(s.as_ref())).collect()
444}