amt/core.rs
1//! Core encoding pipeline.
2//!
3//! Stage-by-stage:
4//! 1. [`preprocess`] — NFKD normalize, uppercase, strip prefixes/affixes.
5//! 2. [`class_sequence`] — walk chars, emit sonority classes with contextual rules.
6//! 3. [`pack_spectral`] — Chebyshev accumulate, Gray-quantize, bit-pack.
7//! 4. [`bloom_signature`] — skip-bigram Bloom filter.
8//! 5. [`encode_token`] — orchestrate the above, emit multi-key variants.
9//!
10//! Performance notes:
11//! * All hot functions are `#[inline]`.
12//! * [`ClassVec`], [`SpectralVec`], [`BloomVec`] use `SmallVec` (default
13//! feature) so the common case (≤16 classes, ≤2 variants) avoids any
14//! heap allocation.
15//! * [`bloom_signature`] uses a fixed-size stack array to sidestep
16//! allocation entirely.
17//! * The Chebyshev inner loop compiles to a tight 4-way FMA sequence.
18
19use unicode_normalization::char::canonical_combining_class;
20use unicode_normalization::UnicodeNormalization;
21
22use crate::chebyshev;
23use crate::sonority::{self, Class};
24
25// ---------------------------------------------------------------------------
26// Type aliases — smallvec optional
27// ---------------------------------------------------------------------------
28
29/// Sequence of sonority classes; inline-stored when ≤ 16 entries.
30#[cfg(feature = "smallvec")]
31pub type ClassVec = smallvec::SmallVec<[u8; 16]>;
32/// Spectral keys for a single token; inline-stored when ≤ 2 entries.
33#[cfg(feature = "smallvec")]
34pub type SpectralVec = smallvec::SmallVec<[u32; 2]>;
35/// Bloom signatures for a single token; inline-stored when ≤ 2 entries.
36#[cfg(feature = "smallvec")]
37pub type BloomVec = smallvec::SmallVec<[u64; 2]>;
38
39/// Sequence of sonority classes (heap-allocated when `smallvec` feature is off).
40#[cfg(not(feature = "smallvec"))]
41pub type ClassVec = Vec<u8>;
42/// Spectral keys for a single token (heap-allocated when `smallvec` feature is off).
43#[cfg(not(feature = "smallvec"))]
44pub type SpectralVec = Vec<u32>;
45/// Bloom signatures for a single token (heap-allocated when `smallvec` feature is off).
46#[cfg(not(feature = "smallvec"))]
47pub type BloomVec = Vec<u64>;
48
49// ---------------------------------------------------------------------------
50// Public type
51// ---------------------------------------------------------------------------
52
53/// A fully-encoded token.
54///
55/// Invariant: `spectrals.len() == blooms.len()` and equals either 1 or 2
56/// (2 only when the token contains ambiguous phonemes like `G`/`ج`).
57#[derive(Debug, Clone, PartialEq, Eq, Hash)]
58pub struct Code {
59 /// 32-bit packed spectral keys. First key is always the primary variant.
60 pub spectrals: SpectralVec,
61 /// 64-bit Bloom signatures, index-aligned with `spectrals`.
62 pub blooms: BloomVec,
63 /// The class sequence from the primary variant (for debugging / re-analysis).
64 pub classes: ClassVec,
65}
66
67impl Code {
68 /// True if any spectral key is shared with `other`.
69 #[inline]
70 #[must_use]
71 pub fn matches(&self, other: &Code) -> bool {
72 for &a in &self.spectrals {
73 for &b in &other.spectrals {
74 if a == b {
75 return true;
76 }
77 }
78 }
79 false
80 }
81}
82
83// ---------------------------------------------------------------------------
84// Stage 1: Preprocessing
85// ---------------------------------------------------------------------------
86
87/// NFKD-normalize, strip combining marks, uppercase, remove hyphens/apostrophes,
88/// strip prefixes, strip silent trailing letters.
89#[must_use]
90pub fn preprocess(token: &str) -> String {
91 // Unicode normalization: NFKD splits composed characters into base + combining.
92 // Filtering by canonical_combining_class == 0 keeps only base characters.
93 // Then uppercase (handles Latin case; Arabic is caseless).
94 let mut s: String = token
95 .nfkd()
96 .filter(|c| canonical_combining_class(*c) == 0)
97 .flat_map(char::to_uppercase)
98 .collect();
99
100 // Strip hyphens, apostrophes, right single quotation mark.
101 s.retain(|c| c != '-' && c != '\'' && c != '\u{2019}');
102
103 // Strip Arabic definite article "ال" — possibly multiple compounds.
104 loop {
105 let consumed_end = {
106 let mut it = s.char_indices();
107 let alif = it.next();
108 let lam = it.next();
109 let first_real = it.next();
110 match (alif, lam, first_real) {
111 (Some((_, 'ا')), Some((_, 'ل')), Some((idx, ch)))
112 if sonority::class_of(ch) > 0 && it.next().is_some() =>
113 {
114 idx
115 }
116 _ => break,
117 }
118 };
119 s.drain(..consumed_end);
120 }
121
122 // Strip Latin article-style prefixes (AL, EL, UL, AS, ES) — but only
123 // when the next character is a real consonant (not a vowel/class ≥ 7).
124 for &px in sonority::LATIN_PREFIXES {
125 if s.len() > px.len() + 1 && s.starts_with(px) {
126 if let Some(next_ch) = s[px.len()..].chars().next() {
127 let cls = sonority::class_of(next_ch);
128 if (1..7).contains(&cls) {
129 s.drain(..px.len());
130 break;
131 }
132 }
133 }
134 }
135
136 // Strip silent word-final letters (H / ه / ة).
137 // Walk from the end byte-by-byte, using char_indices to get valid boundaries.
138 loop {
139 let last = s.char_indices().last();
140 match last {
141 Some((idx, c)) if sonority::is_silent_trailing(c) => {
142 // Require at least ~2 remaining chars
143 let remaining = s[..idx].chars().count();
144 if remaining < 2 {
145 break;
146 }
147 s.truncate(idx);
148 }
149 _ => break,
150 }
151 }
152
153 s
154}
155
156// ---------------------------------------------------------------------------
157// Stage 2: Class sequence construction
158// ---------------------------------------------------------------------------
159
160/// Walk the preprocessed string, emitting one sonority class per phoneme.
161///
162/// `g_class` controls the resolution of ambiguous `G` / `ج`:
163/// - `1` → treat as /g/ (Egyptian Arabic, Germanic)
164/// - `6` → treat as /dʒ/ (Standard Arabic, French)
165#[must_use]
166pub fn class_sequence(s: &str, g_class: Class) -> ClassVec {
167 // Collect chars onto the stack (SmallVec spills to heap only for very
168 // long names, which are rare).
169 #[cfg(feature = "smallvec")]
170 let chars: smallvec::SmallVec<[char; 24]> = s.chars().collect();
171 #[cfg(not(feature = "smallvec"))]
172 let chars: Vec<char> = s.chars().collect();
173
174 let n = chars.len();
175 let mut out: ClassVec = ClassVec::new();
176 let mut i = 0usize;
177 let mut prev = '\0';
178
179 while i < n {
180 let ch = chars[i];
181
182 // True gemination: KK → K (same literal char twice in a row)
183 if ch == prev && prev != '\0' {
184 i += 1;
185 continue;
186 }
187
188 // Arabic matres lectionis: glide word-initial, vowel medially
189 if sonority::is_arabic_matres(ch) {
190 let cls = if i == 0 { 6 } else { 7 };
191 out.push(cls);
192 prev = ch;
193 i += 1;
194 continue;
195 }
196
197 // Latin Y: glide only at word-initial before a vowel, else high vowel
198 if ch == 'Y' {
199 let at_start = i == 0;
200 let next_is_vowel = (i + 1 < n) && sonority::class_of(chars[i + 1]) >= 7;
201 let cls = if at_start && next_is_vowel { 6 } else { 7 };
202 out.push(cls);
203 prev = ch;
204 i += 1;
205 continue;
206 }
207
208 // Ambiguous G / ج — class decided by caller
209 if sonority::is_g_ambiguous(ch) {
210 out.push(g_class);
211 prev = ch;
212 i += 1;
213 continue;
214 }
215
216 // Digraphs (KH, SH, TH, ...)
217 if i + 1 < n {
218 if let Some(cls) = sonority::digraph_class(ch, chars[i + 1]) {
219 out.push(cls);
220 prev = chars[i + 1];
221 i += 2;
222 continue;
223 }
224 }
225
226 // Plain character
227 let cls = sonority::class_of(ch);
228 if cls > 0 {
229 // Adjacent vowel collapse: AE → max(A, E)
230 if let Some(&last) = out.last() {
231 if cls >= 7 && last >= 7 {
232 if cls > last {
233 *out.last_mut().unwrap() = cls;
234 }
235 } else {
236 out.push(cls);
237 }
238 } else {
239 out.push(cls);
240 }
241 }
242 prev = ch;
243 i += 1;
244 }
245
246 out
247}
248
249// ---------------------------------------------------------------------------
250// Stage 3: Spectral packing
251// ---------------------------------------------------------------------------
252
253const CHEB_RANGES: [(f32, f32); 4] = [
254 (1.0, 6.0), // a_0: mean class
255 (-3.0, 3.0), // a_1: linear trend
256 (-2.0, 2.0), // a_2: quadratic shape
257 (-1.5, 1.5), // a_3: cubic wobble
258];
259
260const CHEB_BITS: [u32; 4] = [5, 7, 6, 6];
261const LENGTH_SHIFT: u32 = 29;
262
263/// Compute 4 Chebyshev coefficients and Gray-code-quantize into a 32-bit key.
264///
265/// The top 3 bits are the length bucket (log-bucketed), enabling a single-
266/// comparison length prefilter during retrieval. The remaining 24 bits are
267/// the quantized coefficients, with a trailing 5-bit reserved pad.
268#[inline]
269#[must_use]
270pub fn pack_spectral(consonants: &[u8]) -> u32 {
271 let n = consonants.len();
272 if n == 0 {
273 return 0;
274 }
275
276 let n_eff = n.min(chebyshev::MAX_LEN);
277 let table = chebyshev::table();
278
279 let mut a0 = 0f32;
280 let mut a1 = 0f32;
281 let mut a2 = 0f32;
282 let mut a3 = 0f32;
283
284 for (i, &c_i) in consonants.iter().take(n_eff).enumerate() {
285 let c = c_i as f32;
286 let row = table.row(n_eff, i);
287 // row is a [f32; K] slice; indexing is bounds-checked but the compiler
288 // can elide the checks since K is const and we access 0..K.
289 a0 += c * row[0];
290 a1 += c * row[1];
291 a2 += c * row[2];
292 a3 += c * row[3];
293 }
294
295 let inv_n = 2.0 / (n_eff as f32);
296 a0 *= inv_n * 0.5; // standard a_0 convention
297 a1 *= inv_n;
298 a2 *= inv_n;
299 a3 *= inv_n;
300
301 let mut packed = (chebyshev::length_bucket(n) as u32) << LENGTH_SHIFT;
302 let mut shift = LENGTH_SHIFT;
303 let coeffs = [a0, a1, a2, a3];
304 for k in 0..4 {
305 let (lo, hi) = CHEB_RANGES[k];
306 let b = CHEB_BITS[k];
307 shift -= b;
308 let span = hi - lo;
309 let u = ((coeffs[k] - lo) / span).clamp(0.0, 1.0);
310 let hi_val = (1u32 << b) - 1;
311 let v = (u * (hi_val as f32) + 0.5) as u32;
312 let v = v.min(hi_val);
313 let gray = v ^ (v >> 1); // Gray code
314 packed |= gray << shift;
315 }
316
317 packed
318}
319
320// ---------------------------------------------------------------------------
321// Stage 4: Bloom signature
322// ---------------------------------------------------------------------------
323
324/// 64-bit skip-bigram Bloom over consonants.
325///
326/// Hashes each (c_i, c_{i+d}) pair for d ∈ {1, 2} to a bit position.
327/// Provides edit-distance tolerance beyond what moment-based fingerprints
328/// can capture.
329#[inline]
330#[must_use]
331pub fn bloom_signature(classes: &[u8]) -> u64 {
332 // Filter to consonants on the stack — no allocation.
333 let mut cons = [0u8; 32];
334 let mut n = 0usize;
335 for &c in classes {
336 if c < 7 && n < cons.len() {
337 cons[n] = c;
338 n += 1;
339 }
340 }
341 if n < 2 {
342 return 0;
343 }
344
345 let mut sig: u64 = 0;
346 for i in 0..n {
347 let c_i = cons[i] as u64;
348 if i + 1 < n {
349 let c_j = cons[i + 1] as u64;
350 let key = (c_i
351 .wrapping_mul(11)
352 .wrapping_add(c_j.wrapping_mul(97))
353 .wrapping_add(17))
354 & 63;
355 sig |= 1u64 << key;
356 }
357 if i + 2 < n {
358 let c_j = cons[i + 2] as u64;
359 let key = (c_i
360 .wrapping_mul(11)
361 .wrapping_add(c_j.wrapping_mul(97))
362 .wrapping_add(34))
363 & 63;
364 sig |= 1u64 << key;
365 }
366 }
367
368 sig
369}
370
371// ---------------------------------------------------------------------------
372// Stage 5: Encode
373// ---------------------------------------------------------------------------
374
375#[inline]
376fn filter_consonants(seq: &[u8]) -> ClassVec {
377 let mut out: ClassVec = ClassVec::with_capacity(seq.len());
378 for &c in seq {
379 if c < 7 {
380 out.push(c);
381 }
382 }
383 out
384}
385
386/// Encode a single token into an AMT [`Code`].
387///
388/// Whitespace and punctuation in the input are treated as invisible —
389/// callers wanting multi-token handling should use [`encode`].
390#[must_use]
391pub fn encode_token(token: &str) -> Code {
392 let s = preprocess(token);
393 let has_g = s.chars().any(sonority::is_g_ambiguous);
394
395 let seq1 = class_sequence(&s, 1);
396 let primary_classes = seq1.clone();
397
398 let cons1 = filter_consonants(&seq1);
399 let sp1 = pack_spectral(&cons1);
400 let bl1 = bloom_signature(&seq1);
401
402 let mut spectrals: SpectralVec = SpectralVec::new();
403 let mut blooms: BloomVec = BloomVec::new();
404 spectrals.push(sp1);
405 blooms.push(bl1);
406
407 if has_g {
408 let seq2 = class_sequence(&s, 6);
409 let cons2 = filter_consonants(&seq2);
410 let sp2 = pack_spectral(&cons2);
411 if sp2 != sp1 {
412 let bl2 = bloom_signature(&seq2);
413 spectrals.push(sp2);
414 blooms.push(bl2);
415 }
416 }
417
418 Code {
419 spectrals,
420 blooms,
421 classes: primary_classes,
422 }
423}
424
425/// Encode a multi-token name into a list of per-token codes.
426///
427/// Splits on ASCII whitespace.
428#[must_use]
429pub fn encode(name: &str) -> Vec<Code> {
430 name.split_whitespace()
431 .filter(|t| !t.is_empty())
432 .map(encode_token)
433 .collect()
434}
435
436/// Encode many names in a single call.
437#[must_use]
438pub fn encode_batch<I, S>(names: I) -> Vec<Vec<Code>>
439where
440 I: IntoIterator<Item = S>,
441 S: AsRef<str>,
442{
443 names.into_iter().map(|s| encode(s.as_ref())).collect()
444}