use unicode_normalization::char::canonical_combining_class;
use unicode_normalization::UnicodeNormalization;
use crate::chebyshev;
use crate::sonority::{self, Class};
#[cfg(feature = "smallvec")]
pub type ClassVec = smallvec::SmallVec<[u8; 16]>;
#[cfg(feature = "smallvec")]
pub type SpectralVec = smallvec::SmallVec<[u32; 2]>;
#[cfg(feature = "smallvec")]
pub type BloomVec = smallvec::SmallVec<[u64; 2]>;
#[cfg(not(feature = "smallvec"))]
pub type ClassVec = Vec<u8>;
#[cfg(not(feature = "smallvec"))]
pub type SpectralVec = Vec<u32>;
#[cfg(not(feature = "smallvec"))]
pub type BloomVec = Vec<u64>;
#[derive(Debug, Clone, PartialEq, Eq, Hash)]
pub struct Code {
pub spectrals: SpectralVec,
pub blooms: BloomVec,
pub classes: ClassVec,
}
impl Code {
#[inline]
#[must_use]
pub fn matches(&self, other: &Code) -> bool {
for &a in &self.spectrals {
for &b in &other.spectrals {
if a == b {
return true;
}
}
}
false
}
}
#[must_use]
pub fn preprocess(token: &str) -> String {
let mut s: String = token
.nfkd()
.filter(|c| canonical_combining_class(*c) == 0)
.flat_map(char::to_uppercase)
.collect();
s.retain(|c| c != '-' && c != '\'' && c != '\u{2019}');
loop {
let consumed_end = {
let mut it = s.char_indices();
let alif = it.next();
let lam = it.next();
let first_real = it.next();
match (alif, lam, first_real) {
(Some((_, 'ا')), Some((_, 'ل')), Some((idx, ch)))
if sonority::class_of(ch) > 0 && it.next().is_some() =>
{
idx
}
_ => break,
}
};
s.drain(..consumed_end);
}
for &px in sonority::LATIN_PREFIXES {
if s.len() > px.len() + 1 && s.starts_with(px) {
if let Some(next_ch) = s[px.len()..].chars().next() {
let cls = sonority::class_of(next_ch);
if (1..7).contains(&cls) {
s.drain(..px.len());
break;
}
}
}
}
loop {
let last = s.char_indices().last();
match last {
Some((idx, c)) if sonority::is_silent_trailing(c) => {
let remaining = s[..idx].chars().count();
if remaining < 2 {
break;
}
s.truncate(idx);
}
_ => break,
}
}
s
}
#[must_use]
pub fn class_sequence(s: &str, g_class: Class) -> ClassVec {
#[cfg(feature = "smallvec")]
let chars: smallvec::SmallVec<[char; 24]> = s.chars().collect();
#[cfg(not(feature = "smallvec"))]
let chars: Vec<char> = s.chars().collect();
let n = chars.len();
let mut out: ClassVec = ClassVec::new();
let mut i = 0usize;
let mut prev = '\0';
while i < n {
let ch = chars[i];
if ch == prev && prev != '\0' {
i += 1;
continue;
}
if sonority::is_arabic_matres(ch) {
let cls = if i == 0 { 6 } else { 7 };
out.push(cls);
prev = ch;
i += 1;
continue;
}
if ch == 'Y' {
let at_start = i == 0;
let next_is_vowel = (i + 1 < n) && sonority::class_of(chars[i + 1]) >= 7;
let cls = if at_start && next_is_vowel { 6 } else { 7 };
out.push(cls);
prev = ch;
i += 1;
continue;
}
if sonority::is_g_ambiguous(ch) {
out.push(g_class);
prev = ch;
i += 1;
continue;
}
if i + 1 < n {
if let Some(cls) = sonority::digraph_class(ch, chars[i + 1]) {
out.push(cls);
prev = chars[i + 1];
i += 2;
continue;
}
}
let cls = sonority::class_of(ch);
if cls > 0 {
if let Some(&last) = out.last() {
if cls >= 7 && last >= 7 {
if cls > last {
*out.last_mut().unwrap() = cls;
}
} else {
out.push(cls);
}
} else {
out.push(cls);
}
}
prev = ch;
i += 1;
}
out
}
const CHEB_RANGES: [(f32, f32); 4] = [
(1.0, 6.0), (-3.0, 3.0), (-2.0, 2.0), (-1.5, 1.5), ];
const CHEB_BITS: [u32; 4] = [5, 7, 6, 6];
const LENGTH_SHIFT: u32 = 29;
#[inline]
#[must_use]
pub fn pack_spectral(consonants: &[u8]) -> u32 {
let n = consonants.len();
if n == 0 {
return 0;
}
let n_eff = n.min(chebyshev::MAX_LEN);
let table = chebyshev::table();
let mut a0 = 0f32;
let mut a1 = 0f32;
let mut a2 = 0f32;
let mut a3 = 0f32;
for (i, &c_i) in consonants.iter().take(n_eff).enumerate() {
let c = c_i as f32;
let row = table.row(n_eff, i);
a0 += c * row[0];
a1 += c * row[1];
a2 += c * row[2];
a3 += c * row[3];
}
let inv_n = 2.0 / (n_eff as f32);
a0 *= inv_n * 0.5; a1 *= inv_n;
a2 *= inv_n;
a3 *= inv_n;
let mut packed = (chebyshev::length_bucket(n) as u32) << LENGTH_SHIFT;
let mut shift = LENGTH_SHIFT;
let coeffs = [a0, a1, a2, a3];
for k in 0..4 {
let (lo, hi) = CHEB_RANGES[k];
let b = CHEB_BITS[k];
shift -= b;
let span = hi - lo;
let u = ((coeffs[k] - lo) / span).clamp(0.0, 1.0);
let hi_val = (1u32 << b) - 1;
let v = (u * (hi_val as f32) + 0.5) as u32;
let v = v.min(hi_val);
let gray = v ^ (v >> 1); packed |= gray << shift;
}
packed
}
#[inline]
#[must_use]
pub fn bloom_signature(classes: &[u8]) -> u64 {
let mut cons = [0u8; 32];
let mut n = 0usize;
for &c in classes {
if c < 7 && n < cons.len() {
cons[n] = c;
n += 1;
}
}
if n < 2 {
return 0;
}
let mut sig: u64 = 0;
for i in 0..n {
let c_i = cons[i] as u64;
if i + 1 < n {
let c_j = cons[i + 1] as u64;
let key = (c_i
.wrapping_mul(11)
.wrapping_add(c_j.wrapping_mul(97))
.wrapping_add(17))
& 63;
sig |= 1u64 << key;
}
if i + 2 < n {
let c_j = cons[i + 2] as u64;
let key = (c_i
.wrapping_mul(11)
.wrapping_add(c_j.wrapping_mul(97))
.wrapping_add(34))
& 63;
sig |= 1u64 << key;
}
}
sig
}
#[inline]
fn filter_consonants(seq: &[u8]) -> ClassVec {
let mut out: ClassVec = ClassVec::with_capacity(seq.len());
for &c in seq {
if c < 7 {
out.push(c);
}
}
out
}
#[must_use]
pub fn encode_token(token: &str) -> Code {
let s = preprocess(token);
let has_g = s.chars().any(sonority::is_g_ambiguous);
let seq1 = class_sequence(&s, 1);
let primary_classes = seq1.clone();
let cons1 = filter_consonants(&seq1);
let sp1 = pack_spectral(&cons1);
let bl1 = bloom_signature(&seq1);
let mut spectrals: SpectralVec = SpectralVec::new();
let mut blooms: BloomVec = BloomVec::new();
spectrals.push(sp1);
blooms.push(bl1);
if has_g {
let seq2 = class_sequence(&s, 6);
let cons2 = filter_consonants(&seq2);
let sp2 = pack_spectral(&cons2);
if sp2 != sp1 {
let bl2 = bloom_signature(&seq2);
spectrals.push(sp2);
blooms.push(bl2);
}
}
Code {
spectrals,
blooms,
classes: primary_classes,
}
}
#[must_use]
pub fn encode(name: &str) -> Vec<Code> {
name.split_whitespace()
.filter(|t| !t.is_empty())
.map(encode_token)
.collect()
}
#[must_use]
pub fn encode_batch<I, S>(names: I) -> Vec<Vec<Code>>
where
I: IntoIterator<Item = S>,
S: AsRef<str>,
{
names.into_iter().map(|s| encode(s.as_ref())).collect()
}