use alloc::boxed::Box;
use alloc::vec::Vec;
use super::dictionary::{self, MAX_DICTIONARY_WORD_LENGTH, MIN_DICTIONARY_WORD_LENGTH};
use super::transforms::{PREFIX_SUFFIX, TRANSFORMS, Tr};
const HASH_BITS: u32 = 15;
const HASH_SIZE: usize = 1 << HASH_BITS;
const NIL: u16 = u16::MAX;
const NUM_WORDS: usize = compute_num_words();
const fn compute_num_words() -> usize {
let mut total = 0usize;
let mut len = MIN_DICTIONARY_WORD_LENGTH;
while len <= MAX_DICTIONARY_WORD_LENGTH {
let bits = dictionary::SIZE_BITS_BY_LENGTH[len];
if bits > 0 {
total += 1usize << bits;
}
len += 1;
}
total
}
#[derive(Clone, Copy, Debug)]
struct WordRef(u32);
impl WordRef {
const fn new(len: u8, idx: u32) -> Self {
Self((idx << 5) | (len as u32 & 0x1F))
}
fn len(self) -> u8 {
(self.0 & 0x1F) as u8
}
fn idx(self) -> u32 {
self.0 >> 5
}
}
pub(crate) struct DictIndex {
head: Box<[u16; HASH_SIZE]>,
entries: Box<[(WordRef, u16)]>,
}
impl core::fmt::Debug for DictIndex {
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_struct("DictIndex")
.field("hash_size", &HASH_SIZE)
.field("entries", &self.entries.len())
.finish()
}
}
fn hash4(b0: u8, b1: u8, b2: u8, b3: u8) -> u32 {
let v = (b0 as u32) | ((b1 as u32) << 8) | ((b2 as u32) << 16) | ((b3 as u32) << 24);
v.wrapping_mul(0x9E37_79B1) >> (32 - HASH_BITS)
}
impl DictIndex {
pub(crate) fn build() -> Self {
let mut head: Box<[u16; HASH_SIZE]> = Box::new([NIL; HASH_SIZE]);
let mut entries: Vec<(WordRef, u16)> = Vec::with_capacity(NUM_WORDS);
for len in MIN_DICTIONARY_WORD_LENGTH..=MAX_DICTIONARY_WORD_LENGTH {
let bits = dictionary::SIZE_BITS_BY_LENGTH[len];
if bits == 0 {
continue;
}
let count = 1u32 << bits;
for idx in 0..count {
let word = match dictionary::word(len, idx) {
Some(w) => w,
None => continue,
};
if word.len() < 4 {
continue;
}
let h = hash4(word[0], word[1], word[2], word[3]);
let bucket = h as usize;
let entry_idx = entries.len();
if entry_idx >= u16::MAX as usize {
break;
}
let prev = head[bucket];
head[bucket] = entry_idx as u16;
entries.push((WordRef::new(len as u8, idx), prev));
}
}
Self {
head,
entries: entries.into_boxed_slice(),
}
}
fn for_each_candidate<F: FnMut(u8, u32, &'static [u8])>(&self, key: &[u8], mut f: F) {
if key.len() < 4 {
return;
}
let h = hash4(key[0], key[1], key[2], key[3]);
let mut cur = self.head[h as usize];
let mut steps = 0usize;
while cur != NIL && steps < 32 {
let (wref, next) = self.entries[cur as usize];
let len = wref.len();
let idx = wref.idx();
if let Some(w) = dictionary::word(len as usize, idx) {
f(len, idx, w);
}
cur = next;
steps += 1;
}
}
}
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub(crate) enum BodyKind {
Identity,
UppercaseFirstAscii,
#[allow(dead_code)]
OmitLast(u8),
}
#[derive(Clone, Copy, Debug)]
pub(crate) struct IdTransform {
pub(crate) id: u8,
pub(crate) prefix: &'static [u8],
pub(crate) suffix: &'static [u8],
pub(crate) body: BodyKind,
}
pub(crate) fn identity_transforms() -> Vec<IdTransform> {
let mut v = Vec::with_capacity(128);
for (id, &(pre, kind, suf)) in TRANSFORMS.iter().enumerate() {
let body = match kind {
Tr::Identity => BodyKind::Identity,
Tr::UppercaseFirst => BodyKind::UppercaseFirstAscii,
_ => continue,
};
v.push(IdTransform {
id: id as u8,
prefix: PREFIX_SUFFIX[pre as usize],
suffix: PREFIX_SUFFIX[suf as usize],
body,
});
}
v
}
#[derive(Clone, Copy, Debug)]
pub(crate) struct DictMatch {
pub(crate) word_len: u8,
pub(crate) word_idx: u32,
pub(crate) transform_id: u8,
pub(crate) emit_len: u32,
}
pub(crate) fn find_dict_match(
index: &DictIndex,
transforms: &[IdTransform],
input: &[u8],
pos: usize,
min_emit_len: u32,
) -> Option<DictMatch> {
if pos >= input.len() {
return None;
}
let tail = &input[pos..];
let mut best: Option<DictMatch> = None;
let mut best_len: u32 = min_emit_len.saturating_sub(1);
for tr in transforms {
let pre = tr.prefix;
let suf = tr.suffix;
if tail.len() < pre.len() + MIN_DICTIONARY_WORD_LENGTH + suf.len() {
continue;
}
if !tail.starts_with(pre) {
continue;
}
let after_pre = &tail[pre.len()..];
if after_pre.len() < 4 {
continue;
}
let (key_first, body_offset_first_byte) = match tr.body {
BodyKind::Identity => (after_pre[0], 0u8),
BodyKind::UppercaseFirstAscii => {
if !after_pre[0].is_ascii_uppercase() {
continue;
}
(after_pre[0] | 0x20, 32u8)
}
BodyKind::OmitLast(_) => (after_pre[0], 0u8),
};
let key = [key_first, after_pre[1], after_pre[2], after_pre[3]];
let body_kind = tr.body;
index.for_each_candidate(&key, |word_len, word_idx, word| {
let wl = word_len as usize;
let body_len = match body_kind {
BodyKind::Identity | BodyKind::UppercaseFirstAscii => wl,
BodyKind::OmitLast(n) => {
let n = n as usize;
if wl <= n + 3 {
return;
}
wl - n
}
};
if after_pre.len() < body_len + suf.len() {
return;
}
if after_pre[0] ^ body_offset_first_byte != word[0] {
return;
}
if body_len > 1 && !after_pre[1..body_len].eq(&word[1..body_len]) {
return;
}
if !suf.is_empty() && !after_pre[body_len..body_len + suf.len()].eq(suf) {
return;
}
let emit_len = (pre.len() + body_len + suf.len()) as u32;
if emit_len > best_len {
best_len = emit_len;
best = Some(DictMatch {
word_len,
word_idx,
transform_id: tr.id,
emit_len,
});
}
});
}
best
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn build_does_not_panic() {
let _ = DictIndex::build();
}
#[test]
fn find_time_at_zero() {
let index = DictIndex::build();
let trs = identity_transforms();
let input = b"time";
let m = find_dict_match(&index, &trs, input, 0, 4).expect("should find 'time'");
assert_eq!(m.word_len, 4);
assert_eq!(m.word_idx, 0);
assert_eq!(m.emit_len, 4);
}
#[test]
fn find_time_space_suffix() {
let index = DictIndex::build();
let trs = identity_transforms();
let input = b"time ";
let m = find_dict_match(&index, &trs, input, 0, 4).expect("should find 'time '");
assert!(m.emit_len >= 4);
}
}