pub const PAD_LEFT: u8 = 0x01;
pub const PAD_RIGHT: u8 = 0x03;
pub const TRIGRAM_LEN: usize = 3;
#[must_use]
pub fn hash_trigram(trigram: &[u8]) -> u64 {
let h = blake3::hash(trigram);
let bytes = h.as_bytes();
let mut head = [0_u8; 8];
head.copy_from_slice(&bytes[..8]);
u64::from_le_bytes(head)
}
#[must_use]
pub fn extract_trigrams(text: &[u8]) -> Vec<u64> {
let mut padded: Vec<u8> = Vec::with_capacity(text.len() + 4);
padded.push(PAD_LEFT);
padded.push(PAD_LEFT);
padded.extend_from_slice(text);
padded.push(PAD_RIGHT);
padded.push(PAD_RIGHT);
if padded.len() < TRIGRAM_LEN {
return Vec::new();
}
padded.windows(TRIGRAM_LEN).map(hash_trigram).collect()
}
#[must_use]
pub fn extract_trigram_set(text: &[u8]) -> Vec<u64> {
let mut v = extract_trigrams(text);
v.sort_unstable();
v.dedup();
v
}
#[must_use]
pub fn extract_query_trigrams(query: &[u8]) -> Vec<u64> {
if query.len() < TRIGRAM_LEN {
return Vec::new();
}
query.windows(TRIGRAM_LEN).map(hash_trigram).collect()
}
#[must_use]
pub fn extract_query_trigram_set(query: &[u8]) -> Vec<u64> {
let mut v = extract_query_trigrams(query);
v.sort_unstable();
v.dedup();
v
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extract_trigrams_simple_string() {
let v = extract_trigrams(b"abc");
assert_eq!(v.len(), 5);
let mut sorted = v.clone();
sorted.sort_unstable();
sorted.dedup();
assert_eq!(sorted.len(), 5);
}
#[test]
fn extract_trigrams_short_input_one_byte() {
let v = extract_trigrams(b"x");
assert_eq!(v.len(), 3);
}
#[test]
fn extract_trigrams_short_input_two_bytes() {
let v = extract_trigrams(b"xy");
assert_eq!(v.len(), 4);
}
#[test]
fn extract_trigrams_empty_input() {
let v = extract_trigrams(b"");
assert_eq!(v.len(), 2);
}
#[test]
fn extract_trigrams_unicode_byte_level() {
let s: &[u8] = b"a\xc3\xa9";
let v = extract_trigrams(s);
assert_eq!(v.len(), 5);
}
#[test]
fn hash_trigram_deterministic() {
let a = hash_trigram(b"foo");
let b = hash_trigram(b"foo");
assert_eq!(a, b);
}
#[test]
fn hash_trigram_distinct_for_different_inputs() {
let a = hash_trigram(b"foo");
let b = hash_trigram(b"foa");
let c = hash_trigram(b"oof");
assert_ne!(a, b);
assert_ne!(a, c);
assert_ne!(b, c);
}
#[test]
fn extract_trigram_set_dedups() {
let raw = extract_trigrams(b"aaaa");
let set = extract_trigram_set(b"aaaa");
assert!(set.len() <= raw.len());
for w in set.windows(2) {
assert!(w[0] < w[1]);
}
}
#[test]
fn extract_query_trigrams_unpadded() {
let v = extract_query_trigrams(b"abcdef");
assert_eq!(v.len(), 4);
assert_eq!(v[0], hash_trigram(b"abc"));
assert_eq!(v[1], hash_trigram(b"bcd"));
assert_eq!(v[2], hash_trigram(b"cde"));
assert_eq!(v[3], hash_trigram(b"def"));
}
#[test]
fn extract_query_trigrams_short_input_is_empty() {
assert!(extract_query_trigrams(b"").is_empty());
assert!(extract_query_trigrams(b"a").is_empty());
assert!(extract_query_trigrams(b"ab").is_empty());
}
#[test]
fn extract_query_trigrams_three_byte_input_yields_one_trigram() {
let v = extract_query_trigrams(b"abc");
assert_eq!(v, vec![hash_trigram(b"abc")]);
}
#[test]
fn query_trigrams_are_subset_of_padded_trigrams() {
let doc_set = extract_trigram_set(b"hello world");
let q_set = extract_query_trigram_set(b"hello");
for q in &q_set {
assert!(doc_set.contains(q), "query trigram {q:#x} not in doc set",);
}
}
}