pub type Trigram = u32;
#[inline]
pub fn from_bytes(a: u8, b: u8, c: u8) -> Trigram {
((a as u32) << 16) | ((b as u32) << 8) | (c as u32)
}
pub struct Extractor {
trigram_offsets: Vec<(Trigram, u32)>,
}
impl Extractor {
pub fn new() -> Self {
Self {
trigram_offsets: Vec::with_capacity(4096),
}
}
pub fn extract_with_offsets(&mut self, data: &[u8]) -> &[(Trigram, u32)] {
self.trigram_offsets.clear();
if data.len() < 3 {
return &self.trigram_offsets;
}
for i in 0..=(data.len() - 3) {
if data[i] == 0 || data[i + 1] == 0 || data[i + 2] == 0 {
continue;
}
let tri = from_bytes(data[i], data[i + 1], data[i + 2]);
self.trigram_offsets.push((tri, i as u32));
}
self.trigram_offsets.sort_unstable();
&self.trigram_offsets
}
pub fn extract_set(query: &[u8]) -> Vec<Trigram> {
if query.len() < 3 {
return vec![];
}
let mut set = Vec::with_capacity(query.len() - 2);
for i in 0..=(query.len() - 3) {
let tri = from_bytes(query[i], query[i + 1], query[i + 2]);
set.push(tri);
}
set.sort_unstable();
set.dedup();
set
}
pub fn extract_groups_case_insensitive(query: &[u8]) -> Vec<Vec<Trigram>> {
if query.len() < 3 {
return vec![];
}
let mut groups = Vec::with_capacity(query.len() - 2);
for i in 0..=(query.len() - 3) {
let bytes = [query[i], query[i + 1], query[i + 2]];
let mut variants = case_variants(bytes);
variants.sort_unstable();
variants.dedup();
groups.push(variants);
}
groups
}
}
fn case_variants(bytes: [u8; 3]) -> Vec<Trigram> {
let alts: [Vec<u8>; 3] = [
byte_cases(bytes[0]),
byte_cases(bytes[1]),
byte_cases(bytes[2]),
];
let mut out = Vec::with_capacity(alts[0].len() * alts[1].len() * alts[2].len());
for &a in &alts[0] {
for &b in &alts[1] {
for &c in &alts[2] {
out.push(from_bytes(a, b, c));
}
}
}
out
}
#[inline]
fn byte_cases(b: u8) -> Vec<u8> {
if b.is_ascii_uppercase() {
vec![b, b.to_ascii_lowercase()]
} else if b.is_ascii_lowercase() {
vec![b, b.to_ascii_uppercase()]
} else {
vec![b]
}
}
impl Default for Extractor {
fn default() -> Self {
Self::new()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn empty() {
let mut e = Extractor::new();
assert!(e.extract_with_offsets(b"").is_empty());
assert!(e.extract_with_offsets(b"ab").is_empty());
}
#[test]
fn single_trigram() {
let mut e = Extractor::new();
let result = e.extract_with_offsets(b"abc");
assert_eq!(result.len(), 1);
assert_eq!(result[0], (from_bytes(b'a', b'b', b'c'), 0));
}
#[test]
fn deduplication() {
let mut e = Extractor::new();
let result = e.extract_with_offsets(b"abcabc");
let abc = from_bytes(b'a', b'b', b'c');
let bca = from_bytes(b'b', b'c', b'a');
let cab = from_bytes(b'c', b'a', b'b');
assert_eq!(result.len(), 4);
assert_eq!(result[0], (abc, 0));
assert_eq!(result[1], (abc, 3));
assert_eq!(result[2], (bca, 1));
assert_eq!(result[3], (cab, 2));
}
#[test]
fn null_bytes_skipped() {
let mut e = Extractor::new();
let result = e.extract_with_offsets(b"a\x00b");
assert!(result.is_empty());
}
#[test]
fn high_bytes_work() {
let mut e = Extractor::new();
let result = e.extract_with_offsets(&[0xFF, 0xFE, 0xFD]);
let tri = from_bytes(0xFF, 0xFE, 0xFD);
assert_eq!(result[0], (tri, 0));
}
#[test]
fn utf8_cafe() {
let data = "caf\u{00e9}".as_bytes();
let set = Extractor::extract_set(data);
assert!(!set.is_empty());
assert!(set.contains(&from_bytes(b'c', b'a', b'f')));
}
#[test]
fn query_set_deduped() {
let set = Extractor::extract_set(b"errorerror");
let unique_count = set.len();
assert!(unique_count <= 8); }
}