Skip to main content

ix/
trigram.rs

1//! Trigram extraction — the core indexing primitive.
2//!
3//! Operates on raw bytes. UTF-8 self-synchronization makes this correct
4//! for Unicode for free. No charset detection needed.
5
6use std::collections::HashMap;
7
8/// A trigram is a u32 with high byte always 0.
9/// trigram("abc") = (0x61 << 16) | (0x62 << 8) | 0x63
10pub type Trigram = u32;
11
12/// Convert 3 bytes to a trigram key.
13#[inline]
14pub fn from_bytes(a: u8, b: u8, c: u8) -> Trigram {
15    ((a as u32) << 16) | ((b as u32) << 8) | (c as u32)
16}
17
18/// Reusable trigram extractor. Pre-allocated, reused across files.
19pub struct Extractor {
20    trigram_offsets: HashMap<Trigram, Vec<u32>>,
21}
22
23impl Extractor {
24    pub fn new() -> Self {
25        Self {
26            trigram_offsets: HashMap::with_capacity(4096),
27        }
28    }
29
30    /// Extract trigrams with byte offsets (for indexing).
31    pub fn extract_with_offsets(&mut self, data: &[u8]) -> &HashMap<Trigram, Vec<u32>> {
32        self.trigram_offsets.clear();
33
34        if data.len() < 3 {
35            return &self.trigram_offsets;
36        }
37
38        for i in 0..=(data.len() - 3) {
39            // Skip trigrams containing null bytes (binary content marker)
40            if data[i] == 0 || data[i + 1] == 0 || data[i + 2] == 0 {
41                continue;
42            }
43
44            let tri = from_bytes(data[i], data[i + 1], data[i + 2]);
45            self.trigram_offsets.entry(tri).or_default().push(i as u32);
46        }
47
48        &self.trigram_offsets
49    }
50
51    /// Extract unique trigram set from a query (no offsets needed).
52    pub fn extract_set(query: &[u8]) -> Vec<Trigram> {
53        if query.len() < 3 {
54            return vec![];
55        }
56        let mut set = Vec::with_capacity(query.len() - 2);
57        for i in 0..=(query.len() - 3) {
58            let tri = from_bytes(query[i], query[i + 1], query[i + 2]);
59            set.push(tri);
60        }
61        set.sort_unstable();
62        set.dedup();
63        set
64    }
65
66    /// Extract trigram groups with case permutations for case-insensitive search.
67    /// Returns one group per original trigram position. Each group contains all
68    /// case variants for that position (max 8 per group for 3 ASCII alpha bytes).
69    /// The executor should UNION within each group and INTERSECT across groups.
70    pub fn extract_groups_case_insensitive(query: &[u8]) -> Vec<Vec<Trigram>> {
71        if query.len() < 3 {
72            return vec![];
73        }
74        let mut groups = Vec::with_capacity(query.len() - 2);
75        for i in 0..=(query.len() - 3) {
76            let bytes = [query[i], query[i + 1], query[i + 2]];
77            let mut variants = case_variants(bytes);
78            variants.sort_unstable();
79            variants.dedup();
80            groups.push(variants);
81        }
82        groups
83    }
84}
85
86/// Generate all case permutations of a 3-byte trigram.
87/// Only ASCII alpha bytes get toggled. Max 8 variants per trigram.
88fn case_variants(bytes: [u8; 3]) -> Vec<Trigram> {
89    let alts: [Vec<u8>; 3] = [
90        byte_cases(bytes[0]),
91        byte_cases(bytes[1]),
92        byte_cases(bytes[2]),
93    ];
94    let mut out = Vec::with_capacity(alts[0].len() * alts[1].len() * alts[2].len());
95    for &a in &alts[0] {
96        for &b in &alts[1] {
97            for &c in &alts[2] {
98                out.push(from_bytes(a, b, c));
99            }
100        }
101    }
102    out
103}
104
105/// Return both cases for ASCII alpha, or just the byte itself.
106#[inline]
107fn byte_cases(b: u8) -> Vec<u8> {
108    if b.is_ascii_uppercase() {
109        vec![b, b.to_ascii_lowercase()]
110    } else if b.is_ascii_lowercase() {
111        vec![b, b.to_ascii_uppercase()]
112    } else {
113        vec![b]
114    }
115}
116
117impl Default for Extractor {
118    fn default() -> Self {
119        Self::new()
120    }
121}
122
123#[cfg(test)]
124mod tests {
125    use super::*;
126
127    #[test]
128    fn empty() {
129        let mut e = Extractor::new();
130        assert!(e.extract_with_offsets(b"").is_empty());
131        assert!(e.extract_with_offsets(b"ab").is_empty());
132    }
133
134    #[test]
135    fn single_trigram() {
136        let mut e = Extractor::new();
137        let result = e.extract_with_offsets(b"abc");
138        assert_eq!(result.len(), 1);
139        assert!(result.contains_key(&from_bytes(b'a', b'b', b'c')));
140    }
141
142    #[test]
143    fn deduplication() {
144        let mut e = Extractor::new();
145        let result = e.extract_with_offsets(b"abcabc");
146        // "abc" at offset 0 and 3, "bca" at 1, "cab" at 2
147        let abc = from_bytes(b'a', b'b', b'c');
148        assert_eq!(result[&abc].len(), 2);
149        assert_eq!(result[&abc], vec![0, 3]);
150    }
151
152    #[test]
153    fn null_bytes_skipped() {
154        let mut e = Extractor::new();
155        let result = e.extract_with_offsets(b"a\x00b");
156        assert!(result.is_empty());
157    }
158
159    #[test]
160    fn high_bytes_work() {
161        let mut e = Extractor::new();
162        let result = e.extract_with_offsets(&[0xFF, 0xFE, 0xFD]);
163        let tri = from_bytes(0xFF, 0xFE, 0xFD);
164        assert!(result.contains_key(&tri));
165    }
166
167    #[test]
168    fn utf8_cafe() {
169        // "caf\xC3\xA9" = UTF-8 for "cafe" with accent
170        let data = "caf\u{00e9}".as_bytes();
171        let set = Extractor::extract_set(data);
172        assert!(!set.is_empty());
173        // Should produce byte trigrams, not char trigrams
174        assert!(set.contains(&from_bytes(b'c', b'a', b'f')));
175    }
176
177    #[test]
178    fn query_set_deduped() {
179        let set = Extractor::extract_set(b"errorerror");
180        // All trigrams from "error" repeated — should be deduped
181        let unique_count = set.len();
182        assert!(unique_count <= 8); // "error" has 3 unique trigrams
183    }
184}