Skip to main content

ix/
trigram.rs

1//! Trigram extraction — the core indexing primitive.
2//!
3//! Operates on raw bytes. UTF-8 self-synchronization makes this correct
4//! for Unicode for free. No charset detection needed.
5
6/// A trigram is a u32 with high byte always 0.
7/// trigram("abc") = (0x61 << 16) | (0x62 << 8) | 0x63
8pub type Trigram = u32;
9
10/// Convert 3 bytes to a trigram key.
11#[inline]
12#[must_use]
13pub fn from_bytes(a: u8, b: u8, c: u8) -> Trigram {
14    (u32::from(a) << 16) | (u32::from(b) << 8) | u32::from(c)
15}
16
17/// Reusable trigram extractor. Pre-allocated, reused across files.
18pub struct Extractor {
19    trigram_offsets: Vec<(Trigram, u32)>,
20}
21
22impl Extractor {
23    /// Create a new extractor with pre-allocated capacity.
24    #[must_use]
25    pub fn new() -> Self {
26        Self {
27            trigram_offsets: Vec::with_capacity(4096),
28        }
29    }
30
31    /// Extract trigrams with byte offsets (for indexing).
32    ///
33    /// Returns a slice of `(trigram, offset)` pairs, sorted by trigram then offset.
34    #[must_use]
35    #[allow(clippy::indexing_slicing)] // loop bound: data.len()-3 guarantees i+2 in range
36    #[allow(clippy::as_conversions)] // i < data.len() < u32::MAX for index files
37    pub fn extract_with_offsets(&mut self, data: &[u8]) -> &[(Trigram, u32)] {
38        const MAX_OFFSETS_PER_TRIGRAM: usize = 64;
39        const SMALL_FILE_LIMIT: usize = 1_000_000;
40
41        self.trigram_offsets.clear();
42
43        if data.len() < 3 {
44            return &self.trigram_offsets;
45        }
46
47        if data.len() <= SMALL_FILE_LIMIT {
48            for i in 0..=(data.len() - 3) {
49                if data[i] == 0 || data[i + 1] == 0 || data[i + 2] == 0 {
50                    continue;
51                }
52                let tri = from_bytes(data[i], data[i + 1], data[i + 2]);
53                self.trigram_offsets.push((tri, i as u32));
54            }
55        } else {
56            let mut seen_counts: std::collections::HashMap<Trigram, usize> =
57                std::collections::HashMap::new();
58            for i in 0..=(data.len() - 3) {
59                if data[i] == 0 || data[i + 1] == 0 || data[i + 2] == 0 {
60                    continue;
61                }
62                let tri = from_bytes(data[i], data[i + 1], data[i + 2]);
63                let count = seen_counts.entry(tri).or_insert(0);
64                if *count < MAX_OFFSETS_PER_TRIGRAM {
65                    self.trigram_offsets.push((tri, i as u32));
66                    *count += 1;
67                }
68            }
69        }
70
71        self.trigram_offsets.sort_unstable();
72
73        &self.trigram_offsets
74    }
75
76    /// Extract unique trigram set from a query (no offsets needed).
77    #[must_use]
78    #[allow(clippy::indexing_slicing)] // loop bound: query.len()-3 guarantees i+2 in range
79    pub fn extract_set(query: &[u8]) -> Vec<Trigram> {
80        if query.len() < 3 {
81            return vec![];
82        }
83        let mut set = Vec::with_capacity(query.len() - 2);
84        for i in 0..=(query.len() - 3) {
85            let tri = from_bytes(query[i], query[i + 1], query[i + 2]);
86            set.push(tri);
87        }
88        set.sort_unstable();
89        set.dedup();
90        set
91    }
92
93    /// Extract trigram groups with case permutations for case-insensitive search.
94    ///
95    /// Returns one group per original trigram position. Each group contains all
96    /// case variants for that position (max 8 per group for 3 ASCII alpha bytes).
97    /// The executor should UNION within each group and INTERSECT across groups.
98    #[must_use]
99    #[allow(clippy::indexing_slicing)] // loop bound: query.len()-3 guarantees i+2 in range
100    pub fn extract_groups_case_insensitive(query: &[u8]) -> Vec<Vec<Trigram>> {
101        if query.len() < 3 {
102            return vec![];
103        }
104        let mut groups = Vec::with_capacity(query.len() - 2);
105        for i in 0..=(query.len() - 3) {
106            let bytes = [query[i], query[i + 1], query[i + 2]];
107            let mut variants = case_variants(bytes);
108            variants.sort_unstable();
109            variants.dedup();
110            groups.push(variants);
111        }
112        groups
113    }
114}
115
116/// Generate all case permutations of a 3-byte trigram.
117/// Only ASCII alpha bytes get toggled. Max 8 variants per trigram.
118fn case_variants(bytes: [u8; 3]) -> Vec<Trigram> {
119    let alts: [Vec<u8>; 3] = [
120        byte_cases(bytes[0]),
121        byte_cases(bytes[1]),
122        byte_cases(bytes[2]),
123    ];
124    let mut out = Vec::with_capacity(alts[0].len() * alts[1].len() * alts[2].len());
125    for &a in &alts[0] {
126        for &b in &alts[1] {
127            for &c in &alts[2] {
128                out.push(from_bytes(a, b, c));
129            }
130        }
131    }
132    out
133}
134
135/// Return both cases for ASCII alpha, or just the byte itself.
136#[inline]
137fn byte_cases(b: u8) -> Vec<u8> {
138    if b.is_ascii_uppercase() {
139        vec![b, b.to_ascii_lowercase()]
140    } else if b.is_ascii_lowercase() {
141        vec![b, b.to_ascii_uppercase()]
142    } else {
143        vec![b]
144    }
145}
146
147impl Default for Extractor {
148    fn default() -> Self {
149        Self::new()
150    }
151}
152
153#[cfg(test)]
154#[allow(clippy::as_conversions, clippy::unwrap_used, clippy::indexing_slicing)]
155mod tests {
156    use super::*;
157
158    #[test]
159    fn empty() {
160        let mut e = Extractor::new();
161        assert!(e.extract_with_offsets(b"").is_empty());
162        assert!(e.extract_with_offsets(b"ab").is_empty());
163    }
164
165    #[test]
166    fn single_trigram() {
167        let mut e = Extractor::new();
168        let result = e.extract_with_offsets(b"abc");
169        assert_eq!(result.len(), 1);
170        assert_eq!(result[0], (from_bytes(b'a', b'b', b'c'), 0));
171    }
172
173    #[test]
174    fn deduplication() {
175        let mut e = Extractor::new();
176        let result = e.extract_with_offsets(b"abcabc");
177        let abc = from_bytes(b'a', b'b', b'c');
178        let bca = from_bytes(b'b', b'c', b'a');
179        let cab = from_bytes(b'c', b'a', b'b');
180
181        assert_eq!(result.len(), 4);
182        assert_eq!(result[0], (abc, 0));
183        assert_eq!(result[1], (abc, 3));
184        assert_eq!(result[2], (bca, 1));
185        assert_eq!(result[3], (cab, 2));
186    }
187
188    #[test]
189    fn null_bytes_skipped() {
190        let mut e = Extractor::new();
191        let result = e.extract_with_offsets(b"a\x00b");
192        assert!(result.is_empty());
193    }
194
195    #[test]
196    fn high_bytes_work() {
197        let mut e = Extractor::new();
198        let result = e.extract_with_offsets(&[0xFF, 0xFE, 0xFD]);
199        let tri = from_bytes(0xFF, 0xFE, 0xFD);
200        assert_eq!(result[0], (tri, 0));
201    }
202
203    #[test]
204    fn utf8_cafe() {
205        let data = "caf\u{00e9}".as_bytes();
206        let set = Extractor::extract_set(data);
207        assert!(!set.is_empty());
208        assert!(set.contains(&from_bytes(b'c', b'a', b'f')));
209    }
210
211    #[test]
212    fn query_set_deduped() {
213        let set = Extractor::extract_set(b"errorerror");
214        let unique_count = set.len();
215        assert!(unique_count <= 8);
216    }
217}