Skip to main content

ix/
trigram.rs

1//! Trigram extraction — the core indexing primitive.
2//!
3//! Operates on raw bytes. UTF-8 self-synchronization makes this correct
4//! for Unicode for free. No charset detection needed.
5
6/// A trigram is a u32 with high byte always 0.
7/// trigram("abc") = (0x61 << 16) | (0x62 << 8) | 0x63
8pub type Trigram = u32;
9
10/// Convert 3 bytes to a trigram key.
11#[inline]
12#[must_use]
13pub fn from_bytes(a: u8, b: u8, c: u8) -> Trigram {
14    (u32::from(a) << 16) | (u32::from(b) << 8) | u32::from(c)
15}
16
17/// Reusable trigram extractor. Pre-allocated, reused across files.
18pub struct Extractor {
19    trigram_offsets: Vec<(Trigram, u32)>,
20}
21
22impl Extractor {
23    /// Create a new extractor with pre-allocated capacity.
24    #[must_use]
25    pub fn new() -> Self {
26        Self {
27            trigram_offsets: Vec::with_capacity(4096),
28        }
29    }
30
31    /// Extract trigrams with byte offsets (for indexing).
32    ///
33    /// Returns a slice of `(trigram, offset)` pairs, sorted by trigram then offset.
34    #[must_use]
35    #[allow(clippy::indexing_slicing)] // loop bound: data.len()-3 guarantees i+2 in range
36    #[allow(clippy::as_conversions)] // i < data.len() < u32::MAX for index files
37    pub fn extract_with_offsets(&mut self, data: &[u8]) -> &[(Trigram, u32)] {
38        self.trigram_offsets.clear();
39
40        if data.len() < 3 {
41            return &self.trigram_offsets;
42        }
43
44        for i in 0..=(data.len() - 3) {
45            if data[i] == 0 || data[i + 1] == 0 || data[i + 2] == 0 {
46                continue;
47            }
48
49            let tri = from_bytes(data[i], data[i + 1], data[i + 2]);
50            self.trigram_offsets.push((tri, i as u32));
51
52            if self.trigram_offsets.len() >= 1_000_000 {
53                break;
54            }
55        }
56
57        self.trigram_offsets.sort_unstable();
58
59        &self.trigram_offsets
60    }
61
62    /// Extract unique trigram set from a query (no offsets needed).
63    #[must_use]
64    #[allow(clippy::indexing_slicing)] // loop bound: query.len()-3 guarantees i+2 in range
65    pub fn extract_set(query: &[u8]) -> Vec<Trigram> {
66        if query.len() < 3 {
67            return vec![];
68        }
69        let mut set = Vec::with_capacity(query.len() - 2);
70        for i in 0..=(query.len() - 3) {
71            let tri = from_bytes(query[i], query[i + 1], query[i + 2]);
72            set.push(tri);
73        }
74        set.sort_unstable();
75        set.dedup();
76        set
77    }
78
79    /// Extract trigram groups with case permutations for case-insensitive search.
80    ///
81    /// Returns one group per original trigram position. Each group contains all
82    /// case variants for that position (max 8 per group for 3 ASCII alpha bytes).
83    /// The executor should UNION within each group and INTERSECT across groups.
84    #[must_use]
85    #[allow(clippy::indexing_slicing)] // loop bound: query.len()-3 guarantees i+2 in range
86    pub fn extract_groups_case_insensitive(query: &[u8]) -> Vec<Vec<Trigram>> {
87        if query.len() < 3 {
88            return vec![];
89        }
90        let mut groups = Vec::with_capacity(query.len() - 2);
91        for i in 0..=(query.len() - 3) {
92            let bytes = [query[i], query[i + 1], query[i + 2]];
93            let mut variants = case_variants(bytes);
94            variants.sort_unstable();
95            variants.dedup();
96            groups.push(variants);
97        }
98        groups
99    }
100}
101
102/// Generate all case permutations of a 3-byte trigram.
103/// Only ASCII alpha bytes get toggled. Max 8 variants per trigram.
104fn case_variants(bytes: [u8; 3]) -> Vec<Trigram> {
105    let alts: [Vec<u8>; 3] = [
106        byte_cases(bytes[0]),
107        byte_cases(bytes[1]),
108        byte_cases(bytes[2]),
109    ];
110    let mut out = Vec::with_capacity(alts[0].len() * alts[1].len() * alts[2].len());
111    for &a in &alts[0] {
112        for &b in &alts[1] {
113            for &c in &alts[2] {
114                out.push(from_bytes(a, b, c));
115            }
116        }
117    }
118    out
119}
120
121/// Return both cases for ASCII alpha, or just the byte itself.
122#[inline]
123fn byte_cases(b: u8) -> Vec<u8> {
124    if b.is_ascii_uppercase() {
125        vec![b, b.to_ascii_lowercase()]
126    } else if b.is_ascii_lowercase() {
127        vec![b, b.to_ascii_uppercase()]
128    } else {
129        vec![b]
130    }
131}
132
133impl Default for Extractor {
134    fn default() -> Self {
135        Self::new()
136    }
137}
138
139#[cfg(test)]
140#[allow(clippy::as_conversions, clippy::unwrap_used, clippy::indexing_slicing)]
141mod tests {
142    use super::*;
143
144    #[test]
145    fn empty() {
146        let mut e = Extractor::new();
147        assert!(e.extract_with_offsets(b"").is_empty());
148        assert!(e.extract_with_offsets(b"ab").is_empty());
149    }
150
151    #[test]
152    fn single_trigram() {
153        let mut e = Extractor::new();
154        let result = e.extract_with_offsets(b"abc");
155        assert_eq!(result.len(), 1);
156        assert_eq!(result[0], (from_bytes(b'a', b'b', b'c'), 0));
157    }
158
159    #[test]
160    fn deduplication() {
161        let mut e = Extractor::new();
162        let result = e.extract_with_offsets(b"abcabc");
163        let abc = from_bytes(b'a', b'b', b'c');
164        let bca = from_bytes(b'b', b'c', b'a');
165        let cab = from_bytes(b'c', b'a', b'b');
166
167        assert_eq!(result.len(), 4);
168        assert_eq!(result[0], (abc, 0));
169        assert_eq!(result[1], (abc, 3));
170        assert_eq!(result[2], (bca, 1));
171        assert_eq!(result[3], (cab, 2));
172    }
173
174    #[test]
175    fn null_bytes_skipped() {
176        let mut e = Extractor::new();
177        let result = e.extract_with_offsets(b"a\x00b");
178        assert!(result.is_empty());
179    }
180
181    #[test]
182    fn high_bytes_work() {
183        let mut e = Extractor::new();
184        let result = e.extract_with_offsets(&[0xFF, 0xFE, 0xFD]);
185        let tri = from_bytes(0xFF, 0xFE, 0xFD);
186        assert_eq!(result[0], (tri, 0));
187    }
188
189    #[test]
190    fn utf8_cafe() {
191        let data = "caf\u{00e9}".as_bytes();
192        let set = Extractor::extract_set(data);
193        assert!(!set.is_empty());
194        assert!(set.contains(&from_bytes(b'c', b'a', b'f')));
195    }
196
197    #[test]
198    fn query_set_deduped() {
199        let set = Extractor::extract_set(b"errorerror");
200        let unique_count = set.len();
201        assert!(unique_count <= 8);
202    }
203}