Skip to main content

ix/
trigram.rs

1//! Trigram extraction — the core indexing primitive.
2//!
3//! Operates on raw bytes. UTF-8 self-synchronization makes this correct
4//! for Unicode for free. No charset detection needed.
5
6/// A trigram is a u32 with high byte always 0.
7/// trigram("abc") = (0x61 << 16) | (0x62 << 8) | 0x63
8pub type Trigram = u32;
9
10/// Convert 3 bytes to a trigram key.
11#[inline]
12pub fn from_bytes(a: u8, b: u8, c: u8) -> Trigram {
13    ((a as u32) << 16) | ((b as u32) << 8) | (c as u32)
14}
15
16/// Reusable trigram extractor. Pre-allocated, reused across files.
17pub struct Extractor {
18    trigram_offsets: Vec<(Trigram, u32)>,
19}
20
21impl Extractor {
22    pub fn new() -> Self {
23        Self {
24            trigram_offsets: Vec::with_capacity(4096),
25        }
26    }
27
28    /// Extract trigrams with byte offsets (for indexing).
29    /// Returns a slice of (trigram, offset) pairs, sorted by trigram then offset.
30    pub fn extract_with_offsets(&mut self, data: &[u8]) -> &[(Trigram, u32)] {
31        self.trigram_offsets.clear();
32
33        if data.len() < 3 {
34            return &self.trigram_offsets;
35        }
36
37        // 1. Collect all pairs
38        for i in 0..=(data.len() - 3) {
39            // Skip trigrams containing null bytes (binary content marker)
40            if data[i] == 0 || data[i + 1] == 0 || data[i + 2] == 0 {
41                continue;
42            }
43
44            let tri = from_bytes(data[i], data[i + 1], data[i + 2]);
45            self.trigram_offsets.push((tri, i as u32));
46
47            // Hard cap to prevent memory explosion (R-02)
48            if self.trigram_offsets.len() >= 1_000_000 {
49                break;
50            }
51        }
52
53        // 2. Sort for efficient grouping later
54        self.trigram_offsets.sort_unstable();
55
56        &self.trigram_offsets
57    }
58
59    /// Extract unique trigram set from a query (no offsets needed).
60    pub fn extract_set(query: &[u8]) -> Vec<Trigram> {
61        if query.len() < 3 {
62            return vec![];
63        }
64        let mut set = Vec::with_capacity(query.len() - 2);
65        for i in 0..=(query.len() - 3) {
66            let tri = from_bytes(query[i], query[i + 1], query[i + 2]);
67            set.push(tri);
68        }
69        set.sort_unstable();
70        set.dedup();
71        set
72    }
73
74    /// Extract trigram groups with case permutations for case-insensitive search.
75    /// Returns one group per original trigram position. Each group contains all
76    /// case variants for that position (max 8 per group for 3 ASCII alpha bytes).
77    /// The executor should UNION within each group and INTERSECT across groups.
78    pub fn extract_groups_case_insensitive(query: &[u8]) -> Vec<Vec<Trigram>> {
79        if query.len() < 3 {
80            return vec![];
81        }
82        let mut groups = Vec::with_capacity(query.len() - 2);
83        for i in 0..=(query.len() - 3) {
84            let bytes = [query[i], query[i + 1], query[i + 2]];
85            let mut variants = case_variants(bytes);
86            variants.sort_unstable();
87            variants.dedup();
88            groups.push(variants);
89        }
90        groups
91    }
92}
93
94/// Generate all case permutations of a 3-byte trigram.
95/// Only ASCII alpha bytes get toggled. Max 8 variants per trigram.
96fn case_variants(bytes: [u8; 3]) -> Vec<Trigram> {
97    let alts: [Vec<u8>; 3] = [
98        byte_cases(bytes[0]),
99        byte_cases(bytes[1]),
100        byte_cases(bytes[2]),
101    ];
102    let mut out = Vec::with_capacity(alts[0].len() * alts[1].len() * alts[2].len());
103    for &a in &alts[0] {
104        for &b in &alts[1] {
105            for &c in &alts[2] {
106                out.push(from_bytes(a, b, c));
107            }
108        }
109    }
110    out
111}
112
113/// Return both cases for ASCII alpha, or just the byte itself.
114#[inline]
115fn byte_cases(b: u8) -> Vec<u8> {
116    if b.is_ascii_uppercase() {
117        vec![b, b.to_ascii_lowercase()]
118    } else if b.is_ascii_lowercase() {
119        vec![b, b.to_ascii_uppercase()]
120    } else {
121        vec![b]
122    }
123}
124
125impl Default for Extractor {
126    fn default() -> Self {
127        Self::new()
128    }
129}
130
131#[cfg(test)]
132mod tests {
133    use super::*;
134
135    #[test]
136    fn empty() {
137        let mut e = Extractor::new();
138        assert!(e.extract_with_offsets(b"").is_empty());
139        assert!(e.extract_with_offsets(b"ab").is_empty());
140    }
141
142    #[test]
143    fn single_trigram() {
144        let mut e = Extractor::new();
145        let result = e.extract_with_offsets(b"abc");
146        assert_eq!(result.len(), 1);
147        assert_eq!(result[0], (from_bytes(b'a', b'b', b'c'), 0));
148    }
149
150    #[test]
151    fn deduplication() {
152        let mut e = Extractor::new();
153        let result = e.extract_with_offsets(b"abcabc");
154        // "abc" at offset 0 and 3, "bca" at 1, "cab" at 2
155        // Sorted: abc:0, abc:3, bca:1, cab:2
156        let abc = from_bytes(b'a', b'b', b'c');
157        let bca = from_bytes(b'b', b'c', b'a');
158        let cab = from_bytes(b'c', b'a', b'b');
159
160        assert_eq!(result.len(), 4);
161        assert_eq!(result[0], (abc, 0));
162        assert_eq!(result[1], (abc, 3));
163        assert_eq!(result[2], (bca, 1));
164        assert_eq!(result[3], (cab, 2));
165    }
166
167    #[test]
168    fn null_bytes_skipped() {
169        let mut e = Extractor::new();
170        let result = e.extract_with_offsets(b"a\x00b");
171        assert!(result.is_empty());
172    }
173
174    #[test]
175    fn high_bytes_work() {
176        let mut e = Extractor::new();
177        let result = e.extract_with_offsets(&[0xFF, 0xFE, 0xFD]);
178        let tri = from_bytes(0xFF, 0xFE, 0xFD);
179        assert_eq!(result[0], (tri, 0));
180    }
181
182    #[test]
183    fn utf8_cafe() {
184        // "caf\xC3\xA9" = UTF-8 for "cafe" with accent
185        let data = "caf\u{00e9}".as_bytes();
186        let set = Extractor::extract_set(data);
187        assert!(!set.is_empty());
188        // Should produce byte trigrams, not char trigrams
189        assert!(set.contains(&from_bytes(b'c', b'a', b'f')));
190    }
191
192    #[test]
193    fn query_set_deduped() {
194        let set = Extractor::extract_set(b"errorerror");
195        // All trigrams from "error" repeated — should be deduped
196        let unique_count = set.len();
197        assert!(unique_count <= 8); // "error" has 3 unique trigrams
198    }
199}