Skip to main content

ix/
trigram.rs

1//! Trigram extraction — the core indexing primitive.
2//!
3//! Operates on raw bytes. UTF-8 self-synchronization makes this correct
4//! for Unicode for free. No charset detection needed.
5
6/// A trigram is a u32 with high byte always 0.
7/// trigram("abc") = (0x61 << 16) | (0x62 << 8) | 0x63
8pub type Trigram = u32;
9
10/// Convert 3 bytes to a trigram key.
11#[inline]
12pub fn from_bytes(a: u8, b: u8, c: u8) -> Trigram {
13    ((a as u32) << 16) | ((b as u32) << 8) | (c as u32)
14}
15
16/// Reusable trigram extractor. Pre-allocated, reused across files.
17pub struct Extractor {
18    trigram_offsets: Vec<(Trigram, u32)>,
19}
20
21impl Extractor {
22    pub fn new() -> Self {
23        Self {
24            trigram_offsets: Vec::with_capacity(4096),
25        }
26    }
27
28    /// Extract trigrams with byte offsets (for indexing).
29    /// Returns a slice of (trigram, offset) pairs, sorted by trigram then offset.
30    pub fn extract_with_offsets(&mut self, data: &[u8]) -> &[(Trigram, u32)] {
31        self.trigram_offsets.clear();
32
33        if data.len() < 3 {
34            return &self.trigram_offsets;
35        }
36
37        // 1. Collect all pairs
38        for i in 0..=(data.len() - 3) {
39            // Skip trigrams containing null bytes (binary content marker)
40            if data[i] == 0 || data[i + 1] == 0 || data[i + 2] == 0 {
41                continue;
42            }
43
44            let tri = from_bytes(data[i], data[i + 1], data[i + 2]);
45            self.trigram_offsets.push((tri, i as u32));
46        }
47
48        // 2. Sort for efficient grouping later
49        self.trigram_offsets.sort_unstable();
50
51        &self.trigram_offsets
52    }
53
54    /// Extract unique trigram set from a query (no offsets needed).
55    pub fn extract_set(query: &[u8]) -> Vec<Trigram> {
56        if query.len() < 3 {
57            return vec![];
58        }
59        let mut set = Vec::with_capacity(query.len() - 2);
60        for i in 0..=(query.len() - 3) {
61            let tri = from_bytes(query[i], query[i + 1], query[i + 2]);
62            set.push(tri);
63        }
64        set.sort_unstable();
65        set.dedup();
66        set
67    }
68
69    /// Extract trigram groups with case permutations for case-insensitive search.
70    /// Returns one group per original trigram position. Each group contains all
71    /// case variants for that position (max 8 per group for 3 ASCII alpha bytes).
72    /// The executor should UNION within each group and INTERSECT across groups.
73    pub fn extract_groups_case_insensitive(query: &[u8]) -> Vec<Vec<Trigram>> {
74        if query.len() < 3 {
75            return vec![];
76        }
77        let mut groups = Vec::with_capacity(query.len() - 2);
78        for i in 0..=(query.len() - 3) {
79            let bytes = [query[i], query[i + 1], query[i + 2]];
80            let mut variants = case_variants(bytes);
81            variants.sort_unstable();
82            variants.dedup();
83            groups.push(variants);
84        }
85        groups
86    }
87}
88
89/// Generate all case permutations of a 3-byte trigram.
90/// Only ASCII alpha bytes get toggled. Max 8 variants per trigram.
91fn case_variants(bytes: [u8; 3]) -> Vec<Trigram> {
92    let alts: [Vec<u8>; 3] = [
93        byte_cases(bytes[0]),
94        byte_cases(bytes[1]),
95        byte_cases(bytes[2]),
96    ];
97    let mut out = Vec::with_capacity(alts[0].len() * alts[1].len() * alts[2].len());
98    for &a in &alts[0] {
99        for &b in &alts[1] {
100            for &c in &alts[2] {
101                out.push(from_bytes(a, b, c));
102            }
103        }
104    }
105    out
106}
107
108/// Return both cases for ASCII alpha, or just the byte itself.
109#[inline]
110fn byte_cases(b: u8) -> Vec<u8> {
111    if b.is_ascii_uppercase() {
112        vec![b, b.to_ascii_lowercase()]
113    } else if b.is_ascii_lowercase() {
114        vec![b, b.to_ascii_uppercase()]
115    } else {
116        vec![b]
117    }
118}
119
120impl Default for Extractor {
121    fn default() -> Self {
122        Self::new()
123    }
124}
125
126#[cfg(test)]
127mod tests {
128    use super::*;
129
130    #[test]
131    fn empty() {
132        let mut e = Extractor::new();
133        assert!(e.extract_with_offsets(b"").is_empty());
134        assert!(e.extract_with_offsets(b"ab").is_empty());
135    }
136
137    #[test]
138    fn single_trigram() {
139        let mut e = Extractor::new();
140        let result = e.extract_with_offsets(b"abc");
141        assert_eq!(result.len(), 1);
142        assert_eq!(result[0], (from_bytes(b'a', b'b', b'c'), 0));
143    }
144
145    #[test]
146    fn deduplication() {
147        let mut e = Extractor::new();
148        let result = e.extract_with_offsets(b"abcabc");
149        // "abc" at offset 0 and 3, "bca" at 1, "cab" at 2
150        // Sorted: abc:0, abc:3, bca:1, cab:2
151        let abc = from_bytes(b'a', b'b', b'c');
152        let bca = from_bytes(b'b', b'c', b'a');
153        let cab = from_bytes(b'c', b'a', b'b');
154        
155        assert_eq!(result.len(), 4);
156        assert_eq!(result[0], (abc, 0));
157        assert_eq!(result[1], (abc, 3));
158        assert_eq!(result[2], (bca, 1));
159        assert_eq!(result[3], (cab, 2));
160    }
161
162    #[test]
163    fn null_bytes_skipped() {
164        let mut e = Extractor::new();
165        let result = e.extract_with_offsets(b"a\x00b");
166        assert!(result.is_empty());
167    }
168
169    #[test]
170    fn high_bytes_work() {
171        let mut e = Extractor::new();
172        let result = e.extract_with_offsets(&[0xFF, 0xFE, 0xFD]);
173        let tri = from_bytes(0xFF, 0xFE, 0xFD);
174        assert_eq!(result[0], (tri, 0));
175    }
176
177    #[test]
178    fn utf8_cafe() {
179        // "caf\xC3\xA9" = UTF-8 for "cafe" with accent
180        let data = "caf\u{00e9}".as_bytes();
181        let set = Extractor::extract_set(data);
182        assert!(!set.is_empty());
183        // Should produce byte trigrams, not char trigrams
184        assert!(set.contains(&from_bytes(b'c', b'a', b'f')));
185    }
186
187    #[test]
188    fn query_set_deduped() {
189        let set = Extractor::extract_set(b"errorerror");
190        // All trigrams from "error" repeated — should be deduped
191        let unique_count = set.len();
192        assert!(unique_count <= 8); // "error" has 3 unique trigrams
193    }
194}