Skip to main content

harn_hostlib/code_index/
trigram.rs

1//! Trigram index over file contents.
2//!
3//! Substring-accelerated full-text index: every 3-byte sliding window in a
4//! file becomes a posting in `index[trigram] -> Set<FileId>`. A query is
5//! decomposed into its trigrams; the candidate set is the intersection of
6//! those posting lists. The wire-stable algorithm ASCII case-folds each
7//! byte (`A-Z` -> `a-z`), leaves non-ASCII bytes unchanged, and packs each
8//! 3-byte sliding window as `(a << 16) | (b << 8) | c` into a `u32`.
9
10use std::collections::{HashMap, HashSet};
11
12use super::file_table::FileId;
13
14/// Packed 3-byte sliding-window key. Layout: `(a << 16) | (b << 8) | c`,
15/// case-folded on the ASCII range so lookups are case-insensitive.
16pub type Trigram = u32;
17
18/// Trigram posting list: `trigram -> set of file ids that contain it`,
19/// plus a per-file reverse map for cheap re-indexing.
20#[derive(Debug, Default, Clone)]
21pub struct TrigramIndex {
22    index: HashMap<Trigram, HashSet<FileId>>,
23    file_trigrams: HashMap<FileId, HashSet<Trigram>>,
24}
25
26impl TrigramIndex {
27    /// Construct an empty index.
28    pub fn new() -> Self {
29        Self::default()
30    }
31
32    /// Replace the postings for `id` with the trigrams of `content`.
33    pub fn index_file(&mut self, id: FileId, content: &str) {
34        self.remove_file(id);
35        let trigrams = extract_trigrams(content);
36        if trigrams.is_empty() {
37            return;
38        }
39        for tg in &trigrams {
40            self.index.entry(*tg).or_default().insert(id);
41        }
42        self.file_trigrams.insert(id, trigrams);
43    }
44
45    /// Drop every posting contributed by `id`.
46    pub fn remove_file(&mut self, id: FileId) {
47        let Some(trigrams) = self.file_trigrams.remove(&id) else {
48            return;
49        };
50        for tg in trigrams {
51            if let Some(set) = self.index.get_mut(&tg) {
52                set.remove(&id);
53                if set.is_empty() {
54                    self.index.remove(&tg);
55                }
56            }
57        }
58    }
59
60    /// Return file ids that contain ALL of `trigrams`. Empty input or any
61    /// missing trigram yields an empty set.
62    pub fn query(&self, trigrams: &[Trigram]) -> HashSet<FileId> {
63        if trigrams.is_empty() {
64            return HashSet::new();
65        }
66        let mut postings: Vec<&HashSet<FileId>> = Vec::with_capacity(trigrams.len());
67        for tg in trigrams {
68            match self.index.get(tg) {
69                Some(set) => postings.push(set),
70                None => return HashSet::new(),
71            }
72        }
73        postings.sort_by_key(|s| s.len());
74        let (head, tail) = postings.split_first().expect("non-empty by construction");
75        let mut acc: HashSet<FileId> = (*head).clone();
76        for set in tail {
77            acc.retain(|id| set.contains(id));
78            if acc.is_empty() {
79                return acc;
80            }
81        }
82        acc
83    }
84
85    /// Number of distinct trigrams in the posting table.
86    pub fn distinct_trigrams(&self) -> usize {
87        self.index.len()
88    }
89
90    /// Capture the posting table as a snapshot-friendly vector. Used by
91    /// the on-disk snapshot path; sorted by trigram for deterministic
92    /// serialisation.
93    pub fn snapshot_postings(&self) -> Vec<super::snapshot::TrigramPosting> {
94        let mut out: Vec<super::snapshot::TrigramPosting> = self
95            .index
96            .iter()
97            .map(|(tg, files)| {
98                let mut files: Vec<FileId> = files.iter().copied().collect();
99                files.sort_unstable();
100                super::snapshot::TrigramPosting {
101                    trigram: *tg,
102                    files,
103                }
104            })
105            .collect();
106        out.sort_unstable_by_key(|p| p.trigram);
107        out
108    }
109
110    /// Rebuild a [`TrigramIndex`] from a snapshot's postings vector.
111    pub fn from_postings(postings: Vec<super::snapshot::TrigramPosting>) -> Self {
112        let mut idx = Self::new();
113        for p in postings {
114            let entry = idx.index.entry(p.trigram).or_default();
115            for f in &p.files {
116                entry.insert(*f);
117                idx.file_trigrams.entry(*f).or_default().insert(p.trigram);
118            }
119        }
120        idx
121    }
122
123    /// Order-of-magnitude resident-bytes estimate. Reported by
124    /// `code_index.stats.memory_bytes`.
125    pub fn estimated_bytes(&self) -> usize {
126        // Cheap order-of-magnitude estimate: 4 bytes per posting key plus
127        // ~4 bytes per FileID per posting on both sides.
128        let postings: usize = self.index.values().map(|s| s.len()).sum();
129        let reverse: usize = self.file_trigrams.values().map(|s| s.len()).sum();
130        self.index.len() * 4 + postings * 4 + reverse * 4 + self.file_trigrams.len() * 4
131    }
132}
133
134/// Extract every distinct trigram from `text`. Bytes are lowercased on the
135/// ASCII range; non-ASCII bytes pass through unchanged. Sliding 3-byte
136/// window over the UTF-8 bytes.
137pub fn extract_trigrams(text: &str) -> HashSet<Trigram> {
138    let bytes = text.as_bytes();
139    if bytes.len() < 3 {
140        return HashSet::new();
141    }
142    let mut out: HashSet<Trigram> = HashSet::with_capacity(bytes.len().max(8) / 4);
143    for window in bytes.windows(3) {
144        out.insert(pack(
145            normalize(window[0]),
146            normalize(window[1]),
147            normalize(window[2]),
148        ));
149    }
150    out
151}
152
153/// Decompose a query into its constituent trigrams (deduped) so callers
154/// that pre-compute on the script side stay in lock-step with the host.
155pub fn query_trigrams(query: &str) -> Vec<Trigram> {
156    extract_trigrams(query).into_iter().collect()
157}
158
159/// Pack three normalised bytes into a `Trigram` using the canonical
160/// `(a << 16) | (b << 8) | c` layout.
161#[inline(always)]
162pub fn pack(a: u8, b: u8, c: u8) -> Trigram {
163    (a as u32) << 16 | (b as u32) << 8 | c as u32
164}
165
166#[inline(always)]
167fn normalize(byte: u8) -> u8 {
168    if byte.is_ascii_uppercase() {
169        byte + 0x20
170    } else {
171        byte
172    }
173}
174
175#[cfg(test)]
176mod tests {
177    use super::*;
178
179    #[test]
180    fn extract_handles_short_strings() {
181        assert!(extract_trigrams("").is_empty());
182        assert!(extract_trigrams("ab").is_empty());
183        assert_eq!(extract_trigrams("abc").len(), 1);
184    }
185
186    #[test]
187    fn extract_is_case_insensitive() {
188        let lower = extract_trigrams("foo");
189        let upper = extract_trigrams("FOO");
190        assert_eq!(lower, upper);
191    }
192
193    #[test]
194    fn query_intersects_postings() {
195        let mut idx = TrigramIndex::new();
196        idx.index_file(1, "the quick brown fox");
197        idx.index_file(2, "jumped over the lazy dog");
198        idx.index_file(3, "lorem ipsum dolor sit amet");
199
200        let needle = query_trigrams("the");
201        let hits = idx.query(&needle);
202        assert!(hits.contains(&1));
203        assert!(hits.contains(&2));
204        assert!(!hits.contains(&3));
205    }
206
207    #[test]
208    fn missing_trigram_short_circuits_to_empty() {
209        let mut idx = TrigramIndex::new();
210        idx.index_file(1, "hello world");
211        let needle = query_trigrams("zzzzzz");
212        assert!(idx.query(&needle).is_empty());
213    }
214
215    #[test]
216    fn remove_file_drops_postings() {
217        let mut idx = TrigramIndex::new();
218        idx.index_file(1, "alpha beta");
219        idx.index_file(2, "alpha gamma");
220        idx.remove_file(1);
221        let needle = query_trigrams("alp");
222        let hits = idx.query(&needle);
223        assert!(!hits.contains(&1));
224        assert!(hits.contains(&2));
225    }
226
227    #[test]
228    fn reindex_replaces_postings() {
229        let mut idx = TrigramIndex::new();
230        idx.index_file(1, "alpha beta");
231        idx.index_file(1, "gamma delta");
232        let alpha = query_trigrams("alp");
233        let gamma = query_trigrams("gam");
234        assert!(idx.query(&alpha).is_empty());
235        assert!(idx.query(&gamma).contains(&1));
236    }
237}