Skip to main content

harn_hostlib/code_index/
trigram.rs

1//! Trigram index over file contents.
2//!
3//! Substring-accelerated full-text index: every 3-byte sliding window in a
4//! file becomes a posting in `index[trigram] -> Set<FileId>`. A query is
5//! decomposed into its trigrams; the candidate set is the intersection of
6//! those posting lists. Matches the Swift `TrigramIndex` byte-for-byte:
7//! ASCII case-fold on each byte (`A-Z` -> `a-z`), non-ASCII passes through,
8//! 3-byte sliding window packed `(a << 16) | (b << 8) | c` into a `u32`.
9
10use std::collections::{HashMap, HashSet};
11
12use super::file_table::FileId;
13
14/// Packed 3-byte sliding-window key. Layout: `(a << 16) | (b << 8) | c`,
15/// case-folded on the ASCII range so lookups are case-insensitive.
16pub type Trigram = u32;
17
18/// Trigram posting list: `trigram -> set of file ids that contain it`,
19/// plus a per-file reverse map for cheap re-indexing.
20#[derive(Debug, Default, Clone)]
21pub struct TrigramIndex {
22    index: HashMap<Trigram, HashSet<FileId>>,
23    file_trigrams: HashMap<FileId, HashSet<Trigram>>,
24}
25
26impl TrigramIndex {
27    /// Construct an empty index.
28    pub fn new() -> Self {
29        Self::default()
30    }
31
32    /// Replace the postings for `id` with the trigrams of `content`.
33    pub fn index_file(&mut self, id: FileId, content: &str) {
34        self.remove_file(id);
35        let trigrams = extract_trigrams(content);
36        if trigrams.is_empty() {
37            return;
38        }
39        for tg in &trigrams {
40            self.index.entry(*tg).or_default().insert(id);
41        }
42        self.file_trigrams.insert(id, trigrams);
43    }
44
45    /// Drop every posting contributed by `id`.
46    pub fn remove_file(&mut self, id: FileId) {
47        let Some(trigrams) = self.file_trigrams.remove(&id) else {
48            return;
49        };
50        for tg in trigrams {
51            if let Some(set) = self.index.get_mut(&tg) {
52                set.remove(&id);
53                if set.is_empty() {
54                    self.index.remove(&tg);
55                }
56            }
57        }
58    }
59
60    /// Return file ids that contain ALL of `trigrams`. Empty input or any
61    /// missing trigram yields an empty set.
62    pub fn query(&self, trigrams: &[Trigram]) -> HashSet<FileId> {
63        if trigrams.is_empty() {
64            return HashSet::new();
65        }
66        let mut postings: Vec<&HashSet<FileId>> = Vec::with_capacity(trigrams.len());
67        for tg in trigrams {
68            match self.index.get(tg) {
69                Some(set) => postings.push(set),
70                None => return HashSet::new(),
71            }
72        }
73        postings.sort_by_key(|s| s.len());
74        let (head, tail) = postings.split_first().expect("non-empty by construction");
75        let mut acc: HashSet<FileId> = (*head).clone();
76        for set in tail {
77            acc.retain(|id| set.contains(id));
78            if acc.is_empty() {
79                return acc;
80            }
81        }
82        acc
83    }
84
85    /// Number of distinct trigrams in the posting table.
86    pub fn distinct_trigrams(&self) -> usize {
87        self.index.len()
88    }
89
90    /// Capture the posting table as a snapshot-friendly vector. Used by
91    /// the on-disk snapshot path; sorted by trigram for deterministic
92    /// serialisation.
93    pub fn snapshot_postings(&self) -> Vec<super::snapshot::TrigramPosting> {
94        let mut out: Vec<super::snapshot::TrigramPosting> = self
95            .index
96            .iter()
97            .map(|(tg, files)| {
98                let mut files: Vec<FileId> = files.iter().copied().collect();
99                files.sort_unstable();
100                super::snapshot::TrigramPosting {
101                    trigram: *tg,
102                    files,
103                }
104            })
105            .collect();
106        out.sort_unstable_by_key(|p| p.trigram);
107        out
108    }
109
110    /// Rebuild a [`TrigramIndex`] from a snapshot's postings vector.
111    pub fn from_postings(postings: Vec<super::snapshot::TrigramPosting>) -> Self {
112        let mut idx = Self::new();
113        for p in postings {
114            let entry = idx.index.entry(p.trigram).or_default();
115            for f in &p.files {
116                entry.insert(*f);
117                idx.file_trigrams.entry(*f).or_default().insert(p.trigram);
118            }
119        }
120        idx
121    }
122
123    /// Order-of-magnitude resident-bytes estimate. Reported by
124    /// `code_index.stats.memory_bytes`.
125    pub fn estimated_bytes(&self) -> usize {
126        // Cheap order-of-magnitude estimate: 4 bytes per posting key plus
127        // ~4 bytes per FileID per posting on both sides.
128        let postings: usize = self.index.values().map(|s| s.len()).sum();
129        let reverse: usize = self.file_trigrams.values().map(|s| s.len()).sum();
130        self.index.len() * 4 + postings * 4 + reverse * 4 + self.file_trigrams.len() * 4
131    }
132}
133
134/// Extract every distinct trigram from `text`. Bytes are lowercased on the
135/// ASCII range; non-ASCII bytes pass through unchanged. Sliding 3-byte
136/// window over the UTF-8 bytes — matches Swift's
137/// `TrigramIndex.extractTrigrams` byte-for-byte.
138pub fn extract_trigrams(text: &str) -> HashSet<Trigram> {
139    let bytes = text.as_bytes();
140    if bytes.len() < 3 {
141        return HashSet::new();
142    }
143    let mut out: HashSet<Trigram> = HashSet::with_capacity(bytes.len().max(8) / 4);
144    for window in bytes.windows(3) {
145        out.insert(pack(
146            normalize(window[0]),
147            normalize(window[1]),
148            normalize(window[2]),
149        ));
150    }
151    out
152}
153
154/// Decompose a query into its constituent trigrams (deduped). Mirrors the
155/// Swift `TrigramIndex.queryTrigrams` shape so callers that pre-compute on
156/// the script side stay in lock-step with the host.
157pub fn query_trigrams(query: &str) -> Vec<Trigram> {
158    extract_trigrams(query).into_iter().collect()
159}
160
161/// Pack three normalised bytes into a `Trigram` using the canonical
162/// `(a << 16) | (b << 8) | c` layout.
163#[inline(always)]
164pub fn pack(a: u8, b: u8, c: u8) -> Trigram {
165    (a as u32) << 16 | (b as u32) << 8 | c as u32
166}
167
168#[inline(always)]
169fn normalize(byte: u8) -> u8 {
170    if byte.is_ascii_uppercase() {
171        byte + 0x20
172    } else {
173        byte
174    }
175}
176
177#[cfg(test)]
178mod tests {
179    use super::*;
180
181    #[test]
182    fn extract_handles_short_strings() {
183        assert!(extract_trigrams("").is_empty());
184        assert!(extract_trigrams("ab").is_empty());
185        assert_eq!(extract_trigrams("abc").len(), 1);
186    }
187
188    #[test]
189    fn extract_is_case_insensitive() {
190        let lower = extract_trigrams("foo");
191        let upper = extract_trigrams("FOO");
192        assert_eq!(lower, upper);
193    }
194
195    #[test]
196    fn query_intersects_postings() {
197        let mut idx = TrigramIndex::new();
198        idx.index_file(1, "the quick brown fox");
199        idx.index_file(2, "jumped over the lazy dog");
200        idx.index_file(3, "lorem ipsum dolor sit amet");
201
202        let needle = query_trigrams("the");
203        let hits = idx.query(&needle);
204        assert!(hits.contains(&1));
205        assert!(hits.contains(&2));
206        assert!(!hits.contains(&3));
207    }
208
209    #[test]
210    fn missing_trigram_short_circuits_to_empty() {
211        let mut idx = TrigramIndex::new();
212        idx.index_file(1, "hello world");
213        let needle = query_trigrams("zzzzzz");
214        assert!(idx.query(&needle).is_empty());
215    }
216
217    #[test]
218    fn remove_file_drops_postings() {
219        let mut idx = TrigramIndex::new();
220        idx.index_file(1, "alpha beta");
221        idx.index_file(2, "alpha gamma");
222        idx.remove_file(1);
223        let needle = query_trigrams("alp");
224        let hits = idx.query(&needle);
225        assert!(!hits.contains(&1));
226        assert!(hits.contains(&2));
227    }
228
229    #[test]
230    fn reindex_replaces_postings() {
231        let mut idx = TrigramIndex::new();
232        idx.index_file(1, "alpha beta");
233        idx.index_file(1, "gamma delta");
234        let alpha = query_trigrams("alp");
235        let gamma = query_trigrams("gam");
236        assert!(idx.query(&alpha).is_empty());
237        assert!(idx.query(&gamma).contains(&1));
238    }
239}