Skip to main content

harn_hostlib/code_index/
trigram.rs

1//! Trigram index over file contents.
2//!
3//! Substring-accelerated full-text index: every 3-byte sliding window in a
4//! file becomes a posting in `index[trigram] -> Set<FileId>`. A query is
5//! decomposed into its trigrams; the candidate set is the intersection of
6//! those posting lists. Matches the Swift `TrigramIndex` byte-for-byte:
7//! ASCII case-fold on each byte (`A-Z` -> `a-z`), non-ASCII passes through,
8//! 3-byte sliding window packed `(a << 16) | (b << 8) | c` into a `u32`.
9
10use std::collections::{HashMap, HashSet};
11
12use super::file_table::FileId;
13
14/// Packed 3-byte sliding-window key. Layout: `(a << 16) | (b << 8) | c`,
15/// case-folded on the ASCII range so lookups are case-insensitive.
16pub type Trigram = u32;
17
18/// Trigram posting list: `trigram -> set of file ids that contain it`,
19/// plus a per-file reverse map for cheap re-indexing.
20#[derive(Debug, Default, Clone)]
21pub struct TrigramIndex {
22    index: HashMap<Trigram, HashSet<FileId>>,
23    file_trigrams: HashMap<FileId, HashSet<Trigram>>,
24}
25
26impl TrigramIndex {
27    /// Construct an empty index.
28    pub fn new() -> Self {
29        Self::default()
30    }
31
32    /// Replace the postings for `id` with the trigrams of `content`.
33    pub fn index_file(&mut self, id: FileId, content: &str) {
34        self.remove_file(id);
35        let trigrams = extract_trigrams(content);
36        if trigrams.is_empty() {
37            return;
38        }
39        for tg in &trigrams {
40            self.index.entry(*tg).or_default().insert(id);
41        }
42        self.file_trigrams.insert(id, trigrams);
43    }
44
45    /// Drop every posting contributed by `id`.
46    pub fn remove_file(&mut self, id: FileId) {
47        let Some(trigrams) = self.file_trigrams.remove(&id) else {
48            return;
49        };
50        for tg in trigrams {
51            if let Some(set) = self.index.get_mut(&tg) {
52                set.remove(&id);
53                if set.is_empty() {
54                    self.index.remove(&tg);
55                }
56            }
57        }
58    }
59
60    /// Return file ids that contain ALL of `trigrams`. Empty input or any
61    /// missing trigram yields an empty set.
62    pub fn query(&self, trigrams: &[Trigram]) -> HashSet<FileId> {
63        if trigrams.is_empty() {
64            return HashSet::new();
65        }
66        let mut postings: Vec<&HashSet<FileId>> = Vec::with_capacity(trigrams.len());
67        for tg in trigrams {
68            match self.index.get(tg) {
69                Some(set) => postings.push(set),
70                None => return HashSet::new(),
71            }
72        }
73        postings.sort_by_key(|s| s.len());
74        let (head, tail) = postings.split_first().expect("non-empty by construction");
75        let mut acc: HashSet<FileId> = (*head).clone();
76        for set in tail {
77            acc.retain(|id| set.contains(id));
78            if acc.is_empty() {
79                return acc;
80            }
81        }
82        acc
83    }
84
85    /// Number of distinct trigrams in the posting table.
86    pub fn distinct_trigrams(&self) -> usize {
87        self.index.len()
88    }
89
90    /// Order-of-magnitude resident-bytes estimate. Reported by
91    /// `code_index.stats.memory_bytes`.
92    pub fn estimated_bytes(&self) -> usize {
93        // Cheap order-of-magnitude estimate: 4 bytes per posting key plus
94        // ~4 bytes per FileID per posting on both sides.
95        let postings: usize = self.index.values().map(|s| s.len()).sum();
96        let reverse: usize = self.file_trigrams.values().map(|s| s.len()).sum();
97        self.index.len() * 4 + postings * 4 + reverse * 4 + self.file_trigrams.len() * 4
98    }
99}
100
101/// Extract every distinct trigram from `text`. Bytes are lowercased on the
102/// ASCII range; non-ASCII bytes pass through unchanged. Sliding 3-byte
103/// window over the UTF-8 bytes — matches Swift's
104/// `TrigramIndex.extractTrigrams` byte-for-byte.
105pub fn extract_trigrams(text: &str) -> HashSet<Trigram> {
106    let bytes = text.as_bytes();
107    if bytes.len() < 3 {
108        return HashSet::new();
109    }
110    let mut out: HashSet<Trigram> = HashSet::with_capacity(bytes.len().max(8) / 4);
111    for window in bytes.windows(3) {
112        out.insert(pack(
113            normalize(window[0]),
114            normalize(window[1]),
115            normalize(window[2]),
116        ));
117    }
118    out
119}
120
121/// Decompose a query into its constituent trigrams (deduped). Mirrors the
122/// Swift `TrigramIndex.queryTrigrams` shape so callers that pre-compute on
123/// the script side stay in lock-step with the host.
124pub fn query_trigrams(query: &str) -> Vec<Trigram> {
125    extract_trigrams(query).into_iter().collect()
126}
127
128/// Pack three normalised bytes into a `Trigram` using the canonical
129/// `(a << 16) | (b << 8) | c` layout.
130#[inline(always)]
131pub fn pack(a: u8, b: u8, c: u8) -> Trigram {
132    (a as u32) << 16 | (b as u32) << 8 | c as u32
133}
134
135#[inline(always)]
136fn normalize(byte: u8) -> u8 {
137    if byte.is_ascii_uppercase() {
138        byte + 0x20
139    } else {
140        byte
141    }
142}
143
144#[cfg(test)]
145mod tests {
146    use super::*;
147
148    #[test]
149    fn extract_handles_short_strings() {
150        assert!(extract_trigrams("").is_empty());
151        assert!(extract_trigrams("ab").is_empty());
152        assert_eq!(extract_trigrams("abc").len(), 1);
153    }
154
155    #[test]
156    fn extract_is_case_insensitive() {
157        let lower = extract_trigrams("foo");
158        let upper = extract_trigrams("FOO");
159        assert_eq!(lower, upper);
160    }
161
162    #[test]
163    fn query_intersects_postings() {
164        let mut idx = TrigramIndex::new();
165        idx.index_file(1, "the quick brown fox");
166        idx.index_file(2, "jumped over the lazy dog");
167        idx.index_file(3, "lorem ipsum dolor sit amet");
168
169        let needle = query_trigrams("the");
170        let hits = idx.query(&needle);
171        assert!(hits.contains(&1));
172        assert!(hits.contains(&2));
173        assert!(!hits.contains(&3));
174    }
175
176    #[test]
177    fn missing_trigram_short_circuits_to_empty() {
178        let mut idx = TrigramIndex::new();
179        idx.index_file(1, "hello world");
180        let needle = query_trigrams("zzzzzz");
181        assert!(idx.query(&needle).is_empty());
182    }
183
184    #[test]
185    fn remove_file_drops_postings() {
186        let mut idx = TrigramIndex::new();
187        idx.index_file(1, "alpha beta");
188        idx.index_file(2, "alpha gamma");
189        idx.remove_file(1);
190        let needle = query_trigrams("alp");
191        let hits = idx.query(&needle);
192        assert!(!hits.contains(&1));
193        assert!(hits.contains(&2));
194    }
195
196    #[test]
197    fn reindex_replaces_postings() {
198        let mut idx = TrigramIndex::new();
199        idx.index_file(1, "alpha beta");
200        idx.index_file(1, "gamma delta");
201        let alpha = query_trigrams("alp");
202        let gamma = query_trigrams("gam");
203        assert!(idx.query(&alpha).is_empty());
204        assert!(idx.query(&gamma).contains(&1));
205    }
206}