harn_hostlib/code_index/
trigram.rs1use std::collections::{HashMap, HashSet};
11
12use super::file_table::FileId;
13
14pub type Trigram = u32;
17
18#[derive(Debug, Default, Clone)]
21pub struct TrigramIndex {
22 index: HashMap<Trigram, HashSet<FileId>>,
23 file_trigrams: HashMap<FileId, HashSet<Trigram>>,
24}
25
26impl TrigramIndex {
27 pub fn new() -> Self {
29 Self::default()
30 }
31
32 pub fn index_file(&mut self, id: FileId, content: &str) {
34 self.remove_file(id);
35 let trigrams = extract_trigrams(content);
36 if trigrams.is_empty() {
37 return;
38 }
39 for tg in &trigrams {
40 self.index.entry(*tg).or_default().insert(id);
41 }
42 self.file_trigrams.insert(id, trigrams);
43 }
44
45 pub fn remove_file(&mut self, id: FileId) {
47 let Some(trigrams) = self.file_trigrams.remove(&id) else {
48 return;
49 };
50 for tg in trigrams {
51 if let Some(set) = self.index.get_mut(&tg) {
52 set.remove(&id);
53 if set.is_empty() {
54 self.index.remove(&tg);
55 }
56 }
57 }
58 }
59
60 pub fn query(&self, trigrams: &[Trigram]) -> HashSet<FileId> {
63 if trigrams.is_empty() {
64 return HashSet::new();
65 }
66 let mut postings: Vec<&HashSet<FileId>> = Vec::with_capacity(trigrams.len());
67 for tg in trigrams {
68 match self.index.get(tg) {
69 Some(set) => postings.push(set),
70 None => return HashSet::new(),
71 }
72 }
73 postings.sort_by_key(|s| s.len());
74 let (head, tail) = postings.split_first().expect("non-empty by construction");
75 let mut acc: HashSet<FileId> = (*head).clone();
76 for set in tail {
77 acc.retain(|id| set.contains(id));
78 if acc.is_empty() {
79 return acc;
80 }
81 }
82 acc
83 }
84
85 pub fn distinct_trigrams(&self) -> usize {
87 self.index.len()
88 }
89
90 pub fn snapshot_postings(&self) -> Vec<super::snapshot::TrigramPosting> {
94 let mut out: Vec<super::snapshot::TrigramPosting> = self
95 .index
96 .iter()
97 .map(|(tg, files)| {
98 let mut files: Vec<FileId> = files.iter().copied().collect();
99 files.sort_unstable();
100 super::snapshot::TrigramPosting {
101 trigram: *tg,
102 files,
103 }
104 })
105 .collect();
106 out.sort_unstable_by_key(|p| p.trigram);
107 out
108 }
109
110 pub fn from_postings(postings: Vec<super::snapshot::TrigramPosting>) -> Self {
112 let mut idx = Self::new();
113 for p in postings {
114 let entry = idx.index.entry(p.trigram).or_default();
115 for f in &p.files {
116 entry.insert(*f);
117 idx.file_trigrams.entry(*f).or_default().insert(p.trigram);
118 }
119 }
120 idx
121 }
122
123 pub fn estimated_bytes(&self) -> usize {
126 let postings: usize = self.index.values().map(|s| s.len()).sum();
129 let reverse: usize = self.file_trigrams.values().map(|s| s.len()).sum();
130 self.index.len() * 4 + postings * 4 + reverse * 4 + self.file_trigrams.len() * 4
131 }
132}
133
134pub fn extract_trigrams(text: &str) -> HashSet<Trigram> {
139 let bytes = text.as_bytes();
140 if bytes.len() < 3 {
141 return HashSet::new();
142 }
143 let mut out: HashSet<Trigram> = HashSet::with_capacity(bytes.len().max(8) / 4);
144 for window in bytes.windows(3) {
145 out.insert(pack(
146 normalize(window[0]),
147 normalize(window[1]),
148 normalize(window[2]),
149 ));
150 }
151 out
152}
153
154pub fn query_trigrams(query: &str) -> Vec<Trigram> {
158 extract_trigrams(query).into_iter().collect()
159}
160
161#[inline(always)]
164pub fn pack(a: u8, b: u8, c: u8) -> Trigram {
165 (a as u32) << 16 | (b as u32) << 8 | c as u32
166}
167
168#[inline(always)]
169fn normalize(byte: u8) -> u8 {
170 if byte.is_ascii_uppercase() {
171 byte + 0x20
172 } else {
173 byte
174 }
175}
176
177#[cfg(test)]
178mod tests {
179 use super::*;
180
181 #[test]
182 fn extract_handles_short_strings() {
183 assert!(extract_trigrams("").is_empty());
184 assert!(extract_trigrams("ab").is_empty());
185 assert_eq!(extract_trigrams("abc").len(), 1);
186 }
187
188 #[test]
189 fn extract_is_case_insensitive() {
190 let lower = extract_trigrams("foo");
191 let upper = extract_trigrams("FOO");
192 assert_eq!(lower, upper);
193 }
194
195 #[test]
196 fn query_intersects_postings() {
197 let mut idx = TrigramIndex::new();
198 idx.index_file(1, "the quick brown fox");
199 idx.index_file(2, "jumped over the lazy dog");
200 idx.index_file(3, "lorem ipsum dolor sit amet");
201
202 let needle = query_trigrams("the");
203 let hits = idx.query(&needle);
204 assert!(hits.contains(&1));
205 assert!(hits.contains(&2));
206 assert!(!hits.contains(&3));
207 }
208
209 #[test]
210 fn missing_trigram_short_circuits_to_empty() {
211 let mut idx = TrigramIndex::new();
212 idx.index_file(1, "hello world");
213 let needle = query_trigrams("zzzzzz");
214 assert!(idx.query(&needle).is_empty());
215 }
216
217 #[test]
218 fn remove_file_drops_postings() {
219 let mut idx = TrigramIndex::new();
220 idx.index_file(1, "alpha beta");
221 idx.index_file(2, "alpha gamma");
222 idx.remove_file(1);
223 let needle = query_trigrams("alp");
224 let hits = idx.query(&needle);
225 assert!(!hits.contains(&1));
226 assert!(hits.contains(&2));
227 }
228
229 #[test]
230 fn reindex_replaces_postings() {
231 let mut idx = TrigramIndex::new();
232 idx.index_file(1, "alpha beta");
233 idx.index_file(1, "gamma delta");
234 let alpha = query_trigrams("alp");
235 let gamma = query_trigrams("gam");
236 assert!(idx.query(&alpha).is_empty());
237 assert!(idx.query(&gamma).contains(&1));
238 }
239}