harn_hostlib/code_index/
trigram.rs1use std::collections::{HashMap, HashSet};
11
12use super::file_table::FileId;
13
14pub type Trigram = u32;
17
18#[derive(Debug, Default, Clone)]
21pub struct TrigramIndex {
22 index: HashMap<Trigram, HashSet<FileId>>,
23 file_trigrams: HashMap<FileId, HashSet<Trigram>>,
24}
25
26impl TrigramIndex {
27 pub fn new() -> Self {
29 Self::default()
30 }
31
32 pub fn index_file(&mut self, id: FileId, content: &str) {
34 self.remove_file(id);
35 let trigrams = extract_trigrams(content);
36 if trigrams.is_empty() {
37 return;
38 }
39 for tg in &trigrams {
40 self.index.entry(*tg).or_default().insert(id);
41 }
42 self.file_trigrams.insert(id, trigrams);
43 }
44
45 pub fn remove_file(&mut self, id: FileId) {
47 let Some(trigrams) = self.file_trigrams.remove(&id) else {
48 return;
49 };
50 for tg in trigrams {
51 if let Some(set) = self.index.get_mut(&tg) {
52 set.remove(&id);
53 if set.is_empty() {
54 self.index.remove(&tg);
55 }
56 }
57 }
58 }
59
60 pub fn query(&self, trigrams: &[Trigram]) -> HashSet<FileId> {
63 if trigrams.is_empty() {
64 return HashSet::new();
65 }
66 let mut postings: Vec<&HashSet<FileId>> = Vec::with_capacity(trigrams.len());
67 for tg in trigrams {
68 match self.index.get(tg) {
69 Some(set) => postings.push(set),
70 None => return HashSet::new(),
71 }
72 }
73 postings.sort_by_key(|s| s.len());
74 let (head, tail) = postings.split_first().expect("non-empty by construction");
75 let mut acc: HashSet<FileId> = (*head).clone();
76 for set in tail {
77 acc.retain(|id| set.contains(id));
78 if acc.is_empty() {
79 return acc;
80 }
81 }
82 acc
83 }
84
85 pub fn distinct_trigrams(&self) -> usize {
87 self.index.len()
88 }
89
90 pub fn snapshot_postings(&self) -> Vec<super::snapshot::TrigramPosting> {
94 let mut out: Vec<super::snapshot::TrigramPosting> = self
95 .index
96 .iter()
97 .map(|(tg, files)| {
98 let mut files: Vec<FileId> = files.iter().copied().collect();
99 files.sort_unstable();
100 super::snapshot::TrigramPosting {
101 trigram: *tg,
102 files,
103 }
104 })
105 .collect();
106 out.sort_unstable_by_key(|p| p.trigram);
107 out
108 }
109
110 pub fn from_postings(postings: Vec<super::snapshot::TrigramPosting>) -> Self {
112 let mut idx = Self::new();
113 for p in postings {
114 let entry = idx.index.entry(p.trigram).or_default();
115 for f in &p.files {
116 entry.insert(*f);
117 idx.file_trigrams.entry(*f).or_default().insert(p.trigram);
118 }
119 }
120 idx
121 }
122
123 pub fn estimated_bytes(&self) -> usize {
126 let postings: usize = self.index.values().map(|s| s.len()).sum();
129 let reverse: usize = self.file_trigrams.values().map(|s| s.len()).sum();
130 self.index.len() * 4 + postings * 4 + reverse * 4 + self.file_trigrams.len() * 4
131 }
132}
133
134pub fn extract_trigrams(text: &str) -> HashSet<Trigram> {
138 let bytes = text.as_bytes();
139 if bytes.len() < 3 {
140 return HashSet::new();
141 }
142 let mut out: HashSet<Trigram> = HashSet::with_capacity(bytes.len().max(8) / 4);
143 for window in bytes.windows(3) {
144 out.insert(pack(
145 normalize(window[0]),
146 normalize(window[1]),
147 normalize(window[2]),
148 ));
149 }
150 out
151}
152
153pub fn query_trigrams(query: &str) -> Vec<Trigram> {
156 extract_trigrams(query).into_iter().collect()
157}
158
159#[inline(always)]
162pub fn pack(a: u8, b: u8, c: u8) -> Trigram {
163 (a as u32) << 16 | (b as u32) << 8 | c as u32
164}
165
166#[inline(always)]
167fn normalize(byte: u8) -> u8 {
168 if byte.is_ascii_uppercase() {
169 byte + 0x20
170 } else {
171 byte
172 }
173}
174
175#[cfg(test)]
176mod tests {
177 use super::*;
178
179 #[test]
180 fn extract_handles_short_strings() {
181 assert!(extract_trigrams("").is_empty());
182 assert!(extract_trigrams("ab").is_empty());
183 assert_eq!(extract_trigrams("abc").len(), 1);
184 }
185
186 #[test]
187 fn extract_is_case_insensitive() {
188 let lower = extract_trigrams("foo");
189 let upper = extract_trigrams("FOO");
190 assert_eq!(lower, upper);
191 }
192
193 #[test]
194 fn query_intersects_postings() {
195 let mut idx = TrigramIndex::new();
196 idx.index_file(1, "the quick brown fox");
197 idx.index_file(2, "jumped over the lazy dog");
198 idx.index_file(3, "lorem ipsum dolor sit amet");
199
200 let needle = query_trigrams("the");
201 let hits = idx.query(&needle);
202 assert!(hits.contains(&1));
203 assert!(hits.contains(&2));
204 assert!(!hits.contains(&3));
205 }
206
207 #[test]
208 fn missing_trigram_short_circuits_to_empty() {
209 let mut idx = TrigramIndex::new();
210 idx.index_file(1, "hello world");
211 let needle = query_trigrams("zzzzzz");
212 assert!(idx.query(&needle).is_empty());
213 }
214
215 #[test]
216 fn remove_file_drops_postings() {
217 let mut idx = TrigramIndex::new();
218 idx.index_file(1, "alpha beta");
219 idx.index_file(2, "alpha gamma");
220 idx.remove_file(1);
221 let needle = query_trigrams("alp");
222 let hits = idx.query(&needle);
223 assert!(!hits.contains(&1));
224 assert!(hits.contains(&2));
225 }
226
227 #[test]
228 fn reindex_replaces_postings() {
229 let mut idx = TrigramIndex::new();
230 idx.index_file(1, "alpha beta");
231 idx.index_file(1, "gamma delta");
232 let alpha = query_trigrams("alp");
233 let gamma = query_trigrams("gam");
234 assert!(idx.query(&alpha).is_empty());
235 assert!(idx.query(&gamma).contains(&1));
236 }
237}