harn_hostlib/code_index/
trigram.rs1use std::collections::{HashMap, HashSet};
11
12use super::file_table::FileId;
13
14pub type Trigram = u32;
17
18#[derive(Debug, Default, Clone)]
21pub struct TrigramIndex {
22 index: HashMap<Trigram, HashSet<FileId>>,
23 file_trigrams: HashMap<FileId, HashSet<Trigram>>,
24}
25
26impl TrigramIndex {
27 pub fn new() -> Self {
29 Self::default()
30 }
31
32 pub fn index_file(&mut self, id: FileId, content: &str) {
34 self.remove_file(id);
35 let trigrams = extract_trigrams(content);
36 if trigrams.is_empty() {
37 return;
38 }
39 for tg in &trigrams {
40 self.index.entry(*tg).or_default().insert(id);
41 }
42 self.file_trigrams.insert(id, trigrams);
43 }
44
45 pub fn remove_file(&mut self, id: FileId) {
47 let Some(trigrams) = self.file_trigrams.remove(&id) else {
48 return;
49 };
50 for tg in trigrams {
51 if let Some(set) = self.index.get_mut(&tg) {
52 set.remove(&id);
53 if set.is_empty() {
54 self.index.remove(&tg);
55 }
56 }
57 }
58 }
59
60 pub fn query(&self, trigrams: &[Trigram]) -> HashSet<FileId> {
63 if trigrams.is_empty() {
64 return HashSet::new();
65 }
66 let mut postings: Vec<&HashSet<FileId>> = Vec::with_capacity(trigrams.len());
67 for tg in trigrams {
68 match self.index.get(tg) {
69 Some(set) => postings.push(set),
70 None => return HashSet::new(),
71 }
72 }
73 postings.sort_by_key(|s| s.len());
74 let (head, tail) = postings.split_first().expect("non-empty by construction");
75 let mut acc: HashSet<FileId> = (*head).clone();
76 for set in tail {
77 acc.retain(|id| set.contains(id));
78 if acc.is_empty() {
79 return acc;
80 }
81 }
82 acc
83 }
84
85 pub fn distinct_trigrams(&self) -> usize {
87 self.index.len()
88 }
89
90 pub fn estimated_bytes(&self) -> usize {
93 let postings: usize = self.index.values().map(|s| s.len()).sum();
96 let reverse: usize = self.file_trigrams.values().map(|s| s.len()).sum();
97 self.index.len() * 4 + postings * 4 + reverse * 4 + self.file_trigrams.len() * 4
98 }
99}
100
101pub fn extract_trigrams(text: &str) -> HashSet<Trigram> {
106 let bytes = text.as_bytes();
107 if bytes.len() < 3 {
108 return HashSet::new();
109 }
110 let mut out: HashSet<Trigram> = HashSet::with_capacity(bytes.len().max(8) / 4);
111 for window in bytes.windows(3) {
112 out.insert(pack(
113 normalize(window[0]),
114 normalize(window[1]),
115 normalize(window[2]),
116 ));
117 }
118 out
119}
120
121pub fn query_trigrams(query: &str) -> Vec<Trigram> {
125 extract_trigrams(query).into_iter().collect()
126}
127
128#[inline(always)]
131pub fn pack(a: u8, b: u8, c: u8) -> Trigram {
132 (a as u32) << 16 | (b as u32) << 8 | c as u32
133}
134
135#[inline(always)]
136fn normalize(byte: u8) -> u8 {
137 if byte.is_ascii_uppercase() {
138 byte + 0x20
139 } else {
140 byte
141 }
142}
143
144#[cfg(test)]
145mod tests {
146 use super::*;
147
148 #[test]
149 fn extract_handles_short_strings() {
150 assert!(extract_trigrams("").is_empty());
151 assert!(extract_trigrams("ab").is_empty());
152 assert_eq!(extract_trigrams("abc").len(), 1);
153 }
154
155 #[test]
156 fn extract_is_case_insensitive() {
157 let lower = extract_trigrams("foo");
158 let upper = extract_trigrams("FOO");
159 assert_eq!(lower, upper);
160 }
161
162 #[test]
163 fn query_intersects_postings() {
164 let mut idx = TrigramIndex::new();
165 idx.index_file(1, "the quick brown fox");
166 idx.index_file(2, "jumped over the lazy dog");
167 idx.index_file(3, "lorem ipsum dolor sit amet");
168
169 let needle = query_trigrams("the");
170 let hits = idx.query(&needle);
171 assert!(hits.contains(&1));
172 assert!(hits.contains(&2));
173 assert!(!hits.contains(&3));
174 }
175
176 #[test]
177 fn missing_trigram_short_circuits_to_empty() {
178 let mut idx = TrigramIndex::new();
179 idx.index_file(1, "hello world");
180 let needle = query_trigrams("zzzzzz");
181 assert!(idx.query(&needle).is_empty());
182 }
183
184 #[test]
185 fn remove_file_drops_postings() {
186 let mut idx = TrigramIndex::new();
187 idx.index_file(1, "alpha beta");
188 idx.index_file(2, "alpha gamma");
189 idx.remove_file(1);
190 let needle = query_trigrams("alp");
191 let hits = idx.query(&needle);
192 assert!(!hits.contains(&1));
193 assert!(hits.contains(&2));
194 }
195
196 #[test]
197 fn reindex_replaces_postings() {
198 let mut idx = TrigramIndex::new();
199 idx.index_file(1, "alpha beta");
200 idx.index_file(1, "gamma delta");
201 let alpha = query_trigrams("alp");
202 let gamma = query_trigrams("gam");
203 assert!(idx.query(&alpha).is_empty());
204 assert!(idx.query(&gamma).contains(&1));
205 }
206}