harn_hostlib/code_index/
words.rs1use std::collections::{HashMap, HashSet};
10
11use super::file_table::FileId;
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub struct WordHit {
17 pub file: FileId,
19 pub line: u32,
21}
22
23#[derive(Debug, Default, Clone)]
25pub struct WordIndex {
26 index: HashMap<String, Vec<WordHit>>,
27 file_words: HashMap<FileId, HashSet<String>>,
28}
29
30impl WordIndex {
31 pub fn new() -> Self {
33 Self::default()
34 }
35
36 pub fn index_file(&mut self, id: FileId, content: &str) {
39 self.remove_file(id);
40 let mut contributed: HashSet<String> = HashSet::new();
41 for (line_idx, line) in content.split('\n').enumerate() {
42 let line_no = (line_idx as u32) + 1;
43 tokenize(line, |word| {
44 if word.len() < 2 {
45 return;
46 }
47 self.index
48 .entry(word.to_string())
49 .or_default()
50 .push(WordHit {
51 file: id,
52 line: line_no,
53 });
54 contributed.insert(word.to_string());
55 });
56 }
57 if !contributed.is_empty() {
58 self.file_words.insert(id, contributed);
59 }
60 }
61
62 pub fn remove_file(&mut self, id: FileId) {
64 let Some(words) = self.file_words.remove(&id) else {
65 return;
66 };
67 for word in words {
68 if let Some(hits) = self.index.get_mut(&word) {
69 hits.retain(|h| h.file != id);
70 if hits.is_empty() {
71 self.index.remove(&word);
72 }
73 }
74 }
75 }
76
77 pub fn get(&self, word: &str) -> &[WordHit] {
79 self.index.get(word).map(Vec::as_slice).unwrap_or(&[])
80 }
81
82 pub fn distinct_words(&self) -> usize {
84 self.index.len()
85 }
86
87 pub fn snapshot_postings(&self) -> Vec<super::snapshot::WordPosting> {
90 let mut out: Vec<super::snapshot::WordPosting> = self
91 .index
92 .iter()
93 .map(|(word, hits)| super::snapshot::WordPosting {
94 word: word.clone(),
95 hits: hits.iter().map(|h| (h.file, h.line)).collect(),
96 })
97 .collect();
98 out.sort_by(|a, b| a.word.cmp(&b.word));
99 out
100 }
101
102 pub fn from_postings(postings: Vec<super::snapshot::WordPosting>) -> Self {
104 let mut idx = Self::new();
105 for p in postings {
106 let mut contributing_files: HashSet<FileId> = HashSet::new();
107 let entry = idx.index.entry(p.word.clone()).or_default();
108 for (file, line) in &p.hits {
109 entry.push(WordHit {
110 file: *file,
111 line: *line,
112 });
113 contributing_files.insert(*file);
114 }
115 for file in contributing_files {
116 idx.file_words
117 .entry(file)
118 .or_default()
119 .insert(p.word.clone());
120 }
121 }
122 idx
123 }
124
125 pub fn estimated_bytes(&self) -> usize {
128 let words = self.index.len();
129 let key_bytes: usize = self.index.keys().map(|k| k.len()).sum();
130 let hits: usize = self.index.values().map(Vec::len).sum();
131 words * 16 + key_bytes + hits * 8 + self.file_words.len() * 16
134 }
135}
136
137pub fn tokenize(line: &str, mut yield_token: impl FnMut(&str)) {
141 let bytes = line.as_bytes();
142 let mut i = 0;
143 while i < bytes.len() {
144 if !is_ident_start(bytes[i]) {
145 i += 1;
146 continue;
147 }
148 let start = i;
149 i += 1;
150 while i < bytes.len() && is_ident_cont(bytes[i]) {
151 i += 1;
152 }
153 let token = std::str::from_utf8(&bytes[start..i]).expect("ASCII run");
156 yield_token(token);
157 }
158}
159
160#[inline(always)]
161fn is_ident_start(b: u8) -> bool {
162 b.is_ascii_alphabetic() || b == b'_'
163}
164
165#[inline(always)]
166fn is_ident_cont(b: u8) -> bool {
167 b.is_ascii_alphanumeric() || b == b'_'
168}
169
170#[cfg(test)]
171mod tests {
172 use super::*;
173
174 #[test]
175 fn tokenize_skips_punctuation_and_numbers() {
176 let mut tokens: Vec<String> = Vec::new();
177 tokenize("let foo_bar = baz(1, 2.0); // 42_things", |t| {
178 tokens.push(t.to_string())
179 });
180 assert_eq!(tokens, vec!["let", "foo_bar", "baz", "_things"]);
181 }
182
183 #[test]
184 fn index_records_line_numbers() {
185 let mut idx = WordIndex::new();
186 idx.index_file(7, "alpha\n beta gamma\nalpha");
187 let alpha_hits = idx.get("alpha");
188 assert_eq!(alpha_hits.len(), 2);
189 assert_eq!(alpha_hits[0], WordHit { file: 7, line: 1 });
190 assert_eq!(alpha_hits[1], WordHit { file: 7, line: 3 });
191 let gamma_hits = idx.get("gamma");
192 assert_eq!(gamma_hits, &[WordHit { file: 7, line: 2 }]);
193 }
194
195 #[test]
196 fn remove_and_reindex_replace_entries() {
197 let mut idx = WordIndex::new();
198 idx.index_file(1, "foo bar baz");
199 idx.remove_file(1);
200 assert!(idx.get("foo").is_empty());
201 idx.index_file(1, "qux");
202 assert!(idx.get("foo").is_empty());
203 assert_eq!(idx.get("qux"), &[WordHit { file: 1, line: 1 }]);
204 }
205
206 #[test]
207 fn single_character_tokens_are_skipped() {
208 let mut idx = WordIndex::new();
209 idx.index_file(1, "a foo b bar c");
210 assert!(idx.get("a").is_empty());
211 assert_eq!(idx.get("foo").len(), 1);
212 }
213}