harn_hostlib/code_index/
words.rs1use std::collections::{HashMap, HashSet};
10
11use super::file_table::FileId;
12
13#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub struct WordHit {
17 pub file: FileId,
19 pub line: u32,
21}
22
23#[derive(Debug, Default, Clone)]
25pub struct WordIndex {
26 index: HashMap<String, Vec<WordHit>>,
27 file_words: HashMap<FileId, HashSet<String>>,
28}
29
30impl WordIndex {
31 pub fn new() -> Self {
33 Self::default()
34 }
35
36 pub fn index_file(&mut self, id: FileId, content: &str) {
39 self.remove_file(id);
40 let mut contributed: HashSet<String> = HashSet::new();
41 for (line_idx, line) in content.split('\n').enumerate() {
42 let line_no = (line_idx as u32) + 1;
43 tokenize(line, |word| {
44 if word.len() < 2 {
45 return;
46 }
47 self.index
48 .entry(word.to_string())
49 .or_default()
50 .push(WordHit {
51 file: id,
52 line: line_no,
53 });
54 contributed.insert(word.to_string());
55 });
56 }
57 if !contributed.is_empty() {
58 self.file_words.insert(id, contributed);
59 }
60 }
61
62 pub fn remove_file(&mut self, id: FileId) {
64 let Some(words) = self.file_words.remove(&id) else {
65 return;
66 };
67 for word in words {
68 if let Some(hits) = self.index.get_mut(&word) {
69 hits.retain(|h| h.file != id);
70 if hits.is_empty() {
71 self.index.remove(&word);
72 }
73 }
74 }
75 }
76
77 pub fn get(&self, word: &str) -> &[WordHit] {
79 self.index.get(word).map(Vec::as_slice).unwrap_or(&[])
80 }
81
82 pub fn distinct_words(&self) -> usize {
84 self.index.len()
85 }
86
87 pub fn estimated_bytes(&self) -> usize {
90 let words = self.index.len();
91 let key_bytes: usize = self.index.keys().map(|k| k.len()).sum();
92 let hits: usize = self.index.values().map(Vec::len).sum();
93 words * 16 + key_bytes + hits * 8 + self.file_words.len() * 16
96 }
97}
98
99pub fn tokenize(line: &str, mut yield_token: impl FnMut(&str)) {
103 let bytes = line.as_bytes();
104 let mut i = 0;
105 while i < bytes.len() {
106 if !is_ident_start(bytes[i]) {
107 i += 1;
108 continue;
109 }
110 let start = i;
111 i += 1;
112 while i < bytes.len() && is_ident_cont(bytes[i]) {
113 i += 1;
114 }
115 let token = std::str::from_utf8(&bytes[start..i]).expect("ASCII run");
118 yield_token(token);
119 }
120}
121
122#[inline(always)]
123fn is_ident_start(b: u8) -> bool {
124 b.is_ascii_alphabetic() || b == b'_'
125}
126
127#[inline(always)]
128fn is_ident_cont(b: u8) -> bool {
129 b.is_ascii_alphanumeric() || b == b'_'
130}
131
132#[cfg(test)]
133mod tests {
134 use super::*;
135
136 #[test]
137 fn tokenize_skips_punctuation_and_numbers() {
138 let mut tokens: Vec<String> = Vec::new();
139 tokenize("let foo_bar = baz(1, 2.0); // 42_things", |t| {
140 tokens.push(t.to_string())
141 });
142 assert_eq!(tokens, vec!["let", "foo_bar", "baz", "_things"]);
143 }
144
145 #[test]
146 fn index_records_line_numbers() {
147 let mut idx = WordIndex::new();
148 idx.index_file(7, "alpha\n beta gamma\nalpha");
149 let alpha_hits = idx.get("alpha");
150 assert_eq!(alpha_hits.len(), 2);
151 assert_eq!(alpha_hits[0], WordHit { file: 7, line: 1 });
152 assert_eq!(alpha_hits[1], WordHit { file: 7, line: 3 });
153 let gamma_hits = idx.get("gamma");
154 assert_eq!(gamma_hits, &[WordHit { file: 7, line: 2 }]);
155 }
156
157 #[test]
158 fn remove_and_reindex_replace_entries() {
159 let mut idx = WordIndex::new();
160 idx.index_file(1, "foo bar baz");
161 idx.remove_file(1);
162 assert!(idx.get("foo").is_empty());
163 idx.index_file(1, "qux");
164 assert!(idx.get("foo").is_empty());
165 assert_eq!(idx.get("qux"), &[WordHit { file: 1, line: 1 }]);
166 }
167
168 #[test]
169 fn single_character_tokens_are_skipped() {
170 let mut idx = WordIndex::new();
171 idx.index_file(1, "a foo b bar c");
172 assert!(idx.get("a").is_empty());
173 assert_eq!(idx.get("foo").len(), 1);
174 }
175}