harn_hostlib/code_index/
words.rs

1//! Word index over identifier-like tokens.
2//!
3//! Inverted index `identifier -> Vec<(file_id, line)>`. Complements the
4//! trigram index: trigrams answer "which files contain this substring?",
5//! the word index answers "which lines mention this exact identifier?".
6//! Tokens are runs of `[A-Za-z_][A-Za-z0-9_]*` (single-character tokens
7//! are skipped), one entry per occurrence.
8
9use std::collections::{HashMap, HashSet};
10
11use super::file_table::FileId;
12
13/// Single occurrence of an identifier-shaped token: which file it landed
14/// in and on which 1-based line number.
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub struct WordHit {
17    /// File the token was tokenized from.
18    pub file: FileId,
19    /// 1-based line number of the occurrence.
20    pub line: u32,
21}
22
23/// Inverted word index keyed on identifier-shaped tokens.
24#[derive(Debug, Default, Clone)]
25pub struct WordIndex {
26    index: HashMap<String, Vec<WordHit>>,
27    file_words: HashMap<FileId, HashSet<String>>,
28}
29
30impl WordIndex {
31    /// Construct an empty index.
32    pub fn new() -> Self {
33        Self::default()
34    }
35
36    /// Tokenize `content` and record every hit under `id`. If `id` already
37    /// has entries, they are dropped first.
38    pub fn index_file(&mut self, id: FileId, content: &str) {
39        self.remove_file(id);
40        let mut contributed: HashSet<String> = HashSet::new();
41        for (line_idx, line) in content.split('\n').enumerate() {
42            let line_no = (line_idx as u32) + 1;
43            tokenize(line, |word| {
44                if word.len() < 2 {
45                    return;
46                }
47                self.index
48                    .entry(word.to_string())
49                    .or_default()
50                    .push(WordHit {
51                        file: id,
52                        line: line_no,
53                    });
54                contributed.insert(word.to_string());
55            });
56        }
57        if !contributed.is_empty() {
58            self.file_words.insert(id, contributed);
59        }
60    }
61
62    /// Drop every hit contributed by `id`.
63    pub fn remove_file(&mut self, id: FileId) {
64        let Some(words) = self.file_words.remove(&id) else {
65            return;
66        };
67        for word in words {
68            if let Some(hits) = self.index.get_mut(&word) {
69                hits.retain(|h| h.file != id);
70                if hits.is_empty() {
71                    self.index.remove(&word);
72                }
73            }
74        }
75    }
76
77    /// O(1) lookup for every hit recorded under `word`.
78    pub fn get(&self, word: &str) -> &[WordHit] {
79        self.index.get(word).map(Vec::as_slice).unwrap_or(&[])
80    }
81
82    /// Number of distinct tokens in the posting table.
83    pub fn distinct_words(&self) -> usize {
84        self.index.len()
85    }
86
87    /// Capture the inverted index as snapshot-friendly postings. Sorted
88    /// by word so the on-disk JSON is reproducible.
89    pub fn snapshot_postings(&self) -> Vec<super::snapshot::WordPosting> {
90        let mut out: Vec<super::snapshot::WordPosting> = self
91            .index
92            .iter()
93            .map(|(word, hits)| super::snapshot::WordPosting {
94                word: word.clone(),
95                hits: hits.iter().map(|h| (h.file, h.line)).collect(),
96            })
97            .collect();
98        out.sort_by(|a, b| a.word.cmp(&b.word));
99        out
100    }
101
102    /// Rebuild a [`WordIndex`] from a snapshot's postings vector.
103    pub fn from_postings(postings: Vec<super::snapshot::WordPosting>) -> Self {
104        let mut idx = Self::new();
105        for p in postings {
106            let mut contributing_files: HashSet<FileId> = HashSet::new();
107            let entry = idx.index.entry(p.word.clone()).or_default();
108            for (file, line) in &p.hits {
109                entry.push(WordHit {
110                    file: *file,
111                    line: *line,
112                });
113                contributing_files.insert(*file);
114            }
115            for file in contributing_files {
116                idx.file_words
117                    .entry(file)
118                    .or_default()
119                    .insert(p.word.clone());
120            }
121        }
122        idx
123    }
124
125    /// Order-of-magnitude resident-bytes estimate. Reported by
126    /// `code_index.stats.memory_bytes`.
127    pub fn estimated_bytes(&self) -> usize {
128        let words = self.index.len();
129        let key_bytes: usize = self.index.keys().map(|k| k.len()).sum();
130        let hits: usize = self.index.values().map(Vec::len).sum();
131        // Each WordHit packs into 8 bytes (FileId + u32). Add overhead for
132        // the per-word vec plus the reverse map.
133        words * 16 + key_bytes + hits * 8 + self.file_words.len() * 16
134    }
135}
136
137/// Split a single line into identifier tokens. A token matches
138/// `[A-Za-z_][A-Za-z0-9_]*`. Numeric-only runs are skipped. `yield_token`
139/// is invoked once per occurrence — dedupe is the caller's responsibility.
140pub fn tokenize(line: &str, mut yield_token: impl FnMut(&str)) {
141    let bytes = line.as_bytes();
142    let mut i = 0;
143    while i < bytes.len() {
144        if !is_ident_start(bytes[i]) {
145            i += 1;
146            continue;
147        }
148        let start = i;
149        i += 1;
150        while i < bytes.len() && is_ident_cont(bytes[i]) {
151            i += 1;
152        }
153        // SAFETY: ASCII slice boundaries — `is_ident_start`/`is_ident_cont`
154        // only ever match ASCII bytes, so `start..i` is on a UTF-8 boundary.
155        let token = std::str::from_utf8(&bytes[start..i]).expect("ASCII run");
156        yield_token(token);
157    }
158}
159
160#[inline(always)]
161fn is_ident_start(b: u8) -> bool {
162    b.is_ascii_alphabetic() || b == b'_'
163}
164
165#[inline(always)]
166fn is_ident_cont(b: u8) -> bool {
167    b.is_ascii_alphanumeric() || b == b'_'
168}
169
170#[cfg(test)]
171mod tests {
172    use super::*;
173
174    #[test]
175    fn tokenize_skips_punctuation_and_numbers() {
176        let mut tokens: Vec<String> = Vec::new();
177        tokenize("let foo_bar = baz(1, 2.0); // 42_things", |t| {
178            tokens.push(t.to_string())
179        });
180        assert_eq!(tokens, vec!["let", "foo_bar", "baz", "_things"]);
181    }
182
183    #[test]
184    fn index_records_line_numbers() {
185        let mut idx = WordIndex::new();
186        idx.index_file(7, "alpha\n  beta gamma\nalpha");
187        let alpha_hits = idx.get("alpha");
188        assert_eq!(alpha_hits.len(), 2);
189        assert_eq!(alpha_hits[0], WordHit { file: 7, line: 1 });
190        assert_eq!(alpha_hits[1], WordHit { file: 7, line: 3 });
191        let gamma_hits = idx.get("gamma");
192        assert_eq!(gamma_hits, &[WordHit { file: 7, line: 2 }]);
193    }
194
195    #[test]
196    fn remove_and_reindex_replace_entries() {
197        let mut idx = WordIndex::new();
198        idx.index_file(1, "foo bar baz");
199        idx.remove_file(1);
200        assert!(idx.get("foo").is_empty());
201        idx.index_file(1, "qux");
202        assert!(idx.get("foo").is_empty());
203        assert_eq!(idx.get("qux"), &[WordHit { file: 1, line: 1 }]);
204    }
205
206    #[test]
207    fn single_character_tokens_are_skipped() {
208        let mut idx = WordIndex::new();
209        idx.index_file(1, "a foo b bar c");
210        assert!(idx.get("a").is_empty());
211        assert_eq!(idx.get("foo").len(), 1);
212    }
213}
harn_hostlib/code_index/words.rs

harn_hostlib/code_index/
words.rs