harn_hostlib/code_index/
words.rs

1//! Word index over identifier-like tokens.
2//!
3//! Inverted index `identifier -> Vec<(file_id, line)>`. Complements the
4//! trigram index: trigrams answer "which files contain this substring?",
5//! the word index answers "which lines mention this exact identifier?".
6//! Mirrors the Swift `WordIndex`: tokens are runs of `[A-Za-z_][A-Za-z0-9_]*`
7//! (single-character tokens are skipped), one entry per occurrence.
8
9use std::collections::{HashMap, HashSet};
10
11use super::file_table::FileId;
12
13/// Single occurrence of an identifier-shaped token: which file it landed
14/// in and on which 1-based line number.
15#[derive(Debug, Clone, Copy, PartialEq, Eq)]
16pub struct WordHit {
17    /// File the token was tokenized from.
18    pub file: FileId,
19    /// 1-based line number of the occurrence.
20    pub line: u32,
21}
22
23/// Inverted word index keyed on identifier-shaped tokens.
24#[derive(Debug, Default, Clone)]
25pub struct WordIndex {
26    index: HashMap<String, Vec<WordHit>>,
27    file_words: HashMap<FileId, HashSet<String>>,
28}
29
30impl WordIndex {
31    /// Construct an empty index.
32    pub fn new() -> Self {
33        Self::default()
34    }
35
36    /// Tokenize `content` and record every hit under `id`. If `id` already
37    /// has entries, they are dropped first.
38    pub fn index_file(&mut self, id: FileId, content: &str) {
39        self.remove_file(id);
40        let mut contributed: HashSet<String> = HashSet::new();
41        for (line_idx, line) in content.split('\n').enumerate() {
42            let line_no = (line_idx as u32) + 1;
43            tokenize(line, |word| {
44                if word.len() < 2 {
45                    return;
46                }
47                self.index
48                    .entry(word.to_string())
49                    .or_default()
50                    .push(WordHit {
51                        file: id,
52                        line: line_no,
53                    });
54                contributed.insert(word.to_string());
55            });
56        }
57        if !contributed.is_empty() {
58            self.file_words.insert(id, contributed);
59        }
60    }
61
62    /// Drop every hit contributed by `id`.
63    pub fn remove_file(&mut self, id: FileId) {
64        let Some(words) = self.file_words.remove(&id) else {
65            return;
66        };
67        for word in words {
68            if let Some(hits) = self.index.get_mut(&word) {
69                hits.retain(|h| h.file != id);
70                if hits.is_empty() {
71                    self.index.remove(&word);
72                }
73            }
74        }
75    }
76
77    /// O(1) lookup for every hit recorded under `word`.
78    pub fn get(&self, word: &str) -> &[WordHit] {
79        self.index.get(word).map(Vec::as_slice).unwrap_or(&[])
80    }
81
82    /// Number of distinct tokens in the posting table.
83    pub fn distinct_words(&self) -> usize {
84        self.index.len()
85    }
86
87    /// Order-of-magnitude resident-bytes estimate. Reported by
88    /// `code_index.stats.memory_bytes`.
89    pub fn estimated_bytes(&self) -> usize {
90        let words = self.index.len();
91        let key_bytes: usize = self.index.keys().map(|k| k.len()).sum();
92        let hits: usize = self.index.values().map(Vec::len).sum();
93        // Each WordHit packs into 8 bytes (FileId + u32). Add overhead for
94        // the per-word vec plus the reverse map.
95        words * 16 + key_bytes + hits * 8 + self.file_words.len() * 16
96    }
97}
98
99/// Split a single line into identifier tokens. A token matches
100/// `[A-Za-z_][A-Za-z0-9_]*`. Numeric-only runs are skipped. `yield_token`
101/// is invoked once per occurrence — dedupe is the caller's responsibility.
102pub fn tokenize(line: &str, mut yield_token: impl FnMut(&str)) {
103    let bytes = line.as_bytes();
104    let mut i = 0;
105    while i < bytes.len() {
106        if !is_ident_start(bytes[i]) {
107            i += 1;
108            continue;
109        }
110        let start = i;
111        i += 1;
112        while i < bytes.len() && is_ident_cont(bytes[i]) {
113            i += 1;
114        }
115        // SAFETY: ASCII slice boundaries — `is_ident_start`/`is_ident_cont`
116        // only ever match ASCII bytes, so `start..i` is on a UTF-8 boundary.
117        let token = std::str::from_utf8(&bytes[start..i]).expect("ASCII run");
118        yield_token(token);
119    }
120}
121
122#[inline(always)]
123fn is_ident_start(b: u8) -> bool {
124    b.is_ascii_alphabetic() || b == b'_'
125}
126
127#[inline(always)]
128fn is_ident_cont(b: u8) -> bool {
129    b.is_ascii_alphanumeric() || b == b'_'
130}
131
132#[cfg(test)]
133mod tests {
134    use super::*;
135
136    #[test]
137    fn tokenize_skips_punctuation_and_numbers() {
138        let mut tokens: Vec<String> = Vec::new();
139        tokenize("let foo_bar = baz(1, 2.0); // 42_things", |t| {
140            tokens.push(t.to_string())
141        });
142        assert_eq!(tokens, vec!["let", "foo_bar", "baz", "_things"]);
143    }
144
145    #[test]
146    fn index_records_line_numbers() {
147        let mut idx = WordIndex::new();
148        idx.index_file(7, "alpha\n  beta gamma\nalpha");
149        let alpha_hits = idx.get("alpha");
150        assert_eq!(alpha_hits.len(), 2);
151        assert_eq!(alpha_hits[0], WordHit { file: 7, line: 1 });
152        assert_eq!(alpha_hits[1], WordHit { file: 7, line: 3 });
153        let gamma_hits = idx.get("gamma");
154        assert_eq!(gamma_hits, &[WordHit { file: 7, line: 2 }]);
155    }
156
157    #[test]
158    fn remove_and_reindex_replace_entries() {
159        let mut idx = WordIndex::new();
160        idx.index_file(1, "foo bar baz");
161        idx.remove_file(1);
162        assert!(idx.get("foo").is_empty());
163        idx.index_file(1, "qux");
164        assert!(idx.get("foo").is_empty());
165        assert_eq!(idx.get("qux"), &[WordHit { file: 1, line: 1 }]);
166    }
167
168    #[test]
169    fn single_character_tokens_are_skipped() {
170        let mut idx = WordIndex::new();
171        idx.index_file(1, "a foo b bar c");
172        assert!(idx.get("a").is_empty());
173        assert_eq!(idx.get("foo").len(), 1);
174    }
175}
harn_hostlib/code_index/words.rs

harn_hostlib/code_index/
words.rs