use std::collections::{HashMap, HashSet};
use super::file_table::FileId;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub struct WordHit {
pub file: FileId,
pub line: u32,
}
#[derive(Debug, Default, Clone)]
pub struct WordIndex {
index: HashMap<String, Vec<WordHit>>,
file_words: HashMap<FileId, HashSet<String>>,
}
impl WordIndex {
pub fn new() -> Self {
Self::default()
}
pub fn index_file(&mut self, id: FileId, content: &str) {
self.remove_file(id);
let mut contributed: HashSet<String> = HashSet::new();
for (line_idx, line) in content.split('\n').enumerate() {
let line_no = (line_idx as u32) + 1;
tokenize(line, |word| {
if word.len() < 2 {
return;
}
self.index
.entry(word.to_string())
.or_default()
.push(WordHit {
file: id,
line: line_no,
});
contributed.insert(word.to_string());
});
}
if !contributed.is_empty() {
self.file_words.insert(id, contributed);
}
}
pub fn remove_file(&mut self, id: FileId) {
let Some(words) = self.file_words.remove(&id) else {
return;
};
for word in words {
if let Some(hits) = self.index.get_mut(&word) {
hits.retain(|h| h.file != id);
if hits.is_empty() {
self.index.remove(&word);
}
}
}
}
pub fn get(&self, word: &str) -> &[WordHit] {
self.index.get(word).map(Vec::as_slice).unwrap_or(&[])
}
pub fn distinct_words(&self) -> usize {
self.index.len()
}
pub fn estimated_bytes(&self) -> usize {
let words = self.index.len();
let key_bytes: usize = self.index.keys().map(|k| k.len()).sum();
let hits: usize = self.index.values().map(Vec::len).sum();
words * 16 + key_bytes + hits * 8 + self.file_words.len() * 16
}
}
pub fn tokenize(line: &str, mut yield_token: impl FnMut(&str)) {
let bytes = line.as_bytes();
let mut i = 0;
while i < bytes.len() {
if !is_ident_start(bytes[i]) {
i += 1;
continue;
}
let start = i;
i += 1;
while i < bytes.len() && is_ident_cont(bytes[i]) {
i += 1;
}
let token = std::str::from_utf8(&bytes[start..i]).expect("ASCII run");
yield_token(token);
}
}
#[inline(always)]
fn is_ident_start(b: u8) -> bool {
b.is_ascii_alphabetic() || b == b'_'
}
#[inline(always)]
fn is_ident_cont(b: u8) -> bool {
b.is_ascii_alphanumeric() || b == b'_'
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tokenize_skips_punctuation_and_numbers() {
let mut tokens: Vec<String> = Vec::new();
tokenize("let foo_bar = baz(1, 2.0); // 42_things", |t| {
tokens.push(t.to_string())
});
assert_eq!(tokens, vec!["let", "foo_bar", "baz", "_things"]);
}
#[test]
fn index_records_line_numbers() {
let mut idx = WordIndex::new();
idx.index_file(7, "alpha\n beta gamma\nalpha");
let alpha_hits = idx.get("alpha");
assert_eq!(alpha_hits.len(), 2);
assert_eq!(alpha_hits[0], WordHit { file: 7, line: 1 });
assert_eq!(alpha_hits[1], WordHit { file: 7, line: 3 });
let gamma_hits = idx.get("gamma");
assert_eq!(gamma_hits, &[WordHit { file: 7, line: 2 }]);
}
#[test]
fn remove_and_reindex_replace_entries() {
let mut idx = WordIndex::new();
idx.index_file(1, "foo bar baz");
idx.remove_file(1);
assert!(idx.get("foo").is_empty());
idx.index_file(1, "qux");
assert!(idx.get("foo").is_empty());
assert_eq!(idx.get("qux"), &[WordHit { file: 1, line: 1 }]);
}
#[test]
fn single_character_tokens_are_skipped() {
let mut idx = WordIndex::new();
idx.index_file(1, "a foo b bar c");
assert!(idx.get("a").is_empty());
assert_eq!(idx.get("foo").len(), 1);
}
}