use std::collections::{HashMap, HashSet};
use super::file_table::FileId;
pub type Trigram = u32;
#[derive(Debug, Default, Clone)]
pub struct TrigramIndex {
index: HashMap<Trigram, HashSet<FileId>>,
file_trigrams: HashMap<FileId, HashSet<Trigram>>,
}
impl TrigramIndex {
pub fn new() -> Self {
Self::default()
}
pub fn index_file(&mut self, id: FileId, content: &str) {
self.remove_file(id);
let trigrams = extract_trigrams(content);
if trigrams.is_empty() {
return;
}
for tg in &trigrams {
self.index.entry(*tg).or_default().insert(id);
}
self.file_trigrams.insert(id, trigrams);
}
pub fn remove_file(&mut self, id: FileId) {
let Some(trigrams) = self.file_trigrams.remove(&id) else {
return;
};
for tg in trigrams {
if let Some(set) = self.index.get_mut(&tg) {
set.remove(&id);
if set.is_empty() {
self.index.remove(&tg);
}
}
}
}
pub fn query(&self, trigrams: &[Trigram]) -> HashSet<FileId> {
if trigrams.is_empty() {
return HashSet::new();
}
let mut postings: Vec<&HashSet<FileId>> = Vec::with_capacity(trigrams.len());
for tg in trigrams {
match self.index.get(tg) {
Some(set) => postings.push(set),
None => return HashSet::new(),
}
}
postings.sort_by_key(|s| s.len());
let (head, tail) = postings.split_first().expect("non-empty by construction");
let mut acc: HashSet<FileId> = (*head).clone();
for set in tail {
acc.retain(|id| set.contains(id));
if acc.is_empty() {
return acc;
}
}
acc
}
pub fn distinct_trigrams(&self) -> usize {
self.index.len()
}
pub fn snapshot_postings(&self) -> Vec<super::snapshot::TrigramPosting> {
let mut out: Vec<super::snapshot::TrigramPosting> = self
.index
.iter()
.map(|(tg, files)| {
let mut files: Vec<FileId> = files.iter().copied().collect();
files.sort_unstable();
super::snapshot::TrigramPosting {
trigram: *tg,
files,
}
})
.collect();
out.sort_unstable_by_key(|p| p.trigram);
out
}
pub fn from_postings(postings: Vec<super::snapshot::TrigramPosting>) -> Self {
let mut idx = Self::new();
for p in postings {
let entry = idx.index.entry(p.trigram).or_default();
for f in &p.files {
entry.insert(*f);
idx.file_trigrams.entry(*f).or_default().insert(p.trigram);
}
}
idx
}
pub fn estimated_bytes(&self) -> usize {
let postings: usize = self.index.values().map(|s| s.len()).sum();
let reverse: usize = self.file_trigrams.values().map(|s| s.len()).sum();
self.index.len() * 4 + postings * 4 + reverse * 4 + self.file_trigrams.len() * 4
}
}
pub fn extract_trigrams(text: &str) -> HashSet<Trigram> {
let bytes = text.as_bytes();
if bytes.len() < 3 {
return HashSet::new();
}
let mut out: HashSet<Trigram> = HashSet::with_capacity(bytes.len().max(8) / 4);
for window in bytes.windows(3) {
out.insert(pack(
normalize(window[0]),
normalize(window[1]),
normalize(window[2]),
));
}
out
}
pub fn query_trigrams(query: &str) -> Vec<Trigram> {
extract_trigrams(query).into_iter().collect()
}
#[inline(always)]
pub fn pack(a: u8, b: u8, c: u8) -> Trigram {
(a as u32) << 16 | (b as u32) << 8 | c as u32
}
#[inline(always)]
fn normalize(byte: u8) -> u8 {
if byte.is_ascii_uppercase() {
byte + 0x20
} else {
byte
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn extract_handles_short_strings() {
assert!(extract_trigrams("").is_empty());
assert!(extract_trigrams("ab").is_empty());
assert_eq!(extract_trigrams("abc").len(), 1);
}
#[test]
fn extract_is_case_insensitive() {
let lower = extract_trigrams("foo");
let upper = extract_trigrams("FOO");
assert_eq!(lower, upper);
}
#[test]
fn query_intersects_postings() {
let mut idx = TrigramIndex::new();
idx.index_file(1, "the quick brown fox");
idx.index_file(2, "jumped over the lazy dog");
idx.index_file(3, "lorem ipsum dolor sit amet");
let needle = query_trigrams("the");
let hits = idx.query(&needle);
assert!(hits.contains(&1));
assert!(hits.contains(&2));
assert!(!hits.contains(&3));
}
#[test]
fn missing_trigram_short_circuits_to_empty() {
let mut idx = TrigramIndex::new();
idx.index_file(1, "hello world");
let needle = query_trigrams("zzzzzz");
assert!(idx.query(&needle).is_empty());
}
#[test]
fn remove_file_drops_postings() {
let mut idx = TrigramIndex::new();
idx.index_file(1, "alpha beta");
idx.index_file(2, "alpha gamma");
idx.remove_file(1);
let needle = query_trigrams("alp");
let hits = idx.query(&needle);
assert!(!hits.contains(&1));
assert!(hits.contains(&2));
}
#[test]
fn reindex_replaces_postings() {
let mut idx = TrigramIndex::new();
idx.index_file(1, "alpha beta");
idx.index_file(1, "gamma delta");
let alpha = query_trigrams("alp");
let gamma = query_trigrams("gam");
assert!(idx.query(&alpha).is_empty());
assert!(idx.query(&gamma).contains(&1));
}
}