use alloc::collections::BTreeMap;
use alloc::string::{String, ToString};
use alloc::vec::Vec;
use super::tokenizer::Tokenizer;
use super::types::{DocMeta, IndexError, IndexStats, InvertedIndex, Posting, PostingList};
pub struct FtsEngine {
indices: BTreeMap<String, InvertedIndex>,
tokenizer: Tokenizer,
}
impl Default for FtsEngine {
fn default() -> Self {
Self::new()
}
}
impl FtsEngine {
pub fn new() -> Self {
Self {
indices: BTreeMap::new(),
tokenizer: Tokenizer::new(),
}
}
fn get_or_create_index(&mut self, dataset: &str) -> &mut InvertedIndex {
self.indices.entry(dataset.to_string()).or_default()
}
fn get_index(&self, dataset: &str) -> Option<&InvertedIndex> {
self.indices.get(dataset)
}
pub fn index_document(
&mut self,
dataset: &str,
object_id: u64,
path: &str,
content: &[u8],
) -> Result<(), IndexError> {
let _ = self.remove_document(dataset, object_id);
let tokens = self.tokenizer.tokenize(content);
let doc_length = tokens.len() as u32;
if doc_length == 0 {
return Ok(());
}
let mut term_positions: BTreeMap<String, Vec<u32>> = BTreeMap::new();
for token in &tokens {
term_positions
.entry(token.term.clone())
.or_default()
.push(token.position);
}
let index = self.get_or_create_index(dataset);
index.docs.insert(
object_id,
DocMeta {
object_id,
path: path.to_string(),
length: doc_length,
indexed_at: current_timestamp(),
},
);
index.doc_count += 1;
index.total_terms += doc_length as u64;
index.recalculate_avg_doc_len();
for (term, positions) in term_positions {
let posting = Posting {
object_id,
term_freq: positions.len() as u32,
positions,
};
index.index.entry(term).or_default().add_posting(posting);
}
Ok(())
}
pub fn remove_document(&mut self, dataset: &str, object_id: u64) -> Result<(), IndexError> {
let index = match self.indices.get_mut(dataset) {
Some(idx) => idx,
None => return Ok(()), };
let doc_meta = match index.docs.remove(&object_id) {
Some(meta) => meta,
None => return Ok(()), };
index.doc_count = index.doc_count.saturating_sub(1);
index.total_terms = index.total_terms.saturating_sub(doc_meta.length as u64);
index.recalculate_avg_doc_len();
let mut empty_terms = Vec::new();
for (term, posting_list) in &mut index.index {
posting_list.remove_posting(object_id);
if posting_list.postings.is_empty() {
empty_terms.push(term.clone());
}
}
for term in empty_terms {
index.index.remove(&term);
}
Ok(())
}
pub fn rebuild(&mut self, dataset: &str) -> Result<IndexStats, IndexError> {
self.indices.remove(dataset);
self.indices
.insert(dataset.to_string(), InvertedIndex::new());
self.get_stats(dataset)
}
pub fn get_stats(&self, dataset: &str) -> Result<IndexStats, IndexError> {
let index = self
.get_index(dataset)
.ok_or_else(|| IndexError::DatasetNotFound(dataset.to_string()))?;
Ok(IndexStats {
document_count: index.doc_count,
term_count: index.index.len() as u64,
total_term_occurrences: index.total_terms,
avg_doc_length: index.avg_doc_len,
index_size_bytes: estimate_index_size(index),
last_rebuild: 0, })
}
pub fn tokenizer(&self) -> &Tokenizer {
&self.tokenizer
}
pub fn index(&self, dataset: &str) -> Option<&InvertedIndex> {
self.get_index(dataset)
}
}
fn estimate_index_size(index: &InvertedIndex) -> u64 {
let mut size: u64 = 0;
for (term, posting_list) in &index.index {
size += term.len() as u64;
size += 4; for posting in &posting_list.postings {
size += 8; size += 4; size += (posting.positions.len() * 4) as u64; }
}
for meta in index.docs.values() {
size += 8; size += meta.path.len() as u64;
size += 4 + 8; }
size
}
fn current_timestamp() -> u64 {
crate::time::now()
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_index_document() {
let mut engine = FtsEngine::new();
engine
.index_document("test", 1, "/doc1.txt", b"hello world")
.unwrap();
let stats = engine.get_stats("test").unwrap();
assert_eq!(stats.document_count, 1);
assert!(stats.term_count > 0);
}
#[test]
fn test_remove_document() {
let mut engine = FtsEngine::new();
engine
.index_document("test", 1, "/doc1.txt", b"hello world")
.unwrap();
engine.remove_document("test", 1).unwrap();
let stats = engine.get_stats("test").unwrap();
assert_eq!(stats.document_count, 0);
}
#[test]
fn test_multiple_documents() {
let mut engine = FtsEngine::new();
engine
.index_document("test", 1, "/doc1.txt", b"hello world")
.unwrap();
engine
.index_document("test", 2, "/doc2.txt", b"hello there")
.unwrap();
let stats = engine.get_stats("test").unwrap();
assert_eq!(stats.document_count, 2);
}
}