lcpfs 2026.1.102

LCP File System - A ZFS-inspired copy-on-write filesystem for Rust
// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0

//! Inverted index implementation for full-text search

use alloc::collections::BTreeMap;
use alloc::string::{String, ToString};
use alloc::vec::Vec;

use super::tokenizer::Tokenizer;
use super::types::{DocMeta, IndexError, IndexStats, InvertedIndex, Posting, PostingList};

/// Full-text search engine
pub struct FtsEngine {
    /// Per-dataset indices
    indices: BTreeMap<String, InvertedIndex>,
    /// Tokenizer
    tokenizer: Tokenizer,
}

impl Default for FtsEngine {
    fn default() -> Self {
        Self::new()
    }
}

impl FtsEngine {
    /// Create a new FTS engine
    pub fn new() -> Self {
        Self {
            indices: BTreeMap::new(),
            tokenizer: Tokenizer::new(),
        }
    }

    /// Get or create index for a dataset
    fn get_or_create_index(&mut self, dataset: &str) -> &mut InvertedIndex {
        self.indices.entry(dataset.to_string()).or_default()
    }

    /// Get index for a dataset (read-only)
    fn get_index(&self, dataset: &str) -> Option<&InvertedIndex> {
        self.indices.get(dataset)
    }

    /// Index a document
    pub fn index_document(
        &mut self,
        dataset: &str,
        object_id: u64,
        path: &str,
        content: &[u8],
    ) -> Result<(), IndexError> {
        // First remove any existing entry for this document
        let _ = self.remove_document(dataset, object_id);

        // Tokenize content
        let tokens = self.tokenizer.tokenize(content);
        let doc_length = tokens.len() as u32;

        if doc_length == 0 {
            // Nothing to index
            return Ok(());
        }

        // Group tokens by term for posting creation
        let mut term_positions: BTreeMap<String, Vec<u32>> = BTreeMap::new();
        for token in &tokens {
            term_positions
                .entry(token.term.clone())
                .or_default()
                .push(token.position);
        }

        // Get or create the index
        let index = self.get_or_create_index(dataset);

        // Add document metadata
        index.docs.insert(
            object_id,
            DocMeta {
                object_id,
                path: path.to_string(),
                length: doc_length,
                indexed_at: current_timestamp(),
            },
        );

        // Update statistics
        index.doc_count += 1;
        index.total_terms += doc_length as u64;
        index.recalculate_avg_doc_len();

        // Add postings for each term
        for (term, positions) in term_positions {
            let posting = Posting {
                object_id,
                term_freq: positions.len() as u32,
                positions,
            };

            index.index.entry(term).or_default().add_posting(posting);
        }

        Ok(())
    }

    /// Remove a document from the index
    pub fn remove_document(&mut self, dataset: &str, object_id: u64) -> Result<(), IndexError> {
        let index = match self.indices.get_mut(dataset) {
            Some(idx) => idx,
            None => return Ok(()), // Nothing to remove
        };

        // Get document metadata
        let doc_meta = match index.docs.remove(&object_id) {
            Some(meta) => meta,
            None => return Ok(()), // Document not in index
        };

        // Update statistics
        index.doc_count = index.doc_count.saturating_sub(1);
        index.total_terms = index.total_terms.saturating_sub(doc_meta.length as u64);
        index.recalculate_avg_doc_len();

        // Remove postings for this document from all terms
        let mut empty_terms = Vec::new();
        for (term, posting_list) in &mut index.index {
            posting_list.remove_posting(object_id);
            if posting_list.postings.is_empty() {
                empty_terms.push(term.clone());
            }
        }

        // Remove empty posting lists
        for term in empty_terms {
            index.index.remove(&term);
        }

        Ok(())
    }

    /// Rebuild the entire index for a dataset
    pub fn rebuild(&mut self, dataset: &str) -> Result<IndexStats, IndexError> {
        // Clear existing index
        self.indices.remove(dataset);

        // In a real implementation, we would:
        // 1. List all files in the dataset
        // 2. Read each file's content
        // 3. Index each file
        //
        // For now, we just create an empty index
        self.indices
            .insert(dataset.to_string(), InvertedIndex::new());

        self.get_stats(dataset)
    }

    /// Get index statistics
    pub fn get_stats(&self, dataset: &str) -> Result<IndexStats, IndexError> {
        let index = self
            .get_index(dataset)
            .ok_or_else(|| IndexError::DatasetNotFound(dataset.to_string()))?;

        Ok(IndexStats {
            document_count: index.doc_count,
            term_count: index.index.len() as u64,
            total_term_occurrences: index.total_terms,
            avg_doc_length: index.avg_doc_len,
            index_size_bytes: estimate_index_size(index),
            last_rebuild: 0, // Would be stored in real implementation
        })
    }

    /// Get the tokenizer (for query parsing)
    pub fn tokenizer(&self) -> &Tokenizer {
        &self.tokenizer
    }

    /// Get the index for a dataset (for searching)
    pub fn index(&self, dataset: &str) -> Option<&InvertedIndex> {
        self.get_index(dataset)
    }
}

/// Estimate index size in bytes (rough approximation)
fn estimate_index_size(index: &InvertedIndex) -> u64 {
    let mut size: u64 = 0;

    // Term strings + posting lists
    for (term, posting_list) in &index.index {
        size += term.len() as u64;
        size += 4; // doc_freq
        for posting in &posting_list.postings {
            size += 8; // object_id
            size += 4; // term_freq
            size += (posting.positions.len() * 4) as u64; // positions
        }
    }

    // Document metadata
    for meta in index.docs.values() {
        size += 8; // object_id
        size += meta.path.len() as u64;
        size += 4 + 8; // length + indexed_at
    }

    size
}

/// Get current timestamp from unified time provider.
fn current_timestamp() -> u64 {
    crate::time::now()
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_index_document() {
        let mut engine = FtsEngine::new();

        engine
            .index_document("test", 1, "/doc1.txt", b"hello world")
            .unwrap();

        let stats = engine.get_stats("test").unwrap();
        assert_eq!(stats.document_count, 1);
        assert!(stats.term_count > 0);
    }

    #[test]
    fn test_remove_document() {
        let mut engine = FtsEngine::new();

        engine
            .index_document("test", 1, "/doc1.txt", b"hello world")
            .unwrap();
        engine.remove_document("test", 1).unwrap();

        let stats = engine.get_stats("test").unwrap();
        assert_eq!(stats.document_count, 0);
    }

    #[test]
    fn test_multiple_documents() {
        let mut engine = FtsEngine::new();

        engine
            .index_document("test", 1, "/doc1.txt", b"hello world")
            .unwrap();
        engine
            .index_document("test", 2, "/doc2.txt", b"hello there")
            .unwrap();

        let stats = engine.get_stats("test").unwrap();
        assert_eq!(stats.document_count, 2);
    }
}