lcpfs 2026.1.102

LCP File System - A ZFS-inspired copy-on-write filesystem for Rust
// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0

//! Type definitions for full-text search

use alloc::collections::BTreeMap;
use alloc::string::String;
use alloc::vec::Vec;
use thiserror_no_std::Error;

/// Error type for indexing and search operations
#[derive(Debug, Clone, PartialEq, Eq, Error)]
pub enum IndexError {
    /// Dataset not found
    #[error("Dataset not found: {0}")]
    DatasetNotFound(String),

    /// Document not found in index
    #[error("Document not found: object_id={0}")]
    DocumentNotFound(u64),

    /// Content could not be decoded
    #[error("Content decode error")]
    ContentDecodeError,

    /// Index is corrupted
    #[error("Index corrupted")]
    IndexCorrupted,

    /// IO error during index operation
    #[error("IO error: {0}")]
    IoError(String),

    /// Query parse error
    #[error("Query parse error: {0}")]
    QueryParseError(String),
}

/// Search options
#[derive(Debug, Clone)]
pub struct SearchOptions {
    /// Maximum number of results
    pub limit: usize,
    /// Offset for pagination
    pub offset: usize,
    /// Enable fuzzy matching
    pub fuzzy: bool,
    /// Maximum edit distance for fuzzy matching
    pub fuzzy_distance: usize,
    /// Highlight matches in snippets
    pub highlight: bool,
    /// Snippet length in characters
    pub snippet_length: usize,
    /// Minimum score threshold (0.0 - 1.0)
    pub min_score: f32,
}

impl Default for SearchOptions {
    fn default() -> Self {
        Self {
            limit: 100,
            offset: 0,
            fuzzy: false,
            fuzzy_distance: 2,
            highlight: true,
            snippet_length: 200,
            min_score: 0.0,
        }
    }
}

/// Search hit result
#[derive(Debug, Clone)]
pub struct SearchHit {
    /// Object ID of the matching file
    pub object_id: u64,
    /// File path
    pub path: String,
    /// BM25 score
    pub score: f32,
    /// Snippet with optional highlighting
    pub snippet: String,
    /// Matched terms
    pub matched_terms: Vec<String>,
    /// Term positions in document
    pub positions: Vec<u32>,
}

/// Index statistics
#[derive(Debug, Clone, Default)]
pub struct IndexStats {
    /// Number of documents indexed
    pub document_count: u64,
    /// Number of unique terms
    pub term_count: u64,
    /// Total number of term occurrences
    pub total_term_occurrences: u64,
    /// Average document length in terms
    pub avg_doc_length: f32,
    /// Index size in bytes
    pub index_size_bytes: u64,
    /// Last rebuild timestamp
    pub last_rebuild: u64,
}

/// Posting in the inverted index
#[derive(Debug, Clone)]
pub struct Posting {
    /// Object ID of the document
    pub object_id: u64,
    /// Term frequency in this document
    pub term_freq: u32,
    /// Positions where term appears
    pub positions: Vec<u32>,
}

/// Posting list for a term
#[derive(Debug, Clone)]
pub struct PostingList {
    /// Document frequency (number of docs containing this term)
    pub doc_freq: u32,
    /// List of postings
    pub postings: Vec<Posting>,
}

impl PostingList {
    /// Create a new empty posting list
    pub fn new() -> Self {
        Self {
            doc_freq: 0,
            postings: Vec::new(),
        }
    }

    /// Add a posting to the list
    pub fn add_posting(&mut self, posting: Posting) {
        self.doc_freq += 1;
        self.postings.push(posting);
    }

    /// Remove a posting by object_id
    pub fn remove_posting(&mut self, object_id: u64) -> bool {
        if let Some(pos) = self.postings.iter().position(|p| p.object_id == object_id) {
            self.postings.remove(pos);
            self.doc_freq = self.doc_freq.saturating_sub(1);
            true
        } else {
            false
        }
    }
}

impl Default for PostingList {
    fn default() -> Self {
        Self::new()
    }
}

/// Document metadata in the index
#[derive(Debug, Clone)]
pub struct DocMeta {
    /// Object ID
    pub object_id: u64,
    /// File path
    pub path: String,
    /// Document length in terms
    pub length: u32,
    /// Last indexed timestamp
    pub indexed_at: u64,
}

/// Inverted index structure
#[derive(Debug, Clone)]
pub struct InvertedIndex {
    /// Term -> PostingList mapping
    pub index: BTreeMap<String, PostingList>,
    /// Document metadata
    pub docs: BTreeMap<u64, DocMeta>,
    /// Total document count
    pub doc_count: u64,
    /// Total term count across all documents
    pub total_terms: u64,
    /// Average document length
    pub avg_doc_len: f32,
}

impl InvertedIndex {
    /// Create a new empty inverted index
    pub fn new() -> Self {
        Self {
            index: BTreeMap::new(),
            docs: BTreeMap::new(),
            doc_count: 0,
            total_terms: 0,
            avg_doc_len: 0.0,
        }
    }

    /// Recalculate average document length
    pub fn recalculate_avg_doc_len(&mut self) {
        if self.doc_count == 0 {
            self.avg_doc_len = 0.0;
        } else {
            self.avg_doc_len = self.total_terms as f32 / self.doc_count as f32;
        }
    }
}

impl Default for InvertedIndex {
    fn default() -> Self {
        Self::new()
    }
}

/// Token with position information
#[derive(Debug, Clone)]
pub struct Token {
    /// The normalized term
    pub term: String,
    /// Position in the document (word index)
    pub position: u32,
    /// Byte offset in original content
    pub byte_offset: usize,
}