lcpfs 2026.1.102

LCP File System - A ZFS-inspired copy-on-write filesystem for Rust
// Copyright 2025 LunaOS Contributors
// SPDX-License-Identifier: Apache-2.0

//! Search functionality for full-text search

use alloc::collections::BTreeMap;
use alloc::string::{String, ToString};
use alloc::vec::Vec;

use super::index::FtsEngine;
use super::ranking::{Bm25Params, bm25_score};
use super::tokenizer::{fuzzy_match, levenshtein_distance};
use super::types::{IndexError, InvertedIndex, SearchHit, SearchOptions};

impl FtsEngine {
    /// Search for documents matching a query
    pub fn search(
        &self,
        dataset: &str,
        query: &str,
        options: &SearchOptions,
    ) -> Result<Vec<SearchHit>, IndexError> {
        let index = self
            .index(dataset)
            .ok_or_else(|| IndexError::DatasetNotFound(dataset.to_string()))?;

        // Tokenize query
        let query_terms = self.tokenizer().tokenize_query(query);

        if query_terms.is_empty() {
            return Ok(Vec::new());
        }

        // Expand query terms with fuzzy matches if enabled
        let search_terms = if options.fuzzy {
            expand_fuzzy_terms(&query_terms, index, options.fuzzy_distance)
        } else {
            query_terms.clone()
        };

        // Score documents
        let params = Bm25Params::default();
        let mut doc_scores: BTreeMap<u64, DocumentScore> = BTreeMap::new();

        for term in &search_terms {
            if let Some(posting_list) = index.index.get(term) {
                for posting in &posting_list.postings {
                    let doc_meta = match index.docs.get(&posting.object_id) {
                        Some(meta) => meta,
                        None => continue,
                    };

                    let score = bm25_score(
                        posting.term_freq,
                        doc_meta.length,
                        index.avg_doc_len,
                        index.doc_count,
                        posting_list.doc_freq,
                        &params,
                    );

                    let entry = doc_scores
                        .entry(posting.object_id)
                        .or_insert_with(|| DocumentScore::new(posting.object_id));

                    entry.total_score += score;
                    entry.matched_terms.push(term.clone());
                    entry.positions.extend(posting.positions.iter().copied());
                }
            }
        }

        // Filter by minimum score
        let doc_scores: Vec<_> = doc_scores
            .into_values()
            .filter(|ds| ds.total_score >= options.min_score)
            .collect();

        // Sort by score descending
        let mut sorted_scores = doc_scores;
        sorted_scores.sort_by(|a, b| {
            b.total_score
                .partial_cmp(&a.total_score)
                .unwrap_or(core::cmp::Ordering::Equal)
        });

        // Apply pagination
        let paginated: Vec<_> = sorted_scores
            .into_iter()
            .skip(options.offset)
            .take(options.limit)
            .collect();

        // Build results
        let results = paginated
            .into_iter()
            .filter_map(|ds| {
                let doc_meta = index.docs.get(&ds.object_id)?;

                let snippet = if options.highlight {
                    generate_highlighted_snippet(
                        &ds.matched_terms,
                        &ds.positions,
                        options.snippet_length,
                    )
                } else {
                    String::new()
                };

                Some(SearchHit {
                    object_id: ds.object_id,
                    path: doc_meta.path.clone(),
                    score: ds.total_score,
                    snippet,
                    matched_terms: ds.matched_terms,
                    positions: ds.positions,
                })
            })
            .collect();

        Ok(results)
    }
}

/// Intermediate document score
struct DocumentScore {
    /// Document object ID
    object_id: u64,
    /// Accumulated BM25 score
    total_score: f32,
    /// Terms that matched
    matched_terms: Vec<String>,
    /// Positions of matches
    positions: Vec<u32>,
}

impl DocumentScore {
    fn new(object_id: u64) -> Self {
        Self {
            object_id,
            total_score: 0.0,
            matched_terms: Vec::new(),
            positions: Vec::new(),
        }
    }
}

/// Expand query terms with fuzzy matches
fn expand_fuzzy_terms(
    query_terms: &[String],
    index: &InvertedIndex,
    max_distance: usize,
) -> Vec<String> {
    let mut expanded = Vec::new();

    for term in query_terms {
        // Always include the original term
        expanded.push(term.clone());

        // Find fuzzy matches
        let matches = fuzzy_match(term, index.index.keys().map(|s| s.as_str()), max_distance);

        for matched in matches {
            if &matched != term {
                expanded.push(matched);
            }
        }
    }

    expanded
}

/// Generate a snippet with highlighted matches
///
/// In a real implementation, this would fetch the actual document content
/// and extract a relevant portion around the matches.
fn generate_highlighted_snippet(
    matched_terms: &[String],
    positions: &[u32],
    _max_length: usize,
) -> String {
    if matched_terms.is_empty() {
        return String::new();
    }

    // In a real implementation:
    // 1. Read document content
    // 2. Find position of first match
    // 3. Extract surrounding context
    // 4. Wrap matches in highlight markers

    // For now, return a placeholder indicating what was matched
    let terms = matched_terms.join(", ");
    let pos_str = if positions.is_empty() {
        String::new()
    } else {
        let first_pos = positions.iter().min().unwrap_or(&0);
        alloc::format!(" at position {}", first_pos)
    };

    alloc::format!("Matched: {}{}", terms, pos_str)
}

/// Phrase search - find documents containing an exact phrase
pub fn phrase_search(index: &InvertedIndex, phrase_terms: &[String]) -> Vec<u64> {
    if phrase_terms.is_empty() {
        return Vec::new();
    }

    if phrase_terms.len() == 1 {
        // Single term - return all documents containing it
        return match index.index.get(&phrase_terms[0]) {
            Some(pl) => pl.postings.iter().map(|p| p.object_id).collect(),
            None => Vec::new(),
        };
    }

    // Multi-term phrase search
    // Find documents containing all terms first
    let first_term = &phrase_terms[0];
    let first_postings = match index.index.get(first_term) {
        Some(pl) => &pl.postings,
        None => return Vec::new(),
    };

    let mut candidates: Vec<u64> = first_postings.iter().map(|p| p.object_id).collect();

    for term in phrase_terms.iter().skip(1) {
        let postings = match index.index.get(term) {
            Some(pl) => &pl.postings,
            None => return Vec::new(),
        };

        let term_docs: Vec<u64> = postings.iter().map(|p| p.object_id).collect();
        candidates.retain(|doc_id| term_docs.contains(doc_id));

        if candidates.is_empty() {
            return Vec::new();
        }
    }

    // Now check for consecutive positions
    let mut phrase_matches = Vec::new();

    for doc_id in candidates {
        if check_phrase_positions(index, doc_id, phrase_terms) {
            phrase_matches.push(doc_id);
        }
    }

    phrase_matches
}

/// Check if terms appear consecutively in a document
fn check_phrase_positions(index: &InvertedIndex, doc_id: u64, phrase_terms: &[String]) -> bool {
    // Get positions for first term
    let first_positions = match index.index.get(&phrase_terms[0]) {
        Some(pl) => match pl.postings.iter().find(|p| p.object_id == doc_id) {
            Some(posting) => &posting.positions,
            None => return false,
        },
        None => return false,
    };

    // For each starting position of the first term
    'outer: for &start_pos in first_positions {
        let mut expected_pos = start_pos;

        // Check if subsequent terms appear at consecutive positions
        for (offset, term) in phrase_terms.iter().enumerate() {
            expected_pos = start_pos + offset as u32;

            let positions = match index.index.get(term) {
                Some(pl) => match pl.postings.iter().find(|p| p.object_id == doc_id) {
                    Some(posting) => &posting.positions,
                    None => continue 'outer,
                },
                None => continue 'outer,
            };

            if !positions.contains(&expected_pos) {
                continue 'outer;
            }
        }

        // All terms found at consecutive positions
        return true;
    }

    false
}

#[cfg(test)]
mod tests {
    use super::*;
    use crate::fts::index::FtsEngine;

    #[test]
    fn test_basic_search() {
        let mut engine = FtsEngine::new();

        engine
            .index_document("test", 1, "/doc1.txt", b"hello world foo bar")
            .unwrap();
        engine
            .index_document("test", 2, "/doc2.txt", b"hello there friend")
            .unwrap();
        engine
            .index_document("test", 3, "/doc3.txt", b"goodbye world")
            .unwrap();

        let options = SearchOptions::default();
        let results = engine.search("test", "hello", &options).unwrap();

        assert_eq!(results.len(), 2);
        // Both doc1 and doc2 contain "hello"
    }

    #[test]
    fn test_multi_term_search() {
        let mut engine = FtsEngine::new();

        engine
            .index_document("test", 1, "/doc1.txt", b"hello world")
            .unwrap();
        engine
            .index_document("test", 2, "/doc2.txt", b"hello there")
            .unwrap();

        let options = SearchOptions::default();
        let results = engine.search("test", "hello world", &options).unwrap();

        // doc1 should rank higher (matches both terms)
        assert!(!results.is_empty());
        assert_eq!(results[0].object_id, 1);
    }

    #[test]
    fn test_empty_query() {
        let mut engine = FtsEngine::new();

        engine
            .index_document("test", 1, "/doc1.txt", b"hello world")
            .unwrap();

        let options = SearchOptions::default();
        let results = engine.search("test", "", &options).unwrap();

        assert!(results.is_empty());
    }
}