openserve 2.0.3

A modern, high-performance, AI-enhanced file server built in Rust
Documentation
//! Search Service
//!
//! This service provides both full-text and semantic search capabilities.
//! It uses the `tantivy` crate for full-text indexing and searching,
//! and integrates with the AI service for semantic search.

use anyhow::Result;

use std::future::Future;
use std::pin::Pin;
use std::sync::Arc;
use tantivy::{
    self,
    collector::TopDocs,
    query::QueryParser,
    schema::{Field, Schema, STORED, TEXT, STRING, Value},
    Index, IndexReader, IndexWriter, ReloadPolicy,
    doc,
};

use tracing::{debug, info};
use std::sync::Mutex;

use crate::{
    config::Config,
    models::file::File,
    services::FileService,
    models::search::SearchResult,
};

/// A service for handling search operations.
#[allow(dead_code)]
pub struct SearchService {
    index: Index,
    reader: IndexReader,
    writer: Arc<Mutex<IndexWriter>>,
    schema: Schema,
    title_field: Field,
    body_field: Field,
    path_field: Field,
    content_field: Field,
    index_path: String,
    query_parser: QueryParser,
}

impl SearchService {
    /// Creates a new `SearchService`.
    pub async fn new(config: Arc<Config>) -> Result<Self> {
        let index_path = config.search.index_dir.clone();
        info!("Initializing search service with index path: {:?}", index_path);

        // Create schema
        let mut schema_builder = Schema::builder();
        let title_field = schema_builder.add_text_field("title", TEXT | STORED);
        let body_field = schema_builder.add_text_field("body", TEXT);
        let path_field = schema_builder.add_text_field("path", STRING | STORED);
        let schema = schema_builder.build();

        // Create or open index
        let index_path = &config.search.index_dir;
        let index = if index_path.exists() {
            Index::open_in_dir(index_path)?
        } else {
            std::fs::create_dir_all(index_path)?;
            Index::create_in_dir(index_path, schema.clone())?
        };

        // Create reader and writer
        let writer = index.writer(50_000_000)?;
        
        let reader = index
            .reader_builder()
            .reload_policy(ReloadPolicy::OnCommitWithDelay)
            .try_into()?;

        // Create query parser
        let query_parser = QueryParser::for_index(&index, vec![title_field, path_field, body_field]);

        Ok(Self {
            index,
            reader,
            writer: Arc::new(Mutex::new(writer)),
            schema,
            title_field,
            body_field,
            path_field,
            content_field: body_field,
            index_path: index_path.to_string_lossy().to_string(),
            query_parser,
        })
    }

    /// Reloads the search index reader to make recent changes searchable.
    pub fn reload_reader(&self) -> Result<()> {
        self.reader.reload().map_err(anyhow::Error::from)
    }

    /// Index a file
    pub async fn index_file(&self, file: &File, content: &str) -> Result<()> {
        debug!("Indexing file: {}", file.path);

        let doc = doc!(
            self.title_field => file.name.clone(),
            self.body_field => content,
            self.path_field => file.path.clone(),
        );

        let mut writer = self.writer.lock().unwrap();
        writer.add_document(doc)?;
        writer.commit()?;

        debug!("File indexed successfully: {}", file.path);
        Ok(())
    }

    /// Remove file from index
    pub async fn remove_file(&self, path: &str) -> Result<()> {
        debug!("Removing file from index: {}", path);

        let term = tantivy::Term::from_field_text(self.path_field, path);
        let mut writer = self.writer.lock().unwrap();
        writer.delete_term(term);
        writer.commit()?;

        debug!("File removed from index: {}", path);
        Ok(())
    }

    /// Performs a full-text search.
    pub async fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchResult>> {
        debug!("Searching with query: {} (limit: {})", query, limit);

        let mut results = Vec::new();
        let searcher = self.reader.searcher();
        let query = self.query_parser.parse_query(query)?;
        let top_docs = searcher.search(&query, &TopDocs::with_limit(limit))?;

        for (_score, doc_address) in top_docs {
            let retrieved_doc = searcher.doc::<tantivy::TantivyDocument>(doc_address)?;
            let path = retrieved_doc
                .get_first(self.path_field)
                .and_then(|v| v.as_str())
                .unwrap_or("")
                .to_string();
            
            let snippet = retrieved_doc
                .get_first(self.body_field)
                .and_then(|v| v.as_str())
                .map(|text| {
                    if text.len() > 200 {
                        format!("{}...", &text[..200])
                    } else {
                        text.to_string()
                    }
                })
                .unwrap_or_default();

            results.push(SearchResult {
                path,
                score: _score,
                snippet,
            });
        }

        debug!("Search completed, found {} results", results.len());
        Ok(results)
    }
    
    /// Performs a semantic search.
    pub async fn semantic_search(&self, query: &str, limit: usize) -> Result<Vec<SearchResult>> {
        // This would implement the semantic search logic.
        self.search(query, limit).await
    }

    /// Rebuild entire search index
    pub async fn rebuild_index(&self, file_service: &FileService) -> Result<()> {
        info!("Rebuilding search index");

        let mut writer = self.writer.lock().unwrap();
        writer.delete_all_documents()?;

        // Walk through all files and reindex
        self.index_directory_recursive(file_service, "", self.writer.clone()).await?;
        
        writer.commit()?;
        info!("Search index rebuilt successfully");
        Ok(())
    }

    /// Get search statistics
    pub async fn get_stats(&self) -> Result<SearchStats> {
        let searcher = self.reader.searcher();
        let num_docs = searcher.num_docs() as u64;
        
        Ok(SearchStats {
            indexed_documents: num_docs,
            index_size: self.get_index_size()?,
        })
    }

    /// Get index statistics
    pub async fn get_index_stats(&self) -> Result<IndexStats> {
        let searcher = self.reader.searcher();
        let num_docs = searcher.num_docs() as u64;
        
        Ok(IndexStats {
            num_docs,
            index_path: self.index_path.clone(),
        })
    }

    // Helper methods

    /// Recursively index directory
    fn index_directory_recursive<'a>(
        &'a self,
        file_service: &'a FileService,
        path: &'a str,
        writer: Arc<Mutex<IndexWriter>>,
    ) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'a>> {
        Box::pin(async move {
            let directory = file_service.list_directory(path).await?;
            
            for entry in &directory.entries {
                match entry {
                    crate::services::file::DirectoryEntry::File(file) => {
                        if file.is_text {
                            if let Ok(content_bytes) = file_service.read_file(&file.path).await {
                                if let Ok(content) = String::from_utf8(content_bytes) {
                                    let writer_guard = writer.lock().unwrap();
                                    let doc = doc!(
                                        self.title_field => file.name.clone(),
                                        self.body_field => content,
                                        self.path_field => file.path.clone(),
                                    );
                                    writer_guard.add_document(doc)?;
                                }
                            }
                        }
                    }
                    crate::services::file::DirectoryEntry::Directory(dir) => {
                        self.index_directory_recursive(file_service, &dir.path, writer.clone()).await?;
                    }
                }
            }
    
            Ok(())
        })
    }

    /// Get index size on disk
    fn get_index_size(&self) -> Result<u64> {
        let index_path = std::path::Path::new(&self.index_path);
        let mut total_size = 0;

        if index_path.exists() {
            for entry in std::fs::read_dir(index_path)? {
                let entry = entry?;
                if entry.file_type()?.is_file() {
                    total_size += entry.metadata()?.len();
                }
            }
        }

        Ok(total_size)
    }
}

/// Represents statistics about the search index.
#[derive(Debug, Clone, serde::Serialize)]
pub struct SearchStats {
    /// The total number of documents in the index.
    pub indexed_documents: u64,
    /// The total size of the index on disk in bytes.
    pub index_size: u64,
}

/// Represents detailed statistics about the index.
#[derive(Debug, Clone, serde::Serialize)]
pub struct IndexStats {
    /// The total number of documents in the index.
    pub num_docs: u64,
    /// The path to the index directory on disk.
    pub index_path: String,
}

#[cfg(test)]
mod tests {
    use super::*;
    use tempfile::TempDir;

    async fn create_test_search_service() -> (SearchService, TempDir) {
        let temp_dir = TempDir::new().unwrap();
        let config = Arc::new(crate::config::Config {
            search: crate::config::SearchConfig {
                enabled: true,
                index_dir: temp_dir.path().join("index"),
                max_results: 100,
                fuzzy_search: true,
                refresh_interval: 300,
            },
            ..Default::default()
        });
        
        let service = SearchService::new(config).await.unwrap();
        (service, temp_dir)
    }

    #[tokio::test]
    async fn test_index_and_search() {
        let (service, _temp_dir) = create_test_search_service().await;
        
        let file = crate::models::file::File {
            name: "test.txt".to_string(),
            path: "test.txt".to_string(),
            size: 100,
            modified: chrono::Utc::now(),
            mime_type: "text/plain".to_string(),
            is_text: true,
        };

        service.index_file(&file, "This is a test document with some content").await.unwrap();
        
        // Reload the reader to ensure the index is up-to-date for the search
        service.reload_reader().unwrap();
        
        let results = service.search("test", 10).await.unwrap();
        assert!(!results.is_empty(), "Search results should not be empty");
        assert_eq!(results[0].path, "test.txt");
    }

    #[tokio::test]
    async fn test_remove_file() {
        let (service, _temp_dir) = create_test_search_service().await;
        
        let file = crate::models::file::File {
            name: "test.txt".to_string(),
            path: "test.txt".to_string(),
            size: 100,
            modified: chrono::Utc::now(),
            mime_type: "text/plain".to_string(),
            is_text: true,
        };

        service.index_file(&file, "Test content").await.unwrap();
        service.reload_reader().unwrap();
        service.remove_file("test.txt").await.unwrap();
        service.reload_reader().unwrap();
        
        let results = service.search("Test content", 10).await.unwrap();
        assert!(results.is_empty());
    }
}