use anyhow::Result;
use std::future::Future;
use std::pin::Pin;
use std::sync::Arc;
use tantivy::{
self,
collector::TopDocs,
query::QueryParser,
schema::{Field, Schema, STORED, TEXT, STRING, Value},
Index, IndexReader, IndexWriter, ReloadPolicy,
doc,
};
use tracing::{debug, info};
use std::sync::Mutex;
use crate::{
config::Config,
models::file::File,
services::FileService,
models::search::SearchResult,
};
#[allow(dead_code)]
pub struct SearchService {
index: Index,
reader: IndexReader,
writer: Arc<Mutex<IndexWriter>>,
schema: Schema,
title_field: Field,
body_field: Field,
path_field: Field,
content_field: Field,
index_path: String,
query_parser: QueryParser,
}
impl SearchService {
pub async fn new(config: Arc<Config>) -> Result<Self> {
let index_path = config.search.index_dir.clone();
info!("Initializing search service with index path: {:?}", index_path);
let mut schema_builder = Schema::builder();
let title_field = schema_builder.add_text_field("title", TEXT | STORED);
let body_field = schema_builder.add_text_field("body", TEXT);
let path_field = schema_builder.add_text_field("path", STRING | STORED);
let schema = schema_builder.build();
let index_path = &config.search.index_dir;
let index = if index_path.exists() {
Index::open_in_dir(index_path)?
} else {
std::fs::create_dir_all(index_path)?;
Index::create_in_dir(index_path, schema.clone())?
};
let writer = index.writer(50_000_000)?;
let reader = index
.reader_builder()
.reload_policy(ReloadPolicy::OnCommitWithDelay)
.try_into()?;
let query_parser = QueryParser::for_index(&index, vec![title_field, path_field, body_field]);
Ok(Self {
index,
reader,
writer: Arc::new(Mutex::new(writer)),
schema,
title_field,
body_field,
path_field,
content_field: body_field,
index_path: index_path.to_string_lossy().to_string(),
query_parser,
})
}
pub fn reload_reader(&self) -> Result<()> {
self.reader.reload().map_err(anyhow::Error::from)
}
pub async fn index_file(&self, file: &File, content: &str) -> Result<()> {
debug!("Indexing file: {}", file.path);
let doc = doc!(
self.title_field => file.name.clone(),
self.body_field => content,
self.path_field => file.path.clone(),
);
let mut writer = self.writer.lock().unwrap();
writer.add_document(doc)?;
writer.commit()?;
debug!("File indexed successfully: {}", file.path);
Ok(())
}
pub async fn remove_file(&self, path: &str) -> Result<()> {
debug!("Removing file from index: {}", path);
let term = tantivy::Term::from_field_text(self.path_field, path);
let mut writer = self.writer.lock().unwrap();
writer.delete_term(term);
writer.commit()?;
debug!("File removed from index: {}", path);
Ok(())
}
pub async fn search(&self, query: &str, limit: usize) -> Result<Vec<SearchResult>> {
debug!("Searching with query: {} (limit: {})", query, limit);
let mut results = Vec::new();
let searcher = self.reader.searcher();
let query = self.query_parser.parse_query(query)?;
let top_docs = searcher.search(&query, &TopDocs::with_limit(limit))?;
for (_score, doc_address) in top_docs {
let retrieved_doc = searcher.doc::<tantivy::TantivyDocument>(doc_address)?;
let path = retrieved_doc
.get_first(self.path_field)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let snippet = retrieved_doc
.get_first(self.body_field)
.and_then(|v| v.as_str())
.map(|text| {
if text.len() > 200 {
format!("{}...", &text[..200])
} else {
text.to_string()
}
})
.unwrap_or_default();
results.push(SearchResult {
path,
score: _score,
snippet,
});
}
debug!("Search completed, found {} results", results.len());
Ok(results)
}
pub async fn semantic_search(&self, query: &str, limit: usize) -> Result<Vec<SearchResult>> {
self.search(query, limit).await
}
pub async fn rebuild_index(&self, file_service: &FileService) -> Result<()> {
info!("Rebuilding search index");
let mut writer = self.writer.lock().unwrap();
writer.delete_all_documents()?;
self.index_directory_recursive(file_service, "", self.writer.clone()).await?;
writer.commit()?;
info!("Search index rebuilt successfully");
Ok(())
}
pub async fn get_stats(&self) -> Result<SearchStats> {
let searcher = self.reader.searcher();
let num_docs = searcher.num_docs() as u64;
Ok(SearchStats {
indexed_documents: num_docs,
index_size: self.get_index_size()?,
})
}
pub async fn get_index_stats(&self) -> Result<IndexStats> {
let searcher = self.reader.searcher();
let num_docs = searcher.num_docs() as u64;
Ok(IndexStats {
num_docs,
index_path: self.index_path.clone(),
})
}
fn index_directory_recursive<'a>(
&'a self,
file_service: &'a FileService,
path: &'a str,
writer: Arc<Mutex<IndexWriter>>,
) -> Pin<Box<dyn Future<Output = Result<()>> + Send + 'a>> {
Box::pin(async move {
let directory = file_service.list_directory(path).await?;
for entry in &directory.entries {
match entry {
crate::services::file::DirectoryEntry::File(file) => {
if file.is_text {
if let Ok(content_bytes) = file_service.read_file(&file.path).await {
if let Ok(content) = String::from_utf8(content_bytes) {
let writer_guard = writer.lock().unwrap();
let doc = doc!(
self.title_field => file.name.clone(),
self.body_field => content,
self.path_field => file.path.clone(),
);
writer_guard.add_document(doc)?;
}
}
}
}
crate::services::file::DirectoryEntry::Directory(dir) => {
self.index_directory_recursive(file_service, &dir.path, writer.clone()).await?;
}
}
}
Ok(())
})
}
fn get_index_size(&self) -> Result<u64> {
let index_path = std::path::Path::new(&self.index_path);
let mut total_size = 0;
if index_path.exists() {
for entry in std::fs::read_dir(index_path)? {
let entry = entry?;
if entry.file_type()?.is_file() {
total_size += entry.metadata()?.len();
}
}
}
Ok(total_size)
}
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct SearchStats {
pub indexed_documents: u64,
pub index_size: u64,
}
#[derive(Debug, Clone, serde::Serialize)]
pub struct IndexStats {
pub num_docs: u64,
pub index_path: String,
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
async fn create_test_search_service() -> (SearchService, TempDir) {
let temp_dir = TempDir::new().unwrap();
let config = Arc::new(crate::config::Config {
search: crate::config::SearchConfig {
enabled: true,
index_dir: temp_dir.path().join("index"),
max_results: 100,
fuzzy_search: true,
refresh_interval: 300,
},
..Default::default()
});
let service = SearchService::new(config).await.unwrap();
(service, temp_dir)
}
#[tokio::test]
async fn test_index_and_search() {
let (service, _temp_dir) = create_test_search_service().await;
let file = crate::models::file::File {
name: "test.txt".to_string(),
path: "test.txt".to_string(),
size: 100,
modified: chrono::Utc::now(),
mime_type: "text/plain".to_string(),
is_text: true,
};
service.index_file(&file, "This is a test document with some content").await.unwrap();
service.reload_reader().unwrap();
let results = service.search("test", 10).await.unwrap();
assert!(!results.is_empty(), "Search results should not be empty");
assert_eq!(results[0].path, "test.txt");
}
#[tokio::test]
async fn test_remove_file() {
let (service, _temp_dir) = create_test_search_service().await;
let file = crate::models::file::File {
name: "test.txt".to_string(),
path: "test.txt".to_string(),
size: 100,
modified: chrono::Utc::now(),
mime_type: "text/plain".to_string(),
is_text: true,
};
service.index_file(&file, "Test content").await.unwrap();
service.reload_reader().unwrap();
service.remove_file("test.txt").await.unwrap();
service.reload_reader().unwrap();
let results = service.search("Test content", 10).await.unwrap();
assert!(results.is_empty());
}
}