use serde::{Deserialize, Serialize};
use std::path::PathBuf;
use std::sync::Arc;
use tantivy::collector::TopDocs;
use tantivy::query::QueryParser;
use tantivy::schema::*;
use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
use tracing::instrument;
use turbovault_core::prelude::*;
use turbovault_parser::to_plain_text;
use turbovault_vault::VaultManager;
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct SearchResultInfo {
pub path: String,
pub title: String,
pub preview: String,
pub score: f64,
pub snippet: String,
pub tags: Vec<String>,
pub outgoing_links: Vec<String>,
pub backlink_count: usize,
pub word_count: usize,
pub char_count: usize,
}
#[derive(Debug, Clone, Default)]
pub struct SearchFilter {
pub tags: Option<Vec<String>>,
pub frontmatter_filters: Option<Vec<(String, String)>>,
pub backlinks_from: Option<Vec<String>>,
pub exclude_paths: Option<Vec<String>>,
}
pub struct SearchQuery {
query: String,
filter: SearchFilter,
limit: usize,
}
impl SearchQuery {
pub fn new(query: impl Into<String>) -> Self {
Self {
query: query.into(),
filter: SearchFilter::default(),
limit: 10,
}
}
pub fn with_tags(mut self, tags: Vec<String>) -> Self {
self.filter.tags = Some(tags);
self
}
pub fn with_frontmatter(mut self, key: String, value: String) -> Self {
self.filter
.frontmatter_filters
.get_or_insert_with(Vec::new)
.push((key, value));
self
}
pub fn with_backlinks_from(mut self, paths: Vec<String>) -> Self {
self.filter.backlinks_from = Some(paths);
self
}
pub fn exclude(mut self, paths: Vec<String>) -> Self {
self.filter.exclude_paths = Some(paths);
self
}
pub fn limit(mut self, limit: usize) -> Self {
self.limit = limit;
self
}
pub fn build(self) -> (String, SearchFilter, usize) {
(self.query, self.filter, self.limit)
}
}
pub struct SearchEngine {
pub manager: Arc<VaultManager>,
index: Index,
field_path: Field,
field_title: Field,
field_content: Field,
field_tags: Field,
}
impl SearchEngine {
pub async fn new(manager: Arc<VaultManager>) -> Result<Self> {
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("path", TEXT | STORED);
schema_builder.add_text_field("title", TEXT | STORED);
schema_builder.add_text_field("content", TEXT);
schema_builder.add_text_field("tags", TEXT | STORED);
let schema = schema_builder.build();
let field_path = schema
.get_field("path")
.expect("schema built with 'path' field");
let field_title = schema
.get_field("title")
.expect("schema built with 'title' field");
let field_content = schema
.get_field("content")
.expect("schema built with 'content' field");
let field_tags = schema
.get_field("tags")
.expect("schema built with 'tags' field");
let index = Index::create_in_ram(schema.clone());
let mut index_writer = index
.writer(50_000_000)
.map_err(|e| Error::config_error(format!("Failed to create index writer: {}", e)))?;
let files = manager.scan_vault().await?;
for file_path in files {
let path_str = file_path.to_string_lossy();
let path_lower = path_str.to_lowercase();
if !path_lower.ends_with(".md") {
continue;
}
match manager.parse_file(&file_path).await {
Ok(vault_file) => {
let path_str = file_path.to_string_lossy().to_string();
let title = vault_file
.frontmatter
.as_ref()
.and_then(|fm| fm.data.get("title"))
.and_then(|v| v.as_str())
.unwrap_or_else(|| {
file_path
.file_stem()
.unwrap_or_default()
.to_str()
.unwrap_or("")
})
.to_string();
let tags_str = vault_file
.frontmatter
.as_ref()
.map(|fm| fm.tags().join(" "))
.unwrap_or_default();
let plain_content = to_plain_text(&vault_file.content);
let _ = index_writer.add_document(doc!(
field_path => path_str.clone(),
field_title => title,
field_content => plain_content,
field_tags => tags_str,
));
}
Err(_e) => {
}
}
}
index_writer
.commit()
.map_err(|e| Error::config_error(format!("Failed to commit index: {}", e)))?;
Ok(Self {
manager,
index,
field_path,
field_title,
field_content,
field_tags,
})
}
#[instrument(skip(self), fields(query = query), name = "search_query")]
pub async fn search(&self, query: &str) -> Result<Vec<SearchResultInfo>> {
SearchQuery::new(query).limit(10).build_results(self).await
}
#[instrument(skip(self, query), name = "search_advanced")]
pub async fn advanced_search(&self, query: SearchQuery) -> Result<Vec<SearchResultInfo>> {
query.build_results(self).await
}
pub async fn search_by_tags(&self, tags: Vec<String>) -> Result<Vec<SearchResultInfo>> {
SearchQuery::new("*")
.with_tags(tags)
.limit(100)
.build_results(self)
.await
}
pub async fn search_by_frontmatter(
&self,
key: &str,
value: &str,
) -> Result<Vec<SearchResultInfo>> {
SearchQuery::new("*")
.with_frontmatter(key.to_string(), value.to_string())
.limit(100)
.build_results(self)
.await
}
#[instrument(skip(self), fields(path = path, limit = limit), name = "search_find_related")]
pub async fn find_related(&self, path: &str, limit: usize) -> Result<Vec<SearchResultInfo>> {
let vault_file = self.manager.parse_file(&PathBuf::from(path)).await?;
let plain_content = to_plain_text(&vault_file.content);
let keywords = extract_keywords(&plain_content);
let query = keywords.join(" ");
let mut results = SearchQuery::new(query)
.exclude(vec![path.to_string()])
.limit(limit)
.build_results(self)
.await?;
results.sort_by(|a, b| b.score.total_cmp(&a.score));
Ok(results)
}
pub async fn recommend_related(&self, path: &str) -> Result<Vec<SearchResultInfo>> {
self.find_related(path, 5).await
}
}
impl SearchQuery {
async fn build_results(self, engine: &SearchEngine) -> Result<Vec<SearchResultInfo>> {
let (query_str, filter, limit) = self.build();
let reader = engine
.index
.reader_builder()
.reload_policy(ReloadPolicy::Manual)
.try_into()
.map_err(|e| Error::config_error(format!("Failed to create reader: {}", e)))?;
let searcher = reader.searcher();
let graph = engine.manager.link_graph();
let graph_read = graph.read().await;
let mut query_parser = QueryParser::for_index(
&engine.index,
vec![engine.field_title, engine.field_content, engine.field_tags],
);
query_parser.set_field_fuzzy(
engine.field_title,
true, 1, false, );
query_parser.set_field_fuzzy(engine.field_content, true, 1, false);
query_parser.set_field_fuzzy(engine.field_tags, true, 1, false);
let query = query_parser
.parse_query(&query_str)
.map_err(|e| Error::config_error(format!("Failed to parse query: {}", e)))?;
let top_docs = searcher
.search(&query, &TopDocs::with_limit(limit * 2)) .map_err(|e| Error::config_error(format!("Search failed: {}", e)))?;
let mut results = Vec::new();
for (score, doc_address) in top_docs {
let tantivy_doc: TantivyDocument = searcher
.doc(doc_address)
.map_err(|e| Error::config_error(format!("Failed to retrieve doc: {}", e)))?;
let path = tantivy_doc
.get_first(engine.field_path)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let title = tantivy_doc
.get_first(engine.field_title)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let tags_str = tantivy_doc
.get_first(engine.field_tags)
.and_then(|v| v.as_str())
.unwrap_or("")
.to_string();
let file_tags: Vec<String> =
tags_str.split_whitespace().map(|s| s.to_string()).collect();
if let Some(tags) = &filter.tags
&& !file_tags.iter().any(|t| tags.contains(t))
{
continue;
}
if let Some(exclude) = &filter.exclude_paths
&& exclude.iter().any(|p| path.ends_with(p))
{
continue;
}
if let Some(fm_filters) = &filter.frontmatter_filters {
let file_path = PathBuf::from(&path);
if let Ok(vault_file) = engine.manager.parse_file(&file_path).await {
let mut matches_all = true;
if let Some(fm) = &vault_file.frontmatter {
for (key, value) in fm_filters {
if let Some(fm_value) = fm.data.get(key) {
let fm_str = fm_value.to_string();
if !fm_str.contains(value) {
matches_all = false;
break;
}
} else {
matches_all = false;
break;
}
}
} else {
matches_all = false;
}
if !matches_all {
continue;
}
} else {
continue;
}
}
let file_path = PathBuf::from(&path);
if let Ok(vault_file) = engine.manager.parse_file(&file_path).await {
let plain_content = to_plain_text(&vault_file.content);
let preview = plain_content
.lines()
.next()
.unwrap_or("")
.chars()
.take(200)
.collect::<String>();
let snippet = extract_snippet(&plain_content, &query_str);
let backlink_count = graph_read.backlinks(&file_path).unwrap_or_default().len();
let word_count = plain_content.split_whitespace().count();
let char_count = plain_content.chars().count();
let outgoing_links: Vec<String> =
vault_file.links.iter().map(|l| l.target.clone()).collect();
let score_f64 = score as f64;
let normalized_score = (1.0 / (1.0 + (-score_f64 / 2.0).exp())).clamp(0.0, 1.0);
results.push(SearchResultInfo {
path,
title,
preview,
score: normalized_score,
snippet,
tags: file_tags,
outgoing_links,
backlink_count,
word_count,
char_count,
});
}
if results.len() >= limit {
break;
}
}
Ok(results)
}
}
fn extract_keywords(content: &str) -> Vec<String> {
content
.split_whitespace()
.filter(|word| word.len() > 3)
.filter(|word| !is_stopword(word))
.map(|w| w.to_lowercase())
.take(10)
.collect()
}
pub(crate) fn is_stopword(word: &str) -> bool {
matches!(
word.to_lowercase().as_str(),
"the"
| "a"
| "an"
| "and"
| "or"
| "but"
| "in"
| "on"
| "at"
| "to"
| "for"
| "of"
| "with"
| "from"
| "by"
| "about"
| "is"
| "are"
| "was"
| "were"
| "be"
| "been"
| "being"
| "have"
| "has"
| "had"
| "do"
| "does"
| "did"
| "will"
| "would"
| "could"
| "should"
| "may"
| "might"
| "must"
| "can"
)
}
fn extract_snippet(content: &str, query: &str) -> String {
if query.is_empty() || query == "*" {
return content.lines().take(1).collect();
}
let query_lower = query.to_lowercase();
let content_lower = content.to_lowercase();
if let Some(pos) = content_lower.find(&query_lower) {
let mut start = pos.saturating_sub(50);
while start > 0 && !content.is_char_boundary(start) {
start -= 1;
}
let mut end = (pos + query_lower.len() + 50).min(content.len());
while end < content.len() && !content.is_char_boundary(end) {
end += 1;
}
let snippet = &content[start..end];
format!("...{}...", snippet.trim())
} else {
content.lines().take(1).next().unwrap_or("").to_string()
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_extract_keywords() {
let content = "The quick brown fox jumps over the lazy dog";
let keywords = extract_keywords(content);
assert!(!keywords.is_empty());
assert!(keywords.iter().any(|k| k == "quick" || k == "brown"));
}
#[test]
fn test_is_stopword() {
assert!(is_stopword("the"));
assert!(is_stopword("and"));
assert!(!is_stopword("rust"));
}
#[test]
fn test_extract_snippet() {
let content = "The quick brown fox jumps over the lazy dog";
let snippet = extract_snippet(content, "fox");
assert!(snippet.contains("fox"));
}
#[test]
fn test_extract_snippet_no_match() {
let content = "The quick brown fox";
let snippet = extract_snippet(content, "xyz");
assert!(!snippet.contains("xyz"));
}
#[test]
fn test_extract_snippet_wildcard() {
let content = "First line\nSecond line";
let snippet = extract_snippet(content, "*");
assert!(snippet.contains("First"));
}
#[test]
fn test_extract_keywords_filters_short_words() {
let content = "a b c defgh ijklmn";
let keywords = extract_keywords(content);
assert!(!keywords.iter().any(|k| k.len() <= 3));
}
#[test]
fn test_file_path_extension_check() {
let paths = vec![
"/vault/index.md",
"/vault/test.MD",
"/vault/readme.txt",
"/vault/file.md.bak",
"relative/path/note.md",
];
for path_str in paths {
let ends_with_md = path_str.to_lowercase().ends_with(".md");
eprintln!("[TEST] Path: {}, ends_with .md: {}", path_str, ends_with_md);
}
assert!("/vault/index.md".ends_with(".md"));
assert!("/vault/test.md".ends_with(".md"));
assert!(!"/vault/readme.txt".ends_with(".md"));
assert!(!"/vault/file.md.bak".ends_with(".md"));
assert!("relative/path/note.md".ends_with(".md"));
}
#[test]
fn test_stopword_filtering_comprehensive() {
let stopwords = vec!["the", "and", "or", "is", "are"];
let content_words = vec!["testing", "capabilities", "search", "index"];
for word in stopwords {
assert!(is_stopword(word), "Should recognize '{}' as stopword", word);
}
for word in content_words {
assert!(
!is_stopword(word),
"Should NOT recognize '{}' as stopword",
word
);
}
}
#[test]
fn test_snippet_extraction_edge_cases() {
let snippet = extract_snippet("", "search");
assert!(snippet.is_empty() || !snippet.contains("search"));
let short = "short";
let snippet = extract_snippet(short, "short");
assert!(snippet.contains("short"));
let multi = "test test test another test";
let snippet = extract_snippet(multi, "test");
assert!(snippet.contains("test"));
}
#[test]
fn test_fuzzy_search_query_building() {
use tantivy::schema::*;
let mut schema_builder = Schema::builder();
schema_builder.add_text_field("title", TEXT);
schema_builder.add_text_field("content", TEXT);
let schema = schema_builder.build();
let mut query_parser = tantivy::query::QueryParser::for_index(
&tantivy::Index::create_in_ram(schema.clone()),
vec![schema.get_field("title").unwrap()],
);
query_parser.set_field_fuzzy(
schema.get_field("title").unwrap(),
true, 1, false, );
eprintln!("[TEST] QueryParser configured successfully with fuzzy search");
}
#[test]
fn test_score_normalization_bounds() {
let scores: Vec<f64> = vec![-10.0, -1.0, 0.0, 1.0, 5.0, 10.0, 100.0];
for raw_score in scores {
let normalized: f64 = (1.0 / (1.0 + (-raw_score / 2.0).exp())).clamp(0.0, 1.0);
assert!(
(0.0..=1.0).contains(&normalized),
"Score {} normalized to {}, should be 0.0-1.0",
raw_score,
normalized
);
eprintln!("[SCORE] Raw: {}, Normalized: {}", raw_score, normalized);
}
}
#[test]
fn test_file_filtering_logic() {
let test_paths = vec![
("index.md", true),
("test.MD", true), ("README.txt", false),
(".md", true),
("file.md.backup", false),
];
eprintln!("\n[INTEGRATION TEST] File filtering logic (case-insensitive):");
for (path, should_index) in test_paths {
let path_str = path.to_string();
let passes_filter = path_str.to_lowercase().ends_with(".md");
eprintln!(
"[CHECK] Path: {}, ends_with .md (case-insensitive): {}, expected: {}",
path, passes_filter, should_index
);
if should_index {
assert!(
passes_filter,
"Path {} should pass filter (case-insensitive)",
path
);
} else {
assert!(
!passes_filter,
"Path {} should NOT pass filter (case-insensitive)",
path
);
}
}
}
}