1use serde::{Deserialize, Serialize};
11use std::path::PathBuf;
12use std::sync::Arc;
13use tantivy::collector::TopDocs;
14use tantivy::query::QueryParser;
15use tantivy::schema::*;
16use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
17use tracing::instrument;
18use turbovault_core::prelude::*;
19use turbovault_parser::to_plain_text;
20use turbovault_vault::VaultManager;
21
22#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct SearchResultInfo {
25 pub path: String,
27 pub title: String,
29 pub preview: String,
31 pub score: f64,
33 pub snippet: String,
35 pub tags: Vec<String>,
37 pub outgoing_links: Vec<String>,
39 pub backlink_count: usize,
41 pub word_count: usize,
43 pub char_count: usize,
45}
46
47#[derive(Debug, Clone, Default)]
49pub struct SearchFilter {
50 pub tags: Option<Vec<String>>,
52 pub frontmatter_filters: Option<Vec<(String, String)>>,
54 pub backlinks_from: Option<Vec<String>>,
56 pub exclude_paths: Option<Vec<String>>,
58}
59
60pub struct SearchQuery {
62 query: String,
63 filter: SearchFilter,
64 limit: usize,
65}
66
67impl SearchQuery {
68 pub fn new(query: impl Into<String>) -> Self {
70 Self {
71 query: query.into(),
72 filter: SearchFilter::default(),
73 limit: 10,
74 }
75 }
76
77 pub fn with_tags(mut self, tags: Vec<String>) -> Self {
79 self.filter.tags = Some(tags);
80 self
81 }
82
83 pub fn with_frontmatter(mut self, key: String, value: String) -> Self {
85 self.filter
86 .frontmatter_filters
87 .get_or_insert_with(Vec::new)
88 .push((key, value));
89 self
90 }
91
92 pub fn with_backlinks_from(mut self, paths: Vec<String>) -> Self {
94 self.filter.backlinks_from = Some(paths);
95 self
96 }
97
98 pub fn exclude(mut self, paths: Vec<String>) -> Self {
100 self.filter.exclude_paths = Some(paths);
101 self
102 }
103
104 pub fn limit(mut self, limit: usize) -> Self {
106 self.limit = limit;
107 self
108 }
109
110 pub fn build(self) -> (String, SearchFilter, usize) {
112 (self.query, self.filter, self.limit)
113 }
114}
115
116pub struct SearchEngine {
118 pub manager: Arc<VaultManager>,
119 index: Index,
120 schema: Schema,
121}
122
123impl SearchEngine {
124 pub async fn new(manager: Arc<VaultManager>) -> Result<Self> {
126 let mut schema_builder = Schema::builder();
128 schema_builder.add_text_field("path", TEXT | STORED);
129 schema_builder.add_text_field("title", TEXT | STORED);
130 schema_builder.add_text_field("content", TEXT);
131 schema_builder.add_text_field("tags", TEXT | STORED);
132 let schema = schema_builder.build();
133
134 let index = Index::create_in_ram(schema.clone());
136
137 let mut index_writer = index
139 .writer(50_000_000)
140 .map_err(|e| Error::config_error(format!("Failed to create index writer: {}", e)))?;
141
142 let files = manager.scan_vault().await?;
143
144 for file_path in files {
145 let path_str = file_path.to_string_lossy();
147 let path_lower = path_str.to_lowercase();
148 if !path_lower.ends_with(".md") {
149 continue;
150 }
151
152 match manager.parse_file(&file_path).await {
153 Ok(vault_file) => {
154 let path_str = file_path.to_string_lossy().to_string();
155
156 let title = vault_file
158 .frontmatter
159 .as_ref()
160 .and_then(|fm| fm.data.get("title"))
161 .and_then(|v| v.as_str())
162 .unwrap_or_else(|| {
163 file_path
164 .file_stem()
165 .unwrap_or_default()
166 .to_str()
167 .unwrap_or("")
168 })
169 .to_string();
170
171 let tags_str = vault_file
173 .frontmatter
174 .as_ref()
175 .map(|fm| fm.tags().join(" "))
176 .unwrap_or_default();
177
178 let plain_content = to_plain_text(&vault_file.content);
180
181 let _ = index_writer.add_document(doc!(
183 schema.get_field("path").unwrap() => path_str.clone(),
184 schema.get_field("title").unwrap() => title,
185 schema.get_field("content").unwrap() => plain_content,
186 schema.get_field("tags").unwrap() => tags_str,
187 ));
188 }
189 Err(_e) => {
190 }
192 }
193 }
194
195 index_writer
196 .commit()
197 .map_err(|e| Error::config_error(format!("Failed to commit index: {}", e)))?;
198
199 Ok(Self {
200 manager,
201 index,
202 schema,
203 })
204 }
205
206 #[instrument(skip(self), fields(query = query), name = "search_query")]
208 pub async fn search(&self, query: &str) -> Result<Vec<SearchResultInfo>> {
209 SearchQuery::new(query).limit(10).build_results(self).await
210 }
211
212 #[instrument(skip(self, query), name = "search_advanced")]
214 pub async fn advanced_search(&self, query: SearchQuery) -> Result<Vec<SearchResultInfo>> {
215 query.build_results(self).await
216 }
217
218 pub async fn search_by_tags(&self, tags: Vec<String>) -> Result<Vec<SearchResultInfo>> {
220 SearchQuery::new("*")
221 .with_tags(tags)
222 .limit(100)
223 .build_results(self)
224 .await
225 }
226
227 pub async fn search_by_frontmatter(
229 &self,
230 key: &str,
231 value: &str,
232 ) -> Result<Vec<SearchResultInfo>> {
233 SearchQuery::new("*")
234 .with_frontmatter(key.to_string(), value.to_string())
235 .limit(100)
236 .build_results(self)
237 .await
238 }
239
240 #[instrument(skip(self), fields(path = path, limit = limit), name = "search_find_related")]
242 pub async fn find_related(&self, path: &str, limit: usize) -> Result<Vec<SearchResultInfo>> {
243 let vault_file = self.manager.parse_file(&PathBuf::from(path)).await?;
245
246 let plain_content = to_plain_text(&vault_file.content);
248 let keywords = extract_keywords(&plain_content);
249
250 let query = keywords.join(" ");
252 let mut results = SearchQuery::new(query)
253 .exclude(vec![path.to_string()])
254 .limit(limit)
255 .build_results(self)
256 .await?;
257
258 results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
260
261 Ok(results)
262 }
263
264 pub async fn recommend_related(&self, path: &str) -> Result<Vec<SearchResultInfo>> {
266 self.find_related(path, 5).await
267 }
268}
269
270impl SearchQuery {
271 async fn build_results(self, engine: &SearchEngine) -> Result<Vec<SearchResultInfo>> {
273 let (query_str, filter, limit) = self.build();
274
275 let reader = engine
276 .index
277 .reader_builder()
278 .reload_policy(ReloadPolicy::Manual)
279 .try_into()
280 .map_err(|e| Error::config_error(format!("Failed to create reader: {}", e)))?;
281
282 let searcher = reader.searcher();
283 let graph = engine.manager.link_graph();
284 let graph_read = graph.read().await;
285
286 let mut query_parser = QueryParser::for_index(
288 &engine.index,
289 vec![
290 engine.schema.get_field("title").unwrap(),
291 engine.schema.get_field("content").unwrap(),
292 engine.schema.get_field("tags").unwrap(),
293 ],
294 );
295
296 query_parser.set_field_fuzzy(
299 engine.schema.get_field("title").unwrap(),
300 true, 1, false, );
304 query_parser.set_field_fuzzy(engine.schema.get_field("content").unwrap(), true, 1, false);
305 query_parser.set_field_fuzzy(engine.schema.get_field("tags").unwrap(), true, 1, false);
306
307 let query = query_parser
308 .parse_query(&query_str)
309 .map_err(|e| Error::config_error(format!("Failed to parse query: {}", e)))?;
310
311 let top_docs = searcher
313 .search(&query, &TopDocs::with_limit(limit * 2)) .map_err(|e| Error::config_error(format!("Search failed: {}", e)))?;
315
316 let mut results = Vec::new();
317
318 for (score, doc_address) in top_docs {
319 let tantivy_doc: TantivyDocument = searcher
321 .doc(doc_address)
322 .map_err(|e| Error::config_error(format!("Failed to retrieve doc: {}", e)))?;
323
324 let doc_json_str = tantivy_doc.to_json(&engine.schema);
326 let doc_json: serde_json::Value =
327 serde_json::from_str(&doc_json_str).unwrap_or(serde_json::json!({}));
328
329 let path = doc_json
332 .get("path")
333 .and_then(|v| v.as_array())
334 .and_then(|arr| arr.first())
335 .and_then(|v| v.as_str())
336 .map(|s| s.to_string())
337 .unwrap_or_default();
338
339 let title = doc_json
340 .get("title")
341 .and_then(|v| v.as_array())
342 .and_then(|arr| arr.first())
343 .and_then(|v| v.as_str())
344 .map(|s| s.to_string())
345 .unwrap_or_default();
346
347 let tags_str = doc_json
348 .get("tags")
349 .and_then(|v| v.as_array())
350 .and_then(|arr| arr.first())
351 .and_then(|v| v.as_str())
352 .map(|s| s.to_string())
353 .unwrap_or_default();
354
355 let file_tags: Vec<String> =
356 tags_str.split_whitespace().map(|s| s.to_string()).collect();
357
358 if let Some(tags) = &filter.tags
360 && !file_tags.iter().any(|t| tags.contains(t))
361 {
362 continue;
363 }
364
365 if let Some(exclude) = &filter.exclude_paths
367 && exclude.iter().any(|p| path.ends_with(p))
368 {
369 continue;
370 }
371
372 if let Some(fm_filters) = &filter.frontmatter_filters {
374 let file_path = PathBuf::from(&path);
375 if let Ok(vault_file) = engine.manager.parse_file(&file_path).await {
376 let mut matches_all = true;
377 if let Some(fm) = &vault_file.frontmatter {
378 for (key, value) in fm_filters {
379 if let Some(fm_value) = fm.data.get(key) {
380 let fm_str = fm_value.to_string();
381 if !fm_str.contains(value) {
382 matches_all = false;
383 break;
384 }
385 } else {
386 matches_all = false;
387 break;
388 }
389 }
390 } else {
391 matches_all = false;
392 }
393 if !matches_all {
394 continue;
395 }
396 } else {
397 continue;
398 }
399 }
400
401 let file_path = PathBuf::from(&path);
403 if let Ok(vault_file) = engine.manager.parse_file(&file_path).await {
404 let plain_content = to_plain_text(&vault_file.content);
406
407 let preview = plain_content
409 .lines()
410 .next()
411 .unwrap_or("")
412 .chars()
413 .take(200)
414 .collect::<String>();
415
416 let snippet = extract_snippet(&plain_content, &query_str);
418 let backlink_count = graph_read.backlinks(&file_path).unwrap_or_default().len();
419
420 let word_count = plain_content.split_whitespace().count();
422 let char_count = plain_content.chars().count();
423
424 let outgoing_links: Vec<String> =
426 vault_file.links.iter().map(|l| l.target.clone()).collect();
427
428 let score_f64 = score as f64;
431 let normalized_score = (1.0 / (1.0 + (-score_f64 / 2.0).exp())).clamp(0.0, 1.0);
432
433 results.push(SearchResultInfo {
434 path,
435 title,
436 preview,
437 score: normalized_score,
438 snippet,
439 tags: file_tags,
440 outgoing_links,
441 backlink_count,
442 word_count,
443 char_count,
444 });
445 }
446
447 if results.len() >= limit {
448 break;
449 }
450 }
451
452 Ok(results)
453 }
454}
455
456fn extract_keywords(content: &str) -> Vec<String> {
458 content
459 .split_whitespace()
460 .filter(|word| word.len() > 3)
461 .filter(|word| !is_stopword(word))
462 .map(|w| w.to_lowercase())
463 .take(10)
464 .collect()
465}
466
467fn is_stopword(word: &str) -> bool {
469 matches!(
470 word.to_lowercase().as_str(),
471 "the"
472 | "a"
473 | "an"
474 | "and"
475 | "or"
476 | "but"
477 | "in"
478 | "on"
479 | "at"
480 | "to"
481 | "for"
482 | "of"
483 | "with"
484 | "from"
485 | "by"
486 | "about"
487 | "is"
488 | "are"
489 | "was"
490 | "were"
491 | "be"
492 | "been"
493 | "being"
494 | "have"
495 | "has"
496 | "had"
497 | "do"
498 | "does"
499 | "did"
500 | "will"
501 | "would"
502 | "could"
503 | "should"
504 | "may"
505 | "might"
506 | "must"
507 | "can"
508 )
509}
510
511fn extract_snippet(content: &str, query: &str) -> String {
513 if query.is_empty() || query == "*" {
514 return content.lines().take(1).collect();
515 }
516
517 let query_lower = query.to_lowercase();
518 let content_lower = content.to_lowercase();
519
520 if let Some(pos) = content_lower.find(&query_lower) {
521 let mut start = pos.saturating_sub(50);
522 while start > 0 && !content.is_char_boundary(start) {
523 start -= 1;
524 }
525
526 let mut end = (pos + query_lower.len() + 50).min(content.len());
527 while end < content.len() && !content.is_char_boundary(end) {
528 end += 1;
529 }
530
531 let snippet = &content[start..end];
532 format!("...{}...", snippet.trim())
533 } else {
534 content.lines().take(1).next().unwrap_or("").to_string()
535 }
536}
537
538#[cfg(test)]
539mod tests {
540 use super::*;
541
542 #[test]
543 fn test_extract_keywords() {
544 let content = "The quick brown fox jumps over the lazy dog";
545 let keywords = extract_keywords(content);
546 assert!(!keywords.is_empty());
547 assert!(keywords.iter().any(|k| k == "quick" || k == "brown"));
548 }
549
550 #[test]
551 fn test_is_stopword() {
552 assert!(is_stopword("the"));
553 assert!(is_stopword("and"));
554 assert!(!is_stopword("rust"));
555 }
556
557 #[test]
558 fn test_extract_snippet() {
559 let content = "The quick brown fox jumps over the lazy dog";
560 let snippet = extract_snippet(content, "fox");
561 assert!(snippet.contains("fox"));
562 }
563
564 #[test]
565 fn test_extract_snippet_no_match() {
566 let content = "The quick brown fox";
567 let snippet = extract_snippet(content, "xyz");
568 assert!(!snippet.contains("xyz"));
569 }
570
571 #[test]
572 fn test_extract_snippet_wildcard() {
573 let content = "First line\nSecond line";
574 let snippet = extract_snippet(content, "*");
575 assert!(snippet.contains("First"));
576 }
577
578 #[test]
579 fn test_extract_keywords_filters_short_words() {
580 let content = "a b c defgh ijklmn";
581 let keywords = extract_keywords(content);
582 assert!(!keywords.iter().any(|k| k.len() <= 3));
583 }
584
585 #[test]
590 fn test_file_path_extension_check() {
591 let paths = vec![
592 "/vault/index.md",
593 "/vault/test.MD",
594 "/vault/readme.txt",
595 "/vault/file.md.bak",
596 "relative/path/note.md",
597 ];
598
599 for path_str in paths {
600 let ends_with_md = path_str.to_lowercase().ends_with(".md");
601 eprintln!("[TEST] Path: {}, ends_with .md: {}", path_str, ends_with_md);
602 }
603
604 assert!("/vault/index.md".ends_with(".md"));
606 assert!("/vault/test.md".ends_with(".md"));
607 assert!(!"/vault/readme.txt".ends_with(".md"));
608 assert!(!"/vault/file.md.bak".ends_with(".md"));
609 assert!("relative/path/note.md".ends_with(".md"));
610 }
611
612 #[test]
614 fn test_stopword_filtering_comprehensive() {
615 let stopwords = vec!["the", "and", "or", "is", "are"];
616 let content_words = vec!["testing", "capabilities", "search", "index"];
617
618 for word in stopwords {
619 assert!(is_stopword(word), "Should recognize '{}' as stopword", word);
620 }
621
622 for word in content_words {
623 assert!(
624 !is_stopword(word),
625 "Should NOT recognize '{}' as stopword",
626 word
627 );
628 }
629 }
630
631 #[test]
633 fn test_snippet_extraction_edge_cases() {
634 let snippet = extract_snippet("", "search");
636 assert!(snippet.is_empty() || !snippet.contains("search"));
637
638 let short = "short";
640 let snippet = extract_snippet(short, "short");
641 assert!(snippet.contains("short"));
642
643 let multi = "test test test another test";
645 let snippet = extract_snippet(multi, "test");
646 assert!(snippet.contains("test"));
647 }
648
649 #[test]
651 fn test_fuzzy_search_query_building() {
652 use tantivy::schema::*;
654
655 let mut schema_builder = Schema::builder();
656 schema_builder.add_text_field("title", TEXT);
657 schema_builder.add_text_field("content", TEXT);
658 let schema = schema_builder.build();
659
660 let mut query_parser = tantivy::query::QueryParser::for_index(
662 &tantivy::Index::create_in_ram(schema.clone()),
663 vec![schema.get_field("title").unwrap()],
664 );
665
666 query_parser.set_field_fuzzy(
668 schema.get_field("title").unwrap(),
669 true, 1, false, );
673
674 eprintln!("[TEST] QueryParser configured successfully with fuzzy search");
675 }
676
677 #[test]
679 fn test_score_normalization_bounds() {
680 let scores: Vec<f64> = vec![-10.0, -1.0, 0.0, 1.0, 5.0, 10.0, 100.0];
681
682 for raw_score in scores {
683 let normalized: f64 = (1.0 / (1.0 + (-raw_score / 2.0).exp())).clamp(0.0, 1.0);
684 assert!(
685 (0.0..=1.0).contains(&normalized),
686 "Score {} normalized to {}, should be 0.0-1.0",
687 raw_score,
688 normalized
689 );
690 eprintln!("[SCORE] Raw: {}, Normalized: {}", raw_score, normalized);
691 }
692 }
693
694 #[test]
696 fn test_file_filtering_logic() {
697 let test_paths = vec![
699 ("index.md", true),
700 ("test.MD", true), ("README.txt", false),
702 (".md", true),
703 ("file.md.backup", false),
704 ];
705
706 eprintln!("\n[INTEGRATION TEST] File filtering logic (case-insensitive):");
707 for (path, should_index) in test_paths {
708 let path_str = path.to_string();
709 let passes_filter = path_str.to_lowercase().ends_with(".md");
711 eprintln!(
712 "[CHECK] Path: {}, ends_with .md (case-insensitive): {}, expected: {}",
713 path, passes_filter, should_index
714 );
715
716 if should_index {
717 assert!(
718 passes_filter,
719 "Path {} should pass filter (case-insensitive)",
720 path
721 );
722 } else {
723 assert!(
724 !passes_filter,
725 "Path {} should NOT pass filter (case-insensitive)",
726 path
727 );
728 }
729 }
730 }
731}