turbovault_tools/
search_engine.rs

1//! Full-text search engine powered by tantivy
2//!
3//! Provides production-grade full-text search with:
4//! - Apache Lucene-inspired indexing and searching
5//! - TF-IDF relevance scoring
6//! - Field-specific search (content, title, tags)
7//! - Fuzzy/approximate queries via regex
8//! - Fast searching even on large vaults
9
10use serde::{Deserialize, Serialize};
11use std::path::PathBuf;
12use std::sync::Arc;
13use tantivy::collector::TopDocs;
14use tantivy::query::QueryParser;
15use tantivy::schema::*;
16use tantivy::{Index, ReloadPolicy, TantivyDocument, doc};
17use tracing::instrument;
18use turbovault_core::prelude::*;
19use turbovault_parser::to_plain_text;
20use turbovault_vault::VaultManager;
21
22/// Search result metadata for LLM consumption
23#[derive(Debug, Clone, Serialize, Deserialize)]
24pub struct SearchResultInfo {
25    /// File path relative to vault root
26    pub path: String,
27    /// File title (from frontmatter or first heading)
28    pub title: String,
29    /// Content preview (first 200 chars of plain text)
30    pub preview: String,
31    /// Relevance score (0.0 to 1.0, normalized from tantivy's TF-IDF)
32    pub score: f64,
33    /// Matching snippet with context (plain text)
34    pub snippet: String,
35    /// Front matter tags
36    pub tags: Vec<String>,
37    /// Files this note links to
38    pub outgoing_links: Vec<String>,
39    /// Number of backlinks to this note
40    pub backlink_count: usize,
41    /// Word count of readable content (excludes markdown syntax)
42    pub word_count: usize,
43    /// Character count of readable content (excludes markdown syntax)
44    pub char_count: usize,
45}
46
47/// Search filter options
48#[derive(Debug, Clone, Default)]
49pub struct SearchFilter {
50    /// Only match specific tags
51    pub tags: Option<Vec<String>>,
52    /// Only match specific frontmatter keys
53    pub frontmatter_filters: Option<Vec<(String, String)>>,
54    /// Only match notes linked by these paths
55    pub backlinks_from: Option<Vec<String>>,
56    /// Exclude specific paths
57    pub exclude_paths: Option<Vec<String>>,
58}
59
60/// Advanced search builder for LLMs
61pub struct SearchQuery {
62    query: String,
63    filter: SearchFilter,
64    limit: usize,
65}
66
67impl SearchQuery {
68    /// Create new search query
69    pub fn new(query: impl Into<String>) -> Self {
70        Self {
71            query: query.into(),
72            filter: SearchFilter::default(),
73            limit: 10,
74        }
75    }
76
77    /// Add tag filter
78    pub fn with_tags(mut self, tags: Vec<String>) -> Self {
79        self.filter.tags = Some(tags);
80        self
81    }
82
83    /// Add frontmatter filter (e.g., "type", "project")
84    pub fn with_frontmatter(mut self, key: String, value: String) -> Self {
85        self.filter
86            .frontmatter_filters
87            .get_or_insert_with(Vec::new)
88            .push((key, value));
89        self
90    }
91
92    /// Filter by backlinks from specific notes
93    pub fn with_backlinks_from(mut self, paths: Vec<String>) -> Self {
94        self.filter.backlinks_from = Some(paths);
95        self
96    }
97
98    /// Exclude certain paths from results
99    pub fn exclude(mut self, paths: Vec<String>) -> Self {
100        self.filter.exclude_paths = Some(paths);
101        self
102    }
103
104    /// Set result limit
105    pub fn limit(mut self, limit: usize) -> Self {
106        self.limit = limit;
107        self
108    }
109
110    /// Get the query parameters
111    pub fn build(self) -> (String, SearchFilter, usize) {
112        (self.query, self.filter, self.limit)
113    }
114}
115
116/// Search engine for vault discovery (powered by tantivy)
117pub struct SearchEngine {
118    pub manager: Arc<VaultManager>,
119    index: Index,
120    schema: Schema,
121}
122
123impl SearchEngine {
124    /// Create new search engine and index all vault files
125    pub async fn new(manager: Arc<VaultManager>) -> Result<Self> {
126        // Define schema: fields to index
127        let mut schema_builder = Schema::builder();
128        schema_builder.add_text_field("path", TEXT | STORED);
129        schema_builder.add_text_field("title", TEXT | STORED);
130        schema_builder.add_text_field("content", TEXT);
131        schema_builder.add_text_field("tags", TEXT | STORED);
132        let schema = schema_builder.build();
133
134        // Create in-memory index
135        let index = Index::create_in_ram(schema.clone());
136
137        // Index all files
138        let mut index_writer = index
139            .writer(50_000_000)
140            .map_err(|e| Error::config_error(format!("Failed to create index writer: {}", e)))?;
141
142        let files = manager.scan_vault().await?;
143
144        for file_path in files {
145            // Convert PathBuf to string to check extension (case-insensitive)
146            let path_str = file_path.to_string_lossy();
147            let path_lower = path_str.to_lowercase();
148            if !path_lower.ends_with(".md") {
149                continue;
150            }
151
152            match manager.parse_file(&file_path).await {
153                Ok(vault_file) => {
154                    let path_str = file_path.to_string_lossy().to_string();
155
156                    // Get title
157                    let title = vault_file
158                        .frontmatter
159                        .as_ref()
160                        .and_then(|fm| fm.data.get("title"))
161                        .and_then(|v| v.as_str())
162                        .unwrap_or_else(|| {
163                            file_path
164                                .file_stem()
165                                .unwrap_or_default()
166                                .to_str()
167                                .unwrap_or("")
168                        })
169                        .to_string();
170
171                    // Get tags
172                    let tags_str = vault_file
173                        .frontmatter
174                        .as_ref()
175                        .map(|fm| fm.tags().join(" "))
176                        .unwrap_or_default();
177
178                    // Extract plain text for indexing (excludes markdown syntax, URLs, etc.)
179                    let plain_content = to_plain_text(&vault_file.content);
180
181                    // Add document to index with plain text content
182                    let _ = index_writer.add_document(doc!(
183                        schema.get_field("path").unwrap() => path_str.clone(),
184                        schema.get_field("title").unwrap() => title,
185                        schema.get_field("content").unwrap() => plain_content,
186                        schema.get_field("tags").unwrap() => tags_str,
187                    ));
188                }
189                Err(_e) => {
190                    // Silently skip files that fail to parse
191                }
192            }
193        }
194
195        index_writer
196            .commit()
197            .map_err(|e| Error::config_error(format!("Failed to commit index: {}", e)))?;
198
199        Ok(Self {
200            manager,
201            index,
202            schema,
203        })
204    }
205
206    /// Simple keyword search
207    #[instrument(skip(self), fields(query = query), name = "search_query")]
208    pub async fn search(&self, query: &str) -> Result<Vec<SearchResultInfo>> {
209        SearchQuery::new(query).limit(10).build_results(self).await
210    }
211
212    /// Advanced search with filters and options
213    #[instrument(skip(self, query), name = "search_advanced")]
214    pub async fn advanced_search(&self, query: SearchQuery) -> Result<Vec<SearchResultInfo>> {
215        query.build_results(self).await
216    }
217
218    /// Search by tag
219    pub async fn search_by_tags(&self, tags: Vec<String>) -> Result<Vec<SearchResultInfo>> {
220        SearchQuery::new("*")
221            .with_tags(tags)
222            .limit(100)
223            .build_results(self)
224            .await
225    }
226
227    /// Search by frontmatter property
228    pub async fn search_by_frontmatter(
229        &self,
230        key: &str,
231        value: &str,
232    ) -> Result<Vec<SearchResultInfo>> {
233        SearchQuery::new("*")
234            .with_frontmatter(key.to_string(), value.to_string())
235            .limit(100)
236            .build_results(self)
237            .await
238    }
239
240    /// Find related notes (by link proximity + content similarity)
241    #[instrument(skip(self), fields(path = path, limit = limit), name = "search_find_related")]
242    pub async fn find_related(&self, path: &str, limit: usize) -> Result<Vec<SearchResultInfo>> {
243        // Parse the note to extract keywords
244        let vault_file = self.manager.parse_file(&PathBuf::from(path)).await?;
245
246        // Extract key terms from plain text content (excludes URLs, markdown syntax)
247        let plain_content = to_plain_text(&vault_file.content);
248        let keywords = extract_keywords(&plain_content);
249
250        // Search for similar notes using tantivy query
251        let query = keywords.join(" ");
252        let mut results = SearchQuery::new(query)
253            .exclude(vec![path.to_string()])
254            .limit(limit)
255            .build_results(self)
256            .await?;
257
258        // Sort by relevance (tantivy already scores, but ensure descending)
259        results.sort_by(|a, b| b.score.partial_cmp(&a.score).unwrap());
260
261        Ok(results)
262    }
263
264    /// Semantic search recommendations for LLMs
265    pub async fn recommend_related(&self, path: &str) -> Result<Vec<SearchResultInfo>> {
266        self.find_related(path, 5).await
267    }
268}
269
270impl SearchQuery {
271    /// Build and execute search results using tantivy
272    async fn build_results(self, engine: &SearchEngine) -> Result<Vec<SearchResultInfo>> {
273        let (query_str, filter, limit) = self.build();
274
275        let reader = engine
276            .index
277            .reader_builder()
278            .reload_policy(ReloadPolicy::Manual)
279            .try_into()
280            .map_err(|e| Error::config_error(format!("Failed to create reader: {}", e)))?;
281
282        let searcher = reader.searcher();
283        let graph = engine.manager.link_graph();
284        let graph_read = graph.read().await;
285
286        // Parse query using tantivy's QueryParser with fuzzy search enabled
287        let mut query_parser = QueryParser::for_index(
288            &engine.index,
289            vec![
290                engine.schema.get_field("title").unwrap(),
291                engine.schema.get_field("content").unwrap(),
292                engine.schema.get_field("tags").unwrap(),
293            ],
294        );
295
296        // Enable fuzzy search with Levenshtein distance of 1 for typo tolerance
297        // This makes searches forgiving of single-character mistakes
298        query_parser.set_field_fuzzy(
299            engine.schema.get_field("title").unwrap(),
300            true,  // enable_fuzzy
301            1,     // distance (1-2 char typos)
302            false, // prefix_only
303        );
304        query_parser.set_field_fuzzy(engine.schema.get_field("content").unwrap(), true, 1, false);
305        query_parser.set_field_fuzzy(engine.schema.get_field("tags").unwrap(), true, 1, false);
306
307        let query = query_parser
308            .parse_query(&query_str)
309            .map_err(|e| Error::config_error(format!("Failed to parse query: {}", e)))?;
310
311        // Execute search
312        let top_docs = searcher
313            .search(&query, &TopDocs::with_limit(limit * 2)) // Get extra docs for filtering
314            .map_err(|e| Error::config_error(format!("Search failed: {}", e)))?;
315
316        let mut results = Vec::new();
317
318        for (score, doc_address) in top_docs {
319            // Retrieve the stored document from the index
320            let tantivy_doc: TantivyDocument = searcher
321                .doc(doc_address)
322                .map_err(|e| Error::config_error(format!("Failed to retrieve doc: {}", e)))?;
323
324            // Convert to JSON string, then parse to Value
325            let doc_json_str = tantivy_doc.to_json(&engine.schema);
326            let doc_json: serde_json::Value =
327                serde_json::from_str(&doc_json_str).unwrap_or(serde_json::json!({}));
328
329            // Extract field values from the JSON document
330            // Note: Tantivy returns fields as arrays, so we need to get the first element
331            let path = doc_json
332                .get("path")
333                .and_then(|v| v.as_array())
334                .and_then(|arr| arr.first())
335                .and_then(|v| v.as_str())
336                .map(|s| s.to_string())
337                .unwrap_or_default();
338
339            let title = doc_json
340                .get("title")
341                .and_then(|v| v.as_array())
342                .and_then(|arr| arr.first())
343                .and_then(|v| v.as_str())
344                .map(|s| s.to_string())
345                .unwrap_or_default();
346
347            let tags_str = doc_json
348                .get("tags")
349                .and_then(|v| v.as_array())
350                .and_then(|arr| arr.first())
351                .and_then(|v| v.as_str())
352                .map(|s| s.to_string())
353                .unwrap_or_default();
354
355            let file_tags: Vec<String> =
356                tags_str.split_whitespace().map(|s| s.to_string()).collect();
357
358            // Apply filter filters
359            if let Some(tags) = &filter.tags
360                && !file_tags.iter().any(|t| tags.contains(t))
361            {
362                continue;
363            }
364
365            // Apply exclusion filter
366            if let Some(exclude) = &filter.exclude_paths
367                && exclude.iter().any(|p| path.ends_with(p))
368            {
369                continue;
370            }
371
372            // Apply frontmatter filters
373            if let Some(fm_filters) = &filter.frontmatter_filters {
374                let file_path = PathBuf::from(&path);
375                if let Ok(vault_file) = engine.manager.parse_file(&file_path).await {
376                    let mut matches_all = true;
377                    if let Some(fm) = &vault_file.frontmatter {
378                        for (key, value) in fm_filters {
379                            if let Some(fm_value) = fm.data.get(key) {
380                                let fm_str = fm_value.to_string();
381                                if !fm_str.contains(value) {
382                                    matches_all = false;
383                                    break;
384                                }
385                            } else {
386                                matches_all = false;
387                                break;
388                            }
389                        }
390                    } else {
391                        matches_all = false;
392                    }
393                    if !matches_all {
394                        continue;
395                    }
396                } else {
397                    continue;
398                }
399            }
400
401            // Get full content for preview and snippet
402            let file_path = PathBuf::from(&path);
403            if let Ok(vault_file) = engine.manager.parse_file(&file_path).await {
404                // Extract plain text for preview, snippet, and metrics
405                let plain_content = to_plain_text(&vault_file.content);
406
407                // Generate preview from plain text (first line, up to 200 chars)
408                let preview = plain_content
409                    .lines()
410                    .next()
411                    .unwrap_or("")
412                    .chars()
413                    .take(200)
414                    .collect::<String>();
415
416                // Extract snippet from plain text (no markdown syntax in results)
417                let snippet = extract_snippet(&plain_content, &query_str);
418                let backlink_count = graph_read.backlinks(&file_path).unwrap_or_default().len();
419
420                // Calculate content metrics from plain text
421                let word_count = plain_content.split_whitespace().count();
422                let char_count = plain_content.chars().count();
423
424                // Get outgoing links
425                let outgoing_links: Vec<String> =
426                    vault_file.links.iter().map(|l| l.target.clone()).collect();
427
428                // Normalize Tantivy's BM25 score to 0.0-1.0 range
429                // Typical BM25 scores range 0-10+, so we use sigmoid-like normalization
430                let score_f64 = score as f64;
431                let normalized_score = (1.0 / (1.0 + (-score_f64 / 2.0).exp())).clamp(0.0, 1.0);
432
433                results.push(SearchResultInfo {
434                    path,
435                    title,
436                    preview,
437                    score: normalized_score,
438                    snippet,
439                    tags: file_tags,
440                    outgoing_links,
441                    backlink_count,
442                    word_count,
443                    char_count,
444                });
445            }
446
447            if results.len() >= limit {
448                break;
449            }
450        }
451
452        Ok(results)
453    }
454}
455
456/// Extract keywords from content for recommendations
457fn extract_keywords(content: &str) -> Vec<String> {
458    content
459        .split_whitespace()
460        .filter(|word| word.len() > 3)
461        .filter(|word| !is_stopword(word))
462        .map(|w| w.to_lowercase())
463        .take(10)
464        .collect()
465}
466
467/// Check if word is a common stopword
468fn is_stopword(word: &str) -> bool {
469    matches!(
470        word.to_lowercase().as_str(),
471        "the"
472            | "a"
473            | "an"
474            | "and"
475            | "or"
476            | "but"
477            | "in"
478            | "on"
479            | "at"
480            | "to"
481            | "for"
482            | "of"
483            | "with"
484            | "from"
485            | "by"
486            | "about"
487            | "is"
488            | "are"
489            | "was"
490            | "were"
491            | "be"
492            | "been"
493            | "being"
494            | "have"
495            | "has"
496            | "had"
497            | "do"
498            | "does"
499            | "did"
500            | "will"
501            | "would"
502            | "could"
503            | "should"
504            | "may"
505            | "might"
506            | "must"
507            | "can"
508    )
509}
510
511/// Extract snippet from content around matching terms
512fn extract_snippet(content: &str, query: &str) -> String {
513    if query.is_empty() || query == "*" {
514        return content.lines().take(1).collect();
515    }
516
517    let query_lower = query.to_lowercase();
518    let content_lower = content.to_lowercase();
519
520    if let Some(pos) = content_lower.find(&query_lower) {
521        let mut start = pos.saturating_sub(50);
522        while start > 0 && !content.is_char_boundary(start) {
523            start -= 1;
524        }
525
526        let mut end = (pos + query_lower.len() + 50).min(content.len());
527        while end < content.len() && !content.is_char_boundary(end) {
528            end += 1;
529        }
530
531        let snippet = &content[start..end];
532        format!("...{}...", snippet.trim())
533    } else {
534        content.lines().take(1).next().unwrap_or("").to_string()
535    }
536}
537
538#[cfg(test)]
539mod tests {
540    use super::*;
541
542    #[test]
543    fn test_extract_keywords() {
544        let content = "The quick brown fox jumps over the lazy dog";
545        let keywords = extract_keywords(content);
546        assert!(!keywords.is_empty());
547        assert!(keywords.iter().any(|k| k == "quick" || k == "brown"));
548    }
549
550    #[test]
551    fn test_is_stopword() {
552        assert!(is_stopword("the"));
553        assert!(is_stopword("and"));
554        assert!(!is_stopword("rust"));
555    }
556
557    #[test]
558    fn test_extract_snippet() {
559        let content = "The quick brown fox jumps over the lazy dog";
560        let snippet = extract_snippet(content, "fox");
561        assert!(snippet.contains("fox"));
562    }
563
564    #[test]
565    fn test_extract_snippet_no_match() {
566        let content = "The quick brown fox";
567        let snippet = extract_snippet(content, "xyz");
568        assert!(!snippet.contains("xyz"));
569    }
570
571    #[test]
572    fn test_extract_snippet_wildcard() {
573        let content = "First line\nSecond line";
574        let snippet = extract_snippet(content, "*");
575        assert!(snippet.contains("First"));
576    }
577
578    #[test]
579    fn test_extract_keywords_filters_short_words() {
580        let content = "a b c defgh ijklmn";
581        let keywords = extract_keywords(content);
582        assert!(!keywords.iter().any(|k| k.len() <= 3));
583    }
584
585    // ==================== INTEGRATION TESTS ====================
586    // These tests verify the search engine works end-to-end
587
588    /// Test: File path extension checking works correctly
589    #[test]
590    fn test_file_path_extension_check() {
591        let paths = vec![
592            "/vault/index.md",
593            "/vault/test.MD",
594            "/vault/readme.txt",
595            "/vault/file.md.bak",
596            "relative/path/note.md",
597        ];
598
599        for path_str in paths {
600            let ends_with_md = path_str.to_lowercase().ends_with(".md");
601            eprintln!("[TEST] Path: {}, ends_with .md: {}", path_str, ends_with_md);
602        }
603
604        // Verify the logic
605        assert!("/vault/index.md".ends_with(".md"));
606        assert!("/vault/test.md".ends_with(".md"));
607        assert!(!"/vault/readme.txt".ends_with(".md"));
608        assert!(!"/vault/file.md.bak".ends_with(".md"));
609        assert!("relative/path/note.md".ends_with(".md"));
610    }
611
612    /// Test: Stopword filtering works for keyword extraction
613    #[test]
614    fn test_stopword_filtering_comprehensive() {
615        let stopwords = vec!["the", "and", "or", "is", "are"];
616        let content_words = vec!["testing", "capabilities", "search", "index"];
617
618        for word in stopwords {
619            assert!(is_stopword(word), "Should recognize '{}' as stopword", word);
620        }
621
622        for word in content_words {
623            assert!(
624                !is_stopword(word),
625                "Should NOT recognize '{}' as stopword",
626                word
627            );
628        }
629    }
630
631    /// Test: Snippet extraction handles edge cases
632    #[test]
633    fn test_snippet_extraction_edge_cases() {
634        // Empty content
635        let snippet = extract_snippet("", "search");
636        assert!(snippet.is_empty() || !snippet.contains("search"));
637
638        // Content shorter than context window
639        let short = "short";
640        let snippet = extract_snippet(short, "short");
641        assert!(snippet.contains("short"));
642
643        // Multiple occurrences - should find first
644        let multi = "test test test another test";
645        let snippet = extract_snippet(multi, "test");
646        assert!(snippet.contains("test"));
647    }
648
649    /// Test: Fuzzy search query building (basic)
650    #[test]
651    fn test_fuzzy_search_query_building() {
652        // This test verifies the QueryParser can be created and configured
653        use tantivy::schema::*;
654
655        let mut schema_builder = Schema::builder();
656        schema_builder.add_text_field("title", TEXT);
657        schema_builder.add_text_field("content", TEXT);
658        let schema = schema_builder.build();
659
660        // Create query parser
661        let mut query_parser = tantivy::query::QueryParser::for_index(
662            &tantivy::Index::create_in_ram(schema.clone()),
663            vec![schema.get_field("title").unwrap()],
664        );
665
666        // Enable fuzzy search
667        query_parser.set_field_fuzzy(
668            schema.get_field("title").unwrap(),
669            true,  // enable
670            1,     // distance
671            false, // prefix_only
672        );
673
674        eprintln!("[TEST] QueryParser configured successfully with fuzzy search");
675    }
676
677    /// Test: Score normalization stays in 0.0-1.0 range
678    #[test]
679    fn test_score_normalization_bounds() {
680        let scores: Vec<f64> = vec![-10.0, -1.0, 0.0, 1.0, 5.0, 10.0, 100.0];
681
682        for raw_score in scores {
683            let normalized: f64 = (1.0 / (1.0 + (-raw_score / 2.0).exp())).clamp(0.0, 1.0);
684            assert!(
685                (0.0..=1.0).contains(&normalized),
686                "Score {} normalized to {}, should be 0.0-1.0",
687                raw_score,
688                normalized
689            );
690            eprintln!("[SCORE] Raw: {}, Normalized: {}", raw_score, normalized);
691        }
692    }
693
694    /// TEST: Integration - file extension logic in isolation
695    #[test]
696    fn test_file_filtering_logic() {
697        // Test BOTH case-sensitive and case-insensitive approaches
698        let test_paths = vec![
699            ("index.md", true),
700            ("test.MD", true), // should support uppercase too!
701            ("README.txt", false),
702            (".md", true),
703            ("file.md.backup", false),
704        ];
705
706        eprintln!("\n[INTEGRATION TEST] File filtering logic (case-insensitive):");
707        for (path, should_index) in test_paths {
708            let path_str = path.to_string();
709            // Use to_lowercase() for case-insensitive comparison (like real code should do)
710            let passes_filter = path_str.to_lowercase().ends_with(".md");
711            eprintln!(
712                "[CHECK] Path: {}, ends_with .md (case-insensitive): {}, expected: {}",
713                path, passes_filter, should_index
714            );
715
716            if should_index {
717                assert!(
718                    passes_filter,
719                    "Path {} should pass filter (case-insensitive)",
720                    path
721                );
722            } else {
723                assert!(
724                    !passes_filter,
725                    "Path {} should NOT pass filter (case-insensitive)",
726                    path
727                );
728            }
729        }
730    }
731}