ygrep_core/search/
searcher.rs

1use regex::RegexBuilder;
2use std::collections::HashSet;
3use std::time::Instant;
4use tantivy::{collector::TopDocs, query::QueryParser, Index};
5
6use super::results::{MatchType, SearchHit, SearchResult};
7use crate::config::SearchConfig;
8use crate::error::Result;
9use crate::index::schema::SchemaFields;
10
11/// Search engine for querying the index
12pub struct Searcher {
13    config: SearchConfig,
14    index: Index,
15    fields: SchemaFields,
16}
17
18impl Searcher {
19    /// Create a new searcher for an index
20    pub fn new(config: SearchConfig, index: Index) -> Self {
21        let schema = index.schema();
22        let fields = SchemaFields::new(&schema);
23
24        Self {
25            config,
26            index,
27            fields,
28        }
29    }
30
31    /// Search the index with a query string (literal text matching like grep)
32    pub fn search(
33        &self,
34        query: &str,
35        limit: Option<usize>,
36        case_sensitive: bool,
37        context_before: Option<usize>,
38        context_after: Option<usize>,
39    ) -> Result<SearchResult> {
40        let start = Instant::now();
41        let limit = limit
42            .unwrap_or(self.config.default_limit)
43            .min(self.config.max_limit);
44
45        // Get a reader (with retry for META_LOCK contention, issue #7)
46        let reader = super::open_reader_with_retry(&self.index)?;
47        let searcher = reader.searcher();
48
49        // Build query parser for content and filepath fields
50        let mut query_fields = vec![self.fields.content];
51        if let Some(fp) = self.fields.filepath {
52            query_fields.push(fp);
53        }
54        let query_parser = QueryParser::for_index(&self.index, query_fields);
55
56        // Extract alphanumeric words for Tantivy query (it can't search special chars)
57        // Then we'll post-filter for exact literal match
58        let search_terms: Vec<&str> = query
59            .split(|c: char| !c.is_alphanumeric() && c != '_')
60            .filter(|s| !s.is_empty())
61            .collect();
62
63        // If no searchable terms, return empty
64        if search_terms.is_empty() {
65            return Ok(SearchResult {
66                total: 0,
67                hits: vec![],
68                query_time_ms: start.elapsed().as_millis() as u64,
69                text_hits: 0,
70                semantic_hits: 0,
71            });
72        }
73
74        // Search for the extracted terms
75        let tantivy_query_str = search_terms.join(" ");
76        let (tantivy_query, _errors) = query_parser.parse_query_lenient(&tantivy_query_str);
77
78        // Fetch more results since we'll filter them down
79        let fetch_limit = limit * 50;
80        let top_docs = searcher.search(&tantivy_query, &TopDocs::with_limit(fetch_limit))?;
81
82        // Build results
83        let mut hits = Vec::with_capacity(top_docs.len());
84        let max_score = top_docs.first().map(|(score, _)| *score).unwrap_or(1.0);
85        let mut seen: HashSet<(String, u64, u64)> = HashSet::new();
86
87        // Prepare query for matching
88        let query_normalized = if case_sensitive {
89            query.to_string()
90        } else {
91            query.to_lowercase()
92        };
93        let query_terms: Vec<&str> = query_normalized.split_whitespace().collect();
94        let is_multi_word = query_terms.len() > 1;
95
96        for (score, doc_address) in top_docs {
97            // Stop if we have enough results
98            if hits.len() >= limit {
99                break;
100            }
101
102            let doc = searcher.doc(doc_address)?;
103
104            // Extract fields
105            let path = extract_text(&doc, self.fields.path).unwrap_or_default();
106            let doc_id = extract_text(&doc, self.fields.doc_id).unwrap_or_default();
107            let content = extract_text(&doc, self.fields.content).unwrap_or_default();
108            let line_start = extract_u64(&doc, self.fields.line_start).unwrap_or(1);
109            let chunk_id = extract_text(&doc, self.fields.chunk_id).unwrap_or_default();
110
111            let content_normalized = if case_sensitive {
112                content.clone()
113            } else {
114                content.to_lowercase()
115            };
116
117            // Check if path matches the query (filename search)
118            let path_normalized = path.to_lowercase();
119            let path_match = query_terms
120                .iter()
121                .all(|term| path_normalized.contains(term));
122
123            // LITERAL GREP-LIKE FILTER: exact phrase match, or AND match for multi-word queries
124            let exact_match = content_normalized.contains(&query_normalized);
125            let and_match = is_multi_word
126                && query_terms
127                    .iter()
128                    .all(|term| content_normalized.contains(term));
129            if !exact_match && !and_match && !path_match {
130                continue;
131            }
132
133            // Normalize score to 0-1 range
134            let normalized_score = if max_score > 0.0 {
135                score / max_score
136            } else {
137                0.0
138            };
139
140            // For path-only matches (no content match), show beginning of file
141            let is_content_match = exact_match || and_match;
142
143            let (snippet, snippet_offset, snippet_line_count, match_line_offset) =
144                if is_content_match {
145                    create_relevant_snippet(&content, query, 10, context_before, context_after)
146                } else {
147                    // Path-only match: show first few lines
148                    let lines: Vec<&str> = content.lines().take(10).collect();
149                    let snippet = lines.join("\n");
150                    let line_count = lines.len();
151                    (snippet, 0, line_count, 0)
152                };
153
154            // Adjust line numbers to reflect where the snippet is in the file
155            let actual_line_start = line_start + snippet_offset as u64;
156            let actual_line_end = actual_line_start + snippet_line_count.saturating_sub(1) as u64;
157            let match_line_in_snippet = match_line_offset - snippet_offset;
158
159            // Deduplicate: skip if we already have a hit for the same file and line range
160            let key = (path.clone(), actual_line_start, actual_line_end);
161            if !seen.insert(key) {
162                continue;
163            }
164
165            hits.push(SearchHit {
166                path,
167                line_start: actual_line_start,
168                line_end: actual_line_end,
169                snippet,
170                score: normalized_score,
171                is_chunk: !chunk_id.is_empty(),
172                doc_id,
173                match_type: MatchType::Text,
174                match_line_in_snippet,
175            });
176        }
177
178        let query_time_ms = start.elapsed().as_millis() as u64;
179        let text_hits = hits.len();
180
181        Ok(SearchResult {
182            total: hits.len(),
183            hits,
184            query_time_ms,
185            text_hits,
186            semantic_hits: 0,
187        })
188    }
189
190    /// Search with filters
191    pub fn search_filtered(
192        &self,
193        query: &str,
194        limit: Option<usize>,
195        filters: SearchFilters,
196        use_regex: bool,
197        case_sensitive: bool,
198        context_before: Option<usize>,
199        context_after: Option<usize>,
200        verbose: bool,
201    ) -> Result<SearchResult> {
202        // Use regex search if requested
203        let mut result = if use_regex {
204            self.search_regex(
205                query,
206                Some(limit.unwrap_or(self.config.max_limit) * 2),
207                case_sensitive,
208                context_before,
209                context_after,
210            )?
211        } else {
212            self.search(
213                query,
214                Some(limit.unwrap_or(self.config.max_limit) * 2),
215                case_sensitive,
216                context_before,
217                context_after,
218            )?
219        };
220
221        let pre_filter_count = result.hits.len();
222        if verbose {
223            eprintln!(
224                "[verbose] search mode: {}",
225                if use_regex { "regex" } else { "text" }
226            );
227            eprintln!("[verbose] matches before filtering: {}", pre_filter_count);
228        }
229
230        // Apply filters
231        if let Some(ref extensions) = filters.extensions {
232            result.hits.retain(|hit| {
233                if let Some(ext) = std::path::Path::new(&hit.path).extension() {
234                    extensions
235                        .iter()
236                        .any(|e| e.eq_ignore_ascii_case(&ext.to_string_lossy()))
237                } else {
238                    false
239                }
240            });
241            if verbose {
242                eprintln!(
243                    "[verbose] after extension filter ({}): {}",
244                    extensions.join(", "),
245                    result.hits.len()
246                );
247            }
248        }
249
250        if let Some(ref paths) = filters.paths {
251            result
252                .hits
253                .retain(|hit| paths.iter().any(|p| path_matches(p, &hit.path)));
254            if verbose {
255                eprintln!(
256                    "[verbose] after path filter ({}): {}",
257                    paths.join(", "),
258                    result.hits.len()
259                );
260            }
261        }
262
263        // Re-limit
264        let limit = limit
265            .unwrap_or(self.config.default_limit)
266            .min(self.config.max_limit);
267        result.hits.truncate(limit);
268        result.total = result.hits.len();
269
270        // Fix text_hits/semantic_hits to reflect post-filter counts (issue #10)
271        result.text_hits = result
272            .hits
273            .iter()
274            .filter(|h| matches!(h.match_type, MatchType::Text | MatchType::Hybrid))
275            .count();
276        result.semantic_hits = result
277            .hits
278            .iter()
279            .filter(|h| matches!(h.match_type, MatchType::Semantic | MatchType::Hybrid))
280            .count();
281
282        if verbose {
283            eprintln!("[verbose] final results: {}", result.total);
284        }
285
286        Ok(result)
287    }
288
289    /// Search the index with a regex pattern
290    pub fn search_regex(
291        &self,
292        pattern: &str,
293        limit: Option<usize>,
294        case_sensitive: bool,
295        context_before: Option<usize>,
296        context_after: Option<usize>,
297    ) -> Result<SearchResult> {
298        let start = Instant::now();
299        let limit = limit
300            .unwrap_or(self.config.default_limit)
301            .min(self.config.max_limit);
302
303        // Compile regex (case-insensitive by default unless --case-sensitive)
304        let regex = match RegexBuilder::new(pattern)
305            .case_insensitive(!case_sensitive)
306            .build()
307        {
308            Ok(r) => r,
309            Err(e) => {
310                return Err(crate::error::YgrepError::Search(format!(
311                    "Invalid regex pattern: {}",
312                    e
313                )));
314            }
315        };
316
317        // Get a reader (with retry for META_LOCK contention, issue #7)
318        let reader = super::open_reader_with_retry(&self.index)?;
319        let searcher = reader.searcher();
320
321        // Build query parser for content and filepath fields
322        let mut query_fields = vec![self.fields.content];
323        if let Some(fp) = self.fields.filepath {
324            query_fields.push(fp);
325        }
326        let query_parser = QueryParser::for_index(&self.index, query_fields);
327
328        // Extract alphanumeric words from the regex pattern for Tantivy pre-filter
329        // This is a rough heuristic - we extract literal parts from the regex
330        let search_terms: Vec<&str> = pattern
331            .split(|c: char| !c.is_alphanumeric() && c != '_')
332            .filter(|s| !s.is_empty() && s.len() > 1) // Skip single chars (likely regex syntax)
333            .collect();
334
335        // If we have searchable terms, use Tantivy to narrow down candidates
336        let candidates: Vec<_> = if !search_terms.is_empty() {
337            let tantivy_query_str = search_terms.join(" ");
338            let (tantivy_query, _errors) = query_parser.parse_query_lenient(&tantivy_query_str);
339
340            // Fetch many candidates since regex might be selective
341            let fetch_limit = limit * 100;
342            searcher.search(&tantivy_query, &TopDocs::with_limit(fetch_limit))?
343        } else {
344            // No good search terms - scan all documents
345            // This is slow but necessary for patterns like "^#" or ".*"
346            let all_query = tantivy::query::AllQuery;
347            let fetch_limit = limit * 100;
348            searcher.search(&all_query, &TopDocs::with_limit(fetch_limit))?
349        };
350
351        // Build results by applying regex filter
352        let mut hits = Vec::with_capacity(candidates.len());
353        let max_score = candidates.first().map(|(score, _)| *score).unwrap_or(1.0);
354        let mut seen: HashSet<(String, u64, u64)> = HashSet::new();
355
356        for (score, doc_address) in candidates {
357            // Stop if we have enough results
358            if hits.len() >= limit {
359                break;
360            }
361
362            let doc = searcher.doc(doc_address)?;
363
364            // Extract fields
365            let path = extract_text(&doc, self.fields.path).unwrap_or_default();
366            let doc_id = extract_text(&doc, self.fields.doc_id).unwrap_or_default();
367            let content = extract_text(&doc, self.fields.content).unwrap_or_default();
368            let line_start = extract_u64(&doc, self.fields.line_start).unwrap_or(1);
369            let chunk_id = extract_text(&doc, self.fields.chunk_id).unwrap_or_default();
370
371            // REGEX FILTER: Only include if content matches the regex
372            if !regex.is_match(&content) {
373                continue;
374            }
375
376            // Normalize score to 0-1 range
377            let normalized_score = if max_score > 0.0 {
378                score / max_score
379            } else {
380                0.0
381            };
382
383            // Create snippet showing lines that match the regex
384            let (snippet, snippet_offset, snippet_line_count, match_line_offset) =
385                create_regex_snippet(&content, &regex, 10, context_before, context_after);
386
387            // Adjust line numbers to reflect where the snippet is in the file
388            let actual_line_start = line_start + snippet_offset as u64;
389            let actual_line_end = actual_line_start + snippet_line_count.saturating_sub(1) as u64;
390            let match_line_in_snippet = match_line_offset - snippet_offset;
391
392            // Deduplicate: skip if we already have a hit for the same file and line range
393            let key = (path.clone(), actual_line_start, actual_line_end);
394            if !seen.insert(key) {
395                continue;
396            }
397
398            hits.push(SearchHit {
399                path,
400                line_start: actual_line_start,
401                line_end: actual_line_end,
402                snippet,
403                score: normalized_score,
404                is_chunk: !chunk_id.is_empty(),
405                doc_id,
406                match_type: MatchType::Text,
407                match_line_in_snippet,
408            });
409        }
410
411        let query_time_ms = start.elapsed().as_millis() as u64;
412        let text_hits = hits.len();
413
414        Ok(SearchResult {
415            total: hits.len(),
416            hits,
417            query_time_ms,
418            text_hits,
419            semantic_hits: 0,
420        })
421    }
422}
423
424/// Filters for search
425#[derive(Debug, Clone, Default)]
426pub struct SearchFilters {
427    /// Filter by file extensions (e.g., ["rs", "ts"])
428    pub extensions: Option<Vec<String>>,
429    /// Filter by path patterns
430    pub paths: Option<Vec<String>>,
431}
432
433/// Extract text value from a document
434fn extract_text(doc: &tantivy::TantivyDocument, field: tantivy::schema::Field) -> Option<String> {
435    doc.get_first(field).and_then(|v| {
436        if let tantivy::schema::OwnedValue::Str(s) = v {
437            Some(s.to_string())
438        } else {
439            None
440        }
441    })
442}
443
444/// Extract u64 value from a document
445fn extract_u64(doc: &tantivy::TantivyDocument, field: tantivy::schema::Field) -> Option<u64> {
446    doc.get_first(field).and_then(|v| {
447        if let tantivy::schema::OwnedValue::U64(n) = v {
448            Some(*n)
449        } else {
450            None
451        }
452    })
453}
454
455/// Create a snippet showing lines relevant to the query
456/// Returns (snippet, snippet_offset, line_count, match_line_offset)
457/// - snippet_offset: 0-based line index where snippet starts in the chunk
458/// - match_line_offset: 0-based line index of the actual match in the chunk
459fn create_relevant_snippet(
460    content: &str,
461    query: &str,
462    max_lines: usize,
463    ctx_before: Option<usize>,
464    ctx_after: Option<usize>,
465) -> (String, usize, usize, usize) {
466    let lines: Vec<&str> = content.lines().collect();
467    let query_lower = query.to_lowercase();
468    let query_terms: Vec<&str> = query_lower.split_whitespace().collect();
469
470    // Find lines that contain any query term
471    let mut matching_indices: Vec<usize> = Vec::new();
472    for (i, line) in lines.iter().enumerate() {
473        let line_lower = line.to_lowercase();
474        if query_terms.iter().any(|term| line_lower.contains(term)) {
475            matching_indices.push(i);
476        }
477    }
478
479    if matching_indices.is_empty() {
480        // No direct matches, return first lines
481        let snippet = lines
482            .iter()
483            .take(max_lines)
484            .copied()
485            .collect::<Vec<_>>()
486            .join("\n");
487        let line_count = snippet.lines().count();
488        return (snippet, 0, line_count, 0);
489    }
490
491    // For multi-word queries, prefer lines with more matching terms
492    let best_match = if query_terms.len() > 1 {
493        let mut best_line = matching_indices[0];
494        let mut best_count = 0;
495        for &idx in &matching_indices {
496            let line_lower = lines[idx].to_lowercase();
497            let count = query_terms
498                .iter()
499                .filter(|t| line_lower.contains(*t))
500                .count();
501            if count > best_count {
502                best_count = count;
503                best_line = idx;
504            }
505        }
506        best_line
507    } else {
508        matching_indices[0]
509    };
510
511    // Get context around the best match
512    let context_before = ctx_before.unwrap_or(2);
513    let context_after = ctx_after.unwrap_or_else(|| max_lines.saturating_sub(context_before + 1));
514
515    let start = best_match.saturating_sub(context_before);
516    let end = (best_match + context_after + 1).min(lines.len());
517
518    let snippet = lines[start..end].join("\n");
519    let line_count = end - start;
520    (snippet, start, line_count, best_match)
521}
522
523/// Create a snippet showing lines relevant to a regex match
524/// Returns (snippet, snippet_offset, line_count, match_line_offset)
525fn create_regex_snippet(
526    content: &str,
527    regex: &regex::Regex,
528    max_lines: usize,
529    ctx_before: Option<usize>,
530    ctx_after: Option<usize>,
531) -> (String, usize, usize, usize) {
532    let lines: Vec<&str> = content.lines().collect();
533
534    // Find lines that match the regex
535    let mut matching_indices: Vec<usize> = Vec::new();
536    for (i, line) in lines.iter().enumerate() {
537        if regex.is_match(line) {
538            matching_indices.push(i);
539        }
540    }
541
542    if matching_indices.is_empty() {
543        // No direct line matches, but document matched - return first lines
544        let snippet = lines
545            .iter()
546            .take(max_lines)
547            .copied()
548            .collect::<Vec<_>>()
549            .join("\n");
550        let line_count = snippet.lines().count();
551        return (snippet, 0, line_count, 0);
552    }
553
554    // Get context around the first match
555    let first_match = matching_indices[0];
556    let context_before = ctx_before.unwrap_or(2);
557    let context_after = ctx_after.unwrap_or_else(|| max_lines.saturating_sub(context_before + 1));
558
559    let start = first_match.saturating_sub(context_before);
560    let end = (first_match + context_after + 1).min(lines.len());
561
562    let snippet = lines[start..end].join("\n");
563    let line_count = end - start;
564    (snippet, start, line_count, first_match)
565}
566
567/// Match a path against a pattern, supporting glob wildcards.
568///
569/// - If the pattern contains `*` or `?`, it is treated as a glob:
570///   - `*` matches any characters except `/`
571///   - `**` matches any characters including `/`
572///   - `?` matches any single character except `/`
573/// - Otherwise, falls back to prefix/contains matching.
574fn path_matches(pattern: &str, path: &str) -> bool {
575    if pattern.contains('*') || pattern.contains('?') {
576        glob_to_regex(pattern)
577            .map(|re| re.is_match(path))
578            .unwrap_or(false)
579    } else {
580        path.starts_with(pattern) || path.contains(pattern)
581    }
582}
583
584/// Convert a glob pattern to a compiled regex.
585fn glob_to_regex(pattern: &str) -> std::result::Result<regex::Regex, regex::Error> {
586    let mut re = String::with_capacity(pattern.len() * 2);
587    let chars: Vec<char> = pattern.chars().collect();
588    let mut i = 0;
589
590    while i < chars.len() {
591        if chars[i] == '*' && i + 1 < chars.len() && chars[i + 1] == '*' {
592            // ** matches anything including /
593            re.push_str(".*");
594            i += 2;
595            // Skip trailing / after **
596            if i < chars.len() && chars[i] == '/' {
597                re.push_str("/?");
598                i += 1;
599            }
600        } else if chars[i] == '*' {
601            // * matches anything except /
602            re.push_str("[^/]*");
603            i += 1;
604        } else if chars[i] == '?' {
605            re.push_str("[^/]");
606            i += 1;
607        } else {
608            // Escape regex metacharacters
609            let ch = chars[i];
610            if ".+(){}[]^$|\\".contains(ch) {
611                re.push('\\');
612            }
613            re.push(ch);
614            i += 1;
615        }
616    }
617
618    RegexBuilder::new(&re).case_insensitive(true).build()
619}
620
621#[cfg(test)]
622mod tests {
623    use super::*;
624    use crate::index::schema::build_document_schema;
625    use tantivy::doc;
626    use tempfile::tempdir;
627
628    /// Helper: create an index with the code tokenizer registered
629    fn create_test_index(path: &std::path::Path) -> (Index, SchemaFields) {
630        let schema = build_document_schema();
631        let index = Index::create_in_dir(path, schema.clone()).unwrap();
632        crate::index::register_tokenizers(index.tokenizers());
633        let fields = SchemaFields::new(&schema);
634        (index, fields)
635    }
636
637    /// Helper: add a document to an index
638    fn add_doc(
639        index: &Index,
640        fields: &SchemaFields,
641        doc_id: &str,
642        path: &str,
643        content: &str,
644        ext: &str,
645    ) {
646        let mut writer = index.writer(50_000_000).unwrap();
647        writer
648            .add_document(doc!(
649                fields.doc_id => doc_id,
650                fields.path => path,
651                fields.filepath.unwrap() => path,
652                fields.workspace => "/test",
653                fields.content => content,
654                fields.mtime => 0u64,
655                fields.size => content.len() as u64,
656                fields.extension => ext,
657                fields.line_start => 1u64,
658                fields.line_end => content.lines().count() as u64,
659                fields.chunk_id => "",
660                fields.parent_doc => ""
661            ))
662            .unwrap();
663        writer.commit().unwrap();
664    }
665
666    #[test]
667    fn test_basic_search() -> Result<()> {
668        let temp_dir = tempdir().unwrap();
669        let (index, fields) = create_test_index(temp_dir.path());
670        add_doc(
671            &index,
672            &fields,
673            "test1",
674            "src/main.rs",
675            "fn main() { println!(\"Hello, world!\"); }",
676            "rs",
677        );
678
679        let config = SearchConfig::default();
680        let searcher = Searcher::new(config, index);
681        let result = searcher.search("hello", None, false, None, None)?;
682
683        assert_eq!(result.hits.len(), 1);
684        assert_eq!(result.hits[0].path, "src/main.rs");
685
686        Ok(())
687    }
688
689    #[test]
690    fn test_case_insensitive_search() -> Result<()> {
691        let temp_dir = tempdir().unwrap();
692        let (index, fields) = create_test_index(temp_dir.path());
693        add_doc(
694            &index,
695            &fields,
696            "test1",
697            "src/lib.rs",
698            "fn greet() { println!(\"Hello World\"); }",
699            "rs",
700        );
701
702        let config = SearchConfig::default();
703        let searcher = Searcher::new(config, index);
704
705        // Uppercase query should find mixed-case content
706        let result = searcher.search("HELLO", None, false, None, None)?;
707        assert_eq!(result.hits.len(), 1);
708        assert_eq!(result.hits[0].path, "src/lib.rs");
709
710        Ok(())
711    }
712
713    #[test]
714    fn test_empty_query_returns_empty() -> Result<()> {
715        let temp_dir = tempdir().unwrap();
716        let (index, fields) = create_test_index(temp_dir.path());
717        add_doc(
718            &index,
719            &fields,
720            "test1",
721            "src/main.rs",
722            "fn main() {}",
723            "rs",
724        );
725
726        let config = SearchConfig::default();
727        let searcher = Searcher::new(config, index);
728
729        // Queries with no searchable terms should return empty
730        let result = searcher.search("...", None, false, None, None)?;
731        assert!(result.is_empty());
732
733        Ok(())
734    }
735
736    #[test]
737    fn test_regex_search_basic() -> Result<()> {
738        let temp_dir = tempdir().unwrap();
739        let (index, fields) = create_test_index(temp_dir.path());
740        add_doc(
741            &index,
742            &fields,
743            "test1",
744            "src/main.rs",
745            "fn hello_world() {\n    println!(\"Hello!\");\n}",
746            "rs",
747        );
748
749        let config = SearchConfig::default();
750        let searcher = Searcher::new(config, index);
751
752        let result = searcher.search_regex("hello.*world", None, false, None, None)?;
753        assert_eq!(result.hits.len(), 1);
754
755        Ok(())
756    }
757
758    #[test]
759    fn test_regex_search_invalid_returns_error() {
760        let temp_dir = tempdir().unwrap();
761        let (index, _fields) = create_test_index(temp_dir.path());
762
763        let config = SearchConfig::default();
764        let searcher = Searcher::new(config, index);
765
766        let result = searcher.search_regex("[invalid", None, false, None, None);
767        assert!(result.is_err());
768    }
769
770    #[test]
771    fn test_search_extension_filter() -> Result<()> {
772        let temp_dir = tempdir().unwrap();
773        let (index, fields) = create_test_index(temp_dir.path());
774        add_doc(
775            &index,
776            &fields,
777            "test1",
778            "src/main.rs",
779            "fn hello() {}",
780            "rs",
781        );
782        add_doc(
783            &index,
784            &fields,
785            "test2",
786            "src/main.py",
787            "def hello(): pass",
788            "py",
789        );
790
791        let config = SearchConfig::default();
792        let searcher = Searcher::new(config, index);
793
794        let filters = SearchFilters {
795            extensions: Some(vec!["rs".to_string()]),
796            paths: None,
797        };
798        let result =
799            searcher.search_filtered("hello", None, filters, false, false, None, None, false)?;
800
801        assert_eq!(result.hits.len(), 1);
802        assert_eq!(result.hits[0].path, "src/main.rs");
803
804        Ok(())
805    }
806
807    #[test]
808    fn test_search_path_filter() -> Result<()> {
809        let temp_dir = tempdir().unwrap();
810        let (index, fields) = create_test_index(temp_dir.path());
811        add_doc(
812            &index,
813            &fields,
814            "test1",
815            "src/main.rs",
816            "fn hello() {}",
817            "rs",
818        );
819        add_doc(
820            &index,
821            &fields,
822            "test2",
823            "lib/utils.rs",
824            "fn hello() {}",
825            "rs",
826        );
827
828        let config = SearchConfig::default();
829        let searcher = Searcher::new(config, index);
830
831        let filters = SearchFilters {
832            extensions: None,
833            paths: Some(vec!["lib/".to_string()]),
834        };
835        let result =
836            searcher.search_filtered("hello", None, filters, false, false, None, None, false)?;
837
838        assert_eq!(result.hits.len(), 1);
839        assert_eq!(result.hits[0].path, "lib/utils.rs");
840
841        Ok(())
842    }
843
844    #[test]
845    fn test_path_matches_glob() {
846        // Plain prefix/contains (no wildcards)
847        assert!(path_matches("src/", "src/main.rs"));
848        assert!(path_matches("src/", "project/src/main.rs"));
849        assert!(!path_matches("lib/", "src/main.rs"));
850
851        // Single * matches within one path segment
852        assert!(path_matches("src/*/tests/", "src/api/tests/foo.rs"));
853        assert!(path_matches("src/*/tests/", "src/core/tests/bar.rs"));
854        assert!(!path_matches("src/*/tests/", "src/a/b/tests/foo.rs"));
855
856        // ** matches across segments
857        assert!(path_matches("**/tests/", "src/api/tests/foo.rs"));
858        assert!(path_matches("**/tests/", "deep/nested/tests/bar.rs"));
859        assert!(path_matches("src/**/test.rs", "src/a/b/c/test.rs"));
860
861        // ? matches single character
862        assert!(path_matches("src/?.rs", "src/a.rs"));
863        assert!(!path_matches("src/?.rs", "src/ab.rs"));
864
865        // Glob patterns are case-insensitive
866        assert!(path_matches("SRC/*/tests/", "src/api/tests/foo.rs"));
867        // Plain prefix matching is case-sensitive (existing behavior)
868        assert!(!path_matches("SRC/", "src/main.rs"));
869    }
870
871    #[test]
872    fn test_search_path_filter_glob() -> Result<()> {
873        let temp_dir = tempdir().unwrap();
874        let (index, fields) = create_test_index(temp_dir.path());
875        add_doc(
876            &index,
877            &fields,
878            "test1",
879            "user/plugins/impersonate/tests/test.php",
880            "class FooTest extends Plugin {}",
881            "php",
882        );
883        add_doc(
884            &index,
885            &fields,
886            "test2",
887            "user/plugins/impersonate/src/plugin.php",
888            "class Plugin extends Base {}",
889            "php",
890        );
891        add_doc(
892            &index,
893            &fields,
894            "test3",
895            "user/plugins/auth/tests/test.php",
896            "class BarTest extends Plugin {}",
897            "php",
898        );
899
900        let config = SearchConfig::default();
901        let searcher = Searcher::new(config, index);
902
903        // Glob pattern should match only files in tests/ directories
904        let filters = SearchFilters {
905            extensions: None,
906            paths: Some(vec!["user/plugins/*/tests/".to_string()]),
907        };
908        let result = searcher.search_filtered(
909            "extends Plugin",
910            None,
911            filters,
912            false,
913            false,
914            None,
915            None,
916            false,
917        )?;
918
919        assert_eq!(result.hits.len(), 2);
920        assert!(result.hits.iter().all(|h| h.path.contains("/tests/")));
921
922        Ok(())
923    }
924
925    #[test]
926    fn test_multiple_results_ordered_by_score() -> Result<()> {
927        let temp_dir = tempdir().unwrap();
928        let (index, fields) = create_test_index(temp_dir.path());
929
930        // Document with more occurrences of "hello" should score higher
931        add_doc(
932            &index,
933            &fields,
934            "test1",
935            "src/many.rs",
936            "hello hello hello hello hello",
937            "rs",
938        );
939        add_doc(
940            &index,
941            &fields,
942            "test2",
943            "src/one.rs",
944            "hello world goodbye",
945            "rs",
946        );
947
948        let config = SearchConfig::default();
949        let searcher = Searcher::new(config, index);
950        let result = searcher.search("hello", None, false, None, None)?;
951
952        assert!(result.hits.len() >= 2);
953        // Results should be ordered by score descending
954        for pair in result.hits.windows(2) {
955            assert!(pair[0].score >= pair[1].score);
956        }
957
958        Ok(())
959    }
960
961    #[test]
962    fn test_dedup_full_doc_and_chunk() -> Result<()> {
963        let temp_dir = tempdir().unwrap();
964        let (index, fields) = create_test_index(temp_dir.path());
965
966        let content = "fn hello() {\n    println!(\"Hello, world!\");\n}";
967
968        // Add full document (empty chunk_id)
969        let mut writer = index.writer(50_000_000).unwrap();
970        writer
971            .add_document(doc!(
972                fields.doc_id => "full-doc",
973                fields.path => "src/main.rs",
974                fields.filepath.unwrap() => "src/main.rs",
975                fields.workspace => "/test",
976                fields.content => content,
977                fields.mtime => 0u64,
978                fields.size => content.len() as u64,
979                fields.extension => "rs",
980                fields.line_start => 1u64,
981                fields.line_end => 3u64,
982                fields.chunk_id => "",
983                fields.parent_doc => ""
984            ))
985            .unwrap();
986        // Add chunk with same content for the same file
987        writer
988            .add_document(doc!(
989                fields.doc_id => "chunk-1",
990                fields.path => "src/main.rs",
991                fields.filepath.unwrap() => "src/main.rs",
992                fields.workspace => "/test",
993                fields.content => content,
994                fields.mtime => 0u64,
995                fields.size => content.len() as u64,
996                fields.extension => "rs",
997                fields.line_start => 1u64,
998                fields.line_end => 3u64,
999                fields.chunk_id => "chunk-1",
1000                fields.parent_doc => "full-doc"
1001            ))
1002            .unwrap();
1003        writer.commit().unwrap();
1004
1005        let config = SearchConfig::default();
1006        let searcher = Searcher::new(config, index);
1007
1008        // Text search should return only 1 hit (deduplicated)
1009        let result = searcher.search("hello", None, false, None, None)?;
1010        assert_eq!(result.hits.len(), 1);
1011        assert_eq!(result.hits[0].path, "src/main.rs");
1012
1013        // Regex search should also return only 1 hit
1014        let result = searcher.search_regex("hello", None, false, None, None)?;
1015        assert_eq!(result.hits.len(), 1);
1016        assert_eq!(result.hits[0].path, "src/main.rs");
1017
1018        Ok(())
1019    }
1020
1021    #[test]
1022    fn test_filename_search() -> Result<()> {
1023        let temp_dir = tempdir().unwrap();
1024        let (index, fields) = create_test_index(temp_dir.path());
1025
1026        // Add a file where content does NOT contain the search term,
1027        // but the filename does
1028        add_doc(
1029            &index,
1030            &fields,
1031            "test1",
1032            "src/commands/dashboard.rs",
1033            "fn run() {\n    println!(\"starting...\");\n}",
1034            "rs",
1035        );
1036        add_doc(
1037            &index,
1038            &fields,
1039            "test2",
1040            "src/main.rs",
1041            "fn main() { hello(); }",
1042            "rs",
1043        );
1044
1045        let config = SearchConfig::default();
1046        let searcher = Searcher::new(config, index);
1047
1048        // Search for "dashboard" - should find via filename even though content doesn't contain it
1049        let result = searcher.search("dashboard", None, false, None, None)?;
1050        assert_eq!(result.hits.len(), 1);
1051        assert_eq!(result.hits[0].path, "src/commands/dashboard.rs");
1052
1053        Ok(())
1054    }
1055
1056    #[test]
1057    fn test_text_hits_consistent_after_filter() -> Result<()> {
1058        // Issue #10: text_hits should reflect post-filter count, not pre-filter
1059        let temp_dir = tempdir().unwrap();
1060        let (index, fields) = create_test_index(temp_dir.path());
1061        add_doc(
1062            &index,
1063            &fields,
1064            "test1",
1065            "src/main.rs",
1066            "fn hello() {}",
1067            "rs",
1068        );
1069        add_doc(
1070            &index,
1071            &fields,
1072            "test2",
1073            "src/main.py",
1074            "def hello(): pass",
1075            "py",
1076        );
1077        add_doc(
1078            &index,
1079            &fields,
1080            "test3",
1081            "lib/utils.js",
1082            "function hello() {}",
1083            "js",
1084        );
1085
1086        let config = SearchConfig::default();
1087        let searcher = Searcher::new(config, index);
1088
1089        // Filter to only .rs files - should get 1 hit, and text_hits must equal total
1090        let filters = SearchFilters {
1091            extensions: Some(vec!["rs".to_string()]),
1092            paths: None,
1093        };
1094        let result =
1095            searcher.search_filtered("hello", None, filters, false, false, None, None, false)?;
1096
1097        assert_eq!(result.total, 1);
1098        assert_eq!(result.text_hits, 1);
1099        assert_eq!(result.text_hits, result.total);
1100
1101        // Filter to a path that matches nothing - should get 0 hits with text_hits = 0
1102        let filters = SearchFilters {
1103            extensions: None,
1104            paths: Some(vec!["nonexistent/".to_string()]),
1105        };
1106        let result =
1107            searcher.search_filtered("hello", None, filters, false, false, None, None, false)?;
1108
1109        assert_eq!(result.total, 0);
1110        assert_eq!(result.text_hits, 0);
1111
1112        Ok(())
1113    }
1114}
ygrep_core/search/searcher.rs

ygrep_core/search/
searcher.rs