Skip to main content

reflex/query/
mod.rs

1//! Query engine for searching indexed code
2//!
3//! The query engine loads the memory-mapped cache and executes
4//! deterministic searches based on lexical, structural, or symbol patterns.
5
6pub mod filter;
7pub mod result;
8
9pub use filter::QueryFilter;
10
11use anyhow::{Context, Result};
12use regex::Regex;
13
14use crate::cache::CacheManager;
15use crate::content_store::ContentReader;
16use crate::models::{
17    IndexStatus, IndexWarning, IndexWarningDetails, Language, QueryResponse, SearchResult, Span,
18    SymbolKind,
19};
20use crate::output;
21use crate::parsers::ParserFactory;
22use crate::regex_trigrams::extract_trigrams_from_regex;
23use crate::trigram::TrigramIndex;
24
25/// Manages query execution against the index
26pub struct QueryEngine {
27    cache: CacheManager,
28}
29
30impl QueryEngine {
31    /// Create a new query engine with the given cache manager
32    pub fn new(cache: CacheManager) -> Self {
33        Self { cache }
34    }
35
36    /// Load dependencies for search results if requested (legacy - per result)
37    /// Deprecated: Use group_and_load_dependencies for file-level grouping
38    fn load_dependencies(&self, results: &mut [SearchResult], include_deps: bool) -> Result<()> {
39        if !include_deps || results.is_empty() {
40            return Ok(());
41        }
42
43        log::debug!("Loading dependencies for {} results", results.len());
44
45        // Create dependency index
46        // Note: We need to pass the workspace root, not the cache directory
47        // The cache path is .reflex/, so its parent is the workspace root (.)
48        let workspace_root = self.cache.path().parent()
49            .ok_or_else(|| anyhow::anyhow!("Cache path has no parent"))?;
50        let cache_for_deps = CacheManager::new(workspace_root);
51        let dep_index = crate::dependency::DependencyIndex::new(cache_for_deps);
52
53        // Load dependencies for each result
54        for result in results {
55            // Normalize path: strip leading "./" if present
56            let normalized_path = result.path.strip_prefix("./").unwrap_or(&result.path);
57
58            // Get file_id from database by path
59            match self.cache.get_file_id(normalized_path) {
60                Ok(Some(file_id)) => {
61                    log::debug!("Found file_id={} for path={}", file_id, result.path);
62                    // Get dependencies for this file
63                    match dep_index.get_dependencies_info(file_id) {
64                        Ok(dep_infos) => {
65                            log::debug!("Loaded {} dependencies for file_id={}", dep_infos.len(), file_id);
66                            if !dep_infos.is_empty() {
67                                result.dependencies = Some(dep_infos);
68                            }
69                        }
70                        Err(e) => {
71                            log::warn!("Failed to get dependencies for file_id={}: {}", file_id, e);
72                        }
73                    }
74                }
75                Ok(None) => {
76                    log::warn!("No file_id found for path: {}", result.path);
77                }
78                Err(e) => {
79                    log::warn!("Failed to get file_id for path {}: {}", result.path, e);
80                }
81            }
82        }
83
84        Ok(())
85    }
86
87    /// Group search results by file and load dependencies at file level
88    /// Returns file-grouped results with dependencies populated once per file
89    fn group_and_load_dependencies(
90        &self,
91        results: Vec<SearchResult>,
92        include_deps: bool,
93        context_lines: usize,
94    ) -> Result<Vec<crate::models::FileGroupedResult>> {
95        use std::collections::HashMap;
96        use crate::models::{FileGroupedResult, MatchResult};
97
98        if results.is_empty() {
99            return Ok(Vec::new());
100        }
101
102        // Group results by file path (preserving language from first match)
103        let mut grouped: HashMap<String, Vec<SearchResult>> = HashMap::new();
104        for result in results {
105            grouped
106                .entry(result.path.clone())
107                .or_default()
108                .push(result);
109        }
110
111        // Create dependency index if needed
112        let dep_index = if include_deps {
113            let workspace_root = self.cache.path().parent()
114                .ok_or_else(|| anyhow::anyhow!("Cache path has no parent"))?;
115            let cache_for_deps = CacheManager::new(workspace_root);
116            Some(crate::dependency::DependencyIndex::new(cache_for_deps))
117        } else {
118            None
119        };
120
121        // Load ContentReader for extracting context lines
122        let content_path = self.cache.path().join("content.bin");
123        let content_reader_opt = ContentReader::open(&content_path).ok();
124
125        // Convert to FileGroupedResult and load dependencies
126        let mut file_results: Vec<FileGroupedResult> = grouped
127            .into_iter()
128            .map(|(path, file_matches)| {
129                // Capture language from first match (all matches in a file share the same language)
130                let language = file_matches.first().map(|r| r.lang).unwrap_or_default();
131
132                // Load dependencies for this file (once per file, not per result)
133                let dependencies = if let Some(dep_idx) = &dep_index {
134                    let normalized_path = path.strip_prefix("./").unwrap_or(&path);
135                    match self.cache.get_file_id(normalized_path) {
136                        Ok(Some(file_id)) => {
137                            match dep_idx.get_dependencies_info(file_id) {
138                                Ok(dep_infos) if !dep_infos.is_empty() => {
139                                    log::debug!("Loaded {} dependencies for file: {}", dep_infos.len(), path);
140                                    Some(dep_infos)
141                                }
142                                Ok(_) => None,
143                                Err(e) => {
144                                    log::warn!("Failed to get dependencies for {}: {}", path, e);
145                                    None
146                                }
147                            }
148                        }
149                        Ok(None) => {
150                            log::warn!("No file_id found for path: {}", path);
151                            None
152                        }
153                        Err(e) => {
154                            log::warn!("Failed to get file_id for path {}: {}", path, e);
155                            None
156                        }
157                    }
158                } else {
159                    None
160                };
161
162                // Get file_id for context extraction
163                // Note: We use ContentReader's get_file_id_by_path() which returns array indices,
164                // not database file_ids (which are AUTO INCREMENT values)
165                let normalized_path = path.strip_prefix("./").unwrap_or(&path);
166                let file_id_for_context = if let Some(reader) = &content_reader_opt {
167                    reader.get_file_id_by_path(normalized_path)
168                } else {
169                    None
170                };
171                log::debug!("Context extraction: file={}, file_id={:?}, content_reader={}",
172                    path, file_id_for_context, content_reader_opt.is_some());
173
174                // Convert SearchResults to MatchResults (strip path and dependencies) and extract context
175                let matches: Vec<MatchResult> = file_matches
176                    .into_iter()
177                    .map(|r| {
178                        // Extract context lines if requested (0 = disabled)
179                        let (context_before, context_after) = if context_lines > 0 {
180                            if let (Some(reader), Some(fid)) = (&content_reader_opt, file_id_for_context) {
181                                let result = reader.get_context_by_line(fid as u32, r.span.start_line, context_lines)
182                                    .unwrap_or_else(|e| {
183                                        log::warn!("Failed to extract context for {}:{}: {}", path, r.span.start_line, e);
184                                        (vec![], vec![])
185                                    });
186                                log::debug!("Extracted context for {}:{} - before: {}, after: {}",
187                                    path, r.span.start_line, result.0.len(), result.1.len());
188                                result
189                            } else {
190                                if content_reader_opt.is_none() {
191                                    log::debug!("No ContentReader available for context extraction");
192                                }
193                                if file_id_for_context.is_none() {
194                                    log::debug!("No file_id found for {}", path);
195                                }
196                                (vec![], vec![])
197                            }
198                        } else {
199                            (vec![], vec![])
200                        };
201
202                        MatchResult {
203                            kind: r.kind,
204                            symbol: r.symbol,
205                            span: r.span,
206                            preview: r.preview,
207                            context_before,
208                            context_after,
209                        }
210                    })
211                    .collect();
212
213                FileGroupedResult {
214                    path,
215                    language,
216                    dependencies,
217                    matches,
218                }
219            })
220            .collect();
221
222        // Sort by path for deterministic output
223        file_results.sort_by(|a, b| a.path.cmp(&b.path));
224
225        Ok(file_results)
226    }
227
228    /// Execute a query and return matching results with index metadata
229    ///
230    /// This is the preferred method for programmatic/JSON output as it includes
231    /// index freshness information that AI agents can use to decide whether to re-index.
232    pub fn search_with_metadata(&self, pattern: &str, filter: QueryFilter) -> Result<QueryResponse> {
233        log::info!("Executing query with metadata: pattern='{}', filter={:?}", pattern, filter);
234
235        // Ensure cache exists
236        if !self.cache.exists() {
237            anyhow::bail!(
238                "Index not found. Run 'rfx index' to build the cache first."
239            );
240        }
241
242        // Validate cache integrity
243        if let Err(e) = self.cache.validate() {
244            anyhow::bail!(
245                "Cache appears to be corrupted: {}. Run 'rfx clear' followed by 'rfx index' to rebuild.",
246                e
247            );
248        }
249
250        // Get index status and warning (without printing warnings to stderr)
251        let (status, can_trust_results, warning) = self.get_index_status()?;
252
253        // Execute the search
254        let (results, total) = self.search_internal(pattern, filter.clone())?;
255
256        // Build pagination metadata
257        use crate::models::PaginationInfo;
258        let pagination = PaginationInfo {
259            total,
260            count: results.len(),
261            offset: filter.offset.unwrap_or(0),
262            limit: filter.limit,
263            has_more: total > filter.offset.unwrap_or(0) + results.len(),
264        };
265
266        // Always use grouped format (group results by file)
267        // Dependencies are loaded only when include_dependencies is true
268        let grouped_results = self.group_and_load_dependencies(results, filter.include_dependencies, filter.context_lines)?;
269
270        Ok(QueryResponse {
271            ai_instruction: None,  // AI instruction is generated by CLI/MCP layer, not here
272            status,
273            can_trust_results,
274            warning,
275            pagination,
276            results: grouped_results,
277        })
278    }
279
280    /// Execute a query and return matching results (legacy method)
281    ///
282    /// This method prints warnings to stderr and returns just the results.
283    /// For programmatic use, prefer `search_with_metadata()`.
284    pub fn search(&self, pattern: &str, filter: QueryFilter) -> Result<Vec<SearchResult>> {
285        log::info!("Executing query: pattern='{}', filter={:?}", pattern, filter);
286
287        // Ensure cache exists
288        if !self.cache.exists() {
289            anyhow::bail!(
290                "Index not found. Run 'rfx index' to build the cache first."
291            );
292        }
293
294        // Validate cache integrity
295        if let Err(e) = self.cache.validate() {
296            anyhow::bail!(
297                "Cache appears to be corrupted: {}. Run 'rfx clear' followed by 'rfx index' to rebuild.",
298                e
299            );
300        }
301
302        // Show non-blocking warnings about branch state and staleness
303        self.check_index_freshness(&filter)?;
304
305        // Execute the search (discard total count - legacy method doesn't use it)
306        let (mut results, _total_count) = self.search_internal(pattern, filter.clone())?;
307
308        // Load dependencies if requested
309        self.load_dependencies(&mut results, filter.include_dependencies)?;
310
311        Ok(results)
312    }
313
314    /// Internal search implementation (used by both search methods)
315    /// Returns (results, total_count) where total_count is the count before offset/limit
316    fn search_internal(&self, pattern: &str, filter: QueryFilter) -> Result<(Vec<SearchResult>, usize)> {
317        use std::time::{Duration, Instant};
318
319        // Start timeout timer if configured
320        let start_time = Instant::now();
321        let timeout = if filter.timeout_secs > 0 {
322            Some(Duration::from_secs(filter.timeout_secs))
323        } else {
324            None
325        };
326
327        // KEYWORD DETECTION (early): Check if this is a keyword query that should scan ALL files
328        // When a user searches for a language keyword (like "class", "function") with --symbols or --kind,
329        // we interpret it as "list all symbols of that type" and should scan ALL files,
330        // not just the first 100 candidates from trigram search.
331        //
332        // Requirements for keyword query mode:
333        // 1. Symbol mode active (--symbols or --kind)
334        // 2. Pattern matches a keyword in ANY supported language
335        //
336        // Note: --lang is optional. If specified, language filtering happens naturally in Phase 2/3.
337        // Empty pattern in symbol mode means "list all symbols of the requested kind" —
338        // treat it like a keyword query so we scan all files instead of failing the
339        // broad-query guard or returning zero trigram matches.
340        let is_keyword_query = if filter.symbols_mode || filter.kind.is_some() {
341            pattern.is_empty() || ParserFactory::get_all_keywords().contains(&pattern)
342        } else {
343            false
344        };
345
346        // KEYWORD-TO-KIND MAPPING: If user searches for a keyword without --kind, infer the kind
347        // Example: "class" → SymbolKind::Class, "function" → SymbolKind::Function
348        // This ensures keyword queries return only the relevant symbol type
349        let mut filter = filter.clone();  // Clone so we can modify it
350        if is_keyword_query && filter.kind.is_none() {
351            if let Some(inferred_kind) = Self::keyword_to_kind(pattern) {
352                log::info!("Keyword '{}' mapped to kind {:?} (auto-inferred)", pattern, inferred_kind);
353                filter.kind = Some(inferred_kind);
354            }
355        }
356
357        // EARLY BROAD QUERY DETECTION (Index Size Check)
358        // This check happens BEFORE the expensive trigram search to prevent hangs on large indexes
359        // For very large codebases (like Linux kernel with 62K files), even valid 3-char trigrams
360        // like "get" can take 10-30+ seconds to search. This early check prevents that hang.
361        //
362        // Criteria for early blocking:
363        // 1. Large index (> 20,000 files) AND
364        // 2. Short pattern (< 4 chars) AND
365        // 3. Not using regex (regex has its own trigram extraction) AND
366        // 4. Not a keyword query (keywords are intentionally broad) AND
367        // 5. Not forced by --force flag
368        if !filter.force && !filter.use_regex && !is_keyword_query {
369            let stats = self.cache.stats()?;
370            let total_files = stats.total_files;
371            let pattern_len = pattern.chars().count();
372
373            // Thresholds for early blocking:
374            // - Large index: 20,000+ files (approximately where performance degrades significantly)
375            // - Short pattern: < 4 chars (3-char trigrams are borderline, < 4 catches edge cases)
376            // Test overrides allow reducing thresholds for integration tests without creating 20K+ files
377            let large_index_threshold = filter.test_large_index_threshold.unwrap_or(20_000);
378            let short_pattern_threshold = filter.test_short_pattern_threshold.unwrap_or(4);
379
380            if total_files > large_index_threshold && pattern_len < short_pattern_threshold {
381                anyhow::bail!(
382                    "Query too broad - would be expensive to execute on this large index\n\
383                     \n\
384                     This index contains {} files, and pattern '{}' ({} characters) is too short for efficient searching.\n\
385                     On large codebases, short patterns can take 10-30+ seconds to complete.\n\
386                     \n\
387                     This query could:\n\
388                     • Hang for an extended period before returning results\n\
389                     • Return thousands of results\n\
390                     • Flood LLM context windows with excessive data\n\
391                     • Fail entirely\n\
392                     \n\
393                     Suggestions to narrow the query:\n\
394                     • Use a longer, more specific pattern (4+ characters recommended for large indexes)\n\
395                     • Add a language filter: --lang <language>\n\
396                     • Add a file filter: --glob <pattern> or --file <path>\n\
397                     • Use --force to bypass this check if you really need all results\n\
398                     \n\
399                     To force execution anyway:\n\
400                     rfx query \"{}\" --force",
401                    total_files,
402                    pattern,
403                    pattern_len,
404                    pattern
405                );
406            }
407        }
408
409        // PHASE 1: Get initial candidates (choose search strategy)
410        let mut results = if is_keyword_query {
411            // KEYWORD QUERY MODE: Scan all files (or files of target language if --lang specified)
412            // This ensures we find ALL classes/functions/etc, not just those in the first 100 trigram matches
413            if let Some(lang) = filter.language {
414                log::info!("Keyword query detected for '{}' - scanning all {:?} files (bypassing trigram search)",
415                          pattern, lang);
416            } else {
417                log::info!("Keyword query detected for '{}' - scanning all files (bypassing trigram search)", pattern);
418            }
419            self.get_all_language_files(&filter)?
420        } else if filter.use_regex {
421            // Regex pattern search with trigram optimization
422            self.get_regex_candidates(pattern, timeout.as_ref(), &start_time, filter.suppress_output)?
423        } else {
424            // Standard trigram-based full-text search
425            self.get_trigram_candidates(pattern, &filter)?
426        };
427
428        // EARLY LANGUAGE FILTER: Apply language filtering BEFORE broad query check
429        // This ensures we only parse files matching the language filter in Phase 2
430        // Critical for non-keyword queries to work correctly with accurate candidate counts
431        //
432        // Skip for keyword queries - those candidates are already pre-filtered by language
433        if !is_keyword_query {
434            if let Some(lang) = filter.language {
435                let before_count = results.len();
436                results.retain(|r| r.lang == lang);
437                log::debug!(
438                    "Language filter ({:?}): reduced {} candidates to {} candidates",
439                    lang,
440                    before_count,
441                    results.len()
442                );
443            }
444        }
445
446        // EARLY GLOB PATTERN FILTER: Apply glob/exclude filtering BEFORE broad query check
447        // This ensures candidate count reflects actual files that will be parsed
448        // Critical for queries like: rfx query "index" --symbols --glob "src/**/*.rs"
449        if !filter.glob_patterns.is_empty() || !filter.exclude_patterns.is_empty() {
450            use globset::{Glob, GlobSetBuilder};
451
452            // Build include matcher (if patterns specified)
453            let include_matcher = if !filter.glob_patterns.is_empty() {
454                let mut builder = GlobSetBuilder::new();
455                for pattern in &filter.glob_patterns {
456                    // Normalize pattern to ensure LLM-generated patterns work correctly
457                    let normalized = Self::normalize_glob_pattern(pattern);
458                    match Glob::new(&normalized) {
459                        Ok(glob) => {
460                            builder.add(glob);
461                        }
462                        Err(e) => {
463                            log::warn!("Invalid glob pattern '{}': {}", pattern, e);
464                        }
465                    }
466                }
467                match builder.build() {
468                    Ok(matcher) => Some(matcher),
469                    Err(e) => {
470                        log::warn!("Failed to build glob matcher: {}", e);
471                        None
472                    }
473                }
474            } else {
475                None
476            };
477
478            // Build exclude matcher (if patterns specified)
479            let exclude_matcher = if !filter.exclude_patterns.is_empty() {
480                let mut builder = GlobSetBuilder::new();
481                for pattern in &filter.exclude_patterns {
482                    // Normalize pattern to ensure LLM-generated patterns work correctly
483                    let normalized = Self::normalize_glob_pattern(pattern);
484                    match Glob::new(&normalized) {
485                        Ok(glob) => {
486                            builder.add(glob);
487                        }
488                        Err(e) => {
489                            log::warn!("Invalid exclude pattern '{}': {}", pattern, e);
490                        }
491                    }
492                }
493                match builder.build() {
494                    Ok(matcher) => Some(matcher),
495                    Err(e) => {
496                        log::warn!("Failed to build exclude matcher: {}", e);
497                        None
498                    }
499                }
500            } else {
501                None
502            };
503
504            // Apply filters
505            let before_count = results.len();
506            results.retain(|r| {
507                // If include patterns specified, path must match at least one
508                let included = if let Some(ref matcher) = include_matcher {
509                    matcher.is_match(&r.path)
510                } else {
511                    true // No include patterns = include all
512                };
513
514                // If exclude patterns specified, path must NOT match any
515                let excluded = if let Some(ref matcher) = exclude_matcher {
516                    matcher.is_match(&r.path)
517                } else {
518                    false // No exclude patterns = exclude none
519                };
520
521                included && !excluded
522            });
523            log::debug!(
524                "Glob filter: reduced {} candidates to {} candidates",
525                before_count,
526                results.len()
527            );
528        }
529
530        // Check timeout after Phase 1
531        if let Some(timeout_duration) = timeout {
532            if start_time.elapsed() > timeout_duration {
533                anyhow::bail!(
534                    "Query timeout exceeded ({} seconds).\n\
535                     \n\
536                     The query took too long to complete. Try one of these approaches:\n\
537                     • Use a more specific search pattern (longer patterns = faster search)\n\
538                     • Add a language filter with --lang to narrow the search space\n\
539                     • Add a file filter with --file to search specific directories\n\
540                     • Increase the timeout with --timeout <seconds>\n\
541                     \n\
542                     Example: rfx query \"{}\" --lang rust --timeout 60",
543                    filter.timeout_secs,
544                    pattern
545                );
546            }
547        }
548
549        // BROAD QUERY DETECTION: Check if query is too expensive BEFORE parsing
550        // This protects LLM users from accidentally running expensive queries that flood context windows
551        if !filter.force {
552            let candidate_count = results.len();
553            let pattern_len = pattern.chars().count();
554
555            // Condition 1: Pattern too short (< 3 chars can't use trigram optimization efficiently)
556            // Exception: Allow short keyword queries (e.g., "fn", "if") since they scan all language files
557            let is_short_pattern = pattern_len < 3 && !filter.use_regex && !is_keyword_query;
558
559            // Condition 2: AST query without glob restriction on large codebases
560            // Allow on small codebases (< 100 files) but require glob for larger ones
561            let is_broad_ast = filter.use_ast && filter.glob_patterns.is_empty() && candidate_count >= 100;
562
563            // Condition 3: Query-type-aware threshold for symbol/AST parsing
564            // Different thresholds based on actual performance characteristics:
565            // - AST without glob: 100 files (allow small codebases, block large ones)
566            // - AST with glob: 10,000 files (~5 seconds max)
567            // - Keyword queries: 20,000 files (~3 seconds max) - scan all files of language
568            // - Trigram-filtered symbols: 50,000 files (~5 seconds max) - very fast due to trigram filtering
569            let threshold = if filter.use_ast && filter.glob_patterns.is_empty() {
570                100  // AST without glob - allow small codebases
571            } else if filter.use_ast {
572                10_000  // AST with glob restriction
573            } else if is_keyword_query {
574                20_000  // Keyword queries (e.g., "class", "function")
575            } else {
576                50_000  // Trigram-filtered symbol queries
577            };
578
579            let has_many_candidates = candidate_count > threshold &&
580                                     (filter.symbols_mode || filter.kind.is_some() || filter.use_ast);
581
582            if is_short_pattern || has_many_candidates || is_broad_ast {
583                let reason = if is_short_pattern {
584                    format!("Pattern '{}' is too short ({} characters). Short patterns bypass trigram optimization and require scanning many files.", pattern, pattern_len)
585                } else if is_broad_ast {
586                    format!("AST query without --glob restriction will scan the entire codebase ({} files). AST queries are SLOW (500ms-10s+).", candidate_count)
587                } else if is_keyword_query {
588                    format!("Keyword query '{}' matched {} files. This query scans all files of the target language, which will take significant time and produce excessive results.", pattern, candidate_count)
589                } else {
590                    format!("Query matched {} files. Parsing this many files with --symbols or --kind will take significant time and produce excessive results.", candidate_count)
591                };
592
593                let suggestions = if is_short_pattern {
594                    vec![
595                        "• Use a longer, more specific pattern (3+ characters recommended)",
596                        "• Add a language filter: --lang <language>",
597                        "• Add a file path filter: --file <path> or --glob <pattern>",
598                        "• Use --force to bypass this check if you really need all results"
599                    ]
600                } else if is_broad_ast {
601                    vec![
602                        "• Add --glob to restrict AST query to specific files: --glob 'src/**/*.rs'",
603                        "• Use --symbols instead (10-100x faster in 95% of cases)",
604                        "• Use --force to bypass this check if you need a full codebase scan"
605                    ]
606                } else if is_keyword_query {
607                    vec![
608                        "• Add a language filter to reduce files scanned: --lang <language>",
609                        "• Add glob patterns to search specific directories: --glob 'src/**/*.rs'",
610                        "• Add --kind to filter to specific symbol types: --kind function",
611                        "• Use a more specific pattern instead of a keyword",
612                        "• Use --force to bypass this check if you need all results"
613                    ]
614                } else {
615                    vec![
616                        "• Add a language filter to reduce candidate set: --lang <language>",
617                        "• Add glob patterns to search specific directories: --glob 'src/**/*.rs'",
618                        "• Use a more specific search pattern",
619                        "• Use --force to bypass this check if you need all results"
620                    ]
621                };
622
623                // Build the command snippet showing current flags
624                let mut cmd_flags = String::new();
625                if filter.symbols_mode {
626                    cmd_flags.push_str("--symbols ");
627                }
628                if let Some(ref lang) = filter.language {
629                    cmd_flags.push_str(&format!("--lang {:?} ", lang));
630                }
631                if let Some(ref kind) = filter.kind {
632                    cmd_flags.push_str(&format!("--kind {:?} ", kind));
633                }
634                if filter.use_ast {
635                    cmd_flags.push_str("--ast ");
636                }
637
638                anyhow::bail!(
639                    "Query too broad - would be expensive to execute\n\
640                     \n\
641                     {}\n\
642                     \n\
643                     This query could:\n\
644                     • Hang for an extended period before returning results\n\
645                     • Return thousands of results\n\
646                     • Flood LLM context windows with excessive data\n\
647                     • Fail entirely\n\
648                     \n\
649                     Suggestions to narrow the query:\n\
650                     {}\n\
651                     \n\
652                     To force execution anyway:\n\
653                     rfx query \"{}\" --force {}",
654                    reason,
655                    suggestions.join("\n             "),
656                    pattern,
657                    cmd_flags
658                );
659            }
660        }
661
662        // DETERMINISTIC SORTING: Sort candidates early for deterministic results
663        // This ensures results are always returned in the same order
664        if filter.symbols_mode || filter.kind.is_some() || filter.use_ast {
665            results.sort_by(|a, b| {
666                a.path.cmp(&b.path)
667                    .then_with(|| a.span.start_line.cmp(&b.span.start_line))
668            });
669
670            // Warn if many candidates need parsing (helps users refine queries)
671            let candidate_count = results.len();
672            if candidate_count > 1000 && !filter.suppress_output {
673                output::warn(&format!(
674                    "Pattern '{}' matched {} files - parsing may take some time. Consider using --file, --glob, or a more specific pattern to narrow the search.",
675                    pattern,
676                    candidate_count
677                ));
678            } else if candidate_count > 100 {
679                log::info!("Parsing {} candidate files for symbol extraction", candidate_count);
680            }
681        }
682
683        // PHASE 2: Enrich with symbol information or AST pattern matching (if needed)
684        if filter.use_ast {
685            // AST pattern matching: Execute Tree-sitter query on candidate files
686            results = self.enrich_with_ast(results, pattern, filter.language)?;
687        } else if filter.symbols_mode || filter.kind.is_some() {
688            // Symbol enrichment: Parse candidate files and extract symbol definitions
689            results = self.enrich_with_symbols(results, pattern, &filter)?;
690        }
691
692        // PHASE 3: Apply post-enrichment filters
693        // Note: Language and glob filters are applied in Phase 1 (before broad query check)
694        // Only kind, file_pattern, and exact filters are applied here
695
696        // Deduplicate symbols: the same source location can be emitted as both
697        // Function and Method by some parsers.  Keep the first hit for each
698        // (path, start_line, symbol_name) triple so --kind function doesn't
699        // return the same definition twice.
700        if filter.symbols_mode || filter.kind.is_some() {
701            let mut seen = std::collections::HashSet::<(String, usize, Option<String>)>::new();
702            results.retain(|r| seen.insert((r.path.clone(), r.span.start_line, r.symbol.clone())));
703        }
704
705        // Apply kind filter (only relevant for symbol searches)
706        // Special case: --kind function also includes methods (methods are functions in classes)
707        if let Some(ref kind) = filter.kind {
708            results.retain(|r| {
709                if matches!(kind, SymbolKind::Function) {
710                    // When searching for functions, also include methods
711                    matches!(r.kind, SymbolKind::Function | SymbolKind::Method)
712                } else {
713                    r.kind == *kind
714                }
715            });
716        }
717
718        // Apply file path filter (substring match)
719        if let Some(ref file_pattern) = filter.file_pattern {
720            results.retain(|r| r.path.contains(file_pattern));
721        }
722
723        // Apply exact name filter (only for symbol searches)
724        if filter.exact && filter.symbols_mode {
725            results.retain(|r| r.symbol.as_deref() == Some(pattern));
726        }
727
728        // Expand symbol bodies if requested
729        // Works for both symbol-mode and regex searches (if regex matched a symbol definition)
730        if filter.expand {
731            // Load content store to fetch full symbol bodies
732            let content_path = self.cache.path().join("content.bin");
733            if let Ok(content_reader) = ContentReader::open(&content_path) {
734                for result in &mut results {
735                    // Only expand if the result has a meaningful span (not just a single line)
736                    if result.span.start_line < result.span.end_line {
737                        // Find the file_id for this result's path
738                        if let Some(file_id) = Self::find_file_id(&content_reader, &result.path) {
739                            // Fetch the full span content
740                            if let Ok(content) = content_reader.get_file_content(file_id) {
741                                let lines: Vec<&str> = content.lines().collect();
742                                let start_idx = (result.span.start_line as usize).saturating_sub(1);
743                                let end_idx = (result.span.end_line as usize).min(lines.len());
744
745                                if start_idx < end_idx {
746                                    let full_body = lines[start_idx..end_idx].join("\n");
747                                    result.preview = full_body;
748                                }
749                            }
750                        }
751                    }
752                }
753            }
754        }
755
756        // Step 4: Deduplicate by path if paths-only mode
757        if filter.paths_only {
758            use std::collections::HashSet;
759            let mut seen_paths = HashSet::new();
760            results.retain(|r| seen_paths.insert(r.path.clone()));
761        }
762
763        // Step 5: Sort results deterministically (by path, then line number)
764        results.sort_by(|a, b| {
765            a.path.cmp(&b.path)
766                .then_with(|| a.span.start_line.cmp(&b.span.start_line))
767        });
768
769        // Capture total count AFTER all filtering but BEFORE pagination (offset/limit)
770        // This is the total number of results the user can paginate through
771        let total_count = results.len();
772
773        // Step 5.5: Apply offset (pagination)
774        if let Some(offset) = filter.offset {
775            if offset < results.len() {
776                results = results.into_iter().skip(offset).collect();
777            } else {
778                // Offset beyond results - return empty
779                results.clear();
780            }
781        }
782
783        // Step 6: Apply limit
784        if let Some(limit) = filter.limit {
785            results.truncate(limit);
786        }
787
788        log::info!("Query returned {} results (total before pagination: {})", results.len(), total_count);
789
790        Ok((results, total_count))
791    }
792
793    /// Search for symbols by exact name match
794    pub fn find_symbol(&self, name: &str) -> Result<Vec<SearchResult>> {
795        let filter = QueryFilter {
796            symbols_mode: true,
797            ..Default::default()
798        };
799        self.search(name, filter)
800    }
801
802    /// Search using a Tree-sitter AST pattern
803    pub fn search_ast(&self, pattern: &str, lang: Option<Language>) -> Result<Vec<SearchResult>> {
804        let filter = QueryFilter {
805            language: lang,
806            use_ast: true,
807            ..Default::default()
808        };
809
810        self.search(pattern, filter)
811    }
812
813    /// Execute AST query on all indexed files (no trigram filtering)
814    ///
815    /// WARNING: This method scans the entire codebase (500ms-2s+).
816    /// In 95% of cases, use --symbols instead which is 10-100x faster.
817    ///
818    /// # Algorithm
819    /// 1. Get all indexed files for the specified language
820    /// 2. Apply glob/exclude filters to reduce file set
821    /// 3. Load file contents for all matching files
822    /// 4. Execute AST query pattern using Tree-sitter
823    /// 5. Apply remaining filters and return results
824    ///
825    /// # Performance
826    /// - Parses entire codebase (not just trigram candidates)
827    /// - Expected: 500ms-2s for medium codebases, 2-10s for large codebases
828    /// - Use --glob to limit scope for better performance
829    ///
830    /// # Requirements
831    /// - Language must be specified (AST queries are language-specific)
832    /// - AST pattern must be valid S-expression syntax
833    pub fn search_ast_all_files(&self, ast_pattern: &str, filter: QueryFilter) -> Result<Vec<SearchResult>> {
834        log::info!("Executing AST query on all files: pattern='{}', filter={:?}", ast_pattern, filter);
835
836        // Require language for AST queries
837        let lang = filter.language.ok_or_else(|| anyhow::anyhow!(
838            "Language must be specified for AST pattern matching. Use --lang to specify the language.\n\
839             \n\
840             Example: rfx query \"(function_definition) @fn\" --ast --lang python"
841        ))?;
842
843        // Ensure cache exists
844        if !self.cache.exists() {
845            anyhow::bail!(
846                "Index not found. Run 'rfx index' to build the cache first."
847            );
848        }
849
850        // Show non-blocking warnings about branch state and staleness
851        self.check_index_freshness(&filter)?;
852
853        // Load content store
854        let content_path = self.cache.path().join("content.bin");
855        let content_reader = ContentReader::open(&content_path)
856            .context("Failed to open content store")?;
857
858        // Build glob matchers ONCE before file iteration (performance optimization)
859        use globset::{Glob, GlobSetBuilder};
860
861        let include_matcher = if !filter.glob_patterns.is_empty() {
862            let mut builder = GlobSetBuilder::new();
863            for pattern in &filter.glob_patterns {
864                // Normalize pattern to ensure LLM-generated patterns work correctly
865                let normalized = Self::normalize_glob_pattern(pattern);
866                if let Ok(glob) = Glob::new(&normalized) {
867                    builder.add(glob);
868                }
869            }
870            builder.build().ok()
871        } else {
872            None
873        };
874
875        let exclude_matcher = if !filter.exclude_patterns.is_empty() {
876            let mut builder = GlobSetBuilder::new();
877            for pattern in &filter.exclude_patterns {
878                // Normalize pattern to ensure LLM-generated patterns work correctly
879                let normalized = Self::normalize_glob_pattern(pattern);
880                if let Ok(glob) = Glob::new(&normalized) {
881                    builder.add(glob);
882                }
883            }
884            builder.build().ok()
885        } else {
886            None
887        };
888
889        // Get all files matching the language and glob filters
890        let mut candidates: Vec<SearchResult> = Vec::new();
891
892        for file_id in 0..content_reader.file_count() {
893            let file_path = match content_reader.get_file_path(file_id as u32) {
894                Some(p) => p,
895                None => continue,
896            };
897
898            // Detect language from file extension
899            let ext = file_path.extension()
900                .and_then(|e| e.to_str())
901                .unwrap_or("");
902            let detected_lang = Language::from_extension(ext);
903
904            // Filter by language
905            if detected_lang != lang {
906                continue;
907            }
908
909            let file_path_str = file_path.to_string_lossy().to_string();
910
911            // Apply glob/exclude filters BEFORE loading content (performance optimization)
912            let included = include_matcher.as_ref().map_or(true, |m| m.is_match(&file_path_str));
913            let excluded = exclude_matcher.as_ref().map_or(false, |m| m.is_match(&file_path_str));
914
915            if !included || excluded {
916                continue;
917            }
918
919            // Create a dummy candidate for this file (AST query will replace it)
920            candidates.push(SearchResult {
921                path: file_path_str,
922                lang: detected_lang,
923                span: Span { start_line: 1, end_line: 1 },
924                symbol: None,
925                kind: SymbolKind::Unknown("ast_query".to_string()),
926                preview: String::new(),
927                dependencies: None,
928            });
929        }
930
931        log::info!("AST query scanning {} files for language {:?}", candidates.len(), lang);
932
933        // BROAD QUERY DETECTION: Block large AST queries without glob restriction
934        // Allow small codebases (<100 files) but require --glob for larger ones
935        if !filter.force && filter.glob_patterns.is_empty() && candidates.len() >= 100 {
936            anyhow::bail!(
937                "Query too broad - would be expensive to execute\n\
938                 \n\
939                 AST query without --glob restriction will scan the ENTIRE codebase ({} files). AST queries are SLOW (500ms-10s+).\n\
940                 \n\
941                 This query could:\n\
942                 • Hang for an extended period before returning results\n\
943                 • Return thousands of results\n\
944                 • Flood LLM context windows with excessive data\n\
945                 • Fail entirely\n\
946                 \n\
947                 Suggestions to narrow the query:\n\
948                 • Add --glob to restrict AST query to specific files: --glob 'src/**/*.rs'\n\
949                 • Use --symbols instead (10-100x faster in 95% of cases)\n\
950                 • Use --force to bypass this check if you need a full codebase scan\n\
951                 \n\
952                 To force execution anyway:\n\
953                 rfx query \"{}\" --force --ast --lang {:?}",
954                candidates.len(),
955                ast_pattern,
956                lang
957            );
958        }
959
960        if candidates.is_empty() {
961            if !filter.suppress_output {
962                output::warn(&format!("No files found for language {:?}. Check your language filter or glob patterns.", lang));
963            }
964            return Ok(Vec::new());
965        }
966
967        // Execute the AST query on all candidate files
968        // This will load file contents and parse them with tree-sitter
969        let mut results = self.enrich_with_ast(candidates, ast_pattern, filter.language)?;
970
971        log::debug!("AST query found {} matches before filtering", results.len());
972
973        // Apply remaining filters (same as search_internal Phase 3)
974
975        // Apply kind filter
976        if let Some(ref kind) = filter.kind {
977            results.retain(|r| {
978                if matches!(kind, SymbolKind::Function) {
979                    matches!(r.kind, SymbolKind::Function | SymbolKind::Method)
980                } else {
981                    r.kind == *kind
982                }
983            });
984        }
985
986        // Note: exact filter doesn't make sense for AST queries (pattern is S-expression, not symbol name)
987
988        // Expand symbol bodies if requested
989        if filter.expand {
990            let content_path = self.cache.path().join("content.bin");
991            if let Ok(content_reader) = ContentReader::open(&content_path) {
992                for result in &mut results {
993                    if result.span.start_line < result.span.end_line {
994                        if let Some(file_id) = Self::find_file_id(&content_reader, &result.path) {
995                            if let Ok(content) = content_reader.get_file_content(file_id) {
996                                let lines: Vec<&str> = content.lines().collect();
997                                let start_idx = (result.span.start_line as usize).saturating_sub(1);
998                                let end_idx = (result.span.end_line as usize).min(lines.len());
999
1000                                if start_idx < end_idx {
1001                                    let full_body = lines[start_idx..end_idx].join("\n");
1002                                    result.preview = full_body;
1003                                }
1004                            }
1005                        }
1006                    }
1007                }
1008            }
1009        }
1010
1011        // Deduplicate by path if paths-only mode
1012        if filter.paths_only {
1013            use std::collections::HashSet;
1014            let mut seen_paths = HashSet::new();
1015            results.retain(|r| seen_paths.insert(r.path.clone()));
1016        }
1017
1018        // Sort results deterministically
1019        results.sort_by(|a, b| {
1020            a.path.cmp(&b.path)
1021                .then_with(|| a.span.start_line.cmp(&b.span.start_line))
1022        });
1023
1024        // Apply offset (pagination)
1025        if let Some(offset) = filter.offset {
1026            if offset < results.len() {
1027                results = results.into_iter().skip(offset).collect();
1028            } else {
1029                results.clear();
1030            }
1031        }
1032
1033        // Apply limit
1034        if let Some(limit) = filter.limit {
1035            results.truncate(limit);
1036        }
1037
1038        log::info!("AST query returned {} results", results.len());
1039
1040        // Load dependencies if requested
1041        self.load_dependencies(&mut results, filter.include_dependencies)?;
1042
1043        Ok(results)
1044    }
1045
1046    /// Search using AST pattern with separate text pattern for trigram filtering
1047    ///
1048    /// This allows efficient AST queries by:
1049    /// 1. Using text_pattern for Phase 1 trigram filtering (narrows to candidate files)
1050    /// 2. Using ast_pattern for Phase 2 AST matching (structure-aware filtering)
1051    ///
1052    /// # Example
1053    /// ```ignore
1054    /// // Find async functions: trigram search for "fn ", AST match for function_item
1055    /// engine.search_ast_with_text_filter("fn ", "(function_item (async))", filter)?;
1056    /// ```
1057    pub fn search_ast_with_text_filter(
1058        &self,
1059        text_pattern: &str,
1060        ast_pattern: &str,
1061        filter: QueryFilter,
1062    ) -> Result<Vec<SearchResult>> {
1063        log::info!("Executing AST query with text filter: text='{}', ast='{}', filter={:?}",
1064                   text_pattern, ast_pattern, filter);
1065
1066        // Ensure cache exists
1067        if !self.cache.exists() {
1068            anyhow::bail!(
1069                "Index not found. Run 'rfx index' to build the cache first."
1070            );
1071        }
1072
1073        // Show non-blocking warnings about branch state and staleness
1074        self.check_index_freshness(&filter)?;
1075
1076        // Start timeout timer if configured
1077        use std::time::{Duration, Instant};
1078        let start_time = Instant::now();
1079        let timeout = if filter.timeout_secs > 0 {
1080            Some(Duration::from_secs(filter.timeout_secs))
1081        } else {
1082            None
1083        };
1084
1085        // PHASE 1: Get initial candidates using text pattern (trigram search)
1086        let candidates = if filter.use_regex {
1087            self.get_regex_candidates(text_pattern, timeout.as_ref(), &start_time, filter.suppress_output)?
1088        } else {
1089            self.get_trigram_candidates(text_pattern, &filter)?
1090        };
1091
1092        log::debug!("Phase 1 found {} candidate locations", candidates.len());
1093
1094        // PHASE 2: Execute AST query on candidates
1095        let mut results = self.enrich_with_ast(candidates, ast_pattern, filter.language)?;
1096
1097        log::debug!("Phase 2 AST matching found {} results", results.len());
1098
1099        // PHASE 3: Apply filters
1100        if let Some(lang) = filter.language {
1101            results.retain(|r| r.lang == lang);
1102        }
1103
1104        if let Some(ref kind) = filter.kind {
1105            results.retain(|r| {
1106                if matches!(kind, SymbolKind::Function) {
1107                    matches!(r.kind, SymbolKind::Function | SymbolKind::Method)
1108                } else {
1109                    r.kind == *kind
1110                }
1111            });
1112        }
1113
1114        if let Some(ref file_pattern) = filter.file_pattern {
1115            results.retain(|r| r.path.contains(file_pattern));
1116        }
1117
1118        // Apply glob pattern filters (same logic as in search_internal)
1119        if !filter.glob_patterns.is_empty() || !filter.exclude_patterns.is_empty() {
1120            use globset::{Glob, GlobSetBuilder};
1121
1122            let include_matcher = if !filter.glob_patterns.is_empty() {
1123                let mut builder = GlobSetBuilder::new();
1124                for pattern in &filter.glob_patterns {
1125                    // Normalize pattern to ensure LLM-generated patterns work correctly
1126                    let normalized = Self::normalize_glob_pattern(pattern);
1127                    if let Ok(glob) = Glob::new(&normalized) {
1128                        builder.add(glob);
1129                    }
1130                }
1131                builder.build().ok()
1132            } else {
1133                None
1134            };
1135
1136            let exclude_matcher = if !filter.exclude_patterns.is_empty() {
1137                let mut builder = GlobSetBuilder::new();
1138                for pattern in &filter.exclude_patterns {
1139                    // Normalize pattern to ensure LLM-generated patterns work correctly
1140                    let normalized = Self::normalize_glob_pattern(pattern);
1141                    if let Ok(glob) = Glob::new(&normalized) {
1142                        builder.add(glob);
1143                    }
1144                }
1145                builder.build().ok()
1146            } else {
1147                None
1148            };
1149
1150            results.retain(|r| {
1151                let included = include_matcher.as_ref().map_or(true, |m| m.is_match(&r.path));
1152                let excluded = exclude_matcher.as_ref().map_or(false, |m| m.is_match(&r.path));
1153                included && !excluded
1154            });
1155        }
1156
1157        if filter.exact && filter.symbols_mode {
1158            results.retain(|r| r.symbol.as_deref() == Some(text_pattern));
1159        }
1160
1161        // Expand symbol bodies if requested
1162        if filter.expand {
1163            let content_path = self.cache.path().join("content.bin");
1164            if let Ok(content_reader) = ContentReader::open(&content_path) {
1165                for result in &mut results {
1166                    if result.span.start_line < result.span.end_line {
1167                        if let Some(file_id) = Self::find_file_id(&content_reader, &result.path) {
1168                            if let Ok(content) = content_reader.get_file_content(file_id) {
1169                                let lines: Vec<&str> = content.lines().collect();
1170                                let start_idx = (result.span.start_line as usize).saturating_sub(1);
1171                                let end_idx = (result.span.end_line as usize).min(lines.len());
1172
1173                                if start_idx < end_idx {
1174                                    let full_body = lines[start_idx..end_idx].join("\n");
1175                                    result.preview = full_body;
1176                                }
1177                            }
1178                        }
1179                    }
1180                }
1181            }
1182        }
1183
1184        // Sort results deterministically
1185        results.sort_by(|a, b| {
1186            a.path.cmp(&b.path)
1187                .then_with(|| a.span.start_line.cmp(&b.span.start_line))
1188        });
1189
1190        // Apply offset (pagination)
1191        if let Some(offset) = filter.offset {
1192            if offset < results.len() {
1193                results = results.into_iter().skip(offset).collect();
1194            } else {
1195                results.clear();
1196            }
1197        }
1198
1199        // Apply limit
1200        if let Some(limit) = filter.limit {
1201            results.truncate(limit);
1202        }
1203
1204        log::info!("AST query returned {} results", results.len());
1205
1206        Ok(results)
1207    }
1208
1209    /// List all symbols of a specific kind
1210    pub fn list_by_kind(&self, kind: SymbolKind) -> Result<Vec<SearchResult>> {
1211        let filter = QueryFilter {
1212            kind: Some(kind),
1213            symbols_mode: true,
1214            ..Default::default()
1215        };
1216
1217        self.search("*", filter)
1218    }
1219
1220    /// Enrich text match candidates with symbol information by parsing files
1221    ///
1222    /// Takes a list of text match candidates and extracts symbol information at those locations.
1223    ///
1224    /// # Algorithm
1225    /// 1. Group candidates by file_id for efficient processing
1226    /// 2. Parse each file with tree-sitter to extract ALL symbols
1227    /// 3. Filter symbols based on matching strategy:
1228    ///    - If use_regex=true: Extract symbols whose line spans overlap with candidate locations
1229    ///    - If use_contains=true: Filter symbols by substring match on symbol name
1230    ///    - Default: Filter symbols by exact name match
1231    /// 4. Return filtered symbol results
1232    ///
1233    /// # Performance
1234    /// Only parses files that have text matches, so typically 10-100 files
1235    /// instead of the entire codebase (62K+ files).
1236    ///
1237    /// # Optimizations
1238    /// 1. Language filtering: Skips files with unsupported languages (no parsers)
1239    /// 2. Parallel processing: Uses Rayon to parse files concurrently across CPU cores
1240    fn enrich_with_symbols(&self, candidates: Vec<SearchResult>, pattern: &str, filter: &QueryFilter) -> Result<Vec<SearchResult>> {
1241        // Load content store for file reading
1242        let content_path = self.cache.path().join("content.bin");
1243        let content_reader = ContentReader::open(&content_path)
1244            .context("Failed to open content store")?;
1245
1246        // Load trigram index for file path lookups
1247        let trigrams_path = self.cache.path().join("trigrams.bin");
1248        let trigram_index = if trigrams_path.exists() {
1249            TrigramIndex::load(&trigrams_path)?
1250        } else {
1251            Self::rebuild_trigram_index(&content_reader)?
1252        };
1253
1254        // Open symbol cache for reading cached symbols
1255        let symbol_cache = crate::symbol_cache::SymbolCache::open(self.cache.path())
1256            .context("Failed to open symbol cache")?;
1257
1258        // Load file hashes for current branch for cache lookups
1259        let root = self.cache.workspace_root();
1260        let branch = crate::git::get_current_branch(&root)
1261            .unwrap_or_else(|_| "_default".to_string());
1262        let file_hashes = self.cache.load_hashes_for_branch(&branch)
1263            .context("Failed to load file hashes")?;
1264        log::debug!("Loaded {} file hashes for branch '{}' for symbol cache lookups", file_hashes.len(), branch);
1265
1266        // Group candidates by file, filtering out unsupported languages
1267        use std::collections::HashMap;
1268        let mut files_by_path: HashMap<String, Vec<SearchResult>> = HashMap::new();
1269        let mut skipped_unsupported = 0;
1270
1271        for candidate in candidates {
1272            // Skip files with unsupported languages (no parser available)
1273            if !candidate.lang.is_supported() {
1274                skipped_unsupported += 1;
1275                continue;
1276            }
1277
1278            files_by_path
1279                .entry(candidate.path.clone())
1280                .or_insert_with(Vec::new)
1281                .push(candidate);
1282        }
1283
1284        let total_files = files_by_path.len();
1285        log::debug!("Processing {} candidate files for symbol enrichment (skipped {} unsupported language files)",
1286                   total_files, skipped_unsupported);
1287
1288        // Warn if pattern is very broad (may take time to parse all files)
1289        if total_files > 1000 && !filter.suppress_output {
1290            output::warn(&format!(
1291                "Pattern '{}' matched {} files. This may take some time to parse. Consider using a more specific pattern or adding --lang/--file filters to narrow the search.",
1292                pattern,
1293                total_files
1294            ));
1295        }
1296
1297        // Convert to vec for parallel processing
1298        let mut files_to_process: Vec<String> = files_by_path.keys().cloned().collect();
1299
1300        // PHASE 2a: Line-based pre-filtering (skip files where ALL matches are in comments/strings)
1301        // This reduces tree-sitter parsing workload by 2-5x for most queries
1302        let mut files_to_skip: std::collections::HashSet<String> = std::collections::HashSet::new();
1303
1304        for file_path in &files_to_process {
1305            // Get the language for this file
1306            let ext = std::path::Path::new(file_path)
1307                .extension()
1308                .and_then(|e| e.to_str())
1309                .unwrap_or("");
1310            let lang = Language::from_extension(ext);
1311
1312            // Get line filter for this language (if available)
1313            if let Some(line_filter) = crate::line_filter::get_filter(lang) {
1314                // Find file_id for this path
1315                let file_id = match Self::find_file_id_by_path(&content_reader, &trigram_index, file_path) {
1316                    Some(id) => id,
1317                    None => continue,
1318                };
1319
1320                // Load file content
1321                let content = match content_reader.get_file_content(file_id) {
1322                    Ok(c) => c,
1323                    Err(_) => continue,
1324                };
1325
1326                // Check if ALL pattern occurrences are in comments/strings
1327                let mut all_in_non_code = true;
1328                for line in content.lines() {
1329                    // Find all occurrences of the pattern in this line
1330                    let mut search_start = 0;
1331                    while let Some(pos) = line[search_start..].find(pattern) {
1332                        let absolute_pos = search_start + pos;
1333
1334                        // Check if this occurrence is in code (not comment/string)
1335                        let in_comment = line_filter.is_in_comment(line, absolute_pos);
1336                        let in_string = line_filter.is_in_string(line, absolute_pos);
1337
1338                        if !in_comment && !in_string {
1339                            // Found at least one occurrence in actual code
1340                            all_in_non_code = false;
1341                            break;
1342                        }
1343
1344                        search_start = absolute_pos + pattern.len();
1345                    }
1346
1347                    if !all_in_non_code {
1348                        break;
1349                    }
1350                }
1351
1352                // If ALL occurrences are in comments/strings, skip this file
1353                if all_in_non_code {
1354                    // Double-check: make sure there was at least one occurrence
1355                    if content.contains(pattern) {
1356                        files_to_skip.insert(file_path.clone());
1357                        log::debug!("Pre-filter: Skipping {} (all matches in comments/strings)", file_path);
1358                    }
1359                }
1360            }
1361        }
1362
1363        // Filter out files we're skipping
1364        files_to_process.retain(|path| !files_to_skip.contains(path));
1365
1366        log::debug!("Pre-filter: Skipped {} files where all matches are in comments/strings (parsing {} files)",
1367                   files_to_skip.len(), files_to_process.len());
1368
1369        // Configure thread pool for parallel processing (use 80% of available cores, capped at 8)
1370        let num_threads = {
1371            let available_cores = std::thread::available_parallelism()
1372                .map(|n| n.get())
1373                .unwrap_or(4);
1374            // Use 80% of available cores (minimum 1, maximum 8) to avoid locking the system
1375            // Cap at 8 to prevent diminishing returns from cache contention on high-core systems
1376            ((available_cores as f64 * 0.8).ceil() as usize).max(1).min(8)
1377        };
1378
1379        log::debug!("Using {} threads for parallel symbol extraction (out of {} available cores)",
1380                   num_threads,
1381                   std::thread::available_parallelism().map(|n| n.get()).unwrap_or(4));
1382
1383        // Build a custom thread pool with limited threads
1384        let pool = rayon::ThreadPoolBuilder::new()
1385            .num_threads(num_threads)
1386            .build()
1387            .context("Failed to create thread pool for symbol extraction")?;
1388
1389        // OPTIMIZATION: Batch read all cached symbols in ONE database transaction
1390        // This is 10-30x faster than calling get() individually for each file
1391
1392        // Step 1: Collect file paths that have hashes
1393        let files_with_hashes: Vec<String> = files_to_process
1394            .iter()
1395            .filter(|path| file_hashes.contains_key(path.as_str()))
1396            .cloned()
1397            .collect();
1398
1399        // Step 2: Batch lookup file_ids for all paths
1400        let file_id_map = self.cache.batch_get_file_ids(&files_with_hashes)
1401            .context("Failed to batch lookup file IDs")?;
1402
1403        // Step 3: Build (file_id, hash, path) tuples for batch_get_with_kind
1404        let file_lookup_tuples: Vec<(i64, String, String)> = files_with_hashes
1405            .iter()
1406            .filter_map(|path| {
1407                let file_id = file_id_map.get(path)?;
1408                let hash = file_hashes.get(path.as_str())?;
1409                Some((*file_id, hash.clone(), path.clone()))
1410            })
1411            .collect();
1412
1413        // Step 4: Batch read symbols with kind filtering (uses junction table + integer joins)
1414        let batch_results = symbol_cache.batch_get_with_kind(&file_lookup_tuples, filter.kind.clone())
1415            .context("Failed to batch read symbol cache")?;
1416
1417        // Step 5: Separate files into cached vs need-to-parse
1418        let mut cached_symbols: HashMap<String, Vec<SearchResult>> = HashMap::new();
1419        let mut files_needing_parse: Vec<String> = Vec::new();
1420
1421        // Build path lookup from file_id
1422        let id_to_path: HashMap<i64, String> = file_id_map
1423            .iter()
1424            .map(|(path, id)| (*id, path.clone()))
1425            .collect();
1426
1427        // Process cached results
1428        for (file_id, symbols) in batch_results {
1429            if let Some(file_path) = id_to_path.get(&file_id) {
1430                cached_symbols.insert(file_path.clone(), symbols);
1431            }
1432        }
1433
1434        // Files with hashes but not in cache results need parsing
1435        for path in &files_with_hashes {
1436            if file_id_map.contains_key(path) && !cached_symbols.contains_key(path) {
1437                files_needing_parse.push(path.clone());
1438            }
1439        }
1440
1441        // Add files without hashes to parse list
1442        for file_path in &files_to_process {
1443            if !file_hashes.contains_key(file_path.as_str()) {
1444                files_needing_parse.push(file_path.clone());
1445            }
1446        }
1447
1448        log::debug!(
1449            "Symbol cache: {} hits, {} need parsing",
1450            cached_symbols.len(),
1451            files_needing_parse.len()
1452        );
1453
1454        // Parse files in parallel using custom thread pool (only cache misses)
1455        use rayon::prelude::*;
1456
1457        let parsed_symbols: Vec<SearchResult> = pool.install(|| {
1458            files_needing_parse
1459                .par_iter()
1460                .flat_map(|file_path| {
1461                // Find file_id for this path
1462                let file_id = match Self::find_file_id_by_path(&content_reader, &trigram_index, file_path) {
1463                    Some(id) => id,
1464                    None => {
1465                        log::warn!("Could not find file_id for path: {}", file_path);
1466                        return Vec::new();
1467                    }
1468                };
1469
1470                let content = match content_reader.get_file_content(file_id) {
1471                    Ok(c) => c,
1472                    Err(e) => {
1473                        log::warn!("Failed to read file {}: {}", file_path, e);
1474                        return Vec::new();
1475                    }
1476                };
1477
1478                // Detect language
1479                let ext = std::path::Path::new(file_path)
1480                    .extension()
1481                    .and_then(|e| e.to_str())
1482                    .unwrap_or("");
1483                let lang = Language::from_extension(ext);
1484
1485                // Parse file to extract symbols
1486                let symbols = match ParserFactory::parse(file_path, content, lang) {
1487                    Ok(symbols) => {
1488                        log::debug!("Parsed {} symbols from {}", symbols.len(), file_path);
1489                        symbols
1490                    }
1491                    Err(e) => {
1492                        log::debug!("Failed to parse {}: {}", file_path, e);
1493                        Vec::new()
1494                    }
1495                };
1496
1497                // Cache the parsed symbols (ignore errors - caching is best-effort)
1498                if let Some(file_hash) = file_hashes.get(file_path.as_str()) {
1499                    if let Err(e) = symbol_cache.set(file_path, file_hash, &symbols) {
1500                        log::debug!("Failed to cache symbols for {}: {}", file_path, e);
1501                    }
1502                }
1503
1504                symbols
1505            })
1506            .collect()
1507        });
1508
1509        // Combine cached and parsed symbols
1510        let mut all_symbols: Vec<SearchResult> = Vec::new();
1511
1512        // Add all cached symbols
1513        for symbols in cached_symbols.values() {
1514            all_symbols.extend_from_slice(symbols);
1515        }
1516
1517        // Add all parsed symbols
1518        all_symbols.extend(parsed_symbols);
1519
1520        // KEYWORD DETECTION: Check if pattern is a language keyword (e.g., "class", "function")
1521        // If it matches a keyword AND symbols_mode is true, interpret as "list all symbols of that type"
1522        // rather than looking for a symbol literally named "class" or "function"
1523        //
1524        // IMPORTANT: Only check keywords for languages that will pass Phase 3 filtering.
1525        // If a language filter is specified, only check that language's keywords.
1526        // Otherwise, check all languages present in the symbol results.
1527        let is_keyword_query = {
1528            // Determine which language to check keywords for
1529            let lang_to_check = if let Some(lang) = filter.language {
1530                // Language filter specified - check that language only
1531                // This ensures keyword detection aligns with Phase 3 language filtering
1532                vec![lang]
1533            } else {
1534                // No language filter - check all languages that appear in the actual symbols
1535                // (not candidates, but the parsed symbols that made it through)
1536                // This handles mixed-language codebases correctly
1537                let mut langs: Vec<Language> = all_symbols.iter()
1538                    .map(|s| s.lang)
1539                    .collect::<Vec<_>>();
1540                langs.sort_by(|a, b| format!("{:?}", a).cmp(&format!("{:?}", b))); // Deterministic ordering
1541                langs.dedup(); // Remove duplicates after sorting
1542                langs
1543            };
1544
1545            // Check if pattern matches a keyword in any of the relevant languages
1546            lang_to_check.iter().any(|lang| {
1547                ParserFactory::get_keywords(*lang).contains(&pattern)
1548            })
1549        };
1550
1551        // If pattern is a keyword (like "class" or "function"), skip name-based filtering
1552        // and return all symbols (kind filtering happens in Phase 3)
1553        let filtered: Vec<SearchResult> = if is_keyword_query {
1554            log::info!("Pattern '{}' is a language keyword - listing all symbols (kind filtering will be applied in Phase 3)", pattern);
1555            all_symbols
1556        } else if filter.use_regex {
1557            // For regex queries, candidates already matched content via regex in Phase 1.
1558            // Extract symbols whose line spans overlap with the candidate locations.
1559            // This ensures symbols are found at the locations where the regex matched.
1560
1561            // Build a map of (file_path, line_no) from candidates
1562            use std::collections::{HashMap, HashSet};
1563            let mut candidate_lines: HashMap<String, HashSet<usize>> = HashMap::new();
1564            for candidate in &files_by_path {
1565                for cand in candidate.1 {
1566                    candidate_lines
1567                        .entry(candidate.0.clone())
1568                        .or_insert_with(HashSet::new)
1569                        .insert(cand.span.start_line);
1570                }
1571            }
1572
1573            // Filter symbols whose spans overlap with candidate lines
1574            all_symbols
1575                .into_iter()
1576                .filter(|sym| {
1577                    if let Some(lines) = candidate_lines.get(&sym.path) {
1578                        // Check if symbol's line span overlaps with any candidate line
1579                        for line in sym.span.start_line..=sym.span.end_line {
1580                            if lines.contains(&line) {
1581                                return true;
1582                            }
1583                        }
1584                    }
1585                    false
1586                })
1587                .collect()
1588        } else if filter.use_contains {
1589            // Substring match (opt-in with --contains)
1590            all_symbols
1591                .into_iter()
1592                .filter(|sym| sym.symbol.as_deref().map_or(false, |s| s.contains(pattern)))
1593                .collect()
1594        } else {
1595            // Exact match (default)
1596            all_symbols
1597                .into_iter()
1598                .filter(|sym| sym.symbol.as_deref().map_or(false, |s| s == pattern))
1599                .collect()
1600        };
1601
1602        log::info!("Symbol enrichment found {} matches for pattern '{}'", filtered.len(), pattern);
1603
1604        Ok(filtered)
1605    }
1606
1607    /// Enrich text match candidates with AST pattern matching
1608    ///
1609    /// Takes a list of text match candidates and executes a Tree-sitter AST query
1610    /// on the candidate files, returning only matches that satisfy the AST pattern.
1611    ///
1612    /// # Algorithm
1613    /// 1. Extract unique file paths from candidates
1614    /// 2. Load file contents for each candidate file
1615    /// 3. Execute AST query pattern using Tree-sitter
1616    /// 4. Return AST matches
1617    ///
1618    /// # Performance
1619    /// Only parses files that have text matches, so typically 10-100 files
1620    /// instead of the entire codebase (62K+ files).
1621    ///
1622    /// # Requirements
1623    /// - Language must be specified (AST queries are language-specific)
1624    /// - AST pattern must be valid S-expression syntax
1625    fn enrich_with_ast(&self, candidates: Vec<SearchResult>, ast_pattern: &str, language: Option<Language>) -> Result<Vec<SearchResult>> {
1626        // Require language for AST queries
1627        let lang = language.ok_or_else(|| anyhow::anyhow!(
1628            "Language must be specified for AST pattern matching. Use --lang to specify the language."
1629        ))?;
1630
1631        // Load content store for file reading
1632        let content_path = self.cache.path().join("content.bin");
1633        let content_reader = ContentReader::open(&content_path)
1634            .context("Failed to open content store")?;
1635
1636        // Load trigram index for file path lookups
1637        let trigrams_path = self.cache.path().join("trigrams.bin");
1638        let trigram_index = if trigrams_path.exists() {
1639            TrigramIndex::load(&trigrams_path)?
1640        } else {
1641            Self::rebuild_trigram_index(&content_reader)?
1642        };
1643
1644        // Collect unique file paths from candidates and load their contents
1645        use std::collections::HashMap;
1646        let mut file_contents: HashMap<String, String> = HashMap::new();
1647
1648        for candidate in &candidates {
1649            if file_contents.contains_key(&candidate.path) {
1650                continue;
1651            }
1652
1653            // Find file_id for this path
1654            let file_id = match Self::find_file_id_by_path(&content_reader, &trigram_index, &candidate.path) {
1655                Some(id) => id,
1656                None => {
1657                    log::warn!("Could not find file_id for path: {}", candidate.path);
1658                    continue;
1659                }
1660            };
1661
1662            // Load file content
1663            let content = match content_reader.get_file_content(file_id) {
1664                Ok(c) => c,
1665                Err(e) => {
1666                    log::warn!("Failed to read file {}: {}", candidate.path, e);
1667                    continue;
1668                }
1669            };
1670
1671            file_contents.insert(candidate.path.clone(), content.to_string());
1672        }
1673
1674        log::debug!("Executing AST query on {} candidate files with language {:?}", file_contents.len(), lang);
1675
1676        // Execute AST query using the ast_query module
1677        let results = crate::ast_query::execute_ast_query(candidates, ast_pattern, lang, &file_contents)?;
1678
1679        log::info!("AST query found {} matches for pattern '{}'", results.len(), ast_pattern);
1680
1681        Ok(results)
1682    }
1683
1684    /// Helper to find file_id by path string
1685    fn find_file_id_by_path(
1686        content_reader: &ContentReader,
1687        trigram_index: &TrigramIndex,
1688        target_path: &str,
1689    ) -> Option<u32> {
1690        // Try trigram index first (faster)
1691        for file_id in 0..trigram_index.file_count() {
1692            if let Some(path) = trigram_index.get_file(file_id as u32) {
1693                if path.to_string_lossy() == target_path {
1694                    return Some(file_id as u32);
1695                }
1696            }
1697        }
1698
1699        // Fallback to content reader
1700        for file_id in 0..content_reader.file_count() {
1701            if let Some(path) = content_reader.get_file_path(file_id as u32) {
1702                if path.to_string_lossy() == target_path {
1703                    return Some(file_id as u32);
1704                }
1705            }
1706        }
1707
1708        None
1709    }
1710
1711    /// Map keyword patterns to SymbolKind for auto-inference
1712    ///
1713    /// When users search for keywords like "class" or "function" with --symbols,
1714    /// automatically infer the kind filter to return only symbols of that type.
1715    ///
1716    /// This makes keyword queries more intuitive: searching for "class" returns
1717    /// only classes, not all symbols.
1718    fn keyword_to_kind(keyword: &str) -> Option<SymbolKind> {
1719        filter::keyword_to_kind(keyword)
1720    }
1721
1722    /// Get all files matching the language filter (for keyword queries)
1723    ///
1724    /// This method bypasses trigram search and returns ALL files of the specified language.
1725    /// Used for keyword queries like "list all classes" where we need complete coverage,
1726    /// not just the first 100 candidates from a trigram search.
1727    ///
1728    /// Similar to `search_ast_all_files()` but works for symbol queries instead of AST queries.
1729    fn get_all_language_files(&self, filter: &QueryFilter) -> Result<Vec<SearchResult>> {
1730        // Language filter is optional - if not specified, scan all files
1731        // If specified, only scan files of that language
1732
1733        // Load content store
1734        let content_path = self.cache.path().join("content.bin");
1735        let content_reader = ContentReader::open(&content_path)
1736            .context("Failed to open content store")?;
1737
1738        // Build glob matchers if specified (for filtering)
1739        use globset::{Glob, GlobSetBuilder};
1740
1741        let include_matcher = if !filter.glob_patterns.is_empty() {
1742            let mut builder = GlobSetBuilder::new();
1743            for pattern in &filter.glob_patterns {
1744                let normalized = Self::normalize_glob_pattern(pattern);
1745                if let Ok(glob) = Glob::new(&normalized) {
1746                    builder.add(glob);
1747                }
1748            }
1749            builder.build().ok()
1750        } else {
1751            None
1752        };
1753
1754        let exclude_matcher = if !filter.exclude_patterns.is_empty() {
1755            let mut builder = GlobSetBuilder::new();
1756            for pattern in &filter.exclude_patterns {
1757                let normalized = Self::normalize_glob_pattern(pattern);
1758                if let Ok(glob) = Glob::new(&normalized) {
1759                    builder.add(glob);
1760                }
1761            }
1762            builder.build().ok()
1763        } else {
1764            None
1765        };
1766
1767        // Scan all files and filter by language + glob patterns
1768        let mut candidates: Vec<SearchResult> = Vec::new();
1769
1770        for file_id in 0..content_reader.file_count() {
1771            let file_path = match content_reader.get_file_path(file_id as u32) {
1772                Some(p) => p,
1773                None => continue,
1774            };
1775
1776            // Detect language from file extension
1777            let ext = file_path.extension()
1778                .and_then(|e| e.to_str())
1779                .unwrap_or("");
1780            let detected_lang = Language::from_extension(ext);
1781
1782            // Filter by language (if specified)
1783            if let Some(lang) = filter.language {
1784                if detected_lang != lang {
1785                    continue;
1786                }
1787            }
1788
1789            let file_path_str = file_path.to_string_lossy().to_string();
1790
1791            // Apply glob/exclude filters
1792            let included = include_matcher.as_ref().map_or(true, |m| m.is_match(&file_path_str));
1793            let excluded = exclude_matcher.as_ref().map_or(false, |m| m.is_match(&file_path_str));
1794
1795            if !included || excluded {
1796                continue;
1797            }
1798
1799            // Apply file path filter if specified
1800            if let Some(ref file_pattern) = filter.file_pattern {
1801                if !file_path_str.contains(file_pattern) {
1802                    continue;
1803                }
1804            }
1805
1806            // Create a dummy candidate for this file
1807            // Phase 2 (symbol enrichment) will parse it and extract actual symbols
1808            candidates.push(SearchResult {
1809                path: file_path_str,
1810                lang: detected_lang,
1811                span: Span { start_line: 1, end_line: 1 },
1812                symbol: None,
1813                kind: SymbolKind::Unknown("keyword_query".to_string()),
1814                preview: String::new(),
1815                dependencies: None,
1816            });
1817        }
1818
1819        if let Some(lang) = filter.language {
1820            log::info!("Keyword query will scan {} {:?} files for symbol extraction", candidates.len(), lang);
1821        } else {
1822            log::info!("Keyword query will scan {} files (all languages) for symbol extraction", candidates.len());
1823        }
1824
1825        Ok(candidates)
1826    }
1827
1828    /// Get candidate results using trigram-based full-text search
1829    fn get_trigram_candidates(&self, pattern: &str, filter: &QueryFilter) -> Result<Vec<SearchResult>> {
1830        // Load content store
1831        let content_path = self.cache.path().join("content.bin");
1832        let content_reader = ContentReader::open(&content_path)
1833            .context("Failed to open content store")?;
1834
1835        // Patterns shorter than 3 chars have no trigrams, so the trigram index always
1836        // returns empty.  Fall back to a linear scan of the content store so that
1837        // --force (which bypasses the broad-query guard) still produces real results.
1838        if pattern.chars().count() < 3 {
1839            log::info!(
1840                "Pattern '{}' is shorter than 3 chars — trigram index cannot be used, \
1841                 falling back to linear scan",
1842                pattern
1843            );
1844            return self.linear_scan_candidates(pattern, filter, &content_reader);
1845        }
1846
1847        // Load trigram index from disk (or rebuild if missing)
1848        let trigrams_path = self.cache.path().join("trigrams.bin");
1849        let trigram_index = if trigrams_path.exists() {
1850            match TrigramIndex::load(&trigrams_path) {
1851                Ok(index) => {
1852                    log::debug!("Loaded trigram index from disk: {} trigrams, {} files",
1853                               index.trigram_count(), index.file_count());
1854                    index
1855                }
1856                Err(e) => {
1857                    log::warn!("Failed to load trigram index from disk: {}", e);
1858                    log::warn!("Rebuilding trigram index from content store...");
1859                    Self::rebuild_trigram_index(&content_reader)?
1860                }
1861            }
1862        } else {
1863            log::debug!("trigrams.bin not found, rebuilding from content store");
1864            Self::rebuild_trigram_index(&content_reader)?
1865        };
1866
1867        // Search using trigrams
1868        let candidates = trigram_index.search(pattern);
1869        log::debug!("Found {} candidate locations from trigram search", candidates.len());
1870
1871        // Clone pattern to owned String for thread safety
1872        let pattern_owned = pattern.to_string();
1873
1874        // Compile regex once if in regex mode (before parallel processing for efficiency)
1875        let compiled_regex = if filter.use_regex {
1876            match Regex::new(&pattern_owned) {
1877                Ok(re) => Some(re),
1878                Err(e) => {
1879                    log::error!("Invalid regex pattern '{}': {}", pattern_owned, e);
1880                    anyhow::bail!("Invalid regex pattern '{}': {}", pattern_owned, e);
1881                }
1882            }
1883        } else {
1884            None
1885        };
1886
1887        // Group candidates by file for efficient processing
1888        use std::collections::HashMap;
1889        let mut candidates_by_file: HashMap<u32, Vec<crate::trigram::FileLocation>> = HashMap::new();
1890        for loc in candidates {
1891            candidates_by_file
1892                .entry(loc.file_id)
1893                .or_insert_with(Vec::new)
1894                .push(loc);
1895        }
1896
1897        log::debug!("Scanning {} files with trigram matches", candidates_by_file.len());
1898
1899        // Process files in parallel using rayon
1900        use rayon::prelude::*;
1901
1902        let results: Vec<SearchResult> = candidates_by_file
1903            .par_iter()
1904            .flat_map(|(file_id, locations)| {
1905                // Get file metadata
1906                let file_path = match trigram_index.get_file(*file_id) {
1907                    Some(p) => p,
1908                    None => return Vec::new(),
1909                };
1910
1911                let content = match content_reader.get_file_content(*file_id) {
1912                    Ok(c) => c,
1913                    Err(_) => return Vec::new(),
1914                };
1915
1916                let file_path_str = file_path.to_string_lossy().to_string();
1917
1918                // Detect language once per file
1919                let ext = file_path.extension()
1920                    .and_then(|e| e.to_str())
1921                    .unwrap_or("");
1922                let lang = Language::from_extension(ext);
1923
1924                // Split content into lines once
1925                let lines: Vec<&str> = content.lines().collect();
1926
1927                // Use a HashSet to deduplicate results by line number
1928                let mut seen_lines: std::collections::HashSet<usize> = std::collections::HashSet::new();
1929                let mut file_results = Vec::new();
1930
1931                // Only check the specific lines indicated by trigram posting lists
1932                for loc in locations {
1933                    let line_no = loc.line_no as usize;
1934
1935                    // Skip if we've already processed this line
1936                    if seen_lines.contains(&line_no) {
1937                        continue;
1938                    }
1939
1940                    // Bounds check
1941                    if line_no == 0 || line_no > lines.len() {
1942                        log::debug!("Line {} out of bounds (file has {} lines)", line_no, lines.len());
1943                        continue;
1944                    }
1945
1946                    let line = lines[line_no - 1];
1947
1948                    // Apply matching strategy based on filter mode:
1949                    // - Default: Word-boundary matching (restrictive - finds whole identifiers)
1950                    // - --contains: Substring matching (expansive - finds pattern anywhere)
1951                    // - --regex: Actual regex matching (controlled by pattern itself)
1952                    let line_matches = if filter.use_regex {
1953                        // Regex matching - use pre-compiled regex for efficiency
1954                        // The regex was compiled once outside the parallel loop
1955                        compiled_regex.as_ref()
1956                            .map(|re| re.is_match(line))
1957                            .unwrap_or(false)
1958                    } else if filter.use_contains {
1959                        // Substring matching (expansive)
1960                        line.contains(&pattern_owned)
1961                    } else {
1962                        // Word-boundary matching (restrictive, default)
1963                        Self::has_word_boundary_match(line, &pattern_owned)
1964                    };
1965
1966                    if !line_matches {
1967                        continue;
1968                    }
1969
1970                    seen_lines.insert(line_no);
1971
1972                    // Create a text match result (no symbol lookup for performance)
1973                    file_results.push(SearchResult {
1974                        path: file_path_str.clone(),
1975                        lang: lang.clone(),
1976                        kind: SymbolKind::Unknown("text_match".to_string()),
1977                        symbol: None,  // No symbol name for text matches (avoid duplication)
1978                        span: Span {
1979                            start_line: line_no,
1980                            end_line: line_no,
1981                        },
1982                        preview: line.to_string(),
1983                        dependencies: None,
1984                    });
1985                }
1986
1987                file_results
1988            })
1989            .collect();
1990
1991        Ok(results)
1992    }
1993
1994    /// Linear scan fallback for patterns shorter than 3 characters.
1995    ///
1996    /// The trigram index requires 3-char n-grams; patterns like "fn" or "i" yield
1997    /// zero trigrams and therefore zero results.  This method scans every file in
1998    /// the content store directly using the same matching logic (word-boundary,
1999    /// contains, or regex) so short-pattern queries always return real results.
2000    fn linear_scan_candidates(
2001        &self,
2002        pattern: &str,
2003        filter: &QueryFilter,
2004        content_reader: &ContentReader,
2005    ) -> Result<Vec<SearchResult>> {
2006        use rayon::prelude::*;
2007
2008        let pattern_owned = pattern.to_string();
2009        let file_count = content_reader.file_count();
2010
2011        let compiled_regex = if filter.use_regex {
2012            match Regex::new(&pattern_owned) {
2013                Ok(re) => Some(re),
2014                Err(e) => anyhow::bail!("Invalid regex pattern '{}': {}", pattern_owned, e),
2015            }
2016        } else {
2017            None
2018        };
2019
2020        let results: Vec<SearchResult> = (0..file_count as u32)
2021            .collect::<Vec<_>>()
2022            .par_iter()
2023            .flat_map(|&file_id| {
2024                let file_path = match content_reader.get_file_path(file_id) {
2025                    Some(p) => p.to_path_buf(),
2026                    None => return Vec::new(),
2027                };
2028                let content = match content_reader.get_file_content(file_id) {
2029                    Ok(c) => c,
2030                    Err(_) => return Vec::new(),
2031                };
2032
2033                let file_path_str = file_path.to_string_lossy().to_string();
2034                let ext = file_path.extension().and_then(|e| e.to_str()).unwrap_or("");
2035                let lang = Language::from_extension(ext);
2036
2037                let mut seen_lines = std::collections::HashSet::new();
2038                let mut file_results = Vec::new();
2039
2040                for (line_idx, line) in content.lines().enumerate() {
2041                    let line_no = line_idx + 1;
2042                    if seen_lines.contains(&line_no) {
2043                        continue;
2044                    }
2045
2046                    let line_matches = if filter.use_regex {
2047                        compiled_regex.as_ref().map(|re| re.is_match(line)).unwrap_or(false)
2048                    } else if filter.use_contains {
2049                        line.contains(&pattern_owned)
2050                    } else {
2051                        Self::has_word_boundary_match(line, &pattern_owned)
2052                    };
2053
2054                    if !line_matches {
2055                        continue;
2056                    }
2057
2058                    seen_lines.insert(line_no);
2059                    file_results.push(SearchResult {
2060                        path: file_path_str.clone(),
2061                        lang: lang.clone(),
2062                        kind: SymbolKind::Unknown("text_match".to_string()),
2063                        symbol: None,
2064                        span: Span { start_line: line_no, end_line: line_no },
2065                        preview: line.to_string(),
2066                        dependencies: None,
2067                    });
2068                }
2069
2070                file_results
2071            })
2072            .collect();
2073
2074        log::info!(
2075            "Linear scan (short pattern '{}') found {} results across {} files",
2076            pattern, results.len(), file_count
2077        );
2078        Ok(results)
2079    }
2080
2081    /// Get candidate results using regex patterns with trigram optimization
2082    ///
2083    /// # Algorithm
2084    ///
2085    /// 1. Extract literal sequences from the regex pattern (≥3 chars)
2086    /// 2. If literals found: search for files containing ANY of the literals (UNION)
2087    /// 3. If no literals: fall back to full content scan
2088    /// 4. Compile regex and verify matches in candidate files
2089    /// 5. Return matching results with context
2090    ///
2091    /// # File Selection Strategy
2092    ///
2093    /// Uses UNION of files containing any literal (conservative approach):
2094    /// - For alternation patterns `(a|b)`: Correctly searches files with a OR b
2095    /// - For sequential patterns `a.*b`: Searches files with a OR b (may include extra files)
2096    /// - Trade-off: Ensures correctness at the cost of scanning 2-3x more files for sequential patterns
2097    /// - Performance impact is minimal due to memory-mapped I/O (<5ms overhead typically)
2098    ///
2099    /// # Performance
2100    ///
2101    /// - Best case (pattern with literals): <20ms (trigram optimization)
2102    /// - Typical case (alternation/sequential): 5-15ms on small codebases (<100 files)
2103    /// - Worst case (no literals like `.*`): ~100ms (full scan)
2104    fn get_regex_candidates(&self, pattern: &str, timeout: Option<&std::time::Duration>, start_time: &std::time::Instant, suppress_output: bool) -> Result<Vec<SearchResult>> {
2105        // Step 1: Compile the regex
2106        let regex = Regex::new(pattern)
2107            .with_context(|| format!("Invalid regex pattern: {}", pattern))?;
2108
2109        // Check timeout before expensive operations
2110        if let Some(timeout_duration) = timeout {
2111            if start_time.elapsed() > *timeout_duration {
2112                anyhow::bail!(
2113                    "Query timeout exceeded ({} seconds) during regex compilation",
2114                    timeout_duration.as_secs()
2115                );
2116            }
2117        }
2118
2119        // Step 2: Extract trigrams from regex
2120        let trigrams = extract_trigrams_from_regex(pattern);
2121
2122        // Load content store
2123        let content_path = self.cache.path().join("content.bin");
2124        let content_reader = ContentReader::open(&content_path)
2125            .context("Failed to open content store")?;
2126
2127        let mut results = Vec::new();
2128
2129        if trigrams.is_empty() {
2130            // No trigrams - fall back to full scan
2131            if !suppress_output {
2132                output::warn(&format!(
2133                    "Regex pattern '{}' has no literals (≥3 chars), falling back to full content scan. This may be slow on large codebases. Consider using patterns with literal text.",
2134                    pattern
2135                ));
2136            }
2137
2138            // Scan all files
2139            for file_id in 0..content_reader.file_count() {
2140                let file_path = content_reader.get_file_path(file_id as u32)
2141                    .context("Invalid file_id")?;
2142                let content = content_reader.get_file_content(file_id as u32)?;
2143
2144                self.find_regex_matches_in_file(
2145                    &regex,
2146                    file_path,
2147                    content,
2148                    &mut results,
2149                )?;
2150            }
2151        } else {
2152            // Use trigrams to narrow down candidates
2153            log::debug!("Using {} trigrams to narrow regex search candidates", trigrams.len());
2154
2155            // Load trigram index
2156            let trigrams_path = self.cache.path().join("trigrams.bin");
2157            let trigram_index = if trigrams_path.exists() {
2158                TrigramIndex::load(&trigrams_path)?
2159            } else {
2160                Self::rebuild_trigram_index(&content_reader)?
2161            };
2162
2163            // Extract the literal sequences from the regex pattern
2164            use crate::regex_trigrams::extract_literal_sequences;
2165            let literals = extract_literal_sequences(pattern);
2166
2167            if literals.is_empty() {
2168                log::warn!("Regex extraction found trigrams but no literal sequences - this shouldn't happen");
2169                // Fall back to full scan
2170                for file_id in 0..content_reader.file_count() {
2171                    let file_path = content_reader.get_file_path(file_id as u32)
2172                        .context("Invalid file_id")?;
2173                    let content = content_reader.get_file_content(file_id as u32)?;
2174                    self.find_regex_matches_in_file(&regex, file_path, content, &mut results)?;
2175                }
2176            } else {
2177                // Search for each literal sequence and union the results
2178                // This ensures we find matches for ANY literal (important for alternation patterns like (a|b))
2179                // Trade-off: May scan more files than necessary for sequential patterns (a.*b),
2180                // but ensures correctness for all regex patterns
2181                use std::collections::HashSet;
2182                let mut candidate_files: HashSet<u32> = HashSet::new();
2183
2184                for literal in &literals {
2185                    // Search for this literal in the trigram index
2186                    let candidates = trigram_index.search(literal);
2187                    let file_ids: HashSet<u32> = candidates.iter().map(|loc| loc.file_id).collect();
2188
2189                    log::debug!("Literal '{}' found in {} files", literal, file_ids.len());
2190
2191                    // Union with existing candidate files (not intersection)
2192                    // This ensures we search files containing ANY of the literals
2193                    candidate_files.extend(file_ids);
2194                }
2195
2196                let final_candidates = candidate_files;
2197                log::debug!("After union: searching {} files that contain any literal", final_candidates.len());
2198
2199                // Verify regex matches in candidate files only
2200                for &file_id in &final_candidates {
2201                    let file_path = trigram_index.get_file(file_id)
2202                        .context("Invalid file_id from trigram search")?;
2203                    let content = content_reader.get_file_content(file_id)?;
2204
2205                    self.find_regex_matches_in_file(
2206                        &regex,
2207                        file_path,
2208                        content,
2209                        &mut results,
2210                    )?;
2211                }
2212            }
2213        }
2214
2215        log::info!("Regex search found {} matches for pattern '{}'", results.len(), pattern);
2216        Ok(results)
2217    }
2218
2219    /// Find all regex matches in a single file
2220    fn find_regex_matches_in_file(
2221        &self,
2222        regex: &Regex,
2223        file_path: &std::path::Path,
2224        content: &str,
2225        results: &mut Vec<SearchResult>,
2226    ) -> Result<()> {
2227        let file_path_str = file_path.to_string_lossy().to_string();
2228
2229        // Detect language from file extension
2230        let ext = file_path.extension()
2231            .and_then(|e| e.to_str())
2232            .unwrap_or("");
2233        let lang = Language::from_extension(ext);
2234
2235        // Find all regex matches line by line
2236        for (line_idx, line) in content.lines().enumerate() {
2237            if regex.is_match(line) {
2238                let line_no = line_idx + 1;
2239
2240                // Create text match result
2241                // Note: We don't extract symbol names from regex matches because:
2242                // 1. Regex might match partial identifiers (e.g., "UserController" in "ListUserController")
2243                // 2. Regex might match across language-specific delimiters (namespaces, scopes, etc.)
2244                // 3. Accurate symbol extraction requires tree-sitter parsing (expensive)
2245                // The user can see the full context in the 'preview' field
2246                results.push(SearchResult {
2247                    path: file_path_str.clone(),
2248                    lang: lang.clone(),
2249                    kind: SymbolKind::Unknown("regex_match".to_string()),
2250                    symbol: None,  // No symbol name for regex matches
2251                    span: Span {
2252                        start_line: line_no,
2253                        end_line: line_no,
2254                    },
2255                    preview: line.to_string(),
2256                    dependencies: None,
2257                });
2258            }
2259        }
2260
2261        Ok(())
2262    }
2263
2264    fn find_file_id(content_reader: &ContentReader, target_path: &str) -> Option<u32> {
2265        result::find_file_id(content_reader, target_path)
2266    }
2267
2268    fn rebuild_trigram_index(content_reader: &ContentReader) -> Result<TrigramIndex> {
2269        result::rebuild_trigram_index(content_reader)
2270    }
2271
2272    fn normalize_glob_pattern(pattern: &str) -> String {
2273        result::normalize_glob_pattern(pattern)
2274    }
2275
2276    fn has_word_boundary_match(line: &str, pattern: &str) -> bool {
2277        filter::has_word_boundary_match(line, pattern)
2278    }
2279
2280    /// Get index status for programmatic use (doesn't print warnings)
2281    ///
2282    /// Returns (status, can_trust_results, warning) tuple for JSON output.
2283    /// This is optimized for AI agents to detect staleness and auto-reindex.
2284    pub fn get_index_status(&self) -> Result<(IndexStatus, bool, Option<IndexWarning>)> {
2285        let root = self.cache.workspace_root();
2286
2287        // Check git state if in a git repo
2288        if crate::git::is_git_repo(&root) {
2289            if let Ok(current_branch) = crate::git::get_current_branch(&root) {
2290                // Check if we're on a different branch than what was indexed
2291                if !self.cache.branch_exists(&current_branch).unwrap_or(false) {
2292                    let warning = IndexWarning {
2293                        reason: format!("Branch '{}' has not been indexed", current_branch),
2294                        action_required: "rfx index".to_string(),
2295                        files_modified: None,
2296                        details: Some(IndexWarningDetails {
2297                            current_branch: Some(current_branch),
2298                            indexed_branch: None,
2299                            current_commit: None,
2300                            indexed_commit: None,
2301                        }),
2302                    };
2303                    return Ok((IndexStatus::Stale, false, Some(warning)));
2304                }
2305
2306                // Branch exists - check if commit changed
2307                if let (Ok(current_commit), Ok(branch_info)) =
2308                    (crate::git::get_current_commit(&root), self.cache.get_branch_info(&current_branch)) {
2309
2310                    if branch_info.commit_sha != current_commit {
2311                        let warning = IndexWarning {
2312                            reason: format!(
2313                                "Commit changed from {} to {}",
2314                                &branch_info.commit_sha[..7],
2315                                &current_commit[..7]
2316                            ),
2317                            action_required: "rfx index".to_string(),
2318                            files_modified: None,
2319                            details: Some(IndexWarningDetails {
2320                                current_branch: Some(current_branch.clone()),
2321                                indexed_branch: Some(current_branch.clone()),
2322                                current_commit: Some(current_commit.clone()),
2323                                indexed_commit: Some(branch_info.commit_sha.clone()),
2324                            }),
2325                        };
2326                        return Ok((IndexStatus::Stale, false, Some(warning)));
2327                    }
2328
2329                    // If commits match, do a quick file freshness check
2330                    if let Ok(branch_files) = self.cache.get_branch_files(&current_branch) {
2331                        let mut checked = 0;
2332                        let mut changed = 0;
2333                        const SAMPLE_SIZE: usize = 10;
2334
2335                        for (path, _indexed_hash) in branch_files.iter().take(SAMPLE_SIZE) {
2336                            checked += 1;
2337                            let file_path = std::path::Path::new(path);
2338
2339                            if let Ok(metadata) = std::fs::metadata(file_path) {
2340                                if let Ok(modified) = metadata.modified() {
2341                                    let indexed_time = branch_info.last_indexed;
2342                                    let file_time = modified.duration_since(std::time::UNIX_EPOCH)
2343                                        .unwrap_or_default()
2344                                        .as_secs() as i64;
2345
2346                                    if file_time > indexed_time {
2347                                        // File modified after indexing - likely stale
2348                                        // Note: We skip hash verification for performance (mtime check is sufficient)
2349                                        changed += 1;
2350                                    }
2351                                }
2352                            }
2353                        }
2354
2355                        if changed > 0 {
2356                            let warning = IndexWarning {
2357                                reason: format!("{} of {} sampled files modified", changed, checked),
2358                                action_required: "rfx index".to_string(),
2359                                files_modified: Some(changed as u32),
2360                                details: Some(IndexWarningDetails {
2361                                    current_branch: Some(current_branch.clone()),
2362                                    indexed_branch: Some(branch_info.branch.clone()),
2363                                    current_commit: Some(current_commit.clone()),
2364                                    indexed_commit: Some(branch_info.commit_sha.clone()),
2365                                }),
2366                            };
2367                            return Ok((IndexStatus::Stale, false, Some(warning)));
2368                        }
2369                    }
2370
2371                    // All checks passed - index is fresh
2372                    return Ok((IndexStatus::Fresh, true, None));
2373                }
2374            }
2375        }
2376
2377        // Not in a git repo or couldn't get git info - assume fresh
2378        Ok((IndexStatus::Fresh, true, None))
2379    }
2380
2381    /// Check index freshness and show non-blocking warnings
2382    ///
2383    /// This performs lightweight checks to warn users if their index might be stale:
2384    /// 1. Branch mismatch: indexed different branch
2385    /// 2. Commit changed: HEAD moved since indexing
2386    /// 3. File changes: quick mtime check on sample of files (if available)
2387    fn check_index_freshness(&self, filter: &QueryFilter) -> Result<()> {
2388        let root = self.cache.workspace_root();
2389
2390        // Check git state if in a git repo
2391        if crate::git::is_git_repo(&root) {
2392            if !crate::git::is_git_available() {
2393                static WARNED: std::sync::OnceLock<()> = std::sync::OnceLock::new();
2394                if !filter.suppress_output {
2395                    WARNED.get_or_init(|| {
2396                        output::warn("⚠️  git binary not found in PATH; index freshness checks disabled for this session.");
2397                    });
2398                }
2399                return Ok(());
2400            }
2401            if let Ok(current_branch) = crate::git::get_current_branch(&root) {
2402                // Check if we're on a different branch than what was indexed
2403                if !self.cache.branch_exists(&current_branch).unwrap_or(false) {
2404                    if !filter.suppress_output {
2405                        output::warn(&format!("⚠️  WARNING: Index not found for branch '{}'. Run 'rfx index' to index this branch.", current_branch));
2406                    }
2407                    return Ok(());
2408                }
2409
2410                // Branch exists - check if commit changed
2411                if let (Ok(current_commit), Ok(branch_info)) =
2412                    (crate::git::get_current_commit(&root), self.cache.get_branch_info(&current_branch)) {
2413
2414                    if branch_info.commit_sha != current_commit {
2415                        if !filter.suppress_output {
2416                            output::warn(&format!("⚠️  WARNING: Index may be stale (commit changed: {} → {}). Consider running 'rfx index'.",
2417                                     &branch_info.commit_sha[..7], &current_commit[..7]));
2418                        }
2419                        return Ok(());
2420                    }
2421
2422                    // If commits match, do a quick file freshness check
2423                    // Sample up to 10 files to check for modifications (cheap mtime check)
2424                    if let Ok(branch_files) = self.cache.get_branch_files(&current_branch) {
2425                        let mut checked = 0;
2426                        let mut changed = 0;
2427                        const SAMPLE_SIZE: usize = 10;
2428
2429                        for (path, _indexed_hash) in branch_files.iter().take(SAMPLE_SIZE) {
2430                            checked += 1;
2431                            let file_path = std::path::Path::new(path);
2432
2433                            // Check if file exists and has been modified (mtime/size heuristic)
2434                            if let Ok(metadata) = std::fs::metadata(file_path) {
2435                                if let Ok(modified) = metadata.modified() {
2436                                    let indexed_time = branch_info.last_indexed;
2437                                    let file_time = modified.duration_since(std::time::UNIX_EPOCH)
2438                                        .unwrap_or_default()
2439                                        .as_secs() as i64;
2440
2441                                    // If file modified after indexing, it might be stale
2442                                    if file_time > indexed_time {
2443                                        // File modified after indexing - likely stale
2444                                        // Note: We skip hash verification for performance (mtime check is sufficient)
2445                                        // This may cause false positives if files were touched without changes,
2446                                        // but the warning is non-blocking and vastly better than slow queries
2447                                        changed += 1;
2448                                    }
2449                                }
2450                            }
2451                        }
2452
2453                        if changed > 0 && !filter.suppress_output {
2454                            output::warn(&format!("⚠️  WARNING: {} of {} sampled files changed since indexing. Consider running 'rfx index'.", changed, checked));
2455                        }
2456                    }
2457                }
2458            }
2459        }
2460
2461        Ok(())
2462    }
2463}
2464
2465/// Generate AI instruction based on query results
2466///
2467/// Provides context-aware guidance to AI agents on how to handle search results.
2468/// Uses priority-based logic to determine the most relevant instruction.
2469pub fn generate_ai_instruction(
2470    result_count: usize,
2471    total_count: usize,
2472    has_more: bool,
2473    symbols_mode: bool,
2474    paths_only: bool,
2475    use_ast: bool,
2476    use_regex: bool,
2477    language_filter: bool,
2478    glob_filter: bool,
2479    exact_mode: bool,
2480) -> Option<String> {
2481    // Priority 1: No results
2482    if result_count == 0 {
2483        return Some(
2484            "No results found. Consider these alternatives: 1) Check pattern spelling, 2) Remove --kind or --lang filters to broaden search, 3) Try partial match or related term, 4) Use search_regex tool for pattern matching with special characters or complex patterns."
2485            .to_string()
2486        );
2487    }
2488
2489    // Priority 2: Query too broad (500+ results)
2490    if total_count >= 500 {
2491        return Some(
2492            format!("Query too broad: {} results found. STOP. Do not list results. Refine search automatically by adding filters: kind parameter (Function/Struct/Class), lang parameter (rust/python/etc), or glob parameter (['src/**/*.rs']). Call search_code again with appropriate filters.", total_count)
2493        );
2494    }
2495
2496    // Priority 3: Paginated results
2497    if has_more {
2498        return Some(
2499            format!("Showing {} of {} results. PAGINATED - there are more results available. Do not automatically fetch all results. Show current page, ask user if these results answer their question before fetching more with --offset parameter.", result_count, total_count)
2500        );
2501    }
2502
2503    // Priority 4: Single precise result (symbols mode)
2504    if result_count == 1 && symbols_mode {
2505        return Some(
2506            "Found 1 precise result. Respond concisely: '[symbol] at [path]:[line]'.".to_string()
2507        );
2508    }
2509
2510    // Priority 5: Few precise results (symbols mode)
2511    if result_count >= 2 && result_count <= 10 && symbols_mode {
2512        return Some(
2513            format!("Found {} precise results (definitions only, not usages). List locations concisely: '[symbol] at [path]:[line]' for each result.", result_count)
2514        );
2515    }
2516
2517    // Priority 6: Many results (101-500)
2518    if total_count >= 101 && total_count < 500 {
2519        return Some(
2520            format!("Found {} results - this is broad. Suggest refining search with: kind parameter (Function/Struct/Class/etc), lang parameter (rust/python/etc), or glob parameter to narrow file scope.", total_count)
2521        );
2522    }
2523
2524    // Priority 7: Full-text mode with many results (suggest symbols mode)
2525    if result_count >= 100 && !symbols_mode {
2526        return Some(
2527            format!("Found {} results in full-text search mode (includes definitions AND all usages). Consider using symbols=true parameter to filter to definitions only. This typically reduces results by 80-90%.", result_count)
2528        );
2529    }
2530
2531    // Priority 8: Paths-only mode
2532    if paths_only {
2533        return Some(
2534            format!("Found {} unique files (paths-only mode - no code content included). Next step: Use Read tool on specific files that look relevant based on their paths.", result_count)
2535        );
2536    }
2537
2538    // Priority 9: AST query results
2539    if use_ast {
2540        return Some(
2541            format!("Found {} results using AST pattern matching. These are structure-based matches using Tree-sitter patterns, not text search.", result_count)
2542        );
2543    }
2544
2545    // Priority 10: Regex with many results
2546    if use_regex && result_count >= 100 {
2547        return Some(
2548            format!("Found {} results using regex pattern matching. Regex matches are expansive. Consider using exact text search or symbols mode for more precise results.", result_count)
2549        );
2550    }
2551
2552    // Priority 11: Language filter with few results
2553    if language_filter && result_count <= 5 {
2554        return Some(
2555            format!("Found {} results with language filter active. Results are limited to this language only. Remove lang parameter if you want to search all languages.", result_count)
2556        );
2557    }
2558
2559    // Priority 12: Glob filter with few results
2560    if glob_filter && result_count <= 10 {
2561        return Some(
2562            format!("Found {} results with glob filter active. Results are limited to matching paths. Remove glob parameter to search entire codebase.", result_count)
2563        );
2564    }
2565
2566    // Priority 13: Exact mode with few results
2567    if exact_mode && result_count <= 5 {
2568        return Some(
2569            format!("Found {} results in exact match mode. Only exact symbol name matches are included. Remove exact parameter to allow substring matching.", result_count)
2570        );
2571    }
2572
2573    // Normal case (11-100 results, no special conditions) - no instruction
2574    None
2575}
2576
2577#[cfg(test)]
2578mod tests {
2579    use super::*;
2580    use crate::indexer::Indexer;
2581    use crate::models::IndexConfig;
2582    use std::fs;
2583    use tempfile::TempDir;
2584
2585    // ==================== Basic Tests ====================
2586
2587    #[test]
2588    fn test_query_engine_creation() {
2589        let temp = TempDir::new().unwrap();
2590        let cache = CacheManager::new(temp.path());
2591        let engine = QueryEngine::new(cache);
2592
2593        assert!(engine.cache.path().ends_with(".reflex"));
2594    }
2595
2596    #[test]
2597    fn test_filter_modes() {
2598        // Test that symbols_mode works as expected
2599        let filter_fulltext = QueryFilter::default();
2600        assert!(!filter_fulltext.symbols_mode);
2601
2602        let filter_symbols = QueryFilter {
2603            symbols_mode: true,
2604            ..Default::default()
2605        };
2606        assert!(filter_symbols.symbols_mode);
2607
2608        // Test that kind implies symbols_mode (handled in CLI layer)
2609        let filter_with_kind = QueryFilter {
2610            kind: Some(SymbolKind::Function),
2611            symbols_mode: true,
2612            ..Default::default()
2613        };
2614        assert!(filter_with_kind.symbols_mode);
2615    }
2616
2617    // ==================== Search Mode Tests ====================
2618
2619    #[test]
2620    fn test_fulltext_search() {
2621        let temp = TempDir::new().unwrap();
2622        let project = temp.path().join("project");
2623        fs::create_dir(&project).unwrap();
2624
2625        // Create test files
2626        fs::write(project.join("main.rs"), "fn main() {\n    println!(\"hello\");\n}").unwrap();
2627        fs::write(project.join("lib.rs"), "pub fn hello() {}").unwrap();
2628
2629        // Index the project
2630        let cache = CacheManager::new(&project);
2631        let indexer = Indexer::new(cache, IndexConfig::default());
2632        indexer.index(&project, false).unwrap();
2633
2634        // Search for "hello"
2635        let cache = CacheManager::new(&project);
2636        let engine = QueryEngine::new(cache);
2637        let filter = QueryFilter::default(); // full-text mode
2638        let results = engine.search("hello", filter).unwrap();
2639
2640        // Should find both occurrences (println and function name)
2641        assert!(results.len() >= 2);
2642        assert!(results.iter().any(|r| r.path.contains("main.rs")));
2643        assert!(results.iter().any(|r| r.path.contains("lib.rs")));
2644    }
2645
2646    #[test]
2647    fn test_symbol_search() {
2648        let temp = TempDir::new().unwrap();
2649        let project = temp.path().join("project");
2650        fs::create_dir(&project).unwrap();
2651
2652        // Create test file with function definition and call
2653        fs::write(
2654            project.join("main.rs"),
2655            "fn greet() {}\nfn main() {\n    greet();\n}"
2656        ).unwrap();
2657
2658        // Index
2659        let cache = CacheManager::new(&project);
2660        let indexer = Indexer::new(cache, IndexConfig::default());
2661        indexer.index(&project, false).unwrap();
2662
2663        let cache = CacheManager::new(&project);
2664
2665        // Symbol search (definitions only)
2666        let engine = QueryEngine::new(cache);
2667        let filter = QueryFilter {
2668            symbols_mode: true,
2669            ..Default::default()
2670        };
2671        let results = engine.search("greet", filter).unwrap();
2672
2673        // Should find only the definition, not the call
2674        assert!(results.len() >= 1);
2675        assert!(results.iter().any(|r| r.kind == SymbolKind::Function));
2676    }
2677
2678    #[test]
2679    fn test_regex_search() {
2680        let temp = TempDir::new().unwrap();
2681        let project = temp.path().join("project");
2682        fs::create_dir(&project).unwrap();
2683
2684        fs::write(
2685            project.join("main.rs"),
2686            "fn test1() {}\nfn test2() {}\nfn other() {}"
2687        ).unwrap();
2688
2689        let cache = CacheManager::new(&project);
2690        let indexer = Indexer::new(cache, IndexConfig::default());
2691        indexer.index(&project, false).unwrap();
2692
2693        let cache = CacheManager::new(&project);
2694
2695        let engine = QueryEngine::new(cache);
2696        let filter = QueryFilter {
2697            use_regex: true,
2698            ..Default::default()
2699        };
2700        let results = engine.search(r"fn test\d", filter).unwrap();
2701
2702        // Should match test1 and test2 but not other
2703        assert_eq!(results.len(), 2);
2704        assert!(results.iter().all(|r| r.preview.contains("test")));
2705    }
2706
2707    // ==================== Filter Tests ====================
2708
2709    #[test]
2710    fn test_language_filter() {
2711        let temp = TempDir::new().unwrap();
2712        let project = temp.path().join("project");
2713        fs::create_dir(&project).unwrap();
2714
2715        fs::write(project.join("main.rs"), "fn main() {}").unwrap();
2716        fs::write(project.join("main.js"), "function main() {}").unwrap();
2717
2718        let cache = CacheManager::new(&project);
2719        let indexer = Indexer::new(cache, IndexConfig::default());
2720        indexer.index(&project, false).unwrap();
2721
2722        let cache = CacheManager::new(&project);
2723
2724        let engine = QueryEngine::new(cache);
2725
2726        // Filter to Rust only
2727        let filter = QueryFilter {
2728            language: Some(Language::Rust),
2729            ..Default::default()
2730        };
2731        let results = engine.search("main", filter).unwrap();
2732
2733        assert!(results.iter().all(|r| r.lang == Language::Rust));
2734        assert!(results.iter().all(|r| r.path.ends_with(".rs")));
2735    }
2736
2737    #[test]
2738    fn test_kind_filter() {
2739        let temp = TempDir::new().unwrap();
2740        let project = temp.path().join("project");
2741        fs::create_dir(&project).unwrap();
2742
2743        fs::write(
2744            project.join("main.rs"),
2745            "struct Point {}\nfn main() {}\nimpl Point { fn new() {} }"
2746        ).unwrap();
2747
2748        let cache = CacheManager::new(&project);
2749        let indexer = Indexer::new(cache, IndexConfig::default());
2750        indexer.index(&project, false).unwrap();
2751
2752        let cache = CacheManager::new(&project);
2753
2754        let engine = QueryEngine::new(cache);
2755
2756        // Filter to functions only (includes methods)
2757        let filter = QueryFilter {
2758            symbols_mode: true,
2759            kind: Some(SymbolKind::Function),
2760            use_contains: true,  // "mai" is substring of "main"
2761            ..Default::default()
2762        };
2763        // Search for "mai" which should match "main" (tri gram pattern will def be in index)
2764        let results = engine.search("mai", filter).unwrap();
2765
2766        // Should find main function
2767        assert!(results.len() > 0, "Should find at least one result");
2768        assert!(results.iter().any(|r| r.symbol.as_deref() == Some("main")), "Should find 'main' function");
2769    }
2770
2771    #[test]
2772    fn test_file_pattern_filter() {
2773        let temp = TempDir::new().unwrap();
2774        let project = temp.path().join("project");
2775        fs::create_dir_all(project.join("src")).unwrap();
2776        fs::create_dir_all(project.join("tests")).unwrap();
2777
2778        fs::write(project.join("src/lib.rs"), "fn foo() {}").unwrap();
2779        fs::write(project.join("tests/test.rs"), "fn foo() {}").unwrap();
2780
2781        let cache = CacheManager::new(&project);
2782        let indexer = Indexer::new(cache, IndexConfig::default());
2783        indexer.index(&project, false).unwrap();
2784
2785        let cache = CacheManager::new(&project);
2786
2787        let engine = QueryEngine::new(cache);
2788
2789        // Filter to src/ only
2790        let filter = QueryFilter {
2791            file_pattern: Some("src/".to_string()),
2792            ..Default::default()
2793        };
2794        let results = engine.search("foo", filter).unwrap();
2795
2796        assert!(results.iter().all(|r| r.path.contains("src/")));
2797        assert!(!results.iter().any(|r| r.path.contains("tests/")));
2798    }
2799
2800    #[test]
2801    fn test_limit_filter() {
2802        let temp = TempDir::new().unwrap();
2803        let project = temp.path().join("project");
2804        fs::create_dir(&project).unwrap();
2805
2806        // Create file with many matches
2807        let content = (0..20).map(|i| format!("fn test{}() {{}}", i)).collect::<Vec<_>>().join("\n");
2808        fs::write(project.join("main.rs"), content).unwrap();
2809
2810        let cache = CacheManager::new(&project);
2811        let indexer = Indexer::new(cache, IndexConfig::default());
2812        indexer.index(&project, false).unwrap();
2813
2814        let cache = CacheManager::new(&project);
2815
2816        let engine = QueryEngine::new(cache);
2817
2818        // Limit to 5 results
2819        let filter = QueryFilter {
2820            limit: Some(5),
2821            use_contains: true,  // "test" is substring of "test0", "test1", etc.
2822            ..Default::default()
2823        };
2824        let results = engine.search("test", filter).unwrap();
2825
2826        assert_eq!(results.len(), 5);
2827    }
2828
2829    #[test]
2830    fn test_exact_match_filter() {
2831        let temp = TempDir::new().unwrap();
2832        let project = temp.path().join("project");
2833        fs::create_dir(&project).unwrap();
2834
2835        fs::write(
2836            project.join("main.rs"),
2837            "fn test() {}\nfn test_helper() {}\nfn other_test() {}"
2838        ).unwrap();
2839
2840        let cache = CacheManager::new(&project);
2841        let indexer = Indexer::new(cache, IndexConfig::default());
2842        indexer.index(&project, false).unwrap();
2843
2844        let cache = CacheManager::new(&project);
2845
2846        let engine = QueryEngine::new(cache);
2847
2848        // Exact match for "test"
2849        let filter = QueryFilter {
2850            symbols_mode: true,
2851            exact: true,
2852            ..Default::default()
2853        };
2854        let results = engine.search("test", filter).unwrap();
2855
2856        // Should only match exactly "test", not "test_helper" or "other_test"
2857        assert_eq!(results.len(), 1);
2858        assert_eq!(results[0].symbol.as_deref(), Some("test"));
2859    }
2860
2861    // ==================== Expand Mode Tests ====================
2862
2863    #[test]
2864    fn test_expand_mode() {
2865        let temp = TempDir::new().unwrap();
2866        let project = temp.path().join("project");
2867        fs::create_dir(&project).unwrap();
2868
2869        fs::write(
2870            project.join("main.rs"),
2871            "fn greet() {\n    println!(\"Hello\");\n    println!(\"World\");\n}"
2872        ).unwrap();
2873
2874        let cache = CacheManager::new(&project);
2875        let indexer = Indexer::new(cache, IndexConfig::default());
2876        indexer.index(&project, false).unwrap();
2877
2878        let cache = CacheManager::new(&project);
2879
2880        let engine = QueryEngine::new(cache);
2881
2882        // Search with expand mode
2883        let filter = QueryFilter {
2884            symbols_mode: true,
2885            expand: true,
2886            ..Default::default()
2887        };
2888        let results = engine.search("greet", filter).unwrap();
2889
2890        // Should have full function body in preview
2891        assert!(results.len() >= 1);
2892        let result = &results[0];
2893        assert!(result.preview.contains("println"));
2894    }
2895
2896    // ==================== Edge Cases ====================
2897
2898    #[test]
2899    fn test_search_empty_index() {
2900        let temp = TempDir::new().unwrap();
2901        let project = temp.path().join("project");
2902        fs::create_dir(&project).unwrap();
2903
2904        let cache = CacheManager::new(&project);
2905        let indexer = Indexer::new(cache, IndexConfig::default());
2906        indexer.index(&project, false).unwrap();
2907
2908        let cache = CacheManager::new(&project);
2909
2910        let engine = QueryEngine::new(cache);
2911        let filter = QueryFilter::default();
2912        let results = engine.search("nonexistent", filter).unwrap();
2913
2914        assert_eq!(results.len(), 0);
2915    }
2916
2917    #[test]
2918    fn test_search_no_index() {
2919        let temp = TempDir::new().unwrap();
2920        let project = temp.path().join("project");
2921        fs::create_dir(&project).unwrap();
2922
2923        let cache = CacheManager::new(&project);
2924        let engine = QueryEngine::new(cache);
2925        let filter = QueryFilter::default();
2926
2927        // Should fail when index doesn't exist
2928        assert!(engine.search("test", filter).is_err());
2929    }
2930
2931    #[test]
2932    fn test_search_special_characters() {
2933        let temp = TempDir::new().unwrap();
2934        let project = temp.path().join("project");
2935        fs::create_dir(&project).unwrap();
2936
2937        fs::write(project.join("main.rs"), "let x = 42;\nlet y = x + 1;").unwrap();
2938
2939        let cache = CacheManager::new(&project);
2940        let indexer = Indexer::new(cache, IndexConfig::default());
2941        indexer.index(&project, false).unwrap();
2942
2943        let cache = CacheManager::new(&project);
2944
2945        let engine = QueryEngine::new(cache);
2946        let filter = QueryFilter::default();
2947
2948        // Search for special characters
2949        let results = engine.search("x + ", filter).unwrap();
2950        assert!(results.len() >= 1);
2951    }
2952
2953    #[test]
2954    fn test_search_unicode() {
2955        let temp = TempDir::new().unwrap();
2956        let project = temp.path().join("project");
2957        fs::create_dir(&project).unwrap();
2958
2959        fs::write(project.join("main.rs"), "// 你好世界\nfn main() {}").unwrap();
2960
2961        let cache = CacheManager::new(&project);
2962        let indexer = Indexer::new(cache, IndexConfig::default());
2963        indexer.index(&project, false).unwrap();
2964
2965        let cache = CacheManager::new(&project);
2966
2967        let engine = QueryEngine::new(cache);
2968        let filter = QueryFilter {
2969            use_contains: true,  // Unicode word boundaries may not work as expected
2970            force: true,  // Bypass broad query detection for 2-char Unicode pattern
2971            ..Default::default()
2972        };
2973
2974        // Search for unicode characters
2975        let results = engine.search("你好", filter).unwrap();
2976        assert!(results.len() >= 1);
2977    }
2978
2979    #[test]
2980    fn test_case_sensitive_search() {
2981        let temp = TempDir::new().unwrap();
2982        let project = temp.path().join("project");
2983        fs::create_dir(&project).unwrap();
2984
2985        fs::write(project.join("main.rs"), "fn Test() {}\nfn test() {}").unwrap();
2986
2987        let cache = CacheManager::new(&project);
2988        let indexer = Indexer::new(cache, IndexConfig::default());
2989        indexer.index(&project, false).unwrap();
2990
2991        let cache = CacheManager::new(&project);
2992
2993        let engine = QueryEngine::new(cache);
2994        let filter = QueryFilter::default();
2995
2996        // Search is case-sensitive
2997        let results = engine.search("Test", filter).unwrap();
2998        assert!(results.iter().any(|r| r.preview.contains("Test()")));
2999    }
3000
3001    // ==================== Determinism Tests ====================
3002
3003    #[test]
3004    fn test_results_sorted_deterministically() {
3005        let temp = TempDir::new().unwrap();
3006        let project = temp.path().join("project");
3007        fs::create_dir(&project).unwrap();
3008
3009        fs::write(project.join("a.rs"), "fn test() {}").unwrap();
3010        fs::write(project.join("z.rs"), "fn test() {}").unwrap();
3011        fs::write(project.join("m.rs"), "fn test() {}\nfn test2() {}").unwrap();
3012
3013        let cache = CacheManager::new(&project);
3014        let indexer = Indexer::new(cache, IndexConfig::default());
3015        indexer.index(&project, false).unwrap();
3016
3017        let cache = CacheManager::new(&project);
3018
3019        let engine = QueryEngine::new(cache);
3020        let filter = QueryFilter::default();
3021
3022        // Run search multiple times
3023        let results1 = engine.search("test", filter.clone()).unwrap();
3024        let results2 = engine.search("test", filter.clone()).unwrap();
3025        let results3 = engine.search("test", filter).unwrap();
3026
3027        // Results should be identical and sorted by path then line
3028        assert_eq!(results1.len(), results2.len());
3029        assert_eq!(results1.len(), results3.len());
3030
3031        for i in 0..results1.len() {
3032            assert_eq!(results1[i].path, results2[i].path);
3033            assert_eq!(results1[i].path, results3[i].path);
3034            assert_eq!(results1[i].span.start_line, results2[i].span.start_line);
3035            assert_eq!(results1[i].span.start_line, results3[i].span.start_line);
3036        }
3037
3038        // Verify sorting (path ascending, then line ascending)
3039        for i in 0..results1.len().saturating_sub(1) {
3040            let curr = &results1[i];
3041            let next = &results1[i + 1];
3042            assert!(
3043                curr.path < next.path ||
3044                (curr.path == next.path && curr.span.start_line <= next.span.start_line)
3045            );
3046        }
3047    }
3048
3049    // ==================== Combined Filter Tests ====================
3050
3051    #[test]
3052    fn test_multiple_filters_combined() {
3053        let temp = TempDir::new().unwrap();
3054        let project = temp.path().join("project");
3055        fs::create_dir_all(project.join("src")).unwrap();
3056
3057        fs::write(project.join("src/main.rs"), "fn test() {}\nstruct Test {}").unwrap();
3058        fs::write(project.join("src/lib.rs"), "fn test() {}").unwrap();
3059        fs::write(project.join("test.js"), "function test() {}").unwrap();
3060
3061        let cache = CacheManager::new(&project);
3062        let indexer = Indexer::new(cache, IndexConfig::default());
3063        indexer.index(&project, false).unwrap();
3064
3065        let cache = CacheManager::new(&project);
3066
3067        let engine = QueryEngine::new(cache);
3068
3069        // Combine language, kind, and file pattern filters
3070        let filter = QueryFilter {
3071            language: Some(Language::Rust),
3072            kind: Some(SymbolKind::Function),
3073            file_pattern: Some("src/main".to_string()),
3074            symbols_mode: true,
3075            ..Default::default()
3076        };
3077        let results = engine.search("test", filter).unwrap();
3078
3079        // Should only find the function in src/main.rs
3080        assert_eq!(results.len(), 1);
3081        assert!(results[0].path.contains("src/main.rs"));
3082        assert_eq!(results[0].kind, SymbolKind::Function);
3083    }
3084
3085    // ==================== Helper Method Tests ====================
3086
3087    #[test]
3088    fn test_find_symbol_helper() {
3089        let temp = TempDir::new().unwrap();
3090        let project = temp.path().join("project");
3091        fs::create_dir(&project).unwrap();
3092
3093        fs::write(project.join("main.rs"), "fn greet() {}").unwrap();
3094
3095        let cache = CacheManager::new(&project);
3096        let indexer = Indexer::new(cache, IndexConfig::default());
3097        indexer.index(&project, false).unwrap();
3098
3099        let cache = CacheManager::new(&project);
3100
3101        let engine = QueryEngine::new(cache);
3102        let results = engine.find_symbol("greet").unwrap();
3103
3104        assert!(results.len() >= 1);
3105        assert_eq!(results[0].kind, SymbolKind::Function);
3106    }
3107
3108    #[test]
3109    fn test_list_by_kind_helper() {
3110        let temp = TempDir::new().unwrap();
3111        let project = temp.path().join("project");
3112        fs::create_dir(&project).unwrap();
3113
3114        fs::write(
3115            project.join("main.rs"),
3116            "struct Point {}\nfn test() {}\nstruct Line {}"
3117        ).unwrap();
3118
3119        let cache = CacheManager::new(&project);
3120        let indexer = Indexer::new(cache, IndexConfig::default());
3121        indexer.index(&project, false).unwrap();
3122
3123        let cache = CacheManager::new(&project);
3124
3125        let engine = QueryEngine::new(cache);
3126
3127        // Search for structs that contain "oin" (Point contains it, Line doesn't)
3128        let filter = QueryFilter {
3129            kind: Some(SymbolKind::Struct),
3130            symbols_mode: true,
3131            use_contains: true,  // "oin" is substring of "Point"
3132            ..Default::default()
3133        };
3134        let results = engine.search("oin", filter).unwrap();
3135
3136        // Should find Point struct
3137        assert!(results.len() >= 1, "Should find at least Point struct");
3138        assert!(results.iter().all(|r| r.kind == SymbolKind::Struct));
3139        assert!(results.iter().any(|r| r.symbol.as_deref() == Some("Point")));
3140    }
3141
3142    // ==================== Metadata Tests ====================
3143
3144    #[test]
3145    fn test_search_with_metadata() {
3146        let temp = TempDir::new().unwrap();
3147        let project = temp.path().join("project");
3148        fs::create_dir(&project).unwrap();
3149
3150        fs::write(project.join("main.rs"), "fn test() {}").unwrap();
3151
3152        let cache = CacheManager::new(&project);
3153        let indexer = Indexer::new(cache, IndexConfig::default());
3154        indexer.index(&project, false).unwrap();
3155
3156        let cache = CacheManager::new(&project);
3157
3158        let engine = QueryEngine::new(cache);
3159        let filter = QueryFilter::default();
3160        let response = engine.search_with_metadata("test", filter).unwrap();
3161
3162        // Check metadata is present (status might be stale if run inside git repo)
3163        assert!(response.results.len() >= 1);
3164        // Note: can_trust_results may be false if running in a git repo without branch index
3165    }
3166
3167    // ==================== Multi-language Tests ====================
3168
3169    #[test]
3170    fn test_search_across_languages() {
3171        let temp = TempDir::new().unwrap();
3172        let project = temp.path().join("project");
3173        fs::create_dir(&project).unwrap();
3174
3175        fs::write(project.join("main.rs"), "fn greet() {}").unwrap();
3176        fs::write(project.join("main.ts"), "function greet() {}").unwrap();
3177        fs::write(project.join("main.py"), "def greet(): pass").unwrap();
3178
3179        let cache = CacheManager::new(&project);
3180        let indexer = Indexer::new(cache, IndexConfig::default());
3181        indexer.index(&project, false).unwrap();
3182
3183        let cache = CacheManager::new(&project);
3184
3185        let engine = QueryEngine::new(cache);
3186        let filter = QueryFilter::default();
3187        let results = engine.search("greet", filter).unwrap();
3188
3189        // Should find greet in all three languages
3190        assert!(results.len() >= 3);
3191        assert!(results.iter().any(|r| r.lang == Language::Rust));
3192        assert!(results.iter().any(|r| r.lang == Language::TypeScript));
3193        assert!(results.iter().any(|r| r.lang == Language::Python));
3194    }
3195}