reflex/
query.rs

1//! Query engine for searching indexed code
2//!
3//! The query engine loads the memory-mapped cache and executes
4//! deterministic searches based on lexical, structural, or symbol patterns.
5
6use anyhow::{Context, Result};
7use regex::Regex;
8
9use crate::cache::CacheManager;
10use crate::content_store::ContentReader;
11use crate::models::{
12    IndexStatus, IndexWarning, IndexWarningDetails, Language, QueryResponse, SearchResult, Span,
13    SymbolKind,
14};
15use crate::output;
16use crate::parsers::ParserFactory;
17use crate::regex_trigrams::extract_trigrams_from_regex;
18use crate::trigram::TrigramIndex;
19
20/// Query filter options
21#[derive(Debug, Clone)]
22pub struct QueryFilter {
23    /// Language filter (None = all languages)
24    pub language: Option<Language>,
25    /// Symbol kind filter (None = all kinds)
26    pub kind: Option<SymbolKind>,
27    /// Use AST pattern matching (vs lexical search)
28    pub use_ast: bool,
29    /// Use regex pattern matching
30    pub use_regex: bool,
31    /// Maximum number of results
32    pub limit: Option<usize>,
33    /// Search symbol definitions only (vs full-text)
34    pub symbols_mode: bool,
35    /// Show full symbol body (from span.start_line to span.end_line)
36    pub expand: bool,
37    /// File path filter (substring match)
38    pub file_pattern: Option<String>,
39    /// Exact symbol name match (no substring matching)
40    pub exact: bool,
41    /// Use substring matching instead of word-boundary matching (opt-in, expansive)
42    pub use_contains: bool,
43    /// Query timeout in seconds (0 = no timeout)
44    pub timeout_secs: u64,
45    /// Glob patterns to include (empty = all files)
46    pub glob_patterns: Vec<String>,
47    /// Glob patterns to exclude (applied after includes)
48    pub exclude_patterns: Vec<String>,
49    /// Return only unique file paths (deduplicated)
50    pub paths_only: bool,
51    /// Pagination offset (skip first N results after sorting)
52    pub offset: Option<usize>,
53    /// Force execution of potentially expensive queries (bypass broad query detection)
54    pub force: bool,
55    /// Suppress warning/info output (for --json mode to ensure pure JSON output)
56    pub suppress_output: bool,
57    /// Include dependency information in results
58    pub include_dependencies: bool,
59    /// Test-only: Override large index threshold (None = use default of 20,000)
60    #[doc(hidden)]
61    pub test_large_index_threshold: Option<usize>,
62    /// Test-only: Override short pattern threshold (None = use default of 4)
63    #[doc(hidden)]
64    pub test_short_pattern_threshold: Option<usize>,
65}
66
67impl Default for QueryFilter {
68    fn default() -> Self {
69        Self {
70            language: None,
71            kind: None,
72            use_ast: false,
73            use_regex: false,
74            limit: Some(100),  // Default: limit to 100 results for token efficiency
75            symbols_mode: false,
76            expand: false,
77            file_pattern: None,
78            exact: false,
79            use_contains: false,  // Default: word-boundary matching
80            timeout_secs: 30, // 30 seconds default timeout
81            glob_patterns: Vec::new(),
82            exclude_patterns: Vec::new(),
83            paths_only: false,
84            offset: None,
85            force: false,  // Default: enable broad query detection
86            suppress_output: false,  // Default: show warnings/info
87            include_dependencies: false,  // Default: don't load dependencies for performance
88            test_large_index_threshold: None,  // Default: use production threshold (20,000)
89            test_short_pattern_threshold: None,  // Default: use production threshold (4)
90        }
91    }
92}
93
94/// Manages query execution against the index
95pub struct QueryEngine {
96    cache: CacheManager,
97}
98
99impl QueryEngine {
100    /// Create a new query engine with the given cache manager
101    pub fn new(cache: CacheManager) -> Self {
102        Self { cache }
103    }
104
105    /// Load dependencies for search results if requested (legacy - per result)
106    /// Deprecated: Use group_and_load_dependencies for file-level grouping
107    fn load_dependencies(&self, results: &mut [SearchResult], include_deps: bool) -> Result<()> {
108        if !include_deps || results.is_empty() {
109            return Ok(());
110        }
111
112        log::debug!("Loading dependencies for {} results", results.len());
113
114        // Create dependency index
115        // Note: We need to pass the workspace root, not the cache directory
116        // The cache path is .reflex/, so its parent is the workspace root (.)
117        let workspace_root = self.cache.path().parent()
118            .ok_or_else(|| anyhow::anyhow!("Cache path has no parent"))?;
119        let cache_for_deps = CacheManager::new(workspace_root);
120        let dep_index = crate::dependency::DependencyIndex::new(cache_for_deps);
121
122        // Load dependencies for each result
123        for result in results {
124            // Normalize path: strip leading "./" if present
125            let normalized_path = result.path.strip_prefix("./").unwrap_or(&result.path);
126
127            // Get file_id from database by path
128            match self.cache.get_file_id(normalized_path) {
129                Ok(Some(file_id)) => {
130                    log::debug!("Found file_id={} for path={}", file_id, result.path);
131                    // Get dependencies for this file
132                    match dep_index.get_dependencies_info(file_id) {
133                        Ok(dep_infos) => {
134                            log::debug!("Loaded {} dependencies for file_id={}", dep_infos.len(), file_id);
135                            if !dep_infos.is_empty() {
136                                result.dependencies = Some(dep_infos);
137                            }
138                        }
139                        Err(e) => {
140                            log::warn!("Failed to get dependencies for file_id={}: {}", file_id, e);
141                        }
142                    }
143                }
144                Ok(None) => {
145                    log::warn!("No file_id found for path: {}", result.path);
146                }
147                Err(e) => {
148                    log::warn!("Failed to get file_id for path {}: {}", result.path, e);
149                }
150            }
151        }
152
153        Ok(())
154    }
155
156    /// Group search results by file and load dependencies at file level
157    /// Returns file-grouped results with dependencies populated once per file
158    fn group_and_load_dependencies(
159        &self,
160        results: Vec<SearchResult>,
161        include_deps: bool,
162    ) -> Result<Vec<crate::models::FileGroupedResult>> {
163        use std::collections::HashMap;
164        use crate::models::{FileGroupedResult, MatchResult};
165
166        if results.is_empty() {
167            return Ok(Vec::new());
168        }
169
170        // Group results by file path
171        let mut grouped: HashMap<String, Vec<SearchResult>> = HashMap::new();
172        for result in results {
173            grouped
174                .entry(result.path.clone())
175                .or_default()
176                .push(result);
177        }
178
179        // Create dependency index if needed
180        let dep_index = if include_deps {
181            let workspace_root = self.cache.path().parent()
182                .ok_or_else(|| anyhow::anyhow!("Cache path has no parent"))?;
183            let cache_for_deps = CacheManager::new(workspace_root);
184            Some(crate::dependency::DependencyIndex::new(cache_for_deps))
185        } else {
186            None
187        };
188
189        // Load ContentReader for extracting context lines
190        let content_path = self.cache.path().join("content.bin");
191        let content_reader_opt = ContentReader::open(&content_path).ok();
192
193        // Convert to FileGroupedResult and load dependencies
194        let mut file_results: Vec<FileGroupedResult> = grouped
195            .into_iter()
196            .map(|(path, file_matches)| {
197                // Load dependencies for this file (once per file, not per result)
198                let dependencies = if let Some(dep_idx) = &dep_index {
199                    let normalized_path = path.strip_prefix("./").unwrap_or(&path);
200                    match self.cache.get_file_id(normalized_path) {
201                        Ok(Some(file_id)) => {
202                            match dep_idx.get_dependencies_info(file_id) {
203                                Ok(dep_infos) if !dep_infos.is_empty() => {
204                                    log::debug!("Loaded {} dependencies for file: {}", dep_infos.len(), path);
205                                    Some(dep_infos)
206                                }
207                                Ok(_) => None,
208                                Err(e) => {
209                                    log::warn!("Failed to get dependencies for {}: {}", path, e);
210                                    None
211                                }
212                            }
213                        }
214                        Ok(None) => {
215                            log::warn!("No file_id found for path: {}", path);
216                            None
217                        }
218                        Err(e) => {
219                            log::warn!("Failed to get file_id for path {}: {}", path, e);
220                            None
221                        }
222                    }
223                } else {
224                    None
225                };
226
227                // Get file_id for context extraction
228                // Note: We use ContentReader's get_file_id_by_path() which returns array indices,
229                // not database file_ids (which are AUTO INCREMENT values)
230                let normalized_path = path.strip_prefix("./").unwrap_or(&path);
231                let file_id_for_context = if let Some(reader) = &content_reader_opt {
232                    reader.get_file_id_by_path(normalized_path)
233                } else {
234                    None
235                };
236                log::debug!("Context extraction: file={}, file_id={:?}, content_reader={}",
237                    path, file_id_for_context, content_reader_opt.is_some());
238
239                // Convert SearchResults to MatchResults (strip path and dependencies) and extract context
240                let matches: Vec<MatchResult> = file_matches
241                    .into_iter()
242                    .map(|r| {
243                        // Extract context lines (default: 3 lines before and after)
244                        let (context_before, context_after) = if let (Some(reader), Some(fid)) = (&content_reader_opt, file_id_for_context) {
245                            let result = reader.get_context_by_line(fid as u32, r.span.start_line, 3)
246                                .unwrap_or_else(|e| {
247                                    log::warn!("Failed to extract context for {}:{}: {}", path, r.span.start_line, e);
248                                    (vec![], vec![])
249                                });
250                            log::debug!("Extracted context for {}:{} - before: {}, after: {}",
251                                path, r.span.start_line, result.0.len(), result.1.len());
252                            result
253                        } else {
254                            if content_reader_opt.is_none() {
255                                log::debug!("No ContentReader available for context extraction");
256                            }
257                            if file_id_for_context.is_none() {
258                                log::debug!("No file_id found for {}", path);
259                            }
260                            (vec![], vec![])
261                        };
262
263                        MatchResult {
264                            kind: r.kind,
265                            symbol: r.symbol,
266                            span: r.span,
267                            preview: r.preview,
268                            context_before,
269                            context_after,
270                        }
271                    })
272                    .collect();
273
274                FileGroupedResult {
275                    path,
276                    dependencies,
277                    matches,
278                }
279            })
280            .collect();
281
282        // Sort by path for deterministic output
283        file_results.sort_by(|a, b| a.path.cmp(&b.path));
284
285        Ok(file_results)
286    }
287
288    /// Execute a query and return matching results with index metadata
289    ///
290    /// This is the preferred method for programmatic/JSON output as it includes
291    /// index freshness information that AI agents can use to decide whether to re-index.
292    pub fn search_with_metadata(&self, pattern: &str, filter: QueryFilter) -> Result<QueryResponse> {
293        log::info!("Executing query with metadata: pattern='{}', filter={:?}", pattern, filter);
294
295        // Ensure cache exists
296        if !self.cache.exists() {
297            anyhow::bail!(
298                "Index not found. Run 'rfx index' to build the cache first."
299            );
300        }
301
302        // Validate cache integrity
303        if let Err(e) = self.cache.validate() {
304            anyhow::bail!(
305                "Cache appears to be corrupted: {}. Run 'rfx clear' followed by 'rfx index' to rebuild.",
306                e
307            );
308        }
309
310        // Get index status and warning (without printing warnings to stderr)
311        let (status, can_trust_results, warning) = self.get_index_status()?;
312
313        // Execute the search
314        let (results, total) = self.search_internal(pattern, filter.clone())?;
315
316        // Build pagination metadata
317        use crate::models::PaginationInfo;
318        let pagination = PaginationInfo {
319            total,
320            count: results.len(),
321            offset: filter.offset.unwrap_or(0),
322            limit: filter.limit,
323            has_more: total > filter.offset.unwrap_or(0) + results.len(),
324        };
325
326        // Always use grouped format (group results by file)
327        // Dependencies are loaded only when include_dependencies is true
328        let grouped_results = self.group_and_load_dependencies(results, filter.include_dependencies)?;
329
330        Ok(QueryResponse {
331            ai_instruction: None,  // AI instruction is generated by CLI/MCP layer, not here
332            status,
333            can_trust_results,
334            warning,
335            pagination,
336            results: grouped_results,
337        })
338    }
339
340    /// Execute a query and return matching results (legacy method)
341    ///
342    /// This method prints warnings to stderr and returns just the results.
343    /// For programmatic use, prefer `search_with_metadata()`.
344    pub fn search(&self, pattern: &str, filter: QueryFilter) -> Result<Vec<SearchResult>> {
345        log::info!("Executing query: pattern='{}', filter={:?}", pattern, filter);
346
347        // Ensure cache exists
348        if !self.cache.exists() {
349            anyhow::bail!(
350                "Index not found. Run 'rfx index' to build the cache first."
351            );
352        }
353
354        // Validate cache integrity
355        if let Err(e) = self.cache.validate() {
356            anyhow::bail!(
357                "Cache appears to be corrupted: {}. Run 'rfx clear' followed by 'rfx index' to rebuild.",
358                e
359            );
360        }
361
362        // Show non-blocking warnings about branch state and staleness
363        self.check_index_freshness(&filter)?;
364
365        // Execute the search (discard total count - legacy method doesn't use it)
366        let (mut results, _total_count) = self.search_internal(pattern, filter.clone())?;
367
368        // Load dependencies if requested
369        self.load_dependencies(&mut results, filter.include_dependencies)?;
370
371        Ok(results)
372    }
373
374    /// Internal search implementation (used by both search methods)
375    /// Returns (results, total_count) where total_count is the count before offset/limit
376    fn search_internal(&self, pattern: &str, filter: QueryFilter) -> Result<(Vec<SearchResult>, usize)> {
377        use std::time::{Duration, Instant};
378
379        // Start timeout timer if configured
380        let start_time = Instant::now();
381        let timeout = if filter.timeout_secs > 0 {
382            Some(Duration::from_secs(filter.timeout_secs))
383        } else {
384            None
385        };
386
387        // KEYWORD DETECTION (early): Check if this is a keyword query that should scan ALL files
388        // When a user searches for a language keyword (like "class", "function") with --symbols or --kind,
389        // we interpret it as "list all symbols of that type" and should scan ALL files,
390        // not just the first 100 candidates from trigram search.
391        //
392        // Requirements for keyword query mode:
393        // 1. Symbol mode active (--symbols or --kind)
394        // 2. Pattern matches a keyword in ANY supported language
395        //
396        // Note: --lang is optional. If specified, language filtering happens naturally in Phase 2/3.
397        let is_keyword_query = if filter.symbols_mode || filter.kind.is_some() {
398            ParserFactory::get_all_keywords().contains(&pattern)
399        } else {
400            false
401        };
402
403        // KEYWORD-TO-KIND MAPPING: If user searches for a keyword without --kind, infer the kind
404        // Example: "class" → SymbolKind::Class, "function" → SymbolKind::Function
405        // This ensures keyword queries return only the relevant symbol type
406        let mut filter = filter.clone();  // Clone so we can modify it
407        if is_keyword_query && filter.kind.is_none() {
408            if let Some(inferred_kind) = Self::keyword_to_kind(pattern) {
409                log::info!("Keyword '{}' mapped to kind {:?} (auto-inferred)", pattern, inferred_kind);
410                filter.kind = Some(inferred_kind);
411            }
412        }
413
414        // EARLY BROAD QUERY DETECTION (Index Size Check)
415        // This check happens BEFORE the expensive trigram search to prevent hangs on large indexes
416        // For very large codebases (like Linux kernel with 62K files), even valid 3-char trigrams
417        // like "get" can take 10-30+ seconds to search. This early check prevents that hang.
418        //
419        // Criteria for early blocking:
420        // 1. Large index (> 20,000 files) AND
421        // 2. Short pattern (< 4 chars) AND
422        // 3. Not using regex (regex has its own trigram extraction) AND
423        // 4. Not a keyword query (keywords are intentionally broad) AND
424        // 5. Not forced by --force flag
425        if !filter.force && !filter.use_regex && !is_keyword_query {
426            let stats = self.cache.stats()?;
427            let total_files = stats.total_files;
428            let pattern_len = pattern.chars().count();
429
430            // Thresholds for early blocking:
431            // - Large index: 20,000+ files (approximately where performance degrades significantly)
432            // - Short pattern: < 4 chars (3-char trigrams are borderline, < 4 catches edge cases)
433            // Test overrides allow reducing thresholds for integration tests without creating 20K+ files
434            let large_index_threshold = filter.test_large_index_threshold.unwrap_or(20_000);
435            let short_pattern_threshold = filter.test_short_pattern_threshold.unwrap_or(4);
436
437            if total_files > large_index_threshold && pattern_len < short_pattern_threshold {
438                anyhow::bail!(
439                    "Query too broad - would be expensive to execute on this large index\n\
440                     \n\
441                     This index contains {} files, and pattern '{}' ({} characters) is too short for efficient searching.\n\
442                     On large codebases, short patterns can take 10-30+ seconds to complete.\n\
443                     \n\
444                     This query could:\n\
445                     • Hang for an extended period before returning results\n\
446                     • Return thousands of results\n\
447                     • Flood LLM context windows with excessive data\n\
448                     • Fail entirely\n\
449                     \n\
450                     Suggestions to narrow the query:\n\
451                     • Use a longer, more specific pattern (4+ characters recommended for large indexes)\n\
452                     • Add a language filter: --lang <language>\n\
453                     • Add a file filter: --glob <pattern> or --file <path>\n\
454                     • Use --force to bypass this check if you really need all results\n\
455                     \n\
456                     To force execution anyway:\n\
457                     rfx query \"{}\" --force",
458                    total_files,
459                    pattern,
460                    pattern_len,
461                    pattern
462                );
463            }
464        }
465
466        // PHASE 1: Get initial candidates (choose search strategy)
467        let mut results = if is_keyword_query {
468            // KEYWORD QUERY MODE: Scan all files (or files of target language if --lang specified)
469            // This ensures we find ALL classes/functions/etc, not just those in the first 100 trigram matches
470            if let Some(lang) = filter.language {
471                log::info!("Keyword query detected for '{}' - scanning all {:?} files (bypassing trigram search)",
472                          pattern, lang);
473            } else {
474                log::info!("Keyword query detected for '{}' - scanning all files (bypassing trigram search)", pattern);
475            }
476            self.get_all_language_files(&filter)?
477        } else if filter.use_regex {
478            // Regex pattern search with trigram optimization
479            self.get_regex_candidates(pattern, timeout.as_ref(), &start_time, filter.suppress_output)?
480        } else {
481            // Standard trigram-based full-text search
482            self.get_trigram_candidates(pattern, &filter)?
483        };
484
485        // EARLY LANGUAGE FILTER: Apply language filtering BEFORE broad query check
486        // This ensures we only parse files matching the language filter in Phase 2
487        // Critical for non-keyword queries to work correctly with accurate candidate counts
488        //
489        // Skip for keyword queries - those candidates are already pre-filtered by language
490        if !is_keyword_query {
491            if let Some(lang) = filter.language {
492                let before_count = results.len();
493                results.retain(|r| r.lang == lang);
494                log::debug!(
495                    "Language filter ({:?}): reduced {} candidates to {} candidates",
496                    lang,
497                    before_count,
498                    results.len()
499                );
500            }
501        }
502
503        // EARLY GLOB PATTERN FILTER: Apply glob/exclude filtering BEFORE broad query check
504        // This ensures candidate count reflects actual files that will be parsed
505        // Critical for queries like: rfx query "index" --symbols --glob "src/**/*.rs"
506        if !filter.glob_patterns.is_empty() || !filter.exclude_patterns.is_empty() {
507            use globset::{Glob, GlobSetBuilder};
508
509            // Build include matcher (if patterns specified)
510            let include_matcher = if !filter.glob_patterns.is_empty() {
511                let mut builder = GlobSetBuilder::new();
512                for pattern in &filter.glob_patterns {
513                    // Normalize pattern to ensure LLM-generated patterns work correctly
514                    let normalized = Self::normalize_glob_pattern(pattern);
515                    match Glob::new(&normalized) {
516                        Ok(glob) => {
517                            builder.add(glob);
518                        }
519                        Err(e) => {
520                            log::warn!("Invalid glob pattern '{}': {}", pattern, e);
521                        }
522                    }
523                }
524                match builder.build() {
525                    Ok(matcher) => Some(matcher),
526                    Err(e) => {
527                        log::warn!("Failed to build glob matcher: {}", e);
528                        None
529                    }
530                }
531            } else {
532                None
533            };
534
535            // Build exclude matcher (if patterns specified)
536            let exclude_matcher = if !filter.exclude_patterns.is_empty() {
537                let mut builder = GlobSetBuilder::new();
538                for pattern in &filter.exclude_patterns {
539                    // Normalize pattern to ensure LLM-generated patterns work correctly
540                    let normalized = Self::normalize_glob_pattern(pattern);
541                    match Glob::new(&normalized) {
542                        Ok(glob) => {
543                            builder.add(glob);
544                        }
545                        Err(e) => {
546                            log::warn!("Invalid exclude pattern '{}': {}", pattern, e);
547                        }
548                    }
549                }
550                match builder.build() {
551                    Ok(matcher) => Some(matcher),
552                    Err(e) => {
553                        log::warn!("Failed to build exclude matcher: {}", e);
554                        None
555                    }
556                }
557            } else {
558                None
559            };
560
561            // Apply filters
562            let before_count = results.len();
563            results.retain(|r| {
564                // If include patterns specified, path must match at least one
565                let included = if let Some(ref matcher) = include_matcher {
566                    matcher.is_match(&r.path)
567                } else {
568                    true // No include patterns = include all
569                };
570
571                // If exclude patterns specified, path must NOT match any
572                let excluded = if let Some(ref matcher) = exclude_matcher {
573                    matcher.is_match(&r.path)
574                } else {
575                    false // No exclude patterns = exclude none
576                };
577
578                included && !excluded
579            });
580            log::debug!(
581                "Glob filter: reduced {} candidates to {} candidates",
582                before_count,
583                results.len()
584            );
585        }
586
587        // Check timeout after Phase 1
588        if let Some(timeout_duration) = timeout {
589            if start_time.elapsed() > timeout_duration {
590                anyhow::bail!(
591                    "Query timeout exceeded ({} seconds).\n\
592                     \n\
593                     The query took too long to complete. Try one of these approaches:\n\
594                     • Use a more specific search pattern (longer patterns = faster search)\n\
595                     • Add a language filter with --lang to narrow the search space\n\
596                     • Add a file filter with --file to search specific directories\n\
597                     • Increase the timeout with --timeout <seconds>\n\
598                     \n\
599                     Example: rfx query \"{}\" --lang rust --timeout 60",
600                    filter.timeout_secs,
601                    pattern
602                );
603            }
604        }
605
606        // BROAD QUERY DETECTION: Check if query is too expensive BEFORE parsing
607        // This protects LLM users from accidentally running expensive queries that flood context windows
608        if !filter.force {
609            let candidate_count = results.len();
610            let pattern_len = pattern.chars().count();
611
612            // Condition 1: Pattern too short (< 3 chars can't use trigram optimization efficiently)
613            // Exception: Allow short keyword queries (e.g., "fn", "if") since they scan all language files
614            let is_short_pattern = pattern_len < 3 && !filter.use_regex && !is_keyword_query;
615
616            // Condition 2: AST query without glob restriction on large codebases
617            // Allow on small codebases (< 100 files) but require glob for larger ones
618            let is_broad_ast = filter.use_ast && filter.glob_patterns.is_empty() && candidate_count >= 100;
619
620            // Condition 3: Query-type-aware threshold for symbol/AST parsing
621            // Different thresholds based on actual performance characteristics:
622            // - AST without glob: 100 files (allow small codebases, block large ones)
623            // - AST with glob: 10,000 files (~5 seconds max)
624            // - Keyword queries: 20,000 files (~3 seconds max) - scan all files of language
625            // - Trigram-filtered symbols: 50,000 files (~5 seconds max) - very fast due to trigram filtering
626            let threshold = if filter.use_ast && filter.glob_patterns.is_empty() {
627                100  // AST without glob - allow small codebases
628            } else if filter.use_ast {
629                10_000  // AST with glob restriction
630            } else if is_keyword_query {
631                20_000  // Keyword queries (e.g., "class", "function")
632            } else {
633                50_000  // Trigram-filtered symbol queries
634            };
635
636            let has_many_candidates = candidate_count > threshold &&
637                                     (filter.symbols_mode || filter.kind.is_some() || filter.use_ast);
638
639            if is_short_pattern || has_many_candidates || is_broad_ast {
640                let reason = if is_short_pattern {
641                    format!("Pattern '{}' is too short ({} characters). Short patterns bypass trigram optimization and require scanning many files.", pattern, pattern_len)
642                } else if is_broad_ast {
643                    format!("AST query without --glob restriction will scan the entire codebase ({} files). AST queries are SLOW (500ms-10s+).", candidate_count)
644                } else if is_keyword_query {
645                    format!("Keyword query '{}' matched {} files. This query scans all files of the target language, which will take significant time and produce excessive results.", pattern, candidate_count)
646                } else {
647                    format!("Query matched {} files. Parsing this many files with --symbols or --kind will take significant time and produce excessive results.", candidate_count)
648                };
649
650                let suggestions = if is_short_pattern {
651                    vec![
652                        "• Use a longer, more specific pattern (3+ characters recommended)",
653                        "• Add a language filter: --lang <language>",
654                        "• Add a file path filter: --file <path> or --glob <pattern>",
655                        "• Use --force to bypass this check if you really need all results"
656                    ]
657                } else if is_broad_ast {
658                    vec![
659                        "• Add --glob to restrict AST query to specific files: --glob 'src/**/*.rs'",
660                        "• Use --symbols instead (10-100x faster in 95% of cases)",
661                        "• Use --force to bypass this check if you need a full codebase scan"
662                    ]
663                } else if is_keyword_query {
664                    vec![
665                        "• Add a language filter to reduce files scanned: --lang <language>",
666                        "• Add glob patterns to search specific directories: --glob 'src/**/*.rs'",
667                        "• Add --kind to filter to specific symbol types: --kind function",
668                        "• Use a more specific pattern instead of a keyword",
669                        "• Use --force to bypass this check if you need all results"
670                    ]
671                } else {
672                    vec![
673                        "• Add a language filter to reduce candidate set: --lang <language>",
674                        "• Add glob patterns to search specific directories: --glob 'src/**/*.rs'",
675                        "• Use a more specific search pattern",
676                        "• Use --force to bypass this check if you need all results"
677                    ]
678                };
679
680                // Build the command snippet showing current flags
681                let mut cmd_flags = String::new();
682                if filter.symbols_mode {
683                    cmd_flags.push_str("--symbols ");
684                }
685                if let Some(ref lang) = filter.language {
686                    cmd_flags.push_str(&format!("--lang {:?} ", lang));
687                }
688                if let Some(ref kind) = filter.kind {
689                    cmd_flags.push_str(&format!("--kind {:?} ", kind));
690                }
691                if filter.use_ast {
692                    cmd_flags.push_str("--ast ");
693                }
694
695                anyhow::bail!(
696                    "Query too broad - would be expensive to execute\n\
697                     \n\
698                     {}\n\
699                     \n\
700                     This query could:\n\
701                     • Hang for an extended period before returning results\n\
702                     • Return thousands of results\n\
703                     • Flood LLM context windows with excessive data\n\
704                     • Fail entirely\n\
705                     \n\
706                     Suggestions to narrow the query:\n\
707                     {}\n\
708                     \n\
709                     To force execution anyway:\n\
710                     rfx query \"{}\" --force {}",
711                    reason,
712                    suggestions.join("\n             "),
713                    pattern,
714                    cmd_flags
715                );
716            }
717        }
718
719        // DETERMINISTIC SORTING: Sort candidates early for deterministic results
720        // This ensures results are always returned in the same order
721        if filter.symbols_mode || filter.kind.is_some() || filter.use_ast {
722            results.sort_by(|a, b| {
723                a.path.cmp(&b.path)
724                    .then_with(|| a.span.start_line.cmp(&b.span.start_line))
725            });
726
727            // Warn if many candidates need parsing (helps users refine queries)
728            let candidate_count = results.len();
729            if candidate_count > 1000 && !filter.suppress_output {
730                output::warn(&format!(
731                    "Pattern '{}' matched {} files - parsing may take some time. Consider using --file, --glob, or a more specific pattern to narrow the search.",
732                    pattern,
733                    candidate_count
734                ));
735            } else if candidate_count > 100 {
736                log::info!("Parsing {} candidate files for symbol extraction", candidate_count);
737            }
738        }
739
740        // PHASE 2: Enrich with symbol information or AST pattern matching (if needed)
741        if filter.use_ast {
742            // AST pattern matching: Execute Tree-sitter query on candidate files
743            results = self.enrich_with_ast(results, pattern, filter.language)?;
744        } else if filter.symbols_mode || filter.kind.is_some() {
745            // Symbol enrichment: Parse candidate files and extract symbol definitions
746            results = self.enrich_with_symbols(results, pattern, &filter)?;
747        }
748
749        // PHASE 3: Apply post-enrichment filters
750        // Note: Language and glob filters are applied in Phase 1 (before broad query check)
751        // Only kind, file_pattern, and exact filters are applied here
752
753        // Apply kind filter (only relevant for symbol searches)
754        // Special case: --kind function also includes methods (methods are functions in classes)
755        if let Some(ref kind) = filter.kind {
756            results.retain(|r| {
757                if matches!(kind, SymbolKind::Function) {
758                    // When searching for functions, also include methods
759                    matches!(r.kind, SymbolKind::Function | SymbolKind::Method)
760                } else {
761                    r.kind == *kind
762                }
763            });
764        }
765
766        // Apply file path filter (substring match)
767        if let Some(ref file_pattern) = filter.file_pattern {
768            results.retain(|r| r.path.contains(file_pattern));
769        }
770
771        // Apply exact name filter (only for symbol searches)
772        if filter.exact && filter.symbols_mode {
773            results.retain(|r| r.symbol.as_deref() == Some(pattern));
774        }
775
776        // Expand symbol bodies if requested
777        // Works for both symbol-mode and regex searches (if regex matched a symbol definition)
778        if filter.expand {
779            // Load content store to fetch full symbol bodies
780            let content_path = self.cache.path().join("content.bin");
781            if let Ok(content_reader) = ContentReader::open(&content_path) {
782                for result in &mut results {
783                    // Only expand if the result has a meaningful span (not just a single line)
784                    if result.span.start_line < result.span.end_line {
785                        // Find the file_id for this result's path
786                        if let Some(file_id) = Self::find_file_id(&content_reader, &result.path) {
787                            // Fetch the full span content
788                            if let Ok(content) = content_reader.get_file_content(file_id) {
789                                let lines: Vec<&str> = content.lines().collect();
790                                let start_idx = (result.span.start_line as usize).saturating_sub(1);
791                                let end_idx = (result.span.end_line as usize).min(lines.len());
792
793                                if start_idx < end_idx {
794                                    let full_body = lines[start_idx..end_idx].join("\n");
795                                    result.preview = full_body;
796                                }
797                            }
798                        }
799                    }
800                }
801            }
802        }
803
804        // Step 4: Deduplicate by path if paths-only mode
805        if filter.paths_only {
806            use std::collections::HashSet;
807            let mut seen_paths = HashSet::new();
808            results.retain(|r| seen_paths.insert(r.path.clone()));
809        }
810
811        // Step 5: Sort results deterministically (by path, then line number)
812        results.sort_by(|a, b| {
813            a.path.cmp(&b.path)
814                .then_with(|| a.span.start_line.cmp(&b.span.start_line))
815        });
816
817        // Capture total count AFTER all filtering but BEFORE pagination (offset/limit)
818        // This is the total number of results the user can paginate through
819        let total_count = results.len();
820
821        // Step 5.5: Apply offset (pagination)
822        if let Some(offset) = filter.offset {
823            if offset < results.len() {
824                results = results.into_iter().skip(offset).collect();
825            } else {
826                // Offset beyond results - return empty
827                results.clear();
828            }
829        }
830
831        // Step 6: Apply limit
832        if let Some(limit) = filter.limit {
833            results.truncate(limit);
834        }
835
836        log::info!("Query returned {} results (total before pagination: {})", results.len(), total_count);
837
838        Ok((results, total_count))
839    }
840
841    /// Search for symbols by exact name match
842    pub fn find_symbol(&self, name: &str) -> Result<Vec<SearchResult>> {
843        let filter = QueryFilter {
844            symbols_mode: true,
845            ..Default::default()
846        };
847        self.search(name, filter)
848    }
849
850    /// Search using a Tree-sitter AST pattern
851    pub fn search_ast(&self, pattern: &str, lang: Option<Language>) -> Result<Vec<SearchResult>> {
852        let filter = QueryFilter {
853            language: lang,
854            use_ast: true,
855            ..Default::default()
856        };
857
858        self.search(pattern, filter)
859    }
860
861    /// Execute AST query on all indexed files (no trigram filtering)
862    ///
863    /// WARNING: This method scans the entire codebase (500ms-2s+).
864    /// In 95% of cases, use --symbols instead which is 10-100x faster.
865    ///
866    /// # Algorithm
867    /// 1. Get all indexed files for the specified language
868    /// 2. Apply glob/exclude filters to reduce file set
869    /// 3. Load file contents for all matching files
870    /// 4. Execute AST query pattern using Tree-sitter
871    /// 5. Apply remaining filters and return results
872    ///
873    /// # Performance
874    /// - Parses entire codebase (not just trigram candidates)
875    /// - Expected: 500ms-2s for medium codebases, 2-10s for large codebases
876    /// - Use --glob to limit scope for better performance
877    ///
878    /// # Requirements
879    /// - Language must be specified (AST queries are language-specific)
880    /// - AST pattern must be valid S-expression syntax
881    pub fn search_ast_all_files(&self, ast_pattern: &str, filter: QueryFilter) -> Result<Vec<SearchResult>> {
882        log::info!("Executing AST query on all files: pattern='{}', filter={:?}", ast_pattern, filter);
883
884        // Require language for AST queries
885        let lang = filter.language.ok_or_else(|| anyhow::anyhow!(
886            "Language must be specified for AST pattern matching. Use --lang to specify the language.\n\
887             \n\
888             Example: rfx query \"(function_definition) @fn\" --ast --lang python"
889        ))?;
890
891        // Ensure cache exists
892        if !self.cache.exists() {
893            anyhow::bail!(
894                "Index not found. Run 'rfx index' to build the cache first."
895            );
896        }
897
898        // Show non-blocking warnings about branch state and staleness
899        self.check_index_freshness(&filter)?;
900
901        // Load content store
902        let content_path = self.cache.path().join("content.bin");
903        let content_reader = ContentReader::open(&content_path)
904            .context("Failed to open content store")?;
905
906        // Build glob matchers ONCE before file iteration (performance optimization)
907        use globset::{Glob, GlobSetBuilder};
908
909        let include_matcher = if !filter.glob_patterns.is_empty() {
910            let mut builder = GlobSetBuilder::new();
911            for pattern in &filter.glob_patterns {
912                // Normalize pattern to ensure LLM-generated patterns work correctly
913                let normalized = Self::normalize_glob_pattern(pattern);
914                if let Ok(glob) = Glob::new(&normalized) {
915                    builder.add(glob);
916                }
917            }
918            builder.build().ok()
919        } else {
920            None
921        };
922
923        let exclude_matcher = if !filter.exclude_patterns.is_empty() {
924            let mut builder = GlobSetBuilder::new();
925            for pattern in &filter.exclude_patterns {
926                // Normalize pattern to ensure LLM-generated patterns work correctly
927                let normalized = Self::normalize_glob_pattern(pattern);
928                if let Ok(glob) = Glob::new(&normalized) {
929                    builder.add(glob);
930                }
931            }
932            builder.build().ok()
933        } else {
934            None
935        };
936
937        // Get all files matching the language and glob filters
938        let mut candidates: Vec<SearchResult> = Vec::new();
939
940        for file_id in 0..content_reader.file_count() {
941            let file_path = match content_reader.get_file_path(file_id as u32) {
942                Some(p) => p,
943                None => continue,
944            };
945
946            // Detect language from file extension
947            let ext = file_path.extension()
948                .and_then(|e| e.to_str())
949                .unwrap_or("");
950            let detected_lang = Language::from_extension(ext);
951
952            // Filter by language
953            if detected_lang != lang {
954                continue;
955            }
956
957            let file_path_str = file_path.to_string_lossy().to_string();
958
959            // Apply glob/exclude filters BEFORE loading content (performance optimization)
960            let included = include_matcher.as_ref().map_or(true, |m| m.is_match(&file_path_str));
961            let excluded = exclude_matcher.as_ref().map_or(false, |m| m.is_match(&file_path_str));
962
963            if !included || excluded {
964                continue;
965            }
966
967            // Create a dummy candidate for this file (AST query will replace it)
968            candidates.push(SearchResult {
969                path: file_path_str,
970                lang: detected_lang,
971                span: Span { start_line: 1, end_line: 1 },
972                symbol: None,
973                kind: SymbolKind::Unknown("ast_query".to_string()),
974                preview: String::new(),
975                dependencies: None,
976            });
977        }
978
979        log::info!("AST query scanning {} files for language {:?}", candidates.len(), lang);
980
981        // BROAD QUERY DETECTION: Block large AST queries without glob restriction
982        // Allow small codebases (<100 files) but require --glob for larger ones
983        if !filter.force && filter.glob_patterns.is_empty() && candidates.len() >= 100 {
984            anyhow::bail!(
985                "Query too broad - would be expensive to execute\n\
986                 \n\
987                 AST query without --glob restriction will scan the ENTIRE codebase ({} files). AST queries are SLOW (500ms-10s+).\n\
988                 \n\
989                 This query could:\n\
990                 • Hang for an extended period before returning results\n\
991                 • Return thousands of results\n\
992                 • Flood LLM context windows with excessive data\n\
993                 • Fail entirely\n\
994                 \n\
995                 Suggestions to narrow the query:\n\
996                 • Add --glob to restrict AST query to specific files: --glob 'src/**/*.rs'\n\
997                 • Use --symbols instead (10-100x faster in 95% of cases)\n\
998                 • Use --force to bypass this check if you need a full codebase scan\n\
999                 \n\
1000                 To force execution anyway:\n\
1001                 rfx query \"{}\" --force --ast --lang {:?}",
1002                candidates.len(),
1003                ast_pattern,
1004                lang
1005            );
1006        }
1007
1008        if candidates.is_empty() {
1009            if !filter.suppress_output {
1010                output::warn(&format!("No files found for language {:?}. Check your language filter or glob patterns.", lang));
1011            }
1012            return Ok(Vec::new());
1013        }
1014
1015        // Execute the AST query on all candidate files
1016        // This will load file contents and parse them with tree-sitter
1017        let mut results = self.enrich_with_ast(candidates, ast_pattern, filter.language)?;
1018
1019        log::debug!("AST query found {} matches before filtering", results.len());
1020
1021        // Apply remaining filters (same as search_internal Phase 3)
1022
1023        // Apply kind filter
1024        if let Some(ref kind) = filter.kind {
1025            results.retain(|r| {
1026                if matches!(kind, SymbolKind::Function) {
1027                    matches!(r.kind, SymbolKind::Function | SymbolKind::Method)
1028                } else {
1029                    r.kind == *kind
1030                }
1031            });
1032        }
1033
1034        // Note: exact filter doesn't make sense for AST queries (pattern is S-expression, not symbol name)
1035
1036        // Expand symbol bodies if requested
1037        if filter.expand {
1038            let content_path = self.cache.path().join("content.bin");
1039            if let Ok(content_reader) = ContentReader::open(&content_path) {
1040                for result in &mut results {
1041                    if result.span.start_line < result.span.end_line {
1042                        if let Some(file_id) = Self::find_file_id(&content_reader, &result.path) {
1043                            if let Ok(content) = content_reader.get_file_content(file_id) {
1044                                let lines: Vec<&str> = content.lines().collect();
1045                                let start_idx = (result.span.start_line as usize).saturating_sub(1);
1046                                let end_idx = (result.span.end_line as usize).min(lines.len());
1047
1048                                if start_idx < end_idx {
1049                                    let full_body = lines[start_idx..end_idx].join("\n");
1050                                    result.preview = full_body;
1051                                }
1052                            }
1053                        }
1054                    }
1055                }
1056            }
1057        }
1058
1059        // Deduplicate by path if paths-only mode
1060        if filter.paths_only {
1061            use std::collections::HashSet;
1062            let mut seen_paths = HashSet::new();
1063            results.retain(|r| seen_paths.insert(r.path.clone()));
1064        }
1065
1066        // Sort results deterministically
1067        results.sort_by(|a, b| {
1068            a.path.cmp(&b.path)
1069                .then_with(|| a.span.start_line.cmp(&b.span.start_line))
1070        });
1071
1072        // Apply offset (pagination)
1073        if let Some(offset) = filter.offset {
1074            if offset < results.len() {
1075                results = results.into_iter().skip(offset).collect();
1076            } else {
1077                results.clear();
1078            }
1079        }
1080
1081        // Apply limit
1082        if let Some(limit) = filter.limit {
1083            results.truncate(limit);
1084        }
1085
1086        log::info!("AST query returned {} results", results.len());
1087
1088        // Load dependencies if requested
1089        self.load_dependencies(&mut results, filter.include_dependencies)?;
1090
1091        Ok(results)
1092    }
1093
1094    /// Search using AST pattern with separate text pattern for trigram filtering
1095    ///
1096    /// This allows efficient AST queries by:
1097    /// 1. Using text_pattern for Phase 1 trigram filtering (narrows to candidate files)
1098    /// 2. Using ast_pattern for Phase 2 AST matching (structure-aware filtering)
1099    ///
1100    /// # Example
1101    /// ```ignore
1102    /// // Find async functions: trigram search for "fn ", AST match for function_item
1103    /// engine.search_ast_with_text_filter("fn ", "(function_item (async))", filter)?;
1104    /// ```
1105    pub fn search_ast_with_text_filter(
1106        &self,
1107        text_pattern: &str,
1108        ast_pattern: &str,
1109        filter: QueryFilter,
1110    ) -> Result<Vec<SearchResult>> {
1111        log::info!("Executing AST query with text filter: text='{}', ast='{}', filter={:?}",
1112                   text_pattern, ast_pattern, filter);
1113
1114        // Ensure cache exists
1115        if !self.cache.exists() {
1116            anyhow::bail!(
1117                "Index not found. Run 'rfx index' to build the cache first."
1118            );
1119        }
1120
1121        // Show non-blocking warnings about branch state and staleness
1122        self.check_index_freshness(&filter)?;
1123
1124        // Start timeout timer if configured
1125        use std::time::{Duration, Instant};
1126        let start_time = Instant::now();
1127        let timeout = if filter.timeout_secs > 0 {
1128            Some(Duration::from_secs(filter.timeout_secs))
1129        } else {
1130            None
1131        };
1132
1133        // PHASE 1: Get initial candidates using text pattern (trigram search)
1134        let candidates = if filter.use_regex {
1135            self.get_regex_candidates(text_pattern, timeout.as_ref(), &start_time, filter.suppress_output)?
1136        } else {
1137            self.get_trigram_candidates(text_pattern, &filter)?
1138        };
1139
1140        log::debug!("Phase 1 found {} candidate locations", candidates.len());
1141
1142        // PHASE 2: Execute AST query on candidates
1143        let mut results = self.enrich_with_ast(candidates, ast_pattern, filter.language)?;
1144
1145        log::debug!("Phase 2 AST matching found {} results", results.len());
1146
1147        // PHASE 3: Apply filters
1148        if let Some(lang) = filter.language {
1149            results.retain(|r| r.lang == lang);
1150        }
1151
1152        if let Some(ref kind) = filter.kind {
1153            results.retain(|r| {
1154                if matches!(kind, SymbolKind::Function) {
1155                    matches!(r.kind, SymbolKind::Function | SymbolKind::Method)
1156                } else {
1157                    r.kind == *kind
1158                }
1159            });
1160        }
1161
1162        if let Some(ref file_pattern) = filter.file_pattern {
1163            results.retain(|r| r.path.contains(file_pattern));
1164        }
1165
1166        // Apply glob pattern filters (same logic as in search_internal)
1167        if !filter.glob_patterns.is_empty() || !filter.exclude_patterns.is_empty() {
1168            use globset::{Glob, GlobSetBuilder};
1169
1170            let include_matcher = if !filter.glob_patterns.is_empty() {
1171                let mut builder = GlobSetBuilder::new();
1172                for pattern in &filter.glob_patterns {
1173                    // Normalize pattern to ensure LLM-generated patterns work correctly
1174                    let normalized = Self::normalize_glob_pattern(pattern);
1175                    if let Ok(glob) = Glob::new(&normalized) {
1176                        builder.add(glob);
1177                    }
1178                }
1179                builder.build().ok()
1180            } else {
1181                None
1182            };
1183
1184            let exclude_matcher = if !filter.exclude_patterns.is_empty() {
1185                let mut builder = GlobSetBuilder::new();
1186                for pattern in &filter.exclude_patterns {
1187                    // Normalize pattern to ensure LLM-generated patterns work correctly
1188                    let normalized = Self::normalize_glob_pattern(pattern);
1189                    if let Ok(glob) = Glob::new(&normalized) {
1190                        builder.add(glob);
1191                    }
1192                }
1193                builder.build().ok()
1194            } else {
1195                None
1196            };
1197
1198            results.retain(|r| {
1199                let included = include_matcher.as_ref().map_or(true, |m| m.is_match(&r.path));
1200                let excluded = exclude_matcher.as_ref().map_or(false, |m| m.is_match(&r.path));
1201                included && !excluded
1202            });
1203        }
1204
1205        if filter.exact && filter.symbols_mode {
1206            results.retain(|r| r.symbol.as_deref() == Some(text_pattern));
1207        }
1208
1209        // Expand symbol bodies if requested
1210        if filter.expand {
1211            let content_path = self.cache.path().join("content.bin");
1212            if let Ok(content_reader) = ContentReader::open(&content_path) {
1213                for result in &mut results {
1214                    if result.span.start_line < result.span.end_line {
1215                        if let Some(file_id) = Self::find_file_id(&content_reader, &result.path) {
1216                            if let Ok(content) = content_reader.get_file_content(file_id) {
1217                                let lines: Vec<&str> = content.lines().collect();
1218                                let start_idx = (result.span.start_line as usize).saturating_sub(1);
1219                                let end_idx = (result.span.end_line as usize).min(lines.len());
1220
1221                                if start_idx < end_idx {
1222                                    let full_body = lines[start_idx..end_idx].join("\n");
1223                                    result.preview = full_body;
1224                                }
1225                            }
1226                        }
1227                    }
1228                }
1229            }
1230        }
1231
1232        // Sort results deterministically
1233        results.sort_by(|a, b| {
1234            a.path.cmp(&b.path)
1235                .then_with(|| a.span.start_line.cmp(&b.span.start_line))
1236        });
1237
1238        // Apply offset (pagination)
1239        if let Some(offset) = filter.offset {
1240            if offset < results.len() {
1241                results = results.into_iter().skip(offset).collect();
1242            } else {
1243                results.clear();
1244            }
1245        }
1246
1247        // Apply limit
1248        if let Some(limit) = filter.limit {
1249            results.truncate(limit);
1250        }
1251
1252        log::info!("AST query returned {} results", results.len());
1253
1254        Ok(results)
1255    }
1256
1257    /// List all symbols of a specific kind
1258    pub fn list_by_kind(&self, kind: SymbolKind) -> Result<Vec<SearchResult>> {
1259        let filter = QueryFilter {
1260            kind: Some(kind),
1261            symbols_mode: true,
1262            ..Default::default()
1263        };
1264
1265        self.search("*", filter)
1266    }
1267
1268    /// Enrich text match candidates with symbol information by parsing files
1269    ///
1270    /// Takes a list of text match candidates and extracts symbol information at those locations.
1271    ///
1272    /// # Algorithm
1273    /// 1. Group candidates by file_id for efficient processing
1274    /// 2. Parse each file with tree-sitter to extract ALL symbols
1275    /// 3. Filter symbols based on matching strategy:
1276    ///    - If use_regex=true: Extract symbols whose line spans overlap with candidate locations
1277    ///    - If use_contains=true: Filter symbols by substring match on symbol name
1278    ///    - Default: Filter symbols by exact name match
1279    /// 4. Return filtered symbol results
1280    ///
1281    /// # Performance
1282    /// Only parses files that have text matches, so typically 10-100 files
1283    /// instead of the entire codebase (62K+ files).
1284    ///
1285    /// # Optimizations
1286    /// 1. Language filtering: Skips files with unsupported languages (no parsers)
1287    /// 2. Parallel processing: Uses Rayon to parse files concurrently across CPU cores
1288    fn enrich_with_symbols(&self, candidates: Vec<SearchResult>, pattern: &str, filter: &QueryFilter) -> Result<Vec<SearchResult>> {
1289        // Load content store for file reading
1290        let content_path = self.cache.path().join("content.bin");
1291        let content_reader = ContentReader::open(&content_path)
1292            .context("Failed to open content store")?;
1293
1294        // Load trigram index for file path lookups
1295        let trigrams_path = self.cache.path().join("trigrams.bin");
1296        let trigram_index = if trigrams_path.exists() {
1297            TrigramIndex::load(&trigrams_path)?
1298        } else {
1299            Self::rebuild_trigram_index(&content_reader)?
1300        };
1301
1302        // Open symbol cache for reading cached symbols
1303        let symbol_cache = crate::symbol_cache::SymbolCache::open(self.cache.path())
1304            .context("Failed to open symbol cache")?;
1305
1306        // Load file hashes for current branch for cache lookups
1307        let root = self.cache.workspace_root();
1308        let branch = crate::git::get_current_branch(&root)
1309            .unwrap_or_else(|_| "_default".to_string());
1310        let file_hashes = self.cache.load_hashes_for_branch(&branch)
1311            .context("Failed to load file hashes")?;
1312        log::debug!("Loaded {} file hashes for branch '{}' for symbol cache lookups", file_hashes.len(), branch);
1313
1314        // Group candidates by file, filtering out unsupported languages
1315        use std::collections::HashMap;
1316        let mut files_by_path: HashMap<String, Vec<SearchResult>> = HashMap::new();
1317        let mut skipped_unsupported = 0;
1318
1319        for candidate in candidates {
1320            // Skip files with unsupported languages (no parser available)
1321            if !candidate.lang.is_supported() {
1322                skipped_unsupported += 1;
1323                continue;
1324            }
1325
1326            files_by_path
1327                .entry(candidate.path.clone())
1328                .or_insert_with(Vec::new)
1329                .push(candidate);
1330        }
1331
1332        let total_files = files_by_path.len();
1333        log::debug!("Processing {} candidate files for symbol enrichment (skipped {} unsupported language files)",
1334                   total_files, skipped_unsupported);
1335
1336        // Warn if pattern is very broad (may take time to parse all files)
1337        if total_files > 1000 && !filter.suppress_output {
1338            output::warn(&format!(
1339                "Pattern '{}' matched {} files. This may take some time to parse. Consider using a more specific pattern or adding --lang/--file filters to narrow the search.",
1340                pattern,
1341                total_files
1342            ));
1343        }
1344
1345        // Convert to vec for parallel processing
1346        let mut files_to_process: Vec<String> = files_by_path.keys().cloned().collect();
1347
1348        // PHASE 2a: Line-based pre-filtering (skip files where ALL matches are in comments/strings)
1349        // This reduces tree-sitter parsing workload by 2-5x for most queries
1350        let mut files_to_skip: std::collections::HashSet<String> = std::collections::HashSet::new();
1351
1352        for file_path in &files_to_process {
1353            // Get the language for this file
1354            let ext = std::path::Path::new(file_path)
1355                .extension()
1356                .and_then(|e| e.to_str())
1357                .unwrap_or("");
1358            let lang = Language::from_extension(ext);
1359
1360            // Get line filter for this language (if available)
1361            if let Some(line_filter) = crate::line_filter::get_filter(lang) {
1362                // Find file_id for this path
1363                let file_id = match Self::find_file_id_by_path(&content_reader, &trigram_index, file_path) {
1364                    Some(id) => id,
1365                    None => continue,
1366                };
1367
1368                // Load file content
1369                let content = match content_reader.get_file_content(file_id) {
1370                    Ok(c) => c,
1371                    Err(_) => continue,
1372                };
1373
1374                // Check if ALL pattern occurrences are in comments/strings
1375                let mut all_in_non_code = true;
1376                for line in content.lines() {
1377                    // Find all occurrences of the pattern in this line
1378                    let mut search_start = 0;
1379                    while let Some(pos) = line[search_start..].find(pattern) {
1380                        let absolute_pos = search_start + pos;
1381
1382                        // Check if this occurrence is in code (not comment/string)
1383                        let in_comment = line_filter.is_in_comment(line, absolute_pos);
1384                        let in_string = line_filter.is_in_string(line, absolute_pos);
1385
1386                        if !in_comment && !in_string {
1387                            // Found at least one occurrence in actual code
1388                            all_in_non_code = false;
1389                            break;
1390                        }
1391
1392                        search_start = absolute_pos + pattern.len();
1393                    }
1394
1395                    if !all_in_non_code {
1396                        break;
1397                    }
1398                }
1399
1400                // If ALL occurrences are in comments/strings, skip this file
1401                if all_in_non_code {
1402                    // Double-check: make sure there was at least one occurrence
1403                    if content.contains(pattern) {
1404                        files_to_skip.insert(file_path.clone());
1405                        log::debug!("Pre-filter: Skipping {} (all matches in comments/strings)", file_path);
1406                    }
1407                }
1408            }
1409        }
1410
1411        // Filter out files we're skipping
1412        files_to_process.retain(|path| !files_to_skip.contains(path));
1413
1414        log::debug!("Pre-filter: Skipped {} files where all matches are in comments/strings (parsing {} files)",
1415                   files_to_skip.len(), files_to_process.len());
1416
1417        // Configure thread pool for parallel processing (use 80% of available cores, capped at 8)
1418        let num_threads = {
1419            let available_cores = std::thread::available_parallelism()
1420                .map(|n| n.get())
1421                .unwrap_or(4);
1422            // Use 80% of available cores (minimum 1, maximum 8) to avoid locking the system
1423            // Cap at 8 to prevent diminishing returns from cache contention on high-core systems
1424            ((available_cores as f64 * 0.8).ceil() as usize).max(1).min(8)
1425        };
1426
1427        log::debug!("Using {} threads for parallel symbol extraction (out of {} available cores)",
1428                   num_threads,
1429                   std::thread::available_parallelism().map(|n| n.get()).unwrap_or(4));
1430
1431        // Build a custom thread pool with limited threads
1432        let pool = rayon::ThreadPoolBuilder::new()
1433            .num_threads(num_threads)
1434            .build()
1435            .context("Failed to create thread pool for symbol extraction")?;
1436
1437        // OPTIMIZATION: Batch read all cached symbols in ONE database transaction
1438        // This is 10-30x faster than calling get() individually for each file
1439
1440        // Step 1: Collect file paths that have hashes
1441        let files_with_hashes: Vec<String> = files_to_process
1442            .iter()
1443            .filter(|path| file_hashes.contains_key(path.as_str()))
1444            .cloned()
1445            .collect();
1446
1447        // Step 2: Batch lookup file_ids for all paths
1448        let file_id_map = self.cache.batch_get_file_ids(&files_with_hashes)
1449            .context("Failed to batch lookup file IDs")?;
1450
1451        // Step 3: Build (file_id, hash, path) tuples for batch_get_with_kind
1452        let file_lookup_tuples: Vec<(i64, String, String)> = files_with_hashes
1453            .iter()
1454            .filter_map(|path| {
1455                let file_id = file_id_map.get(path)?;
1456                let hash = file_hashes.get(path.as_str())?;
1457                Some((*file_id, hash.clone(), path.clone()))
1458            })
1459            .collect();
1460
1461        // Step 4: Batch read symbols with kind filtering (uses junction table + integer joins)
1462        let batch_results = symbol_cache.batch_get_with_kind(&file_lookup_tuples, filter.kind.clone())
1463            .context("Failed to batch read symbol cache")?;
1464
1465        // Step 5: Separate files into cached vs need-to-parse
1466        let mut cached_symbols: HashMap<String, Vec<SearchResult>> = HashMap::new();
1467        let mut files_needing_parse: Vec<String> = Vec::new();
1468
1469        // Build path lookup from file_id
1470        let id_to_path: HashMap<i64, String> = file_id_map
1471            .iter()
1472            .map(|(path, id)| (*id, path.clone()))
1473            .collect();
1474
1475        // Process cached results
1476        for (file_id, symbols) in batch_results {
1477            if let Some(file_path) = id_to_path.get(&file_id) {
1478                cached_symbols.insert(file_path.clone(), symbols);
1479            }
1480        }
1481
1482        // Files with hashes but not in cache results need parsing
1483        for path in &files_with_hashes {
1484            if file_id_map.contains_key(path) && !cached_symbols.contains_key(path) {
1485                files_needing_parse.push(path.clone());
1486            }
1487        }
1488
1489        // Add files without hashes to parse list
1490        for file_path in &files_to_process {
1491            if !file_hashes.contains_key(file_path.as_str()) {
1492                files_needing_parse.push(file_path.clone());
1493            }
1494        }
1495
1496        log::debug!(
1497            "Symbol cache: {} hits, {} need parsing",
1498            cached_symbols.len(),
1499            files_needing_parse.len()
1500        );
1501
1502        // Parse files in parallel using custom thread pool (only cache misses)
1503        use rayon::prelude::*;
1504
1505        let parsed_symbols: Vec<SearchResult> = pool.install(|| {
1506            files_needing_parse
1507                .par_iter()
1508                .flat_map(|file_path| {
1509                // Find file_id for this path
1510                let file_id = match Self::find_file_id_by_path(&content_reader, &trigram_index, file_path) {
1511                    Some(id) => id,
1512                    None => {
1513                        log::warn!("Could not find file_id for path: {}", file_path);
1514                        return Vec::new();
1515                    }
1516                };
1517
1518                let content = match content_reader.get_file_content(file_id) {
1519                    Ok(c) => c,
1520                    Err(e) => {
1521                        log::warn!("Failed to read file {}: {}", file_path, e);
1522                        return Vec::new();
1523                    }
1524                };
1525
1526                // Detect language
1527                let ext = std::path::Path::new(file_path)
1528                    .extension()
1529                    .and_then(|e| e.to_str())
1530                    .unwrap_or("");
1531                let lang = Language::from_extension(ext);
1532
1533                // Parse file to extract symbols
1534                let symbols = match ParserFactory::parse(file_path, content, lang) {
1535                    Ok(symbols) => {
1536                        log::debug!("Parsed {} symbols from {}", symbols.len(), file_path);
1537                        symbols
1538                    }
1539                    Err(e) => {
1540                        log::debug!("Failed to parse {}: {}", file_path, e);
1541                        Vec::new()
1542                    }
1543                };
1544
1545                // Cache the parsed symbols (ignore errors - caching is best-effort)
1546                if let Some(file_hash) = file_hashes.get(file_path.as_str()) {
1547                    if let Err(e) = symbol_cache.set(file_path, file_hash, &symbols) {
1548                        log::debug!("Failed to cache symbols for {}: {}", file_path, e);
1549                    }
1550                }
1551
1552                symbols
1553            })
1554            .collect()
1555        });
1556
1557        // Combine cached and parsed symbols
1558        let mut all_symbols: Vec<SearchResult> = Vec::new();
1559
1560        // Add all cached symbols
1561        for symbols in cached_symbols.values() {
1562            all_symbols.extend_from_slice(symbols);
1563        }
1564
1565        // Add all parsed symbols
1566        all_symbols.extend(parsed_symbols);
1567
1568        // KEYWORD DETECTION: Check if pattern is a language keyword (e.g., "class", "function")
1569        // If it matches a keyword AND symbols_mode is true, interpret as "list all symbols of that type"
1570        // rather than looking for a symbol literally named "class" or "function"
1571        //
1572        // IMPORTANT: Only check keywords for languages that will pass Phase 3 filtering.
1573        // If a language filter is specified, only check that language's keywords.
1574        // Otherwise, check all languages present in the symbol results.
1575        let is_keyword_query = {
1576            // Determine which language to check keywords for
1577            let lang_to_check = if let Some(lang) = filter.language {
1578                // Language filter specified - check that language only
1579                // This ensures keyword detection aligns with Phase 3 language filtering
1580                vec![lang]
1581            } else {
1582                // No language filter - check all languages that appear in the actual symbols
1583                // (not candidates, but the parsed symbols that made it through)
1584                // This handles mixed-language codebases correctly
1585                let mut langs: Vec<Language> = all_symbols.iter()
1586                    .map(|s| s.lang)
1587                    .collect::<Vec<_>>();
1588                langs.sort_by(|a, b| format!("{:?}", a).cmp(&format!("{:?}", b))); // Deterministic ordering
1589                langs.dedup(); // Remove duplicates after sorting
1590                langs
1591            };
1592
1593            // Check if pattern matches a keyword in any of the relevant languages
1594            lang_to_check.iter().any(|lang| {
1595                ParserFactory::get_keywords(*lang).contains(&pattern)
1596            })
1597        };
1598
1599        // If pattern is a keyword (like "class" or "function"), skip name-based filtering
1600        // and return all symbols (kind filtering happens in Phase 3)
1601        let filtered: Vec<SearchResult> = if is_keyword_query {
1602            log::info!("Pattern '{}' is a language keyword - listing all symbols (kind filtering will be applied in Phase 3)", pattern);
1603            all_symbols
1604        } else if filter.use_regex {
1605            // For regex queries, candidates already matched content via regex in Phase 1.
1606            // Extract symbols whose line spans overlap with the candidate locations.
1607            // This ensures symbols are found at the locations where the regex matched.
1608
1609            // Build a map of (file_path, line_no) from candidates
1610            use std::collections::{HashMap, HashSet};
1611            let mut candidate_lines: HashMap<String, HashSet<usize>> = HashMap::new();
1612            for candidate in &files_by_path {
1613                for cand in candidate.1 {
1614                    candidate_lines
1615                        .entry(candidate.0.clone())
1616                        .or_insert_with(HashSet::new)
1617                        .insert(cand.span.start_line);
1618                }
1619            }
1620
1621            // Filter symbols whose spans overlap with candidate lines
1622            all_symbols
1623                .into_iter()
1624                .filter(|sym| {
1625                    if let Some(lines) = candidate_lines.get(&sym.path) {
1626                        // Check if symbol's line span overlaps with any candidate line
1627                        for line in sym.span.start_line..=sym.span.end_line {
1628                            if lines.contains(&line) {
1629                                return true;
1630                            }
1631                        }
1632                    }
1633                    false
1634                })
1635                .collect()
1636        } else if filter.use_contains {
1637            // Substring match (opt-in with --contains)
1638            all_symbols
1639                .into_iter()
1640                .filter(|sym| sym.symbol.as_deref().map_or(false, |s| s.contains(pattern)))
1641                .collect()
1642        } else {
1643            // Exact match (default)
1644            all_symbols
1645                .into_iter()
1646                .filter(|sym| sym.symbol.as_deref().map_or(false, |s| s == pattern))
1647                .collect()
1648        };
1649
1650        log::info!("Symbol enrichment found {} matches for pattern '{}'", filtered.len(), pattern);
1651
1652        Ok(filtered)
1653    }
1654
1655    /// Enrich text match candidates with AST pattern matching
1656    ///
1657    /// Takes a list of text match candidates and executes a Tree-sitter AST query
1658    /// on the candidate files, returning only matches that satisfy the AST pattern.
1659    ///
1660    /// # Algorithm
1661    /// 1. Extract unique file paths from candidates
1662    /// 2. Load file contents for each candidate file
1663    /// 3. Execute AST query pattern using Tree-sitter
1664    /// 4. Return AST matches
1665    ///
1666    /// # Performance
1667    /// Only parses files that have text matches, so typically 10-100 files
1668    /// instead of the entire codebase (62K+ files).
1669    ///
1670    /// # Requirements
1671    /// - Language must be specified (AST queries are language-specific)
1672    /// - AST pattern must be valid S-expression syntax
1673    fn enrich_with_ast(&self, candidates: Vec<SearchResult>, ast_pattern: &str, language: Option<Language>) -> Result<Vec<SearchResult>> {
1674        // Require language for AST queries
1675        let lang = language.ok_or_else(|| anyhow::anyhow!(
1676            "Language must be specified for AST pattern matching. Use --lang to specify the language."
1677        ))?;
1678
1679        // Load content store for file reading
1680        let content_path = self.cache.path().join("content.bin");
1681        let content_reader = ContentReader::open(&content_path)
1682            .context("Failed to open content store")?;
1683
1684        // Load trigram index for file path lookups
1685        let trigrams_path = self.cache.path().join("trigrams.bin");
1686        let trigram_index = if trigrams_path.exists() {
1687            TrigramIndex::load(&trigrams_path)?
1688        } else {
1689            Self::rebuild_trigram_index(&content_reader)?
1690        };
1691
1692        // Collect unique file paths from candidates and load their contents
1693        use std::collections::HashMap;
1694        let mut file_contents: HashMap<String, String> = HashMap::new();
1695
1696        for candidate in &candidates {
1697            if file_contents.contains_key(&candidate.path) {
1698                continue;
1699            }
1700
1701            // Find file_id for this path
1702            let file_id = match Self::find_file_id_by_path(&content_reader, &trigram_index, &candidate.path) {
1703                Some(id) => id,
1704                None => {
1705                    log::warn!("Could not find file_id for path: {}", candidate.path);
1706                    continue;
1707                }
1708            };
1709
1710            // Load file content
1711            let content = match content_reader.get_file_content(file_id) {
1712                Ok(c) => c,
1713                Err(e) => {
1714                    log::warn!("Failed to read file {}: {}", candidate.path, e);
1715                    continue;
1716                }
1717            };
1718
1719            file_contents.insert(candidate.path.clone(), content.to_string());
1720        }
1721
1722        log::debug!("Executing AST query on {} candidate files with language {:?}", file_contents.len(), lang);
1723
1724        // Execute AST query using the ast_query module
1725        let results = crate::ast_query::execute_ast_query(candidates, ast_pattern, lang, &file_contents)?;
1726
1727        log::info!("AST query found {} matches for pattern '{}'", results.len(), ast_pattern);
1728
1729        Ok(results)
1730    }
1731
1732    /// Helper to find file_id by path string
1733    fn find_file_id_by_path(
1734        content_reader: &ContentReader,
1735        trigram_index: &TrigramIndex,
1736        target_path: &str,
1737    ) -> Option<u32> {
1738        // Try trigram index first (faster)
1739        for file_id in 0..trigram_index.file_count() {
1740            if let Some(path) = trigram_index.get_file(file_id as u32) {
1741                if path.to_string_lossy() == target_path {
1742                    return Some(file_id as u32);
1743                }
1744            }
1745        }
1746
1747        // Fallback to content reader
1748        for file_id in 0..content_reader.file_count() {
1749            if let Some(path) = content_reader.get_file_path(file_id as u32) {
1750                if path.to_string_lossy() == target_path {
1751                    return Some(file_id as u32);
1752                }
1753            }
1754        }
1755
1756        None
1757    }
1758
1759    /// Map keyword patterns to SymbolKind for auto-inference
1760    ///
1761    /// When users search for keywords like "class" or "function" with --symbols,
1762    /// automatically infer the kind filter to return only symbols of that type.
1763    ///
1764    /// This makes keyword queries more intuitive: searching for "class" returns
1765    /// only classes, not all symbols.
1766    fn keyword_to_kind(keyword: &str) -> Option<SymbolKind> {
1767        match keyword {
1768            // Classes and types
1769            "class" => Some(SymbolKind::Class),
1770            "struct" => Some(SymbolKind::Struct),
1771            "enum" => Some(SymbolKind::Enum),
1772            "interface" => Some(SymbolKind::Interface),
1773            "trait" => Some(SymbolKind::Trait),
1774            "type" => Some(SymbolKind::Type),
1775            "record" => Some(SymbolKind::Struct),  // C# record types
1776
1777            // Functions and methods
1778            "function" | "fn" | "def" | "func" => Some(SymbolKind::Function),
1779
1780            // Variables and constants
1781            "const" | "static" => Some(SymbolKind::Constant),
1782            "var" | "let" => Some(SymbolKind::Variable),
1783
1784            // Modules and namespaces
1785            "mod" | "module" | "namespace" => Some(SymbolKind::Module),
1786
1787            // Other constructs
1788            "impl" => None,  // impl blocks don't have a direct SymbolKind
1789            "async" => None, // async is a modifier, not a symbol type
1790
1791            // Default: no mapping (return all symbols)
1792            _ => None,
1793        }
1794    }
1795
1796    /// Get all files matching the language filter (for keyword queries)
1797    ///
1798    /// This method bypasses trigram search and returns ALL files of the specified language.
1799    /// Used for keyword queries like "list all classes" where we need complete coverage,
1800    /// not just the first 100 candidates from a trigram search.
1801    ///
1802    /// Similar to `search_ast_all_files()` but works for symbol queries instead of AST queries.
1803    fn get_all_language_files(&self, filter: &QueryFilter) -> Result<Vec<SearchResult>> {
1804        // Language filter is optional - if not specified, scan all files
1805        // If specified, only scan files of that language
1806
1807        // Load content store
1808        let content_path = self.cache.path().join("content.bin");
1809        let content_reader = ContentReader::open(&content_path)
1810            .context("Failed to open content store")?;
1811
1812        // Build glob matchers if specified (for filtering)
1813        use globset::{Glob, GlobSetBuilder};
1814
1815        let include_matcher = if !filter.glob_patterns.is_empty() {
1816            let mut builder = GlobSetBuilder::new();
1817            for pattern in &filter.glob_patterns {
1818                let normalized = Self::normalize_glob_pattern(pattern);
1819                if let Ok(glob) = Glob::new(&normalized) {
1820                    builder.add(glob);
1821                }
1822            }
1823            builder.build().ok()
1824        } else {
1825            None
1826        };
1827
1828        let exclude_matcher = if !filter.exclude_patterns.is_empty() {
1829            let mut builder = GlobSetBuilder::new();
1830            for pattern in &filter.exclude_patterns {
1831                let normalized = Self::normalize_glob_pattern(pattern);
1832                if let Ok(glob) = Glob::new(&normalized) {
1833                    builder.add(glob);
1834                }
1835            }
1836            builder.build().ok()
1837        } else {
1838            None
1839        };
1840
1841        // Scan all files and filter by language + glob patterns
1842        let mut candidates: Vec<SearchResult> = Vec::new();
1843
1844        for file_id in 0..content_reader.file_count() {
1845            let file_path = match content_reader.get_file_path(file_id as u32) {
1846                Some(p) => p,
1847                None => continue,
1848            };
1849
1850            // Detect language from file extension
1851            let ext = file_path.extension()
1852                .and_then(|e| e.to_str())
1853                .unwrap_or("");
1854            let detected_lang = Language::from_extension(ext);
1855
1856            // Filter by language (if specified)
1857            if let Some(lang) = filter.language {
1858                if detected_lang != lang {
1859                    continue;
1860                }
1861            }
1862
1863            let file_path_str = file_path.to_string_lossy().to_string();
1864
1865            // Apply glob/exclude filters
1866            let included = include_matcher.as_ref().map_or(true, |m| m.is_match(&file_path_str));
1867            let excluded = exclude_matcher.as_ref().map_or(false, |m| m.is_match(&file_path_str));
1868
1869            if !included || excluded {
1870                continue;
1871            }
1872
1873            // Apply file path filter if specified
1874            if let Some(ref file_pattern) = filter.file_pattern {
1875                if !file_path_str.contains(file_pattern) {
1876                    continue;
1877                }
1878            }
1879
1880            // Create a dummy candidate for this file
1881            // Phase 2 (symbol enrichment) will parse it and extract actual symbols
1882            candidates.push(SearchResult {
1883                path: file_path_str,
1884                lang: detected_lang,
1885                span: Span { start_line: 1, end_line: 1 },
1886                symbol: None,
1887                kind: SymbolKind::Unknown("keyword_query".to_string()),
1888                preview: String::new(),
1889                dependencies: None,
1890            });
1891        }
1892
1893        if let Some(lang) = filter.language {
1894            log::info!("Keyword query will scan {} {:?} files for symbol extraction", candidates.len(), lang);
1895        } else {
1896            log::info!("Keyword query will scan {} files (all languages) for symbol extraction", candidates.len());
1897        }
1898
1899        Ok(candidates)
1900    }
1901
1902    /// Get candidate results using trigram-based full-text search
1903    fn get_trigram_candidates(&self, pattern: &str, filter: &QueryFilter) -> Result<Vec<SearchResult>> {
1904        // Load content store
1905        let content_path = self.cache.path().join("content.bin");
1906        let content_reader = ContentReader::open(&content_path)
1907            .context("Failed to open content store")?;
1908
1909        // Load trigram index from disk (or rebuild if missing)
1910        let trigrams_path = self.cache.path().join("trigrams.bin");
1911        let trigram_index = if trigrams_path.exists() {
1912            match TrigramIndex::load(&trigrams_path) {
1913                Ok(index) => {
1914                    log::debug!("Loaded trigram index from disk: {} trigrams, {} files",
1915                               index.trigram_count(), index.file_count());
1916                    index
1917                }
1918                Err(e) => {
1919                    log::warn!("Failed to load trigram index from disk: {}", e);
1920                    log::warn!("Rebuilding trigram index from content store...");
1921                    Self::rebuild_trigram_index(&content_reader)?
1922                }
1923            }
1924        } else {
1925            log::debug!("trigrams.bin not found, rebuilding from content store");
1926            Self::rebuild_trigram_index(&content_reader)?
1927        };
1928
1929        // Search using trigrams
1930        let candidates = trigram_index.search(pattern);
1931        log::debug!("Found {} candidate locations from trigram search", candidates.len());
1932
1933        // Clone pattern to owned String for thread safety
1934        let pattern_owned = pattern.to_string();
1935
1936        // Compile regex once if in regex mode (before parallel processing for efficiency)
1937        let compiled_regex = if filter.use_regex {
1938            match Regex::new(&pattern_owned) {
1939                Ok(re) => Some(re),
1940                Err(e) => {
1941                    log::error!("Invalid regex pattern '{}': {}", pattern_owned, e);
1942                    anyhow::bail!("Invalid regex pattern '{}': {}", pattern_owned, e);
1943                }
1944            }
1945        } else {
1946            None
1947        };
1948
1949        // Group candidates by file for efficient processing
1950        use std::collections::HashMap;
1951        let mut candidates_by_file: HashMap<u32, Vec<crate::trigram::FileLocation>> = HashMap::new();
1952        for loc in candidates {
1953            candidates_by_file
1954                .entry(loc.file_id)
1955                .or_insert_with(Vec::new)
1956                .push(loc);
1957        }
1958
1959        log::debug!("Scanning {} files with trigram matches", candidates_by_file.len());
1960
1961        // Process files in parallel using rayon
1962        use rayon::prelude::*;
1963
1964        let results: Vec<SearchResult> = candidates_by_file
1965            .par_iter()
1966            .flat_map(|(file_id, locations)| {
1967                // Get file metadata
1968                let file_path = match trigram_index.get_file(*file_id) {
1969                    Some(p) => p,
1970                    None => return Vec::new(),
1971                };
1972
1973                let content = match content_reader.get_file_content(*file_id) {
1974                    Ok(c) => c,
1975                    Err(_) => return Vec::new(),
1976                };
1977
1978                let file_path_str = file_path.to_string_lossy().to_string();
1979
1980                // Detect language once per file
1981                let ext = file_path.extension()
1982                    .and_then(|e| e.to_str())
1983                    .unwrap_or("");
1984                let lang = Language::from_extension(ext);
1985
1986                // Split content into lines once
1987                let lines: Vec<&str> = content.lines().collect();
1988
1989                // Use a HashSet to deduplicate results by line number
1990                let mut seen_lines: std::collections::HashSet<usize> = std::collections::HashSet::new();
1991                let mut file_results = Vec::new();
1992
1993                // Only check the specific lines indicated by trigram posting lists
1994                for loc in locations {
1995                    let line_no = loc.line_no as usize;
1996
1997                    // Skip if we've already processed this line
1998                    if seen_lines.contains(&line_no) {
1999                        continue;
2000                    }
2001
2002                    // Bounds check
2003                    if line_no == 0 || line_no > lines.len() {
2004                        log::debug!("Line {} out of bounds (file has {} lines)", line_no, lines.len());
2005                        continue;
2006                    }
2007
2008                    let line = lines[line_no - 1];
2009
2010                    // Apply matching strategy based on filter mode:
2011                    // - Default: Word-boundary matching (restrictive - finds whole identifiers)
2012                    // - --contains: Substring matching (expansive - finds pattern anywhere)
2013                    // - --regex: Actual regex matching (controlled by pattern itself)
2014                    let line_matches = if filter.use_regex {
2015                        // Regex matching - use pre-compiled regex for efficiency
2016                        // The regex was compiled once outside the parallel loop
2017                        compiled_regex.as_ref()
2018                            .map(|re| re.is_match(line))
2019                            .unwrap_or(false)
2020                    } else if filter.use_contains {
2021                        // Substring matching (expansive)
2022                        line.contains(&pattern_owned)
2023                    } else {
2024                        // Word-boundary matching (restrictive, default)
2025                        Self::has_word_boundary_match(line, &pattern_owned)
2026                    };
2027
2028                    if !line_matches {
2029                        continue;
2030                    }
2031
2032                    seen_lines.insert(line_no);
2033
2034                    // Create a text match result (no symbol lookup for performance)
2035                    file_results.push(SearchResult {
2036                        path: file_path_str.clone(),
2037                        lang: lang.clone(),
2038                        kind: SymbolKind::Unknown("text_match".to_string()),
2039                        symbol: None,  // No symbol name for text matches (avoid duplication)
2040                        span: Span {
2041                            start_line: line_no,
2042                            end_line: line_no,
2043                        },
2044                        preview: line.to_string(),
2045                        dependencies: None,
2046                    });
2047                }
2048
2049                file_results
2050            })
2051            .collect();
2052
2053        Ok(results)
2054    }
2055
2056    /// Get candidate results using regex patterns with trigram optimization
2057    ///
2058    /// # Algorithm
2059    ///
2060    /// 1. Extract literal sequences from the regex pattern (≥3 chars)
2061    /// 2. If literals found: search for files containing ANY of the literals (UNION)
2062    /// 3. If no literals: fall back to full content scan
2063    /// 4. Compile regex and verify matches in candidate files
2064    /// 5. Return matching results with context
2065    ///
2066    /// # File Selection Strategy
2067    ///
2068    /// Uses UNION of files containing any literal (conservative approach):
2069    /// - For alternation patterns `(a|b)`: Correctly searches files with a OR b
2070    /// - For sequential patterns `a.*b`: Searches files with a OR b (may include extra files)
2071    /// - Trade-off: Ensures correctness at the cost of scanning 2-3x more files for sequential patterns
2072    /// - Performance impact is minimal due to memory-mapped I/O (<5ms overhead typically)
2073    ///
2074    /// # Performance
2075    ///
2076    /// - Best case (pattern with literals): <20ms (trigram optimization)
2077    /// - Typical case (alternation/sequential): 5-15ms on small codebases (<100 files)
2078    /// - Worst case (no literals like `.*`): ~100ms (full scan)
2079    fn get_regex_candidates(&self, pattern: &str, timeout: Option<&std::time::Duration>, start_time: &std::time::Instant, suppress_output: bool) -> Result<Vec<SearchResult>> {
2080        // Step 1: Compile the regex
2081        let regex = Regex::new(pattern)
2082            .with_context(|| format!("Invalid regex pattern: {}", pattern))?;
2083
2084        // Check timeout before expensive operations
2085        if let Some(timeout_duration) = timeout {
2086            if start_time.elapsed() > *timeout_duration {
2087                anyhow::bail!(
2088                    "Query timeout exceeded ({} seconds) during regex compilation",
2089                    timeout_duration.as_secs()
2090                );
2091            }
2092        }
2093
2094        // Step 2: Extract trigrams from regex
2095        let trigrams = extract_trigrams_from_regex(pattern);
2096
2097        // Load content store
2098        let content_path = self.cache.path().join("content.bin");
2099        let content_reader = ContentReader::open(&content_path)
2100            .context("Failed to open content store")?;
2101
2102        let mut results = Vec::new();
2103
2104        if trigrams.is_empty() {
2105            // No trigrams - fall back to full scan
2106            if !suppress_output {
2107                output::warn(&format!(
2108                    "Regex pattern '{}' has no literals (≥3 chars), falling back to full content scan. This may be slow on large codebases. Consider using patterns with literal text.",
2109                    pattern
2110                ));
2111            }
2112
2113            // Scan all files
2114            for file_id in 0..content_reader.file_count() {
2115                let file_path = content_reader.get_file_path(file_id as u32)
2116                    .context("Invalid file_id")?;
2117                let content = content_reader.get_file_content(file_id as u32)?;
2118
2119                self.find_regex_matches_in_file(
2120                    &regex,
2121                    file_path,
2122                    content,
2123                    &mut results,
2124                )?;
2125            }
2126        } else {
2127            // Use trigrams to narrow down candidates
2128            log::debug!("Using {} trigrams to narrow regex search candidates", trigrams.len());
2129
2130            // Load trigram index
2131            let trigrams_path = self.cache.path().join("trigrams.bin");
2132            let trigram_index = if trigrams_path.exists() {
2133                TrigramIndex::load(&trigrams_path)?
2134            } else {
2135                Self::rebuild_trigram_index(&content_reader)?
2136            };
2137
2138            // Extract the literal sequences from the regex pattern
2139            use crate::regex_trigrams::extract_literal_sequences;
2140            let literals = extract_literal_sequences(pattern);
2141
2142            if literals.is_empty() {
2143                log::warn!("Regex extraction found trigrams but no literal sequences - this shouldn't happen");
2144                // Fall back to full scan
2145                for file_id in 0..content_reader.file_count() {
2146                    let file_path = content_reader.get_file_path(file_id as u32)
2147                        .context("Invalid file_id")?;
2148                    let content = content_reader.get_file_content(file_id as u32)?;
2149                    self.find_regex_matches_in_file(&regex, file_path, content, &mut results)?;
2150                }
2151            } else {
2152                // Search for each literal sequence and union the results
2153                // This ensures we find matches for ANY literal (important for alternation patterns like (a|b))
2154                // Trade-off: May scan more files than necessary for sequential patterns (a.*b),
2155                // but ensures correctness for all regex patterns
2156                use std::collections::HashSet;
2157                let mut candidate_files: HashSet<u32> = HashSet::new();
2158
2159                for literal in &literals {
2160                    // Search for this literal in the trigram index
2161                    let candidates = trigram_index.search(literal);
2162                    let file_ids: HashSet<u32> = candidates.iter().map(|loc| loc.file_id).collect();
2163
2164                    log::debug!("Literal '{}' found in {} files", literal, file_ids.len());
2165
2166                    // Union with existing candidate files (not intersection)
2167                    // This ensures we search files containing ANY of the literals
2168                    candidate_files.extend(file_ids);
2169                }
2170
2171                let final_candidates = candidate_files;
2172                log::debug!("After union: searching {} files that contain any literal", final_candidates.len());
2173
2174                // Verify regex matches in candidate files only
2175                for &file_id in &final_candidates {
2176                    let file_path = trigram_index.get_file(file_id)
2177                        .context("Invalid file_id from trigram search")?;
2178                    let content = content_reader.get_file_content(file_id)?;
2179
2180                    self.find_regex_matches_in_file(
2181                        &regex,
2182                        file_path,
2183                        content,
2184                        &mut results,
2185                    )?;
2186                }
2187            }
2188        }
2189
2190        log::info!("Regex search found {} matches for pattern '{}'", results.len(), pattern);
2191        Ok(results)
2192    }
2193
2194    /// Find all regex matches in a single file
2195    fn find_regex_matches_in_file(
2196        &self,
2197        regex: &Regex,
2198        file_path: &std::path::Path,
2199        content: &str,
2200        results: &mut Vec<SearchResult>,
2201    ) -> Result<()> {
2202        let file_path_str = file_path.to_string_lossy().to_string();
2203
2204        // Detect language from file extension
2205        let ext = file_path.extension()
2206            .and_then(|e| e.to_str())
2207            .unwrap_or("");
2208        let lang = Language::from_extension(ext);
2209
2210        // Find all regex matches line by line
2211        for (line_idx, line) in content.lines().enumerate() {
2212            if regex.is_match(line) {
2213                let line_no = line_idx + 1;
2214
2215                // Create text match result
2216                // Note: We don't extract symbol names from regex matches because:
2217                // 1. Regex might match partial identifiers (e.g., "UserController" in "ListUserController")
2218                // 2. Regex might match across language-specific delimiters (namespaces, scopes, etc.)
2219                // 3. Accurate symbol extraction requires tree-sitter parsing (expensive)
2220                // The user can see the full context in the 'preview' field
2221                results.push(SearchResult {
2222                    path: file_path_str.clone(),
2223                    lang: lang.clone(),
2224                    kind: SymbolKind::Unknown("regex_match".to_string()),
2225                    symbol: None,  // No symbol name for regex matches
2226                    span: Span {
2227                        start_line: line_no,
2228                        end_line: line_no,
2229                    },
2230                    preview: line.to_string(),
2231                    dependencies: None,
2232                });
2233            }
2234        }
2235
2236        Ok(())
2237    }
2238
2239    /// Helper function to find file_id in ContentReader by matching path
2240    fn find_file_id(content_reader: &ContentReader, target_path: &str) -> Option<u32> {
2241        for file_id in 0..content_reader.file_count() {
2242            if let Some(path) = content_reader.get_file_path(file_id as u32) {
2243                if path.to_string_lossy() == target_path {
2244                    return Some(file_id as u32);
2245                }
2246            }
2247        }
2248        None
2249    }
2250
2251    /// Rebuild trigram index from content store (fallback when trigrams.bin is missing)
2252    fn rebuild_trigram_index(content_reader: &ContentReader) -> Result<TrigramIndex> {
2253        log::debug!("Rebuilding trigram index from {} files", content_reader.file_count());
2254        let mut trigram_index = TrigramIndex::new();
2255
2256        for file_id in 0..content_reader.file_count() {
2257            let file_path = content_reader.get_file_path(file_id as u32)
2258                .context("Invalid file_id")?
2259                .to_path_buf();
2260            let content = content_reader.get_file_content(file_id as u32)?;
2261
2262            let idx = trigram_index.add_file(file_path);
2263            trigram_index.index_file(idx, content);
2264        }
2265
2266        trigram_index.finalize();
2267        log::debug!("Trigram index rebuilt with {} trigrams", trigram_index.trigram_count());
2268
2269        Ok(trigram_index)
2270    }
2271
2272    /// Normalize glob patterns for consistent matching
2273    ///
2274    /// Ensures glob patterns work correctly by auto-prepending "./" to relative paths
2275    /// that don't already start with ".", "/", or "*". This fixes LLM-generated patterns
2276    /// that omit the explicit relative path prefix.
2277    ///
2278    /// # Examples
2279    /// - "services/**/*.php" → "./services/**/*.php"
2280    /// - "./services/**/*.php" → "./services/**/*.php" (unchanged)
2281    /// - "**/services/**/*.php" → "**/services/**/*.php" (unchanged)
2282    /// - "/absolute/path/**" → "/absolute/path/**" (unchanged)
2283    fn normalize_glob_pattern(pattern: &str) -> String {
2284        if pattern.starts_with('.') || pattern.starts_with('/') || pattern.starts_with('*') {
2285            // Already has a prefix that works - don't modify
2286            pattern.to_string()
2287        } else {
2288            // Relative path without explicit prefix - add "./"
2289            format!("./{}", pattern)
2290        }
2291    }
2292
2293    /// Check if pattern appears at word boundaries in a line
2294    ///
2295    /// Word boundary is defined as:
2296    /// - Start/end of string
2297    /// - Transition between word characters (\w) and non-word characters (\W)
2298    ///
2299    /// This is used for default (restrictive) matching to find complete identifiers
2300    /// rather than substrings. For example:
2301    /// - "Error" matches "Error" but not "NetworkError"
2302    /// - "parse" matches "parse()" but not "parseUser()"
2303    fn has_word_boundary_match(line: &str, pattern: &str) -> bool {
2304        // Build regex: \bpattern\b
2305        let escaped_pattern = regex::escape(pattern);
2306        let pattern_with_boundaries = format!(r"\b{}\b", escaped_pattern);
2307
2308        if let Ok(re) = Regex::new(&pattern_with_boundaries) {
2309            re.is_match(line)
2310        } else {
2311            // If regex fails (shouldn't happen with escaped pattern), fall back to substring
2312            log::debug!("Word boundary regex failed for pattern '{}', falling back to substring", pattern);
2313            line.contains(pattern)
2314        }
2315    }
2316
2317    /// Get index status for programmatic use (doesn't print warnings)
2318    ///
2319    /// Returns (status, can_trust_results, warning) tuple for JSON output.
2320    /// This is optimized for AI agents to detect staleness and auto-reindex.
2321    fn get_index_status(&self) -> Result<(IndexStatus, bool, Option<IndexWarning>)> {
2322        let root = std::env::current_dir()?;
2323
2324        // Check git state if in a git repo
2325        if crate::git::is_git_repo(&root) {
2326            if let Ok(current_branch) = crate::git::get_current_branch(&root) {
2327                // Check if we're on a different branch than what was indexed
2328                if !self.cache.branch_exists(&current_branch).unwrap_or(false) {
2329                    let warning = IndexWarning {
2330                        reason: format!("Branch '{}' has not been indexed", current_branch),
2331                        action_required: "rfx index".to_string(),
2332                        details: Some(IndexWarningDetails {
2333                            current_branch: Some(current_branch),
2334                            indexed_branch: None,
2335                            current_commit: None,
2336                            indexed_commit: None,
2337                        }),
2338                    };
2339                    return Ok((IndexStatus::Stale, false, Some(warning)));
2340                }
2341
2342                // Branch exists - check if commit changed
2343                if let (Ok(current_commit), Ok(branch_info)) =
2344                    (crate::git::get_current_commit(&root), self.cache.get_branch_info(&current_branch)) {
2345
2346                    if branch_info.commit_sha != current_commit {
2347                        let warning = IndexWarning {
2348                            reason: format!(
2349                                "Commit changed from {} to {}",
2350                                &branch_info.commit_sha[..7],
2351                                &current_commit[..7]
2352                            ),
2353                            action_required: "rfx index".to_string(),
2354                            details: Some(IndexWarningDetails {
2355                                current_branch: Some(current_branch.clone()),
2356                                indexed_branch: Some(current_branch.clone()),
2357                                current_commit: Some(current_commit.clone()),
2358                                indexed_commit: Some(branch_info.commit_sha.clone()),
2359                            }),
2360                        };
2361                        return Ok((IndexStatus::Stale, false, Some(warning)));
2362                    }
2363
2364                    // If commits match, do a quick file freshness check
2365                    if let Ok(branch_files) = self.cache.get_branch_files(&current_branch) {
2366                        let mut checked = 0;
2367                        let mut changed = 0;
2368                        const SAMPLE_SIZE: usize = 10;
2369
2370                        for (path, _indexed_hash) in branch_files.iter().take(SAMPLE_SIZE) {
2371                            checked += 1;
2372                            let file_path = std::path::Path::new(path);
2373
2374                            if let Ok(metadata) = std::fs::metadata(file_path) {
2375                                if let Ok(modified) = metadata.modified() {
2376                                    let indexed_time = branch_info.last_indexed;
2377                                    let file_time = modified.duration_since(std::time::UNIX_EPOCH)
2378                                        .unwrap_or_default()
2379                                        .as_secs() as i64;
2380
2381                                    if file_time > indexed_time {
2382                                        // File modified after indexing - likely stale
2383                                        // Note: We skip hash verification for performance (mtime check is sufficient)
2384                                        changed += 1;
2385                                    }
2386                                }
2387                            }
2388                        }
2389
2390                        if changed > 0 {
2391                            let warning = IndexWarning {
2392                                reason: format!("{} of {} sampled files modified", changed, checked),
2393                                action_required: "rfx index".to_string(),
2394                                details: Some(IndexWarningDetails {
2395                                    current_branch: Some(current_branch.clone()),
2396                                    indexed_branch: Some(branch_info.branch.clone()),
2397                                    current_commit: Some(current_commit.clone()),
2398                                    indexed_commit: Some(branch_info.commit_sha.clone()),
2399                                }),
2400                            };
2401                            return Ok((IndexStatus::Stale, false, Some(warning)));
2402                        }
2403                    }
2404
2405                    // All checks passed - index is fresh
2406                    return Ok((IndexStatus::Fresh, true, None));
2407                }
2408            }
2409        }
2410
2411        // Not in a git repo or couldn't get git info - assume fresh
2412        Ok((IndexStatus::Fresh, true, None))
2413    }
2414
2415    /// Check index freshness and show non-blocking warnings
2416    ///
2417    /// This performs lightweight checks to warn users if their index might be stale:
2418    /// 1. Branch mismatch: indexed different branch
2419    /// 2. Commit changed: HEAD moved since indexing
2420    /// 3. File changes: quick mtime check on sample of files (if available)
2421    fn check_index_freshness(&self, filter: &QueryFilter) -> Result<()> {
2422        let root = std::env::current_dir()?;
2423
2424        // Check git state if in a git repo
2425        if crate::git::is_git_repo(&root) {
2426            if let Ok(current_branch) = crate::git::get_current_branch(&root) {
2427                // Check if we're on a different branch than what was indexed
2428                if !self.cache.branch_exists(&current_branch).unwrap_or(false) {
2429                    if !filter.suppress_output {
2430                        output::warn(&format!("⚠️  WARNING: Index not found for branch '{}'. Run 'rfx index' to index this branch.", current_branch));
2431                    }
2432                    return Ok(());
2433                }
2434
2435                // Branch exists - check if commit changed
2436                if let (Ok(current_commit), Ok(branch_info)) =
2437                    (crate::git::get_current_commit(&root), self.cache.get_branch_info(&current_branch)) {
2438
2439                    if branch_info.commit_sha != current_commit {
2440                        if !filter.suppress_output {
2441                            output::warn(&format!("⚠️  WARNING: Index may be stale (commit changed: {} → {}). Consider running 'rfx index'.",
2442                                     &branch_info.commit_sha[..7], &current_commit[..7]));
2443                        }
2444                        return Ok(());
2445                    }
2446
2447                    // If commits match, do a quick file freshness check
2448                    // Sample up to 10 files to check for modifications (cheap mtime check)
2449                    if let Ok(branch_files) = self.cache.get_branch_files(&current_branch) {
2450                        let mut checked = 0;
2451                        let mut changed = 0;
2452                        const SAMPLE_SIZE: usize = 10;
2453
2454                        for (path, _indexed_hash) in branch_files.iter().take(SAMPLE_SIZE) {
2455                            checked += 1;
2456                            let file_path = std::path::Path::new(path);
2457
2458                            // Check if file exists and has been modified (mtime/size heuristic)
2459                            if let Ok(metadata) = std::fs::metadata(file_path) {
2460                                if let Ok(modified) = metadata.modified() {
2461                                    let indexed_time = branch_info.last_indexed;
2462                                    let file_time = modified.duration_since(std::time::UNIX_EPOCH)
2463                                        .unwrap_or_default()
2464                                        .as_secs() as i64;
2465
2466                                    // If file modified after indexing, it might be stale
2467                                    if file_time > indexed_time {
2468                                        // File modified after indexing - likely stale
2469                                        // Note: We skip hash verification for performance (mtime check is sufficient)
2470                                        // This may cause false positives if files were touched without changes,
2471                                        // but the warning is non-blocking and vastly better than slow queries
2472                                        changed += 1;
2473                                    }
2474                                }
2475                            }
2476                        }
2477
2478                        if changed > 0 && !filter.suppress_output {
2479                            output::warn(&format!("⚠️  WARNING: {} of {} sampled files changed since indexing. Consider running 'rfx index'.", changed, checked));
2480                        }
2481                    }
2482                }
2483            }
2484        }
2485
2486        Ok(())
2487    }
2488}
2489
2490/// Generate AI instruction based on query results
2491///
2492/// Provides context-aware guidance to AI agents on how to handle search results.
2493/// Uses priority-based logic to determine the most relevant instruction.
2494pub fn generate_ai_instruction(
2495    result_count: usize,
2496    total_count: usize,
2497    has_more: bool,
2498    symbols_mode: bool,
2499    paths_only: bool,
2500    use_ast: bool,
2501    use_regex: bool,
2502    language_filter: bool,
2503    glob_filter: bool,
2504    exact_mode: bool,
2505) -> Option<String> {
2506    // Priority 1: No results
2507    if result_count == 0 {
2508        return Some(
2509            "No results found. Consider these alternatives: 1) Check pattern spelling, 2) Remove --kind or --lang filters to broaden search, 3) Try partial match or related term, 4) Use search_regex tool for pattern matching with special characters or complex patterns."
2510            .to_string()
2511        );
2512    }
2513
2514    // Priority 2: Query too broad (500+ results)
2515    if total_count >= 500 {
2516        return Some(
2517            format!("Query too broad: {} results found. STOP. Do not list results. Refine search automatically by adding filters: kind parameter (Function/Struct/Class), lang parameter (rust/python/etc), or glob parameter (['src/**/*.rs']). Call search_code again with appropriate filters.", total_count)
2518        );
2519    }
2520
2521    // Priority 3: Paginated results
2522    if has_more {
2523        return Some(
2524            format!("Showing {} of {} results. PAGINATED - there are more results available. Do not automatically fetch all results. Show current page, ask user if these results answer their question before fetching more with --offset parameter.", result_count, total_count)
2525        );
2526    }
2527
2528    // Priority 4: Single precise result (symbols mode)
2529    if result_count == 1 && symbols_mode {
2530        return Some(
2531            "Found 1 precise result. Respond concisely: '[symbol] at [path]:[line]'.".to_string()
2532        );
2533    }
2534
2535    // Priority 5: Few precise results (symbols mode)
2536    if result_count >= 2 && result_count <= 10 && symbols_mode {
2537        return Some(
2538            format!("Found {} precise results (definitions only, not usages). List locations concisely: '[symbol] at [path]:[line]' for each result.", result_count)
2539        );
2540    }
2541
2542    // Priority 6: Many results (101-500)
2543    if total_count >= 101 && total_count < 500 {
2544        return Some(
2545            format!("Found {} results - this is broad. Suggest refining search with: kind parameter (Function/Struct/Class/etc), lang parameter (rust/python/etc), or glob parameter to narrow file scope.", total_count)
2546        );
2547    }
2548
2549    // Priority 7: Full-text mode with many results (suggest symbols mode)
2550    if result_count >= 100 && !symbols_mode {
2551        return Some(
2552            format!("Found {} results in full-text search mode (includes definitions AND all usages). Consider using symbols=true parameter to filter to definitions only. This typically reduces results by 80-90%.", result_count)
2553        );
2554    }
2555
2556    // Priority 8: Paths-only mode
2557    if paths_only {
2558        return Some(
2559            format!("Found {} unique files (paths-only mode - no code content included). Next step: Use Read tool on specific files that look relevant based on their paths.", result_count)
2560        );
2561    }
2562
2563    // Priority 9: AST query results
2564    if use_ast {
2565        return Some(
2566            format!("Found {} results using AST pattern matching. These are structure-based matches using Tree-sitter patterns, not text search.", result_count)
2567        );
2568    }
2569
2570    // Priority 10: Regex with many results
2571    if use_regex && result_count >= 100 {
2572        return Some(
2573            format!("Found {} results using regex pattern matching. Regex matches are expansive. Consider using exact text search or symbols mode for more precise results.", result_count)
2574        );
2575    }
2576
2577    // Priority 11: Language filter with few results
2578    if language_filter && result_count <= 5 {
2579        return Some(
2580            format!("Found {} results with language filter active. Results are limited to this language only. Remove lang parameter if you want to search all languages.", result_count)
2581        );
2582    }
2583
2584    // Priority 12: Glob filter with few results
2585    if glob_filter && result_count <= 10 {
2586        return Some(
2587            format!("Found {} results with glob filter active. Results are limited to matching paths. Remove glob parameter to search entire codebase.", result_count)
2588        );
2589    }
2590
2591    // Priority 13: Exact mode with few results
2592    if exact_mode && result_count <= 5 {
2593        return Some(
2594            format!("Found {} results in exact match mode. Only exact symbol name matches are included. Remove exact parameter to allow substring matching.", result_count)
2595        );
2596    }
2597
2598    // Normal case (11-100 results, no special conditions) - no instruction
2599    None
2600}
2601
2602#[cfg(test)]
2603mod tests {
2604    use super::*;
2605    use crate::indexer::Indexer;
2606    use crate::models::IndexConfig;
2607    use std::fs;
2608    use tempfile::TempDir;
2609
2610    // ==================== Basic Tests ====================
2611
2612    #[test]
2613    fn test_query_engine_creation() {
2614        let temp = TempDir::new().unwrap();
2615        let cache = CacheManager::new(temp.path());
2616        let engine = QueryEngine::new(cache);
2617
2618        assert!(engine.cache.path().ends_with(".reflex"));
2619    }
2620
2621    #[test]
2622    fn test_filter_modes() {
2623        // Test that symbols_mode works as expected
2624        let filter_fulltext = QueryFilter::default();
2625        assert!(!filter_fulltext.symbols_mode);
2626
2627        let filter_symbols = QueryFilter {
2628            symbols_mode: true,
2629            ..Default::default()
2630        };
2631        assert!(filter_symbols.symbols_mode);
2632
2633        // Test that kind implies symbols_mode (handled in CLI layer)
2634        let filter_with_kind = QueryFilter {
2635            kind: Some(SymbolKind::Function),
2636            symbols_mode: true,
2637            ..Default::default()
2638        };
2639        assert!(filter_with_kind.symbols_mode);
2640    }
2641
2642    // ==================== Search Mode Tests ====================
2643
2644    #[test]
2645    fn test_fulltext_search() {
2646        let temp = TempDir::new().unwrap();
2647        let project = temp.path().join("project");
2648        fs::create_dir(&project).unwrap();
2649
2650        // Create test files
2651        fs::write(project.join("main.rs"), "fn main() {\n    println!(\"hello\");\n}").unwrap();
2652        fs::write(project.join("lib.rs"), "pub fn hello() {}").unwrap();
2653
2654        // Index the project
2655        let cache = CacheManager::new(&project);
2656        let indexer = Indexer::new(cache, IndexConfig::default());
2657        indexer.index(&project, false).unwrap();
2658
2659        // Search for "hello"
2660        let cache = CacheManager::new(&project);
2661        let engine = QueryEngine::new(cache);
2662        let filter = QueryFilter::default(); // full-text mode
2663        let results = engine.search("hello", filter).unwrap();
2664
2665        // Should find both occurrences (println and function name)
2666        assert!(results.len() >= 2);
2667        assert!(results.iter().any(|r| r.path.contains("main.rs")));
2668        assert!(results.iter().any(|r| r.path.contains("lib.rs")));
2669    }
2670
2671    #[test]
2672    fn test_symbol_search() {
2673        let temp = TempDir::new().unwrap();
2674        let project = temp.path().join("project");
2675        fs::create_dir(&project).unwrap();
2676
2677        // Create test file with function definition and call
2678        fs::write(
2679            project.join("main.rs"),
2680            "fn greet() {}\nfn main() {\n    greet();\n}"
2681        ).unwrap();
2682
2683        // Index
2684        let cache = CacheManager::new(&project);
2685        let indexer = Indexer::new(cache, IndexConfig::default());
2686        indexer.index(&project, false).unwrap();
2687
2688        let cache = CacheManager::new(&project);
2689
2690        // Symbol search (definitions only)
2691        let engine = QueryEngine::new(cache);
2692        let filter = QueryFilter {
2693            symbols_mode: true,
2694            ..Default::default()
2695        };
2696        let results = engine.search("greet", filter).unwrap();
2697
2698        // Should find only the definition, not the call
2699        assert!(results.len() >= 1);
2700        assert!(results.iter().any(|r| r.kind == SymbolKind::Function));
2701    }
2702
2703    #[test]
2704    fn test_regex_search() {
2705        let temp = TempDir::new().unwrap();
2706        let project = temp.path().join("project");
2707        fs::create_dir(&project).unwrap();
2708
2709        fs::write(
2710            project.join("main.rs"),
2711            "fn test1() {}\nfn test2() {}\nfn other() {}"
2712        ).unwrap();
2713
2714        let cache = CacheManager::new(&project);
2715        let indexer = Indexer::new(cache, IndexConfig::default());
2716        indexer.index(&project, false).unwrap();
2717
2718        let cache = CacheManager::new(&project);
2719
2720        let engine = QueryEngine::new(cache);
2721        let filter = QueryFilter {
2722            use_regex: true,
2723            ..Default::default()
2724        };
2725        let results = engine.search(r"fn test\d", filter).unwrap();
2726
2727        // Should match test1 and test2 but not other
2728        assert_eq!(results.len(), 2);
2729        assert!(results.iter().all(|r| r.preview.contains("test")));
2730    }
2731
2732    // ==================== Filter Tests ====================
2733
2734    #[test]
2735    fn test_language_filter() {
2736        let temp = TempDir::new().unwrap();
2737        let project = temp.path().join("project");
2738        fs::create_dir(&project).unwrap();
2739
2740        fs::write(project.join("main.rs"), "fn main() {}").unwrap();
2741        fs::write(project.join("main.js"), "function main() {}").unwrap();
2742
2743        let cache = CacheManager::new(&project);
2744        let indexer = Indexer::new(cache, IndexConfig::default());
2745        indexer.index(&project, false).unwrap();
2746
2747        let cache = CacheManager::new(&project);
2748
2749        let engine = QueryEngine::new(cache);
2750
2751        // Filter to Rust only
2752        let filter = QueryFilter {
2753            language: Some(Language::Rust),
2754            ..Default::default()
2755        };
2756        let results = engine.search("main", filter).unwrap();
2757
2758        assert!(results.iter().all(|r| r.lang == Language::Rust));
2759        assert!(results.iter().all(|r| r.path.ends_with(".rs")));
2760    }
2761
2762    #[test]
2763    fn test_kind_filter() {
2764        let temp = TempDir::new().unwrap();
2765        let project = temp.path().join("project");
2766        fs::create_dir(&project).unwrap();
2767
2768        fs::write(
2769            project.join("main.rs"),
2770            "struct Point {}\nfn main() {}\nimpl Point { fn new() {} }"
2771        ).unwrap();
2772
2773        let cache = CacheManager::new(&project);
2774        let indexer = Indexer::new(cache, IndexConfig::default());
2775        indexer.index(&project, false).unwrap();
2776
2777        let cache = CacheManager::new(&project);
2778
2779        let engine = QueryEngine::new(cache);
2780
2781        // Filter to functions only (includes methods)
2782        let filter = QueryFilter {
2783            symbols_mode: true,
2784            kind: Some(SymbolKind::Function),
2785            use_contains: true,  // "mai" is substring of "main"
2786            ..Default::default()
2787        };
2788        // Search for "mai" which should match "main" (tri gram pattern will def be in index)
2789        let results = engine.search("mai", filter).unwrap();
2790
2791        // Should find main function
2792        assert!(results.len() > 0, "Should find at least one result");
2793        assert!(results.iter().any(|r| r.symbol.as_deref() == Some("main")), "Should find 'main' function");
2794    }
2795
2796    #[test]
2797    fn test_file_pattern_filter() {
2798        let temp = TempDir::new().unwrap();
2799        let project = temp.path().join("project");
2800        fs::create_dir_all(project.join("src")).unwrap();
2801        fs::create_dir_all(project.join("tests")).unwrap();
2802
2803        fs::write(project.join("src/lib.rs"), "fn foo() {}").unwrap();
2804        fs::write(project.join("tests/test.rs"), "fn foo() {}").unwrap();
2805
2806        let cache = CacheManager::new(&project);
2807        let indexer = Indexer::new(cache, IndexConfig::default());
2808        indexer.index(&project, false).unwrap();
2809
2810        let cache = CacheManager::new(&project);
2811
2812        let engine = QueryEngine::new(cache);
2813
2814        // Filter to src/ only
2815        let filter = QueryFilter {
2816            file_pattern: Some("src/".to_string()),
2817            ..Default::default()
2818        };
2819        let results = engine.search("foo", filter).unwrap();
2820
2821        assert!(results.iter().all(|r| r.path.contains("src/")));
2822        assert!(!results.iter().any(|r| r.path.contains("tests/")));
2823    }
2824
2825    #[test]
2826    fn test_limit_filter() {
2827        let temp = TempDir::new().unwrap();
2828        let project = temp.path().join("project");
2829        fs::create_dir(&project).unwrap();
2830
2831        // Create file with many matches
2832        let content = (0..20).map(|i| format!("fn test{}() {{}}", i)).collect::<Vec<_>>().join("\n");
2833        fs::write(project.join("main.rs"), content).unwrap();
2834
2835        let cache = CacheManager::new(&project);
2836        let indexer = Indexer::new(cache, IndexConfig::default());
2837        indexer.index(&project, false).unwrap();
2838
2839        let cache = CacheManager::new(&project);
2840
2841        let engine = QueryEngine::new(cache);
2842
2843        // Limit to 5 results
2844        let filter = QueryFilter {
2845            limit: Some(5),
2846            use_contains: true,  // "test" is substring of "test0", "test1", etc.
2847            ..Default::default()
2848        };
2849        let results = engine.search("test", filter).unwrap();
2850
2851        assert_eq!(results.len(), 5);
2852    }
2853
2854    #[test]
2855    fn test_exact_match_filter() {
2856        let temp = TempDir::new().unwrap();
2857        let project = temp.path().join("project");
2858        fs::create_dir(&project).unwrap();
2859
2860        fs::write(
2861            project.join("main.rs"),
2862            "fn test() {}\nfn test_helper() {}\nfn other_test() {}"
2863        ).unwrap();
2864
2865        let cache = CacheManager::new(&project);
2866        let indexer = Indexer::new(cache, IndexConfig::default());
2867        indexer.index(&project, false).unwrap();
2868
2869        let cache = CacheManager::new(&project);
2870
2871        let engine = QueryEngine::new(cache);
2872
2873        // Exact match for "test"
2874        let filter = QueryFilter {
2875            symbols_mode: true,
2876            exact: true,
2877            ..Default::default()
2878        };
2879        let results = engine.search("test", filter).unwrap();
2880
2881        // Should only match exactly "test", not "test_helper" or "other_test"
2882        assert_eq!(results.len(), 1);
2883        assert_eq!(results[0].symbol.as_deref(), Some("test"));
2884    }
2885
2886    // ==================== Expand Mode Tests ====================
2887
2888    #[test]
2889    fn test_expand_mode() {
2890        let temp = TempDir::new().unwrap();
2891        let project = temp.path().join("project");
2892        fs::create_dir(&project).unwrap();
2893
2894        fs::write(
2895            project.join("main.rs"),
2896            "fn greet() {\n    println!(\"Hello\");\n    println!(\"World\");\n}"
2897        ).unwrap();
2898
2899        let cache = CacheManager::new(&project);
2900        let indexer = Indexer::new(cache, IndexConfig::default());
2901        indexer.index(&project, false).unwrap();
2902
2903        let cache = CacheManager::new(&project);
2904
2905        let engine = QueryEngine::new(cache);
2906
2907        // Search with expand mode
2908        let filter = QueryFilter {
2909            symbols_mode: true,
2910            expand: true,
2911            ..Default::default()
2912        };
2913        let results = engine.search("greet", filter).unwrap();
2914
2915        // Should have full function body in preview
2916        assert!(results.len() >= 1);
2917        let result = &results[0];
2918        assert!(result.preview.contains("println"));
2919    }
2920
2921    // ==================== Edge Cases ====================
2922
2923    #[test]
2924    fn test_search_empty_index() {
2925        let temp = TempDir::new().unwrap();
2926        let project = temp.path().join("project");
2927        fs::create_dir(&project).unwrap();
2928
2929        let cache = CacheManager::new(&project);
2930        let indexer = Indexer::new(cache, IndexConfig::default());
2931        indexer.index(&project, false).unwrap();
2932
2933        let cache = CacheManager::new(&project);
2934
2935        let engine = QueryEngine::new(cache);
2936        let filter = QueryFilter::default();
2937        let results = engine.search("nonexistent", filter).unwrap();
2938
2939        assert_eq!(results.len(), 0);
2940    }
2941
2942    #[test]
2943    fn test_search_no_index() {
2944        let temp = TempDir::new().unwrap();
2945        let project = temp.path().join("project");
2946        fs::create_dir(&project).unwrap();
2947
2948        let cache = CacheManager::new(&project);
2949        let engine = QueryEngine::new(cache);
2950        let filter = QueryFilter::default();
2951
2952        // Should fail when index doesn't exist
2953        assert!(engine.search("test", filter).is_err());
2954    }
2955
2956    #[test]
2957    fn test_search_special_characters() {
2958        let temp = TempDir::new().unwrap();
2959        let project = temp.path().join("project");
2960        fs::create_dir(&project).unwrap();
2961
2962        fs::write(project.join("main.rs"), "let x = 42;\nlet y = x + 1;").unwrap();
2963
2964        let cache = CacheManager::new(&project);
2965        let indexer = Indexer::new(cache, IndexConfig::default());
2966        indexer.index(&project, false).unwrap();
2967
2968        let cache = CacheManager::new(&project);
2969
2970        let engine = QueryEngine::new(cache);
2971        let filter = QueryFilter::default();
2972
2973        // Search for special characters
2974        let results = engine.search("x + ", filter).unwrap();
2975        assert!(results.len() >= 1);
2976    }
2977
2978    #[test]
2979    fn test_search_unicode() {
2980        let temp = TempDir::new().unwrap();
2981        let project = temp.path().join("project");
2982        fs::create_dir(&project).unwrap();
2983
2984        fs::write(project.join("main.rs"), "// 你好世界\nfn main() {}").unwrap();
2985
2986        let cache = CacheManager::new(&project);
2987        let indexer = Indexer::new(cache, IndexConfig::default());
2988        indexer.index(&project, false).unwrap();
2989
2990        let cache = CacheManager::new(&project);
2991
2992        let engine = QueryEngine::new(cache);
2993        let filter = QueryFilter {
2994            use_contains: true,  // Unicode word boundaries may not work as expected
2995            force: true,  // Bypass broad query detection for 2-char Unicode pattern
2996            ..Default::default()
2997        };
2998
2999        // Search for unicode characters
3000        let results = engine.search("你好", filter).unwrap();
3001        assert!(results.len() >= 1);
3002    }
3003
3004    #[test]
3005    fn test_case_sensitive_search() {
3006        let temp = TempDir::new().unwrap();
3007        let project = temp.path().join("project");
3008        fs::create_dir(&project).unwrap();
3009
3010        fs::write(project.join("main.rs"), "fn Test() {}\nfn test() {}").unwrap();
3011
3012        let cache = CacheManager::new(&project);
3013        let indexer = Indexer::new(cache, IndexConfig::default());
3014        indexer.index(&project, false).unwrap();
3015
3016        let cache = CacheManager::new(&project);
3017
3018        let engine = QueryEngine::new(cache);
3019        let filter = QueryFilter::default();
3020
3021        // Search is case-sensitive
3022        let results = engine.search("Test", filter).unwrap();
3023        assert!(results.iter().any(|r| r.preview.contains("Test()")));
3024    }
3025
3026    // ==================== Determinism Tests ====================
3027
3028    #[test]
3029    fn test_results_sorted_deterministically() {
3030        let temp = TempDir::new().unwrap();
3031        let project = temp.path().join("project");
3032        fs::create_dir(&project).unwrap();
3033
3034        fs::write(project.join("a.rs"), "fn test() {}").unwrap();
3035        fs::write(project.join("z.rs"), "fn test() {}").unwrap();
3036        fs::write(project.join("m.rs"), "fn test() {}\nfn test2() {}").unwrap();
3037
3038        let cache = CacheManager::new(&project);
3039        let indexer = Indexer::new(cache, IndexConfig::default());
3040        indexer.index(&project, false).unwrap();
3041
3042        let cache = CacheManager::new(&project);
3043
3044        let engine = QueryEngine::new(cache);
3045        let filter = QueryFilter::default();
3046
3047        // Run search multiple times
3048        let results1 = engine.search("test", filter.clone()).unwrap();
3049        let results2 = engine.search("test", filter.clone()).unwrap();
3050        let results3 = engine.search("test", filter).unwrap();
3051
3052        // Results should be identical and sorted by path then line
3053        assert_eq!(results1.len(), results2.len());
3054        assert_eq!(results1.len(), results3.len());
3055
3056        for i in 0..results1.len() {
3057            assert_eq!(results1[i].path, results2[i].path);
3058            assert_eq!(results1[i].path, results3[i].path);
3059            assert_eq!(results1[i].span.start_line, results2[i].span.start_line);
3060            assert_eq!(results1[i].span.start_line, results3[i].span.start_line);
3061        }
3062
3063        // Verify sorting (path ascending, then line ascending)
3064        for i in 0..results1.len().saturating_sub(1) {
3065            let curr = &results1[i];
3066            let next = &results1[i + 1];
3067            assert!(
3068                curr.path < next.path ||
3069                (curr.path == next.path && curr.span.start_line <= next.span.start_line)
3070            );
3071        }
3072    }
3073
3074    // ==================== Combined Filter Tests ====================
3075
3076    #[test]
3077    fn test_multiple_filters_combined() {
3078        let temp = TempDir::new().unwrap();
3079        let project = temp.path().join("project");
3080        fs::create_dir_all(project.join("src")).unwrap();
3081
3082        fs::write(project.join("src/main.rs"), "fn test() {}\nstruct Test {}").unwrap();
3083        fs::write(project.join("src/lib.rs"), "fn test() {}").unwrap();
3084        fs::write(project.join("test.js"), "function test() {}").unwrap();
3085
3086        let cache = CacheManager::new(&project);
3087        let indexer = Indexer::new(cache, IndexConfig::default());
3088        indexer.index(&project, false).unwrap();
3089
3090        let cache = CacheManager::new(&project);
3091
3092        let engine = QueryEngine::new(cache);
3093
3094        // Combine language, kind, and file pattern filters
3095        let filter = QueryFilter {
3096            language: Some(Language::Rust),
3097            kind: Some(SymbolKind::Function),
3098            file_pattern: Some("src/main".to_string()),
3099            symbols_mode: true,
3100            ..Default::default()
3101        };
3102        let results = engine.search("test", filter).unwrap();
3103
3104        // Should only find the function in src/main.rs
3105        assert_eq!(results.len(), 1);
3106        assert!(results[0].path.contains("src/main.rs"));
3107        assert_eq!(results[0].kind, SymbolKind::Function);
3108    }
3109
3110    // ==================== Helper Method Tests ====================
3111
3112    #[test]
3113    fn test_find_symbol_helper() {
3114        let temp = TempDir::new().unwrap();
3115        let project = temp.path().join("project");
3116        fs::create_dir(&project).unwrap();
3117
3118        fs::write(project.join("main.rs"), "fn greet() {}").unwrap();
3119
3120        let cache = CacheManager::new(&project);
3121        let indexer = Indexer::new(cache, IndexConfig::default());
3122        indexer.index(&project, false).unwrap();
3123
3124        let cache = CacheManager::new(&project);
3125
3126        let engine = QueryEngine::new(cache);
3127        let results = engine.find_symbol("greet").unwrap();
3128
3129        assert!(results.len() >= 1);
3130        assert_eq!(results[0].kind, SymbolKind::Function);
3131    }
3132
3133    #[test]
3134    fn test_list_by_kind_helper() {
3135        let temp = TempDir::new().unwrap();
3136        let project = temp.path().join("project");
3137        fs::create_dir(&project).unwrap();
3138
3139        fs::write(
3140            project.join("main.rs"),
3141            "struct Point {}\nfn test() {}\nstruct Line {}"
3142        ).unwrap();
3143
3144        let cache = CacheManager::new(&project);
3145        let indexer = Indexer::new(cache, IndexConfig::default());
3146        indexer.index(&project, false).unwrap();
3147
3148        let cache = CacheManager::new(&project);
3149
3150        let engine = QueryEngine::new(cache);
3151
3152        // Search for structs that contain "oin" (Point contains it, Line doesn't)
3153        let filter = QueryFilter {
3154            kind: Some(SymbolKind::Struct),
3155            symbols_mode: true,
3156            use_contains: true,  // "oin" is substring of "Point"
3157            ..Default::default()
3158        };
3159        let results = engine.search("oin", filter).unwrap();
3160
3161        // Should find Point struct
3162        assert!(results.len() >= 1, "Should find at least Point struct");
3163        assert!(results.iter().all(|r| r.kind == SymbolKind::Struct));
3164        assert!(results.iter().any(|r| r.symbol.as_deref() == Some("Point")));
3165    }
3166
3167    // ==================== Metadata Tests ====================
3168
3169    #[test]
3170    fn test_search_with_metadata() {
3171        let temp = TempDir::new().unwrap();
3172        let project = temp.path().join("project");
3173        fs::create_dir(&project).unwrap();
3174
3175        fs::write(project.join("main.rs"), "fn test() {}").unwrap();
3176
3177        let cache = CacheManager::new(&project);
3178        let indexer = Indexer::new(cache, IndexConfig::default());
3179        indexer.index(&project, false).unwrap();
3180
3181        let cache = CacheManager::new(&project);
3182
3183        let engine = QueryEngine::new(cache);
3184        let filter = QueryFilter::default();
3185        let response = engine.search_with_metadata("test", filter).unwrap();
3186
3187        // Check metadata is present (status might be stale if run inside git repo)
3188        assert!(response.results.len() >= 1);
3189        // Note: can_trust_results may be false if running in a git repo without branch index
3190    }
3191
3192    // ==================== Multi-language Tests ====================
3193
3194    #[test]
3195    fn test_search_across_languages() {
3196        let temp = TempDir::new().unwrap();
3197        let project = temp.path().join("project");
3198        fs::create_dir(&project).unwrap();
3199
3200        fs::write(project.join("main.rs"), "fn greet() {}").unwrap();
3201        fs::write(project.join("main.ts"), "function greet() {}").unwrap();
3202        fs::write(project.join("main.py"), "def greet(): pass").unwrap();
3203
3204        let cache = CacheManager::new(&project);
3205        let indexer = Indexer::new(cache, IndexConfig::default());
3206        indexer.index(&project, false).unwrap();
3207
3208        let cache = CacheManager::new(&project);
3209
3210        let engine = QueryEngine::new(cache);
3211        let filter = QueryFilter::default();
3212        let results = engine.search("greet", filter).unwrap();
3213
3214        // Should find greet in all three languages
3215        assert!(results.len() >= 3);
3216        assert!(results.iter().any(|r| r.lang == Language::Rust));
3217        assert!(results.iter().any(|r| r.lang == Language::TypeScript));
3218        assert!(results.iter().any(|r| r.lang == Language::Python));
3219    }
3220}
reflex/query.rs

reflex/
query.rs