Skip to main content

reflex/query/
mod.rs

1//! Query engine for searching indexed code
2//!
3//! The query engine loads the memory-mapped cache and executes
4//! deterministic searches based on lexical, structural, or symbol patterns.
5
6pub mod filter;
7pub mod result;
8
9pub use filter::QueryFilter;
10
11use anyhow::{Context, Result};
12use regex::Regex;
13
14use crate::cache::CacheManager;
15use crate::content_store::ContentReader;
16use crate::models::{
17    IndexStatus, IndexWarning, IndexWarningDetails, Language, QueryResponse, SearchResult, Span,
18    SymbolKind,
19};
20use crate::output;
21use crate::parsers::ParserFactory;
22use crate::regex_trigrams::extract_trigrams_from_regex;
23use crate::trigram::TrigramIndex;
24
25/// Manages query execution against the index
26pub struct QueryEngine {
27    cache: CacheManager,
28}
29
30impl QueryEngine {
31    /// Create a new query engine with the given cache manager
32    pub fn new(cache: CacheManager) -> Self {
33        Self { cache }
34    }
35
36    /// Load dependencies for search results if requested (legacy - per result)
37    /// Deprecated: Use group_and_load_dependencies for file-level grouping
38    fn load_dependencies(&self, results: &mut [SearchResult], include_deps: bool) -> Result<()> {
39        if !include_deps || results.is_empty() {
40            return Ok(());
41        }
42
43        log::debug!("Loading dependencies for {} results", results.len());
44
45        // Create dependency index
46        // Note: We need to pass the workspace root, not the cache directory
47        // The cache path is .reflex/, so its parent is the workspace root (.)
48        let workspace_root = self.cache.path().parent()
49            .ok_or_else(|| anyhow::anyhow!("Cache path has no parent"))?;
50        let cache_for_deps = CacheManager::new(workspace_root);
51        let dep_index = crate::dependency::DependencyIndex::new(cache_for_deps);
52
53        // Load dependencies for each result
54        for result in results {
55            // Normalize path: strip leading "./" if present
56            let normalized_path = result.path.strip_prefix("./").unwrap_or(&result.path);
57
58            // Get file_id from database by path
59            match self.cache.get_file_id(normalized_path) {
60                Ok(Some(file_id)) => {
61                    log::debug!("Found file_id={} for path={}", file_id, result.path);
62                    // Get dependencies for this file
63                    match dep_index.get_dependencies_info(file_id) {
64                        Ok(dep_infos) => {
65                            log::debug!("Loaded {} dependencies for file_id={}", dep_infos.len(), file_id);
66                            if !dep_infos.is_empty() {
67                                result.dependencies = Some(dep_infos);
68                            }
69                        }
70                        Err(e) => {
71                            log::warn!("Failed to get dependencies for file_id={}: {}", file_id, e);
72                        }
73                    }
74                }
75                Ok(None) => {
76                    log::warn!("No file_id found for path: {}", result.path);
77                }
78                Err(e) => {
79                    log::warn!("Failed to get file_id for path {}: {}", result.path, e);
80                }
81            }
82        }
83
84        Ok(())
85    }
86
87    /// Group search results by file and load dependencies at file level
88    /// Returns file-grouped results with dependencies populated once per file
89    fn group_and_load_dependencies(
90        &self,
91        results: Vec<SearchResult>,
92        include_deps: bool,
93    ) -> Result<Vec<crate::models::FileGroupedResult>> {
94        use std::collections::HashMap;
95        use crate::models::{FileGroupedResult, MatchResult};
96
97        if results.is_empty() {
98            return Ok(Vec::new());
99        }
100
101        // Group results by file path
102        let mut grouped: HashMap<String, Vec<SearchResult>> = HashMap::new();
103        for result in results {
104            grouped
105                .entry(result.path.clone())
106                .or_default()
107                .push(result);
108        }
109
110        // Create dependency index if needed
111        let dep_index = if include_deps {
112            let workspace_root = self.cache.path().parent()
113                .ok_or_else(|| anyhow::anyhow!("Cache path has no parent"))?;
114            let cache_for_deps = CacheManager::new(workspace_root);
115            Some(crate::dependency::DependencyIndex::new(cache_for_deps))
116        } else {
117            None
118        };
119
120        // Load ContentReader for extracting context lines
121        let content_path = self.cache.path().join("content.bin");
122        let content_reader_opt = ContentReader::open(&content_path).ok();
123
124        // Convert to FileGroupedResult and load dependencies
125        let mut file_results: Vec<FileGroupedResult> = grouped
126            .into_iter()
127            .map(|(path, file_matches)| {
128                // Load dependencies for this file (once per file, not per result)
129                let dependencies = if let Some(dep_idx) = &dep_index {
130                    let normalized_path = path.strip_prefix("./").unwrap_or(&path);
131                    match self.cache.get_file_id(normalized_path) {
132                        Ok(Some(file_id)) => {
133                            match dep_idx.get_dependencies_info(file_id) {
134                                Ok(dep_infos) if !dep_infos.is_empty() => {
135                                    log::debug!("Loaded {} dependencies for file: {}", dep_infos.len(), path);
136                                    Some(dep_infos)
137                                }
138                                Ok(_) => None,
139                                Err(e) => {
140                                    log::warn!("Failed to get dependencies for {}: {}", path, e);
141                                    None
142                                }
143                            }
144                        }
145                        Ok(None) => {
146                            log::warn!("No file_id found for path: {}", path);
147                            None
148                        }
149                        Err(e) => {
150                            log::warn!("Failed to get file_id for path {}: {}", path, e);
151                            None
152                        }
153                    }
154                } else {
155                    None
156                };
157
158                // Get file_id for context extraction
159                // Note: We use ContentReader's get_file_id_by_path() which returns array indices,
160                // not database file_ids (which are AUTO INCREMENT values)
161                let normalized_path = path.strip_prefix("./").unwrap_or(&path);
162                let file_id_for_context = if let Some(reader) = &content_reader_opt {
163                    reader.get_file_id_by_path(normalized_path)
164                } else {
165                    None
166                };
167                log::debug!("Context extraction: file={}, file_id={:?}, content_reader={}",
168                    path, file_id_for_context, content_reader_opt.is_some());
169
170                // Convert SearchResults to MatchResults (strip path and dependencies) and extract context
171                let matches: Vec<MatchResult> = file_matches
172                    .into_iter()
173                    .map(|r| {
174                        // Extract context lines (default: 3 lines before and after)
175                        let (context_before, context_after) = if let (Some(reader), Some(fid)) = (&content_reader_opt, file_id_for_context) {
176                            let result = reader.get_context_by_line(fid as u32, r.span.start_line, 3)
177                                .unwrap_or_else(|e| {
178                                    log::warn!("Failed to extract context for {}:{}: {}", path, r.span.start_line, e);
179                                    (vec![], vec![])
180                                });
181                            log::debug!("Extracted context for {}:{} - before: {}, after: {}",
182                                path, r.span.start_line, result.0.len(), result.1.len());
183                            result
184                        } else {
185                            if content_reader_opt.is_none() {
186                                log::debug!("No ContentReader available for context extraction");
187                            }
188                            if file_id_for_context.is_none() {
189                                log::debug!("No file_id found for {}", path);
190                            }
191                            (vec![], vec![])
192                        };
193
194                        MatchResult {
195                            kind: r.kind,
196                            symbol: r.symbol,
197                            span: r.span,
198                            preview: r.preview,
199                            context_before,
200                            context_after,
201                        }
202                    })
203                    .collect();
204
205                FileGroupedResult {
206                    path,
207                    dependencies,
208                    matches,
209                }
210            })
211            .collect();
212
213        // Sort by path for deterministic output
214        file_results.sort_by(|a, b| a.path.cmp(&b.path));
215
216        Ok(file_results)
217    }
218
219    /// Execute a query and return matching results with index metadata
220    ///
221    /// This is the preferred method for programmatic/JSON output as it includes
222    /// index freshness information that AI agents can use to decide whether to re-index.
223    pub fn search_with_metadata(&self, pattern: &str, filter: QueryFilter) -> Result<QueryResponse> {
224        log::info!("Executing query with metadata: pattern='{}', filter={:?}", pattern, filter);
225
226        // Ensure cache exists
227        if !self.cache.exists() {
228            anyhow::bail!(
229                "Index not found. Run 'rfx index' to build the cache first."
230            );
231        }
232
233        // Validate cache integrity
234        if let Err(e) = self.cache.validate() {
235            anyhow::bail!(
236                "Cache appears to be corrupted: {}. Run 'rfx clear' followed by 'rfx index' to rebuild.",
237                e
238            );
239        }
240
241        // Get index status and warning (without printing warnings to stderr)
242        let (status, can_trust_results, warning) = self.get_index_status()?;
243
244        // Execute the search
245        let (results, total) = self.search_internal(pattern, filter.clone())?;
246
247        // Build pagination metadata
248        use crate::models::PaginationInfo;
249        let pagination = PaginationInfo {
250            total,
251            count: results.len(),
252            offset: filter.offset.unwrap_or(0),
253            limit: filter.limit,
254            has_more: total > filter.offset.unwrap_or(0) + results.len(),
255        };
256
257        // Always use grouped format (group results by file)
258        // Dependencies are loaded only when include_dependencies is true
259        let grouped_results = self.group_and_load_dependencies(results, filter.include_dependencies)?;
260
261        Ok(QueryResponse {
262            ai_instruction: None,  // AI instruction is generated by CLI/MCP layer, not here
263            status,
264            can_trust_results,
265            warning,
266            pagination,
267            results: grouped_results,
268        })
269    }
270
271    /// Execute a query and return matching results (legacy method)
272    ///
273    /// This method prints warnings to stderr and returns just the results.
274    /// For programmatic use, prefer `search_with_metadata()`.
275    pub fn search(&self, pattern: &str, filter: QueryFilter) -> Result<Vec<SearchResult>> {
276        log::info!("Executing query: pattern='{}', filter={:?}", pattern, filter);
277
278        // Ensure cache exists
279        if !self.cache.exists() {
280            anyhow::bail!(
281                "Index not found. Run 'rfx index' to build the cache first."
282            );
283        }
284
285        // Validate cache integrity
286        if let Err(e) = self.cache.validate() {
287            anyhow::bail!(
288                "Cache appears to be corrupted: {}. Run 'rfx clear' followed by 'rfx index' to rebuild.",
289                e
290            );
291        }
292
293        // Show non-blocking warnings about branch state and staleness
294        self.check_index_freshness(&filter)?;
295
296        // Execute the search (discard total count - legacy method doesn't use it)
297        let (mut results, _total_count) = self.search_internal(pattern, filter.clone())?;
298
299        // Load dependencies if requested
300        self.load_dependencies(&mut results, filter.include_dependencies)?;
301
302        Ok(results)
303    }
304
305    /// Internal search implementation (used by both search methods)
306    /// Returns (results, total_count) where total_count is the count before offset/limit
307    fn search_internal(&self, pattern: &str, filter: QueryFilter) -> Result<(Vec<SearchResult>, usize)> {
308        use std::time::{Duration, Instant};
309
310        // Start timeout timer if configured
311        let start_time = Instant::now();
312        let timeout = if filter.timeout_secs > 0 {
313            Some(Duration::from_secs(filter.timeout_secs))
314        } else {
315            None
316        };
317
318        // KEYWORD DETECTION (early): Check if this is a keyword query that should scan ALL files
319        // When a user searches for a language keyword (like "class", "function") with --symbols or --kind,
320        // we interpret it as "list all symbols of that type" and should scan ALL files,
321        // not just the first 100 candidates from trigram search.
322        //
323        // Requirements for keyword query mode:
324        // 1. Symbol mode active (--symbols or --kind)
325        // 2. Pattern matches a keyword in ANY supported language
326        //
327        // Note: --lang is optional. If specified, language filtering happens naturally in Phase 2/3.
328        let is_keyword_query = if filter.symbols_mode || filter.kind.is_some() {
329            ParserFactory::get_all_keywords().contains(&pattern)
330        } else {
331            false
332        };
333
334        // KEYWORD-TO-KIND MAPPING: If user searches for a keyword without --kind, infer the kind
335        // Example: "class" → SymbolKind::Class, "function" → SymbolKind::Function
336        // This ensures keyword queries return only the relevant symbol type
337        let mut filter = filter.clone();  // Clone so we can modify it
338        if is_keyword_query && filter.kind.is_none() {
339            if let Some(inferred_kind) = Self::keyword_to_kind(pattern) {
340                log::info!("Keyword '{}' mapped to kind {:?} (auto-inferred)", pattern, inferred_kind);
341                filter.kind = Some(inferred_kind);
342            }
343        }
344
345        // EARLY BROAD QUERY DETECTION (Index Size Check)
346        // This check happens BEFORE the expensive trigram search to prevent hangs on large indexes
347        // For very large codebases (like Linux kernel with 62K files), even valid 3-char trigrams
348        // like "get" can take 10-30+ seconds to search. This early check prevents that hang.
349        //
350        // Criteria for early blocking:
351        // 1. Large index (> 20,000 files) AND
352        // 2. Short pattern (< 4 chars) AND
353        // 3. Not using regex (regex has its own trigram extraction) AND
354        // 4. Not a keyword query (keywords are intentionally broad) AND
355        // 5. Not forced by --force flag
356        if !filter.force && !filter.use_regex && !is_keyword_query {
357            let stats = self.cache.stats()?;
358            let total_files = stats.total_files;
359            let pattern_len = pattern.chars().count();
360
361            // Thresholds for early blocking:
362            // - Large index: 20,000+ files (approximately where performance degrades significantly)
363            // - Short pattern: < 4 chars (3-char trigrams are borderline, < 4 catches edge cases)
364            // Test overrides allow reducing thresholds for integration tests without creating 20K+ files
365            let large_index_threshold = filter.test_large_index_threshold.unwrap_or(20_000);
366            let short_pattern_threshold = filter.test_short_pattern_threshold.unwrap_or(4);
367
368            if total_files > large_index_threshold && pattern_len < short_pattern_threshold {
369                anyhow::bail!(
370                    "Query too broad - would be expensive to execute on this large index\n\
371                     \n\
372                     This index contains {} files, and pattern '{}' ({} characters) is too short for efficient searching.\n\
373                     On large codebases, short patterns can take 10-30+ seconds to complete.\n\
374                     \n\
375                     This query could:\n\
376                     • Hang for an extended period before returning results\n\
377                     • Return thousands of results\n\
378                     • Flood LLM context windows with excessive data\n\
379                     • Fail entirely\n\
380                     \n\
381                     Suggestions to narrow the query:\n\
382                     • Use a longer, more specific pattern (4+ characters recommended for large indexes)\n\
383                     • Add a language filter: --lang <language>\n\
384                     • Add a file filter: --glob <pattern> or --file <path>\n\
385                     • Use --force to bypass this check if you really need all results\n\
386                     \n\
387                     To force execution anyway:\n\
388                     rfx query \"{}\" --force",
389                    total_files,
390                    pattern,
391                    pattern_len,
392                    pattern
393                );
394            }
395        }
396
397        // PHASE 1: Get initial candidates (choose search strategy)
398        let mut results = if is_keyword_query {
399            // KEYWORD QUERY MODE: Scan all files (or files of target language if --lang specified)
400            // This ensures we find ALL classes/functions/etc, not just those in the first 100 trigram matches
401            if let Some(lang) = filter.language {
402                log::info!("Keyword query detected for '{}' - scanning all {:?} files (bypassing trigram search)",
403                          pattern, lang);
404            } else {
405                log::info!("Keyword query detected for '{}' - scanning all files (bypassing trigram search)", pattern);
406            }
407            self.get_all_language_files(&filter)?
408        } else if filter.use_regex {
409            // Regex pattern search with trigram optimization
410            self.get_regex_candidates(pattern, timeout.as_ref(), &start_time, filter.suppress_output)?
411        } else {
412            // Standard trigram-based full-text search
413            self.get_trigram_candidates(pattern, &filter)?
414        };
415
416        // EARLY LANGUAGE FILTER: Apply language filtering BEFORE broad query check
417        // This ensures we only parse files matching the language filter in Phase 2
418        // Critical for non-keyword queries to work correctly with accurate candidate counts
419        //
420        // Skip for keyword queries - those candidates are already pre-filtered by language
421        if !is_keyword_query {
422            if let Some(lang) = filter.language {
423                let before_count = results.len();
424                results.retain(|r| r.lang == lang);
425                log::debug!(
426                    "Language filter ({:?}): reduced {} candidates to {} candidates",
427                    lang,
428                    before_count,
429                    results.len()
430                );
431            }
432        }
433
434        // EARLY GLOB PATTERN FILTER: Apply glob/exclude filtering BEFORE broad query check
435        // This ensures candidate count reflects actual files that will be parsed
436        // Critical for queries like: rfx query "index" --symbols --glob "src/**/*.rs"
437        if !filter.glob_patterns.is_empty() || !filter.exclude_patterns.is_empty() {
438            use globset::{Glob, GlobSetBuilder};
439
440            // Build include matcher (if patterns specified)
441            let include_matcher = if !filter.glob_patterns.is_empty() {
442                let mut builder = GlobSetBuilder::new();
443                for pattern in &filter.glob_patterns {
444                    // Normalize pattern to ensure LLM-generated patterns work correctly
445                    let normalized = Self::normalize_glob_pattern(pattern);
446                    match Glob::new(&normalized) {
447                        Ok(glob) => {
448                            builder.add(glob);
449                        }
450                        Err(e) => {
451                            log::warn!("Invalid glob pattern '{}': {}", pattern, e);
452                        }
453                    }
454                }
455                match builder.build() {
456                    Ok(matcher) => Some(matcher),
457                    Err(e) => {
458                        log::warn!("Failed to build glob matcher: {}", e);
459                        None
460                    }
461                }
462            } else {
463                None
464            };
465
466            // Build exclude matcher (if patterns specified)
467            let exclude_matcher = if !filter.exclude_patterns.is_empty() {
468                let mut builder = GlobSetBuilder::new();
469                for pattern in &filter.exclude_patterns {
470                    // Normalize pattern to ensure LLM-generated patterns work correctly
471                    let normalized = Self::normalize_glob_pattern(pattern);
472                    match Glob::new(&normalized) {
473                        Ok(glob) => {
474                            builder.add(glob);
475                        }
476                        Err(e) => {
477                            log::warn!("Invalid exclude pattern '{}': {}", pattern, e);
478                        }
479                    }
480                }
481                match builder.build() {
482                    Ok(matcher) => Some(matcher),
483                    Err(e) => {
484                        log::warn!("Failed to build exclude matcher: {}", e);
485                        None
486                    }
487                }
488            } else {
489                None
490            };
491
492            // Apply filters
493            let before_count = results.len();
494            results.retain(|r| {
495                // If include patterns specified, path must match at least one
496                let included = if let Some(ref matcher) = include_matcher {
497                    matcher.is_match(&r.path)
498                } else {
499                    true // No include patterns = include all
500                };
501
502                // If exclude patterns specified, path must NOT match any
503                let excluded = if let Some(ref matcher) = exclude_matcher {
504                    matcher.is_match(&r.path)
505                } else {
506                    false // No exclude patterns = exclude none
507                };
508
509                included && !excluded
510            });
511            log::debug!(
512                "Glob filter: reduced {} candidates to {} candidates",
513                before_count,
514                results.len()
515            );
516        }
517
518        // Check timeout after Phase 1
519        if let Some(timeout_duration) = timeout {
520            if start_time.elapsed() > timeout_duration {
521                anyhow::bail!(
522                    "Query timeout exceeded ({} seconds).\n\
523                     \n\
524                     The query took too long to complete. Try one of these approaches:\n\
525                     • Use a more specific search pattern (longer patterns = faster search)\n\
526                     • Add a language filter with --lang to narrow the search space\n\
527                     • Add a file filter with --file to search specific directories\n\
528                     • Increase the timeout with --timeout <seconds>\n\
529                     \n\
530                     Example: rfx query \"{}\" --lang rust --timeout 60",
531                    filter.timeout_secs,
532                    pattern
533                );
534            }
535        }
536
537        // BROAD QUERY DETECTION: Check if query is too expensive BEFORE parsing
538        // This protects LLM users from accidentally running expensive queries that flood context windows
539        if !filter.force {
540            let candidate_count = results.len();
541            let pattern_len = pattern.chars().count();
542
543            // Condition 1: Pattern too short (< 3 chars can't use trigram optimization efficiently)
544            // Exception: Allow short keyword queries (e.g., "fn", "if") since they scan all language files
545            let is_short_pattern = pattern_len < 3 && !filter.use_regex && !is_keyword_query;
546
547            // Condition 2: AST query without glob restriction on large codebases
548            // Allow on small codebases (< 100 files) but require glob for larger ones
549            let is_broad_ast = filter.use_ast && filter.glob_patterns.is_empty() && candidate_count >= 100;
550
551            // Condition 3: Query-type-aware threshold for symbol/AST parsing
552            // Different thresholds based on actual performance characteristics:
553            // - AST without glob: 100 files (allow small codebases, block large ones)
554            // - AST with glob: 10,000 files (~5 seconds max)
555            // - Keyword queries: 20,000 files (~3 seconds max) - scan all files of language
556            // - Trigram-filtered symbols: 50,000 files (~5 seconds max) - very fast due to trigram filtering
557            let threshold = if filter.use_ast && filter.glob_patterns.is_empty() {
558                100  // AST without glob - allow small codebases
559            } else if filter.use_ast {
560                10_000  // AST with glob restriction
561            } else if is_keyword_query {
562                20_000  // Keyword queries (e.g., "class", "function")
563            } else {
564                50_000  // Trigram-filtered symbol queries
565            };
566
567            let has_many_candidates = candidate_count > threshold &&
568                                     (filter.symbols_mode || filter.kind.is_some() || filter.use_ast);
569
570            if is_short_pattern || has_many_candidates || is_broad_ast {
571                let reason = if is_short_pattern {
572                    format!("Pattern '{}' is too short ({} characters). Short patterns bypass trigram optimization and require scanning many files.", pattern, pattern_len)
573                } else if is_broad_ast {
574                    format!("AST query without --glob restriction will scan the entire codebase ({} files). AST queries are SLOW (500ms-10s+).", candidate_count)
575                } else if is_keyword_query {
576                    format!("Keyword query '{}' matched {} files. This query scans all files of the target language, which will take significant time and produce excessive results.", pattern, candidate_count)
577                } else {
578                    format!("Query matched {} files. Parsing this many files with --symbols or --kind will take significant time and produce excessive results.", candidate_count)
579                };
580
581                let suggestions = if is_short_pattern {
582                    vec![
583                        "• Use a longer, more specific pattern (3+ characters recommended)",
584                        "• Add a language filter: --lang <language>",
585                        "• Add a file path filter: --file <path> or --glob <pattern>",
586                        "• Use --force to bypass this check if you really need all results"
587                    ]
588                } else if is_broad_ast {
589                    vec![
590                        "• Add --glob to restrict AST query to specific files: --glob 'src/**/*.rs'",
591                        "• Use --symbols instead (10-100x faster in 95% of cases)",
592                        "• Use --force to bypass this check if you need a full codebase scan"
593                    ]
594                } else if is_keyword_query {
595                    vec![
596                        "• Add a language filter to reduce files scanned: --lang <language>",
597                        "• Add glob patterns to search specific directories: --glob 'src/**/*.rs'",
598                        "• Add --kind to filter to specific symbol types: --kind function",
599                        "• Use a more specific pattern instead of a keyword",
600                        "• Use --force to bypass this check if you need all results"
601                    ]
602                } else {
603                    vec![
604                        "• Add a language filter to reduce candidate set: --lang <language>",
605                        "• Add glob patterns to search specific directories: --glob 'src/**/*.rs'",
606                        "• Use a more specific search pattern",
607                        "• Use --force to bypass this check if you need all results"
608                    ]
609                };
610
611                // Build the command snippet showing current flags
612                let mut cmd_flags = String::new();
613                if filter.symbols_mode {
614                    cmd_flags.push_str("--symbols ");
615                }
616                if let Some(ref lang) = filter.language {
617                    cmd_flags.push_str(&format!("--lang {:?} ", lang));
618                }
619                if let Some(ref kind) = filter.kind {
620                    cmd_flags.push_str(&format!("--kind {:?} ", kind));
621                }
622                if filter.use_ast {
623                    cmd_flags.push_str("--ast ");
624                }
625
626                anyhow::bail!(
627                    "Query too broad - would be expensive to execute\n\
628                     \n\
629                     {}\n\
630                     \n\
631                     This query could:\n\
632                     • Hang for an extended period before returning results\n\
633                     • Return thousands of results\n\
634                     • Flood LLM context windows with excessive data\n\
635                     • Fail entirely\n\
636                     \n\
637                     Suggestions to narrow the query:\n\
638                     {}\n\
639                     \n\
640                     To force execution anyway:\n\
641                     rfx query \"{}\" --force {}",
642                    reason,
643                    suggestions.join("\n             "),
644                    pattern,
645                    cmd_flags
646                );
647            }
648        }
649
650        // DETERMINISTIC SORTING: Sort candidates early for deterministic results
651        // This ensures results are always returned in the same order
652        if filter.symbols_mode || filter.kind.is_some() || filter.use_ast {
653            results.sort_by(|a, b| {
654                a.path.cmp(&b.path)
655                    .then_with(|| a.span.start_line.cmp(&b.span.start_line))
656            });
657
658            // Warn if many candidates need parsing (helps users refine queries)
659            let candidate_count = results.len();
660            if candidate_count > 1000 && !filter.suppress_output {
661                output::warn(&format!(
662                    "Pattern '{}' matched {} files - parsing may take some time. Consider using --file, --glob, or a more specific pattern to narrow the search.",
663                    pattern,
664                    candidate_count
665                ));
666            } else if candidate_count > 100 {
667                log::info!("Parsing {} candidate files for symbol extraction", candidate_count);
668            }
669        }
670
671        // PHASE 2: Enrich with symbol information or AST pattern matching (if needed)
672        if filter.use_ast {
673            // AST pattern matching: Execute Tree-sitter query on candidate files
674            results = self.enrich_with_ast(results, pattern, filter.language)?;
675        } else if filter.symbols_mode || filter.kind.is_some() {
676            // Symbol enrichment: Parse candidate files and extract symbol definitions
677            results = self.enrich_with_symbols(results, pattern, &filter)?;
678        }
679
680        // PHASE 3: Apply post-enrichment filters
681        // Note: Language and glob filters are applied in Phase 1 (before broad query check)
682        // Only kind, file_pattern, and exact filters are applied here
683
684        // Apply kind filter (only relevant for symbol searches)
685        // Special case: --kind function also includes methods (methods are functions in classes)
686        if let Some(ref kind) = filter.kind {
687            results.retain(|r| {
688                if matches!(kind, SymbolKind::Function) {
689                    // When searching for functions, also include methods
690                    matches!(r.kind, SymbolKind::Function | SymbolKind::Method)
691                } else {
692                    r.kind == *kind
693                }
694            });
695        }
696
697        // Apply file path filter (substring match)
698        if let Some(ref file_pattern) = filter.file_pattern {
699            results.retain(|r| r.path.contains(file_pattern));
700        }
701
702        // Apply exact name filter (only for symbol searches)
703        if filter.exact && filter.symbols_mode {
704            results.retain(|r| r.symbol.as_deref() == Some(pattern));
705        }
706
707        // Expand symbol bodies if requested
708        // Works for both symbol-mode and regex searches (if regex matched a symbol definition)
709        if filter.expand {
710            // Load content store to fetch full symbol bodies
711            let content_path = self.cache.path().join("content.bin");
712            if let Ok(content_reader) = ContentReader::open(&content_path) {
713                for result in &mut results {
714                    // Only expand if the result has a meaningful span (not just a single line)
715                    if result.span.start_line < result.span.end_line {
716                        // Find the file_id for this result's path
717                        if let Some(file_id) = Self::find_file_id(&content_reader, &result.path) {
718                            // Fetch the full span content
719                            if let Ok(content) = content_reader.get_file_content(file_id) {
720                                let lines: Vec<&str> = content.lines().collect();
721                                let start_idx = (result.span.start_line as usize).saturating_sub(1);
722                                let end_idx = (result.span.end_line as usize).min(lines.len());
723
724                                if start_idx < end_idx {
725                                    let full_body = lines[start_idx..end_idx].join("\n");
726                                    result.preview = full_body;
727                                }
728                            }
729                        }
730                    }
731                }
732            }
733        }
734
735        // Step 4: Deduplicate by path if paths-only mode
736        if filter.paths_only {
737            use std::collections::HashSet;
738            let mut seen_paths = HashSet::new();
739            results.retain(|r| seen_paths.insert(r.path.clone()));
740        }
741
742        // Step 5: Sort results deterministically (by path, then line number)
743        results.sort_by(|a, b| {
744            a.path.cmp(&b.path)
745                .then_with(|| a.span.start_line.cmp(&b.span.start_line))
746        });
747
748        // Capture total count AFTER all filtering but BEFORE pagination (offset/limit)
749        // This is the total number of results the user can paginate through
750        let total_count = results.len();
751
752        // Step 5.5: Apply offset (pagination)
753        if let Some(offset) = filter.offset {
754            if offset < results.len() {
755                results = results.into_iter().skip(offset).collect();
756            } else {
757                // Offset beyond results - return empty
758                results.clear();
759            }
760        }
761
762        // Step 6: Apply limit
763        if let Some(limit) = filter.limit {
764            results.truncate(limit);
765        }
766
767        log::info!("Query returned {} results (total before pagination: {})", results.len(), total_count);
768
769        Ok((results, total_count))
770    }
771
772    /// Search for symbols by exact name match
773    pub fn find_symbol(&self, name: &str) -> Result<Vec<SearchResult>> {
774        let filter = QueryFilter {
775            symbols_mode: true,
776            ..Default::default()
777        };
778        self.search(name, filter)
779    }
780
781    /// Search using a Tree-sitter AST pattern
782    pub fn search_ast(&self, pattern: &str, lang: Option<Language>) -> Result<Vec<SearchResult>> {
783        let filter = QueryFilter {
784            language: lang,
785            use_ast: true,
786            ..Default::default()
787        };
788
789        self.search(pattern, filter)
790    }
791
792    /// Execute AST query on all indexed files (no trigram filtering)
793    ///
794    /// WARNING: This method scans the entire codebase (500ms-2s+).
795    /// In 95% of cases, use --symbols instead which is 10-100x faster.
796    ///
797    /// # Algorithm
798    /// 1. Get all indexed files for the specified language
799    /// 2. Apply glob/exclude filters to reduce file set
800    /// 3. Load file contents for all matching files
801    /// 4. Execute AST query pattern using Tree-sitter
802    /// 5. Apply remaining filters and return results
803    ///
804    /// # Performance
805    /// - Parses entire codebase (not just trigram candidates)
806    /// - Expected: 500ms-2s for medium codebases, 2-10s for large codebases
807    /// - Use --glob to limit scope for better performance
808    ///
809    /// # Requirements
810    /// - Language must be specified (AST queries are language-specific)
811    /// - AST pattern must be valid S-expression syntax
812    pub fn search_ast_all_files(&self, ast_pattern: &str, filter: QueryFilter) -> Result<Vec<SearchResult>> {
813        log::info!("Executing AST query on all files: pattern='{}', filter={:?}", ast_pattern, filter);
814
815        // Require language for AST queries
816        let lang = filter.language.ok_or_else(|| anyhow::anyhow!(
817            "Language must be specified for AST pattern matching. Use --lang to specify the language.\n\
818             \n\
819             Example: rfx query \"(function_definition) @fn\" --ast --lang python"
820        ))?;
821
822        // Ensure cache exists
823        if !self.cache.exists() {
824            anyhow::bail!(
825                "Index not found. Run 'rfx index' to build the cache first."
826            );
827        }
828
829        // Show non-blocking warnings about branch state and staleness
830        self.check_index_freshness(&filter)?;
831
832        // Load content store
833        let content_path = self.cache.path().join("content.bin");
834        let content_reader = ContentReader::open(&content_path)
835            .context("Failed to open content store")?;
836
837        // Build glob matchers ONCE before file iteration (performance optimization)
838        use globset::{Glob, GlobSetBuilder};
839
840        let include_matcher = if !filter.glob_patterns.is_empty() {
841            let mut builder = GlobSetBuilder::new();
842            for pattern in &filter.glob_patterns {
843                // Normalize pattern to ensure LLM-generated patterns work correctly
844                let normalized = Self::normalize_glob_pattern(pattern);
845                if let Ok(glob) = Glob::new(&normalized) {
846                    builder.add(glob);
847                }
848            }
849            builder.build().ok()
850        } else {
851            None
852        };
853
854        let exclude_matcher = if !filter.exclude_patterns.is_empty() {
855            let mut builder = GlobSetBuilder::new();
856            for pattern in &filter.exclude_patterns {
857                // Normalize pattern to ensure LLM-generated patterns work correctly
858                let normalized = Self::normalize_glob_pattern(pattern);
859                if let Ok(glob) = Glob::new(&normalized) {
860                    builder.add(glob);
861                }
862            }
863            builder.build().ok()
864        } else {
865            None
866        };
867
868        // Get all files matching the language and glob filters
869        let mut candidates: Vec<SearchResult> = Vec::new();
870
871        for file_id in 0..content_reader.file_count() {
872            let file_path = match content_reader.get_file_path(file_id as u32) {
873                Some(p) => p,
874                None => continue,
875            };
876
877            // Detect language from file extension
878            let ext = file_path.extension()
879                .and_then(|e| e.to_str())
880                .unwrap_or("");
881            let detected_lang = Language::from_extension(ext);
882
883            // Filter by language
884            if detected_lang != lang {
885                continue;
886            }
887
888            let file_path_str = file_path.to_string_lossy().to_string();
889
890            // Apply glob/exclude filters BEFORE loading content (performance optimization)
891            let included = include_matcher.as_ref().map_or(true, |m| m.is_match(&file_path_str));
892            let excluded = exclude_matcher.as_ref().map_or(false, |m| m.is_match(&file_path_str));
893
894            if !included || excluded {
895                continue;
896            }
897
898            // Create a dummy candidate for this file (AST query will replace it)
899            candidates.push(SearchResult {
900                path: file_path_str,
901                lang: detected_lang,
902                span: Span { start_line: 1, end_line: 1 },
903                symbol: None,
904                kind: SymbolKind::Unknown("ast_query".to_string()),
905                preview: String::new(),
906                dependencies: None,
907            });
908        }
909
910        log::info!("AST query scanning {} files for language {:?}", candidates.len(), lang);
911
912        // BROAD QUERY DETECTION: Block large AST queries without glob restriction
913        // Allow small codebases (<100 files) but require --glob for larger ones
914        if !filter.force && filter.glob_patterns.is_empty() && candidates.len() >= 100 {
915            anyhow::bail!(
916                "Query too broad - would be expensive to execute\n\
917                 \n\
918                 AST query without --glob restriction will scan the ENTIRE codebase ({} files). AST queries are SLOW (500ms-10s+).\n\
919                 \n\
920                 This query could:\n\
921                 • Hang for an extended period before returning results\n\
922                 • Return thousands of results\n\
923                 • Flood LLM context windows with excessive data\n\
924                 • Fail entirely\n\
925                 \n\
926                 Suggestions to narrow the query:\n\
927                 • Add --glob to restrict AST query to specific files: --glob 'src/**/*.rs'\n\
928                 • Use --symbols instead (10-100x faster in 95% of cases)\n\
929                 • Use --force to bypass this check if you need a full codebase scan\n\
930                 \n\
931                 To force execution anyway:\n\
932                 rfx query \"{}\" --force --ast --lang {:?}",
933                candidates.len(),
934                ast_pattern,
935                lang
936            );
937        }
938
939        if candidates.is_empty() {
940            if !filter.suppress_output {
941                output::warn(&format!("No files found for language {:?}. Check your language filter or glob patterns.", lang));
942            }
943            return Ok(Vec::new());
944        }
945
946        // Execute the AST query on all candidate files
947        // This will load file contents and parse them with tree-sitter
948        let mut results = self.enrich_with_ast(candidates, ast_pattern, filter.language)?;
949
950        log::debug!("AST query found {} matches before filtering", results.len());
951
952        // Apply remaining filters (same as search_internal Phase 3)
953
954        // Apply kind filter
955        if let Some(ref kind) = filter.kind {
956            results.retain(|r| {
957                if matches!(kind, SymbolKind::Function) {
958                    matches!(r.kind, SymbolKind::Function | SymbolKind::Method)
959                } else {
960                    r.kind == *kind
961                }
962            });
963        }
964
965        // Note: exact filter doesn't make sense for AST queries (pattern is S-expression, not symbol name)
966
967        // Expand symbol bodies if requested
968        if filter.expand {
969            let content_path = self.cache.path().join("content.bin");
970            if let Ok(content_reader) = ContentReader::open(&content_path) {
971                for result in &mut results {
972                    if result.span.start_line < result.span.end_line {
973                        if let Some(file_id) = Self::find_file_id(&content_reader, &result.path) {
974                            if let Ok(content) = content_reader.get_file_content(file_id) {
975                                let lines: Vec<&str> = content.lines().collect();
976                                let start_idx = (result.span.start_line as usize).saturating_sub(1);
977                                let end_idx = (result.span.end_line as usize).min(lines.len());
978
979                                if start_idx < end_idx {
980                                    let full_body = lines[start_idx..end_idx].join("\n");
981                                    result.preview = full_body;
982                                }
983                            }
984                        }
985                    }
986                }
987            }
988        }
989
990        // Deduplicate by path if paths-only mode
991        if filter.paths_only {
992            use std::collections::HashSet;
993            let mut seen_paths = HashSet::new();
994            results.retain(|r| seen_paths.insert(r.path.clone()));
995        }
996
997        // Sort results deterministically
998        results.sort_by(|a, b| {
999            a.path.cmp(&b.path)
1000                .then_with(|| a.span.start_line.cmp(&b.span.start_line))
1001        });
1002
1003        // Apply offset (pagination)
1004        if let Some(offset) = filter.offset {
1005            if offset < results.len() {
1006                results = results.into_iter().skip(offset).collect();
1007            } else {
1008                results.clear();
1009            }
1010        }
1011
1012        // Apply limit
1013        if let Some(limit) = filter.limit {
1014            results.truncate(limit);
1015        }
1016
1017        log::info!("AST query returned {} results", results.len());
1018
1019        // Load dependencies if requested
1020        self.load_dependencies(&mut results, filter.include_dependencies)?;
1021
1022        Ok(results)
1023    }
1024
1025    /// Search using AST pattern with separate text pattern for trigram filtering
1026    ///
1027    /// This allows efficient AST queries by:
1028    /// 1. Using text_pattern for Phase 1 trigram filtering (narrows to candidate files)
1029    /// 2. Using ast_pattern for Phase 2 AST matching (structure-aware filtering)
1030    ///
1031    /// # Example
1032    /// ```ignore
1033    /// // Find async functions: trigram search for "fn ", AST match for function_item
1034    /// engine.search_ast_with_text_filter("fn ", "(function_item (async))", filter)?;
1035    /// ```
1036    pub fn search_ast_with_text_filter(
1037        &self,
1038        text_pattern: &str,
1039        ast_pattern: &str,
1040        filter: QueryFilter,
1041    ) -> Result<Vec<SearchResult>> {
1042        log::info!("Executing AST query with text filter: text='{}', ast='{}', filter={:?}",
1043                   text_pattern, ast_pattern, filter);
1044
1045        // Ensure cache exists
1046        if !self.cache.exists() {
1047            anyhow::bail!(
1048                "Index not found. Run 'rfx index' to build the cache first."
1049            );
1050        }
1051
1052        // Show non-blocking warnings about branch state and staleness
1053        self.check_index_freshness(&filter)?;
1054
1055        // Start timeout timer if configured
1056        use std::time::{Duration, Instant};
1057        let start_time = Instant::now();
1058        let timeout = if filter.timeout_secs > 0 {
1059            Some(Duration::from_secs(filter.timeout_secs))
1060        } else {
1061            None
1062        };
1063
1064        // PHASE 1: Get initial candidates using text pattern (trigram search)
1065        let candidates = if filter.use_regex {
1066            self.get_regex_candidates(text_pattern, timeout.as_ref(), &start_time, filter.suppress_output)?
1067        } else {
1068            self.get_trigram_candidates(text_pattern, &filter)?
1069        };
1070
1071        log::debug!("Phase 1 found {} candidate locations", candidates.len());
1072
1073        // PHASE 2: Execute AST query on candidates
1074        let mut results = self.enrich_with_ast(candidates, ast_pattern, filter.language)?;
1075
1076        log::debug!("Phase 2 AST matching found {} results", results.len());
1077
1078        // PHASE 3: Apply filters
1079        if let Some(lang) = filter.language {
1080            results.retain(|r| r.lang == lang);
1081        }
1082
1083        if let Some(ref kind) = filter.kind {
1084            results.retain(|r| {
1085                if matches!(kind, SymbolKind::Function) {
1086                    matches!(r.kind, SymbolKind::Function | SymbolKind::Method)
1087                } else {
1088                    r.kind == *kind
1089                }
1090            });
1091        }
1092
1093        if let Some(ref file_pattern) = filter.file_pattern {
1094            results.retain(|r| r.path.contains(file_pattern));
1095        }
1096
1097        // Apply glob pattern filters (same logic as in search_internal)
1098        if !filter.glob_patterns.is_empty() || !filter.exclude_patterns.is_empty() {
1099            use globset::{Glob, GlobSetBuilder};
1100
1101            let include_matcher = if !filter.glob_patterns.is_empty() {
1102                let mut builder = GlobSetBuilder::new();
1103                for pattern in &filter.glob_patterns {
1104                    // Normalize pattern to ensure LLM-generated patterns work correctly
1105                    let normalized = Self::normalize_glob_pattern(pattern);
1106                    if let Ok(glob) = Glob::new(&normalized) {
1107                        builder.add(glob);
1108                    }
1109                }
1110                builder.build().ok()
1111            } else {
1112                None
1113            };
1114
1115            let exclude_matcher = if !filter.exclude_patterns.is_empty() {
1116                let mut builder = GlobSetBuilder::new();
1117                for pattern in &filter.exclude_patterns {
1118                    // Normalize pattern to ensure LLM-generated patterns work correctly
1119                    let normalized = Self::normalize_glob_pattern(pattern);
1120                    if let Ok(glob) = Glob::new(&normalized) {
1121                        builder.add(glob);
1122                    }
1123                }
1124                builder.build().ok()
1125            } else {
1126                None
1127            };
1128
1129            results.retain(|r| {
1130                let included = include_matcher.as_ref().map_or(true, |m| m.is_match(&r.path));
1131                let excluded = exclude_matcher.as_ref().map_or(false, |m| m.is_match(&r.path));
1132                included && !excluded
1133            });
1134        }
1135
1136        if filter.exact && filter.symbols_mode {
1137            results.retain(|r| r.symbol.as_deref() == Some(text_pattern));
1138        }
1139
1140        // Expand symbol bodies if requested
1141        if filter.expand {
1142            let content_path = self.cache.path().join("content.bin");
1143            if let Ok(content_reader) = ContentReader::open(&content_path) {
1144                for result in &mut results {
1145                    if result.span.start_line < result.span.end_line {
1146                        if let Some(file_id) = Self::find_file_id(&content_reader, &result.path) {
1147                            if let Ok(content) = content_reader.get_file_content(file_id) {
1148                                let lines: Vec<&str> = content.lines().collect();
1149                                let start_idx = (result.span.start_line as usize).saturating_sub(1);
1150                                let end_idx = (result.span.end_line as usize).min(lines.len());
1151
1152                                if start_idx < end_idx {
1153                                    let full_body = lines[start_idx..end_idx].join("\n");
1154                                    result.preview = full_body;
1155                                }
1156                            }
1157                        }
1158                    }
1159                }
1160            }
1161        }
1162
1163        // Sort results deterministically
1164        results.sort_by(|a, b| {
1165            a.path.cmp(&b.path)
1166                .then_with(|| a.span.start_line.cmp(&b.span.start_line))
1167        });
1168
1169        // Apply offset (pagination)
1170        if let Some(offset) = filter.offset {
1171            if offset < results.len() {
1172                results = results.into_iter().skip(offset).collect();
1173            } else {
1174                results.clear();
1175            }
1176        }
1177
1178        // Apply limit
1179        if let Some(limit) = filter.limit {
1180            results.truncate(limit);
1181        }
1182
1183        log::info!("AST query returned {} results", results.len());
1184
1185        Ok(results)
1186    }
1187
1188    /// List all symbols of a specific kind
1189    pub fn list_by_kind(&self, kind: SymbolKind) -> Result<Vec<SearchResult>> {
1190        let filter = QueryFilter {
1191            kind: Some(kind),
1192            symbols_mode: true,
1193            ..Default::default()
1194        };
1195
1196        self.search("*", filter)
1197    }
1198
1199    /// Enrich text match candidates with symbol information by parsing files
1200    ///
1201    /// Takes a list of text match candidates and extracts symbol information at those locations.
1202    ///
1203    /// # Algorithm
1204    /// 1. Group candidates by file_id for efficient processing
1205    /// 2. Parse each file with tree-sitter to extract ALL symbols
1206    /// 3. Filter symbols based on matching strategy:
1207    ///    - If use_regex=true: Extract symbols whose line spans overlap with candidate locations
1208    ///    - If use_contains=true: Filter symbols by substring match on symbol name
1209    ///    - Default: Filter symbols by exact name match
1210    /// 4. Return filtered symbol results
1211    ///
1212    /// # Performance
1213    /// Only parses files that have text matches, so typically 10-100 files
1214    /// instead of the entire codebase (62K+ files).
1215    ///
1216    /// # Optimizations
1217    /// 1. Language filtering: Skips files with unsupported languages (no parsers)
1218    /// 2. Parallel processing: Uses Rayon to parse files concurrently across CPU cores
1219    fn enrich_with_symbols(&self, candidates: Vec<SearchResult>, pattern: &str, filter: &QueryFilter) -> Result<Vec<SearchResult>> {
1220        // Load content store for file reading
1221        let content_path = self.cache.path().join("content.bin");
1222        let content_reader = ContentReader::open(&content_path)
1223            .context("Failed to open content store")?;
1224
1225        // Load trigram index for file path lookups
1226        let trigrams_path = self.cache.path().join("trigrams.bin");
1227        let trigram_index = if trigrams_path.exists() {
1228            TrigramIndex::load(&trigrams_path)?
1229        } else {
1230            Self::rebuild_trigram_index(&content_reader)?
1231        };
1232
1233        // Open symbol cache for reading cached symbols
1234        let symbol_cache = crate::symbol_cache::SymbolCache::open(self.cache.path())
1235            .context("Failed to open symbol cache")?;
1236
1237        // Load file hashes for current branch for cache lookups
1238        let root = self.cache.workspace_root();
1239        let branch = crate::git::get_current_branch(&root)
1240            .unwrap_or_else(|_| "_default".to_string());
1241        let file_hashes = self.cache.load_hashes_for_branch(&branch)
1242            .context("Failed to load file hashes")?;
1243        log::debug!("Loaded {} file hashes for branch '{}' for symbol cache lookups", file_hashes.len(), branch);
1244
1245        // Group candidates by file, filtering out unsupported languages
1246        use std::collections::HashMap;
1247        let mut files_by_path: HashMap<String, Vec<SearchResult>> = HashMap::new();
1248        let mut skipped_unsupported = 0;
1249
1250        for candidate in candidates {
1251            // Skip files with unsupported languages (no parser available)
1252            if !candidate.lang.is_supported() {
1253                skipped_unsupported += 1;
1254                continue;
1255            }
1256
1257            files_by_path
1258                .entry(candidate.path.clone())
1259                .or_insert_with(Vec::new)
1260                .push(candidate);
1261        }
1262
1263        let total_files = files_by_path.len();
1264        log::debug!("Processing {} candidate files for symbol enrichment (skipped {} unsupported language files)",
1265                   total_files, skipped_unsupported);
1266
1267        // Warn if pattern is very broad (may take time to parse all files)
1268        if total_files > 1000 && !filter.suppress_output {
1269            output::warn(&format!(
1270                "Pattern '{}' matched {} files. This may take some time to parse. Consider using a more specific pattern or adding --lang/--file filters to narrow the search.",
1271                pattern,
1272                total_files
1273            ));
1274        }
1275
1276        // Convert to vec for parallel processing
1277        let mut files_to_process: Vec<String> = files_by_path.keys().cloned().collect();
1278
1279        // PHASE 2a: Line-based pre-filtering (skip files where ALL matches are in comments/strings)
1280        // This reduces tree-sitter parsing workload by 2-5x for most queries
1281        let mut files_to_skip: std::collections::HashSet<String> = std::collections::HashSet::new();
1282
1283        for file_path in &files_to_process {
1284            // Get the language for this file
1285            let ext = std::path::Path::new(file_path)
1286                .extension()
1287                .and_then(|e| e.to_str())
1288                .unwrap_or("");
1289            let lang = Language::from_extension(ext);
1290
1291            // Get line filter for this language (if available)
1292            if let Some(line_filter) = crate::line_filter::get_filter(lang) {
1293                // Find file_id for this path
1294                let file_id = match Self::find_file_id_by_path(&content_reader, &trigram_index, file_path) {
1295                    Some(id) => id,
1296                    None => continue,
1297                };
1298
1299                // Load file content
1300                let content = match content_reader.get_file_content(file_id) {
1301                    Ok(c) => c,
1302                    Err(_) => continue,
1303                };
1304
1305                // Check if ALL pattern occurrences are in comments/strings
1306                let mut all_in_non_code = true;
1307                for line in content.lines() {
1308                    // Find all occurrences of the pattern in this line
1309                    let mut search_start = 0;
1310                    while let Some(pos) = line[search_start..].find(pattern) {
1311                        let absolute_pos = search_start + pos;
1312
1313                        // Check if this occurrence is in code (not comment/string)
1314                        let in_comment = line_filter.is_in_comment(line, absolute_pos);
1315                        let in_string = line_filter.is_in_string(line, absolute_pos);
1316
1317                        if !in_comment && !in_string {
1318                            // Found at least one occurrence in actual code
1319                            all_in_non_code = false;
1320                            break;
1321                        }
1322
1323                        search_start = absolute_pos + pattern.len();
1324                    }
1325
1326                    if !all_in_non_code {
1327                        break;
1328                    }
1329                }
1330
1331                // If ALL occurrences are in comments/strings, skip this file
1332                if all_in_non_code {
1333                    // Double-check: make sure there was at least one occurrence
1334                    if content.contains(pattern) {
1335                        files_to_skip.insert(file_path.clone());
1336                        log::debug!("Pre-filter: Skipping {} (all matches in comments/strings)", file_path);
1337                    }
1338                }
1339            }
1340        }
1341
1342        // Filter out files we're skipping
1343        files_to_process.retain(|path| !files_to_skip.contains(path));
1344
1345        log::debug!("Pre-filter: Skipped {} files where all matches are in comments/strings (parsing {} files)",
1346                   files_to_skip.len(), files_to_process.len());
1347
1348        // Configure thread pool for parallel processing (use 80% of available cores, capped at 8)
1349        let num_threads = {
1350            let available_cores = std::thread::available_parallelism()
1351                .map(|n| n.get())
1352                .unwrap_or(4);
1353            // Use 80% of available cores (minimum 1, maximum 8) to avoid locking the system
1354            // Cap at 8 to prevent diminishing returns from cache contention on high-core systems
1355            ((available_cores as f64 * 0.8).ceil() as usize).max(1).min(8)
1356        };
1357
1358        log::debug!("Using {} threads for parallel symbol extraction (out of {} available cores)",
1359                   num_threads,
1360                   std::thread::available_parallelism().map(|n| n.get()).unwrap_or(4));
1361
1362        // Build a custom thread pool with limited threads
1363        let pool = rayon::ThreadPoolBuilder::new()
1364            .num_threads(num_threads)
1365            .build()
1366            .context("Failed to create thread pool for symbol extraction")?;
1367
1368        // OPTIMIZATION: Batch read all cached symbols in ONE database transaction
1369        // This is 10-30x faster than calling get() individually for each file
1370
1371        // Step 1: Collect file paths that have hashes
1372        let files_with_hashes: Vec<String> = files_to_process
1373            .iter()
1374            .filter(|path| file_hashes.contains_key(path.as_str()))
1375            .cloned()
1376            .collect();
1377
1378        // Step 2: Batch lookup file_ids for all paths
1379        let file_id_map = self.cache.batch_get_file_ids(&files_with_hashes)
1380            .context("Failed to batch lookup file IDs")?;
1381
1382        // Step 3: Build (file_id, hash, path) tuples for batch_get_with_kind
1383        let file_lookup_tuples: Vec<(i64, String, String)> = files_with_hashes
1384            .iter()
1385            .filter_map(|path| {
1386                let file_id = file_id_map.get(path)?;
1387                let hash = file_hashes.get(path.as_str())?;
1388                Some((*file_id, hash.clone(), path.clone()))
1389            })
1390            .collect();
1391
1392        // Step 4: Batch read symbols with kind filtering (uses junction table + integer joins)
1393        let batch_results = symbol_cache.batch_get_with_kind(&file_lookup_tuples, filter.kind.clone())
1394            .context("Failed to batch read symbol cache")?;
1395
1396        // Step 5: Separate files into cached vs need-to-parse
1397        let mut cached_symbols: HashMap<String, Vec<SearchResult>> = HashMap::new();
1398        let mut files_needing_parse: Vec<String> = Vec::new();
1399
1400        // Build path lookup from file_id
1401        let id_to_path: HashMap<i64, String> = file_id_map
1402            .iter()
1403            .map(|(path, id)| (*id, path.clone()))
1404            .collect();
1405
1406        // Process cached results
1407        for (file_id, symbols) in batch_results {
1408            if let Some(file_path) = id_to_path.get(&file_id) {
1409                cached_symbols.insert(file_path.clone(), symbols);
1410            }
1411        }
1412
1413        // Files with hashes but not in cache results need parsing
1414        for path in &files_with_hashes {
1415            if file_id_map.contains_key(path) && !cached_symbols.contains_key(path) {
1416                files_needing_parse.push(path.clone());
1417            }
1418        }
1419
1420        // Add files without hashes to parse list
1421        for file_path in &files_to_process {
1422            if !file_hashes.contains_key(file_path.as_str()) {
1423                files_needing_parse.push(file_path.clone());
1424            }
1425        }
1426
1427        log::debug!(
1428            "Symbol cache: {} hits, {} need parsing",
1429            cached_symbols.len(),
1430            files_needing_parse.len()
1431        );
1432
1433        // Parse files in parallel using custom thread pool (only cache misses)
1434        use rayon::prelude::*;
1435
1436        let parsed_symbols: Vec<SearchResult> = pool.install(|| {
1437            files_needing_parse
1438                .par_iter()
1439                .flat_map(|file_path| {
1440                // Find file_id for this path
1441                let file_id = match Self::find_file_id_by_path(&content_reader, &trigram_index, file_path) {
1442                    Some(id) => id,
1443                    None => {
1444                        log::warn!("Could not find file_id for path: {}", file_path);
1445                        return Vec::new();
1446                    }
1447                };
1448
1449                let content = match content_reader.get_file_content(file_id) {
1450                    Ok(c) => c,
1451                    Err(e) => {
1452                        log::warn!("Failed to read file {}: {}", file_path, e);
1453                        return Vec::new();
1454                    }
1455                };
1456
1457                // Detect language
1458                let ext = std::path::Path::new(file_path)
1459                    .extension()
1460                    .and_then(|e| e.to_str())
1461                    .unwrap_or("");
1462                let lang = Language::from_extension(ext);
1463
1464                // Parse file to extract symbols
1465                let symbols = match ParserFactory::parse(file_path, content, lang) {
1466                    Ok(symbols) => {
1467                        log::debug!("Parsed {} symbols from {}", symbols.len(), file_path);
1468                        symbols
1469                    }
1470                    Err(e) => {
1471                        log::debug!("Failed to parse {}: {}", file_path, e);
1472                        Vec::new()
1473                    }
1474                };
1475
1476                // Cache the parsed symbols (ignore errors - caching is best-effort)
1477                if let Some(file_hash) = file_hashes.get(file_path.as_str()) {
1478                    if let Err(e) = symbol_cache.set(file_path, file_hash, &symbols) {
1479                        log::debug!("Failed to cache symbols for {}: {}", file_path, e);
1480                    }
1481                }
1482
1483                symbols
1484            })
1485            .collect()
1486        });
1487
1488        // Combine cached and parsed symbols
1489        let mut all_symbols: Vec<SearchResult> = Vec::new();
1490
1491        // Add all cached symbols
1492        for symbols in cached_symbols.values() {
1493            all_symbols.extend_from_slice(symbols);
1494        }
1495
1496        // Add all parsed symbols
1497        all_symbols.extend(parsed_symbols);
1498
1499        // KEYWORD DETECTION: Check if pattern is a language keyword (e.g., "class", "function")
1500        // If it matches a keyword AND symbols_mode is true, interpret as "list all symbols of that type"
1501        // rather than looking for a symbol literally named "class" or "function"
1502        //
1503        // IMPORTANT: Only check keywords for languages that will pass Phase 3 filtering.
1504        // If a language filter is specified, only check that language's keywords.
1505        // Otherwise, check all languages present in the symbol results.
1506        let is_keyword_query = {
1507            // Determine which language to check keywords for
1508            let lang_to_check = if let Some(lang) = filter.language {
1509                // Language filter specified - check that language only
1510                // This ensures keyword detection aligns with Phase 3 language filtering
1511                vec![lang]
1512            } else {
1513                // No language filter - check all languages that appear in the actual symbols
1514                // (not candidates, but the parsed symbols that made it through)
1515                // This handles mixed-language codebases correctly
1516                let mut langs: Vec<Language> = all_symbols.iter()
1517                    .map(|s| s.lang)
1518                    .collect::<Vec<_>>();
1519                langs.sort_by(|a, b| format!("{:?}", a).cmp(&format!("{:?}", b))); // Deterministic ordering
1520                langs.dedup(); // Remove duplicates after sorting
1521                langs
1522            };
1523
1524            // Check if pattern matches a keyword in any of the relevant languages
1525            lang_to_check.iter().any(|lang| {
1526                ParserFactory::get_keywords(*lang).contains(&pattern)
1527            })
1528        };
1529
1530        // If pattern is a keyword (like "class" or "function"), skip name-based filtering
1531        // and return all symbols (kind filtering happens in Phase 3)
1532        let filtered: Vec<SearchResult> = if is_keyword_query {
1533            log::info!("Pattern '{}' is a language keyword - listing all symbols (kind filtering will be applied in Phase 3)", pattern);
1534            all_symbols
1535        } else if filter.use_regex {
1536            // For regex queries, candidates already matched content via regex in Phase 1.
1537            // Extract symbols whose line spans overlap with the candidate locations.
1538            // This ensures symbols are found at the locations where the regex matched.
1539
1540            // Build a map of (file_path, line_no) from candidates
1541            use std::collections::{HashMap, HashSet};
1542            let mut candidate_lines: HashMap<String, HashSet<usize>> = HashMap::new();
1543            for candidate in &files_by_path {
1544                for cand in candidate.1 {
1545                    candidate_lines
1546                        .entry(candidate.0.clone())
1547                        .or_insert_with(HashSet::new)
1548                        .insert(cand.span.start_line);
1549                }
1550            }
1551
1552            // Filter symbols whose spans overlap with candidate lines
1553            all_symbols
1554                .into_iter()
1555                .filter(|sym| {
1556                    if let Some(lines) = candidate_lines.get(&sym.path) {
1557                        // Check if symbol's line span overlaps with any candidate line
1558                        for line in sym.span.start_line..=sym.span.end_line {
1559                            if lines.contains(&line) {
1560                                return true;
1561                            }
1562                        }
1563                    }
1564                    false
1565                })
1566                .collect()
1567        } else if filter.use_contains {
1568            // Substring match (opt-in with --contains)
1569            all_symbols
1570                .into_iter()
1571                .filter(|sym| sym.symbol.as_deref().map_or(false, |s| s.contains(pattern)))
1572                .collect()
1573        } else {
1574            // Exact match (default)
1575            all_symbols
1576                .into_iter()
1577                .filter(|sym| sym.symbol.as_deref().map_or(false, |s| s == pattern))
1578                .collect()
1579        };
1580
1581        log::info!("Symbol enrichment found {} matches for pattern '{}'", filtered.len(), pattern);
1582
1583        Ok(filtered)
1584    }
1585
1586    /// Enrich text match candidates with AST pattern matching
1587    ///
1588    /// Takes a list of text match candidates and executes a Tree-sitter AST query
1589    /// on the candidate files, returning only matches that satisfy the AST pattern.
1590    ///
1591    /// # Algorithm
1592    /// 1. Extract unique file paths from candidates
1593    /// 2. Load file contents for each candidate file
1594    /// 3. Execute AST query pattern using Tree-sitter
1595    /// 4. Return AST matches
1596    ///
1597    /// # Performance
1598    /// Only parses files that have text matches, so typically 10-100 files
1599    /// instead of the entire codebase (62K+ files).
1600    ///
1601    /// # Requirements
1602    /// - Language must be specified (AST queries are language-specific)
1603    /// - AST pattern must be valid S-expression syntax
1604    fn enrich_with_ast(&self, candidates: Vec<SearchResult>, ast_pattern: &str, language: Option<Language>) -> Result<Vec<SearchResult>> {
1605        // Require language for AST queries
1606        let lang = language.ok_or_else(|| anyhow::anyhow!(
1607            "Language must be specified for AST pattern matching. Use --lang to specify the language."
1608        ))?;
1609
1610        // Load content store for file reading
1611        let content_path = self.cache.path().join("content.bin");
1612        let content_reader = ContentReader::open(&content_path)
1613            .context("Failed to open content store")?;
1614
1615        // Load trigram index for file path lookups
1616        let trigrams_path = self.cache.path().join("trigrams.bin");
1617        let trigram_index = if trigrams_path.exists() {
1618            TrigramIndex::load(&trigrams_path)?
1619        } else {
1620            Self::rebuild_trigram_index(&content_reader)?
1621        };
1622
1623        // Collect unique file paths from candidates and load their contents
1624        use std::collections::HashMap;
1625        let mut file_contents: HashMap<String, String> = HashMap::new();
1626
1627        for candidate in &candidates {
1628            if file_contents.contains_key(&candidate.path) {
1629                continue;
1630            }
1631
1632            // Find file_id for this path
1633            let file_id = match Self::find_file_id_by_path(&content_reader, &trigram_index, &candidate.path) {
1634                Some(id) => id,
1635                None => {
1636                    log::warn!("Could not find file_id for path: {}", candidate.path);
1637                    continue;
1638                }
1639            };
1640
1641            // Load file content
1642            let content = match content_reader.get_file_content(file_id) {
1643                Ok(c) => c,
1644                Err(e) => {
1645                    log::warn!("Failed to read file {}: {}", candidate.path, e);
1646                    continue;
1647                }
1648            };
1649
1650            file_contents.insert(candidate.path.clone(), content.to_string());
1651        }
1652
1653        log::debug!("Executing AST query on {} candidate files with language {:?}", file_contents.len(), lang);
1654
1655        // Execute AST query using the ast_query module
1656        let results = crate::ast_query::execute_ast_query(candidates, ast_pattern, lang, &file_contents)?;
1657
1658        log::info!("AST query found {} matches for pattern '{}'", results.len(), ast_pattern);
1659
1660        Ok(results)
1661    }
1662
1663    /// Helper to find file_id by path string
1664    fn find_file_id_by_path(
1665        content_reader: &ContentReader,
1666        trigram_index: &TrigramIndex,
1667        target_path: &str,
1668    ) -> Option<u32> {
1669        // Try trigram index first (faster)
1670        for file_id in 0..trigram_index.file_count() {
1671            if let Some(path) = trigram_index.get_file(file_id as u32) {
1672                if path.to_string_lossy() == target_path {
1673                    return Some(file_id as u32);
1674                }
1675            }
1676        }
1677
1678        // Fallback to content reader
1679        for file_id in 0..content_reader.file_count() {
1680            if let Some(path) = content_reader.get_file_path(file_id as u32) {
1681                if path.to_string_lossy() == target_path {
1682                    return Some(file_id as u32);
1683                }
1684            }
1685        }
1686
1687        None
1688    }
1689
1690    /// Map keyword patterns to SymbolKind for auto-inference
1691    ///
1692    /// When users search for keywords like "class" or "function" with --symbols,
1693    /// automatically infer the kind filter to return only symbols of that type.
1694    ///
1695    /// This makes keyword queries more intuitive: searching for "class" returns
1696    /// only classes, not all symbols.
1697    fn keyword_to_kind(keyword: &str) -> Option<SymbolKind> {
1698        filter::keyword_to_kind(keyword)
1699    }
1700
1701    /// Get all files matching the language filter (for keyword queries)
1702    ///
1703    /// This method bypasses trigram search and returns ALL files of the specified language.
1704    /// Used for keyword queries like "list all classes" where we need complete coverage,
1705    /// not just the first 100 candidates from a trigram search.
1706    ///
1707    /// Similar to `search_ast_all_files()` but works for symbol queries instead of AST queries.
1708    fn get_all_language_files(&self, filter: &QueryFilter) -> Result<Vec<SearchResult>> {
1709        // Language filter is optional - if not specified, scan all files
1710        // If specified, only scan files of that language
1711
1712        // Load content store
1713        let content_path = self.cache.path().join("content.bin");
1714        let content_reader = ContentReader::open(&content_path)
1715            .context("Failed to open content store")?;
1716
1717        // Build glob matchers if specified (for filtering)
1718        use globset::{Glob, GlobSetBuilder};
1719
1720        let include_matcher = if !filter.glob_patterns.is_empty() {
1721            let mut builder = GlobSetBuilder::new();
1722            for pattern in &filter.glob_patterns {
1723                let normalized = Self::normalize_glob_pattern(pattern);
1724                if let Ok(glob) = Glob::new(&normalized) {
1725                    builder.add(glob);
1726                }
1727            }
1728            builder.build().ok()
1729        } else {
1730            None
1731        };
1732
1733        let exclude_matcher = if !filter.exclude_patterns.is_empty() {
1734            let mut builder = GlobSetBuilder::new();
1735            for pattern in &filter.exclude_patterns {
1736                let normalized = Self::normalize_glob_pattern(pattern);
1737                if let Ok(glob) = Glob::new(&normalized) {
1738                    builder.add(glob);
1739                }
1740            }
1741            builder.build().ok()
1742        } else {
1743            None
1744        };
1745
1746        // Scan all files and filter by language + glob patterns
1747        let mut candidates: Vec<SearchResult> = Vec::new();
1748
1749        for file_id in 0..content_reader.file_count() {
1750            let file_path = match content_reader.get_file_path(file_id as u32) {
1751                Some(p) => p,
1752                None => continue,
1753            };
1754
1755            // Detect language from file extension
1756            let ext = file_path.extension()
1757                .and_then(|e| e.to_str())
1758                .unwrap_or("");
1759            let detected_lang = Language::from_extension(ext);
1760
1761            // Filter by language (if specified)
1762            if let Some(lang) = filter.language {
1763                if detected_lang != lang {
1764                    continue;
1765                }
1766            }
1767
1768            let file_path_str = file_path.to_string_lossy().to_string();
1769
1770            // Apply glob/exclude filters
1771            let included = include_matcher.as_ref().map_or(true, |m| m.is_match(&file_path_str));
1772            let excluded = exclude_matcher.as_ref().map_or(false, |m| m.is_match(&file_path_str));
1773
1774            if !included || excluded {
1775                continue;
1776            }
1777
1778            // Apply file path filter if specified
1779            if let Some(ref file_pattern) = filter.file_pattern {
1780                if !file_path_str.contains(file_pattern) {
1781                    continue;
1782                }
1783            }
1784
1785            // Create a dummy candidate for this file
1786            // Phase 2 (symbol enrichment) will parse it and extract actual symbols
1787            candidates.push(SearchResult {
1788                path: file_path_str,
1789                lang: detected_lang,
1790                span: Span { start_line: 1, end_line: 1 },
1791                symbol: None,
1792                kind: SymbolKind::Unknown("keyword_query".to_string()),
1793                preview: String::new(),
1794                dependencies: None,
1795            });
1796        }
1797
1798        if let Some(lang) = filter.language {
1799            log::info!("Keyword query will scan {} {:?} files for symbol extraction", candidates.len(), lang);
1800        } else {
1801            log::info!("Keyword query will scan {} files (all languages) for symbol extraction", candidates.len());
1802        }
1803
1804        Ok(candidates)
1805    }
1806
1807    /// Get candidate results using trigram-based full-text search
1808    fn get_trigram_candidates(&self, pattern: &str, filter: &QueryFilter) -> Result<Vec<SearchResult>> {
1809        // Load content store
1810        let content_path = self.cache.path().join("content.bin");
1811        let content_reader = ContentReader::open(&content_path)
1812            .context("Failed to open content store")?;
1813
1814        // Load trigram index from disk (or rebuild if missing)
1815        let trigrams_path = self.cache.path().join("trigrams.bin");
1816        let trigram_index = if trigrams_path.exists() {
1817            match TrigramIndex::load(&trigrams_path) {
1818                Ok(index) => {
1819                    log::debug!("Loaded trigram index from disk: {} trigrams, {} files",
1820                               index.trigram_count(), index.file_count());
1821                    index
1822                }
1823                Err(e) => {
1824                    log::warn!("Failed to load trigram index from disk: {}", e);
1825                    log::warn!("Rebuilding trigram index from content store...");
1826                    Self::rebuild_trigram_index(&content_reader)?
1827                }
1828            }
1829        } else {
1830            log::debug!("trigrams.bin not found, rebuilding from content store");
1831            Self::rebuild_trigram_index(&content_reader)?
1832        };
1833
1834        // Search using trigrams
1835        let candidates = trigram_index.search(pattern);
1836        log::debug!("Found {} candidate locations from trigram search", candidates.len());
1837
1838        // Clone pattern to owned String for thread safety
1839        let pattern_owned = pattern.to_string();
1840
1841        // Compile regex once if in regex mode (before parallel processing for efficiency)
1842        let compiled_regex = if filter.use_regex {
1843            match Regex::new(&pattern_owned) {
1844                Ok(re) => Some(re),
1845                Err(e) => {
1846                    log::error!("Invalid regex pattern '{}': {}", pattern_owned, e);
1847                    anyhow::bail!("Invalid regex pattern '{}': {}", pattern_owned, e);
1848                }
1849            }
1850        } else {
1851            None
1852        };
1853
1854        // Group candidates by file for efficient processing
1855        use std::collections::HashMap;
1856        let mut candidates_by_file: HashMap<u32, Vec<crate::trigram::FileLocation>> = HashMap::new();
1857        for loc in candidates {
1858            candidates_by_file
1859                .entry(loc.file_id)
1860                .or_insert_with(Vec::new)
1861                .push(loc);
1862        }
1863
1864        log::debug!("Scanning {} files with trigram matches", candidates_by_file.len());
1865
1866        // Process files in parallel using rayon
1867        use rayon::prelude::*;
1868
1869        let results: Vec<SearchResult> = candidates_by_file
1870            .par_iter()
1871            .flat_map(|(file_id, locations)| {
1872                // Get file metadata
1873                let file_path = match trigram_index.get_file(*file_id) {
1874                    Some(p) => p,
1875                    None => return Vec::new(),
1876                };
1877
1878                let content = match content_reader.get_file_content(*file_id) {
1879                    Ok(c) => c,
1880                    Err(_) => return Vec::new(),
1881                };
1882
1883                let file_path_str = file_path.to_string_lossy().to_string();
1884
1885                // Detect language once per file
1886                let ext = file_path.extension()
1887                    .and_then(|e| e.to_str())
1888                    .unwrap_or("");
1889                let lang = Language::from_extension(ext);
1890
1891                // Split content into lines once
1892                let lines: Vec<&str> = content.lines().collect();
1893
1894                // Use a HashSet to deduplicate results by line number
1895                let mut seen_lines: std::collections::HashSet<usize> = std::collections::HashSet::new();
1896                let mut file_results = Vec::new();
1897
1898                // Only check the specific lines indicated by trigram posting lists
1899                for loc in locations {
1900                    let line_no = loc.line_no as usize;
1901
1902                    // Skip if we've already processed this line
1903                    if seen_lines.contains(&line_no) {
1904                        continue;
1905                    }
1906
1907                    // Bounds check
1908                    if line_no == 0 || line_no > lines.len() {
1909                        log::debug!("Line {} out of bounds (file has {} lines)", line_no, lines.len());
1910                        continue;
1911                    }
1912
1913                    let line = lines[line_no - 1];
1914
1915                    // Apply matching strategy based on filter mode:
1916                    // - Default: Word-boundary matching (restrictive - finds whole identifiers)
1917                    // - --contains: Substring matching (expansive - finds pattern anywhere)
1918                    // - --regex: Actual regex matching (controlled by pattern itself)
1919                    let line_matches = if filter.use_regex {
1920                        // Regex matching - use pre-compiled regex for efficiency
1921                        // The regex was compiled once outside the parallel loop
1922                        compiled_regex.as_ref()
1923                            .map(|re| re.is_match(line))
1924                            .unwrap_or(false)
1925                    } else if filter.use_contains {
1926                        // Substring matching (expansive)
1927                        line.contains(&pattern_owned)
1928                    } else {
1929                        // Word-boundary matching (restrictive, default)
1930                        Self::has_word_boundary_match(line, &pattern_owned)
1931                    };
1932
1933                    if !line_matches {
1934                        continue;
1935                    }
1936
1937                    seen_lines.insert(line_no);
1938
1939                    // Create a text match result (no symbol lookup for performance)
1940                    file_results.push(SearchResult {
1941                        path: file_path_str.clone(),
1942                        lang: lang.clone(),
1943                        kind: SymbolKind::Unknown("text_match".to_string()),
1944                        symbol: None,  // No symbol name for text matches (avoid duplication)
1945                        span: Span {
1946                            start_line: line_no,
1947                            end_line: line_no,
1948                        },
1949                        preview: line.to_string(),
1950                        dependencies: None,
1951                    });
1952                }
1953
1954                file_results
1955            })
1956            .collect();
1957
1958        Ok(results)
1959    }
1960
1961    /// Get candidate results using regex patterns with trigram optimization
1962    ///
1963    /// # Algorithm
1964    ///
1965    /// 1. Extract literal sequences from the regex pattern (≥3 chars)
1966    /// 2. If literals found: search for files containing ANY of the literals (UNION)
1967    /// 3. If no literals: fall back to full content scan
1968    /// 4. Compile regex and verify matches in candidate files
1969    /// 5. Return matching results with context
1970    ///
1971    /// # File Selection Strategy
1972    ///
1973    /// Uses UNION of files containing any literal (conservative approach):
1974    /// - For alternation patterns `(a|b)`: Correctly searches files with a OR b
1975    /// - For sequential patterns `a.*b`: Searches files with a OR b (may include extra files)
1976    /// - Trade-off: Ensures correctness at the cost of scanning 2-3x more files for sequential patterns
1977    /// - Performance impact is minimal due to memory-mapped I/O (<5ms overhead typically)
1978    ///
1979    /// # Performance
1980    ///
1981    /// - Best case (pattern with literals): <20ms (trigram optimization)
1982    /// - Typical case (alternation/sequential): 5-15ms on small codebases (<100 files)
1983    /// - Worst case (no literals like `.*`): ~100ms (full scan)
1984    fn get_regex_candidates(&self, pattern: &str, timeout: Option<&std::time::Duration>, start_time: &std::time::Instant, suppress_output: bool) -> Result<Vec<SearchResult>> {
1985        // Step 1: Compile the regex
1986        let regex = Regex::new(pattern)
1987            .with_context(|| format!("Invalid regex pattern: {}", pattern))?;
1988
1989        // Check timeout before expensive operations
1990        if let Some(timeout_duration) = timeout {
1991            if start_time.elapsed() > *timeout_duration {
1992                anyhow::bail!(
1993                    "Query timeout exceeded ({} seconds) during regex compilation",
1994                    timeout_duration.as_secs()
1995                );
1996            }
1997        }
1998
1999        // Step 2: Extract trigrams from regex
2000        let trigrams = extract_trigrams_from_regex(pattern);
2001
2002        // Load content store
2003        let content_path = self.cache.path().join("content.bin");
2004        let content_reader = ContentReader::open(&content_path)
2005            .context("Failed to open content store")?;
2006
2007        let mut results = Vec::new();
2008
2009        if trigrams.is_empty() {
2010            // No trigrams - fall back to full scan
2011            if !suppress_output {
2012                output::warn(&format!(
2013                    "Regex pattern '{}' has no literals (≥3 chars), falling back to full content scan. This may be slow on large codebases. Consider using patterns with literal text.",
2014                    pattern
2015                ));
2016            }
2017
2018            // Scan all files
2019            for file_id in 0..content_reader.file_count() {
2020                let file_path = content_reader.get_file_path(file_id as u32)
2021                    .context("Invalid file_id")?;
2022                let content = content_reader.get_file_content(file_id as u32)?;
2023
2024                self.find_regex_matches_in_file(
2025                    &regex,
2026                    file_path,
2027                    content,
2028                    &mut results,
2029                )?;
2030            }
2031        } else {
2032            // Use trigrams to narrow down candidates
2033            log::debug!("Using {} trigrams to narrow regex search candidates", trigrams.len());
2034
2035            // Load trigram index
2036            let trigrams_path = self.cache.path().join("trigrams.bin");
2037            let trigram_index = if trigrams_path.exists() {
2038                TrigramIndex::load(&trigrams_path)?
2039            } else {
2040                Self::rebuild_trigram_index(&content_reader)?
2041            };
2042
2043            // Extract the literal sequences from the regex pattern
2044            use crate::regex_trigrams::extract_literal_sequences;
2045            let literals = extract_literal_sequences(pattern);
2046
2047            if literals.is_empty() {
2048                log::warn!("Regex extraction found trigrams but no literal sequences - this shouldn't happen");
2049                // Fall back to full scan
2050                for file_id in 0..content_reader.file_count() {
2051                    let file_path = content_reader.get_file_path(file_id as u32)
2052                        .context("Invalid file_id")?;
2053                    let content = content_reader.get_file_content(file_id as u32)?;
2054                    self.find_regex_matches_in_file(&regex, file_path, content, &mut results)?;
2055                }
2056            } else {
2057                // Search for each literal sequence and union the results
2058                // This ensures we find matches for ANY literal (important for alternation patterns like (a|b))
2059                // Trade-off: May scan more files than necessary for sequential patterns (a.*b),
2060                // but ensures correctness for all regex patterns
2061                use std::collections::HashSet;
2062                let mut candidate_files: HashSet<u32> = HashSet::new();
2063
2064                for literal in &literals {
2065                    // Search for this literal in the trigram index
2066                    let candidates = trigram_index.search(literal);
2067                    let file_ids: HashSet<u32> = candidates.iter().map(|loc| loc.file_id).collect();
2068
2069                    log::debug!("Literal '{}' found in {} files", literal, file_ids.len());
2070
2071                    // Union with existing candidate files (not intersection)
2072                    // This ensures we search files containing ANY of the literals
2073                    candidate_files.extend(file_ids);
2074                }
2075
2076                let final_candidates = candidate_files;
2077                log::debug!("After union: searching {} files that contain any literal", final_candidates.len());
2078
2079                // Verify regex matches in candidate files only
2080                for &file_id in &final_candidates {
2081                    let file_path = trigram_index.get_file(file_id)
2082                        .context("Invalid file_id from trigram search")?;
2083                    let content = content_reader.get_file_content(file_id)?;
2084
2085                    self.find_regex_matches_in_file(
2086                        &regex,
2087                        file_path,
2088                        content,
2089                        &mut results,
2090                    )?;
2091                }
2092            }
2093        }
2094
2095        log::info!("Regex search found {} matches for pattern '{}'", results.len(), pattern);
2096        Ok(results)
2097    }
2098
2099    /// Find all regex matches in a single file
2100    fn find_regex_matches_in_file(
2101        &self,
2102        regex: &Regex,
2103        file_path: &std::path::Path,
2104        content: &str,
2105        results: &mut Vec<SearchResult>,
2106    ) -> Result<()> {
2107        let file_path_str = file_path.to_string_lossy().to_string();
2108
2109        // Detect language from file extension
2110        let ext = file_path.extension()
2111            .and_then(|e| e.to_str())
2112            .unwrap_or("");
2113        let lang = Language::from_extension(ext);
2114
2115        // Find all regex matches line by line
2116        for (line_idx, line) in content.lines().enumerate() {
2117            if regex.is_match(line) {
2118                let line_no = line_idx + 1;
2119
2120                // Create text match result
2121                // Note: We don't extract symbol names from regex matches because:
2122                // 1. Regex might match partial identifiers (e.g., "UserController" in "ListUserController")
2123                // 2. Regex might match across language-specific delimiters (namespaces, scopes, etc.)
2124                // 3. Accurate symbol extraction requires tree-sitter parsing (expensive)
2125                // The user can see the full context in the 'preview' field
2126                results.push(SearchResult {
2127                    path: file_path_str.clone(),
2128                    lang: lang.clone(),
2129                    kind: SymbolKind::Unknown("regex_match".to_string()),
2130                    symbol: None,  // No symbol name for regex matches
2131                    span: Span {
2132                        start_line: line_no,
2133                        end_line: line_no,
2134                    },
2135                    preview: line.to_string(),
2136                    dependencies: None,
2137                });
2138            }
2139        }
2140
2141        Ok(())
2142    }
2143
2144    fn find_file_id(content_reader: &ContentReader, target_path: &str) -> Option<u32> {
2145        result::find_file_id(content_reader, target_path)
2146    }
2147
2148    fn rebuild_trigram_index(content_reader: &ContentReader) -> Result<TrigramIndex> {
2149        result::rebuild_trigram_index(content_reader)
2150    }
2151
2152    fn normalize_glob_pattern(pattern: &str) -> String {
2153        result::normalize_glob_pattern(pattern)
2154    }
2155
2156    fn has_word_boundary_match(line: &str, pattern: &str) -> bool {
2157        filter::has_word_boundary_match(line, pattern)
2158    }
2159
2160    /// Get index status for programmatic use (doesn't print warnings)
2161    ///
2162    /// Returns (status, can_trust_results, warning) tuple for JSON output.
2163    /// This is optimized for AI agents to detect staleness and auto-reindex.
2164    fn get_index_status(&self) -> Result<(IndexStatus, bool, Option<IndexWarning>)> {
2165        let root = self.cache.workspace_root();
2166
2167        // Check git state if in a git repo
2168        if crate::git::is_git_repo(&root) {
2169            if let Ok(current_branch) = crate::git::get_current_branch(&root) {
2170                // Check if we're on a different branch than what was indexed
2171                if !self.cache.branch_exists(&current_branch).unwrap_or(false) {
2172                    let warning = IndexWarning {
2173                        reason: format!("Branch '{}' has not been indexed", current_branch),
2174                        action_required: "rfx index".to_string(),
2175                        details: Some(IndexWarningDetails {
2176                            current_branch: Some(current_branch),
2177                            indexed_branch: None,
2178                            current_commit: None,
2179                            indexed_commit: None,
2180                        }),
2181                    };
2182                    return Ok((IndexStatus::Stale, false, Some(warning)));
2183                }
2184
2185                // Branch exists - check if commit changed
2186                if let (Ok(current_commit), Ok(branch_info)) =
2187                    (crate::git::get_current_commit(&root), self.cache.get_branch_info(&current_branch)) {
2188
2189                    if branch_info.commit_sha != current_commit {
2190                        let warning = IndexWarning {
2191                            reason: format!(
2192                                "Commit changed from {} to {}",
2193                                &branch_info.commit_sha[..7],
2194                                &current_commit[..7]
2195                            ),
2196                            action_required: "rfx index".to_string(),
2197                            details: Some(IndexWarningDetails {
2198                                current_branch: Some(current_branch.clone()),
2199                                indexed_branch: Some(current_branch.clone()),
2200                                current_commit: Some(current_commit.clone()),
2201                                indexed_commit: Some(branch_info.commit_sha.clone()),
2202                            }),
2203                        };
2204                        return Ok((IndexStatus::Stale, false, Some(warning)));
2205                    }
2206
2207                    // If commits match, do a quick file freshness check
2208                    if let Ok(branch_files) = self.cache.get_branch_files(&current_branch) {
2209                        let mut checked = 0;
2210                        let mut changed = 0;
2211                        const SAMPLE_SIZE: usize = 10;
2212
2213                        for (path, _indexed_hash) in branch_files.iter().take(SAMPLE_SIZE) {
2214                            checked += 1;
2215                            let file_path = std::path::Path::new(path);
2216
2217                            if let Ok(metadata) = std::fs::metadata(file_path) {
2218                                if let Ok(modified) = metadata.modified() {
2219                                    let indexed_time = branch_info.last_indexed;
2220                                    let file_time = modified.duration_since(std::time::UNIX_EPOCH)
2221                                        .unwrap_or_default()
2222                                        .as_secs() as i64;
2223
2224                                    if file_time > indexed_time {
2225                                        // File modified after indexing - likely stale
2226                                        // Note: We skip hash verification for performance (mtime check is sufficient)
2227                                        changed += 1;
2228                                    }
2229                                }
2230                            }
2231                        }
2232
2233                        if changed > 0 {
2234                            let warning = IndexWarning {
2235                                reason: format!("{} of {} sampled files modified", changed, checked),
2236                                action_required: "rfx index".to_string(),
2237                                details: Some(IndexWarningDetails {
2238                                    current_branch: Some(current_branch.clone()),
2239                                    indexed_branch: Some(branch_info.branch.clone()),
2240                                    current_commit: Some(current_commit.clone()),
2241                                    indexed_commit: Some(branch_info.commit_sha.clone()),
2242                                }),
2243                            };
2244                            return Ok((IndexStatus::Stale, false, Some(warning)));
2245                        }
2246                    }
2247
2248                    // All checks passed - index is fresh
2249                    return Ok((IndexStatus::Fresh, true, None));
2250                }
2251            }
2252        }
2253
2254        // Not in a git repo or couldn't get git info - assume fresh
2255        Ok((IndexStatus::Fresh, true, None))
2256    }
2257
2258    /// Check index freshness and show non-blocking warnings
2259    ///
2260    /// This performs lightweight checks to warn users if their index might be stale:
2261    /// 1. Branch mismatch: indexed different branch
2262    /// 2. Commit changed: HEAD moved since indexing
2263    /// 3. File changes: quick mtime check on sample of files (if available)
2264    fn check_index_freshness(&self, filter: &QueryFilter) -> Result<()> {
2265        let root = self.cache.workspace_root();
2266
2267        // Check git state if in a git repo
2268        if crate::git::is_git_repo(&root) {
2269            if !crate::git::is_git_available() {
2270                static WARNED: std::sync::OnceLock<()> = std::sync::OnceLock::new();
2271                if !filter.suppress_output {
2272                    WARNED.get_or_init(|| {
2273                        output::warn("⚠️  git binary not found in PATH; index freshness checks disabled for this session.");
2274                    });
2275                }
2276                return Ok(());
2277            }
2278            if let Ok(current_branch) = crate::git::get_current_branch(&root) {
2279                // Check if we're on a different branch than what was indexed
2280                if !self.cache.branch_exists(&current_branch).unwrap_or(false) {
2281                    if !filter.suppress_output {
2282                        output::warn(&format!("⚠️  WARNING: Index not found for branch '{}'. Run 'rfx index' to index this branch.", current_branch));
2283                    }
2284                    return Ok(());
2285                }
2286
2287                // Branch exists - check if commit changed
2288                if let (Ok(current_commit), Ok(branch_info)) =
2289                    (crate::git::get_current_commit(&root), self.cache.get_branch_info(&current_branch)) {
2290
2291                    if branch_info.commit_sha != current_commit {
2292                        if !filter.suppress_output {
2293                            output::warn(&format!("⚠️  WARNING: Index may be stale (commit changed: {} → {}). Consider running 'rfx index'.",
2294                                     &branch_info.commit_sha[..7], &current_commit[..7]));
2295                        }
2296                        return Ok(());
2297                    }
2298
2299                    // If commits match, do a quick file freshness check
2300                    // Sample up to 10 files to check for modifications (cheap mtime check)
2301                    if let Ok(branch_files) = self.cache.get_branch_files(&current_branch) {
2302                        let mut checked = 0;
2303                        let mut changed = 0;
2304                        const SAMPLE_SIZE: usize = 10;
2305
2306                        for (path, _indexed_hash) in branch_files.iter().take(SAMPLE_SIZE) {
2307                            checked += 1;
2308                            let file_path = std::path::Path::new(path);
2309
2310                            // Check if file exists and has been modified (mtime/size heuristic)
2311                            if let Ok(metadata) = std::fs::metadata(file_path) {
2312                                if let Ok(modified) = metadata.modified() {
2313                                    let indexed_time = branch_info.last_indexed;
2314                                    let file_time = modified.duration_since(std::time::UNIX_EPOCH)
2315                                        .unwrap_or_default()
2316                                        .as_secs() as i64;
2317
2318                                    // If file modified after indexing, it might be stale
2319                                    if file_time > indexed_time {
2320                                        // File modified after indexing - likely stale
2321                                        // Note: We skip hash verification for performance (mtime check is sufficient)
2322                                        // This may cause false positives if files were touched without changes,
2323                                        // but the warning is non-blocking and vastly better than slow queries
2324                                        changed += 1;
2325                                    }
2326                                }
2327                            }
2328                        }
2329
2330                        if changed > 0 && !filter.suppress_output {
2331                            output::warn(&format!("⚠️  WARNING: {} of {} sampled files changed since indexing. Consider running 'rfx index'.", changed, checked));
2332                        }
2333                    }
2334                }
2335            }
2336        }
2337
2338        Ok(())
2339    }
2340}
2341
2342/// Generate AI instruction based on query results
2343///
2344/// Provides context-aware guidance to AI agents on how to handle search results.
2345/// Uses priority-based logic to determine the most relevant instruction.
2346pub fn generate_ai_instruction(
2347    result_count: usize,
2348    total_count: usize,
2349    has_more: bool,
2350    symbols_mode: bool,
2351    paths_only: bool,
2352    use_ast: bool,
2353    use_regex: bool,
2354    language_filter: bool,
2355    glob_filter: bool,
2356    exact_mode: bool,
2357) -> Option<String> {
2358    // Priority 1: No results
2359    if result_count == 0 {
2360        return Some(
2361            "No results found. Consider these alternatives: 1) Check pattern spelling, 2) Remove --kind or --lang filters to broaden search, 3) Try partial match or related term, 4) Use search_regex tool for pattern matching with special characters or complex patterns."
2362            .to_string()
2363        );
2364    }
2365
2366    // Priority 2: Query too broad (500+ results)
2367    if total_count >= 500 {
2368        return Some(
2369            format!("Query too broad: {} results found. STOP. Do not list results. Refine search automatically by adding filters: kind parameter (Function/Struct/Class), lang parameter (rust/python/etc), or glob parameter (['src/**/*.rs']). Call search_code again with appropriate filters.", total_count)
2370        );
2371    }
2372
2373    // Priority 3: Paginated results
2374    if has_more {
2375        return Some(
2376            format!("Showing {} of {} results. PAGINATED - there are more results available. Do not automatically fetch all results. Show current page, ask user if these results answer their question before fetching more with --offset parameter.", result_count, total_count)
2377        );
2378    }
2379
2380    // Priority 4: Single precise result (symbols mode)
2381    if result_count == 1 && symbols_mode {
2382        return Some(
2383            "Found 1 precise result. Respond concisely: '[symbol] at [path]:[line]'.".to_string()
2384        );
2385    }
2386
2387    // Priority 5: Few precise results (symbols mode)
2388    if result_count >= 2 && result_count <= 10 && symbols_mode {
2389        return Some(
2390            format!("Found {} precise results (definitions only, not usages). List locations concisely: '[symbol] at [path]:[line]' for each result.", result_count)
2391        );
2392    }
2393
2394    // Priority 6: Many results (101-500)
2395    if total_count >= 101 && total_count < 500 {
2396        return Some(
2397            format!("Found {} results - this is broad. Suggest refining search with: kind parameter (Function/Struct/Class/etc), lang parameter (rust/python/etc), or glob parameter to narrow file scope.", total_count)
2398        );
2399    }
2400
2401    // Priority 7: Full-text mode with many results (suggest symbols mode)
2402    if result_count >= 100 && !symbols_mode {
2403        return Some(
2404            format!("Found {} results in full-text search mode (includes definitions AND all usages). Consider using symbols=true parameter to filter to definitions only. This typically reduces results by 80-90%.", result_count)
2405        );
2406    }
2407
2408    // Priority 8: Paths-only mode
2409    if paths_only {
2410        return Some(
2411            format!("Found {} unique files (paths-only mode - no code content included). Next step: Use Read tool on specific files that look relevant based on their paths.", result_count)
2412        );
2413    }
2414
2415    // Priority 9: AST query results
2416    if use_ast {
2417        return Some(
2418            format!("Found {} results using AST pattern matching. These are structure-based matches using Tree-sitter patterns, not text search.", result_count)
2419        );
2420    }
2421
2422    // Priority 10: Regex with many results
2423    if use_regex && result_count >= 100 {
2424        return Some(
2425            format!("Found {} results using regex pattern matching. Regex matches are expansive. Consider using exact text search or symbols mode for more precise results.", result_count)
2426        );
2427    }
2428
2429    // Priority 11: Language filter with few results
2430    if language_filter && result_count <= 5 {
2431        return Some(
2432            format!("Found {} results with language filter active. Results are limited to this language only. Remove lang parameter if you want to search all languages.", result_count)
2433        );
2434    }
2435
2436    // Priority 12: Glob filter with few results
2437    if glob_filter && result_count <= 10 {
2438        return Some(
2439            format!("Found {} results with glob filter active. Results are limited to matching paths. Remove glob parameter to search entire codebase.", result_count)
2440        );
2441    }
2442
2443    // Priority 13: Exact mode with few results
2444    if exact_mode && result_count <= 5 {
2445        return Some(
2446            format!("Found {} results in exact match mode. Only exact symbol name matches are included. Remove exact parameter to allow substring matching.", result_count)
2447        );
2448    }
2449
2450    // Normal case (11-100 results, no special conditions) - no instruction
2451    None
2452}
2453
2454#[cfg(test)]
2455mod tests {
2456    use super::*;
2457    use crate::indexer::Indexer;
2458    use crate::models::IndexConfig;
2459    use std::fs;
2460    use tempfile::TempDir;
2461
2462    // ==================== Basic Tests ====================
2463
2464    #[test]
2465    fn test_query_engine_creation() {
2466        let temp = TempDir::new().unwrap();
2467        let cache = CacheManager::new(temp.path());
2468        let engine = QueryEngine::new(cache);
2469
2470        assert!(engine.cache.path().ends_with(".reflex"));
2471    }
2472
2473    #[test]
2474    fn test_filter_modes() {
2475        // Test that symbols_mode works as expected
2476        let filter_fulltext = QueryFilter::default();
2477        assert!(!filter_fulltext.symbols_mode);
2478
2479        let filter_symbols = QueryFilter {
2480            symbols_mode: true,
2481            ..Default::default()
2482        };
2483        assert!(filter_symbols.symbols_mode);
2484
2485        // Test that kind implies symbols_mode (handled in CLI layer)
2486        let filter_with_kind = QueryFilter {
2487            kind: Some(SymbolKind::Function),
2488            symbols_mode: true,
2489            ..Default::default()
2490        };
2491        assert!(filter_with_kind.symbols_mode);
2492    }
2493
2494    // ==================== Search Mode Tests ====================
2495
2496    #[test]
2497    fn test_fulltext_search() {
2498        let temp = TempDir::new().unwrap();
2499        let project = temp.path().join("project");
2500        fs::create_dir(&project).unwrap();
2501
2502        // Create test files
2503        fs::write(project.join("main.rs"), "fn main() {\n    println!(\"hello\");\n}").unwrap();
2504        fs::write(project.join("lib.rs"), "pub fn hello() {}").unwrap();
2505
2506        // Index the project
2507        let cache = CacheManager::new(&project);
2508        let indexer = Indexer::new(cache, IndexConfig::default());
2509        indexer.index(&project, false).unwrap();
2510
2511        // Search for "hello"
2512        let cache = CacheManager::new(&project);
2513        let engine = QueryEngine::new(cache);
2514        let filter = QueryFilter::default(); // full-text mode
2515        let results = engine.search("hello", filter).unwrap();
2516
2517        // Should find both occurrences (println and function name)
2518        assert!(results.len() >= 2);
2519        assert!(results.iter().any(|r| r.path.contains("main.rs")));
2520        assert!(results.iter().any(|r| r.path.contains("lib.rs")));
2521    }
2522
2523    #[test]
2524    fn test_symbol_search() {
2525        let temp = TempDir::new().unwrap();
2526        let project = temp.path().join("project");
2527        fs::create_dir(&project).unwrap();
2528
2529        // Create test file with function definition and call
2530        fs::write(
2531            project.join("main.rs"),
2532            "fn greet() {}\nfn main() {\n    greet();\n}"
2533        ).unwrap();
2534
2535        // Index
2536        let cache = CacheManager::new(&project);
2537        let indexer = Indexer::new(cache, IndexConfig::default());
2538        indexer.index(&project, false).unwrap();
2539
2540        let cache = CacheManager::new(&project);
2541
2542        // Symbol search (definitions only)
2543        let engine = QueryEngine::new(cache);
2544        let filter = QueryFilter {
2545            symbols_mode: true,
2546            ..Default::default()
2547        };
2548        let results = engine.search("greet", filter).unwrap();
2549
2550        // Should find only the definition, not the call
2551        assert!(results.len() >= 1);
2552        assert!(results.iter().any(|r| r.kind == SymbolKind::Function));
2553    }
2554
2555    #[test]
2556    fn test_regex_search() {
2557        let temp = TempDir::new().unwrap();
2558        let project = temp.path().join("project");
2559        fs::create_dir(&project).unwrap();
2560
2561        fs::write(
2562            project.join("main.rs"),
2563            "fn test1() {}\nfn test2() {}\nfn other() {}"
2564        ).unwrap();
2565
2566        let cache = CacheManager::new(&project);
2567        let indexer = Indexer::new(cache, IndexConfig::default());
2568        indexer.index(&project, false).unwrap();
2569
2570        let cache = CacheManager::new(&project);
2571
2572        let engine = QueryEngine::new(cache);
2573        let filter = QueryFilter {
2574            use_regex: true,
2575            ..Default::default()
2576        };
2577        let results = engine.search(r"fn test\d", filter).unwrap();
2578
2579        // Should match test1 and test2 but not other
2580        assert_eq!(results.len(), 2);
2581        assert!(results.iter().all(|r| r.preview.contains("test")));
2582    }
2583
2584    // ==================== Filter Tests ====================
2585
2586    #[test]
2587    fn test_language_filter() {
2588        let temp = TempDir::new().unwrap();
2589        let project = temp.path().join("project");
2590        fs::create_dir(&project).unwrap();
2591
2592        fs::write(project.join("main.rs"), "fn main() {}").unwrap();
2593        fs::write(project.join("main.js"), "function main() {}").unwrap();
2594
2595        let cache = CacheManager::new(&project);
2596        let indexer = Indexer::new(cache, IndexConfig::default());
2597        indexer.index(&project, false).unwrap();
2598
2599        let cache = CacheManager::new(&project);
2600
2601        let engine = QueryEngine::new(cache);
2602
2603        // Filter to Rust only
2604        let filter = QueryFilter {
2605            language: Some(Language::Rust),
2606            ..Default::default()
2607        };
2608        let results = engine.search("main", filter).unwrap();
2609
2610        assert!(results.iter().all(|r| r.lang == Language::Rust));
2611        assert!(results.iter().all(|r| r.path.ends_with(".rs")));
2612    }
2613
2614    #[test]
2615    fn test_kind_filter() {
2616        let temp = TempDir::new().unwrap();
2617        let project = temp.path().join("project");
2618        fs::create_dir(&project).unwrap();
2619
2620        fs::write(
2621            project.join("main.rs"),
2622            "struct Point {}\nfn main() {}\nimpl Point { fn new() {} }"
2623        ).unwrap();
2624
2625        let cache = CacheManager::new(&project);
2626        let indexer = Indexer::new(cache, IndexConfig::default());
2627        indexer.index(&project, false).unwrap();
2628
2629        let cache = CacheManager::new(&project);
2630
2631        let engine = QueryEngine::new(cache);
2632
2633        // Filter to functions only (includes methods)
2634        let filter = QueryFilter {
2635            symbols_mode: true,
2636            kind: Some(SymbolKind::Function),
2637            use_contains: true,  // "mai" is substring of "main"
2638            ..Default::default()
2639        };
2640        // Search for "mai" which should match "main" (tri gram pattern will def be in index)
2641        let results = engine.search("mai", filter).unwrap();
2642
2643        // Should find main function
2644        assert!(results.len() > 0, "Should find at least one result");
2645        assert!(results.iter().any(|r| r.symbol.as_deref() == Some("main")), "Should find 'main' function");
2646    }
2647
2648    #[test]
2649    fn test_file_pattern_filter() {
2650        let temp = TempDir::new().unwrap();
2651        let project = temp.path().join("project");
2652        fs::create_dir_all(project.join("src")).unwrap();
2653        fs::create_dir_all(project.join("tests")).unwrap();
2654
2655        fs::write(project.join("src/lib.rs"), "fn foo() {}").unwrap();
2656        fs::write(project.join("tests/test.rs"), "fn foo() {}").unwrap();
2657
2658        let cache = CacheManager::new(&project);
2659        let indexer = Indexer::new(cache, IndexConfig::default());
2660        indexer.index(&project, false).unwrap();
2661
2662        let cache = CacheManager::new(&project);
2663
2664        let engine = QueryEngine::new(cache);
2665
2666        // Filter to src/ only
2667        let filter = QueryFilter {
2668            file_pattern: Some("src/".to_string()),
2669            ..Default::default()
2670        };
2671        let results = engine.search("foo", filter).unwrap();
2672
2673        assert!(results.iter().all(|r| r.path.contains("src/")));
2674        assert!(!results.iter().any(|r| r.path.contains("tests/")));
2675    }
2676
2677    #[test]
2678    fn test_limit_filter() {
2679        let temp = TempDir::new().unwrap();
2680        let project = temp.path().join("project");
2681        fs::create_dir(&project).unwrap();
2682
2683        // Create file with many matches
2684        let content = (0..20).map(|i| format!("fn test{}() {{}}", i)).collect::<Vec<_>>().join("\n");
2685        fs::write(project.join("main.rs"), content).unwrap();
2686
2687        let cache = CacheManager::new(&project);
2688        let indexer = Indexer::new(cache, IndexConfig::default());
2689        indexer.index(&project, false).unwrap();
2690
2691        let cache = CacheManager::new(&project);
2692
2693        let engine = QueryEngine::new(cache);
2694
2695        // Limit to 5 results
2696        let filter = QueryFilter {
2697            limit: Some(5),
2698            use_contains: true,  // "test" is substring of "test0", "test1", etc.
2699            ..Default::default()
2700        };
2701        let results = engine.search("test", filter).unwrap();
2702
2703        assert_eq!(results.len(), 5);
2704    }
2705
2706    #[test]
2707    fn test_exact_match_filter() {
2708        let temp = TempDir::new().unwrap();
2709        let project = temp.path().join("project");
2710        fs::create_dir(&project).unwrap();
2711
2712        fs::write(
2713            project.join("main.rs"),
2714            "fn test() {}\nfn test_helper() {}\nfn other_test() {}"
2715        ).unwrap();
2716
2717        let cache = CacheManager::new(&project);
2718        let indexer = Indexer::new(cache, IndexConfig::default());
2719        indexer.index(&project, false).unwrap();
2720
2721        let cache = CacheManager::new(&project);
2722
2723        let engine = QueryEngine::new(cache);
2724
2725        // Exact match for "test"
2726        let filter = QueryFilter {
2727            symbols_mode: true,
2728            exact: true,
2729            ..Default::default()
2730        };
2731        let results = engine.search("test", filter).unwrap();
2732
2733        // Should only match exactly "test", not "test_helper" or "other_test"
2734        assert_eq!(results.len(), 1);
2735        assert_eq!(results[0].symbol.as_deref(), Some("test"));
2736    }
2737
2738    // ==================== Expand Mode Tests ====================
2739
2740    #[test]
2741    fn test_expand_mode() {
2742        let temp = TempDir::new().unwrap();
2743        let project = temp.path().join("project");
2744        fs::create_dir(&project).unwrap();
2745
2746        fs::write(
2747            project.join("main.rs"),
2748            "fn greet() {\n    println!(\"Hello\");\n    println!(\"World\");\n}"
2749        ).unwrap();
2750
2751        let cache = CacheManager::new(&project);
2752        let indexer = Indexer::new(cache, IndexConfig::default());
2753        indexer.index(&project, false).unwrap();
2754
2755        let cache = CacheManager::new(&project);
2756
2757        let engine = QueryEngine::new(cache);
2758
2759        // Search with expand mode
2760        let filter = QueryFilter {
2761            symbols_mode: true,
2762            expand: true,
2763            ..Default::default()
2764        };
2765        let results = engine.search("greet", filter).unwrap();
2766
2767        // Should have full function body in preview
2768        assert!(results.len() >= 1);
2769        let result = &results[0];
2770        assert!(result.preview.contains("println"));
2771    }
2772
2773    // ==================== Edge Cases ====================
2774
2775    #[test]
2776    fn test_search_empty_index() {
2777        let temp = TempDir::new().unwrap();
2778        let project = temp.path().join("project");
2779        fs::create_dir(&project).unwrap();
2780
2781        let cache = CacheManager::new(&project);
2782        let indexer = Indexer::new(cache, IndexConfig::default());
2783        indexer.index(&project, false).unwrap();
2784
2785        let cache = CacheManager::new(&project);
2786
2787        let engine = QueryEngine::new(cache);
2788        let filter = QueryFilter::default();
2789        let results = engine.search("nonexistent", filter).unwrap();
2790
2791        assert_eq!(results.len(), 0);
2792    }
2793
2794    #[test]
2795    fn test_search_no_index() {
2796        let temp = TempDir::new().unwrap();
2797        let project = temp.path().join("project");
2798        fs::create_dir(&project).unwrap();
2799
2800        let cache = CacheManager::new(&project);
2801        let engine = QueryEngine::new(cache);
2802        let filter = QueryFilter::default();
2803
2804        // Should fail when index doesn't exist
2805        assert!(engine.search("test", filter).is_err());
2806    }
2807
2808    #[test]
2809    fn test_search_special_characters() {
2810        let temp = TempDir::new().unwrap();
2811        let project = temp.path().join("project");
2812        fs::create_dir(&project).unwrap();
2813
2814        fs::write(project.join("main.rs"), "let x = 42;\nlet y = x + 1;").unwrap();
2815
2816        let cache = CacheManager::new(&project);
2817        let indexer = Indexer::new(cache, IndexConfig::default());
2818        indexer.index(&project, false).unwrap();
2819
2820        let cache = CacheManager::new(&project);
2821
2822        let engine = QueryEngine::new(cache);
2823        let filter = QueryFilter::default();
2824
2825        // Search for special characters
2826        let results = engine.search("x + ", filter).unwrap();
2827        assert!(results.len() >= 1);
2828    }
2829
2830    #[test]
2831    fn test_search_unicode() {
2832        let temp = TempDir::new().unwrap();
2833        let project = temp.path().join("project");
2834        fs::create_dir(&project).unwrap();
2835
2836        fs::write(project.join("main.rs"), "// 你好世界\nfn main() {}").unwrap();
2837
2838        let cache = CacheManager::new(&project);
2839        let indexer = Indexer::new(cache, IndexConfig::default());
2840        indexer.index(&project, false).unwrap();
2841
2842        let cache = CacheManager::new(&project);
2843
2844        let engine = QueryEngine::new(cache);
2845        let filter = QueryFilter {
2846            use_contains: true,  // Unicode word boundaries may not work as expected
2847            force: true,  // Bypass broad query detection for 2-char Unicode pattern
2848            ..Default::default()
2849        };
2850
2851        // Search for unicode characters
2852        let results = engine.search("你好", filter).unwrap();
2853        assert!(results.len() >= 1);
2854    }
2855
2856    #[test]
2857    fn test_case_sensitive_search() {
2858        let temp = TempDir::new().unwrap();
2859        let project = temp.path().join("project");
2860        fs::create_dir(&project).unwrap();
2861
2862        fs::write(project.join("main.rs"), "fn Test() {}\nfn test() {}").unwrap();
2863
2864        let cache = CacheManager::new(&project);
2865        let indexer = Indexer::new(cache, IndexConfig::default());
2866        indexer.index(&project, false).unwrap();
2867
2868        let cache = CacheManager::new(&project);
2869
2870        let engine = QueryEngine::new(cache);
2871        let filter = QueryFilter::default();
2872
2873        // Search is case-sensitive
2874        let results = engine.search("Test", filter).unwrap();
2875        assert!(results.iter().any(|r| r.preview.contains("Test()")));
2876    }
2877
2878    // ==================== Determinism Tests ====================
2879
2880    #[test]
2881    fn test_results_sorted_deterministically() {
2882        let temp = TempDir::new().unwrap();
2883        let project = temp.path().join("project");
2884        fs::create_dir(&project).unwrap();
2885
2886        fs::write(project.join("a.rs"), "fn test() {}").unwrap();
2887        fs::write(project.join("z.rs"), "fn test() {}").unwrap();
2888        fs::write(project.join("m.rs"), "fn test() {}\nfn test2() {}").unwrap();
2889
2890        let cache = CacheManager::new(&project);
2891        let indexer = Indexer::new(cache, IndexConfig::default());
2892        indexer.index(&project, false).unwrap();
2893
2894        let cache = CacheManager::new(&project);
2895
2896        let engine = QueryEngine::new(cache);
2897        let filter = QueryFilter::default();
2898
2899        // Run search multiple times
2900        let results1 = engine.search("test", filter.clone()).unwrap();
2901        let results2 = engine.search("test", filter.clone()).unwrap();
2902        let results3 = engine.search("test", filter).unwrap();
2903
2904        // Results should be identical and sorted by path then line
2905        assert_eq!(results1.len(), results2.len());
2906        assert_eq!(results1.len(), results3.len());
2907
2908        for i in 0..results1.len() {
2909            assert_eq!(results1[i].path, results2[i].path);
2910            assert_eq!(results1[i].path, results3[i].path);
2911            assert_eq!(results1[i].span.start_line, results2[i].span.start_line);
2912            assert_eq!(results1[i].span.start_line, results3[i].span.start_line);
2913        }
2914
2915        // Verify sorting (path ascending, then line ascending)
2916        for i in 0..results1.len().saturating_sub(1) {
2917            let curr = &results1[i];
2918            let next = &results1[i + 1];
2919            assert!(
2920                curr.path < next.path ||
2921                (curr.path == next.path && curr.span.start_line <= next.span.start_line)
2922            );
2923        }
2924    }
2925
2926    // ==================== Combined Filter Tests ====================
2927
2928    #[test]
2929    fn test_multiple_filters_combined() {
2930        let temp = TempDir::new().unwrap();
2931        let project = temp.path().join("project");
2932        fs::create_dir_all(project.join("src")).unwrap();
2933
2934        fs::write(project.join("src/main.rs"), "fn test() {}\nstruct Test {}").unwrap();
2935        fs::write(project.join("src/lib.rs"), "fn test() {}").unwrap();
2936        fs::write(project.join("test.js"), "function test() {}").unwrap();
2937
2938        let cache = CacheManager::new(&project);
2939        let indexer = Indexer::new(cache, IndexConfig::default());
2940        indexer.index(&project, false).unwrap();
2941
2942        let cache = CacheManager::new(&project);
2943
2944        let engine = QueryEngine::new(cache);
2945
2946        // Combine language, kind, and file pattern filters
2947        let filter = QueryFilter {
2948            language: Some(Language::Rust),
2949            kind: Some(SymbolKind::Function),
2950            file_pattern: Some("src/main".to_string()),
2951            symbols_mode: true,
2952            ..Default::default()
2953        };
2954        let results = engine.search("test", filter).unwrap();
2955
2956        // Should only find the function in src/main.rs
2957        assert_eq!(results.len(), 1);
2958        assert!(results[0].path.contains("src/main.rs"));
2959        assert_eq!(results[0].kind, SymbolKind::Function);
2960    }
2961
2962    // ==================== Helper Method Tests ====================
2963
2964    #[test]
2965    fn test_find_symbol_helper() {
2966        let temp = TempDir::new().unwrap();
2967        let project = temp.path().join("project");
2968        fs::create_dir(&project).unwrap();
2969
2970        fs::write(project.join("main.rs"), "fn greet() {}").unwrap();
2971
2972        let cache = CacheManager::new(&project);
2973        let indexer = Indexer::new(cache, IndexConfig::default());
2974        indexer.index(&project, false).unwrap();
2975
2976        let cache = CacheManager::new(&project);
2977
2978        let engine = QueryEngine::new(cache);
2979        let results = engine.find_symbol("greet").unwrap();
2980
2981        assert!(results.len() >= 1);
2982        assert_eq!(results[0].kind, SymbolKind::Function);
2983    }
2984
2985    #[test]
2986    fn test_list_by_kind_helper() {
2987        let temp = TempDir::new().unwrap();
2988        let project = temp.path().join("project");
2989        fs::create_dir(&project).unwrap();
2990
2991        fs::write(
2992            project.join("main.rs"),
2993            "struct Point {}\nfn test() {}\nstruct Line {}"
2994        ).unwrap();
2995
2996        let cache = CacheManager::new(&project);
2997        let indexer = Indexer::new(cache, IndexConfig::default());
2998        indexer.index(&project, false).unwrap();
2999
3000        let cache = CacheManager::new(&project);
3001
3002        let engine = QueryEngine::new(cache);
3003
3004        // Search for structs that contain "oin" (Point contains it, Line doesn't)
3005        let filter = QueryFilter {
3006            kind: Some(SymbolKind::Struct),
3007            symbols_mode: true,
3008            use_contains: true,  // "oin" is substring of "Point"
3009            ..Default::default()
3010        };
3011        let results = engine.search("oin", filter).unwrap();
3012
3013        // Should find Point struct
3014        assert!(results.len() >= 1, "Should find at least Point struct");
3015        assert!(results.iter().all(|r| r.kind == SymbolKind::Struct));
3016        assert!(results.iter().any(|r| r.symbol.as_deref() == Some("Point")));
3017    }
3018
3019    // ==================== Metadata Tests ====================
3020
3021    #[test]
3022    fn test_search_with_metadata() {
3023        let temp = TempDir::new().unwrap();
3024        let project = temp.path().join("project");
3025        fs::create_dir(&project).unwrap();
3026
3027        fs::write(project.join("main.rs"), "fn test() {}").unwrap();
3028
3029        let cache = CacheManager::new(&project);
3030        let indexer = Indexer::new(cache, IndexConfig::default());
3031        indexer.index(&project, false).unwrap();
3032
3033        let cache = CacheManager::new(&project);
3034
3035        let engine = QueryEngine::new(cache);
3036        let filter = QueryFilter::default();
3037        let response = engine.search_with_metadata("test", filter).unwrap();
3038
3039        // Check metadata is present (status might be stale if run inside git repo)
3040        assert!(response.results.len() >= 1);
3041        // Note: can_trust_results may be false if running in a git repo without branch index
3042    }
3043
3044    // ==================== Multi-language Tests ====================
3045
3046    #[test]
3047    fn test_search_across_languages() {
3048        let temp = TempDir::new().unwrap();
3049        let project = temp.path().join("project");
3050        fs::create_dir(&project).unwrap();
3051
3052        fs::write(project.join("main.rs"), "fn greet() {}").unwrap();
3053        fs::write(project.join("main.ts"), "function greet() {}").unwrap();
3054        fs::write(project.join("main.py"), "def greet(): pass").unwrap();
3055
3056        let cache = CacheManager::new(&project);
3057        let indexer = Indexer::new(cache, IndexConfig::default());
3058        indexer.index(&project, false).unwrap();
3059
3060        let cache = CacheManager::new(&project);
3061
3062        let engine = QueryEngine::new(cache);
3063        let filter = QueryFilter::default();
3064        let results = engine.search("greet", filter).unwrap();
3065
3066        // Should find greet in all three languages
3067        assert!(results.len() >= 3);
3068        assert!(results.iter().any(|r| r.lang == Language::Rust));
3069        assert!(results.iter().any(|r| r.lang == Language::TypeScript));
3070        assert!(results.iter().any(|r| r.lang == Language::Python));
3071    }
3072}