probe_code/search/
search_runner.rs

1use anyhow::Result;
2use probe_code::search::file_list_cache;
3use std::collections::{HashMap, HashSet};
4use std::path::{Path, PathBuf};
5use std::time::{Duration, Instant};
6// No need for term_exceptions import
7
8use probe_code::models::{LimitedSearchResults, SearchResult};
9use probe_code::path_resolver::resolve_path;
10use probe_code::search::{
11    cache,
12    // file_list_cache, // Add the new file_list_cache module (unused)
13    file_processing::{process_file_with_results, FileProcessingParams},
14    query::{create_query_plan, create_structured_patterns, QueryPlan},
15    result_ranking::rank_search_results,
16    search_limiter::apply_limits,
17    search_options::SearchOptions,
18    timeout,
19};
20
21/// Struct to hold timing information for different stages of the search process
22pub struct SearchTimings {
23    pub query_preprocessing: Option<Duration>,
24    pub pattern_generation: Option<Duration>,
25    pub file_searching: Option<Duration>,
26    pub filename_matching: Option<Duration>,
27    pub early_filtering: Option<Duration>,
28    pub early_caching: Option<Duration>,
29    pub result_processing: Option<Duration>,
30    // Granular result processing timings
31    pub result_processing_file_io: Option<Duration>,
32    pub result_processing_line_collection: Option<Duration>,
33    pub result_processing_ast_parsing: Option<Duration>,
34    pub result_processing_block_extraction: Option<Duration>,
35    pub result_processing_result_building: Option<Duration>,
36
37    // Granular AST parsing sub-step timings
38    pub result_processing_ast_parsing_language_init: Option<Duration>,
39    pub result_processing_ast_parsing_parser_init: Option<Duration>,
40    pub result_processing_ast_parsing_tree_parsing: Option<Duration>,
41    pub result_processing_ast_parsing_line_map_building: Option<Duration>,
42
43    // Granular block extraction sub-step timings
44    pub result_processing_block_extraction_code_structure: Option<Duration>,
45    pub result_processing_block_extraction_filtering: Option<Duration>,
46    pub result_processing_block_extraction_result_building: Option<Duration>,
47
48    // Detailed result building timings
49    pub result_processing_term_matching: Option<Duration>,
50    pub result_processing_compound_processing: Option<Duration>,
51    pub result_processing_line_matching: Option<Duration>,
52    pub result_processing_result_creation: Option<Duration>,
53    pub result_processing_synchronization: Option<Duration>,
54    pub result_processing_uncovered_lines: Option<Duration>,
55
56    pub result_ranking: Option<Duration>,
57    pub limit_application: Option<Duration>,
58    pub block_merging: Option<Duration>,
59    pub final_caching: Option<Duration>,
60    pub total_search_time: Option<Duration>,
61}
62
63/// Helper function to format duration in a human-readable way
64pub fn format_duration(duration: Duration) -> String {
65    if duration.as_millis() < 1000 {
66        let millis = duration.as_millis();
67        format!("{millis}ms")
68    } else {
69        let secs = duration.as_secs_f64();
70        format!("{secs:.2}s")
71    }
72}
73
74/// Helper function to print timing information in debug mode
75pub fn print_timings(timings: &SearchTimings) {
76    let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
77    if !debug_mode {
78        return;
79    }
80
81    println!("\n=== SEARCH TIMING INFORMATION ===");
82
83    if let Some(duration) = timings.query_preprocessing {
84        println!("Query preprocessing:   {}", format_duration(duration));
85    }
86
87    if let Some(duration) = timings.pattern_generation {
88        println!("Pattern generation:    {}", format_duration(duration));
89    }
90
91    if let Some(duration) = timings.file_searching {
92        println!("File searching:        {}", format_duration(duration));
93    }
94
95    if let Some(duration) = timings.filename_matching {
96        println!("Filename matching:     {}", format_duration(duration));
97    }
98
99    if let Some(duration) = timings.early_filtering {
100        println!("Early AST filtering:   {}", format_duration(duration));
101    }
102
103    if let Some(duration) = timings.early_caching {
104        println!("Early caching:         {}", format_duration(duration));
105    }
106
107    if let Some(duration) = timings.result_processing {
108        println!("Result processing:     {}", format_duration(duration));
109
110        // Print granular result processing timings if available
111        if let Some(duration) = timings.result_processing_file_io {
112            println!("  - File I/O:           {}", format_duration(duration));
113        }
114
115        if let Some(duration) = timings.result_processing_line_collection {
116            println!("  - Line collection:    {}", format_duration(duration));
117        }
118
119        if let Some(duration) = timings.result_processing_ast_parsing {
120            println!("  - AST parsing:        {}", format_duration(duration));
121
122            // Print granular AST parsing sub-step timings
123            if let Some(d) = timings.result_processing_ast_parsing_language_init {
124                println!("    - Language init:     {}", format_duration(d));
125            }
126            if let Some(d) = timings.result_processing_ast_parsing_parser_init {
127                println!("    - Parser init:       {}", format_duration(d));
128            }
129            if let Some(d) = timings.result_processing_ast_parsing_tree_parsing {
130                println!("    - Tree parsing:      {}", format_duration(d));
131            }
132            if let Some(d) = timings.result_processing_ast_parsing_line_map_building {
133                println!("    - Line map building: {}", format_duration(d));
134            }
135        }
136
137        if let Some(duration) = timings.result_processing_block_extraction {
138            println!("  - Block extraction:   {}", format_duration(duration));
139
140            // Print granular block extraction sub-step timings
141            if let Some(d) = timings.result_processing_block_extraction_code_structure {
142                println!("    - Code structure:    {}", format_duration(d));
143            }
144            if let Some(d) = timings.result_processing_block_extraction_filtering {
145                println!("    - Filtering:         {}", format_duration(d));
146            }
147            if let Some(d) = timings.result_processing_block_extraction_result_building {
148                println!("    - Result building:   {}", format_duration(d));
149            }
150        }
151
152        if let Some(duration) = timings.result_processing_result_building {
153            println!("  - Result building:    {}", format_duration(duration));
154
155            // Print detailed result building timings if available
156            if let Some(d) = timings.result_processing_term_matching {
157                println!("    - Term matching:      {}", format_duration(d));
158            }
159            if let Some(d) = timings.result_processing_compound_processing {
160                println!("    - Compound processing: {}", format_duration(d));
161            }
162            if let Some(d) = timings.result_processing_line_matching {
163                println!("    - Line matching:      {}", format_duration(d));
164            }
165            if let Some(d) = timings.result_processing_result_creation {
166                println!("    - Result creation:    {}", format_duration(d));
167            }
168            if let Some(d) = timings.result_processing_synchronization {
169                println!("    - Synchronization:    {}", format_duration(d));
170            }
171            if let Some(d) = timings.result_processing_uncovered_lines {
172                println!("    - Uncovered lines:    {}", format_duration(d));
173            }
174        }
175    }
176
177    if let Some(duration) = timings.result_ranking {
178        println!("Result ranking:        {}", format_duration(duration));
179    }
180
181    if let Some(duration) = timings.limit_application {
182        println!("Limit application:     {}", format_duration(duration));
183    }
184
185    if let Some(duration) = timings.block_merging {
186        println!("Block merging:         {}", format_duration(duration));
187    }
188
189    if let Some(duration) = timings.final_caching {
190        println!("Final caching:         {}", format_duration(duration));
191    }
192
193    if let Some(duration) = timings.total_search_time {
194        println!("Total search time:     {}", format_duration(duration));
195    }
196
197    println!("===================================\n");
198}
199
200// Removed evaluate_ignoring_negatives helper function in favor of direct usage
201
202/// Our main "perform_probe" function remains largely the same. Below we show how you might
203/// incorporate "search_with_structured_patterns" to handle the AST logic in a specialized path.
204/// For simplicity, we won't fully replace the existing logic. Instead, we'll demonstrate
205/// how you'd do it if you wanted to leverage the new approach.
206pub fn perform_probe(options: &SearchOptions) -> Result<LimitedSearchResults> {
207    // Start timing the entire search process
208    let total_start = Instant::now();
209
210    let SearchOptions {
211        path,
212        queries,
213        files_only,
214        custom_ignores,
215        exclude_filenames,
216        reranker,
217        frequency_search: _,
218        exact,
219        language,
220        max_results,
221        max_bytes,
222        max_tokens,
223        allow_tests,
224        no_merge,
225        merge_threshold,
226        dry_run: _, // We don't need this in perform_probe, but need to include it in the pattern
227        session,
228        timeout,
229    } = options;
230    // Start the timeout thread
231    let timeout_handle = timeout::start_timeout_thread(*timeout);
232
233    let include_filenames = !exclude_filenames;
234    let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
235
236    // Handle session ID generation if session is provided but empty
237    // For test runs, force session to None to disable caching
238    let (effective_session, session_was_generated) = if let Some(s) = session {
239        if s.is_empty() || *s == "new" {
240            // Check if we have a session ID in the environment variable
241            if let Ok(env_session_id) = std::env::var("PROBE_SESSION_ID") {
242                if !env_session_id.is_empty() {
243                    if debug_mode {
244                        println!("DEBUG: Using session ID from environment: {env_session_id}");
245                    }
246                    // Convert to a static string (this leaks memory, but it's a small amount and only happens once per session)
247                    let static_id: &'static str = Box::leak(env_session_id.into_boxed_str());
248                    (Some(static_id), false)
249                } else {
250                    // Generate a unique session ID
251                    match cache::generate_session_id() {
252                        Ok((new_id, _is_new)) => {
253                            if debug_mode {
254                                println!("DEBUG: Generated new session ID: {new_id}");
255                            }
256                            (Some(new_id), true)
257                        }
258                        Err(e) => {
259                            eprintln!("Error generating session ID: {e}");
260                            (None, false)
261                        }
262                    }
263                }
264            } else {
265                // Generate a unique session ID
266                match cache::generate_session_id() {
267                    Ok((new_id, _is_new)) => {
268                        if debug_mode {
269                            println!("DEBUG: Generated new session ID: {new_id}");
270                        }
271                        (Some(new_id), true)
272                    }
273                    Err(e) => {
274                        eprintln!("Error generating session ID: {e}");
275                        (None, false)
276                    }
277                }
278            }
279        } else {
280            (Some(*s), false)
281        }
282    } else {
283        // Check if we have a session ID in the environment variable
284        if let Ok(env_session_id) = std::env::var("PROBE_SESSION_ID") {
285            if !env_session_id.is_empty() {
286                if debug_mode {
287                    println!("DEBUG: Using session ID from environment: {env_session_id}");
288                }
289                // Convert to a static string (this leaks memory, but it's a small amount and only happens once per session)
290                let static_id: &'static str = Box::leak(env_session_id.into_boxed_str());
291                (Some(static_id), false)
292            } else {
293                (None, false)
294            }
295        } else {
296            (None, false)
297        }
298    };
299
300    let mut timings = SearchTimings {
301        query_preprocessing: None,
302        pattern_generation: None,
303        file_searching: None,
304        filename_matching: None,
305        early_filtering: None,
306        early_caching: None,
307        result_processing: None,
308        result_processing_file_io: None,
309        result_processing_line_collection: None,
310        result_processing_ast_parsing: None,
311        result_processing_block_extraction: None,
312        result_processing_result_building: None,
313
314        // Initialize granular AST parsing sub-step timings
315        result_processing_ast_parsing_language_init: None,
316        result_processing_ast_parsing_parser_init: None,
317        result_processing_ast_parsing_tree_parsing: None,
318        result_processing_ast_parsing_line_map_building: None,
319
320        // Initialize granular block extraction sub-step timings
321        result_processing_block_extraction_code_structure: None,
322        result_processing_block_extraction_filtering: None,
323        result_processing_block_extraction_result_building: None,
324
325        // Initialize detailed result building timings
326        result_processing_term_matching: None,
327        result_processing_compound_processing: None,
328        result_processing_line_matching: None,
329        result_processing_result_creation: None,
330        result_processing_synchronization: None,
331        result_processing_uncovered_lines: None,
332
333        result_ranking: None,
334        limit_application: None,
335        block_merging: None,
336        final_caching: None,
337        total_search_time: None,
338    };
339
340    // Combine multiple queries with AND or just parse single query
341    let qp_start = Instant::now();
342    if debug_mode {
343        println!("DEBUG: Starting query preprocessing...");
344    }
345
346    let parse_res = if queries.len() > 1 {
347        // Join multiple queries with AND
348        let combined_query = queries.join(" AND ");
349        create_query_plan(&combined_query, *exact)
350    } else {
351        create_query_plan(&queries[0], *exact)
352    };
353
354    let qp_duration = qp_start.elapsed();
355    timings.query_preprocessing = Some(qp_duration);
356
357    if debug_mode {
358        println!(
359            "DEBUG: Query preprocessing completed in {}",
360            format_duration(qp_duration)
361        );
362    }
363
364    // If the query fails to parse, return empty results
365    if parse_res.is_err() {
366        println!("Failed to parse query as AST expression");
367        return Ok(LimitedSearchResults {
368            results: Vec::new(),
369            skipped_files: Vec::new(),
370            limits_applied: None,
371            cached_blocks_skipped: None,
372        });
373    }
374
375    // All queries go through the AST path
376    let plan = parse_res.unwrap();
377
378    // Pattern generation timing
379    let pg_start = Instant::now();
380    if debug_mode {
381        println!("DEBUG: Starting pattern generation...");
382        println!("DEBUG: Using combined pattern approach for more efficient searching");
383    }
384
385    // Use combined pattern approach for more efficient searching
386    let structured_patterns = create_structured_patterns(&plan);
387
388    let pg_duration = pg_start.elapsed();
389    timings.pattern_generation = Some(pg_duration);
390
391    if debug_mode {
392        println!(
393            "DEBUG: Pattern generation completed in {}",
394            format_duration(pg_duration)
395        );
396        println!(
397            "DEBUG: Generated {patterns_len} patterns",
398            patterns_len = structured_patterns.len()
399        );
400        if structured_patterns.len() == 1 {
401            println!("DEBUG: Successfully created a single combined pattern for all terms");
402        }
403    }
404
405    // File searching timing
406    let fs_start = Instant::now();
407    if debug_mode {
408        println!("DEBUG: Starting file searching...");
409    }
410
411    /*
412      Important Note on Non-Determinism:
413      The code in `search_with_structured_patterns` builds a single "combined" regex
414      with multiple capturing groups. If more than one subpattern can match the same
415      text, the regex engine’s backtracking might fill capture group 1 vs. group 2
416      differently from run to run under multithreading, producing inconsistent
417      matched lines (and thus inconsistent "required terms"). That can cause files
418      to be accepted or removed in “early filtering” unpredictably. If you're
419      experiencing random 0-result runs, this combined-regex approach is the most
420      likely culprit.
421    */
422
423    // Normalize language parameter to handle aliases
424    let lang_param = language.as_ref().map(|lang| normalize_language_alias(lang));
425
426    let mut file_term_map = search_with_structured_patterns(
427        path,
428        &plan,
429        &structured_patterns,
430        custom_ignores,
431        *allow_tests,
432        lang_param,
433    )?;
434
435    let fs_duration = fs_start.elapsed();
436    timings.file_searching = Some(fs_duration);
437
438    // Print debug information about search results
439    if debug_mode {
440        // Calculate total matches across all files
441        let total_matches: usize = file_term_map
442            .values()
443            .map(|term_map| term_map.values().map(|lines| lines.len()).sum::<usize>())
444            .sum();
445
446        // Get number of unique files
447        let unique_files = file_term_map.keys().len();
448
449        println!(
450            "DEBUG: File searching completed in {} - Found {} matches in {} unique files",
451            format_duration(fs_duration),
452            total_matches,
453            unique_files
454        );
455    }
456
457    // Build final results
458    let mut all_files = file_term_map.keys().cloned().collect::<HashSet<_>>();
459
460    // Add filename matches if enabled
461    let fm_start = Instant::now();
462    if include_filenames && !exact {
463        if debug_mode {
464            println!("DEBUG: Starting filename matching...");
465        }
466        // Find all files that match our patterns by filename, along with the terms that matched
467        // Resolve the path if it's a special format (e.g., "go:github.com/user/repo")
468        let resolved_path = if let Some(path_str) = path.to_str() {
469            match resolve_path(path_str) {
470                Ok(resolved_path) => {
471                    if debug_mode {
472                        println!(
473                            "DEBUG: Resolved path '{}' to '{}'",
474                            path_str,
475                            resolved_path.display()
476                        );
477                    }
478                    resolved_path
479                }
480                Err(err) => {
481                    if debug_mode {
482                        println!("DEBUG: Failed to resolve path '{path_str}': {err}");
483                    }
484                    // Fall back to the original path
485                    path.to_path_buf()
486                }
487            }
488        } else {
489            // If we can't convert the path to a string, use it as is
490            path.to_path_buf()
491        };
492
493        let filename_matches: HashMap<PathBuf, HashSet<usize>> =
494            file_list_cache::find_matching_filenames(
495                &resolved_path,
496                queries,
497                &all_files,
498                custom_ignores,
499                *allow_tests,
500                &plan.term_indices,
501                lang_param,
502            )?;
503
504        if debug_mode {
505            println!(
506                "DEBUG: Found {} files matching by filename",
507                filename_matches.len()
508            );
509        }
510
511        // Process files that matched by filename
512        for (pathbuf, matched_terms) in &filename_matches {
513            // Define a reasonable maximum file size (e.g., 10MB)
514            const MAX_FILE_SIZE: u64 = 1024 * 1024;
515
516            // Check file metadata and resolve symlinks before reading
517            let resolved_path = match std::fs::canonicalize(pathbuf.as_path()) {
518                Ok(path) => path,
519                Err(e) => {
520                    if debug_mode {
521                        println!("DEBUG: Error resolving path for {pathbuf:?}: {e:?}");
522                    }
523                    continue;
524                }
525            };
526
527            // Get file metadata to check size and file type
528            let metadata = match std::fs::metadata(&resolved_path) {
529                Ok(meta) => meta,
530                Err(e) => {
531                    if debug_mode {
532                        println!("DEBUG: Error getting metadata for {resolved_path:?}: {e:?}");
533                    }
534                    continue;
535                }
536            };
537
538            // Check if the file is too large
539            if metadata.len() > MAX_FILE_SIZE {
540                if debug_mode {
541                    println!(
542                        "DEBUG: Skipping file {:?} - file too large ({} bytes > {} bytes limit)",
543                        resolved_path,
544                        metadata.len(),
545                        MAX_FILE_SIZE
546                    );
547                }
548                continue;
549            }
550
551            // Read the file content to get the total number of lines
552            let file_content = match std::fs::read_to_string(&resolved_path) {
553                Ok(content) => content,
554                Err(e) => {
555                    if debug_mode {
556                        println!(
557                            "DEBUG: Error reading file {:?}: {:?} (size: {} bytes)",
558                            resolved_path,
559                            e,
560                            metadata.len()
561                        );
562                    }
563                    continue;
564                }
565            };
566
567            // Count the number of lines in the file
568            let line_count = file_content.lines().count();
569            if line_count == 0 {
570                if debug_mode {
571                    println!("DEBUG: File {pathbuf:?} is empty, skipping");
572                }
573                continue;
574            }
575
576            // Create a set of all line numbers in the file (1-based indexing)
577            let all_line_numbers: HashSet<usize> = (1..=line_count).collect();
578
579            // Check if this file already has term matches from content search
580            let mut term_map = if let Some(existing_map) = file_term_map.get(pathbuf) {
581                if debug_mode {
582                    println!(
583                        "DEBUG: File {pathbuf:?} already has term matches from content search, extending"
584                    );
585                }
586                existing_map.clone()
587            } else {
588                if debug_mode {
589                    println!("DEBUG: Creating new term map for file {pathbuf:?}");
590                }
591                HashMap::new()
592            };
593
594            // Add the matched terms to the term map with all lines
595            for &term_idx in matched_terms {
596                term_map
597                    .entry(term_idx)
598                    .or_insert_with(HashSet::new)
599                    .extend(&all_line_numbers);
600
601                if debug_mode {
602                    println!(
603                        "DEBUG: Added term index {term_idx} to file {pathbuf:?} with all lines"
604                    );
605                }
606            }
607
608            // Update the file_term_map with the new or extended term map
609            file_term_map.insert(pathbuf.clone(), term_map);
610            all_files.insert(pathbuf.clone());
611
612            if debug_mode {
613                println!("DEBUG: Added file {pathbuf:?} with matching terms to file_term_map");
614            }
615        }
616    }
617
618    if debug_mode {
619        println!("DEBUG: all_files after filename matches: {all_files:?}");
620    }
621
622    // Early filtering step - filter both all_files and file_term_map using full AST evaluation (including excluded terms?).
623    // Actually we pass 'true' to 'evaluate(..., true)', so that ignores excluded terms, contrary to the debug comment.
624    let early_filter_start = Instant::now();
625    if debug_mode {
626        println!("DEBUG: Starting early AST filtering...");
627        println!("DEBUG: Before filtering: {} files", all_files.len());
628    }
629
630    // Create a new filtered file_term_map
631    let mut filtered_file_term_map = HashMap::new();
632    let mut filtered_all_files = HashSet::new();
633
634    for pathbuf in &all_files {
635        if let Some(term_map) = file_term_map.get(pathbuf) {
636            // Extract unique terms found in the file
637            let matched_terms: HashSet<usize> = term_map.keys().copied().collect();
638
639            // Evaluate the file against the AST, but we pass 'true' for ignore_negatives
640            if plan.ast.evaluate(&matched_terms, &plan.term_indices, true) {
641                filtered_file_term_map.insert(pathbuf.clone(), term_map.clone());
642                filtered_all_files.insert(pathbuf.clone());
643            } else if debug_mode {
644                println!("DEBUG: Early filtering removed file: {pathbuf:?}");
645            }
646        } else if debug_mode {
647            println!("DEBUG: File {pathbuf:?} not found in file_term_map during early filtering");
648        }
649    }
650
651    // Replace the original maps with the filtered ones
652    file_term_map = filtered_file_term_map;
653    all_files = filtered_all_files;
654
655    if debug_mode {
656        println!(
657            "DEBUG: After early filtering: {} files remain",
658            all_files.len()
659        );
660        println!("DEBUG: all_files after early filtering: {all_files:?}");
661    }
662
663    let early_filter_duration = early_filter_start.elapsed();
664    timings.early_filtering = Some(early_filter_duration);
665
666    if debug_mode {
667        println!(
668            "DEBUG: Early AST filtering completed in {}",
669            format_duration(early_filter_duration)
670        );
671    }
672
673    let fm_duration = fm_start.elapsed();
674    timings.filename_matching = Some(fm_duration);
675
676    if debug_mode && include_filenames {
677        println!(
678            "DEBUG: Filename matching completed in {}",
679            format_duration(fm_duration)
680        );
681    }
682
683    // Handle files-only mode
684    if *files_only {
685        let mut res = Vec::new();
686        for f in all_files {
687            res.push(SearchResult {
688                file: f.to_string_lossy().to_string(),
689                lines: (1, 1),
690                node_type: "file".to_string(),
691                code: String::new(),
692                matched_by_filename: None,
693                rank: None,
694                score: None,
695                tfidf_score: None,
696                bm25_score: None,
697                tfidf_rank: None,
698                bm25_rank: None,
699                new_score: None,
700                hybrid2_rank: None,
701                combined_score_rank: None,
702                file_unique_terms: None,
703                file_total_matches: None,
704                file_match_rank: None,
705                block_unique_terms: None,
706                block_total_matches: None,
707                parent_file_id: None,
708                block_id: None,
709                matched_keywords: None,
710                tokenized_content: None,
711            });
712        }
713        let mut limited = apply_limits(res, *max_results, *max_bytes, *max_tokens);
714
715        // No caching for files-only mode
716        limited.cached_blocks_skipped = None;
717
718        // Set total search time
719        timings.total_search_time = Some(total_start.elapsed());
720
721        // Print timing information
722        print_timings(&timings);
723
724        return Ok(limited);
725    }
726
727    // Apply early caching if session is provided - AFTER getting ripgrep results but BEFORE processing
728    let ec_start = Instant::now();
729    let mut early_skipped_count = 0;
730    if let Some(session_id) = effective_session {
731        // Get the raw query string for caching
732        let raw_query = if queries.len() > 1 {
733            queries.join(" AND ")
734        } else {
735            queries[0].clone()
736        };
737
738        if debug_mode {
739            println!(
740                "DEBUG: Starting early caching for session: {session_id} with query: {raw_query}"
741            );
742            // Print cache contents before filtering
743            if let Err(e) = cache::debug_print_cache(session_id, &raw_query) {
744                eprintln!("Error printing cache: {e}");
745            }
746        }
747
748        // Filter matched lines using the cache
749        match cache::filter_matched_lines_with_cache(&mut file_term_map, session_id, &raw_query) {
750            Ok(skipped) => {
751                if debug_mode {
752                    println!("DEBUG: Early caching skipped {skipped} matched lines");
753                }
754                early_skipped_count = skipped;
755            }
756            Err(e) => {
757                // Log the error but continue without early caching
758                eprintln!("Error applying early cache: {e}");
759            }
760        }
761
762        // Update all_files based on the filtered file_term_map
763        // Intersect with existing all_files to preserve filtering
764        let cached_files = file_term_map.keys().cloned().collect::<HashSet<_>>();
765        all_files = all_files.intersection(&cached_files).cloned().collect();
766
767        if debug_mode {
768            println!("DEBUG: all_files after caching: {all_files:?}");
769        }
770    }
771
772    let ec_duration = ec_start.elapsed();
773    timings.early_caching = Some(ec_duration);
774
775    if debug_mode && effective_session.is_some() {
776        println!(
777            "DEBUG: Early caching completed in {}",
778            format_duration(ec_duration)
779        );
780    }
781
782    // Process the files for detailed results
783    let rp_start = Instant::now();
784    if debug_mode {
785        println!(
786            "DEBUG: Starting result processing for {} files after early caching...",
787            all_files.len()
788        );
789    }
790
791    let mut final_results = Vec::new();
792
793    // Track granular timing for result processing stages
794    let mut total_file_io_time = Duration::new(0, 0);
795    let mut total_line_collection_time = Duration::new(0, 0);
796    let mut total_ast_parsing_time = Duration::new(0, 0);
797    let mut total_block_extraction_time = Duration::new(0, 0);
798    let _total_result_building_time = Duration::new(0, 0);
799
800    // Track granular timing for AST parsing sub-steps
801    let mut total_ast_parsing_language_init_time = Duration::new(0, 0);
802    let mut total_ast_parsing_parser_init_time = Duration::new(0, 0);
803    let mut total_ast_parsing_tree_parsing_time = Duration::new(0, 0);
804    let mut total_ast_parsing_line_map_building_time = Duration::new(0, 0);
805
806    // Track granular timing for block extraction sub-steps
807    let mut total_block_extraction_code_structure_time = Duration::new(0, 0);
808    let mut total_block_extraction_filtering_time = Duration::new(0, 0);
809    let mut total_block_extraction_result_building_time = Duration::new(0, 0);
810
811    // Track detailed result building timings
812    let mut total_term_matching_time = Duration::new(0, 0);
813    let mut total_compound_processing_time = Duration::new(0, 0);
814    let mut total_line_matching_time = Duration::new(0, 0);
815    let mut total_result_creation_time = Duration::new(0, 0);
816    let mut total_synchronization_time = Duration::new(0, 0);
817    let mut total_uncovered_lines_time = Duration::new(0, 0);
818    for pathbuf in &all_files {
819        if debug_mode {
820            println!("DEBUG: Processing file: {pathbuf:?}");
821        }
822
823        // Get the term map for this file
824        if let Some(term_map) = file_term_map.get(pathbuf) {
825            if debug_mode {
826                println!("DEBUG: Term map for file: {term_map:?}");
827            }
828
829            // Gather matched lines - measure line collection time
830            let line_collection_start = Instant::now();
831            let mut all_lines = HashSet::new();
832            for lineset in term_map.values() {
833                all_lines.extend(lineset.iter());
834            }
835            let line_collection_duration = line_collection_start.elapsed();
836            total_line_collection_time += line_collection_duration;
837
838            if debug_mode {
839                println!(
840                    "DEBUG: Found {} matched lines in file in {}",
841                    all_lines.len(),
842                    format_duration(line_collection_duration)
843                );
844            }
845
846            // Process file with matched lines
847            let filename_matched_queries = HashSet::new();
848
849            // Create a list of term pairs for backward compatibility
850            let term_pairs: Vec<(String, String)> = plan
851                .term_indices
852                .keys()
853                .map(|term| (term.clone(), term.clone()))
854                .collect();
855
856            let pparams = FileProcessingParams {
857                path: pathbuf,
858                line_numbers: &all_lines,
859                allow_tests: *allow_tests,
860                term_matches: term_map,
861                num_queries: plan.term_indices.len(),
862                filename_matched_queries,
863                queries_terms: &[term_pairs],
864                preprocessed_queries: None,
865                no_merge: *no_merge,
866                query_plan: &plan,
867            };
868
869            if debug_mode {
870                println!(
871                    "DEBUG: Processing file with params: {}",
872                    pparams.path.display()
873                );
874            }
875
876            // Process file and track granular timings
877            match process_file_with_results(&pparams) {
878                Ok((mut file_res, file_timings)) => {
879                    // Accumulate granular timings from file processing
880                    if let Some(duration) = file_timings.file_io {
881                        total_file_io_time += duration;
882                    }
883                    if let Some(duration) = file_timings.ast_parsing {
884                        total_ast_parsing_time += duration;
885                    }
886                    if let Some(duration) = file_timings.block_extraction {
887                        total_block_extraction_time += duration;
888                    }
889
890                    // Add the new granular timings for AST parsing sub-steps
891                    if let Some(duration) = file_timings.ast_parsing_language_init {
892                        total_ast_parsing_language_init_time += duration;
893                        if debug_mode {
894                            println!("DEBUG:     - Language init: {}", format_duration(duration));
895                        }
896                    }
897                    if let Some(duration) = file_timings.ast_parsing_parser_init {
898                        total_ast_parsing_parser_init_time += duration;
899                        if debug_mode {
900                            println!("DEBUG:     - Parser init: {}", format_duration(duration));
901                        }
902                    }
903                    if let Some(duration) = file_timings.ast_parsing_tree_parsing {
904                        total_ast_parsing_tree_parsing_time += duration;
905                        if debug_mode {
906                            println!("DEBUG:     - Tree parsing: {}", format_duration(duration));
907                        }
908                    }
909                    if let Some(duration) = file_timings.ast_parsing_line_map_building {
910                        total_ast_parsing_line_map_building_time += duration;
911                        if debug_mode {
912                            println!(
913                                "DEBUG:     - Line map building: {}",
914                                format_duration(duration)
915                            );
916                        }
917                    }
918
919                    // Add the new granular timings for block extraction sub-steps
920                    if let Some(duration) = file_timings.block_extraction_code_structure {
921                        total_block_extraction_code_structure_time += duration;
922                        if debug_mode {
923                            println!(
924                                "DEBUG:     - Code structure finding: {}",
925                                format_duration(duration)
926                            );
927                        }
928                    }
929                    if let Some(duration) = file_timings.block_extraction_filtering {
930                        total_block_extraction_filtering_time += duration;
931                        if debug_mode {
932                            println!("DEBUG:     - Filtering: {}", format_duration(duration));
933                        }
934                    }
935                    if let Some(duration) = file_timings.block_extraction_result_building {
936                        total_block_extraction_result_building_time += duration;
937                        if debug_mode {
938                            println!(
939                                "DEBUG:     - Result building: {}",
940                                format_duration(duration)
941                            );
942                        }
943                    }
944
945                    // Add the detailed result building timings
946                    if let Some(duration) = file_timings.result_building_term_matching {
947                        total_term_matching_time += duration;
948                        if debug_mode {
949                            println!("DEBUG:     - Term matching: {}", format_duration(duration));
950                        }
951                    }
952                    if let Some(duration) = file_timings.result_building_compound_processing {
953                        total_compound_processing_time += duration;
954                        if debug_mode {
955                            println!(
956                                "DEBUG:     - Compound processing: {}",
957                                format_duration(duration)
958                            );
959                        }
960                    }
961                    if let Some(duration) = file_timings.result_building_line_matching {
962                        total_line_matching_time += duration;
963                        if debug_mode {
964                            println!("DEBUG:     - Line matching: {}", format_duration(duration));
965                        }
966                    }
967                    if let Some(duration) = file_timings.result_building_result_creation {
968                        total_result_creation_time += duration;
969                        if debug_mode {
970                            println!(
971                                "DEBUG:     - Result creation: {}",
972                                format_duration(duration)
973                            );
974                        }
975                    }
976                    if let Some(duration) = file_timings.result_building_synchronization {
977                        total_synchronization_time += duration;
978                        if debug_mode {
979                            println!(
980                                "DEBUG:     - Synchronization: {}",
981                                format_duration(duration)
982                            );
983                        }
984                    }
985                    if let Some(duration) = file_timings.result_building_uncovered_lines {
986                        total_uncovered_lines_time += duration;
987                        if debug_mode {
988                            println!(
989                                "DEBUG:     - Uncovered lines: {}",
990                                format_duration(duration)
991                            );
992                        }
993                    }
994
995                    if debug_mode {
996                        println!("DEBUG: Got {} results from file processing", file_res.len());
997                        if let Some(duration) = file_timings.file_io {
998                            println!("DEBUG:   File I/O time: {}", format_duration(duration));
999                        }
1000                        if let Some(duration) = file_timings.ast_parsing {
1001                            println!("DEBUG:   AST parsing time: {}", format_duration(duration));
1002                        }
1003                        if let Some(duration) = file_timings.block_extraction {
1004                            println!(
1005                                "DEBUG:   Block extraction time: {}",
1006                                format_duration(duration)
1007                            );
1008                        }
1009                        if let Some(duration) = file_timings.block_extraction_result_building {
1010                            println!(
1011                                "DEBUG:   Result building time: {}",
1012                                format_duration(duration)
1013                            );
1014                        }
1015                    }
1016                    final_results.append(&mut file_res);
1017                }
1018                Err(e) => {
1019                    if debug_mode {
1020                        println!("DEBUG: Error processing file: {e:?}");
1021                    }
1022                }
1023            }
1024        } else {
1025            // This should never happen, but keep for safety
1026            if debug_mode {
1027                println!("DEBUG: ERROR - File {pathbuf:?} not found in file_term_map but was in all_files");
1028            }
1029        }
1030    }
1031
1032    let rp_duration = rp_start.elapsed();
1033    // Calculate the total time spent on detailed result building operations
1034    let detailed_result_building_time = total_term_matching_time
1035        + total_compound_processing_time
1036        + total_line_matching_time
1037        + total_result_creation_time
1038        + total_synchronization_time
1039        + total_uncovered_lines_time;
1040
1041    // Calculate the result building time as the remaining time after accounting for other operations
1042    let accounted_time = total_file_io_time
1043        + total_line_collection_time
1044        + total_ast_parsing_time
1045        + total_block_extraction_time;
1046    let remaining_time = if rp_duration > accounted_time {
1047        rp_duration - accounted_time
1048    } else {
1049        // Use the sum of detailed timings if available, otherwise fallback to block extraction result building time
1050        if detailed_result_building_time > Duration::new(0, 0) {
1051            detailed_result_building_time
1052        } else {
1053            total_block_extraction_result_building_time
1054        }
1055    };
1056
1057    timings.result_processing = Some(rp_duration);
1058    timings.result_processing_file_io = Some(total_file_io_time);
1059    timings.result_processing_line_collection = Some(total_line_collection_time);
1060    timings.result_processing_ast_parsing = Some(total_ast_parsing_time);
1061    timings.result_processing_block_extraction = Some(total_block_extraction_time);
1062    timings.result_processing_result_building = Some(remaining_time);
1063
1064    // Set the detailed result building timings
1065    timings.result_processing_term_matching = Some(total_term_matching_time);
1066    timings.result_processing_compound_processing = Some(total_compound_processing_time);
1067    timings.result_processing_line_matching = Some(total_line_matching_time);
1068    timings.result_processing_result_creation = Some(total_result_creation_time);
1069    timings.result_processing_synchronization = Some(total_synchronization_time);
1070    timings.result_processing_uncovered_lines = Some(total_uncovered_lines_time);
1071
1072    // Set the granular AST parsing sub-step timings
1073    timings.result_processing_ast_parsing_language_init =
1074        Some(total_ast_parsing_language_init_time);
1075    timings.result_processing_ast_parsing_parser_init = Some(total_ast_parsing_parser_init_time);
1076    timings.result_processing_ast_parsing_tree_parsing = Some(total_ast_parsing_tree_parsing_time);
1077    timings.result_processing_ast_parsing_line_map_building =
1078        Some(total_ast_parsing_line_map_building_time);
1079
1080    // Set the granular block extraction sub-step timings
1081    timings.result_processing_block_extraction_code_structure =
1082        Some(total_block_extraction_code_structure_time);
1083    timings.result_processing_block_extraction_filtering =
1084        Some(total_block_extraction_filtering_time);
1085    timings.result_processing_block_extraction_result_building =
1086        Some(total_block_extraction_result_building_time);
1087
1088    if debug_mode {
1089        println!(
1090            "DEBUG: Result processing completed in {} - Generated {} results",
1091            format_duration(rp_duration),
1092            final_results.len()
1093        );
1094        println!("DEBUG: Granular result processing timings:");
1095        println!("DEBUG:   File I/O: {}", format_duration(total_file_io_time));
1096        println!(
1097            "DEBUG:   Line collection: {}",
1098            format_duration(total_line_collection_time)
1099        );
1100        println!(
1101            "DEBUG:   AST parsing: {}",
1102            format_duration(total_ast_parsing_time)
1103        );
1104        println!(
1105            "DEBUG:     - Language init: {}",
1106            format_duration(total_ast_parsing_language_init_time)
1107        );
1108        println!(
1109            "DEBUG:     - Parser init: {}",
1110            format_duration(total_ast_parsing_parser_init_time)
1111        );
1112        println!(
1113            "DEBUG:     - Tree parsing: {}",
1114            format_duration(total_ast_parsing_tree_parsing_time)
1115        );
1116        println!(
1117            "DEBUG:     - Line map building: {}",
1118            format_duration(total_ast_parsing_line_map_building_time)
1119        );
1120        println!(
1121            "DEBUG:   Block extraction: {}",
1122            format_duration(total_block_extraction_time)
1123        );
1124        println!(
1125            "DEBUG:     - Code structure finding: {}",
1126            format_duration(total_block_extraction_code_structure_time)
1127        );
1128        println!(
1129            "DEBUG:     - Filtering: {}",
1130            format_duration(total_block_extraction_filtering_time)
1131        );
1132        println!(
1133            "DEBUG:     - Result building: {}",
1134            format_duration(total_block_extraction_result_building_time)
1135        );
1136        println!(
1137            "DEBUG:   Result building: {}",
1138            format_duration(remaining_time)
1139        );
1140    }
1141    // Rank results (skip if exact flag is set)
1142    let rr_start = Instant::now();
1143    if debug_mode {
1144        if *exact {
1145            println!("DEBUG: Skipping result ranking due to exact flag being set");
1146        } else {
1147            println!("DEBUG: Starting result ranking...");
1148        }
1149    }
1150
1151    if !*exact {
1152        // Only perform ranking if exact flag is not set
1153        rank_search_results(&mut final_results, queries, reranker);
1154    }
1155
1156    let rr_duration = rr_start.elapsed();
1157    timings.result_ranking = Some(rr_duration);
1158
1159    if debug_mode {
1160        if *exact {
1161            println!(
1162                "DEBUG: Result ranking skipped in {}",
1163                format_duration(rr_duration)
1164            );
1165        } else {
1166            println!(
1167                "DEBUG: Result ranking completed in {}",
1168                format_duration(rr_duration)
1169            );
1170        }
1171    }
1172
1173    // We'll move the caching step AFTER limiting results
1174    let mut skipped_count = early_skipped_count;
1175    let filtered_results = final_results;
1176
1177    // Apply limits
1178    let la_start = Instant::now();
1179    if debug_mode {
1180        println!("DEBUG: Starting limit application...");
1181    }
1182
1183    // First apply limits to the results
1184    let mut limited = apply_limits(filtered_results, *max_results, *max_bytes, *max_tokens);
1185
1186    // Then apply caching AFTER limiting results
1187    let fc_start = Instant::now();
1188
1189    if let Some(session_id) = effective_session {
1190        // Get the raw query string for caching
1191        let raw_query = if queries.len() > 1 {
1192            queries.join(" AND ")
1193        } else {
1194            queries[0].clone()
1195        };
1196
1197        if debug_mode {
1198            println!(
1199                "DEBUG: Starting final caching for session: {session_id} with query: {raw_query}"
1200            );
1201            println!("DEBUG: Already skipped {early_skipped_count} lines in early caching");
1202            // Print cache contents before filtering
1203            if let Err(e) = cache::debug_print_cache(session_id, &raw_query) {
1204                eprintln!("Error printing cache: {e}");
1205            }
1206        }
1207
1208        // Filter results using the cache - but only to count skipped blocks, not to filter
1209        match cache::filter_results_with_cache(&limited.results, session_id, &raw_query) {
1210            Ok((_, cached_skipped)) => {
1211                if debug_mode {
1212                    println!("DEBUG: Final caching found {cached_skipped} cached blocks");
1213                    println!(
1214                        "DEBUG: Total skipped (early + final): {}",
1215                        early_skipped_count + cached_skipped
1216                    );
1217                }
1218
1219                skipped_count += cached_skipped;
1220            }
1221            Err(e) => {
1222                // Log the error but continue without caching
1223                eprintln!("Error checking cache: {e}");
1224            }
1225        }
1226
1227        // Update the cache with the limited results
1228        if let Err(e) = cache::add_results_to_cache(&limited.results, session_id, &raw_query) {
1229            eprintln!("Error adding results to cache: {e}");
1230        }
1231
1232        if debug_mode {
1233            println!("DEBUG: Added limited results to cache before merging");
1234            // Print cache contents after adding new results
1235            if let Err(e) = cache::debug_print_cache(session_id, &raw_query) {
1236                eprintln!("Error printing updated cache: {e}");
1237            }
1238        }
1239    }
1240
1241    // Set the cached blocks skipped count
1242    limited.cached_blocks_skipped = if skipped_count > 0 {
1243        Some(skipped_count)
1244    } else {
1245        None
1246    };
1247
1248    let fc_duration = fc_start.elapsed();
1249    timings.final_caching = Some(fc_duration);
1250
1251    if debug_mode && effective_session.is_some() {
1252        println!(
1253            "DEBUG: Final caching completed in {}",
1254            format_duration(fc_duration)
1255        );
1256    }
1257
1258    let la_duration = la_start.elapsed();
1259    timings.limit_application = Some(la_duration);
1260
1261    if debug_mode {
1262        println!(
1263            "DEBUG: Limit application completed in {} - Final result count: {}",
1264            format_duration(la_duration),
1265            limited.results.len()
1266        );
1267    }
1268
1269    // Optional block merging - AFTER initial caching
1270    let bm_start = Instant::now();
1271    if debug_mode && !limited.results.is_empty() && !*no_merge {
1272        println!("DEBUG: Starting block merging...");
1273    }
1274
1275    let final_results = if !limited.results.is_empty() && !*no_merge {
1276        use probe_code::search::block_merging::merge_ranked_blocks;
1277        let merged = merge_ranked_blocks(limited.results.clone(), *merge_threshold);
1278
1279        let bm_duration = bm_start.elapsed();
1280        timings.block_merging = Some(bm_duration);
1281
1282        if debug_mode {
1283            println!(
1284                "DEBUG: Block merging completed in {} - Merged result count: {}",
1285                format_duration(bm_duration),
1286                merged.len()
1287            );
1288        }
1289
1290        // Create the merged results
1291        let merged_results = LimitedSearchResults {
1292            results: merged.clone(),
1293            skipped_files: limited.skipped_files,
1294            limits_applied: limited.limits_applied,
1295            cached_blocks_skipped: limited.cached_blocks_skipped,
1296        };
1297
1298        // Update the cache with the merged results (after merging)
1299        if let Some(session_id) = effective_session {
1300            // Get the raw query string for caching
1301            let raw_query = if queries.len() > 1 {
1302                queries.join(" AND ")
1303            } else {
1304                queries[0].clone()
1305            };
1306
1307            if let Err(e) = cache::add_results_to_cache(&merged, session_id, &raw_query) {
1308                eprintln!("Error adding merged results to cache: {e}");
1309            }
1310
1311            if debug_mode {
1312                println!("DEBUG: Added merged results to cache after merging");
1313                // Print cache contents after adding merged results
1314                if let Err(e) = cache::debug_print_cache(session_id, &raw_query) {
1315                    eprintln!("Error printing updated cache: {e}");
1316                }
1317            }
1318        }
1319
1320        merged_results
1321    } else {
1322        let bm_duration = bm_start.elapsed();
1323        timings.block_merging = Some(bm_duration);
1324
1325        if debug_mode && !*no_merge {
1326            println!(
1327                "DEBUG: Block merging skipped (no results or disabled) - {}",
1328                format_duration(bm_duration)
1329            );
1330        }
1331
1332        limited
1333    };
1334
1335    // Print the session ID to the console if it was generated or provided
1336    if let Some(session_id) = effective_session {
1337        if session_was_generated {
1338            println!("Session ID: {session_id} (generated - ALWAYS USE IT in future sessions for caching)");
1339        } else {
1340            println!("Session ID: {session_id}");
1341        }
1342    }
1343
1344    // Set total search time
1345    timings.total_search_time = Some(total_start.elapsed());
1346
1347    // Print timing information
1348    print_timings(&timings);
1349
1350    // Stop the timeout thread
1351    timeout_handle.store(true, std::sync::atomic::Ordering::SeqCst);
1352
1353    Ok(final_results)
1354}
1355
1356/// Helper function to search files using structured patterns from a QueryPlan.
1357/// This function uses a RegexSet approach for deterministic pattern matching
1358/// and collects matches by term indices. It uses the file_list_cache to get a filtered
1359/// list of files respecting ignore patterns.
1360///
1361/// # Arguments
1362/// * `root_path` - The base path to search in
1363/// * `plan` - The parsed query plan
1364/// * `patterns` - The generated regex patterns with their term indices
1365/// * `custom_ignores` - Custom ignore patterns
1366/// * `allow_tests` - Whether to include test files
1367pub fn search_with_structured_patterns(
1368    root_path_str: &Path,
1369    _plan: &QueryPlan,
1370    patterns: &[(String, HashSet<usize>)],
1371    custom_ignores: &[String],
1372    allow_tests: bool,
1373    language: Option<&str>,
1374) -> Result<HashMap<PathBuf, HashMap<usize, HashSet<usize>>>> {
1375    // Resolve the path if it's a special format (e.g., "go:github.com/user/repo")
1376    let root_path = if let Some(path_str) = root_path_str.to_str() {
1377        match resolve_path(path_str) {
1378            Ok(resolved_path) => {
1379                if std::env::var("DEBUG").unwrap_or_default() == "1" {
1380                    println!(
1381                        "DEBUG: Resolved path '{}' to '{}'",
1382                        path_str,
1383                        resolved_path.display()
1384                    );
1385                }
1386                resolved_path
1387            }
1388            Err(err) => {
1389                if std::env::var("DEBUG").unwrap_or_default() == "1" {
1390                    println!("DEBUG: Failed to resolve path '{path_str}': {err}");
1391                }
1392                // Fall back to the original path
1393                root_path_str.to_path_buf()
1394            }
1395        }
1396    } else {
1397        // If we can't convert the path to a string, use it as is
1398        root_path_str.to_path_buf()
1399    };
1400    use rayon::prelude::*;
1401    use regex::RegexSet;
1402    use std::sync::{Arc, Mutex};
1403
1404    let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
1405    let search_start = Instant::now();
1406
1407    // Step 1: Create RegexSet for deterministic pattern matching
1408    if debug_mode {
1409        println!("DEBUG: Starting parallel structured pattern search with RegexSet...");
1410        println!("DEBUG: Creating RegexSet from {} patterns", patterns.len());
1411    }
1412
1413    // Extract just the patterns for the RegexSet
1414    let pattern_strings: Vec<String> = patterns.iter().map(|(p, _)| format!("(?i){p}")).collect();
1415
1416    // Create a RegexSet for deterministic matching
1417    let regex_set = RegexSet::new(&pattern_strings)?;
1418
1419    // Create a mapping from pattern index to term indices
1420    let pattern_to_terms: Vec<HashSet<usize>> =
1421        patterns.iter().map(|(_, terms)| terms.clone()).collect();
1422
1423    if debug_mode {
1424        println!("DEBUG: RegexSet created successfully");
1425    }
1426
1427    // Step 2: Get filtered file list from cache
1428    if debug_mode {
1429        println!("DEBUG: Getting filtered file list from cache");
1430        println!("DEBUG: Custom ignore patterns: {custom_ignores:?}");
1431    }
1432
1433    // Use file_list_cache to get a filtered list of files, with language filtering if specified
1434    let file_list = crate::search::file_list_cache::get_file_list_by_language(
1435        &root_path,
1436        allow_tests,
1437        custom_ignores,
1438        language,
1439    )?;
1440
1441    if debug_mode {
1442        println!("DEBUG: Got {} files from cache", file_list.files.len());
1443        println!("DEBUG: Starting parallel file processing with RegexSet");
1444    }
1445
1446    // Step 3: Process files in parallel
1447    // Create thread-safe shared resources
1448    let regex_set = Arc::new(regex_set);
1449    let pattern_to_terms = Arc::new(pattern_to_terms);
1450    let file_term_maps = Arc::new(Mutex::new(HashMap::new()));
1451
1452    // Also create individual regexes for line number extraction
1453    let individual_regexes: Vec<regex::Regex> = pattern_strings
1454        .iter()
1455        .map(|p| regex::Regex::new(p).unwrap())
1456        .collect();
1457    let individual_regexes = Arc::new(individual_regexes);
1458
1459    file_list.files.par_iter().for_each(|file_path| {
1460        let regex_set = Arc::clone(&regex_set);
1461        let pattern_to_terms = Arc::clone(&pattern_to_terms);
1462        let individual_regexes = Arc::clone(&individual_regexes);
1463
1464        // Search file with RegexSet for deterministic matching
1465        match search_file_with_regex_set(
1466            file_path,
1467            &regex_set,
1468            &individual_regexes,
1469            &pattern_to_terms,
1470        ) {
1471            Ok(term_map) => {
1472                if !term_map.is_empty() {
1473                    if debug_mode {
1474                        println!(
1475                            "DEBUG: File {:?} matched patterns with {} term indices",
1476                            file_path,
1477                            term_map.len()
1478                        );
1479                    }
1480
1481                    // Add to results with proper locking
1482                    let mut maps = file_term_maps.lock().unwrap();
1483                    maps.insert(file_path.clone(), term_map);
1484                }
1485            }
1486            Err(e) => {
1487                if debug_mode {
1488                    println!("DEBUG: Error searching file {file_path:?}: {e:?}");
1489                }
1490            }
1491        }
1492    });
1493
1494    let total_duration = search_start.elapsed();
1495
1496    // Extract the results from the Arc<Mutex<>>
1497    let result = Arc::try_unwrap(file_term_maps)
1498        .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
1499        .into_inner()
1500        .unwrap();
1501
1502    if debug_mode {
1503        println!(
1504            "DEBUG: Parallel search completed in {} - Found matches in {} files",
1505            format_duration(total_duration),
1506            result.len()
1507        );
1508    }
1509
1510    Ok(result)
1511}
1512
1513/// Helper function to search a file with a RegexSet for deterministic pattern matching
1514/// This function searches a file for matches against a RegexSet and individual regexes
1515/// to map the matches to their corresponding term indices.
1516///
1517/// Using RegexSet ensures deterministic pattern matching across multiple runs,
1518/// avoiding the non-deterministic behavior of capturing groups in a combined regex.
1519fn search_file_with_regex_set(
1520    file_path: &Path,
1521    regex_set: &regex::RegexSet,
1522    individual_regexes: &[regex::Regex],
1523    pattern_to_terms: &[HashSet<usize>],
1524) -> Result<HashMap<usize, HashSet<usize>>> {
1525    let mut term_map = HashMap::new();
1526    let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
1527
1528    // Define a reasonable maximum file size (e.g., 1MB)
1529    const MAX_FILE_SIZE: u64 = 1024 * 1024;
1530
1531    // Check file metadata and resolve symlinks before reading
1532    let resolved_path = match std::fs::canonicalize(file_path) {
1533        Ok(path) => path,
1534        Err(e) => {
1535            if debug_mode {
1536                println!("DEBUG: Error resolving path for {file_path:?}: {e:?}");
1537            }
1538            return Err(anyhow::anyhow!("Failed to resolve file path: {}", e));
1539        }
1540    };
1541
1542    // Get file metadata to check size and file type
1543    let metadata = match std::fs::metadata(&resolved_path) {
1544        Ok(meta) => meta,
1545        Err(e) => {
1546            if debug_mode {
1547                println!("DEBUG: Error getting metadata for {resolved_path:?}: {e:?}");
1548            }
1549            return Err(anyhow::anyhow!("Failed to get file metadata: {}", e));
1550        }
1551    };
1552
1553    // Check if the file is too large
1554    if metadata.len() > MAX_FILE_SIZE {
1555        if debug_mode {
1556            println!(
1557                "DEBUG: Skipping file {:?} - file too large ({} bytes > {} bytes limit)",
1558                resolved_path,
1559                metadata.len(),
1560                MAX_FILE_SIZE
1561            );
1562        }
1563        return Err(anyhow::anyhow!(
1564            "File too large: {} bytes (limit: {} bytes)",
1565            metadata.len(),
1566            MAX_FILE_SIZE
1567        ));
1568    }
1569
1570    // Read the file content with proper error handling
1571    let content = match std::fs::read_to_string(&resolved_path) {
1572        Ok(content) => content,
1573        Err(e) => {
1574            if debug_mode {
1575                println!(
1576                    "DEBUG: Error reading file {:?}: {:?} (size: {} bytes)",
1577                    resolved_path,
1578                    e,
1579                    metadata.len()
1580                );
1581            }
1582            return Err(anyhow::anyhow!("Failed to read file: {}", e));
1583        }
1584    };
1585
1586    // Process each line
1587    for (line_number, line) in content.lines().enumerate() {
1588        // Skip lines that are too long
1589        if line.len() > 2000 {
1590            if debug_mode {
1591                println!(
1592                    "DEBUG: Skipping line {} in file {:?} - line too long ({} characters)",
1593                    line_number + 1,
1594                    file_path,
1595                    line.len()
1596                );
1597            }
1598            continue;
1599        }
1600
1601        // First check if any pattern matches using the RegexSet
1602        let matches = regex_set.matches(line);
1603        if matches.matched_any() {
1604            // For each matched pattern, find the specific line numbers using individual regexes
1605            for pattern_idx in matches.iter() {
1606                // Use the individual regex to find all matches in the line
1607                if individual_regexes[pattern_idx].is_match(line) {
1608                    // Add matches for all terms associated with this pattern
1609                    for &term_idx in &pattern_to_terms[pattern_idx] {
1610                        term_map
1611                            .entry(term_idx)
1612                            .or_insert_with(HashSet::new)
1613                            .insert(line_number + 1); // Convert to 1-based line numbers
1614                    }
1615                }
1616            }
1617        }
1618    }
1619
1620    Ok(term_map)
1621}
1622
1623/// Normalize language aliases to their canonical names
1624/// This function maps language aliases like "ts" to their canonical names like "typescript"
1625fn normalize_language_alias(lang: &str) -> &str {
1626    match lang.to_lowercase().as_str() {
1627        "rs" => "rust",
1628        "js" | "jsx" => "javascript",
1629        "ts" | "tsx" => "typescript",
1630        "py" => "python",
1631        "h" => "c",
1632        "cc" | "cxx" | "hpp" | "hxx" => "cpp",
1633        "rb" => "ruby",
1634        "cs" => "csharp",
1635        _ => lang, // Return the original language if no alias is found
1636    }
1637}
probe_code/search/search_runner.rs

probe_code/search/
search_runner.rs