probe_code/search/
file_processing.rs

1use anyhow::{Context, Result};
2use rayon::prelude::*;
3use std::collections::{HashMap, HashSet};
4use std::fs;
5use std::path::Path;
6use std::sync::{Arc, Mutex};
7use std::time::{Duration, Instant};
8use tree_sitter;
9
10use probe_code::language::{is_test_file, parse_file_for_code_blocks};
11use probe_code::models::SearchResult;
12use probe_code::ranking;
13use probe_code::search::tokenization;
14
15/// Structure to hold timing information for file processing stages
16pub struct FileProcessingTimings {
17    pub file_io: Option<Duration>,
18
19    // AST parsing timings
20    pub ast_parsing: Option<Duration>,
21    pub ast_parsing_language_init: Option<Duration>,
22    pub ast_parsing_parser_init: Option<Duration>,
23    pub ast_parsing_tree_parsing: Option<Duration>,
24    pub ast_parsing_line_map_building: Option<Duration>,
25
26    // Block extraction timings
27    pub block_extraction: Option<Duration>,
28    pub block_extraction_code_structure: Option<Duration>,
29    pub block_extraction_filtering: Option<Duration>,
30    pub block_extraction_result_building: Option<Duration>,
31
32    // Detailed result building timings
33    pub result_building_term_matching: Option<Duration>,
34    pub result_building_compound_processing: Option<Duration>,
35    pub result_building_line_matching: Option<Duration>,
36    pub result_building_result_creation: Option<Duration>,
37    pub result_building_synchronization: Option<Duration>,
38    pub result_building_uncovered_lines: Option<Duration>,
39}
40
41/// Parameters for file processing
42pub struct FileProcessingParams<'a> {
43    pub path: &'a Path,
44    pub line_numbers: &'a HashSet<usize>,
45    pub allow_tests: bool,
46    pub term_matches: &'a HashMap<usize, HashSet<usize>>,
47    #[allow(dead_code)]
48    pub num_queries: usize,
49    #[allow(dead_code)]
50    pub filename_matched_queries: HashSet<usize>,
51    pub queries_terms: &'a [Vec<(String, String)>],
52    pub preprocessed_queries: Option<&'a [Vec<String>]>,
53    pub query_plan: &'a crate::search::query::QueryPlan,
54
55    #[allow(dead_code)]
56    pub no_merge: bool,
57}
58
59/// Evaluate whether a block of lines satisfies a complex AST query
60/// using the 'evaluate' method in `elastic_query::Expr`. We assume
61/// the 'term_matches' map uses the same indexing as the AST's QueryPlan term_indices.
62#[allow(dead_code)]
63pub fn filter_code_block_with_ast(
64    block_lines: (usize, usize),
65    term_matches: &HashMap<usize, HashSet<usize>>,
66    plan: &crate::search::query::QueryPlan,
67    debug_mode: bool,
68) -> bool {
69    // Gather matched term indices for this block
70    let mut matched_terms = HashSet::new();
71    for (&term_idx, lines) in term_matches {
72        if lines
73            .iter()
74            .any(|&l| l >= block_lines.0 && l <= block_lines.1)
75        {
76            matched_terms.insert(term_idx);
77        }
78    }
79
80    if debug_mode {
81        println!(
82            "DEBUG: Checking for terms in block {}-{}",
83            block_lines.0, block_lines.1
84        );
85        println!("DEBUG: Matched terms: {matched_terms:?}");
86        println!("DEBUG: Term indices: {:?}", plan.term_indices);
87        println!("DEBUG: Excluded terms: {:?}", plan.excluded_terms);
88        println!("DEBUG: AST: {:?}", plan.ast);
89
90        // Add detailed information about which exact keywords matched
91        println!("DEBUG: ===== MATCHED KEYWORDS DETAILS =====");
92        let mut matched_keywords = Vec::new();
93        for (term, &idx) in &plan.term_indices {
94            if matched_terms.contains(&idx) {
95                matched_keywords.push(term);
96                println!(
97                    "DEBUG: Keyword '{}' matched in block {}-{}",
98                    term, block_lines.0, block_lines.1
99                );
100            }
101        }
102        if matched_keywords.is_empty() {
103            println!("DEBUG: No keywords matched in this block");
104        } else {
105            println!("DEBUG: All matched keywords: {matched_keywords:?}");
106        }
107        println!("DEBUG: ===================================");
108    }
109
110    // Check if we have any matches at all
111    if matched_terms.is_empty() {
112        if debug_mode {
113            println!(
114                "DEBUG: No matched terms in block {}-{}, returning false",
115                block_lines.0, block_lines.1
116            );
117        }
118        return false;
119    }
120
121    // Use the AST evaluation directly
122    if debug_mode {
123        println!("DEBUG: ===== AST EVALUATION =====");
124        println!("DEBUG: Matched terms: {matched_terms:?}");
125        println!("DEBUG: Term indices: {:?}", plan.term_indices);
126    }
127
128    // Use the evaluate function from the elastic query module
129    let result = plan.ast.evaluate(&matched_terms, &plan.term_indices, false);
130
131    if debug_mode {
132        println!("DEBUG: ===== EVALUATION RESULT =====");
133        println!("DEBUG: AST evaluation result: {result}");
134        println!(
135            "DEBUG: Block {}-{} will be {}",
136            block_lines.0,
137            block_lines.1,
138            if result { "INCLUDED" } else { "EXCLUDED" }
139        );
140        println!("DEBUG: ============================");
141    }
142
143    let decision = result;
144
145    if debug_mode {
146        println!(
147            "DEBUG: Block {}-{} matched terms: {:?}",
148            block_lines.0, block_lines.1, matched_terms
149        );
150        println!("DEBUG: AST evaluation result: {decision}");
151    }
152
153    if debug_mode {
154        println!(
155            "DEBUG: filter_code_block_with_ast => lines {block_lines:?} => matched {matched_terms:?}, decision={decision}"
156        );
157    }
158    decision
159}
160
161/// Evaluate whether a tokenized block satisfies a complex AST query
162/// using the 'evaluate' method in `elastic_query::Expr`.
163pub fn filter_tokenized_block(
164    tokenized_content: &[String],
165    term_indices: &HashMap<String, usize>,
166    plan: &crate::search::query::QueryPlan,
167    debug_mode: bool,
168) -> bool {
169    // Create a set of matched term indices based on tokenized content
170    let mut matched_terms = HashSet::new();
171
172    // For each token in the tokenized content, check if it's in the term_indices
173    for token in tokenized_content {
174        if let Some(&idx) = term_indices.get(token) {
175            matched_terms.insert(idx);
176        }
177    }
178
179    // Special handling for compound words like "whitelist"
180    // Check if any term in the plan is a compound of tokens in the content
181    for (term, &idx) in &plan.term_indices {
182        // Skip if we already matched this term
183        if matched_terms.contains(&idx) {
184            continue;
185        }
186
187        // Check if this term is a special case that should be treated as a single token
188        if crate::search::tokenization::is_special_case(term) {
189            // If the tokenized content contains this special case term, add it to matched terms
190            if tokenized_content.contains(&term.to_lowercase()) {
191                matched_terms.insert(idx);
192                if debug_mode {
193                    println!("DEBUG: Special case term '{term}' matched in tokenized content");
194                }
195            }
196        }
197    }
198
199    if debug_mode {
200        println!("DEBUG: Checking for terms in tokenized block");
201        println!("DEBUG: Tokenized content: {tokenized_content:?}");
202        println!("DEBUG: Matched terms: {matched_terms:?}");
203        println!("DEBUG: Term indices: {:?}", plan.term_indices);
204        println!("DEBUG: Excluded terms: {:?}", plan.excluded_terms);
205        println!("DEBUG: AST: {:?}", plan.ast);
206
207        // Add detailed information about which exact keywords matched
208        println!("DEBUG: ===== MATCHED KEYWORDS DETAILS =====");
209        let mut matched_keywords = Vec::new();
210        for (term, &idx) in &plan.term_indices {
211            if matched_terms.contains(&idx) {
212                matched_keywords.push(term);
213                println!("DEBUG: Keyword '{term}' matched in tokenized block");
214            }
215        }
216        if matched_keywords.is_empty() {
217            println!("DEBUG: No keywords matched in this block");
218        } else {
219            println!("DEBUG: All matched keywords: {matched_keywords:?}");
220        }
221        println!("DEBUG: ===================================");
222    }
223
224    // Check if we have any matches at all
225    if matched_terms.is_empty() {
226        if debug_mode {
227            println!("DEBUG: No matched terms in tokenized block, returning false");
228        }
229        return false;
230    }
231
232    // Use the AST evaluation directly
233    if debug_mode {
234        println!("DEBUG: ===== AST EVALUATION =====");
235        println!("DEBUG: Matched terms: {matched_terms:?}");
236        println!("DEBUG: Term indices: {:?}", plan.term_indices);
237    }
238
239    // Use the evaluate function from the elastic query module
240    let result = plan.ast.evaluate(&matched_terms, &plan.term_indices, false);
241
242    if debug_mode {
243        println!("DEBUG: ===== EVALUATION RESULT =====");
244        println!("DEBUG: AST evaluation result: {result}");
245        println!(
246            "DEBUG: Block will be {}",
247            if result { "INCLUDED" } else { "EXCLUDED" }
248        );
249        println!("DEBUG: ============================");
250    }
251
252    let decision = result;
253
254    if debug_mode {
255        println!("DEBUG: Tokenized block matched terms: {matched_terms:?}");
256        println!("DEBUG: AST evaluation result: {decision}");
257        println!("DEBUG: filter_tokenized_block => matched {matched_terms:?}, decision={decision}");
258    }
259
260    decision
261}
262
263/// Determines a better node type for fallback context by analyzing the line content
264fn determine_fallback_node_type(line: &str, extension: Option<&str>) -> String {
265    let trimmed = line.trim();
266
267    if trimmed.starts_with("//")
268        || trimmed.starts_with("/*")
269        || trimmed.starts_with("*")
270        || (trimmed.starts_with("#") && extension.is_some_and(|ext| ext == "py" || ext == "rb"))
271        || trimmed.starts_with("'''")
272        || trimmed.starts_with("\"\"\"")
273    {
274        return "comment".to_string();
275    }
276
277    let lowercase = trimmed.to_lowercase();
278
279    if (trimmed.contains("fn ")
280        && (trimmed.contains("(") || trimmed.contains(")"))
281        && extension == Some("rs"))
282        || (trimmed.contains("func ") && extension == Some("go"))
283        || (trimmed.contains("function ")
284            && extension.is_some_and(|ext| ext == "js" || ext == "ts"))
285        || (lowercase.contains("def ") && extension == Some("py"))
286        || (trimmed.contains("public")
287            && trimmed.contains("void")
288            && extension.is_some_and(|ext| ext == "java" || ext == "kt"))
289    {
290        return "function".to_string();
291    }
292
293    if (trimmed.contains("class ") || trimmed.contains("interface "))
294        || (trimmed.contains("struct ")
295            && extension
296                .is_some_and(|ext| ext == "rs" || ext == "go" || ext == "c" || ext == "cpp"))
297        || (trimmed.contains("type ") && trimmed.contains("struct") && extension == Some("go"))
298        || (trimmed.contains("enum "))
299    {
300        return "class".to_string();
301    }
302
303    if trimmed.starts_with("import ")
304        || trimmed.starts_with("from ")
305        || trimmed.starts_with("require ")
306        || trimmed.starts_with("use ")
307        || trimmed.starts_with("#include ")
308    {
309        return "import".to_string();
310    }
311
312    if (trimmed.starts_with("let ") || trimmed.starts_with("var ") || trimmed.starts_with("const "))
313        || (trimmed.contains("=") && !trimmed.contains("==") && !trimmed.contains("=>"))
314    {
315        return "variable_declaration".to_string();
316    }
317
318    if trimmed.starts_with("if ")
319        || trimmed.starts_with("for ")
320        || trimmed.starts_with("while ")
321        || trimmed.starts_with("switch ")
322        || trimmed.starts_with("match ")
323    {
324        return "control_flow".to_string();
325    }
326
327    "code".to_string()
328}
329/// Main function for processing a file with matched lines
330pub fn process_file_with_results(
331    params: &FileProcessingParams,
332) -> Result<(Vec<SearchResult>, FileProcessingTimings)> {
333    let mut timings = FileProcessingTimings {
334        file_io: None,
335
336        // AST parsing timings
337        ast_parsing: None,
338        ast_parsing_language_init: None,
339        ast_parsing_parser_init: None,
340        ast_parsing_tree_parsing: None,
341        ast_parsing_line_map_building: None,
342
343        // Block extraction timings
344        block_extraction: None,
345        block_extraction_code_structure: None,
346        block_extraction_filtering: None,
347        block_extraction_result_building: None,
348
349        // Detailed result building timings
350        result_building_term_matching: None,
351        result_building_compound_processing: None,
352        result_building_line_matching: None,
353        result_building_result_creation: None,
354        result_building_synchronization: None,
355        result_building_uncovered_lines: None,
356    };
357
358    // Measure file I/O time
359    let file_io_start = Instant::now();
360    let content = fs::read_to_string(params.path)
361        .context(format!("Failed to read file: {:?}", params.path))?;
362    let file_io_duration = file_io_start.elapsed();
363    timings.file_io = Some(file_io_duration);
364
365    let extension = params
366        .path
367        .extension()
368        .and_then(|ext| ext.to_str())
369        .unwrap_or("");
370
371    // Get debug mode setting
372    let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
373
374    // Filter out lines longer than 500 characters
375    let lines: Vec<&str> = content
376        .lines()
377        .enumerate()
378        .map(|(i, line)| {
379            if line.len() > 500 {
380                if debug_mode {
381                    println!(
382                        "DEBUG: Skipping line {} with length {} (exceeds 500 character limit)",
383                        i + 1,
384                        line.len()
385                    );
386                }
387                ""
388            } else {
389                line
390            }
391        })
392        .collect();
393    let mut results = Vec::new();
394    let mut covered_lines = HashSet::new();
395    // We now use params.path.to_string_lossy() directly for tokenization
396
397    // Prepare query terms once for the entire file
398    let query_terms: Vec<String> = if let Some(prep) = params.preprocessed_queries {
399        prep.iter().flat_map(|v| v.iter().cloned()).collect()
400    } else {
401        params
402            .queries_terms
403            .iter()
404            .flat_map(|pairs| pairs.iter().map(|(_, s)| s.clone()))
405            .collect()
406    };
407    let unique_query_terms: HashSet<String> = query_terms.into_iter().collect();
408
409    if debug_mode {
410        println!("DEBUG: Processing file: {:?}", params.path);
411        println!("DEBUG:   matched lines: {:?}", params.line_numbers);
412        println!("DEBUG:   file I/O time: {file_io_duration:?}");
413    }
414
415    // Measure AST parsing time with sub-steps
416    let ast_parsing_start = Instant::now();
417
418    // Measure language initialization time
419    let language_init_start = Instant::now();
420    let language_impl = crate::language::factory::get_language_impl(extension);
421    let language_init_duration = language_init_start.elapsed();
422    timings.ast_parsing_language_init = Some(language_init_duration);
423
424    // Measure parser initialization time
425    let parser_init_start = Instant::now();
426    let mut parser = tree_sitter::Parser::new();
427    if let Some(lang_impl) = &language_impl {
428        let _ = parser.set_language(&lang_impl.get_tree_sitter_language());
429    }
430    let parser_init_duration = parser_init_start.elapsed();
431    timings.ast_parsing_parser_init = Some(parser_init_duration);
432
433    // Measure tree parsing time
434    let tree_parsing_start = Instant::now();
435    let file_path = params.path.to_string_lossy();
436    let mut cache_key = String::with_capacity(file_path.len() + extension.len() + 1);
437    cache_key.push_str(&file_path);
438    cache_key.push('_');
439    cache_key.push_str(extension);
440
441    let _ = if language_impl.is_some() {
442        crate::language::tree_cache::get_or_parse_tree(&cache_key, &content, &mut parser).ok()
443    } else {
444        None
445    };
446    let tree_parsing_duration = tree_parsing_start.elapsed();
447    timings.ast_parsing_tree_parsing = Some(tree_parsing_duration);
448
449    // Measure line map building time (this is an approximation since we can't directly measure it)
450    let line_map_building_start = Instant::now();
451
452    // Call the original parse_file_for_code_blocks function
453    let code_blocks_result = parse_file_for_code_blocks(
454        &content,
455        extension,
456        params.line_numbers,
457        params.allow_tests,
458        Some(params.term_matches),
459    );
460
461    let line_map_building_duration = line_map_building_start.elapsed();
462    timings.ast_parsing_line_map_building = Some(line_map_building_duration);
463
464    // Calculate total AST parsing time
465    let ast_parsing_duration = ast_parsing_start.elapsed();
466    timings.ast_parsing = Some(ast_parsing_duration);
467
468    if debug_mode {
469        println!("DEBUG:   AST parsing time: {ast_parsing_duration:?}");
470        println!("DEBUG:     - Language init: {language_init_duration:?}");
471        println!("DEBUG:     - Parser init: {parser_init_duration:?}");
472        println!("DEBUG:     - Tree parsing: {tree_parsing_duration:?}");
473        println!("DEBUG:     - Line map building: {line_map_building_duration:?}");
474    }
475
476    if let Ok(code_blocks) = code_blocks_result {
477        if debug_mode {
478            println!("DEBUG: AST parsing successful");
479            println!("DEBUG:   Found {} code blocks", code_blocks.len());
480
481            for (i, block) in code_blocks.iter().enumerate() {
482                println!(
483                    "DEBUG:   Block {}: type={}, lines={}-{}",
484                    i + 1,
485                    block.node_type,
486                    block.start_row + 1,
487                    block.end_row + 1
488                );
489            }
490        }
491
492        let file_id = params.path.to_string_lossy().to_string();
493
494        // Measure block extraction time with sub-steps
495        let block_extraction_start = Instant::now();
496
497        // Measure code structure finding time
498        let _code_structure_start = Instant::now();
499        let code_structure_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
500        let filtering_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
501        let result_building_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
502
503        // Track detailed result building timings
504        let term_matching_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
505        let compound_processing_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
506        let line_matching_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
507        let result_creation_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
508        let synchronization_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
509
510        // Prepare shared resources for parallel processing
511        let shared_results = Arc::new(Mutex::new(Vec::new()));
512        let shared_covered_lines = Arc::new(Mutex::new(HashSet::new()));
513
514        // Process blocks in parallel
515        code_blocks
516            .par_iter()
517            .enumerate()
518            .for_each(|(block_idx, block)| {
519                // Start measuring code structure finding time for this block
520                let block_start = Instant::now();
521
522                let start_line = block.start_row + 1;
523                let end_line = block.end_row + 1;
524
525                let (final_start_line, final_end_line, is_nested_struct) = if extension == "go"
526                    && block.node_type == "struct_type"
527                    && block
528                        .parent_node_type
529                        .as_ref()
530                        .is_some_and(|p| p == "function_declaration" || p == "method_declaration")
531                {
532                    if let Some(ps) = block.parent_start_row {
533                        if let Some(pe) = block.parent_end_row {
534                            (ps + 1, pe + 1, true)
535                        } else {
536                            (start_line, end_line, false)
537                        }
538                    } else {
539                        (start_line, end_line, false)
540                    }
541                } else {
542                    (start_line, end_line, false)
543                };
544
545                let full_code = if final_start_line > 0 && final_end_line <= lines.len() {
546                    // Skip empty lines (which were originally too long)
547                    lines[final_start_line - 1..final_end_line]
548                        .to_vec()
549                        .join("\n")
550                } else {
551                    "".to_string()
552                };
553
554                // End code structure finding time for this block
555                let block_duration = block_start.elapsed();
556                {
557                    let mut duration = code_structure_duration.lock().unwrap();
558                    *duration += block_duration;
559                }
560
561                // Start measuring term matching time
562                let term_matching_start = Instant::now();
563
564                // Early tokenization with full path prepended
565                let block_terms = ranking::preprocess_text_with_filename(
566                    &full_code,
567                    &params.path.to_string_lossy(),
568                );
569
570                // End term matching time measurement
571                let term_matching_block_duration = term_matching_start.elapsed();
572                {
573                    let mut duration = term_matching_duration.lock().unwrap();
574                    *duration += term_matching_block_duration;
575                }
576
577                // Start measuring filtering time
578                let filtering_start = Instant::now();
579                // Early filtering using tokenized content
580                let should_include = {
581                    if debug_mode {
582                        println!(
583                            "DEBUG: Using filter_tokenized_block for block {final_start_line}-{final_end_line}"
584                        );
585                    }
586
587                    // Skip tokenization and evaluation when exact flag is enabled
588                    if params.query_plan.exact {
589                        // In exact mode, we already matched the lines in the file
590                        // so we should include this block without re-evaluating
591                        if debug_mode {
592                            println!(
593                                "DEBUG: Exact mode enabled, skipping tokenization and evaluation for block {final_start_line}-{final_end_line}"
594                            );
595                        }
596                        true
597                    } else {
598                        // Use the AST evaluation directly to ensure correct handling of complex queries
599                        let result = filter_tokenized_block(
600                            &block_terms,
601                            &params.query_plan.term_indices,
602                            params.query_plan,
603                            debug_mode,
604                        );
605
606                        if debug_mode {
607                            println!(
608                                "DEBUG: Block {final_start_line}-{final_end_line} filter result: {result}"
609                            );
610                        }
611
612                        result
613                    }
614                };
615
616                // End filtering time measurement
617                let filtering_block_duration = filtering_start.elapsed();
618                {
619                    let mut duration = filtering_duration.lock().unwrap();
620                    *duration += filtering_block_duration;
621                }
622
623                if debug_mode {
624                    println!(
625                        "DEBUG: Block lines {final_start_line}-{final_end_line} => should_include={should_include}"
626                    );
627                }
628
629                // Mark lines as covered
630                {
631                    let mut covered = shared_covered_lines.lock().unwrap();
632                    for line_num in final_start_line..=final_end_line {
633                        covered.insert(line_num);
634                    }
635                }
636
637                if should_include {
638                    // Start measuring result building time
639                    let result_building_start = Instant::now();
640
641                    // Start measuring term matching time
642                    let direct_matches_start = Instant::now();
643
644                    // Calculate metrics using the already tokenized content
645                    let direct_matches: HashSet<&String> = block_terms
646                        .iter()
647                        .filter(|t| unique_query_terms.contains(*t))
648                        .collect();
649
650                    let direct_matches_duration = direct_matches_start.elapsed();
651                    {
652                        let mut duration = term_matching_duration.lock().unwrap();
653                        *duration += direct_matches_duration;
654                    }
655
656                    // Start measuring compound word processing time
657                    let compound_start = Instant::now();
658
659                    let mut compound_matches = HashSet::new();
660                    // Load vocabulary once before the loop
661                    let vocabulary = tokenization::load_vocabulary();
662                    for qterm in &unique_query_terms {
663                        if block_terms.iter().any(|bt| bt == qterm) {
664                            continue;
665                        }
666                        let parts = tokenization::split_compound_word(qterm, vocabulary);
667                        if parts.len() > 1 && parts.iter().all(|part| block_terms.contains(part)) {
668                            compound_matches.insert(qterm);
669                        }
670                    }
671
672                    let compound_duration = compound_start.elapsed();
673                    {
674                        let mut duration = compound_processing_duration.lock().unwrap();
675                        *duration += compound_duration;
676                    }
677
678                    let block_unique_terms = direct_matches.len() + compound_matches.len();
679                    let block_total_matches = direct_matches.len() + compound_matches.len();
680
681                    // Collect matched keywords
682                    let mut matched_keywords = Vec::new();
683
684                    // Add direct matches
685                    matched_keywords.extend(direct_matches.iter().map(|s| (*s).clone()));
686
687                    // Add compound matches
688                    matched_keywords.extend(compound_matches.iter().map(|s| (*s).clone()));
689
690                    // Start measuring line matching time
691                    let line_matching_start = Instant::now();
692
693                    // Get the matched term indices for this block
694                    let mut matched_term_indices = HashSet::new();
695                    for (&term_idx, lines) in params.term_matches {
696                        if lines
697                            .iter()
698                            .any(|&l| l >= final_start_line && l <= final_end_line)
699                        {
700                            matched_term_indices.insert(term_idx);
701                        }
702                    }
703
704                    let line_matching_duration_value = line_matching_start.elapsed();
705                    {
706                        let mut duration = line_matching_duration.lock().unwrap();
707                        *duration += line_matching_duration_value;
708                    }
709
710                    // Add the corresponding terms from the query plan
711                    for (term, &idx) in &params.query_plan.term_indices {
712                        if matched_term_indices.contains(&idx)
713                            && !params.query_plan.excluded_terms.contains(term)
714                        {
715                            matched_keywords.push(term.clone());
716                        }
717                    }
718
719                    // Remove duplicates
720                    matched_keywords.sort();
721                    matched_keywords.dedup();
722
723                    // Start measuring result creation time
724                    let result_creation_start = Instant::now();
725
726                    let result = SearchResult {
727                        file: params.path.to_string_lossy().to_string(),
728                        lines: (final_start_line, final_end_line),
729                        node_type: if is_nested_struct {
730                            block
731                                .parent_node_type
732                                .clone()
733                                .unwrap_or_else(|| block.node_type.clone())
734                        } else {
735                            block.node_type.clone()
736                        },
737                        code: full_code,
738                        matched_by_filename: None,
739                        rank: None,
740                        score: None,
741                        tfidf_score: None,
742                        bm25_score: None,
743                        tfidf_rank: None,
744                        bm25_rank: None,
745                        new_score: None,
746                        hybrid2_rank: None,
747                        combined_score_rank: None,
748                        file_unique_terms: Some(block_unique_terms),
749                        file_total_matches: Some(block_total_matches),
750                        file_match_rank: None,
751                        block_unique_terms: Some(block_unique_terms),
752                        block_total_matches: Some(block_total_matches),
753                        parent_file_id: Some(file_id.clone()),
754                        block_id: Some(block_idx),
755                        matched_keywords: if matched_keywords.is_empty() {
756                            None
757                        } else {
758                            Some(matched_keywords)
759                        },
760                        tokenized_content: Some(block_terms),
761                    };
762
763                    let result_creation_duration_value = result_creation_start.elapsed();
764                    {
765                        let mut duration = result_creation_duration.lock().unwrap();
766                        *duration += result_creation_duration_value;
767                    }
768
769                    // Start measuring synchronization time
770                    let sync_start = Instant::now();
771
772                    // Add result to shared results
773                    {
774                        let mut results = shared_results.lock().unwrap();
775                        results.push(result);
776                    }
777
778                    let sync_duration = sync_start.elapsed();
779                    {
780                        let mut duration = synchronization_duration.lock().unwrap();
781                        *duration += sync_duration;
782                    }
783
784                    // End result building time measurement
785                    let result_building_block_duration = result_building_start.elapsed();
786                    {
787                        let mut duration = result_building_duration.lock().unwrap();
788                        *duration += result_building_block_duration;
789                    }
790                }
791            });
792
793        // Extract results from shared resources
794        results = Arc::try_unwrap(shared_results)
795            .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
796            .into_inner()
797            .unwrap();
798
799        covered_lines = Arc::try_unwrap(shared_covered_lines)
800            .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
801            .into_inner()
802            .unwrap();
803
804        // Extract durations from Arc<Mutex<>>
805        let code_structure_duration_value = Arc::try_unwrap(code_structure_duration)
806            .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
807            .into_inner()
808            .unwrap();
809
810        let filtering_duration_value = Arc::try_unwrap(filtering_duration)
811            .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
812            .into_inner()
813            .unwrap();
814
815        let result_building_duration_value = Arc::try_unwrap(result_building_duration)
816            .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
817            .into_inner()
818            .unwrap();
819
820        // Extract detailed result building timings
821        let term_matching_duration_value = Arc::try_unwrap(term_matching_duration)
822            .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
823            .into_inner()
824            .unwrap();
825
826        let compound_processing_duration_value = Arc::try_unwrap(compound_processing_duration)
827            .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
828            .into_inner()
829            .unwrap();
830
831        let line_matching_duration_value = Arc::try_unwrap(line_matching_duration)
832            .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
833            .into_inner()
834            .unwrap();
835
836        let result_creation_duration_value = Arc::try_unwrap(result_creation_duration)
837            .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
838            .into_inner()
839            .unwrap();
840
841        let synchronization_duration_value = Arc::try_unwrap(synchronization_duration)
842            .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
843            .into_inner()
844            .unwrap();
845
846        // Store the sub-step timings
847        let block_extraction_duration = block_extraction_start.elapsed();
848        timings.block_extraction = Some(block_extraction_duration);
849        timings.block_extraction_code_structure = Some(code_structure_duration_value);
850        timings.block_extraction_filtering = Some(filtering_duration_value);
851        timings.block_extraction_result_building = Some(result_building_duration_value);
852
853        // Store detailed result building timings
854        timings.result_building_term_matching = Some(term_matching_duration_value);
855        timings.result_building_compound_processing = Some(compound_processing_duration_value);
856        timings.result_building_line_matching = Some(line_matching_duration_value);
857        timings.result_building_result_creation = Some(result_creation_duration_value);
858        timings.result_building_synchronization = Some(synchronization_duration_value);
859
860        if debug_mode {
861            println!("DEBUG:   Block extraction time: {block_extraction_duration:?}");
862            println!("DEBUG:     - Code structure finding: {code_structure_duration_value:?}");
863            println!("DEBUG:     - Filtering: {filtering_duration_value:?}");
864            println!("DEBUG:     - Result building: {result_building_duration_value:?}");
865        }
866    }
867
868    // Collect all uncovered lines first without processing them
869    let mut uncovered_lines = Vec::new();
870    for &line_num in params.line_numbers {
871        if !covered_lines.contains(&line_num) {
872            if debug_mode {
873                println!("DEBUG: Line {line_num} not covered, will use fallback context");
874                if line_num <= lines.len() {
875                    println!("DEBUG:   Line content: '{}'", lines[line_num - 1].trim());
876                }
877            }
878            uncovered_lines.push(line_num);
879        }
880    }
881
882    // Start measuring uncovered lines processing time
883    let uncovered_lines_start = Instant::now();
884
885    // Process uncovered lines only after all AST blocks have been processed
886    for line_num in uncovered_lines {
887        // Skip fallback context for test files if allow_tests is false
888        if !params.allow_tests && is_test_file(params.path) {
889            if debug_mode {
890                println!(
891                    "DEBUG: Skipping fallback context for test file: {:?}",
892                    params.path
893                );
894            }
895            continue;
896        }
897
898        // Check if the line is in a test function/module using language-specific detection
899        if !params.allow_tests && line_num <= lines.len() {
900            // Get the language implementation for this file extension
901            if let Some(language_impl) = crate::language::factory::get_language_impl(extension) {
902                let line_content = lines[line_num - 1];
903
904                // Create a simple parser to check this line
905                let mut parser = tree_sitter::Parser::new();
906                if parser
907                    .set_language(&language_impl.get_tree_sitter_language())
908                    .is_ok()
909                {
910                    // Try to parse just this line to get a node
911                    if let Some(tree) = parser.parse(line_content, None) {
912                        let node = tree.root_node();
913
914                        // Use the language-specific test detection
915                        if language_impl.is_test_node(&node, line_content.as_bytes()) {
916                            if debug_mode {
917                                println!(
918                                    "DEBUG: Skipping fallback context for test code: '{}'",
919                                    line_content.trim()
920                                );
921                            }
922                            continue;
923                        }
924                    }
925                }
926            }
927        }
928
929        // Use a smaller, adaptive context size (5 lines by default)
930        // This reduces the chance of overshadowing more specific blocks
931        let default_context_size = 5;
932
933        // Calculate 0-based array indices for context
934        // line_num is 1-based, so we subtract 1 to get 0-based index
935        let line_idx = line_num - 1;
936        let context_start_idx = line_idx.saturating_sub(default_context_size);
937        let context_end_idx = std::cmp::min(line_idx + default_context_size, lines.len() - 1);
938
939        // Skip if we don't have enough context
940        if context_start_idx > context_end_idx {
941            continue;
942        }
943
944        // Convert back to 1-based line numbers for display and tracking
945        let context_start = context_start_idx + 1;
946        let context_end = context_end_idx + 1;
947
948        // Extract the context lines using 0-based indices
949        let context_code = lines[context_start_idx..=context_end_idx]
950            .to_vec()
951            .join("\n");
952
953        // Determine a better node type for the fallback context by analyzing the content
954        let node_type = determine_fallback_node_type(lines[line_num - 1], Some(extension));
955
956        if debug_mode {
957            println!("DEBUG: Inferred node type for fallback context: {node_type}");
958            println!(
959                "DEBUG: Using adaptive context size: lines {}-{} (size: {})",
960                context_start,
961                context_end,
962                context_end - context_start + 1
963            );
964        }
965
966        // Start measuring term matching time for uncovered lines
967        let term_matching_start = Instant::now();
968
969        // Early tokenization for fallback context
970        let context_terms =
971            ranking::preprocess_text_with_filename(&context_code, &params.path.to_string_lossy());
972
973        // Add to term matching time
974        let term_matching_duration_value = term_matching_start.elapsed();
975        if let Some(duration) = timings.result_building_term_matching {
976            timings.result_building_term_matching = Some(duration + term_matching_duration_value);
977        } else {
978            timings.result_building_term_matching = Some(term_matching_duration_value);
979        }
980
981        // Start measuring filtering time for uncovered lines
982        let filtering_start = Instant::now();
983
984        // Early filtering for fallback context
985        let should_include = {
986            if debug_mode {
987                println!(
988                    "DEBUG: Using filter_tokenized_block for fallback context {context_start}-{context_end}"
989                );
990            }
991
992            // Skip tokenization and evaluation when exact flag is enabled
993            if params.query_plan.exact {
994                // In exact mode, we already matched the lines in the file
995                // so we should include this block without re-evaluating
996                if debug_mode {
997                    println!(
998                        "DEBUG: Exact mode enabled, skipping tokenization and evaluation for fallback context {context_start}-{context_end}"
999                    );
1000                }
1001                true
1002            } else {
1003                filter_tokenized_block(
1004                    &context_terms,
1005                    &params.query_plan.term_indices,
1006                    params.query_plan,
1007                    debug_mode,
1008                )
1009            }
1010        };
1011
1012        // We don't add this to any timing since filtering is not part of result building
1013        let _filtering_duration = filtering_start.elapsed();
1014
1015        if debug_mode {
1016            println!(
1017                "DEBUG: Block at {context_start}-{context_end} filtered: included={should_include}"
1018            );
1019        }
1020
1021        // Only mark these lines as covered if we're including the result
1022        // This allows for potentially better blocks to be found for these lines later
1023        if should_include {
1024            for line in context_start..=context_end {
1025                covered_lines.insert(line);
1026            }
1027        }
1028
1029        // Add to results only if it passes the filter
1030        if should_include {
1031            // Start measuring compound word processing time
1032            let compound_start = Instant::now();
1033
1034            // Calculate metrics for fallback context using the already tokenized content
1035            let direct_matches: HashSet<&String> = context_terms
1036                .iter()
1037                .filter(|t| unique_query_terms.contains(*t))
1038                .collect();
1039
1040            let mut compound_matches = HashSet::new();
1041            // Load vocabulary once before the loop
1042            let vocabulary = tokenization::load_vocabulary();
1043            for qterm in &unique_query_terms {
1044                if context_terms.iter().any(|bt| bt == qterm) {
1045                    continue;
1046                }
1047                let parts = tokenization::split_compound_word(qterm, vocabulary);
1048                if parts.len() > 1 && parts.iter().all(|part| context_terms.contains(part)) {
1049                    compound_matches.insert(qterm);
1050                }
1051            }
1052
1053            // Add to compound processing time
1054            let compound_duration = compound_start.elapsed();
1055            if let Some(duration) = timings.result_building_compound_processing {
1056                timings.result_building_compound_processing = Some(duration + compound_duration);
1057            } else {
1058                timings.result_building_compound_processing = Some(compound_duration);
1059            }
1060
1061            let context_unique_terms = direct_matches.len() + compound_matches.len();
1062            let context_total_matches = direct_matches.len() + compound_matches.len();
1063
1064            // Collect matched keywords for fallback context
1065            let mut matched_keywords = Vec::new();
1066
1067            // Add direct matches
1068            matched_keywords.extend(direct_matches.iter().map(|s| (*s).clone()));
1069
1070            // Add compound matches
1071            matched_keywords.extend(compound_matches.iter().map(|s| (*s).clone()));
1072
1073            // Start measuring line matching time
1074            let line_matching_start = Instant::now();
1075
1076            // Get the matched term indices for this context block
1077            let mut matched_term_indices = HashSet::new();
1078            for (&term_idx, lines) in params.term_matches {
1079                if lines
1080                    .iter()
1081                    .any(|&l| l >= context_start && l <= context_end)
1082                {
1083                    matched_term_indices.insert(term_idx);
1084                }
1085            }
1086
1087            // Add to line matching time
1088            let line_matching_duration = line_matching_start.elapsed();
1089            if let Some(duration) = timings.result_building_line_matching {
1090                timings.result_building_line_matching = Some(duration + line_matching_duration);
1091            } else {
1092                timings.result_building_line_matching = Some(line_matching_duration);
1093            }
1094
1095            // Add the corresponding terms from the query plan
1096            for (term, &idx) in &params.query_plan.term_indices {
1097                if matched_term_indices.contains(&idx)
1098                    && !params.query_plan.excluded_terms.contains(term)
1099                {
1100                    matched_keywords.push(term.clone());
1101                }
1102            }
1103
1104            // Remove duplicates
1105            matched_keywords.sort();
1106            matched_keywords.dedup();
1107
1108            // Start measuring result creation time
1109            let result_creation_start = Instant::now();
1110
1111            let result = SearchResult {
1112                file: params.path.to_string_lossy().to_string(),
1113                lines: (context_start, context_end),
1114                node_type,
1115                code: context_code,
1116                matched_by_filename: None,
1117                rank: None,
1118                score: None,
1119                tfidf_score: None,
1120                bm25_score: None,
1121                tfidf_rank: None,
1122                bm25_rank: None,
1123                new_score: None,
1124                hybrid2_rank: None,
1125                combined_score_rank: None,
1126                file_unique_terms: Some(context_unique_terms),
1127                file_total_matches: Some(context_total_matches),
1128                file_match_rank: None,
1129                block_unique_terms: Some(context_unique_terms),
1130                block_total_matches: Some(context_total_matches),
1131                parent_file_id: None,
1132                block_id: None,
1133                matched_keywords: if matched_keywords.is_empty() {
1134                    None
1135                } else {
1136                    Some(matched_keywords)
1137                },
1138                tokenized_content: Some(context_terms),
1139            };
1140
1141            // Add to result creation time
1142            let result_creation_duration = result_creation_start.elapsed();
1143            if let Some(duration) = timings.result_building_result_creation {
1144                timings.result_building_result_creation = Some(duration + result_creation_duration);
1145            } else {
1146                timings.result_building_result_creation = Some(result_creation_duration);
1147            }
1148
1149            // Start measuring synchronization time (in this case, just adding to results)
1150            let sync_start = Instant::now();
1151
1152            results.push(result);
1153
1154            // Add to synchronization time
1155            let sync_duration = sync_start.elapsed();
1156            if let Some(duration) = timings.result_building_synchronization {
1157                timings.result_building_synchronization = Some(duration + sync_duration);
1158            } else {
1159                timings.result_building_synchronization = Some(sync_duration);
1160            }
1161        }
1162    }
1163
1164    // End uncovered lines processing time measurement
1165    let uncovered_lines_duration = uncovered_lines_start.elapsed();
1166    timings.result_building_uncovered_lines = Some(uncovered_lines_duration);
1167
1168    if debug_mode {
1169        println!("DEBUG: File processing timings:");
1170        if let Some(duration) = timings.file_io {
1171            println!("DEBUG:   File I/O: {duration:?}");
1172        }
1173        if let Some(duration) = timings.ast_parsing {
1174            println!("DEBUG:   AST parsing: {duration:?}");
1175            if let Some(d) = timings.ast_parsing_language_init {
1176                println!("DEBUG:     - Language init: {d:?}");
1177            }
1178            if let Some(d) = timings.ast_parsing_parser_init {
1179                println!("DEBUG:     - Parser init: {d:?}");
1180            }
1181            if let Some(d) = timings.ast_parsing_tree_parsing {
1182                println!("DEBUG:     - Tree parsing: {d:?}");
1183            }
1184            if let Some(d) = timings.ast_parsing_line_map_building {
1185                println!("DEBUG:     - Line map building: {d:?}");
1186            }
1187        }
1188        if let Some(duration) = timings.block_extraction {
1189            println!("DEBUG:   Block extraction: {duration:?}");
1190            if let Some(d) = timings.block_extraction_code_structure {
1191                println!("DEBUG:     - Code structure finding: {d:?}");
1192            }
1193            if let Some(d) = timings.block_extraction_filtering {
1194                println!("DEBUG:     - Filtering: {d:?}");
1195            }
1196            if let Some(d) = timings.block_extraction_result_building {
1197                println!("DEBUG:     - Result building: {d:?}");
1198            }
1199        }
1200    }
1201
1202    if debug_mode {
1203        println!("DEBUG: Detailed result building timings:");
1204        if let Some(duration) = timings.result_building_term_matching {
1205            println!("DEBUG:   Term matching: {duration:?}");
1206        }
1207        if let Some(duration) = timings.result_building_compound_processing {
1208            println!("DEBUG:   Compound word processing: {duration:?}");
1209        }
1210        if let Some(duration) = timings.result_building_line_matching {
1211            println!("DEBUG:   Line range matching: {duration:?}");
1212        }
1213        if let Some(duration) = timings.result_building_result_creation {
1214            println!("DEBUG:   Result creation: {duration:?}");
1215        }
1216        if let Some(duration) = timings.result_building_synchronization {
1217            println!("DEBUG:   Synchronization: {duration:?}");
1218        }
1219        if let Some(duration) = timings.result_building_uncovered_lines {
1220            println!("DEBUG:   Uncovered lines processing: {duration:?}");
1221        }
1222    }
1223
1224    Ok((results, timings))
1225}
probe_code/search/file_processing.rs

probe_code/search/
file_processing.rs