1use anyhow::{Context, Result};
2use rayon::prelude::*;
3use std::collections::{HashMap, HashSet};
4use std::fs;
5use std::path::Path;
6use std::sync::{Arc, Mutex};
7use std::time::{Duration, Instant};
8use tree_sitter;
9
10use probe_code::language::{is_test_file, parse_file_for_code_blocks};
11use probe_code::models::SearchResult;
12use probe_code::ranking;
13use probe_code::search::tokenization;
14
15pub struct FileProcessingTimings {
17 pub file_io: Option<Duration>,
18
19 pub ast_parsing: Option<Duration>,
21 pub ast_parsing_language_init: Option<Duration>,
22 pub ast_parsing_parser_init: Option<Duration>,
23 pub ast_parsing_tree_parsing: Option<Duration>,
24 pub ast_parsing_line_map_building: Option<Duration>,
25
26 pub block_extraction: Option<Duration>,
28 pub block_extraction_code_structure: Option<Duration>,
29 pub block_extraction_filtering: Option<Duration>,
30 pub block_extraction_result_building: Option<Duration>,
31
32 pub result_building_term_matching: Option<Duration>,
34 pub result_building_compound_processing: Option<Duration>,
35 pub result_building_line_matching: Option<Duration>,
36 pub result_building_result_creation: Option<Duration>,
37 pub result_building_synchronization: Option<Duration>,
38 pub result_building_uncovered_lines: Option<Duration>,
39}
40
41pub struct FileProcessingParams<'a> {
43 pub path: &'a Path,
44 pub line_numbers: &'a HashSet<usize>,
45 pub allow_tests: bool,
46 pub term_matches: &'a HashMap<usize, HashSet<usize>>,
47 #[allow(dead_code)]
48 pub num_queries: usize,
49 #[allow(dead_code)]
50 pub filename_matched_queries: HashSet<usize>,
51 pub queries_terms: &'a [Vec<(String, String)>],
52 pub preprocessed_queries: Option<&'a [Vec<String>]>,
53 pub query_plan: &'a crate::search::query::QueryPlan,
54
55 #[allow(dead_code)]
56 pub no_merge: bool,
57}
58
59#[allow(dead_code)]
63pub fn filter_code_block_with_ast(
64 block_lines: (usize, usize),
65 term_matches: &HashMap<usize, HashSet<usize>>,
66 plan: &crate::search::query::QueryPlan,
67 debug_mode: bool,
68) -> bool {
69 let mut matched_terms = HashSet::new();
71 for (&term_idx, lines) in term_matches {
72 if lines
73 .iter()
74 .any(|&l| l >= block_lines.0 && l <= block_lines.1)
75 {
76 matched_terms.insert(term_idx);
77 }
78 }
79
80 if debug_mode {
81 println!(
82 "DEBUG: Checking for terms in block {}-{}",
83 block_lines.0, block_lines.1
84 );
85 println!("DEBUG: Matched terms: {matched_terms:?}");
86 println!("DEBUG: Term indices: {:?}", plan.term_indices);
87 println!("DEBUG: Excluded terms: {:?}", plan.excluded_terms);
88 println!("DEBUG: AST: {:?}", plan.ast);
89
90 println!("DEBUG: ===== MATCHED KEYWORDS DETAILS =====");
92 let mut matched_keywords = Vec::new();
93 for (term, &idx) in &plan.term_indices {
94 if matched_terms.contains(&idx) {
95 matched_keywords.push(term);
96 println!(
97 "DEBUG: Keyword '{}' matched in block {}-{}",
98 term, block_lines.0, block_lines.1
99 );
100 }
101 }
102 if matched_keywords.is_empty() {
103 println!("DEBUG: No keywords matched in this block");
104 } else {
105 println!("DEBUG: All matched keywords: {matched_keywords:?}");
106 }
107 println!("DEBUG: ===================================");
108 }
109
110 if matched_terms.is_empty() {
112 if debug_mode {
113 println!(
114 "DEBUG: No matched terms in block {}-{}, returning false",
115 block_lines.0, block_lines.1
116 );
117 }
118 return false;
119 }
120
121 if debug_mode {
123 println!("DEBUG: ===== AST EVALUATION =====");
124 println!("DEBUG: Matched terms: {matched_terms:?}");
125 println!("DEBUG: Term indices: {:?}", plan.term_indices);
126 }
127
128 let result = plan.ast.evaluate(&matched_terms, &plan.term_indices, false);
130
131 if debug_mode {
132 println!("DEBUG: ===== EVALUATION RESULT =====");
133 println!("DEBUG: AST evaluation result: {result}");
134 println!(
135 "DEBUG: Block {}-{} will be {}",
136 block_lines.0,
137 block_lines.1,
138 if result { "INCLUDED" } else { "EXCLUDED" }
139 );
140 println!("DEBUG: ============================");
141 }
142
143 let decision = result;
144
145 if debug_mode {
146 println!(
147 "DEBUG: Block {}-{} matched terms: {:?}",
148 block_lines.0, block_lines.1, matched_terms
149 );
150 println!("DEBUG: AST evaluation result: {decision}");
151 }
152
153 if debug_mode {
154 println!(
155 "DEBUG: filter_code_block_with_ast => lines {block_lines:?} => matched {matched_terms:?}, decision={decision}"
156 );
157 }
158 decision
159}
160
161pub fn filter_tokenized_block(
164 tokenized_content: &[String],
165 term_indices: &HashMap<String, usize>,
166 plan: &crate::search::query::QueryPlan,
167 debug_mode: bool,
168) -> bool {
169 let mut matched_terms = HashSet::new();
171
172 for token in tokenized_content {
174 if let Some(&idx) = term_indices.get(token) {
175 matched_terms.insert(idx);
176 }
177 }
178
179 for (term, &idx) in &plan.term_indices {
182 if matched_terms.contains(&idx) {
184 continue;
185 }
186
187 if crate::search::tokenization::is_special_case(term) {
189 if tokenized_content.contains(&term.to_lowercase()) {
191 matched_terms.insert(idx);
192 if debug_mode {
193 println!("DEBUG: Special case term '{term}' matched in tokenized content");
194 }
195 }
196 }
197 }
198
199 if debug_mode {
200 println!("DEBUG: Checking for terms in tokenized block");
201 println!("DEBUG: Tokenized content: {tokenized_content:?}");
202 println!("DEBUG: Matched terms: {matched_terms:?}");
203 println!("DEBUG: Term indices: {:?}", plan.term_indices);
204 println!("DEBUG: Excluded terms: {:?}", plan.excluded_terms);
205 println!("DEBUG: AST: {:?}", plan.ast);
206
207 println!("DEBUG: ===== MATCHED KEYWORDS DETAILS =====");
209 let mut matched_keywords = Vec::new();
210 for (term, &idx) in &plan.term_indices {
211 if matched_terms.contains(&idx) {
212 matched_keywords.push(term);
213 println!("DEBUG: Keyword '{term}' matched in tokenized block");
214 }
215 }
216 if matched_keywords.is_empty() {
217 println!("DEBUG: No keywords matched in this block");
218 } else {
219 println!("DEBUG: All matched keywords: {matched_keywords:?}");
220 }
221 println!("DEBUG: ===================================");
222 }
223
224 if matched_terms.is_empty() {
226 if debug_mode {
227 println!("DEBUG: No matched terms in tokenized block, returning false");
228 }
229 return false;
230 }
231
232 if debug_mode {
234 println!("DEBUG: ===== AST EVALUATION =====");
235 println!("DEBUG: Matched terms: {matched_terms:?}");
236 println!("DEBUG: Term indices: {:?}", plan.term_indices);
237 }
238
239 let result = plan.ast.evaluate(&matched_terms, &plan.term_indices, false);
241
242 if debug_mode {
243 println!("DEBUG: ===== EVALUATION RESULT =====");
244 println!("DEBUG: AST evaluation result: {result}");
245 println!(
246 "DEBUG: Block will be {}",
247 if result { "INCLUDED" } else { "EXCLUDED" }
248 );
249 println!("DEBUG: ============================");
250 }
251
252 let decision = result;
253
254 if debug_mode {
255 println!("DEBUG: Tokenized block matched terms: {matched_terms:?}");
256 println!("DEBUG: AST evaluation result: {decision}");
257 println!("DEBUG: filter_tokenized_block => matched {matched_terms:?}, decision={decision}");
258 }
259
260 decision
261}
262
263fn determine_fallback_node_type(line: &str, extension: Option<&str>) -> String {
265 let trimmed = line.trim();
266
267 if trimmed.starts_with("//")
268 || trimmed.starts_with("/*")
269 || trimmed.starts_with("*")
270 || (trimmed.starts_with("#") && extension.is_some_and(|ext| ext == "py" || ext == "rb"))
271 || trimmed.starts_with("'''")
272 || trimmed.starts_with("\"\"\"")
273 {
274 return "comment".to_string();
275 }
276
277 let lowercase = trimmed.to_lowercase();
278
279 if (trimmed.contains("fn ")
280 && (trimmed.contains("(") || trimmed.contains(")"))
281 && extension == Some("rs"))
282 || (trimmed.contains("func ") && extension == Some("go"))
283 || (trimmed.contains("function ")
284 && extension.is_some_and(|ext| ext == "js" || ext == "ts"))
285 || (lowercase.contains("def ") && extension == Some("py"))
286 || (trimmed.contains("public")
287 && trimmed.contains("void")
288 && extension.is_some_and(|ext| ext == "java" || ext == "kt"))
289 {
290 return "function".to_string();
291 }
292
293 if (trimmed.contains("class ") || trimmed.contains("interface "))
294 || (trimmed.contains("struct ")
295 && extension
296 .is_some_and(|ext| ext == "rs" || ext == "go" || ext == "c" || ext == "cpp"))
297 || (trimmed.contains("type ") && trimmed.contains("struct") && extension == Some("go"))
298 || (trimmed.contains("enum "))
299 {
300 return "class".to_string();
301 }
302
303 if trimmed.starts_with("import ")
304 || trimmed.starts_with("from ")
305 || trimmed.starts_with("require ")
306 || trimmed.starts_with("use ")
307 || trimmed.starts_with("#include ")
308 {
309 return "import".to_string();
310 }
311
312 if (trimmed.starts_with("let ") || trimmed.starts_with("var ") || trimmed.starts_with("const "))
313 || (trimmed.contains("=") && !trimmed.contains("==") && !trimmed.contains("=>"))
314 {
315 return "variable_declaration".to_string();
316 }
317
318 if trimmed.starts_with("if ")
319 || trimmed.starts_with("for ")
320 || trimmed.starts_with("while ")
321 || trimmed.starts_with("switch ")
322 || trimmed.starts_with("match ")
323 {
324 return "control_flow".to_string();
325 }
326
327 "code".to_string()
328}
329pub fn process_file_with_results(
331 params: &FileProcessingParams,
332) -> Result<(Vec<SearchResult>, FileProcessingTimings)> {
333 let mut timings = FileProcessingTimings {
334 file_io: None,
335
336 ast_parsing: None,
338 ast_parsing_language_init: None,
339 ast_parsing_parser_init: None,
340 ast_parsing_tree_parsing: None,
341 ast_parsing_line_map_building: None,
342
343 block_extraction: None,
345 block_extraction_code_structure: None,
346 block_extraction_filtering: None,
347 block_extraction_result_building: None,
348
349 result_building_term_matching: None,
351 result_building_compound_processing: None,
352 result_building_line_matching: None,
353 result_building_result_creation: None,
354 result_building_synchronization: None,
355 result_building_uncovered_lines: None,
356 };
357
358 let file_io_start = Instant::now();
360 let content = fs::read_to_string(params.path)
361 .context(format!("Failed to read file: {:?}", params.path))?;
362 let file_io_duration = file_io_start.elapsed();
363 timings.file_io = Some(file_io_duration);
364
365 let extension = params
366 .path
367 .extension()
368 .and_then(|ext| ext.to_str())
369 .unwrap_or("");
370
371 let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
373
374 let lines: Vec<&str> = content
376 .lines()
377 .enumerate()
378 .map(|(i, line)| {
379 if line.len() > 500 {
380 if debug_mode {
381 println!(
382 "DEBUG: Skipping line {} with length {} (exceeds 500 character limit)",
383 i + 1,
384 line.len()
385 );
386 }
387 ""
388 } else {
389 line
390 }
391 })
392 .collect();
393 let mut results = Vec::new();
394 let mut covered_lines = HashSet::new();
395 let query_terms: Vec<String> = if let Some(prep) = params.preprocessed_queries {
399 prep.iter().flat_map(|v| v.iter().cloned()).collect()
400 } else {
401 params
402 .queries_terms
403 .iter()
404 .flat_map(|pairs| pairs.iter().map(|(_, s)| s.clone()))
405 .collect()
406 };
407 let unique_query_terms: HashSet<String> = query_terms.into_iter().collect();
408
409 if debug_mode {
410 println!("DEBUG: Processing file: {:?}", params.path);
411 println!("DEBUG: matched lines: {:?}", params.line_numbers);
412 println!("DEBUG: file I/O time: {file_io_duration:?}");
413 }
414
415 let ast_parsing_start = Instant::now();
417
418 let language_init_start = Instant::now();
420 let language_impl = crate::language::factory::get_language_impl(extension);
421 let language_init_duration = language_init_start.elapsed();
422 timings.ast_parsing_language_init = Some(language_init_duration);
423
424 let parser_init_start = Instant::now();
426 let mut parser = tree_sitter::Parser::new();
427 if let Some(lang_impl) = &language_impl {
428 let _ = parser.set_language(&lang_impl.get_tree_sitter_language());
429 }
430 let parser_init_duration = parser_init_start.elapsed();
431 timings.ast_parsing_parser_init = Some(parser_init_duration);
432
433 let tree_parsing_start = Instant::now();
435 let file_path = params.path.to_string_lossy();
436 let mut cache_key = String::with_capacity(file_path.len() + extension.len() + 1);
437 cache_key.push_str(&file_path);
438 cache_key.push('_');
439 cache_key.push_str(extension);
440
441 let _ = if language_impl.is_some() {
442 crate::language::tree_cache::get_or_parse_tree(&cache_key, &content, &mut parser).ok()
443 } else {
444 None
445 };
446 let tree_parsing_duration = tree_parsing_start.elapsed();
447 timings.ast_parsing_tree_parsing = Some(tree_parsing_duration);
448
449 let line_map_building_start = Instant::now();
451
452 let code_blocks_result = parse_file_for_code_blocks(
454 &content,
455 extension,
456 params.line_numbers,
457 params.allow_tests,
458 Some(params.term_matches),
459 );
460
461 let line_map_building_duration = line_map_building_start.elapsed();
462 timings.ast_parsing_line_map_building = Some(line_map_building_duration);
463
464 let ast_parsing_duration = ast_parsing_start.elapsed();
466 timings.ast_parsing = Some(ast_parsing_duration);
467
468 if debug_mode {
469 println!("DEBUG: AST parsing time: {ast_parsing_duration:?}");
470 println!("DEBUG: - Language init: {language_init_duration:?}");
471 println!("DEBUG: - Parser init: {parser_init_duration:?}");
472 println!("DEBUG: - Tree parsing: {tree_parsing_duration:?}");
473 println!("DEBUG: - Line map building: {line_map_building_duration:?}");
474 }
475
476 if let Ok(code_blocks) = code_blocks_result {
477 if debug_mode {
478 println!("DEBUG: AST parsing successful");
479 println!("DEBUG: Found {} code blocks", code_blocks.len());
480
481 for (i, block) in code_blocks.iter().enumerate() {
482 println!(
483 "DEBUG: Block {}: type={}, lines={}-{}",
484 i + 1,
485 block.node_type,
486 block.start_row + 1,
487 block.end_row + 1
488 );
489 }
490 }
491
492 let file_id = params.path.to_string_lossy().to_string();
493
494 let block_extraction_start = Instant::now();
496
497 let _code_structure_start = Instant::now();
499 let code_structure_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
500 let filtering_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
501 let result_building_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
502
503 let term_matching_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
505 let compound_processing_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
506 let line_matching_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
507 let result_creation_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
508 let synchronization_duration = Arc::new(Mutex::new(Duration::new(0, 0)));
509
510 let shared_results = Arc::new(Mutex::new(Vec::new()));
512 let shared_covered_lines = Arc::new(Mutex::new(HashSet::new()));
513
514 code_blocks
516 .par_iter()
517 .enumerate()
518 .for_each(|(block_idx, block)| {
519 let block_start = Instant::now();
521
522 let start_line = block.start_row + 1;
523 let end_line = block.end_row + 1;
524
525 let (final_start_line, final_end_line, is_nested_struct) = if extension == "go"
526 && block.node_type == "struct_type"
527 && block
528 .parent_node_type
529 .as_ref()
530 .is_some_and(|p| p == "function_declaration" || p == "method_declaration")
531 {
532 if let Some(ps) = block.parent_start_row {
533 if let Some(pe) = block.parent_end_row {
534 (ps + 1, pe + 1, true)
535 } else {
536 (start_line, end_line, false)
537 }
538 } else {
539 (start_line, end_line, false)
540 }
541 } else {
542 (start_line, end_line, false)
543 };
544
545 let full_code = if final_start_line > 0 && final_end_line <= lines.len() {
546 lines[final_start_line - 1..final_end_line]
548 .to_vec()
549 .join("\n")
550 } else {
551 "".to_string()
552 };
553
554 let block_duration = block_start.elapsed();
556 {
557 let mut duration = code_structure_duration.lock().unwrap();
558 *duration += block_duration;
559 }
560
561 let term_matching_start = Instant::now();
563
564 let block_terms = ranking::preprocess_text_with_filename(
566 &full_code,
567 ¶ms.path.to_string_lossy(),
568 );
569
570 let term_matching_block_duration = term_matching_start.elapsed();
572 {
573 let mut duration = term_matching_duration.lock().unwrap();
574 *duration += term_matching_block_duration;
575 }
576
577 let filtering_start = Instant::now();
579 let should_include = {
581 if debug_mode {
582 println!(
583 "DEBUG: Using filter_tokenized_block for block {final_start_line}-{final_end_line}"
584 );
585 }
586
587 if params.query_plan.exact {
589 if debug_mode {
592 println!(
593 "DEBUG: Exact mode enabled, skipping tokenization and evaluation for block {final_start_line}-{final_end_line}"
594 );
595 }
596 true
597 } else {
598 let result = filter_tokenized_block(
600 &block_terms,
601 ¶ms.query_plan.term_indices,
602 params.query_plan,
603 debug_mode,
604 );
605
606 if debug_mode {
607 println!(
608 "DEBUG: Block {final_start_line}-{final_end_line} filter result: {result}"
609 );
610 }
611
612 result
613 }
614 };
615
616 let filtering_block_duration = filtering_start.elapsed();
618 {
619 let mut duration = filtering_duration.lock().unwrap();
620 *duration += filtering_block_duration;
621 }
622
623 if debug_mode {
624 println!(
625 "DEBUG: Block lines {final_start_line}-{final_end_line} => should_include={should_include}"
626 );
627 }
628
629 {
631 let mut covered = shared_covered_lines.lock().unwrap();
632 for line_num in final_start_line..=final_end_line {
633 covered.insert(line_num);
634 }
635 }
636
637 if should_include {
638 let result_building_start = Instant::now();
640
641 let direct_matches_start = Instant::now();
643
644 let direct_matches: HashSet<&String> = block_terms
646 .iter()
647 .filter(|t| unique_query_terms.contains(*t))
648 .collect();
649
650 let direct_matches_duration = direct_matches_start.elapsed();
651 {
652 let mut duration = term_matching_duration.lock().unwrap();
653 *duration += direct_matches_duration;
654 }
655
656 let compound_start = Instant::now();
658
659 let mut compound_matches = HashSet::new();
660 let vocabulary = tokenization::load_vocabulary();
662 for qterm in &unique_query_terms {
663 if block_terms.iter().any(|bt| bt == qterm) {
664 continue;
665 }
666 let parts = tokenization::split_compound_word(qterm, vocabulary);
667 if parts.len() > 1 && parts.iter().all(|part| block_terms.contains(part)) {
668 compound_matches.insert(qterm);
669 }
670 }
671
672 let compound_duration = compound_start.elapsed();
673 {
674 let mut duration = compound_processing_duration.lock().unwrap();
675 *duration += compound_duration;
676 }
677
678 let block_unique_terms = direct_matches.len() + compound_matches.len();
679 let block_total_matches = direct_matches.len() + compound_matches.len();
680
681 let mut matched_keywords = Vec::new();
683
684 matched_keywords.extend(direct_matches.iter().map(|s| (*s).clone()));
686
687 matched_keywords.extend(compound_matches.iter().map(|s| (*s).clone()));
689
690 let line_matching_start = Instant::now();
692
693 let mut matched_term_indices = HashSet::new();
695 for (&term_idx, lines) in params.term_matches {
696 if lines
697 .iter()
698 .any(|&l| l >= final_start_line && l <= final_end_line)
699 {
700 matched_term_indices.insert(term_idx);
701 }
702 }
703
704 let line_matching_duration_value = line_matching_start.elapsed();
705 {
706 let mut duration = line_matching_duration.lock().unwrap();
707 *duration += line_matching_duration_value;
708 }
709
710 for (term, &idx) in ¶ms.query_plan.term_indices {
712 if matched_term_indices.contains(&idx)
713 && !params.query_plan.excluded_terms.contains(term)
714 {
715 matched_keywords.push(term.clone());
716 }
717 }
718
719 matched_keywords.sort();
721 matched_keywords.dedup();
722
723 let result_creation_start = Instant::now();
725
726 let result = SearchResult {
727 file: params.path.to_string_lossy().to_string(),
728 lines: (final_start_line, final_end_line),
729 node_type: if is_nested_struct {
730 block
731 .parent_node_type
732 .clone()
733 .unwrap_or_else(|| block.node_type.clone())
734 } else {
735 block.node_type.clone()
736 },
737 code: full_code,
738 matched_by_filename: None,
739 rank: None,
740 score: None,
741 tfidf_score: None,
742 bm25_score: None,
743 tfidf_rank: None,
744 bm25_rank: None,
745 new_score: None,
746 hybrid2_rank: None,
747 combined_score_rank: None,
748 file_unique_terms: Some(block_unique_terms),
749 file_total_matches: Some(block_total_matches),
750 file_match_rank: None,
751 block_unique_terms: Some(block_unique_terms),
752 block_total_matches: Some(block_total_matches),
753 parent_file_id: Some(file_id.clone()),
754 block_id: Some(block_idx),
755 matched_keywords: if matched_keywords.is_empty() {
756 None
757 } else {
758 Some(matched_keywords)
759 },
760 tokenized_content: Some(block_terms),
761 };
762
763 let result_creation_duration_value = result_creation_start.elapsed();
764 {
765 let mut duration = result_creation_duration.lock().unwrap();
766 *duration += result_creation_duration_value;
767 }
768
769 let sync_start = Instant::now();
771
772 {
774 let mut results = shared_results.lock().unwrap();
775 results.push(result);
776 }
777
778 let sync_duration = sync_start.elapsed();
779 {
780 let mut duration = synchronization_duration.lock().unwrap();
781 *duration += sync_duration;
782 }
783
784 let result_building_block_duration = result_building_start.elapsed();
786 {
787 let mut duration = result_building_duration.lock().unwrap();
788 *duration += result_building_block_duration;
789 }
790 }
791 });
792
793 results = Arc::try_unwrap(shared_results)
795 .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
796 .into_inner()
797 .unwrap();
798
799 covered_lines = Arc::try_unwrap(shared_covered_lines)
800 .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
801 .into_inner()
802 .unwrap();
803
804 let code_structure_duration_value = Arc::try_unwrap(code_structure_duration)
806 .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
807 .into_inner()
808 .unwrap();
809
810 let filtering_duration_value = Arc::try_unwrap(filtering_duration)
811 .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
812 .into_inner()
813 .unwrap();
814
815 let result_building_duration_value = Arc::try_unwrap(result_building_duration)
816 .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
817 .into_inner()
818 .unwrap();
819
820 let term_matching_duration_value = Arc::try_unwrap(term_matching_duration)
822 .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
823 .into_inner()
824 .unwrap();
825
826 let compound_processing_duration_value = Arc::try_unwrap(compound_processing_duration)
827 .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
828 .into_inner()
829 .unwrap();
830
831 let line_matching_duration_value = Arc::try_unwrap(line_matching_duration)
832 .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
833 .into_inner()
834 .unwrap();
835
836 let result_creation_duration_value = Arc::try_unwrap(result_creation_duration)
837 .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
838 .into_inner()
839 .unwrap();
840
841 let synchronization_duration_value = Arc::try_unwrap(synchronization_duration)
842 .unwrap_or_else(|_| panic!("Failed to unwrap Arc"))
843 .into_inner()
844 .unwrap();
845
846 let block_extraction_duration = block_extraction_start.elapsed();
848 timings.block_extraction = Some(block_extraction_duration);
849 timings.block_extraction_code_structure = Some(code_structure_duration_value);
850 timings.block_extraction_filtering = Some(filtering_duration_value);
851 timings.block_extraction_result_building = Some(result_building_duration_value);
852
853 timings.result_building_term_matching = Some(term_matching_duration_value);
855 timings.result_building_compound_processing = Some(compound_processing_duration_value);
856 timings.result_building_line_matching = Some(line_matching_duration_value);
857 timings.result_building_result_creation = Some(result_creation_duration_value);
858 timings.result_building_synchronization = Some(synchronization_duration_value);
859
860 if debug_mode {
861 println!("DEBUG: Block extraction time: {block_extraction_duration:?}");
862 println!("DEBUG: - Code structure finding: {code_structure_duration_value:?}");
863 println!("DEBUG: - Filtering: {filtering_duration_value:?}");
864 println!("DEBUG: - Result building: {result_building_duration_value:?}");
865 }
866 }
867
868 let mut uncovered_lines = Vec::new();
870 for &line_num in params.line_numbers {
871 if !covered_lines.contains(&line_num) {
872 if debug_mode {
873 println!("DEBUG: Line {line_num} not covered, will use fallback context");
874 if line_num <= lines.len() {
875 println!("DEBUG: Line content: '{}'", lines[line_num - 1].trim());
876 }
877 }
878 uncovered_lines.push(line_num);
879 }
880 }
881
882 let uncovered_lines_start = Instant::now();
884
885 for line_num in uncovered_lines {
887 if !params.allow_tests && is_test_file(params.path) {
889 if debug_mode {
890 println!(
891 "DEBUG: Skipping fallback context for test file: {:?}",
892 params.path
893 );
894 }
895 continue;
896 }
897
898 if !params.allow_tests && line_num <= lines.len() {
900 if let Some(language_impl) = crate::language::factory::get_language_impl(extension) {
902 let line_content = lines[line_num - 1];
903
904 let mut parser = tree_sitter::Parser::new();
906 if parser
907 .set_language(&language_impl.get_tree_sitter_language())
908 .is_ok()
909 {
910 if let Some(tree) = parser.parse(line_content, None) {
912 let node = tree.root_node();
913
914 if language_impl.is_test_node(&node, line_content.as_bytes()) {
916 if debug_mode {
917 println!(
918 "DEBUG: Skipping fallback context for test code: '{}'",
919 line_content.trim()
920 );
921 }
922 continue;
923 }
924 }
925 }
926 }
927 }
928
929 let default_context_size = 5;
932
933 let line_idx = line_num - 1;
936 let context_start_idx = line_idx.saturating_sub(default_context_size);
937 let context_end_idx = std::cmp::min(line_idx + default_context_size, lines.len() - 1);
938
939 if context_start_idx > context_end_idx {
941 continue;
942 }
943
944 let context_start = context_start_idx + 1;
946 let context_end = context_end_idx + 1;
947
948 let context_code = lines[context_start_idx..=context_end_idx]
950 .to_vec()
951 .join("\n");
952
953 let node_type = determine_fallback_node_type(lines[line_num - 1], Some(extension));
955
956 if debug_mode {
957 println!("DEBUG: Inferred node type for fallback context: {node_type}");
958 println!(
959 "DEBUG: Using adaptive context size: lines {}-{} (size: {})",
960 context_start,
961 context_end,
962 context_end - context_start + 1
963 );
964 }
965
966 let term_matching_start = Instant::now();
968
969 let context_terms =
971 ranking::preprocess_text_with_filename(&context_code, ¶ms.path.to_string_lossy());
972
973 let term_matching_duration_value = term_matching_start.elapsed();
975 if let Some(duration) = timings.result_building_term_matching {
976 timings.result_building_term_matching = Some(duration + term_matching_duration_value);
977 } else {
978 timings.result_building_term_matching = Some(term_matching_duration_value);
979 }
980
981 let filtering_start = Instant::now();
983
984 let should_include = {
986 if debug_mode {
987 println!(
988 "DEBUG: Using filter_tokenized_block for fallback context {context_start}-{context_end}"
989 );
990 }
991
992 if params.query_plan.exact {
994 if debug_mode {
997 println!(
998 "DEBUG: Exact mode enabled, skipping tokenization and evaluation for fallback context {context_start}-{context_end}"
999 );
1000 }
1001 true
1002 } else {
1003 filter_tokenized_block(
1004 &context_terms,
1005 ¶ms.query_plan.term_indices,
1006 params.query_plan,
1007 debug_mode,
1008 )
1009 }
1010 };
1011
1012 let _filtering_duration = filtering_start.elapsed();
1014
1015 if debug_mode {
1016 println!(
1017 "DEBUG: Block at {context_start}-{context_end} filtered: included={should_include}"
1018 );
1019 }
1020
1021 if should_include {
1024 for line in context_start..=context_end {
1025 covered_lines.insert(line);
1026 }
1027 }
1028
1029 if should_include {
1031 let compound_start = Instant::now();
1033
1034 let direct_matches: HashSet<&String> = context_terms
1036 .iter()
1037 .filter(|t| unique_query_terms.contains(*t))
1038 .collect();
1039
1040 let mut compound_matches = HashSet::new();
1041 let vocabulary = tokenization::load_vocabulary();
1043 for qterm in &unique_query_terms {
1044 if context_terms.iter().any(|bt| bt == qterm) {
1045 continue;
1046 }
1047 let parts = tokenization::split_compound_word(qterm, vocabulary);
1048 if parts.len() > 1 && parts.iter().all(|part| context_terms.contains(part)) {
1049 compound_matches.insert(qterm);
1050 }
1051 }
1052
1053 let compound_duration = compound_start.elapsed();
1055 if let Some(duration) = timings.result_building_compound_processing {
1056 timings.result_building_compound_processing = Some(duration + compound_duration);
1057 } else {
1058 timings.result_building_compound_processing = Some(compound_duration);
1059 }
1060
1061 let context_unique_terms = direct_matches.len() + compound_matches.len();
1062 let context_total_matches = direct_matches.len() + compound_matches.len();
1063
1064 let mut matched_keywords = Vec::new();
1066
1067 matched_keywords.extend(direct_matches.iter().map(|s| (*s).clone()));
1069
1070 matched_keywords.extend(compound_matches.iter().map(|s| (*s).clone()));
1072
1073 let line_matching_start = Instant::now();
1075
1076 let mut matched_term_indices = HashSet::new();
1078 for (&term_idx, lines) in params.term_matches {
1079 if lines
1080 .iter()
1081 .any(|&l| l >= context_start && l <= context_end)
1082 {
1083 matched_term_indices.insert(term_idx);
1084 }
1085 }
1086
1087 let line_matching_duration = line_matching_start.elapsed();
1089 if let Some(duration) = timings.result_building_line_matching {
1090 timings.result_building_line_matching = Some(duration + line_matching_duration);
1091 } else {
1092 timings.result_building_line_matching = Some(line_matching_duration);
1093 }
1094
1095 for (term, &idx) in ¶ms.query_plan.term_indices {
1097 if matched_term_indices.contains(&idx)
1098 && !params.query_plan.excluded_terms.contains(term)
1099 {
1100 matched_keywords.push(term.clone());
1101 }
1102 }
1103
1104 matched_keywords.sort();
1106 matched_keywords.dedup();
1107
1108 let result_creation_start = Instant::now();
1110
1111 let result = SearchResult {
1112 file: params.path.to_string_lossy().to_string(),
1113 lines: (context_start, context_end),
1114 node_type,
1115 code: context_code,
1116 matched_by_filename: None,
1117 rank: None,
1118 score: None,
1119 tfidf_score: None,
1120 bm25_score: None,
1121 tfidf_rank: None,
1122 bm25_rank: None,
1123 new_score: None,
1124 hybrid2_rank: None,
1125 combined_score_rank: None,
1126 file_unique_terms: Some(context_unique_terms),
1127 file_total_matches: Some(context_total_matches),
1128 file_match_rank: None,
1129 block_unique_terms: Some(context_unique_terms),
1130 block_total_matches: Some(context_total_matches),
1131 parent_file_id: None,
1132 block_id: None,
1133 matched_keywords: if matched_keywords.is_empty() {
1134 None
1135 } else {
1136 Some(matched_keywords)
1137 },
1138 tokenized_content: Some(context_terms),
1139 };
1140
1141 let result_creation_duration = result_creation_start.elapsed();
1143 if let Some(duration) = timings.result_building_result_creation {
1144 timings.result_building_result_creation = Some(duration + result_creation_duration);
1145 } else {
1146 timings.result_building_result_creation = Some(result_creation_duration);
1147 }
1148
1149 let sync_start = Instant::now();
1151
1152 results.push(result);
1153
1154 let sync_duration = sync_start.elapsed();
1156 if let Some(duration) = timings.result_building_synchronization {
1157 timings.result_building_synchronization = Some(duration + sync_duration);
1158 } else {
1159 timings.result_building_synchronization = Some(sync_duration);
1160 }
1161 }
1162 }
1163
1164 let uncovered_lines_duration = uncovered_lines_start.elapsed();
1166 timings.result_building_uncovered_lines = Some(uncovered_lines_duration);
1167
1168 if debug_mode {
1169 println!("DEBUG: File processing timings:");
1170 if let Some(duration) = timings.file_io {
1171 println!("DEBUG: File I/O: {duration:?}");
1172 }
1173 if let Some(duration) = timings.ast_parsing {
1174 println!("DEBUG: AST parsing: {duration:?}");
1175 if let Some(d) = timings.ast_parsing_language_init {
1176 println!("DEBUG: - Language init: {d:?}");
1177 }
1178 if let Some(d) = timings.ast_parsing_parser_init {
1179 println!("DEBUG: - Parser init: {d:?}");
1180 }
1181 if let Some(d) = timings.ast_parsing_tree_parsing {
1182 println!("DEBUG: - Tree parsing: {d:?}");
1183 }
1184 if let Some(d) = timings.ast_parsing_line_map_building {
1185 println!("DEBUG: - Line map building: {d:?}");
1186 }
1187 }
1188 if let Some(duration) = timings.block_extraction {
1189 println!("DEBUG: Block extraction: {duration:?}");
1190 if let Some(d) = timings.block_extraction_code_structure {
1191 println!("DEBUG: - Code structure finding: {d:?}");
1192 }
1193 if let Some(d) = timings.block_extraction_filtering {
1194 println!("DEBUG: - Filtering: {d:?}");
1195 }
1196 if let Some(d) = timings.block_extraction_result_building {
1197 println!("DEBUG: - Result building: {d:?}");
1198 }
1199 }
1200 }
1201
1202 if debug_mode {
1203 println!("DEBUG: Detailed result building timings:");
1204 if let Some(duration) = timings.result_building_term_matching {
1205 println!("DEBUG: Term matching: {duration:?}");
1206 }
1207 if let Some(duration) = timings.result_building_compound_processing {
1208 println!("DEBUG: Compound word processing: {duration:?}");
1209 }
1210 if let Some(duration) = timings.result_building_line_matching {
1211 println!("DEBUG: Line range matching: {duration:?}");
1212 }
1213 if let Some(duration) = timings.result_building_result_creation {
1214 println!("DEBUG: Result creation: {duration:?}");
1215 }
1216 if let Some(duration) = timings.result_building_synchronization {
1217 println!("DEBUG: Synchronization: {duration:?}");
1218 }
1219 if let Some(duration) = timings.result_building_uncovered_lines {
1220 println!("DEBUG: Uncovered lines processing: {duration:?}");
1221 }
1222 }
1223
1224 Ok((results, timings))
1225}