1use probe_code::models::SearchResult;
2use std::collections::HashMap;
3use std::fs::File;
4use std::io::{BufRead, BufReader};
5use std::path::{Path, PathBuf};
6
7pub fn merge_ranked_blocks(
19 results: Vec<SearchResult>,
20 threshold: Option<usize>,
21) -> Vec<SearchResult> {
22 let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
23 let threshold = threshold.unwrap_or(5); if results.is_empty() {
26 return results;
27 }
28
29 if debug_mode {
30 println!(
31 "DEBUG: Starting post-rank merging of {} results with threshold {}",
32 results.len(),
33 threshold
34 );
35 }
36
37 let original_count = results.len();
39
40 let mut file_blocks: HashMap<String, Vec<SearchResult>> = HashMap::new();
42
43 for result in results {
44 file_blocks
45 .entry(result.file.clone())
46 .or_default()
47 .push(result);
48 }
49
50 let mut merged_results = Vec::new();
51
52 for (file_path, mut blocks) in file_blocks {
54 if debug_mode {
55 println!(
56 "DEBUG: Processing {} blocks from file: {}",
57 blocks.len(),
58 file_path
59 );
60 }
61
62 if blocks.len() == 1 {
64 merged_results.push(blocks.remove(0));
65 continue;
66 }
67
68 blocks.sort_by_key(|block| block.lines.0);
70
71 blocks.sort_by_key(|block| block.lines.0);
73
74 let mut processed_indices = std::collections::HashSet::new();
76 let mut merged_blocks = Vec::new();
77
78 for i in 0..blocks.len() {
80 if processed_indices.contains(&i) {
81 continue;
82 }
83
84 let mut current_block = blocks[i].clone();
86 processed_indices.insert(i);
87
88 let mut merged_indices = vec![i];
90 let mut changed = true;
91
92 while changed {
94 changed = false;
95
96 for (j, next_block) in blocks.iter().enumerate() {
98 if processed_indices.contains(&j) {
99 continue;
100 }
101
102 if should_merge_blocks(¤t_block, next_block, threshold) {
103 if debug_mode {
104 println!(
105 "DEBUG: Merging blocks - current: {}-{}, next: {}-{}",
106 current_block.lines.0,
107 current_block.lines.1,
108 next_block.lines.0,
109 next_block.lines.1
110 );
111 }
112
113 let merged_start = current_block.lines.0.min(next_block.lines.0);
115 let merged_end = current_block.lines.1.max(next_block.lines.1);
116 let merged_code = merge_block_content(¤t_block, next_block);
117
118 let merged_node_type = if current_block.rank.unwrap_or(usize::MAX)
120 <= next_block.rank.unwrap_or(usize::MAX)
121 {
122 current_block.node_type.clone()
123 } else {
124 next_block.node_type.clone()
125 };
126
127 let merged_score = merge_scores(¤t_block, next_block);
129 let merged_term_stats = merge_term_statistics(¤t_block, next_block);
130
131 current_block.lines = (merged_start, merged_end);
133 current_block.code = merged_code;
134 current_block.node_type = merged_node_type;
135 current_block.score = merged_score.0;
136 current_block.tfidf_score = merged_score.1;
137 current_block.bm25_score = merged_score.2;
138 current_block.new_score = merged_score.3;
139 current_block.block_unique_terms = merged_term_stats.0;
140 current_block.block_total_matches = merged_term_stats.1;
141
142 processed_indices.insert(j);
144 merged_indices.push(j);
145 changed = true;
146 }
147 }
148 }
149
150 merged_blocks.push(current_block);
152 }
153
154 merged_results.extend(merged_blocks);
156 }
157
158 if debug_mode {
159 println!(
160 "DEBUG: Post-rank merging complete. Merged {} blocks into {} blocks",
161 original_count,
162 merged_results.len()
163 );
164 }
165
166 merged_results
167}
168
169pub fn should_merge_blocks(block1: &SearchResult, block2: &SearchResult, threshold: usize) -> bool {
179 let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
180
181 if let (Some(file_id1), Some(file_id2)) = (&block1.parent_file_id, &block2.parent_file_id) {
183 if file_id1 != file_id2 {
184 if debug_mode {
185 println!("DEBUG: Blocks not merged - different parent file IDs");
186 }
187 return false;
188 }
189 } else {
190 if block1.file != block2.file {
192 if debug_mode {
193 println!("DEBUG: Blocks not merged - different files");
194 }
195 return false;
196 }
197 }
198
199 let (start1, end1) = block1.lines;
201 let (start2, end2) = block2.lines;
202
203 let overlapping = start1 <= end2 && start2 <= end1;
210
211 let distance = if overlapping {
213 0
214 } else if start2 > end1 {
215 start2 - end1 - 1 } else {
217 start1 - end2 - 1
218 };
219
220 let comment_with_function = (block1.node_type.contains("comment")
221 && is_function_like(&block2.node_type))
222 || (block2.node_type.contains("comment") && is_function_like(&block1.node_type));
223
224 let should_merge = overlapping
225 || distance <= threshold
226 || (comment_with_function && distance <= threshold * 2);
227
228 if debug_mode {
229 println!("DEBUG: Considering merging blocks - Block1: type='{}' lines {}-{}, Block2: type='{}' lines {}-{}, threshold: {}",
230 block1.node_type, start1, end1, block2.node_type, start2, end2, threshold);
231 println!(
232 "DEBUG: Should merge: {should_merge} (distance: {distance}, threshold: {threshold})"
233 );
234 }
235
236 should_merge
237}
238
239fn is_function_like(node_type: &str) -> bool {
241 node_type.contains("function")
242 || node_type.contains("method")
243 || node_type.contains("fn")
244 || node_type.contains("func")
245}
246
247fn merge_block_content(block1: &SearchResult, block2: &SearchResult) -> String {
256 let (start1, end1) = block1.lines;
258 let (start2, end2) = block2.lines;
259
260 let merged_start = start1.min(start2);
262 let merged_end = end1.max(end2);
263
264 if start1 == merged_start && end1 == merged_end {
266 return block1.code.clone();
267 }
268
269 if start2 == merged_start && end2 == merged_end {
270 return block2.code.clone();
271 }
272
273 let lines1: Vec<&str> = block1.code.lines().collect();
280 let lines2: Vec<&str> = block2.code.lines().collect();
281
282 let mut line_map: HashMap<usize, String> = HashMap::new();
284
285 for (i, line) in lines1.iter().enumerate() {
286 let abs_pos = start1 + i;
287 line_map.insert(abs_pos, line.to_string());
288 }
289
290 for (i, line) in lines2.iter().enumerate() {
291 let abs_pos = start2 + i;
292 line_map.entry(abs_pos).or_insert_with(|| line.to_string());
293 }
294
295 let mut merged_lines = Vec::new();
297 let mut current_line = merged_start;
298 let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
299
300 let file_path = Path::new(&block1.file);
302 let file_result = File::open(file_path);
303 let file_content_available = file_result.is_ok();
304 let _reader = file_result.map(BufReader::new).ok(); if debug_mode {
307 println!(
308 "DEBUG: Current working directory: {:?}",
309 std::env::current_dir().unwrap_or_else(|_| PathBuf::from("unknown"))
310 );
311 println!(
312 "DEBUG: Attempting to read file: {:?}",
313 file_path
314 .canonicalize()
315 .unwrap_or_else(|_| PathBuf::from(file_path))
316 );
317 println!("DEBUG: File exists: {}", file_path.exists());
318 println!("DEBUG: File can be opened: {file_content_available}");
319 }
320
321 while current_line <= merged_end {
322 if let Some(line_content) = line_map.get(¤t_line) {
323 merged_lines.push(line_content.clone());
324 current_line += 1;
325 } else {
326 let gap_start = current_line;
328 let mut gap_end = current_line;
329
330 while gap_end < merged_end && !line_map.contains_key(&(gap_end + 1)) {
332 gap_end += 1;
333 }
334
335 let gap_size = gap_end - gap_start + 1;
336
337 if gap_size < 10 {
339 if file_content_available {
340 if debug_mode {
341 println!(
342 "DEBUG: Attempting to fill small gap from line {} to {} from file {}",
343 gap_start, gap_end, block1.file
344 );
345 }
346
347 let file_result = File::open(Path::new(&block1.file));
350
351 if let Ok(file) = file_result {
352 let reader = BufReader::new(file);
353
354 if debug_mode {
355 println!("DEBUG: Created fresh file reader for gap");
356 }
357
358 let mut lines_read = Vec::new();
360 let mut current_line_in_file = 1;
361
362 for line_content in reader.lines().map_while(Result::ok) {
363 if current_line_in_file >= gap_start && current_line_in_file <= gap_end
364 {
365 lines_read.push(line_content);
366 }
367
368 current_line_in_file += 1;
369
370 if current_line_in_file > gap_end {
371 break;
372 }
373 }
374
375 if !lines_read.is_empty() {
377 if debug_mode {
378 println!(
379 "DEBUG: Successfully read {} lines for gap",
380 lines_read.len()
381 );
382 }
383 merged_lines.extend(lines_read);
384 current_line = gap_end + 1;
385 continue;
386 } else if debug_mode {
387 println!("DEBUG: No lines were read for the gap (empty lines)");
388 }
389 } else if debug_mode {
390 println!("DEBUG: Could not create fresh file reader");
391 }
392 } else if debug_mode {
393 println!("DEBUG: File content not available for {}", block1.file);
394 }
395
396 merged_lines.push(format!(
399 "... lines {gap_start}-{gap_end} should be included ..."
400 ));
401 } else {
402 merged_lines.push(format!("... lines {gap_start}-{gap_end} skipped..."));
404 }
405
406 current_line = gap_end + 1;
408 }
409 }
410
411 merged_lines.join("\n")
412}
413
414fn merge_scores(
423 block1: &SearchResult,
424 block2: &SearchResult,
425) -> (Option<f64>, Option<f64>, Option<f64>, Option<f64>) {
426 let score = match (block1.score, block2.score) {
428 (Some(s1), Some(s2)) => Some(s1.max(s2)),
429 (Some(s), None) | (None, Some(s)) => Some(s),
430 _ => None,
431 };
432
433 let tfidf_score = match (block1.tfidf_score, block2.tfidf_score) {
434 (Some(s1), Some(s2)) => Some(s1.max(s2)),
435 (Some(s), None) | (None, Some(s)) => Some(s),
436 _ => None,
437 };
438
439 let bm25_score = match (block1.bm25_score, block2.bm25_score) {
440 (Some(s1), Some(s2)) => Some(s1.max(s2)),
441 (Some(s), None) | (None, Some(s)) => Some(s),
442 _ => None,
443 };
444
445 let new_score = match (block1.new_score, block2.new_score) {
446 (Some(s1), Some(s2)) => Some(s1.max(s2)),
447 (Some(s), None) | (None, Some(s)) => Some(s),
448 _ => None,
449 };
450
451 (score, tfidf_score, bm25_score, new_score)
452}
453
454fn merge_term_statistics(
463 block1: &SearchResult,
464 block2: &SearchResult,
465) -> (Option<usize>, Option<usize>) {
466 let unique_terms = match (block1.block_unique_terms, block2.block_unique_terms) {
468 (Some(t1), Some(t2)) => Some(t1.max(t2)),
469 (Some(t), None) | (None, Some(t)) => Some(t),
470 _ => None,
471 };
472
473 let total_matches = match (block1.block_total_matches, block2.block_total_matches) {
476 (Some(t1), Some(t2)) => Some(t1 + t2),
477 (Some(t), None) | (None, Some(t)) => Some(t),
478 _ => None,
479 };
480
481 (unique_terms, total_matches)
482}