probe_code/search/
block_merging.rs

1use probe_code::models::SearchResult;
2use std::collections::HashMap;
3use std::fs::File;
4use std::io::{BufRead, BufReader};
5use std::path::{Path, PathBuf};
6
7/// Merges ranked search results that are adjacent or overlapping
8///
9/// This function should be called AFTER ranking and limiting to merge blocks
10/// that come from the same file and are adjacent or overlapping.
11///
12/// # Arguments
13/// * `results` - A vector of already ranked and limited SearchResult objects
14/// * `threshold` - Maximum number of lines between blocks to consider them adjacent (default: 5)
15///
16/// # Returns
17/// A new vector of SearchResult objects with adjacent blocks merged
18pub fn merge_ranked_blocks(
19    results: Vec<SearchResult>,
20    threshold: Option<usize>,
21) -> Vec<SearchResult> {
22    let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
23    let threshold = threshold.unwrap_or(5); // Default to 5 lines if not specified
24
25    if results.is_empty() {
26        return results;
27    }
28
29    if debug_mode {
30        println!(
31            "DEBUG: Starting post-rank merging of {} results with threshold {}",
32            results.len(),
33            threshold
34        );
35    }
36
37    // Store the original count before we move results
38    let original_count = results.len();
39
40    // Group results by file
41    let mut file_blocks: HashMap<String, Vec<SearchResult>> = HashMap::new();
42
43    for result in results {
44        file_blocks
45            .entry(result.file.clone())
46            .or_default()
47            .push(result);
48    }
49
50    let mut merged_results = Vec::new();
51
52    // Process each file's blocks
53    for (file_path, mut blocks) in file_blocks {
54        if debug_mode {
55            println!(
56                "DEBUG: Processing {} blocks from file: {}",
57                blocks.len(),
58                file_path
59            );
60        }
61
62        // If file only has one block, no need to merge
63        if blocks.len() == 1 {
64            merged_results.push(blocks.remove(0));
65            continue;
66        }
67
68        // Sort blocks by start line for merging
69        blocks.sort_by_key(|block| block.lines.0);
70
71        // Sort blocks by start line
72        blocks.sort_by_key(|block| block.lines.0);
73
74        // Keep track of blocks we've already processed
75        let mut processed_indices = std::collections::HashSet::new();
76        let mut merged_blocks = Vec::new();
77
78        // Process each block
79        for i in 0..blocks.len() {
80            if processed_indices.contains(&i) {
81                continue;
82            }
83
84            // Start with the current block
85            let mut current_block = blocks[i].clone();
86            processed_indices.insert(i);
87
88            // Keep track of which blocks we're merging in this group
89            let mut merged_indices = vec![i];
90            let mut changed = true;
91
92            // Keep trying to merge blocks until no more merges are possible
93            while changed {
94                changed = false;
95
96                // Try to merge with any remaining unprocessed block
97                for (j, next_block) in blocks.iter().enumerate() {
98                    if processed_indices.contains(&j) {
99                        continue;
100                    }
101
102                    if should_merge_blocks(&current_block, next_block, threshold) {
103                        if debug_mode {
104                            println!(
105                                "DEBUG: Merging blocks - current: {}-{}, next: {}-{}",
106                                current_block.lines.0,
107                                current_block.lines.1,
108                                next_block.lines.0,
109                                next_block.lines.1
110                            );
111                        }
112
113                        // Merge the blocks
114                        let merged_start = current_block.lines.0.min(next_block.lines.0);
115                        let merged_end = current_block.lines.1.max(next_block.lines.1);
116                        let merged_code = merge_block_content(&current_block, next_block);
117
118                        // Use node type from the highest-ranked block
119                        let merged_node_type = if current_block.rank.unwrap_or(usize::MAX)
120                            <= next_block.rank.unwrap_or(usize::MAX)
121                        {
122                            current_block.node_type.clone()
123                        } else {
124                            next_block.node_type.clone()
125                        };
126
127                        // Combine scores and term statistics
128                        let merged_score = merge_scores(&current_block, next_block);
129                        let merged_term_stats = merge_term_statistics(&current_block, next_block);
130
131                        // Update the current block
132                        current_block.lines = (merged_start, merged_end);
133                        current_block.code = merged_code;
134                        current_block.node_type = merged_node_type;
135                        current_block.score = merged_score.0;
136                        current_block.tfidf_score = merged_score.1;
137                        current_block.bm25_score = merged_score.2;
138                        current_block.new_score = merged_score.3;
139                        current_block.block_unique_terms = merged_term_stats.0;
140                        current_block.block_total_matches = merged_term_stats.1;
141
142                        // Mark this block as processed
143                        processed_indices.insert(j);
144                        merged_indices.push(j);
145                        changed = true;
146                    }
147                }
148            }
149
150            // Add the merged block to results
151            merged_blocks.push(current_block);
152        }
153
154        // Add the merged blocks to the final results
155        merged_results.extend(merged_blocks);
156    }
157
158    if debug_mode {
159        println!(
160            "DEBUG: Post-rank merging complete. Merged {} blocks into {} blocks",
161            original_count,
162            merged_results.len()
163        );
164    }
165
166    merged_results
167}
168
169/// Helper function to determine if two blocks should be merged
170///
171/// # Arguments
172/// * `block1` - First search result
173/// * `block2` - Second search result
174/// * `threshold` - Maximum number of lines between blocks to consider them adjacent
175///
176/// # Returns
177/// `true` if blocks should be merged, `false` otherwise
178pub fn should_merge_blocks(block1: &SearchResult, block2: &SearchResult, threshold: usize) -> bool {
179    let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
180
181    // Check if both blocks have parent_file_id, and if they match
182    if let (Some(file_id1), Some(file_id2)) = (&block1.parent_file_id, &block2.parent_file_id) {
183        if file_id1 != file_id2 {
184            if debug_mode {
185                println!("DEBUG: Blocks not merged - different parent file IDs");
186            }
187            return false;
188        }
189    } else {
190        // If blocks don't have parent_file_id, check if they're from the same file
191        if block1.file != block2.file {
192            if debug_mode {
193                println!("DEBUG: Blocks not merged - different files");
194            }
195            return false;
196        }
197    }
198
199    // Get line ranges
200    let (start1, end1) = block1.lines;
201    let (start2, end2) = block2.lines;
202
203    // Blocks should be merged if:
204    // 1. They overlap
205    // 2. Or they are within the threshold distance
206    // 3. Or one is a comment and is adjacent to a function
207
208    // Check for overlap first
209    let overlapping = start1 <= end2 && start2 <= end1;
210
211    // If not overlapping, check the gap size
212    let distance = if overlapping {
213        0
214    } else if start2 > end1 {
215        start2 - end1 - 1 // Subtract 1 because we want the gap size
216    } else {
217        start1 - end2 - 1
218    };
219
220    let comment_with_function = (block1.node_type.contains("comment")
221        && is_function_like(&block2.node_type))
222        || (block2.node_type.contains("comment") && is_function_like(&block1.node_type));
223
224    let should_merge = overlapping
225        || distance <= threshold
226        || (comment_with_function && distance <= threshold * 2);
227
228    if debug_mode {
229        println!("DEBUG: Considering merging blocks - Block1: type='{}' lines {}-{}, Block2: type='{}' lines {}-{}, threshold: {}",
230                 block1.node_type, start1, end1, block2.node_type, start2, end2, threshold);
231        println!(
232            "DEBUG: Should merge: {should_merge} (distance: {distance}, threshold: {threshold})"
233        );
234    }
235
236    should_merge
237}
238
239/// Helper function to check if a node type represents a function-like construct
240fn is_function_like(node_type: &str) -> bool {
241    node_type.contains("function")
242        || node_type.contains("method")
243        || node_type.contains("fn")
244        || node_type.contains("func")
245}
246
247/// Helper function to merge the content of two blocks
248///
249/// # Arguments
250/// * `block1` - First search result
251/// * `block2` - Second search result
252///
253/// # Returns
254/// The merged code content
255fn merge_block_content(block1: &SearchResult, block2: &SearchResult) -> String {
256    // Extract line ranges
257    let (start1, end1) = block1.lines;
258    let (start2, end2) = block2.lines;
259
260    // Calculate the merged range
261    let merged_start = start1.min(start2);
262    let merged_end = end1.max(end2);
263
264    // If the blocks are already complete, we can use simpler logic
265    if start1 == merged_start && end1 == merged_end {
266        return block1.code.clone();
267    }
268
269    if start2 == merged_start && end2 == merged_end {
270        return block2.code.clone();
271    }
272
273    // We need to extract the merged content from the file
274    // For simplicity, we'll use the content from the blocks we have
275    // This is not perfect, as we might be missing some lines in between,
276    // but it's a reasonable approximation without loading the file again
277
278    // Convert content to lines
279    let lines1: Vec<&str> = block1.code.lines().collect();
280    let lines2: Vec<&str> = block2.code.lines().collect();
281
282    // Map lines to their absolute positions in the file
283    let mut line_map: HashMap<usize, String> = HashMap::new();
284
285    for (i, line) in lines1.iter().enumerate() {
286        let abs_pos = start1 + i;
287        line_map.insert(abs_pos, line.to_string());
288    }
289
290    for (i, line) in lines2.iter().enumerate() {
291        let abs_pos = start2 + i;
292        line_map.entry(abs_pos).or_insert_with(|| line.to_string());
293    }
294
295    // Build the merged content from the line map
296    let mut merged_lines = Vec::new();
297    let mut current_line = merged_start;
298    let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
299
300    // Try to open the file to fill small gaps
301    let file_path = Path::new(&block1.file);
302    let file_result = File::open(file_path);
303    let file_content_available = file_result.is_ok();
304    let _reader = file_result.map(BufReader::new).ok(); // Used for debugging purposes only
305
306    if debug_mode {
307        println!(
308            "DEBUG: Current working directory: {:?}",
309            std::env::current_dir().unwrap_or_else(|_| PathBuf::from("unknown"))
310        );
311        println!(
312            "DEBUG: Attempting to read file: {:?}",
313            file_path
314                .canonicalize()
315                .unwrap_or_else(|_| PathBuf::from(file_path))
316        );
317        println!("DEBUG: File exists: {}", file_path.exists());
318        println!("DEBUG: File can be opened: {file_content_available}");
319    }
320
321    while current_line <= merged_end {
322        if let Some(line_content) = line_map.get(&current_line) {
323            merged_lines.push(line_content.clone());
324            current_line += 1;
325        } else {
326            // This is a gap in our knowledge - find the entire gap range
327            let gap_start = current_line;
328            let mut gap_end = current_line;
329
330            // Find the end of the gap
331            while gap_end < merged_end && !line_map.contains_key(&(gap_end + 1)) {
332                gap_end += 1;
333            }
334
335            let gap_size = gap_end - gap_start + 1;
336
337            // For small gaps (less than 10 lines), try to read the actual content
338            if gap_size < 10 {
339                if file_content_available {
340                    if debug_mode {
341                        println!(
342                            "DEBUG: Attempting to fill small gap from line {} to {} from file {}",
343                            gap_start, gap_end, block1.file
344                        );
345                    }
346
347                    // Read the file content directly instead of using a reader clone
348                    // which might have its position already moved forward
349                    let file_result = File::open(Path::new(&block1.file));
350
351                    if let Ok(file) = file_result {
352                        let reader = BufReader::new(file);
353
354                        if debug_mode {
355                            println!("DEBUG: Created fresh file reader for gap");
356                        }
357
358                        // Read the file line by line
359                        let mut lines_read = Vec::new();
360                        let mut current_line_in_file = 1;
361
362                        for line_content in reader.lines().map_while(Result::ok) {
363                            if current_line_in_file >= gap_start && current_line_in_file <= gap_end
364                            {
365                                lines_read.push(line_content);
366                            }
367
368                            current_line_in_file += 1;
369
370                            if current_line_in_file > gap_end {
371                                break;
372                            }
373                        }
374
375                        // Add the actual content for the gap
376                        if !lines_read.is_empty() {
377                            if debug_mode {
378                                println!(
379                                    "DEBUG: Successfully read {} lines for gap",
380                                    lines_read.len()
381                                );
382                            }
383                            merged_lines.extend(lines_read);
384                            current_line = gap_end + 1;
385                            continue;
386                        } else if debug_mode {
387                            println!("DEBUG: No lines were read for the gap (empty lines)");
388                        }
389                    } else if debug_mode {
390                        println!("DEBUG: Could not create fresh file reader");
391                    }
392                } else if debug_mode {
393                    println!("DEBUG: File content not available for {}", block1.file);
394                }
395
396                // For small gaps where we couldn't read the file or no lines were read,
397                // include a special placeholder indicating we want to include these lines
398                merged_lines.push(format!(
399                    "... lines {gap_start}-{gap_end} should be included ..."
400                ));
401            } else {
402                // Add a more informative placeholder showing how many lines were skipped for larger gaps
403                merged_lines.push(format!("... lines {gap_start}-{gap_end} skipped..."));
404            }
405
406            // Move past the gap
407            current_line = gap_end + 1;
408        }
409    }
410
411    merged_lines.join("\n")
412}
413
414/// Helper function to merge scores from two blocks
415///
416/// # Arguments
417/// * `block1` - First search result
418/// * `block2` - Second search result
419///
420/// # Returns
421/// Tuple of (score, tfidf_score, bm25_score, new_score) for the merged block
422fn merge_scores(
423    block1: &SearchResult,
424    block2: &SearchResult,
425) -> (Option<f64>, Option<f64>, Option<f64>, Option<f64>) {
426    // For each score, take the maximum of the two blocks' scores
427    let score = match (block1.score, block2.score) {
428        (Some(s1), Some(s2)) => Some(s1.max(s2)),
429        (Some(s), None) | (None, Some(s)) => Some(s),
430        _ => None,
431    };
432
433    let tfidf_score = match (block1.tfidf_score, block2.tfidf_score) {
434        (Some(s1), Some(s2)) => Some(s1.max(s2)),
435        (Some(s), None) | (None, Some(s)) => Some(s),
436        _ => None,
437    };
438
439    let bm25_score = match (block1.bm25_score, block2.bm25_score) {
440        (Some(s1), Some(s2)) => Some(s1.max(s2)),
441        (Some(s), None) | (None, Some(s)) => Some(s),
442        _ => None,
443    };
444
445    let new_score = match (block1.new_score, block2.new_score) {
446        (Some(s1), Some(s2)) => Some(s1.max(s2)),
447        (Some(s), None) | (None, Some(s)) => Some(s),
448        _ => None,
449    };
450
451    (score, tfidf_score, bm25_score, new_score)
452}
453
454/// Helper function to merge term statistics from two blocks
455///
456/// # Arguments
457/// * `block1` - First search result
458/// * `block2` - Second search result
459///
460/// # Returns
461/// Tuple of (block_unique_terms, block_total_matches) for the merged block
462fn merge_term_statistics(
463    block1: &SearchResult,
464    block2: &SearchResult,
465) -> (Option<usize>, Option<usize>) {
466    // For unique terms, we take the maximum (since merging blocks shouldn't reduce matched terms)
467    let unique_terms = match (block1.block_unique_terms, block2.block_unique_terms) {
468        (Some(t1), Some(t2)) => Some(t1.max(t2)),
469        (Some(t), None) | (None, Some(t)) => Some(t),
470        _ => None,
471    };
472
473    // For total matches, we sum them (this might overcount if same terms appear in both blocks)
474    // A more accurate approach would require re-processing the merged content
475    let total_matches = match (block1.block_total_matches, block2.block_total_matches) {
476        (Some(t1), Some(t2)) => Some(t1 + t2),
477        (Some(t), None) | (None, Some(t)) => Some(t),
478        _ => None,
479    };
480
481    (unique_terms, total_matches)
482}