probe_code/extract/
processor.rs

1//! Functions for processing files and extracting code blocks.
2//!
3//! This module provides functions for processing files and extracting code blocks
4//! based on file paths and optional line numbers.
5use anyhow::{Context, Result};
6use probe_code::extract::symbol_finder::find_symbol_in_file;
7use probe_code::language::parser::parse_file_for_code_blocks;
8use probe_code::models::SearchResult;
9use std::collections::HashSet;
10use std::fs;
11use std::path::Path;
12
13/// Process a single file and extract code blocks
14///
15/// If a line range is specified, we find all AST blocks overlapping that range,
16/// merge them into a bounding block, and return it. If no blocks are found, fallback
17/// to the literal lines. If only a single line is specified, do the same but for that line.
18/// If a symbol is specified, we delegate to `find_symbol_in_file`.
19/// If specific lines are provided, we find AST blocks for each line and merge them.
20/// If no lines or symbol are specified, return the entire file.
21///
22/// This function returns a single SearchResult that includes either the merged AST code
23/// or the literal lines as a fallback.
24pub fn process_file_for_extraction(
25    path: &Path,
26    start_line: Option<usize>,
27    end_line: Option<usize>,
28    symbol: Option<&str>,
29    allow_tests: bool,
30    context_lines: usize,
31    specific_lines: Option<&HashSet<usize>>,
32) -> Result<SearchResult> {
33    // Check if debug mode is enabled
34    let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
35
36    if debug_mode {
37        println!("\n[DEBUG] ===== Processing File for Extraction =====");
38        println!("[DEBUG] File path: {path:?}");
39        println!("[DEBUG] Start line: {start_line:?}");
40        println!("[DEBUG] End line: {end_line:?}");
41        println!("[DEBUG] Symbol: {symbol:?}");
42        println!("[DEBUG] Allow tests: {allow_tests}");
43        println!("[DEBUG] Context lines: {context_lines}");
44        println!("[DEBUG] Specific lines: {specific_lines:?}");
45    }
46
47    // Check if the file exists
48    if !path.exists() {
49        if debug_mode {
50            println!("[DEBUG] Error: File does not exist");
51        }
52        return Err(anyhow::anyhow!("File does not exist: {:?}", path));
53    }
54
55    // Read the file content
56    let content = fs::read_to_string(path).context(format!("Failed to read file: {path:?}"))?;
57    let lines: Vec<&str> = content.lines().collect();
58
59    if debug_mode {
60        println!("[DEBUG] File read successfully");
61        println!("[DEBUG] File size: {} bytes", content.len());
62        println!("[DEBUG] Line count: {}", lines.len());
63    }
64
65    // If we have a symbol, find it in the file
66    if let Some(symbol_name) = symbol {
67        if debug_mode {
68            println!("[DEBUG] Looking for symbol: {symbol_name}");
69        }
70        // Find the symbol in the file
71        return find_symbol_in_file(path, symbol_name, &content, allow_tests, context_lines);
72    }
73
74    // If we have a line range (start_line, end_line), gather AST blocks overlapping that range.
75    if let (Some(start), Some(end)) = (start_line, end_line) {
76        if debug_mode {
77            println!("[DEBUG] Extracting line range: {start}-{end} (with AST merging)");
78        }
79
80        // Clamp line numbers to valid ranges instead of failing
81        // Bound start to 1..lines.len()
82        let mut clamped_start = start.clamp(1, lines.len());
83
84        // Bound end to clamped_start..lines.len()
85        let mut clamped_end = end.clamp(clamped_start, lines.len());
86
87        // If the start is still larger than the total lines, we know there's literally nothing to extract
88        if clamped_start > lines.len() {
89            clamped_start = lines.len();
90        }
91
92        // If the end is zero or ends up less than the start, just clamp it to the start
93        if clamped_end < clamped_start {
94            clamped_end = clamped_start;
95        }
96
97        if debug_mode && (clamped_start != start || clamped_end != end) {
98            println!(
99                "[DEBUG] Requested lines {start}-{end} out of range; clamping to {clamped_start}-{clamped_end}"
100            );
101        }
102
103        // Use the clamped values for the rest of the function
104        let start = clamped_start;
105        let end = clamped_end;
106
107        // Parse AST for all lines in [start, end]
108        let mut needed_lines = HashSet::new();
109        for l in start..=end {
110            needed_lines.insert(l);
111        }
112
113        // If specific_lines is provided, add those lines too
114        if let Some(lines_set) = specific_lines {
115            for &line in lines_set {
116                needed_lines.insert(line);
117            }
118        }
119
120        let code_blocks_result = parse_file_for_code_blocks(
121            &content,
122            file_extension(path),
123            &needed_lines,
124            allow_tests,
125            None,
126        );
127
128        match code_blocks_result {
129            Ok(blocks) if !blocks.is_empty() => {
130                // Merge them into a bounding block
131                // i.e. from min(block.start_row) to max(block.end_row)
132                let min_start = blocks.iter().map(|b| b.start_row).min().unwrap_or(0);
133                let max_end = blocks.iter().map(|b| b.end_row).max().unwrap_or(0);
134
135                // Ensure max_end is within bounds of the file
136                let max_end = std::cmp::min(max_end, lines.len() - 1);
137
138                // Ensure min_start is not greater than max_end
139                let min_start = std::cmp::min(min_start, max_end);
140
141                // lines in the file are 0-indexed internally, so we add 1 for final display
142                let merged_start = min_start + 1;
143                let merged_end = max_end + 1;
144
145                if debug_mode {
146                    println!(
147                        "[DEBUG] Found {} overlapping AST blocks, merging into lines {}-{}",
148                        blocks.len(),
149                        merged_start,
150                        merged_end
151                    );
152                }
153
154                let merged_content = lines[min_start..=max_end].join("\n");
155
156                // Tokenize the content
157                let filename = path
158                    .file_name()
159                    .map(|f| f.to_string_lossy().to_string())
160                    .unwrap_or_default();
161                let tokenized_content =
162                    crate::ranking::preprocess_text_with_filename(&merged_content, &filename);
163
164                Ok(SearchResult {
165                    file: path.to_string_lossy().to_string(),
166                    lines: (merged_start, merged_end),
167                    node_type: "merged_ast_range".to_string(),
168                    code: merged_content,
169                    matched_by_filename: None,
170                    rank: None,
171                    score: None,
172                    tfidf_score: None,
173                    bm25_score: None,
174                    tfidf_rank: None,
175                    bm25_rank: None,
176                    new_score: None,
177                    hybrid2_rank: None,
178                    combined_score_rank: None,
179                    file_unique_terms: None,
180                    file_total_matches: None,
181                    file_match_rank: None,
182                    block_unique_terms: None,
183                    block_total_matches: None,
184                    parent_file_id: None,
185                    block_id: None,
186                    matched_keywords: None,
187                    tokenized_content: Some(tokenized_content),
188                })
189            }
190            _ => {
191                // Fallback to literal extraction of lines [start..end]
192                if debug_mode {
193                    println!(
194                        "[DEBUG] No AST blocks found for the range {start}-{end}, falling back to literal lines"
195                    );
196                }
197                let start_idx = start - 1;
198                let end_idx = end;
199                let range_content = lines[start_idx..end_idx].join("\n");
200                // Tokenize the content
201                let filename = path
202                    .file_name()
203                    .map(|f| f.to_string_lossy().to_string())
204                    .unwrap_or_default();
205                let tokenized_content =
206                    crate::ranking::preprocess_text_with_filename(&range_content, &filename);
207
208                Ok(SearchResult {
209                    file: path.to_string_lossy().to_string(),
210                    lines: (start, end),
211                    node_type: "range".to_string(),
212                    code: range_content,
213                    matched_by_filename: None,
214                    rank: None,
215                    score: None,
216                    tfidf_score: None,
217                    bm25_score: None,
218                    tfidf_rank: None,
219                    bm25_rank: None,
220                    new_score: None,
221                    hybrid2_rank: None,
222                    combined_score_rank: None,
223                    file_unique_terms: None,
224                    file_total_matches: None,
225                    file_match_rank: None,
226                    block_unique_terms: None,
227                    block_total_matches: None,
228                    parent_file_id: None,
229                    block_id: None,
230                    matched_keywords: None,
231                    tokenized_content: Some(tokenized_content),
232                })
233            }
234        }
235    }
236    // Single line extraction
237    else if let Some(line_num) = start_line {
238        if debug_mode {
239            println!("[DEBUG] Single line extraction requested: line {line_num}");
240        }
241        // Clamp line number to valid range instead of failing
242        let clamped_line_num = line_num.clamp(1, lines.len());
243
244        if debug_mode && clamped_line_num != line_num {
245            println!(
246                "[DEBUG] Requested line {line_num} out of bounds; clamping to {clamped_line_num}"
247            );
248        }
249
250        // Use the clamped value for the rest of the function
251        let line_num = clamped_line_num;
252
253        // We'll parse the AST for just this line
254        let mut needed_lines = HashSet::new();
255        needed_lines.insert(line_num);
256
257        // If specific_lines is provided, add those lines too
258        if let Some(lines_set) = specific_lines {
259            for &line in lines_set {
260                needed_lines.insert(line);
261            }
262        }
263
264        match parse_file_for_code_blocks(
265            &content,
266            file_extension(path),
267            &needed_lines,
268            allow_tests,
269            None,
270        ) {
271            Ok(blocks) if !blocks.is_empty() => {
272                // Merge them into a bounding block (in most cases it should only be one block,
273                // but let's be safe if multiple overlap)
274                let min_start = blocks.iter().map(|b| b.start_row).min().unwrap_or(0);
275                let max_end = blocks.iter().map(|b| b.end_row).max().unwrap_or(0);
276
277                // Ensure max_end is within bounds of the file
278                let max_end = std::cmp::min(max_end, lines.len() - 1);
279
280                // Ensure min_start is not greater than max_end
281                let min_start = std::cmp::min(min_start, max_end);
282
283                let merged_start = min_start + 1;
284                let merged_end = max_end + 1;
285
286                if debug_mode {
287                    println!(
288                        "[DEBUG] Found {} AST blocks for line {}, merging into lines {}-{}",
289                        blocks.len(),
290                        line_num,
291                        merged_start,
292                        merged_end
293                    );
294                }
295                let merged_content = lines[min_start..=max_end].join("\n");
296
297                // Tokenize the content
298                let filename = path
299                    .file_name()
300                    .map(|f| f.to_string_lossy().to_string())
301                    .unwrap_or_default();
302                let tokenized_content =
303                    crate::ranking::preprocess_text_with_filename(&merged_content, &filename);
304
305                return Ok(SearchResult {
306                    file: path.to_string_lossy().to_string(),
307                    lines: (merged_start, merged_end),
308                    node_type: "merged_ast_line".to_string(),
309                    code: merged_content,
310                    matched_by_filename: None,
311                    rank: None,
312                    score: None,
313                    tfidf_score: None,
314                    bm25_score: None,
315                    tfidf_rank: None,
316                    bm25_rank: None,
317                    new_score: None,
318                    hybrid2_rank: None,
319                    combined_score_rank: None,
320                    file_unique_terms: None,
321                    file_total_matches: None,
322                    file_match_rank: None,
323                    block_unique_terms: None,
324                    block_total_matches: None,
325                    parent_file_id: None,
326                    block_id: None,
327                    matched_keywords: None,
328                    tokenized_content: Some(tokenized_content),
329                });
330            }
331            _ => {
332                // If no AST block found, fallback to the line + context
333                if debug_mode {
334                    println!(
335                        "[DEBUG] No AST blocks found for line {line_num}, using context-based fallback"
336                    );
337                }
338
339                // Extract context
340                let file_line_count = lines.len();
341                let start_ctx = if line_num <= context_lines {
342                    1
343                } else {
344                    line_num - context_lines
345                };
346                let end_ctx = std::cmp::min(line_num + context_lines, file_line_count);
347
348                let start_idx = start_ctx - 1;
349                let end_idx = end_ctx;
350
351                let context_code = lines[start_idx..end_idx].join("\n");
352
353                // Tokenize the content
354                let filename = path
355                    .file_name()
356                    .map(|f| f.to_string_lossy().to_string())
357                    .unwrap_or_default();
358                let tokenized_content =
359                    crate::ranking::preprocess_text_with_filename(&context_code, &filename);
360
361                return Ok(SearchResult {
362                    file: path.to_string_lossy().to_string(),
363                    lines: (start_ctx, end_ctx),
364                    node_type: "context".to_string(),
365                    code: context_code,
366                    matched_by_filename: None,
367                    rank: None,
368                    score: None,
369                    tfidf_score: None,
370                    bm25_score: None,
371                    tfidf_rank: None,
372                    bm25_rank: None,
373                    new_score: None,
374                    hybrid2_rank: None,
375                    combined_score_rank: None,
376                    file_unique_terms: None,
377                    file_total_matches: None,
378                    file_match_rank: None,
379                    block_unique_terms: None,
380                    block_total_matches: None,
381                    parent_file_id: None,
382                    block_id: None,
383                    matched_keywords: None,
384                    tokenized_content: Some(tokenized_content),
385                });
386            }
387        }
388    } else if let Some(lines_set) = specific_lines {
389        // We have specific lines to extract
390        if debug_mode {
391            println!("[DEBUG] Extracting specific lines: {lines_set:?}");
392        }
393
394        if lines_set.is_empty() {
395            if debug_mode {
396                println!("[DEBUG] No specific lines provided, returning entire file content");
397            }
398
399            // Tokenize the content
400            let filename = path
401                .file_name()
402                .map(|f| f.to_string_lossy().to_string())
403                .unwrap_or_default();
404            let tokenized_content =
405                crate::ranking::preprocess_text_with_filename(&content, &filename);
406
407            return Ok(SearchResult {
408                file: path.to_string_lossy().to_string(),
409                lines: (1, lines.len()),
410                node_type: "file".to_string(),
411                code: content,
412                matched_by_filename: None,
413                rank: None,
414                score: None,
415                tfidf_score: None,
416                bm25_score: None,
417                tfidf_rank: None,
418                bm25_rank: None,
419                new_score: None,
420                hybrid2_rank: None,
421                combined_score_rank: None,
422                file_unique_terms: None,
423                file_total_matches: None,
424                file_match_rank: None,
425                block_unique_terms: None,
426                block_total_matches: None,
427                parent_file_id: None,
428                block_id: None,
429                matched_keywords: None,
430                tokenized_content: Some(tokenized_content),
431            });
432        }
433
434        // Clamp specific lines to valid range instead of failing
435        let mut clamped_lines = HashSet::new();
436        let mut any_clamped = false;
437
438        for &line in lines_set {
439            if line == 0 || line > lines.len() {
440                if line > 0 {
441                    // Only add lines that are > 0 (clamp to max)
442                    clamped_lines.insert(line.min(lines.len()));
443                }
444                any_clamped = true;
445            } else {
446                clamped_lines.insert(line);
447            }
448        }
449
450        if debug_mode && any_clamped {
451            println!(
452                "[DEBUG] Some requested lines were out of bounds; clamping to valid range 1-{}",
453                lines.len()
454            );
455        }
456
457        // Use the clamped set for the rest of the function
458        let lines_set = &clamped_lines;
459
460        // Parse AST for all specified lines
461        let code_blocks_result = parse_file_for_code_blocks(
462            &content,
463            file_extension(path),
464            lines_set,
465            allow_tests,
466            None,
467        );
468
469        match code_blocks_result {
470            Ok(blocks) if !blocks.is_empty() => {
471                // Merge them into a bounding block
472                let min_start = blocks.iter().map(|b| b.start_row).min().unwrap_or(0);
473                let max_end = blocks.iter().map(|b| b.end_row).max().unwrap_or(0);
474
475                // Ensure max_end is within bounds of the file
476                let max_end = std::cmp::min(max_end, lines.len() - 1);
477
478                // Ensure min_start is not greater than max_end
479                let min_start = std::cmp::min(min_start, max_end);
480
481                // lines in the file are 0-indexed internally, so we add 1 for final display
482                let merged_start = min_start + 1;
483                let merged_end = max_end + 1;
484
485                if debug_mode {
486                    println!(
487                        "[DEBUG] Found {} AST blocks for specific lines, merging into lines {}-{}",
488                        blocks.len(),
489                        merged_start,
490                        merged_end
491                    );
492                }
493
494                let merged_content = lines[min_start..=max_end].join("\n");
495
496                // Tokenize the content
497                let filename = path
498                    .file_name()
499                    .map(|f| f.to_string_lossy().to_string())
500                    .unwrap_or_default();
501                let tokenized_content =
502                    crate::ranking::preprocess_text_with_filename(&merged_content, &filename);
503
504                return Ok(SearchResult {
505                    file: path.to_string_lossy().to_string(),
506                    lines: (merged_start, merged_end),
507                    node_type: "merged_ast_specific_lines".to_string(),
508                    code: merged_content,
509                    matched_by_filename: None,
510                    rank: None,
511                    score: None,
512                    tfidf_score: None,
513                    bm25_score: None,
514                    tfidf_rank: None,
515                    bm25_rank: None,
516                    new_score: None,
517                    hybrid2_rank: None,
518                    combined_score_rank: None,
519                    file_unique_terms: None,
520                    file_total_matches: None,
521                    file_match_rank: None,
522                    block_unique_terms: None,
523                    block_total_matches: None,
524                    parent_file_id: None,
525                    block_id: None,
526                    matched_keywords: None,
527                    tokenized_content: Some(tokenized_content),
528                });
529            }
530            _ => {
531                // Fallback to literal extraction of the specific lines
532                if debug_mode {
533                    println!(
534                        "[DEBUG] No AST blocks found for specific lines, falling back to literal lines"
535                    );
536                }
537
538                // Get the min and max line numbers
539                let min_line = *lines_set.iter().min().unwrap_or(&1);
540                let max_line = *lines_set.iter().max().unwrap_or(&lines.len());
541
542                // Add some context around the lines
543                let start = if min_line <= context_lines {
544                    1
545                } else {
546                    min_line - context_lines
547                };
548                let end = std::cmp::min(max_line + context_lines, lines.len());
549
550                let start_idx = start - 1;
551                let end_idx = end;
552                let range_content = lines[start_idx..end_idx].join("\n");
553
554                // Tokenize the content
555                let filename = path
556                    .file_name()
557                    .map(|f| f.to_string_lossy().to_string())
558                    .unwrap_or_default();
559                let tokenized_content =
560                    crate::ranking::preprocess_text_with_filename(&range_content, &filename);
561
562                return Ok(SearchResult {
563                    file: path.to_string_lossy().to_string(),
564                    lines: (start, end),
565                    node_type: "specific_lines".to_string(),
566                    code: range_content,
567                    matched_by_filename: None,
568                    rank: None,
569                    score: None,
570                    tfidf_score: None,
571                    bm25_score: None,
572                    tfidf_rank: None,
573                    bm25_rank: None,
574                    new_score: None,
575                    hybrid2_rank: None,
576                    combined_score_rank: None,
577                    file_unique_terms: None,
578                    file_total_matches: None,
579                    file_match_rank: None,
580                    block_unique_terms: None,
581                    block_total_matches: None,
582                    parent_file_id: None,
583                    block_id: None,
584                    matched_keywords: None,
585                    tokenized_content: Some(tokenized_content),
586                });
587            }
588        }
589    } else {
590        // No line specified, return the entire file
591        if debug_mode {
592            println!("[DEBUG] No line or range specified, returning entire file content");
593        }
594
595        // Tokenize the content
596        let filename = path
597            .file_name()
598            .map(|f| f.to_string_lossy().to_string())
599            .unwrap_or_default();
600        let tokenized_content = crate::ranking::preprocess_text_with_filename(&content, &filename);
601
602        Ok(SearchResult {
603            file: path.to_string_lossy().to_string(),
604            lines: (1, lines.len()),
605            node_type: "file".to_string(),
606            code: content,
607            matched_by_filename: None,
608            rank: None,
609            score: None,
610            tfidf_score: None,
611            bm25_score: None,
612            tfidf_rank: None,
613            bm25_rank: None,
614            new_score: None,
615            hybrid2_rank: None,
616            combined_score_rank: None,
617            file_unique_terms: None,
618            file_total_matches: None,
619            file_match_rank: None,
620            block_unique_terms: None,
621            block_total_matches: None,
622            parent_file_id: None,
623            block_id: None,
624            matched_keywords: None,
625            tokenized_content: Some(tokenized_content),
626        })
627    }
628}
629
630/// Helper to get file extension as a &str
631fn file_extension(path: &Path) -> &str {
632    path.extension().and_then(|ext| ext.to_str()).unwrap_or("")
633}