probe_code/extract/
mod.rs

1//! Extract command functionality for extracting code blocks from files.
2//!
3//! This module provides functions for extracting code blocks from files based on file paths
4//! and optional line numbers. When a line number is specified, it uses tree-sitter to find
5//! the closest suitable parent node (function, struct, class, etc.) for that line.
6
7mod file_paths;
8mod formatter;
9mod processor;
10mod prompts;
11pub mod symbol_finder;
12
13// Re-export public functions
14#[allow(unused_imports)]
15pub use file_paths::{
16    extract_file_paths_from_git_diff, extract_file_paths_from_text, is_git_diff_format,
17    parse_file_with_line,
18};
19#[allow(unused_imports)]
20pub use formatter::{
21    format_and_print_extraction_results, format_extraction_dry_run, format_extraction_results,
22};
23#[allow(unused_imports)]
24pub use processor::process_file_for_extraction;
25#[allow(unused_imports)]
26pub use prompts::PromptTemplate;
27
28use anyhow::Result;
29use probe_code::extract::file_paths::{set_custom_ignores, FilePathInfo};
30use probe_code::models::SearchResult;
31use std::collections::HashSet;
32use std::io::Read;
33#[allow(unused_imports)]
34use std::path::PathBuf;
35
36/// Options for the extract command
37pub struct ExtractOptions {
38    /// Files to extract from
39    pub files: Vec<String>,
40    /// Custom patterns to ignore
41    pub custom_ignores: Vec<String>,
42    /// Number of context lines to include
43    pub context_lines: usize,
44    /// Output format
45    pub format: String,
46    /// Whether to read from clipboard
47    pub from_clipboard: bool,
48    /// Path to input file to read from
49    pub input_file: Option<String>,
50    /// Whether to write to clipboard
51    pub to_clipboard: bool,
52    /// Whether to perform a dry run
53    pub dry_run: bool,
54    /// Whether to parse input as git diff format
55    pub diff: bool,
56    /// Whether to allow test files and test code blocks
57    pub allow_tests: bool,
58    /// Whether to keep and display the original input content
59    pub keep_input: bool,
60    /// Optional prompt template for LLM models
61    pub prompt: Option<prompts::PromptTemplate>,
62    /// Optional user instructions for LLM models
63    pub instructions: Option<String>,
64}
65
66/// Handle the extract command
67pub fn handle_extract(options: ExtractOptions) -> Result<()> {
68    use arboard::Clipboard;
69    use colored::*;
70
71    // Check if debug mode is enabled
72    let debug_mode = std::env::var("DEBUG").unwrap_or_default() == "1";
73
74    if debug_mode {
75        println!("\n[DEBUG] ===== Extract Command Started =====");
76        println!("[DEBUG] Files to process: {files:?}", files = options.files);
77        println!(
78            "[DEBUG] Custom ignores: {custom_ignores:?}",
79            custom_ignores = options.custom_ignores
80        );
81        println!(
82            "[DEBUG] Context lines: {context_lines}",
83            context_lines = options.context_lines
84        );
85        println!("[DEBUG] Output format: {format}", format = options.format);
86        println!(
87            "[DEBUG] Read from clipboard: {from_clipboard}",
88            from_clipboard = options.from_clipboard
89        );
90        println!(
91            "[DEBUG] Write to clipboard: {to_clipboard}",
92            to_clipboard = options.to_clipboard
93        );
94        println!("[DEBUG] Dry run: {dry_run}", dry_run = options.dry_run);
95        println!("[DEBUG] Parse as git diff: {diff}", diff = options.diff);
96        println!(
97            "[DEBUG] Allow tests: {allow_tests}",
98            allow_tests = options.allow_tests
99        );
100        println!(
101            "[DEBUG] Prompt template: {prompt:?}",
102            prompt = options.prompt
103        );
104        println!(
105            "[DEBUG] Instructions: {instructions:?}",
106            instructions = options.instructions
107        );
108    }
109
110    // Set custom ignore patterns
111    set_custom_ignores(&options.custom_ignores);
112
113    let mut file_paths: Vec<FilePathInfo> = Vec::new();
114
115    // Store the original input if the keep_input flag is set
116    let mut original_input: Option<String> = None;
117
118    if options.from_clipboard {
119        // Read from clipboard
120        println!("{}", "Reading from clipboard...".bold().blue());
121        let mut clipboard = Clipboard::new()?;
122        let buffer = clipboard.get_text()?;
123
124        // Store the original input if keep_input is true
125        if options.keep_input {
126            original_input = Some(buffer.clone());
127            if debug_mode {
128                println!(
129                    "[DEBUG] Stored original clipboard input: {} bytes",
130                    original_input.as_ref().map_or(0, |s| s.len())
131                );
132            }
133        }
134
135        if debug_mode {
136            println!(
137                "[DEBUG] Reading from clipboard, content length: {} bytes",
138                buffer.len()
139            );
140        }
141
142        // Auto-detect git diff format or use explicit flag
143        let is_diff_format = options.diff || is_git_diff_format(&buffer);
144
145        if is_diff_format {
146            // Parse as git diff format
147            if debug_mode {
148                println!("[DEBUG] Parsing clipboard content as git diff format");
149            }
150            file_paths = extract_file_paths_from_git_diff(&buffer, options.allow_tests);
151        } else {
152            // Parse as regular text
153            file_paths = file_paths::extract_file_paths_from_text(&buffer, options.allow_tests);
154        }
155
156        if debug_mode {
157            println!(
158                "[DEBUG] Extracted {} file paths from clipboard",
159                file_paths.len()
160            );
161            for (path, start, end, symbol, lines) in &file_paths {
162                println!(
163                    "[DEBUG]   - {:?} (lines: {:?}-{:?}, symbol: {:?}, specific lines: {:?})",
164                    path,
165                    start,
166                    end,
167                    symbol,
168                    lines.as_ref().map(|l| l.len())
169                );
170            }
171        }
172
173        if file_paths.is_empty() {
174            println!("{}", "No file paths found in clipboard.".yellow().bold());
175            return Ok(());
176        }
177    } else if let Some(input_file_path) = &options.input_file {
178        // Read from input file
179        println!(
180            "{}",
181            format!("Reading from file: {input_file_path}...")
182                .bold()
183                .blue()
184        );
185
186        // Check if the file exists
187        let input_path = std::path::Path::new(input_file_path);
188        if !input_path.exists() {
189            return Err(anyhow::anyhow!(
190                "Input file does not exist: {}",
191                input_file_path
192            ));
193        }
194
195        // Read the file content
196        let buffer = std::fs::read_to_string(input_path)?;
197
198        // Store the original input if keep_input is true
199        if options.keep_input {
200            original_input = Some(buffer.clone());
201            if debug_mode {
202                println!(
203                    "[DEBUG] Stored original file input: {} bytes",
204                    original_input.as_ref().map_or(0, |s| s.len())
205                );
206            }
207        }
208
209        if debug_mode {
210            println!(
211                "[DEBUG] Reading from file, content length: {} bytes",
212                buffer.len()
213            );
214        }
215
216        // Auto-detect git diff format or use explicit flag
217        let is_diff_format = options.diff || is_git_diff_format(&buffer);
218
219        if is_diff_format {
220            // Parse as git diff format
221            if debug_mode {
222                println!("[DEBUG] Parsing file content as git diff format");
223            }
224            file_paths = extract_file_paths_from_git_diff(&buffer, options.allow_tests);
225        } else {
226            // Parse as regular text
227            file_paths = file_paths::extract_file_paths_from_text(&buffer, options.allow_tests);
228        }
229
230        if debug_mode {
231            println!(
232                "[DEBUG] Extracted {} file paths from input file",
233                file_paths.len()
234            );
235            for (path, start, end, symbol, lines) in &file_paths {
236                println!(
237                    "[DEBUG]   - {:?} (lines: {:?}-{:?}, symbol: {:?}, specific lines: {:?})",
238                    path,
239                    start,
240                    end,
241                    symbol,
242                    lines.as_ref().map(|l| l.len())
243                );
244            }
245        }
246
247        if file_paths.is_empty() {
248            println!(
249                "{}",
250                format!("No file paths found in input file: {input_file_path}")
251                    .yellow()
252                    .bold()
253            );
254            return Ok(());
255        }
256    } else if options.files.is_empty() {
257        // Check if stdin is available (not a terminal)
258        let is_stdin_available = !atty::is(atty::Stream::Stdin);
259
260        if is_stdin_available {
261            // Read from stdin
262            println!("{}", "Reading from stdin...".bold().blue());
263            let mut buffer = String::new();
264            std::io::stdin().read_to_string(&mut buffer)?;
265
266            // Store the original input if keep_input is true
267            if options.keep_input {
268                original_input = Some(buffer.clone());
269                if debug_mode {
270                    println!(
271                        "[DEBUG] Stored original stdin input: {} bytes",
272                        original_input.as_ref().map_or(0, |s| s.len())
273                    );
274                }
275            }
276
277            if debug_mode {
278                println!(
279                    "[DEBUG] Reading from stdin, content length: {} bytes",
280                    buffer.len()
281                );
282            }
283
284            // Auto-detect git diff format or use explicit flag
285            let is_diff_format = options.diff || is_git_diff_format(&buffer);
286
287            if is_diff_format {
288                // Parse as git diff format
289                if debug_mode {
290                    println!("[DEBUG] Parsing stdin content as git diff format");
291                }
292                file_paths = extract_file_paths_from_git_diff(&buffer, options.allow_tests);
293            } else {
294                // Parse as regular text
295                file_paths = file_paths::extract_file_paths_from_text(&buffer, options.allow_tests);
296            }
297        } else {
298            // No arguments and no stdin, show help
299            println!(
300                "{}",
301                "No files specified and no stdin input detected."
302                    .yellow()
303                    .bold()
304            );
305            println!("{}", "Use --help for usage information.".blue());
306            return Ok(());
307        }
308
309        if debug_mode {
310            println!(
311                "[DEBUG] Extracted {} file paths from stdin",
312                file_paths.len()
313            );
314            for (path, start, end, symbol, lines) in &file_paths {
315                println!(
316                    "[DEBUG]   - {:?} (lines: {:?}-{:?}, symbol: {:?}, specific lines: {:?})",
317                    path,
318                    start,
319                    end,
320                    symbol,
321                    lines.as_ref().map(|l| l.len())
322                );
323            }
324        }
325
326        if file_paths.is_empty() {
327            println!("{}", "No file paths found in stdin.".yellow().bold());
328            return Ok(());
329        }
330    } else {
331        // Parse command-line arguments
332        if debug_mode {
333            println!("[DEBUG] Parsing command-line arguments");
334        }
335
336        // Store the original input if keep_input is true
337        if options.keep_input {
338            original_input = Some(options.files.join(" "));
339            if debug_mode {
340                println!(
341                    "[DEBUG] Stored original command-line input: {}",
342                    original_input.as_ref().unwrap_or(&String::new())
343                );
344            }
345        }
346
347        for file in &options.files {
348            if debug_mode {
349                println!("[DEBUG] Parsing file argument: {file}");
350            }
351
352            let paths = file_paths::parse_file_with_line(file, options.allow_tests);
353
354            if debug_mode {
355                println!(
356                    "[DEBUG] Parsed {} paths from argument '{}'",
357                    paths.len(),
358                    file
359                );
360                for (path, start, end, symbol, lines) in &paths {
361                    println!(
362                        "[DEBUG]   - {:?} (lines: {:?}-{:?}, symbol: {:?}, specific lines: {:?})",
363                        path,
364                        start,
365                        end,
366                        symbol,
367                        lines.as_ref().map(|l| l.len())
368                    );
369                }
370            }
371
372            file_paths.extend(paths);
373        }
374    }
375
376    // Only print file information for non-JSON/XML formats
377    if options.format != "json" && options.format != "xml" {
378        println!("{text}", text = "Files to extract:".bold().green());
379
380        for (path, start_line, end_line, symbol, lines) in &file_paths {
381            if let (Some(start), Some(end)) = (start_line, end_line) {
382                println!(
383                    "  {path} (lines {start}-{end})",
384                    path = path.display(),
385                    start = start,
386                    end = end
387                );
388            } else if let Some(line_num) = start_line {
389                println!(
390                    "  {path} (line {line_num})",
391                    path = path.display(),
392                    line_num = line_num
393                );
394            } else if let Some(sym) = symbol {
395                println!("  {path} (symbol: {sym})", path = path.display());
396            } else if let Some(lines_set) = lines {
397                println!(
398                    "  {path} (specific lines: {count} lines)",
399                    path = path.display(),
400                    count = lines_set.len()
401                );
402            } else {
403                println!("  {path}", path = path.display());
404            }
405        }
406
407        if options.context_lines > 0 {
408            println!(
409                "Context lines: {context_lines}",
410                context_lines = options.context_lines
411            );
412        }
413
414        if options.dry_run {
415            println!(
416                "{text}",
417                text = "Dry run (file names and lines only)".yellow()
418            );
419        }
420
421        println!("Format: {format}", format = options.format);
422        println!();
423    }
424
425    // Process prompt template and instructions if provided
426    let system_prompt = if let Some(prompt_template) = &options.prompt {
427        if debug_mode {
428            println!("[DEBUG] Processing prompt template: {prompt_template:?}");
429        }
430        match prompt_template.get_content() {
431            Ok(content) => {
432                if debug_mode {
433                    println!(
434                        "[DEBUG] Loaded prompt template content ({} bytes)",
435                        content.len()
436                    );
437                }
438                Some(content)
439            }
440            Err(e) => {
441                eprintln!(
442                    "{text}",
443                    text = format!("Error loading prompt template: {e}").red()
444                );
445                if debug_mode {
446                    println!("[DEBUG] Error loading prompt template: {e}");
447                }
448                None
449            }
450        }
451    } else {
452        None
453    };
454
455    // Process files in parallel using Rayon
456    use rayon::prelude::*;
457    use std::sync::{Arc, Mutex};
458
459    // Create thread-safe containers for results and errors
460    let results_mutex = Arc::new(Mutex::new(Vec::<SearchResult>::new()));
461    let errors_mutex = Arc::new(Mutex::new(Vec::<String>::new()));
462
463    // Create a struct to hold all parameters for parallel processing
464    struct FileProcessingParams {
465        path: std::path::PathBuf,
466        start_line: Option<usize>,
467        end_line: Option<usize>,
468        symbol: Option<String>,
469        specific_lines: Option<HashSet<usize>>,
470        allow_tests: bool,
471        context_lines: usize,
472        debug_mode: bool,
473        format: String,
474
475        #[allow(dead_code)]
476        original_input: Option<String>,
477        #[allow(dead_code)]
478        system_prompt: Option<String>,
479        #[allow(dead_code)]
480        user_instructions: Option<String>,
481    }
482
483    // Collect all file parameters
484    let file_params: Vec<FileProcessingParams> = file_paths
485        .into_iter()
486        .map(
487            |(path, start_line, end_line, symbol, specific_lines)| FileProcessingParams {
488                path,
489                start_line,
490                end_line,
491                symbol,
492                specific_lines,
493                allow_tests: options.allow_tests,
494                context_lines: options.context_lines,
495                debug_mode,
496                format: options.format.clone(),
497                original_input: original_input.clone(),
498                system_prompt: system_prompt.clone(),
499                user_instructions: options.instructions.clone(),
500            },
501        )
502        .collect();
503
504    // Process files in parallel
505    file_params.par_iter().for_each(|params| {
506        if params.debug_mode {
507            println!("\n[DEBUG] Processing file: {:?}", params.path);
508            println!("[DEBUG] Start line: {:?}", params.start_line);
509            println!("[DEBUG] End line: {:?}", params.end_line);
510            println!("[DEBUG] Symbol: {:?}", params.symbol);
511            println!(
512                "[DEBUG] Specific lines: {:?}",
513                params.specific_lines.as_ref().map(|l| l.len())
514            );
515
516            // Check if file exists
517            if params.path.exists() {
518                println!("[DEBUG] File exists: Yes");
519
520                // Get file extension and language
521                if let Some(ext) = params.path.extension().and_then(|e| e.to_str()) {
522                    let language = formatter::get_language_from_extension(ext);
523                    println!("[DEBUG] File extension: {ext}");
524                    println!(
525                        "[DEBUG] Detected language: {}",
526                        if language.is_empty() {
527                            "unknown"
528                        } else {
529                            language
530                        }
531                    );
532                } else {
533                    println!("[DEBUG] File has no extension");
534                }
535            } else {
536                println!("[DEBUG] File exists: No");
537            }
538        }
539
540        // The allow_tests check is now handled in the file path extraction functions
541        // We only need to check if this is a test file for debugging purposes
542        if params.debug_mode && crate::language::is_test_file(&params.path) && !params.allow_tests {
543            println!("[DEBUG] Test file detected: {:?}", params.path);
544        }
545
546        match processor::process_file_for_extraction(
547            &params.path,
548            params.start_line,
549            params.end_line,
550            params.symbol.as_deref(),
551            params.allow_tests,
552            params.context_lines,
553            params.specific_lines.as_ref(),
554        ) {
555            Ok(result) => {
556                if params.debug_mode {
557                    println!("[DEBUG] Successfully extracted code from {:?}", params.path);
558                    println!("[DEBUG] Extracted lines: {:?}", result.lines);
559                    println!("[DEBUG] Node type: {}", result.node_type);
560                    println!("[DEBUG] Code length: {} bytes", result.code.len());
561                    println!(
562                        "[DEBUG] Estimated tokens: {}",
563                        crate::search::search_tokens::count_tokens(&result.code)
564                    );
565                }
566
567                // Thread-safe addition to results
568                let mut results = results_mutex.lock().unwrap();
569                results.push(result);
570            }
571            Err(e) => {
572                let error_msg = format!(
573                    "Error processing file {path:?}: {e}",
574                    path = params.path,
575                    e = e
576                );
577                if params.debug_mode {
578                    println!("[DEBUG] Error: {error_msg}");
579                }
580                // Only print error messages for non-JSON/XML formats
581                if params.format != "json" && params.format != "xml" {
582                    eprintln!("{}", error_msg.red());
583                }
584                // Thread-safe addition to errors
585                let mut errors = errors_mutex.lock().unwrap();
586                errors.push(error_msg);
587            }
588        }
589    });
590    // Move results and errors from the mutex containers
591    let mut results = Arc::try_unwrap(results_mutex)
592        .expect("Failed to unwrap results mutex")
593        .into_inner()
594        .expect("Failed to get inner results");
595
596    let errors = Arc::try_unwrap(errors_mutex)
597        .expect("Failed to unwrap errors mutex")
598        .into_inner()
599        .expect("Failed to get inner errors");
600
601    // Deduplicate results based on file path and line range
602    if debug_mode {
603        println!(
604            "[DEBUG] Before deduplication: {len} results",
605            len = results.len()
606        );
607    }
608
609    // First, sort results by file path and then by line range size (largest first)
610    // This ensures that parent blocks (like classes) are processed before nested blocks (like methods)
611    results.sort_by(|a, b| {
612        let a_file = &a.file;
613        let b_file = &b.file;
614
615        // First compare by file path
616        if a_file != b_file {
617            return a_file.cmp(b_file);
618        }
619
620        // Then compare by range size (largest first)
621        let a_range_size = a.lines.1 - a.lines.0;
622        let b_range_size = b.lines.1 - b.lines.0;
623        b_range_size.cmp(&a_range_size)
624    });
625
626    if debug_mode {
627        println!("[DEBUG] Sorted results by file path and range size");
628        for (i, result) in results.iter().enumerate() {
629            println!(
630                "[DEBUG] Result {}: {} (lines {}-{}, size: {})",
631                i,
632                result.file,
633                result.lines.0,
634                result.lines.1,
635                result.lines.1 - result.lines.0
636            );
637        }
638    }
639
640    // Now deduplicate, keeping track of which results to retain
641    let mut to_retain = vec![true; results.len()];
642
643    // Use a HashSet to track exact duplicates
644    let mut seen_exact = HashSet::new();
645
646    for i in 0..results.len() {
647        if !to_retain[i] {
648            continue; // Skip already marked for removal
649        }
650
651        let result_i = &results[i];
652        let file_i = &result_i.file;
653        let start_i = result_i.lines.0;
654        let end_i = result_i.lines.1;
655
656        // Check for exact duplicates first
657        let key = format!("{file_i}:{start_i}:{end_i}");
658        if !seen_exact.insert(key) {
659            to_retain[i] = false;
660            if debug_mode {
661                println!("[DEBUG] Removing exact duplicate: {file_i} (lines {start_i}-{end_i})");
662            }
663            continue;
664        }
665
666        // Then check for nested duplicates
667        for j in i + 1..results.len() {
668            if !to_retain[j] {
669                continue; // Skip already marked for removal
670            }
671
672            let result_j = &results[j];
673            let file_j = &result_j.file;
674            let start_j = result_j.lines.0;
675            let end_j = result_j.lines.1;
676
677            // Only compare results from the same file
678            if file_i != file_j {
679                continue;
680            }
681
682            // Check if result_j is contained within result_i
683            if start_j >= start_i && end_j <= end_i {
684                to_retain[j] = false;
685                if debug_mode {
686                    println!("[DEBUG] Removing nested duplicate: {file_j} (lines {start_j}-{end_j}) contained within (lines {start_i}-{end_i})");
687                }
688            }
689        }
690    }
691
692    // Apply the retention filter
693    let original_len = results.len();
694    let mut new_results = Vec::with_capacity(original_len);
695
696    for i in 0..original_len {
697        if to_retain[i] {
698            new_results.push(results[i].clone());
699        }
700    }
701
702    results = new_results;
703
704    if debug_mode {
705        println!(
706            "[DEBUG] After deduplication: {len} results",
707            len = results.len()
708        );
709    }
710
711    if debug_mode {
712        println!("\n[DEBUG] ===== Extraction Summary =====");
713        println!("[DEBUG] Total results: {}", results.len());
714        println!("[DEBUG] Total errors: {}", errors.len());
715        println!("[DEBUG] Output format: {}", options.format);
716        println!("[DEBUG] Dry run: {}", options.dry_run);
717    }
718
719    // Format the results
720    let res = {
721        // Temporarily disable colors if writing to clipboard
722        let colors_enabled = if options.to_clipboard {
723            let was_enabled = colored::control::SHOULD_COLORIZE.should_colorize();
724            colored::control::set_override(false);
725            was_enabled
726        } else {
727            false
728        };
729
730        // Format the results
731        let result = if options.dry_run {
732            formatter::format_extraction_dry_run(
733                &results,
734                &options.format,
735                original_input.as_deref(),
736                system_prompt.as_deref(),
737                options.instructions.as_deref(),
738            )
739        } else {
740            formatter::format_extraction_results(
741                &results,
742                &options.format,
743                original_input.as_deref(),
744                system_prompt.as_deref(),
745                options.instructions.as_deref(),
746            )
747        };
748
749        // Restore color settings if they were changed
750        if options.to_clipboard && colors_enabled {
751            colored::control::set_override(true);
752        }
753
754        result
755    };
756    match res {
757        Ok(formatted_output) => {
758            if options.to_clipboard {
759                // Write to clipboard
760                let mut clipboard = Clipboard::new()?;
761                clipboard.set_text(&formatted_output)?;
762                println!("{}", "Results copied to clipboard.".green().bold());
763
764                if debug_mode {
765                    println!(
766                        "[DEBUG] Wrote {} bytes to clipboard",
767                        formatted_output.len()
768                    );
769                }
770            } else {
771                // Print to stdout
772                println!("{formatted_output}");
773            }
774        }
775        Err(e) => {
776            // Only print error messages for non-JSON/XML formats
777            if options.format != "json" && options.format != "xml" {
778                eprintln!("{}", format!("Error formatting results: {e}").red());
779            }
780            if debug_mode {
781                println!("[DEBUG] Error formatting results: {e}");
782            }
783        }
784    }
785
786    // Print summary of errors if any (only for non-JSON/XML formats)
787    if !errors.is_empty() && options.format != "json" && options.format != "xml" {
788        println!();
789        println!(
790            "{} {} {}",
791            "Encountered".red().bold(),
792            errors.len(),
793            if errors.len() == 1 { "error" } else { "errors" }
794        );
795    }
796
797    if debug_mode {
798        println!("[DEBUG] ===== Extract Command Completed =====");
799    }
800
801    Ok(())
802}