agcodex_core/tools/
glob.rs

1//! High-performance file discovery tool with glob pattern support
2//!
3//! This module provides efficient file globbing with:
4//! - ignore::WalkBuilder for respecting .gitignore files
5//! - Parallel directory traversal for performance
6//! - Complex glob patterns (*.rs, **/*.js, etc.)
7//! - Extension-based filtering
8//! - Rich metadata and context-aware output
9//! - <100ms performance target for 10k files
10
11use super::output::ComprehensiveToolOutput;
12use ignore::DirEntry;
13use ignore::Walk;
14use ignore::WalkBuilder;
15use ignore::WalkParallel;
16use ignore::WalkState;
17use regex::Regex;
18use serde::Deserialize;
19use serde::Serialize;
20use std::collections::HashMap;
21use std::collections::HashSet;
22use std::ffi::OsStr;
23use std::path::Path;
24use std::path::PathBuf;
25use std::sync::Arc;
26use std::sync::Mutex;
27use std::sync::atomic::AtomicBool;
28use std::sync::atomic::Ordering;
29use std::time::Duration;
30use std::time::Instant;
31use std::time::SystemTime;
32use thiserror::Error;
33use tracing::info;
34use wildmatch::WildMatch;
35
36/// Errors for glob operations
37#[derive(Error, Debug)]
38pub enum GlobError {
39    #[error("invalid glob pattern '{pattern}': {reason}")]
40    InvalidPattern { pattern: String, reason: String },
41
42    #[error("directory not found: {path}")]
43    DirectoryNotFound { path: PathBuf },
44
45    #[error("permission denied: {path}")]
46    PermissionDenied { path: PathBuf },
47
48    #[error("search timeout after {timeout:?}")]
49    SearchTimeout { timeout: Duration },
50
51    #[error("I/O error: {0}")]
52    Io(#[from] std::io::Error),
53
54    #[error("search error: {message}")]
55    SearchError { message: String },
56
57    #[error("search cancelled after {duration:?}")]
58    SearchCancelled { duration: Duration },
59
60    #[error("filter chain error: {message}")]
61    FilterChain { message: String },
62}
63
64/// Result type for glob operations
65pub type GlobResult<T> = std::result::Result<T, GlobError>;
66
67/// Output type for glob operations  
68pub type GlobOutput<T> = ComprehensiveToolOutput<T>;
69
70/// Search strategy for optimization
71#[derive(Debug, Clone, Copy, PartialEq, Eq)]
72pub enum SearchStrategy {
73    /// Use parallel search with multiple threads
74    Parallel,
75    /// Use sequential search for memory efficiency
76    Sequential,
77    /// Automatically select based on directory size
78    Auto,
79}
80
81/// File match result with metadata
82#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct FileMatch {
84    /// Absolute path to the file
85    pub path: PathBuf,
86    /// File size in bytes (None for directories)
87    pub size: Option<u64>,
88    /// File extension (if any)
89    pub extension: Option<String>,
90    /// File type classification
91    pub file_type: FileType,
92    /// Relative path from search root
93    pub relative_path: PathBuf,
94    /// Last modified time
95    pub modified: Option<SystemTime>,
96    /// Whether the file is executable
97    pub executable: bool,
98    /// Content category for easier filtering
99    pub content_category: ContentCategory,
100    /// Estimated lines of code (for text files)
101    pub estimated_lines: Option<usize>,
102}
103
104/// File type classification
105#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
106pub enum FileType {
107    /// Regular file
108    File,
109    /// Directory
110    Directory,
111    /// Symbolic link
112    Symlink,
113    /// Other file type
114    Other,
115}
116
117/// Content category for semantic understanding
118#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
119pub enum ContentCategory {
120    /// Source code files
121    Source,
122    /// Configuration files
123    Config,
124    /// Documentation files
125    Documentation,
126    /// Test files
127    Test,
128    /// Binary or executable files
129    Binary,
130    /// Data files (JSON, CSV, etc.)
131    Data,
132    /// Unknown or unclassified
133    Unknown,
134}
135
136/// Search query configuration
137#[derive(Debug, Clone)]
138pub struct GlobQuery {
139    /// Base directory to search from
140    pub base_dir: PathBuf,
141    /// Glob patterns to match (e.g., "*.rs", "**/*.js")
142    pub patterns: Vec<String>,
143    /// File type filter
144    pub file_type: Option<FileType>,
145    /// Size constraints
146    pub size_filter: Option<SizeFilter>,
147    /// Maximum search depth
148    pub max_depth: Option<usize>,
149    /// Include hidden files/directories
150    pub include_hidden: bool,
151    /// Follow symbolic links
152    pub follow_links: bool,
153    /// Case sensitive matching
154    pub case_sensitive: bool,
155    /// Maximum number of results (0 = unlimited)
156    pub max_results: usize,
157    /// Search timeout
158    pub timeout: Option<Duration>,
159}
160
161/// File size filter with min/max constraints
162#[derive(Debug, Clone)]
163pub struct SizeFilter {
164    pub min_size: Option<u64>,
165    pub max_size: Option<u64>,
166}
167
168/// Search statistics
169#[derive(Debug, Clone, Default)]
170pub struct SearchStats {
171    /// Total directories traversed
172    pub directories_traversed: usize,
173    /// Total files examined
174    pub files_examined: usize,
175    /// Files filtered out by ignore rules
176    pub files_ignored: usize,
177}
178
179/// Time-based filter for file modification times
180#[derive(Debug, Clone)]
181pub struct TimeFilter {
182    pub modified_after: Option<SystemTime>,
183    pub modified_before: Option<SystemTime>,
184}
185
186/// Filter chain for complex file matching
187#[derive(Debug, Clone, Default)]
188pub struct FilterChain {
189    /// Glob patterns to match
190    pub glob_patterns: Vec<GlobPattern>,
191    /// File size filters
192    pub size_filters: Vec<SizeFilter>,
193    /// Time-based filters
194    pub time_filters: Vec<TimeFilter>,
195    /// Content category filters
196    pub category_filters: Vec<ContentCategory>,
197    /// File type filters
198    pub type_filters: Vec<FileType>,
199    /// Custom exclude patterns
200    pub exclude_patterns: Vec<String>,
201}
202
203/// Glob pattern with compilation options
204#[derive(Debug, Clone)]
205pub struct GlobPattern {
206    pub pattern: String,
207    pub case_sensitive: bool,
208    pub negate: bool,
209}
210
211/// Compiled filters for efficient matching
212#[derive(Debug, Clone)]
213pub struct CompiledFilters {
214    /// Compiled glob patterns
215    glob_matchers: Vec<CompiledGlobPattern>,
216    /// Size filters (no compilation needed)
217    size_filters: Vec<SizeFilter>,
218    /// Time filters (no compilation needed)
219    time_filters: Vec<TimeFilter>,
220    /// Category filters (no compilation needed)
221    category_filters: Vec<ContentCategory>,
222    /// Type filters (no compilation needed)
223    type_filters: Vec<FileType>,
224    /// Compiled exclude patterns
225    exclude_matchers: Vec<WildMatch>,
226}
227
228/// Compiled glob pattern for efficient matching
229#[derive(Debug, Clone)]
230struct CompiledGlobPattern {
231    pattern: String,
232    matcher: WildMatch,
233    negate: bool,
234}
235
236/// Content classifier for determining file types
237#[derive(Debug, Clone)]
238pub struct ContentClassifier {
239    source_extensions: HashSet<String>,
240    config_extensions: HashSet<String>,
241    doc_extensions: HashSet<String>,
242    test_patterns: Vec<String>,
243}
244
245impl FilterChain {
246    /// Add a glob pattern to the filter chain
247    pub fn add_glob(
248        &mut self,
249        pattern: &str,
250        case_sensitive: bool,
251        negate: bool,
252    ) -> GlobResult<()> {
253        // Validate pattern
254        if pattern.is_empty() {
255            return Err(GlobError::InvalidPattern {
256                pattern: pattern.to_string(),
257                reason: "Pattern cannot be empty".to_string(),
258            });
259        }
260
261        self.glob_patterns.push(GlobPattern {
262            pattern: pattern.to_string(),
263            case_sensitive,
264            negate,
265        });
266        Ok(())
267    }
268}
269
270impl CompiledFilters {
271    /// Compile a filter chain into optimized matchers
272    pub fn compile(chain: &FilterChain) -> GlobResult<Self> {
273        let mut glob_matchers = Vec::new();
274
275        for pattern in &chain.glob_patterns {
276            let matcher = WildMatch::new(&pattern.pattern);
277            glob_matchers.push(CompiledGlobPattern {
278                pattern: pattern.pattern.clone(),
279                matcher,
280                negate: pattern.negate,
281            });
282        }
283
284        let exclude_matchers = chain
285            .exclude_patterns
286            .iter()
287            .map(|p| WildMatch::new(p))
288            .collect();
289
290        Ok(Self {
291            glob_matchers,
292            size_filters: chain.size_filters.clone(),
293            time_filters: chain.time_filters.clone(),
294            category_filters: chain.category_filters.clone(),
295            type_filters: chain.type_filters.clone(),
296            exclude_matchers,
297        })
298    }
299
300    /// Check if a file matches all filters
301    pub fn matches(&self, file: &FileMatch) -> bool {
302        // Check glob patterns
303        if !self.glob_matchers.is_empty() {
304            // For simple patterns without path separators, match against filename only
305            // For patterns with path separators, match against the relative path
306            let mut matched = false;
307
308            for pattern in &self.glob_matchers {
309                let is_match = if pattern.pattern.contains('/') || pattern.pattern.contains("**") {
310                    // Match against relative path for complex patterns
311                    pattern
312                        .matcher
313                        .matches(&file.relative_path.to_string_lossy())
314                } else {
315                    // For simple patterns (e.g., "*.rs"), only match files in the root directory
316                    // Check that the relative path doesn't contain any directory separators
317                    let relative_path_str = file.relative_path.to_string_lossy();
318                    if relative_path_str.contains('/') || relative_path_str.contains('\\') {
319                        // File is in a subdirectory, don't match for simple patterns
320                        false
321                    } else if let Some(file_name) = file.path.file_name() {
322                        // File is in root directory, check if filename matches
323                        pattern.matcher.matches(&file_name.to_string_lossy())
324                    } else {
325                        false
326                    }
327                };
328
329                if pattern.negate {
330                    if is_match {
331                        return false;
332                    }
333                } else if is_match {
334                    matched = true;
335                }
336            }
337
338            if !matched && !self.glob_matchers.iter().all(|p| p.negate) {
339                return false;
340            }
341        }
342
343        // Check exclude patterns - match against relative path
344        let relative_path_str = file.relative_path.to_string_lossy();
345        for exclude in &self.exclude_matchers {
346            if exclude.matches(&relative_path_str) {
347                return false;
348            }
349        }
350
351        // Check size filters
352        if let Some(size) = file.size {
353            for filter in &self.size_filters {
354                if let Some(min) = filter.min_size
355                    && size < min
356                {
357                    return false;
358                }
359                if let Some(max) = filter.max_size
360                    && size > max
361                {
362                    return false;
363                }
364            }
365        }
366
367        // Check time filters
368        if let Some(modified) = file.modified {
369            for filter in &self.time_filters {
370                if let Some(after) = filter.modified_after
371                    && modified < after
372                {
373                    return false;
374                }
375                if let Some(before) = filter.modified_before
376                    && modified > before
377                {
378                    return false;
379                }
380            }
381        }
382
383        // Check category filters
384        if !self.category_filters.is_empty()
385            && !self.category_filters.contains(&file.content_category)
386        {
387            return false;
388        }
389
390        // Check type filters
391        if !self.type_filters.is_empty() && !self.type_filters.contains(&file.file_type) {
392            return false;
393        }
394
395        true
396    }
397}
398
399impl Default for ContentClassifier {
400    fn default() -> Self {
401        let mut source_extensions = HashSet::new();
402        source_extensions.extend(
403            [
404                "rs", "py", "js", "ts", "jsx", "tsx", "go", "java", "c", "cpp", "cc", "cxx", "h",
405                "hpp", "cs", "php", "rb", "swift", "kt", "scala", "hs", "clj", "ex", "exs",
406            ]
407            .iter()
408            .map(|s| (*s).to_string()),
409        );
410
411        let mut config_extensions = HashSet::new();
412        config_extensions.extend(
413            [
414                "toml", "yaml", "yml", "json", "xml", "ini", "cfg", "conf", "config", "env",
415            ]
416            .iter()
417            .map(|s| (*s).to_string()),
418        );
419
420        let mut doc_extensions = HashSet::new();
421        doc_extensions.extend(
422            ["md", "txt", "rst", "adoc", "tex", "pdf", "doc", "docx"]
423                .iter()
424                .map(|s| (*s).to_string()),
425        );
426
427        let test_patterns = vec![
428            "test".to_string(),
429            "spec".to_string(),
430            "_test".to_string(),
431            ".test.".to_string(),
432            ".spec.".to_string(),
433        ];
434
435        Self {
436            source_extensions,
437            config_extensions,
438            doc_extensions,
439            test_patterns,
440        }
441    }
442}
443
444impl ContentClassifier {
445    /// Classify a file based on its path and extension
446    pub fn classify_path(&self, path: &Path) -> ContentCategory {
447        // Check if it looks like a test file first
448        if let Some(file_name) = path.file_name().and_then(|n| n.to_str()) {
449            for pattern in &self.test_patterns {
450                if file_name.contains(pattern) {
451                    return ContentCategory::Test;
452                }
453            }
454        }
455
456        // Check by extension
457        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
458            let ext_lower = ext.to_lowercase();
459
460            if self.source_extensions.contains(&ext_lower) {
461                return ContentCategory::Source;
462            }
463
464            if self.config_extensions.contains(&ext_lower) {
465                return ContentCategory::Config;
466            }
467
468            if self.doc_extensions.contains(&ext_lower) {
469                return ContentCategory::Documentation;
470            }
471
472            // Check for binary extensions
473            match ext_lower.as_str() {
474                "exe" | "dll" | "so" | "dylib" | "a" | "lib" | "o" | "obj" | "bin" => {
475                    return ContentCategory::Binary;
476                }
477                _ => {}
478            }
479        }
480
481        ContentCategory::Unknown
482    }
483}
484
485/// High-performance file discovery tool
486#[derive(Clone)]
487pub struct GlobTool {
488    /// Base directory for searches
489    base_dir: PathBuf,
490    /// Default filter chain
491    default_filters: FilterChain,
492    /// Number of threads for parallel search
493    parallelism: usize,
494    /// Whether to respect .gitignore and similar files
495    respect_ignore: bool,
496    /// Maximum number of results to return
497    max_results: Option<usize>,
498    /// Whether to follow symbolic links
499    follow_links: bool,
500    /// Whether to include hidden files
501    include_hidden: bool,
502    /// Search timeout
503    timeout: Option<Duration>,
504    /// Custom ignore patterns
505    custom_ignores: Vec<String>,
506    /// File content classifier
507    classifier: Arc<ContentClassifier>,
508    /// Cancellation support
509    cancellation: Arc<AtomicBool>,
510}
511
512impl Default for GlobTool {
513    fn default() -> Self {
514        Self::new(std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")))
515    }
516}
517
518impl Default for GlobQuery {
519    fn default() -> Self {
520        Self {
521            base_dir: std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")),
522            patterns: Vec::new(),
523            file_type: None,
524            size_filter: None,
525            max_depth: Some(32), // Reasonable default to prevent runaway
526            include_hidden: false,
527            follow_links: false,
528            case_sensitive: true,
529            max_results: 0, // unlimited
530            timeout: Some(Duration::from_secs(30)),
531        }
532    }
533}
534
535impl GlobTool {
536    /// Create a new glob tool with specified base directory
537    pub fn new(base_dir: PathBuf) -> Self {
538        let thread_count = std::thread::available_parallelism()
539            .map(|n| n.get())
540            .unwrap_or(4);
541
542        Self {
543            base_dir,
544            default_filters: FilterChain::default(),
545            parallelism: thread_count,
546            respect_ignore: true,
547            max_results: Some(10_000), // Performance limit for large codebases
548            follow_links: false,
549            include_hidden: false,
550            timeout: Some(Duration::from_secs(30)),
551            custom_ignores: Vec::new(),
552            classifier: Arc::new(ContentClassifier::default()),
553            cancellation: Arc::new(AtomicBool::new(false)),
554        }
555    }
556
557    /// Builder: Set parallelism (number of threads)
558    pub fn with_parallelism(mut self, threads: usize) -> Self {
559        self.parallelism = threads.max(1);
560        self
561    }
562
563    /// Builder: Set whether to respect ignore files
564    pub const fn with_respect_ignore(mut self, respect: bool) -> Self {
565        self.respect_ignore = respect;
566        self
567    }
568
569    /// Builder: Set maximum results limit
570    pub const fn with_max_results(mut self, max_results: Option<usize>) -> Self {
571        self.max_results = max_results;
572        self
573    }
574
575    /// Builder: Set whether to follow symbolic links
576    pub const fn with_follow_links(mut self, follow_links: bool) -> Self {
577        self.follow_links = follow_links;
578        self
579    }
580
581    /// Builder: Set whether to include hidden files
582    pub const fn with_include_hidden(mut self, include_hidden: bool) -> Self {
583        self.include_hidden = include_hidden;
584        self
585    }
586
587    /// Builder: Set search timeout
588    pub const fn with_timeout(mut self, timeout: Option<Duration>) -> Self {
589        self.timeout = timeout;
590        self
591    }
592
593    /// Builder: Add custom ignore patterns
594    pub fn with_custom_ignores(mut self, ignores: Vec<String>) -> Self {
595        self.custom_ignores = ignores;
596        self
597    }
598
599    /// Builder: Set default filter chain
600    pub fn with_filter_chain(mut self, filters: FilterChain) -> Self {
601        self.default_filters = filters;
602        self
603    }
604
605    /// Core API: Find files by glob pattern with high performance
606    pub fn glob(&self, pattern: &str) -> GlobResult<GlobOutput<Vec<FileMatch>>> {
607        let mut filters = self.default_filters.clone();
608        filters.add_glob(pattern, true, false)?;
609        self.search_with_filters(filters)
610    }
611
612    /// Core API: Find files by extension (*.ext pattern)
613    pub fn find_type(&self, extension: &str) -> GlobResult<GlobOutput<Vec<FileMatch>>> {
614        // We need to match files at all levels, including root
615        // Using two patterns: *.ext for root files and **/*.ext for nested files
616        let ext_clean = extension.trim_start_matches('.').trim_start_matches('*');
617
618        // Create a filter chain with both patterns
619        let mut filters = self.default_filters.clone();
620
621        // Add pattern for root-level files
622        filters.add_glob(&format!("*.{}", ext_clean), true, false)?;
623
624        // Add pattern for files in subdirectories
625        filters.add_glob(&format!("**/*.{}", ext_clean), true, false)?;
626
627        self.search_with_filters(filters)
628    }
629
630    /// Advanced API: Search with custom FilterChain
631    pub fn search_with_filters(
632        &self,
633        filters: FilterChain,
634    ) -> GlobResult<GlobOutput<Vec<FileMatch>>> {
635        use super::output::*;
636        let timer = PerformanceTimer::new();
637
638        // Reset cancellation flag
639        self.cancellation.store(false, Ordering::Relaxed);
640
641        // Validate base directory
642        if !self.base_dir.exists() {
643            return Err(GlobError::DirectoryNotFound {
644                path: self.base_dir.clone(),
645            });
646        }
647
648        // Compile filters for performance
649        let compiled_filters = CompiledFilters::compile(&filters)?;
650
651        // Execute search with selected strategy
652        let strategy = self.select_strategy();
653        let (matches, stats) = match strategy {
654            SearchStrategy::Parallel => self.search_parallel(&compiled_filters)?,
655            SearchStrategy::Sequential => self.search_sequential(&compiled_filters)?,
656            SearchStrategy::Auto => {
657                if self.estimate_directory_size() > 1000 {
658                    self.search_parallel(&compiled_filters)?
659                } else {
660                    self.search_sequential(&compiled_filters)?
661                }
662            }
663        };
664
665        let duration = timer.elapsed();
666        let matches_count = matches.len();
667
668        // Generate comprehensive context
669        let summary = format!(
670            "Found {} files in {} ({}ms, {} examined, {} ignored)",
671            matches_count,
672            self.base_dir.display(),
673            duration.as_millis(),
674            stats.files_examined,
675            stats.files_ignored
676        );
677
678        // Build rich output using the comprehensive output builder
679        let location = SourceLocation::new(self.base_dir.to_string_lossy(), 0, 0, 0, 0, (0, 0));
680
681        let output = OutputBuilder::new(
682            matches,
683            "glob",
684            "file_discovery".to_string(),
685            location.clone(),
686        )
687        .context(OperationContext {
688            before: ContextSnapshot {
689                content: String::new(),
690                timestamp: SystemTime::now(),
691                content_hash: String::new(),
692                ast_summary: None,
693                symbols: Vec::new(),
694            },
695            after: None,
696            surrounding: vec![
697                ContextLine {
698                    line_number: 0,
699                    content: format!("Search root: {}", self.base_dir.display()),
700                    line_type: ContextLineType::Separator,
701                    indentation: 0,
702                    modified: false,
703                },
704                ContextLine {
705                    line_number: 0,
706                    content: format!("Strategy: {:?}", strategy),
707                    line_type: ContextLineType::Separator,
708                    indentation: 0,
709                    modified: false,
710                },
711            ],
712            location: location.clone(),
713            scope: OperationScope {
714                scope_type: ScopeType::File,
715                name: self
716                    .base_dir
717                    .file_name()
718                    .and_then(|n| n.to_str())
719                    .unwrap_or("unknown")
720                    .to_string(),
721                path: vec![self.base_dir.to_string_lossy().to_string()],
722                file_path: self.base_dir.clone(),
723                line_range: 0..0,
724            },
725            language_context: None,
726            project_context: None,
727        })
728        .performance(PerformanceMetrics {
729            execution_time: duration,
730            phase_times: HashMap::new(),
731            memory_usage: MemoryUsage {
732                peak_bytes: (matches_count * std::mem::size_of::<FileMatch>()) as u64,
733                average_bytes: 0,
734                allocations: 0,
735                deallocations: 0,
736                efficiency_score: 0.9,
737            },
738            cpu_usage: CpuUsage {
739                cpu_time: duration,
740                utilization_percent: 0.0,
741                context_switches: 0,
742            },
743            io_stats: IoStats {
744                bytes_read: 0,
745                bytes_written: 0,
746                read_ops: stats.files_examined as u64,
747                write_ops: 0,
748                io_wait_time: Duration::from_millis(0),
749            },
750            cache_stats: CacheStats {
751                hit_rate: 0.0,
752                hits: 0,
753                misses: stats.files_examined as u64,
754                cache_size: 0,
755                efficiency_score: 0.0,
756            },
757        })
758        .summary(summary)
759        .build();
760
761        info!(
762            "Glob search completed: {} matches in {:?}",
763            matches_count, duration
764        );
765        Ok(output)
766    }
767
768    /// Cancel ongoing search operation
769    pub fn cancel(&self) {
770        self.cancellation.store(true, Ordering::Relaxed);
771    }
772
773    /// Check if search was cancelled
774    fn is_cancelled(&self) -> bool {
775        self.cancellation.load(Ordering::Relaxed)
776    }
777
778    /// Select optimal search strategy based on directory estimation
779    fn select_strategy(&self) -> SearchStrategy {
780        if self.parallelism == 1 {
781            SearchStrategy::Sequential
782        } else if self.estimate_directory_size() > 500 {
783            SearchStrategy::Parallel
784        } else {
785            SearchStrategy::Sequential
786        }
787    }
788
789    /// Advanced API: Search in specific directory with pattern
790    pub fn find_in_directory(
791        &self,
792        dir: &Path,
793        pattern: &str,
794    ) -> GlobResult<GlobOutput<Vec<FileMatch>>> {
795        let scoped_tool = GlobTool::new(dir.to_path_buf())
796            .with_parallelism(self.parallelism)
797            .with_max_results(self.max_results)
798            .with_follow_links(self.follow_links)
799            .with_include_hidden(self.include_hidden)
800            .with_timeout(self.timeout)
801            .with_custom_ignores(self.custom_ignores.clone())
802            .with_filter_chain(self.default_filters.clone());
803
804        scoped_tool.glob(pattern)
805    }
806
807    /// Advanced API: Find files by content category
808    pub fn find_by_category(
809        &self,
810        category: ContentCategory,
811    ) -> GlobResult<GlobOutput<Vec<FileMatch>>> {
812        let mut filters = self.default_filters.clone();
813        filters.category_filters.push(category);
814        self.search_with_filters(filters)
815    }
816
817    /// Advanced API: Find files with size constraints
818    pub fn find_by_size(
819        &self,
820        min_size: Option<u64>,
821        max_size: Option<u64>,
822    ) -> GlobResult<GlobOutput<Vec<FileMatch>>> {
823        let mut filters = self.default_filters.clone();
824        filters.size_filters.push(SizeFilter { min_size, max_size });
825        self.search_with_filters(filters)
826    }
827
828    /// Advanced API: Find files modified after a specific time
829    pub fn find_modified_since(&self, since: SystemTime) -> GlobResult<GlobOutput<Vec<FileMatch>>> {
830        let mut filters = self.default_filters.clone();
831        filters.time_filters.push(TimeFilter {
832            modified_after: Some(since),
833            modified_before: None,
834        });
835        self.search_with_filters(filters)
836    }
837
838    /// High-performance parallel search implementation
839    fn search_parallel(
840        &self,
841        filters: &CompiledFilters,
842    ) -> GlobResult<(Vec<FileMatch>, SearchStats)> {
843        let matches = Arc::new(Mutex::new(Vec::new()));
844        let stats = Arc::new(Mutex::new(SearchStats::default()));
845        let start_time = Instant::now();
846
847        let walker = self.create_parallel_walker()?;
848
849        walker.run(|| {
850            let matches = matches.clone();
851            let stats = stats.clone();
852            let filters = filters.clone();
853            let classifier = self.classifier.clone();
854            let base_dir = self.base_dir.clone();
855            let max_results = self.max_results;
856            let timeout = self.timeout;
857            let cancellation = self.cancellation.clone();
858
859            Box::new(move |entry_result| {
860                // Check for cancellation or timeout
861                if cancellation.load(Ordering::Relaxed) {
862                    return WalkState::Quit;
863                }
864
865                if let Some(timeout) = timeout
866                    && start_time.elapsed() > timeout
867                {
868                    cancellation.store(true, Ordering::Relaxed);
869                    return WalkState::Quit;
870                }
871
872                match entry_result {
873                    Ok(entry) => {
874                        let _path = entry.path();
875
876                        // Update statistics
877                        {
878                            let mut stats = stats.lock().unwrap();
879                            if entry.file_type().is_some_and(|ft| ft.is_dir()) {
880                                stats.directories_traversed += 1;
881                                return WalkState::Continue;
882                            } else {
883                                stats.files_examined += 1;
884                            }
885                        }
886
887                        // Create and filter file match
888                        if let Some(file_match) =
889                            Self::create_file_match(&entry, &classifier, &base_dir)
890                        {
891                            if filters.matches(&file_match) {
892                                let mut matches = matches.lock().unwrap();
893
894                                // Check max results limit
895                                if let Some(max) = max_results
896                                    && matches.len() >= max
897                                {
898                                    return WalkState::Quit;
899                                }
900
901                                matches.push(file_match);
902                            } else {
903                                let mut stats = stats.lock().unwrap();
904                                stats.files_ignored += 1;
905                            }
906                        }
907                    }
908                    Err(_) => {
909                        let mut stats = stats.lock().unwrap();
910                        stats.files_ignored += 1;
911                    }
912                }
913                WalkState::Continue
914            })
915        });
916
917        let matches = Arc::try_unwrap(matches)
918            .map_err(|_| GlobError::FilterChain {
919                message: "Failed to unwrap matches".to_string(),
920            })?
921            .into_inner()
922            .map_err(|_| GlobError::FilterChain {
923                message: "Failed to acquire matches lock".to_string(),
924            })?;
925
926        let stats = Arc::try_unwrap(stats)
927            .map_err(|_| GlobError::FilterChain {
928                message: "Failed to unwrap stats".to_string(),
929            })?
930            .into_inner()
931            .map_err(|_| GlobError::FilterChain {
932                message: "Failed to acquire stats lock".to_string(),
933            })?;
934
935        Ok((matches, stats))
936    }
937
938    /// Memory-efficient sequential search implementation
939    fn search_sequential(
940        &self,
941        filters: &CompiledFilters,
942    ) -> GlobResult<(Vec<FileMatch>, SearchStats)> {
943        let mut matches = Vec::new();
944        let mut stats = SearchStats::default();
945        let start_time = Instant::now();
946
947        let walker = self.create_sequential_walker();
948
949        for entry_result in walker {
950            // Check for cancellation or timeout
951            if self.is_cancelled() {
952                return Err(GlobError::SearchCancelled {
953                    duration: start_time.elapsed(),
954                });
955            }
956
957            if let Some(timeout) = self.timeout
958                && start_time.elapsed() > timeout
959            {
960                return Err(GlobError::SearchTimeout { timeout });
961            }
962
963            match entry_result {
964                Ok(entry) => {
965                    let _path = entry.path();
966
967                    // Handle directories
968                    if entry.file_type().is_some_and(|ft| ft.is_dir()) {
969                        stats.directories_traversed += 1;
970                        continue;
971                    }
972
973                    stats.files_examined += 1;
974
975                    // Check max results limit
976                    if let Some(max) = self.max_results
977                        && matches.len() >= max
978                    {
979                        break;
980                    }
981
982                    // Create and filter file match
983                    if let Some(file_match) =
984                        Self::create_file_match(&entry, &self.classifier, &self.base_dir)
985                    {
986                        if filters.matches(&file_match) {
987                            matches.push(file_match);
988                        } else {
989                            stats.files_ignored += 1;
990                        }
991                    }
992                }
993                Err(_) => {
994                    stats.files_ignored += 1;
995                }
996            }
997        }
998
999        Ok((matches, stats))
1000    }
1001
1002    /// Create optimized parallel walker
1003    fn create_parallel_walker(&self) -> GlobResult<WalkParallel> {
1004        let mut builder = WalkBuilder::new(&self.base_dir);
1005        self.configure_builder(&mut builder)?;
1006        builder.threads(self.parallelism);
1007        Ok(builder.build_parallel())
1008    }
1009
1010    /// Create memory-efficient sequential walker  
1011    fn create_sequential_walker(&self) -> Walk {
1012        let mut builder = WalkBuilder::new(&self.base_dir);
1013        let _ = self.configure_builder(&mut builder); // Ignore configuration errors
1014        builder.build()
1015    }
1016
1017    /// Configure walker builder with comprehensive options
1018    fn configure_builder(&self, builder: &mut WalkBuilder) -> GlobResult<()> {
1019        builder
1020            .follow_links(self.follow_links)
1021            .hidden(!self.include_hidden)
1022            .ignore(self.respect_ignore)
1023            .git_ignore(self.respect_ignore)
1024            .git_global(self.respect_ignore)
1025            .git_exclude(self.respect_ignore)
1026            .parents(self.respect_ignore)
1027            .require_git(false); // Don't require git for ignore files to work
1028
1029        // Add custom ignore patterns
1030        for ignore_pattern in &self.custom_ignores {
1031            builder.add_custom_ignore_filename(ignore_pattern);
1032        }
1033
1034        // Validate base directory accessibility
1035        if !self.base_dir.exists() {
1036            return Err(GlobError::DirectoryNotFound {
1037                path: self.base_dir.clone(),
1038            });
1039        }
1040
1041        if !self.base_dir.is_dir() {
1042            return Err(GlobError::FilterChain {
1043                message: format!("Path is not a directory: {}", self.base_dir.display()),
1044            });
1045        }
1046
1047        Ok(())
1048    }
1049
1050    /// Create comprehensive FileMatch with rich metadata
1051    fn create_file_match(
1052        entry: &DirEntry,
1053        classifier: &ContentClassifier,
1054        base_dir: &Path,
1055    ) -> Option<FileMatch> {
1056        let path = entry.path();
1057
1058        // Get file metadata with error handling
1059        let metadata = match entry.metadata() {
1060            Ok(meta) => meta,
1061            Err(_) => return None, // Skip inaccessible files
1062        };
1063
1064        let file_type = if metadata.is_file() {
1065            FileType::File
1066        } else if metadata.is_dir() {
1067            FileType::Directory
1068        } else if metadata.file_type().is_symlink() {
1069            FileType::Symlink
1070        } else {
1071            FileType::Other
1072        };
1073
1074        let size = if file_type == FileType::File {
1075            Some(metadata.len())
1076        } else {
1077            None
1078        };
1079
1080        let extension = path
1081            .extension()
1082            .and_then(OsStr::to_str)
1083            .map(|s| s.to_lowercase());
1084
1085        let content_category = classifier.classify_path(path);
1086
1087        let relative_path = path.strip_prefix(base_dir).unwrap_or(path).to_path_buf();
1088
1089        let modified = metadata.modified().ok();
1090
1091        let executable = {
1092            #[cfg(unix)]
1093            {
1094                use std::os::unix::fs::PermissionsExt;
1095                metadata.permissions().mode() & 0o111 != 0
1096            }
1097            #[cfg(not(unix))]
1098            {
1099                extension
1100                    .as_ref()
1101                    .map(|ext| matches!(ext.as_str(), "exe" | "bat" | "cmd" | "com"))
1102                    .unwrap_or(false)
1103            }
1104        };
1105
1106        // Estimate lines for text files
1107        let estimated_lines = if matches!(
1108            content_category,
1109            ContentCategory::Source | ContentCategory::Config | ContentCategory::Documentation
1110        ) {
1111            size.and_then(|s| {
1112                if s < 1024 * 1024 {
1113                    // Only estimate for files < 1MB
1114                    Some((s / 40).max(1) as usize) // Rough estimate: 40 bytes per line
1115                } else {
1116                    None
1117                }
1118            })
1119        } else {
1120            None
1121        };
1122
1123        Some(FileMatch {
1124            path: path.to_path_buf(),
1125            size,
1126            extension,
1127            file_type,
1128            relative_path,
1129            modified,
1130            executable,
1131            content_category,
1132            estimated_lines,
1133        })
1134    }
1135
1136    /// Estimate directory size for strategy selection optimization
1137    fn estimate_directory_size(&self) -> usize {
1138        // Fast estimation: count immediate entries (files + directories)
1139        std::fs::read_dir(&self.base_dir)
1140            .map(|entries| entries.count())
1141            .unwrap_or(0)
1142    }
1143}
1144
1145/// Internal statistics tracking
1146#[derive(Default)]
1147#[allow(dead_code)]
1148struct GlobStats {
1149    files_examined: usize,
1150    directories_traversed: usize,
1151    files_ignored: usize,
1152}
1153
1154/// File type categories for semantic classification
1155#[derive(Debug, Clone, PartialEq, Eq)]
1156pub enum FileTypeCategory {
1157    Source,
1158    Config,
1159    Documentation,
1160    Test,
1161    Binary,
1162    Generated,
1163    Unknown,
1164}
1165
1166/// File extension classifier for determining file types
1167pub struct FileExtensionClassifier {
1168    source_extensions: HashSet<String>,
1169    config_extensions: HashSet<String>,
1170    doc_extensions: HashSet<String>,
1171    test_patterns: Vec<Regex>,
1172}
1173
1174impl Default for FileExtensionClassifier {
1175    fn default() -> Self {
1176        let mut source_extensions = HashSet::new();
1177        // Programming languages
1178        source_extensions.extend(
1179            [
1180                "rs", "py", "js", "ts", "jsx", "tsx", "go", "java", "c", "cpp", "cc", "cxx", "h",
1181                "hpp", "cs", "php", "rb", "swift", "kt", "scala", "hs", "clj", "ex", "exs", "sh",
1182                "bash", "zsh", "fish", "ps1", "bat", "cmd",
1183            ]
1184            .iter()
1185            .map(|s| (*s).to_string()),
1186        );
1187
1188        let mut config_extensions = HashSet::new();
1189        // Configuration files
1190        config_extensions.extend(
1191            [
1192                "toml",
1193                "yaml",
1194                "yml",
1195                "json",
1196                "xml",
1197                "ini",
1198                "cfg",
1199                "conf",
1200                "config",
1201                "env",
1202                "properties",
1203                "dockerfile",
1204            ]
1205            .iter()
1206            .map(|s| (*s).to_string()),
1207        );
1208
1209        let mut doc_extensions = HashSet::new();
1210        // Documentation files
1211        doc_extensions.extend(
1212            ["md", "txt", "rst", "adoc", "tex", "pdf", "doc", "docx"]
1213                .iter()
1214                .map(|s| (*s).to_string()),
1215        );
1216
1217        // Test file patterns (regex for flexibility)
1218        let test_patterns = vec![
1219            Regex::new(r"(?i)test").unwrap(),
1220            Regex::new(r"(?i)spec").unwrap(),
1221            Regex::new(r"_test\.[^.]+$").unwrap(),
1222            Regex::new(r"\.test\.[^.]+$").unwrap(),
1223            Regex::new(r"\.spec\.[^.]+$").unwrap(),
1224        ];
1225
1226        Self {
1227            source_extensions,
1228            config_extensions,
1229            doc_extensions,
1230            test_patterns,
1231        }
1232    }
1233}
1234
1235impl FileExtensionClassifier {
1236    /// Classify a file based on its path and extension
1237    pub fn classify_file(&self, path: &Path) -> FileTypeCategory {
1238        // Check if it looks like a test file first
1239        if let Some(file_name) = path.file_name().and_then(|n| n.to_str()) {
1240            for pattern in &self.test_patterns {
1241                if pattern.is_match(file_name) {
1242                    return FileTypeCategory::Test;
1243                }
1244            }
1245        }
1246
1247        // Check by extension
1248        if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
1249            let ext_lower = ext.to_lowercase();
1250
1251            if self.source_extensions.contains(&ext_lower) {
1252                return FileTypeCategory::Source;
1253            }
1254
1255            if self.config_extensions.contains(&ext_lower) {
1256                return FileTypeCategory::Config;
1257            }
1258
1259            if self.doc_extensions.contains(&ext_lower) {
1260                return FileTypeCategory::Documentation;
1261            }
1262
1263            // Check for binary extensions
1264            match ext_lower.as_str() {
1265                "exe" | "dll" | "so" | "dylib" | "a" | "lib" | "o" | "obj" | "bin" => {
1266                    return FileTypeCategory::Binary;
1267                }
1268                "class" | "jar" | "pyc" | "pyo" | "rlib" | "node" => {
1269                    return FileTypeCategory::Generated;
1270                }
1271                _ => {}
1272            }
1273        }
1274
1275        // Check for common generated file patterns
1276        if let Some(file_name) = path.file_name().and_then(|n| n.to_str()) {
1277            let file_name_lower = file_name.to_lowercase();
1278            if file_name_lower.starts_with("generated_")
1279                || file_name_lower.contains("autogenerated")
1280                || file_name_lower.starts_with("build")
1281                || file_name_lower == "cargo.lock"
1282                || file_name_lower == "package-lock.json"
1283                || file_name_lower == "yarn.lock"
1284            {
1285                return FileTypeCategory::Generated;
1286            }
1287        }
1288
1289        FileTypeCategory::Unknown
1290    }
1291}
1292
1293#[cfg(test)]
1294mod tests {
1295    use super::*;
1296    use std::fs;
1297    use tempfile::TempDir;
1298
1299    fn create_test_dir() -> TempDir {
1300        let temp_dir = TempDir::new().unwrap();
1301        let path = temp_dir.path();
1302
1303        // Initialize as a git repository so .gitignore works properly
1304        // If git is not available, we'll also create a .ignore file as fallback
1305        let git_init = std::process::Command::new("git")
1306            .arg("init")
1307            .current_dir(path)
1308            .output()
1309            .is_ok();
1310
1311        // Create test files
1312        fs::write(path.join("main.rs"), "fn main() {}").unwrap();
1313        fs::write(path.join("lib.js"), "console.log('hello')").unwrap();
1314        fs::write(path.join("config.toml"), "[package]").unwrap();
1315        fs::write(path.join("README.md"), "# Test").unwrap();
1316        fs::write(path.join("test_main.rs"), "mod tests {}").unwrap();
1317
1318        // Create subdirectory
1319        fs::create_dir(path.join("src")).unwrap();
1320        fs::write(path.join("src").join("lib.rs"), "pub mod lib;").unwrap();
1321
1322        // Create .gitignore with proper line endings
1323        fs::write(path.join(".gitignore"), "target/\n*.tmp\n").unwrap();
1324
1325        // Also create .ignore file for non-git environments
1326        // The ignore crate respects .ignore files even without git
1327        if !git_init {
1328            fs::write(path.join(".ignore"), "target/\n*.tmp\n").unwrap();
1329        }
1330
1331        // Create ignored file and directory
1332        fs::write(path.join("ignored.tmp"), "temporary").unwrap();
1333        fs::create_dir(path.join("target")).unwrap();
1334        fs::write(path.join("target").join("debug.txt"), "debug info").unwrap();
1335
1336        temp_dir
1337    }
1338
1339    #[test]
1340    fn test_glob_rust_files() {
1341        let temp_dir = create_test_dir();
1342        let glob_tool = GlobTool::new(temp_dir.path().to_path_buf());
1343
1344        let result = glob_tool.glob("*.rs").unwrap();
1345
1346        // Should find main.rs and test_main.rs (but not src/lib.rs with this pattern)
1347        assert_eq!(result.result.len(), 2);
1348        assert!(
1349            result
1350                .result
1351                .iter()
1352                .any(|m| m.path.file_name().unwrap() == "main.rs")
1353        );
1354        assert!(
1355            result
1356                .result
1357                .iter()
1358                .any(|m| m.path.file_name().unwrap() == "test_main.rs")
1359        );
1360        assert!(result.summary.contains("Found 2 files"));
1361    }
1362
1363    #[test]
1364    fn test_find_type() {
1365        let temp_dir = create_test_dir();
1366        let glob_tool = GlobTool::new(temp_dir.path().to_path_buf());
1367
1368        let result = glob_tool.find_type("rs").unwrap();
1369
1370        // Should find all .rs files
1371        assert_eq!(result.result.len(), 3); // main.rs, test_main.rs, src/lib.rs
1372        assert!(
1373            result
1374                .result
1375                .iter()
1376                .all(|m| m.extension.as_ref().unwrap() == "rs")
1377        );
1378    }
1379
1380    #[test]
1381    fn test_file_classification() {
1382        let classifier = FileExtensionClassifier::default();
1383
1384        assert_eq!(
1385            classifier.classify_file(Path::new("main.rs")),
1386            FileTypeCategory::Source
1387        );
1388        assert_eq!(
1389            classifier.classify_file(Path::new("config.toml")),
1390            FileTypeCategory::Config
1391        );
1392        assert_eq!(
1393            classifier.classify_file(Path::new("README.md")),
1394            FileTypeCategory::Documentation
1395        );
1396        assert_eq!(
1397            classifier.classify_file(Path::new("test_main.rs")),
1398            FileTypeCategory::Test
1399        );
1400        assert_eq!(
1401            classifier.classify_file(Path::new("main.spec.js")),
1402            FileTypeCategory::Test
1403        );
1404    }
1405
1406    #[test]
1407    fn test_gitignore_respected() {
1408        let temp_dir = create_test_dir();
1409
1410        // Ensure .gitignore is properly set up
1411        let gitignore_path = temp_dir.path().join(".gitignore");
1412        assert!(gitignore_path.exists(), ".gitignore should exist");
1413
1414        // Test that ignored.tmp exists
1415        let ignored_file = temp_dir.path().join("ignored.tmp");
1416        assert!(ignored_file.exists(), "ignored.tmp should exist");
1417
1418        let glob_tool = GlobTool::new(temp_dir.path().to_path_buf()).with_respect_ignore(true); // Explicitly enable .gitignore respect
1419        let result = glob_tool.glob("*.tmp").unwrap();
1420
1421        // Should not find ignored.tmp due to .gitignore
1422        // The walker with git_ignore(true) should filter out the file
1423        assert_eq!(
1424            result.result.len(),
1425            0,
1426            "Should find 0 files (ignored.tmp should be filtered by .gitignore)"
1427        );
1428        assert!(result.result.is_empty());
1429    }
1430
1431    #[test]
1432    fn test_parallel_vs_sequential() {
1433        let temp_dir = create_test_dir();
1434
1435        let parallel_tool = GlobTool::new(temp_dir.path().to_path_buf()).with_parallelism(4);
1436        let sequential_tool = GlobTool::new(temp_dir.path().to_path_buf()).with_parallelism(1);
1437
1438        let parallel_result = parallel_tool.glob("*").unwrap();
1439        let sequential_result = sequential_tool.glob("*").unwrap();
1440
1441        // Results should be the same (order may differ)
1442        assert_eq!(parallel_result.result.len(), sequential_result.result.len());
1443    }
1444
1445    #[test]
1446    fn test_max_results() {
1447        let temp_dir = create_test_dir();
1448        let glob_tool = GlobTool::new(temp_dir.path().to_path_buf()).with_max_results(Some(2));
1449
1450        let result = glob_tool.glob("*").unwrap();
1451
1452        // Should be limited to 2 results
1453        assert!(result.result.len() <= 2);
1454    }
1455}
agcodex_core/tools/glob.rs

agcodex_core/tools/
glob.rs