1use super::output::ComprehensiveToolOutput;
12use ignore::DirEntry;
13use ignore::Walk;
14use ignore::WalkBuilder;
15use ignore::WalkParallel;
16use ignore::WalkState;
17use regex::Regex;
18use serde::Deserialize;
19use serde::Serialize;
20use std::collections::HashMap;
21use std::collections::HashSet;
22use std::ffi::OsStr;
23use std::path::Path;
24use std::path::PathBuf;
25use std::sync::Arc;
26use std::sync::Mutex;
27use std::sync::atomic::AtomicBool;
28use std::sync::atomic::Ordering;
29use std::time::Duration;
30use std::time::Instant;
31use std::time::SystemTime;
32use thiserror::Error;
33use tracing::info;
34use wildmatch::WildMatch;
35
36#[derive(Error, Debug)]
38pub enum GlobError {
39 #[error("invalid glob pattern '{pattern}': {reason}")]
40 InvalidPattern { pattern: String, reason: String },
41
42 #[error("directory not found: {path}")]
43 DirectoryNotFound { path: PathBuf },
44
45 #[error("permission denied: {path}")]
46 PermissionDenied { path: PathBuf },
47
48 #[error("search timeout after {timeout:?}")]
49 SearchTimeout { timeout: Duration },
50
51 #[error("I/O error: {0}")]
52 Io(#[from] std::io::Error),
53
54 #[error("search error: {message}")]
55 SearchError { message: String },
56
57 #[error("search cancelled after {duration:?}")]
58 SearchCancelled { duration: Duration },
59
60 #[error("filter chain error: {message}")]
61 FilterChain { message: String },
62}
63
64pub type GlobResult<T> = std::result::Result<T, GlobError>;
66
67pub type GlobOutput<T> = ComprehensiveToolOutput<T>;
69
70#[derive(Debug, Clone, Copy, PartialEq, Eq)]
72pub enum SearchStrategy {
73 Parallel,
75 Sequential,
77 Auto,
79}
80
81#[derive(Debug, Clone, Serialize, Deserialize)]
83pub struct FileMatch {
84 pub path: PathBuf,
86 pub size: Option<u64>,
88 pub extension: Option<String>,
90 pub file_type: FileType,
92 pub relative_path: PathBuf,
94 pub modified: Option<SystemTime>,
96 pub executable: bool,
98 pub content_category: ContentCategory,
100 pub estimated_lines: Option<usize>,
102}
103
104#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)]
106pub enum FileType {
107 File,
109 Directory,
111 Symlink,
113 Other,
115}
116
117#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
119pub enum ContentCategory {
120 Source,
122 Config,
124 Documentation,
126 Test,
128 Binary,
130 Data,
132 Unknown,
134}
135
136#[derive(Debug, Clone)]
138pub struct GlobQuery {
139 pub base_dir: PathBuf,
141 pub patterns: Vec<String>,
143 pub file_type: Option<FileType>,
145 pub size_filter: Option<SizeFilter>,
147 pub max_depth: Option<usize>,
149 pub include_hidden: bool,
151 pub follow_links: bool,
153 pub case_sensitive: bool,
155 pub max_results: usize,
157 pub timeout: Option<Duration>,
159}
160
161#[derive(Debug, Clone)]
163pub struct SizeFilter {
164 pub min_size: Option<u64>,
165 pub max_size: Option<u64>,
166}
167
168#[derive(Debug, Clone, Default)]
170pub struct SearchStats {
171 pub directories_traversed: usize,
173 pub files_examined: usize,
175 pub files_ignored: usize,
177}
178
179#[derive(Debug, Clone)]
181pub struct TimeFilter {
182 pub modified_after: Option<SystemTime>,
183 pub modified_before: Option<SystemTime>,
184}
185
186#[derive(Debug, Clone, Default)]
188pub struct FilterChain {
189 pub glob_patterns: Vec<GlobPattern>,
191 pub size_filters: Vec<SizeFilter>,
193 pub time_filters: Vec<TimeFilter>,
195 pub category_filters: Vec<ContentCategory>,
197 pub type_filters: Vec<FileType>,
199 pub exclude_patterns: Vec<String>,
201}
202
203#[derive(Debug, Clone)]
205pub struct GlobPattern {
206 pub pattern: String,
207 pub case_sensitive: bool,
208 pub negate: bool,
209}
210
211#[derive(Debug, Clone)]
213pub struct CompiledFilters {
214 glob_matchers: Vec<CompiledGlobPattern>,
216 size_filters: Vec<SizeFilter>,
218 time_filters: Vec<TimeFilter>,
220 category_filters: Vec<ContentCategory>,
222 type_filters: Vec<FileType>,
224 exclude_matchers: Vec<WildMatch>,
226}
227
228#[derive(Debug, Clone)]
230struct CompiledGlobPattern {
231 pattern: String,
232 matcher: WildMatch,
233 negate: bool,
234}
235
236#[derive(Debug, Clone)]
238pub struct ContentClassifier {
239 source_extensions: HashSet<String>,
240 config_extensions: HashSet<String>,
241 doc_extensions: HashSet<String>,
242 test_patterns: Vec<String>,
243}
244
245impl FilterChain {
246 pub fn add_glob(
248 &mut self,
249 pattern: &str,
250 case_sensitive: bool,
251 negate: bool,
252 ) -> GlobResult<()> {
253 if pattern.is_empty() {
255 return Err(GlobError::InvalidPattern {
256 pattern: pattern.to_string(),
257 reason: "Pattern cannot be empty".to_string(),
258 });
259 }
260
261 self.glob_patterns.push(GlobPattern {
262 pattern: pattern.to_string(),
263 case_sensitive,
264 negate,
265 });
266 Ok(())
267 }
268}
269
270impl CompiledFilters {
271 pub fn compile(chain: &FilterChain) -> GlobResult<Self> {
273 let mut glob_matchers = Vec::new();
274
275 for pattern in &chain.glob_patterns {
276 let matcher = WildMatch::new(&pattern.pattern);
277 glob_matchers.push(CompiledGlobPattern {
278 pattern: pattern.pattern.clone(),
279 matcher,
280 negate: pattern.negate,
281 });
282 }
283
284 let exclude_matchers = chain
285 .exclude_patterns
286 .iter()
287 .map(|p| WildMatch::new(p))
288 .collect();
289
290 Ok(Self {
291 glob_matchers,
292 size_filters: chain.size_filters.clone(),
293 time_filters: chain.time_filters.clone(),
294 category_filters: chain.category_filters.clone(),
295 type_filters: chain.type_filters.clone(),
296 exclude_matchers,
297 })
298 }
299
300 pub fn matches(&self, file: &FileMatch) -> bool {
302 if !self.glob_matchers.is_empty() {
304 let mut matched = false;
307
308 for pattern in &self.glob_matchers {
309 let is_match = if pattern.pattern.contains('/') || pattern.pattern.contains("**") {
310 pattern
312 .matcher
313 .matches(&file.relative_path.to_string_lossy())
314 } else {
315 let relative_path_str = file.relative_path.to_string_lossy();
318 if relative_path_str.contains('/') || relative_path_str.contains('\\') {
319 false
321 } else if let Some(file_name) = file.path.file_name() {
322 pattern.matcher.matches(&file_name.to_string_lossy())
324 } else {
325 false
326 }
327 };
328
329 if pattern.negate {
330 if is_match {
331 return false;
332 }
333 } else if is_match {
334 matched = true;
335 }
336 }
337
338 if !matched && !self.glob_matchers.iter().all(|p| p.negate) {
339 return false;
340 }
341 }
342
343 let relative_path_str = file.relative_path.to_string_lossy();
345 for exclude in &self.exclude_matchers {
346 if exclude.matches(&relative_path_str) {
347 return false;
348 }
349 }
350
351 if let Some(size) = file.size {
353 for filter in &self.size_filters {
354 if let Some(min) = filter.min_size
355 && size < min
356 {
357 return false;
358 }
359 if let Some(max) = filter.max_size
360 && size > max
361 {
362 return false;
363 }
364 }
365 }
366
367 if let Some(modified) = file.modified {
369 for filter in &self.time_filters {
370 if let Some(after) = filter.modified_after
371 && modified < after
372 {
373 return false;
374 }
375 if let Some(before) = filter.modified_before
376 && modified > before
377 {
378 return false;
379 }
380 }
381 }
382
383 if !self.category_filters.is_empty()
385 && !self.category_filters.contains(&file.content_category)
386 {
387 return false;
388 }
389
390 if !self.type_filters.is_empty() && !self.type_filters.contains(&file.file_type) {
392 return false;
393 }
394
395 true
396 }
397}
398
399impl Default for ContentClassifier {
400 fn default() -> Self {
401 let mut source_extensions = HashSet::new();
402 source_extensions.extend(
403 [
404 "rs", "py", "js", "ts", "jsx", "tsx", "go", "java", "c", "cpp", "cc", "cxx", "h",
405 "hpp", "cs", "php", "rb", "swift", "kt", "scala", "hs", "clj", "ex", "exs",
406 ]
407 .iter()
408 .map(|s| (*s).to_string()),
409 );
410
411 let mut config_extensions = HashSet::new();
412 config_extensions.extend(
413 [
414 "toml", "yaml", "yml", "json", "xml", "ini", "cfg", "conf", "config", "env",
415 ]
416 .iter()
417 .map(|s| (*s).to_string()),
418 );
419
420 let mut doc_extensions = HashSet::new();
421 doc_extensions.extend(
422 ["md", "txt", "rst", "adoc", "tex", "pdf", "doc", "docx"]
423 .iter()
424 .map(|s| (*s).to_string()),
425 );
426
427 let test_patterns = vec![
428 "test".to_string(),
429 "spec".to_string(),
430 "_test".to_string(),
431 ".test.".to_string(),
432 ".spec.".to_string(),
433 ];
434
435 Self {
436 source_extensions,
437 config_extensions,
438 doc_extensions,
439 test_patterns,
440 }
441 }
442}
443
444impl ContentClassifier {
445 pub fn classify_path(&self, path: &Path) -> ContentCategory {
447 if let Some(file_name) = path.file_name().and_then(|n| n.to_str()) {
449 for pattern in &self.test_patterns {
450 if file_name.contains(pattern) {
451 return ContentCategory::Test;
452 }
453 }
454 }
455
456 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
458 let ext_lower = ext.to_lowercase();
459
460 if self.source_extensions.contains(&ext_lower) {
461 return ContentCategory::Source;
462 }
463
464 if self.config_extensions.contains(&ext_lower) {
465 return ContentCategory::Config;
466 }
467
468 if self.doc_extensions.contains(&ext_lower) {
469 return ContentCategory::Documentation;
470 }
471
472 match ext_lower.as_str() {
474 "exe" | "dll" | "so" | "dylib" | "a" | "lib" | "o" | "obj" | "bin" => {
475 return ContentCategory::Binary;
476 }
477 _ => {}
478 }
479 }
480
481 ContentCategory::Unknown
482 }
483}
484
485#[derive(Clone)]
487pub struct GlobTool {
488 base_dir: PathBuf,
490 default_filters: FilterChain,
492 parallelism: usize,
494 respect_ignore: bool,
496 max_results: Option<usize>,
498 follow_links: bool,
500 include_hidden: bool,
502 timeout: Option<Duration>,
504 custom_ignores: Vec<String>,
506 classifier: Arc<ContentClassifier>,
508 cancellation: Arc<AtomicBool>,
510}
511
512impl Default for GlobTool {
513 fn default() -> Self {
514 Self::new(std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")))
515 }
516}
517
518impl Default for GlobQuery {
519 fn default() -> Self {
520 Self {
521 base_dir: std::env::current_dir().unwrap_or_else(|_| PathBuf::from(".")),
522 patterns: Vec::new(),
523 file_type: None,
524 size_filter: None,
525 max_depth: Some(32), include_hidden: false,
527 follow_links: false,
528 case_sensitive: true,
529 max_results: 0, timeout: Some(Duration::from_secs(30)),
531 }
532 }
533}
534
535impl GlobTool {
536 pub fn new(base_dir: PathBuf) -> Self {
538 let thread_count = std::thread::available_parallelism()
539 .map(|n| n.get())
540 .unwrap_or(4);
541
542 Self {
543 base_dir,
544 default_filters: FilterChain::default(),
545 parallelism: thread_count,
546 respect_ignore: true,
547 max_results: Some(10_000), follow_links: false,
549 include_hidden: false,
550 timeout: Some(Duration::from_secs(30)),
551 custom_ignores: Vec::new(),
552 classifier: Arc::new(ContentClassifier::default()),
553 cancellation: Arc::new(AtomicBool::new(false)),
554 }
555 }
556
557 pub fn with_parallelism(mut self, threads: usize) -> Self {
559 self.parallelism = threads.max(1);
560 self
561 }
562
563 pub const fn with_respect_ignore(mut self, respect: bool) -> Self {
565 self.respect_ignore = respect;
566 self
567 }
568
569 pub const fn with_max_results(mut self, max_results: Option<usize>) -> Self {
571 self.max_results = max_results;
572 self
573 }
574
575 pub const fn with_follow_links(mut self, follow_links: bool) -> Self {
577 self.follow_links = follow_links;
578 self
579 }
580
581 pub const fn with_include_hidden(mut self, include_hidden: bool) -> Self {
583 self.include_hidden = include_hidden;
584 self
585 }
586
587 pub const fn with_timeout(mut self, timeout: Option<Duration>) -> Self {
589 self.timeout = timeout;
590 self
591 }
592
593 pub fn with_custom_ignores(mut self, ignores: Vec<String>) -> Self {
595 self.custom_ignores = ignores;
596 self
597 }
598
599 pub fn with_filter_chain(mut self, filters: FilterChain) -> Self {
601 self.default_filters = filters;
602 self
603 }
604
605 pub fn glob(&self, pattern: &str) -> GlobResult<GlobOutput<Vec<FileMatch>>> {
607 let mut filters = self.default_filters.clone();
608 filters.add_glob(pattern, true, false)?;
609 self.search_with_filters(filters)
610 }
611
612 pub fn find_type(&self, extension: &str) -> GlobResult<GlobOutput<Vec<FileMatch>>> {
614 let ext_clean = extension.trim_start_matches('.').trim_start_matches('*');
617
618 let mut filters = self.default_filters.clone();
620
621 filters.add_glob(&format!("*.{}", ext_clean), true, false)?;
623
624 filters.add_glob(&format!("**/*.{}", ext_clean), true, false)?;
626
627 self.search_with_filters(filters)
628 }
629
630 pub fn search_with_filters(
632 &self,
633 filters: FilterChain,
634 ) -> GlobResult<GlobOutput<Vec<FileMatch>>> {
635 use super::output::*;
636 let timer = PerformanceTimer::new();
637
638 self.cancellation.store(false, Ordering::Relaxed);
640
641 if !self.base_dir.exists() {
643 return Err(GlobError::DirectoryNotFound {
644 path: self.base_dir.clone(),
645 });
646 }
647
648 let compiled_filters = CompiledFilters::compile(&filters)?;
650
651 let strategy = self.select_strategy();
653 let (matches, stats) = match strategy {
654 SearchStrategy::Parallel => self.search_parallel(&compiled_filters)?,
655 SearchStrategy::Sequential => self.search_sequential(&compiled_filters)?,
656 SearchStrategy::Auto => {
657 if self.estimate_directory_size() > 1000 {
658 self.search_parallel(&compiled_filters)?
659 } else {
660 self.search_sequential(&compiled_filters)?
661 }
662 }
663 };
664
665 let duration = timer.elapsed();
666 let matches_count = matches.len();
667
668 let summary = format!(
670 "Found {} files in {} ({}ms, {} examined, {} ignored)",
671 matches_count,
672 self.base_dir.display(),
673 duration.as_millis(),
674 stats.files_examined,
675 stats.files_ignored
676 );
677
678 let location = SourceLocation::new(self.base_dir.to_string_lossy(), 0, 0, 0, 0, (0, 0));
680
681 let output = OutputBuilder::new(
682 matches,
683 "glob",
684 "file_discovery".to_string(),
685 location.clone(),
686 )
687 .context(OperationContext {
688 before: ContextSnapshot {
689 content: String::new(),
690 timestamp: SystemTime::now(),
691 content_hash: String::new(),
692 ast_summary: None,
693 symbols: Vec::new(),
694 },
695 after: None,
696 surrounding: vec![
697 ContextLine {
698 line_number: 0,
699 content: format!("Search root: {}", self.base_dir.display()),
700 line_type: ContextLineType::Separator,
701 indentation: 0,
702 modified: false,
703 },
704 ContextLine {
705 line_number: 0,
706 content: format!("Strategy: {:?}", strategy),
707 line_type: ContextLineType::Separator,
708 indentation: 0,
709 modified: false,
710 },
711 ],
712 location: location.clone(),
713 scope: OperationScope {
714 scope_type: ScopeType::File,
715 name: self
716 .base_dir
717 .file_name()
718 .and_then(|n| n.to_str())
719 .unwrap_or("unknown")
720 .to_string(),
721 path: vec![self.base_dir.to_string_lossy().to_string()],
722 file_path: self.base_dir.clone(),
723 line_range: 0..0,
724 },
725 language_context: None,
726 project_context: None,
727 })
728 .performance(PerformanceMetrics {
729 execution_time: duration,
730 phase_times: HashMap::new(),
731 memory_usage: MemoryUsage {
732 peak_bytes: (matches_count * std::mem::size_of::<FileMatch>()) as u64,
733 average_bytes: 0,
734 allocations: 0,
735 deallocations: 0,
736 efficiency_score: 0.9,
737 },
738 cpu_usage: CpuUsage {
739 cpu_time: duration,
740 utilization_percent: 0.0,
741 context_switches: 0,
742 },
743 io_stats: IoStats {
744 bytes_read: 0,
745 bytes_written: 0,
746 read_ops: stats.files_examined as u64,
747 write_ops: 0,
748 io_wait_time: Duration::from_millis(0),
749 },
750 cache_stats: CacheStats {
751 hit_rate: 0.0,
752 hits: 0,
753 misses: stats.files_examined as u64,
754 cache_size: 0,
755 efficiency_score: 0.0,
756 },
757 })
758 .summary(summary)
759 .build();
760
761 info!(
762 "Glob search completed: {} matches in {:?}",
763 matches_count, duration
764 );
765 Ok(output)
766 }
767
768 pub fn cancel(&self) {
770 self.cancellation.store(true, Ordering::Relaxed);
771 }
772
773 fn is_cancelled(&self) -> bool {
775 self.cancellation.load(Ordering::Relaxed)
776 }
777
778 fn select_strategy(&self) -> SearchStrategy {
780 if self.parallelism == 1 {
781 SearchStrategy::Sequential
782 } else if self.estimate_directory_size() > 500 {
783 SearchStrategy::Parallel
784 } else {
785 SearchStrategy::Sequential
786 }
787 }
788
789 pub fn find_in_directory(
791 &self,
792 dir: &Path,
793 pattern: &str,
794 ) -> GlobResult<GlobOutput<Vec<FileMatch>>> {
795 let scoped_tool = GlobTool::new(dir.to_path_buf())
796 .with_parallelism(self.parallelism)
797 .with_max_results(self.max_results)
798 .with_follow_links(self.follow_links)
799 .with_include_hidden(self.include_hidden)
800 .with_timeout(self.timeout)
801 .with_custom_ignores(self.custom_ignores.clone())
802 .with_filter_chain(self.default_filters.clone());
803
804 scoped_tool.glob(pattern)
805 }
806
807 pub fn find_by_category(
809 &self,
810 category: ContentCategory,
811 ) -> GlobResult<GlobOutput<Vec<FileMatch>>> {
812 let mut filters = self.default_filters.clone();
813 filters.category_filters.push(category);
814 self.search_with_filters(filters)
815 }
816
817 pub fn find_by_size(
819 &self,
820 min_size: Option<u64>,
821 max_size: Option<u64>,
822 ) -> GlobResult<GlobOutput<Vec<FileMatch>>> {
823 let mut filters = self.default_filters.clone();
824 filters.size_filters.push(SizeFilter { min_size, max_size });
825 self.search_with_filters(filters)
826 }
827
828 pub fn find_modified_since(&self, since: SystemTime) -> GlobResult<GlobOutput<Vec<FileMatch>>> {
830 let mut filters = self.default_filters.clone();
831 filters.time_filters.push(TimeFilter {
832 modified_after: Some(since),
833 modified_before: None,
834 });
835 self.search_with_filters(filters)
836 }
837
838 fn search_parallel(
840 &self,
841 filters: &CompiledFilters,
842 ) -> GlobResult<(Vec<FileMatch>, SearchStats)> {
843 let matches = Arc::new(Mutex::new(Vec::new()));
844 let stats = Arc::new(Mutex::new(SearchStats::default()));
845 let start_time = Instant::now();
846
847 let walker = self.create_parallel_walker()?;
848
849 walker.run(|| {
850 let matches = matches.clone();
851 let stats = stats.clone();
852 let filters = filters.clone();
853 let classifier = self.classifier.clone();
854 let base_dir = self.base_dir.clone();
855 let max_results = self.max_results;
856 let timeout = self.timeout;
857 let cancellation = self.cancellation.clone();
858
859 Box::new(move |entry_result| {
860 if cancellation.load(Ordering::Relaxed) {
862 return WalkState::Quit;
863 }
864
865 if let Some(timeout) = timeout
866 && start_time.elapsed() > timeout
867 {
868 cancellation.store(true, Ordering::Relaxed);
869 return WalkState::Quit;
870 }
871
872 match entry_result {
873 Ok(entry) => {
874 let _path = entry.path();
875
876 {
878 let mut stats = stats.lock().unwrap();
879 if entry.file_type().is_some_and(|ft| ft.is_dir()) {
880 stats.directories_traversed += 1;
881 return WalkState::Continue;
882 } else {
883 stats.files_examined += 1;
884 }
885 }
886
887 if let Some(file_match) =
889 Self::create_file_match(&entry, &classifier, &base_dir)
890 {
891 if filters.matches(&file_match) {
892 let mut matches = matches.lock().unwrap();
893
894 if let Some(max) = max_results
896 && matches.len() >= max
897 {
898 return WalkState::Quit;
899 }
900
901 matches.push(file_match);
902 } else {
903 let mut stats = stats.lock().unwrap();
904 stats.files_ignored += 1;
905 }
906 }
907 }
908 Err(_) => {
909 let mut stats = stats.lock().unwrap();
910 stats.files_ignored += 1;
911 }
912 }
913 WalkState::Continue
914 })
915 });
916
917 let matches = Arc::try_unwrap(matches)
918 .map_err(|_| GlobError::FilterChain {
919 message: "Failed to unwrap matches".to_string(),
920 })?
921 .into_inner()
922 .map_err(|_| GlobError::FilterChain {
923 message: "Failed to acquire matches lock".to_string(),
924 })?;
925
926 let stats = Arc::try_unwrap(stats)
927 .map_err(|_| GlobError::FilterChain {
928 message: "Failed to unwrap stats".to_string(),
929 })?
930 .into_inner()
931 .map_err(|_| GlobError::FilterChain {
932 message: "Failed to acquire stats lock".to_string(),
933 })?;
934
935 Ok((matches, stats))
936 }
937
938 fn search_sequential(
940 &self,
941 filters: &CompiledFilters,
942 ) -> GlobResult<(Vec<FileMatch>, SearchStats)> {
943 let mut matches = Vec::new();
944 let mut stats = SearchStats::default();
945 let start_time = Instant::now();
946
947 let walker = self.create_sequential_walker();
948
949 for entry_result in walker {
950 if self.is_cancelled() {
952 return Err(GlobError::SearchCancelled {
953 duration: start_time.elapsed(),
954 });
955 }
956
957 if let Some(timeout) = self.timeout
958 && start_time.elapsed() > timeout
959 {
960 return Err(GlobError::SearchTimeout { timeout });
961 }
962
963 match entry_result {
964 Ok(entry) => {
965 let _path = entry.path();
966
967 if entry.file_type().is_some_and(|ft| ft.is_dir()) {
969 stats.directories_traversed += 1;
970 continue;
971 }
972
973 stats.files_examined += 1;
974
975 if let Some(max) = self.max_results
977 && matches.len() >= max
978 {
979 break;
980 }
981
982 if let Some(file_match) =
984 Self::create_file_match(&entry, &self.classifier, &self.base_dir)
985 {
986 if filters.matches(&file_match) {
987 matches.push(file_match);
988 } else {
989 stats.files_ignored += 1;
990 }
991 }
992 }
993 Err(_) => {
994 stats.files_ignored += 1;
995 }
996 }
997 }
998
999 Ok((matches, stats))
1000 }
1001
1002 fn create_parallel_walker(&self) -> GlobResult<WalkParallel> {
1004 let mut builder = WalkBuilder::new(&self.base_dir);
1005 self.configure_builder(&mut builder)?;
1006 builder.threads(self.parallelism);
1007 Ok(builder.build_parallel())
1008 }
1009
1010 fn create_sequential_walker(&self) -> Walk {
1012 let mut builder = WalkBuilder::new(&self.base_dir);
1013 let _ = self.configure_builder(&mut builder); builder.build()
1015 }
1016
1017 fn configure_builder(&self, builder: &mut WalkBuilder) -> GlobResult<()> {
1019 builder
1020 .follow_links(self.follow_links)
1021 .hidden(!self.include_hidden)
1022 .ignore(self.respect_ignore)
1023 .git_ignore(self.respect_ignore)
1024 .git_global(self.respect_ignore)
1025 .git_exclude(self.respect_ignore)
1026 .parents(self.respect_ignore)
1027 .require_git(false); for ignore_pattern in &self.custom_ignores {
1031 builder.add_custom_ignore_filename(ignore_pattern);
1032 }
1033
1034 if !self.base_dir.exists() {
1036 return Err(GlobError::DirectoryNotFound {
1037 path: self.base_dir.clone(),
1038 });
1039 }
1040
1041 if !self.base_dir.is_dir() {
1042 return Err(GlobError::FilterChain {
1043 message: format!("Path is not a directory: {}", self.base_dir.display()),
1044 });
1045 }
1046
1047 Ok(())
1048 }
1049
1050 fn create_file_match(
1052 entry: &DirEntry,
1053 classifier: &ContentClassifier,
1054 base_dir: &Path,
1055 ) -> Option<FileMatch> {
1056 let path = entry.path();
1057
1058 let metadata = match entry.metadata() {
1060 Ok(meta) => meta,
1061 Err(_) => return None, };
1063
1064 let file_type = if metadata.is_file() {
1065 FileType::File
1066 } else if metadata.is_dir() {
1067 FileType::Directory
1068 } else if metadata.file_type().is_symlink() {
1069 FileType::Symlink
1070 } else {
1071 FileType::Other
1072 };
1073
1074 let size = if file_type == FileType::File {
1075 Some(metadata.len())
1076 } else {
1077 None
1078 };
1079
1080 let extension = path
1081 .extension()
1082 .and_then(OsStr::to_str)
1083 .map(|s| s.to_lowercase());
1084
1085 let content_category = classifier.classify_path(path);
1086
1087 let relative_path = path.strip_prefix(base_dir).unwrap_or(path).to_path_buf();
1088
1089 let modified = metadata.modified().ok();
1090
1091 let executable = {
1092 #[cfg(unix)]
1093 {
1094 use std::os::unix::fs::PermissionsExt;
1095 metadata.permissions().mode() & 0o111 != 0
1096 }
1097 #[cfg(not(unix))]
1098 {
1099 extension
1100 .as_ref()
1101 .map(|ext| matches!(ext.as_str(), "exe" | "bat" | "cmd" | "com"))
1102 .unwrap_or(false)
1103 }
1104 };
1105
1106 let estimated_lines = if matches!(
1108 content_category,
1109 ContentCategory::Source | ContentCategory::Config | ContentCategory::Documentation
1110 ) {
1111 size.and_then(|s| {
1112 if s < 1024 * 1024 {
1113 Some((s / 40).max(1) as usize) } else {
1116 None
1117 }
1118 })
1119 } else {
1120 None
1121 };
1122
1123 Some(FileMatch {
1124 path: path.to_path_buf(),
1125 size,
1126 extension,
1127 file_type,
1128 relative_path,
1129 modified,
1130 executable,
1131 content_category,
1132 estimated_lines,
1133 })
1134 }
1135
1136 fn estimate_directory_size(&self) -> usize {
1138 std::fs::read_dir(&self.base_dir)
1140 .map(|entries| entries.count())
1141 .unwrap_or(0)
1142 }
1143}
1144
1145#[derive(Default)]
1147#[allow(dead_code)]
1148struct GlobStats {
1149 files_examined: usize,
1150 directories_traversed: usize,
1151 files_ignored: usize,
1152}
1153
1154#[derive(Debug, Clone, PartialEq, Eq)]
1156pub enum FileTypeCategory {
1157 Source,
1158 Config,
1159 Documentation,
1160 Test,
1161 Binary,
1162 Generated,
1163 Unknown,
1164}
1165
1166pub struct FileExtensionClassifier {
1168 source_extensions: HashSet<String>,
1169 config_extensions: HashSet<String>,
1170 doc_extensions: HashSet<String>,
1171 test_patterns: Vec<Regex>,
1172}
1173
1174impl Default for FileExtensionClassifier {
1175 fn default() -> Self {
1176 let mut source_extensions = HashSet::new();
1177 source_extensions.extend(
1179 [
1180 "rs", "py", "js", "ts", "jsx", "tsx", "go", "java", "c", "cpp", "cc", "cxx", "h",
1181 "hpp", "cs", "php", "rb", "swift", "kt", "scala", "hs", "clj", "ex", "exs", "sh",
1182 "bash", "zsh", "fish", "ps1", "bat", "cmd",
1183 ]
1184 .iter()
1185 .map(|s| (*s).to_string()),
1186 );
1187
1188 let mut config_extensions = HashSet::new();
1189 config_extensions.extend(
1191 [
1192 "toml",
1193 "yaml",
1194 "yml",
1195 "json",
1196 "xml",
1197 "ini",
1198 "cfg",
1199 "conf",
1200 "config",
1201 "env",
1202 "properties",
1203 "dockerfile",
1204 ]
1205 .iter()
1206 .map(|s| (*s).to_string()),
1207 );
1208
1209 let mut doc_extensions = HashSet::new();
1210 doc_extensions.extend(
1212 ["md", "txt", "rst", "adoc", "tex", "pdf", "doc", "docx"]
1213 .iter()
1214 .map(|s| (*s).to_string()),
1215 );
1216
1217 let test_patterns = vec![
1219 Regex::new(r"(?i)test").unwrap(),
1220 Regex::new(r"(?i)spec").unwrap(),
1221 Regex::new(r"_test\.[^.]+$").unwrap(),
1222 Regex::new(r"\.test\.[^.]+$").unwrap(),
1223 Regex::new(r"\.spec\.[^.]+$").unwrap(),
1224 ];
1225
1226 Self {
1227 source_extensions,
1228 config_extensions,
1229 doc_extensions,
1230 test_patterns,
1231 }
1232 }
1233}
1234
1235impl FileExtensionClassifier {
1236 pub fn classify_file(&self, path: &Path) -> FileTypeCategory {
1238 if let Some(file_name) = path.file_name().and_then(|n| n.to_str()) {
1240 for pattern in &self.test_patterns {
1241 if pattern.is_match(file_name) {
1242 return FileTypeCategory::Test;
1243 }
1244 }
1245 }
1246
1247 if let Some(ext) = path.extension().and_then(|e| e.to_str()) {
1249 let ext_lower = ext.to_lowercase();
1250
1251 if self.source_extensions.contains(&ext_lower) {
1252 return FileTypeCategory::Source;
1253 }
1254
1255 if self.config_extensions.contains(&ext_lower) {
1256 return FileTypeCategory::Config;
1257 }
1258
1259 if self.doc_extensions.contains(&ext_lower) {
1260 return FileTypeCategory::Documentation;
1261 }
1262
1263 match ext_lower.as_str() {
1265 "exe" | "dll" | "so" | "dylib" | "a" | "lib" | "o" | "obj" | "bin" => {
1266 return FileTypeCategory::Binary;
1267 }
1268 "class" | "jar" | "pyc" | "pyo" | "rlib" | "node" => {
1269 return FileTypeCategory::Generated;
1270 }
1271 _ => {}
1272 }
1273 }
1274
1275 if let Some(file_name) = path.file_name().and_then(|n| n.to_str()) {
1277 let file_name_lower = file_name.to_lowercase();
1278 if file_name_lower.starts_with("generated_")
1279 || file_name_lower.contains("autogenerated")
1280 || file_name_lower.starts_with("build")
1281 || file_name_lower == "cargo.lock"
1282 || file_name_lower == "package-lock.json"
1283 || file_name_lower == "yarn.lock"
1284 {
1285 return FileTypeCategory::Generated;
1286 }
1287 }
1288
1289 FileTypeCategory::Unknown
1290 }
1291}
1292
1293#[cfg(test)]
1294mod tests {
1295 use super::*;
1296 use std::fs;
1297 use tempfile::TempDir;
1298
1299 fn create_test_dir() -> TempDir {
1300 let temp_dir = TempDir::new().unwrap();
1301 let path = temp_dir.path();
1302
1303 let git_init = std::process::Command::new("git")
1306 .arg("init")
1307 .current_dir(path)
1308 .output()
1309 .is_ok();
1310
1311 fs::write(path.join("main.rs"), "fn main() {}").unwrap();
1313 fs::write(path.join("lib.js"), "console.log('hello')").unwrap();
1314 fs::write(path.join("config.toml"), "[package]").unwrap();
1315 fs::write(path.join("README.md"), "# Test").unwrap();
1316 fs::write(path.join("test_main.rs"), "mod tests {}").unwrap();
1317
1318 fs::create_dir(path.join("src")).unwrap();
1320 fs::write(path.join("src").join("lib.rs"), "pub mod lib;").unwrap();
1321
1322 fs::write(path.join(".gitignore"), "target/\n*.tmp\n").unwrap();
1324
1325 if !git_init {
1328 fs::write(path.join(".ignore"), "target/\n*.tmp\n").unwrap();
1329 }
1330
1331 fs::write(path.join("ignored.tmp"), "temporary").unwrap();
1333 fs::create_dir(path.join("target")).unwrap();
1334 fs::write(path.join("target").join("debug.txt"), "debug info").unwrap();
1335
1336 temp_dir
1337 }
1338
1339 #[test]
1340 fn test_glob_rust_files() {
1341 let temp_dir = create_test_dir();
1342 let glob_tool = GlobTool::new(temp_dir.path().to_path_buf());
1343
1344 let result = glob_tool.glob("*.rs").unwrap();
1345
1346 assert_eq!(result.result.len(), 2);
1348 assert!(
1349 result
1350 .result
1351 .iter()
1352 .any(|m| m.path.file_name().unwrap() == "main.rs")
1353 );
1354 assert!(
1355 result
1356 .result
1357 .iter()
1358 .any(|m| m.path.file_name().unwrap() == "test_main.rs")
1359 );
1360 assert!(result.summary.contains("Found 2 files"));
1361 }
1362
1363 #[test]
1364 fn test_find_type() {
1365 let temp_dir = create_test_dir();
1366 let glob_tool = GlobTool::new(temp_dir.path().to_path_buf());
1367
1368 let result = glob_tool.find_type("rs").unwrap();
1369
1370 assert_eq!(result.result.len(), 3); assert!(
1373 result
1374 .result
1375 .iter()
1376 .all(|m| m.extension.as_ref().unwrap() == "rs")
1377 );
1378 }
1379
1380 #[test]
1381 fn test_file_classification() {
1382 let classifier = FileExtensionClassifier::default();
1383
1384 assert_eq!(
1385 classifier.classify_file(Path::new("main.rs")),
1386 FileTypeCategory::Source
1387 );
1388 assert_eq!(
1389 classifier.classify_file(Path::new("config.toml")),
1390 FileTypeCategory::Config
1391 );
1392 assert_eq!(
1393 classifier.classify_file(Path::new("README.md")),
1394 FileTypeCategory::Documentation
1395 );
1396 assert_eq!(
1397 classifier.classify_file(Path::new("test_main.rs")),
1398 FileTypeCategory::Test
1399 );
1400 assert_eq!(
1401 classifier.classify_file(Path::new("main.spec.js")),
1402 FileTypeCategory::Test
1403 );
1404 }
1405
1406 #[test]
1407 fn test_gitignore_respected() {
1408 let temp_dir = create_test_dir();
1409
1410 let gitignore_path = temp_dir.path().join(".gitignore");
1412 assert!(gitignore_path.exists(), ".gitignore should exist");
1413
1414 let ignored_file = temp_dir.path().join("ignored.tmp");
1416 assert!(ignored_file.exists(), "ignored.tmp should exist");
1417
1418 let glob_tool = GlobTool::new(temp_dir.path().to_path_buf()).with_respect_ignore(true); let result = glob_tool.glob("*.tmp").unwrap();
1420
1421 assert_eq!(
1424 result.result.len(),
1425 0,
1426 "Should find 0 files (ignored.tmp should be filtered by .gitignore)"
1427 );
1428 assert!(result.result.is_empty());
1429 }
1430
1431 #[test]
1432 fn test_parallel_vs_sequential() {
1433 let temp_dir = create_test_dir();
1434
1435 let parallel_tool = GlobTool::new(temp_dir.path().to_path_buf()).with_parallelism(4);
1436 let sequential_tool = GlobTool::new(temp_dir.path().to_path_buf()).with_parallelism(1);
1437
1438 let parallel_result = parallel_tool.glob("*").unwrap();
1439 let sequential_result = sequential_tool.glob("*").unwrap();
1440
1441 assert_eq!(parallel_result.result.len(), sequential_result.result.len());
1443 }
1444
1445 #[test]
1446 fn test_max_results() {
1447 let temp_dir = create_test_dir();
1448 let glob_tool = GlobTool::new(temp_dir.path().to_path_buf()).with_max_results(Some(2));
1449
1450 let result = glob_tool.glob("*").unwrap();
1451
1452 assert!(result.result.len() <= 2);
1454 }
1455}