1use std::path::{Path, PathBuf};
9use std::time::{Duration, SystemTime};
10
11use anyhow::{Context, Result};
12use ignore::WalkBuilder;
13use rayon::prelude::*;
14use tracing::{debug, info, instrument, warn};
15
16#[derive(Debug, Clone)]
18pub enum SizeFilter {
19 GreaterThan(u64),
21 LessThan(u64),
23 Equal(u64),
25 Between(u64, u64),
27}
28
29impl SizeFilter {
30 pub fn parse(s: &str) -> Result<Self> {
36 let s = s.trim();
37
38 if let Some(rest) = s.strip_prefix("gt:") {
40 let bytes = parse_size(rest.trim())?;
41 return Ok(SizeFilter::GreaterThan(bytes));
42 }
43 if let Some(rest) = s.strip_prefix("lt:") {
44 let bytes = parse_size(rest.trim())?;
45 return Ok(SizeFilter::LessThan(bytes));
46 }
47 if let Some(rest) = s.strip_prefix("eq:") {
48 let bytes = parse_size(rest.trim())?;
49 return Ok(SizeFilter::Equal(bytes));
50 }
51
52 if let Some((min, max)) = s.split_once('-') {
54 let min_bytes = parse_size(min.trim())?;
55 let max_bytes = parse_size(max.trim())?;
56 return Ok(SizeFilter::Between(min_bytes, max_bytes));
57 }
58
59 if let Some(rest) = s.strip_prefix('>') {
61 let bytes = parse_size(rest.trim())?;
62 Ok(SizeFilter::GreaterThan(bytes))
63 } else if let Some(rest) = s.strip_prefix('<') {
64 let bytes = parse_size(rest.trim())?;
65 Ok(SizeFilter::LessThan(bytes))
66 } else if let Some(rest) = s.strip_prefix('=') {
67 let bytes = parse_size(rest.trim())?;
68 Ok(SizeFilter::Equal(bytes))
69 } else {
70 let bytes = parse_size(s)?;
72 Ok(SizeFilter::Equal(bytes))
73 }
74 }
75
76 pub fn matches(&self, size: u64) -> bool {
78 match self {
79 SizeFilter::GreaterThan(threshold) => size > *threshold,
80 SizeFilter::LessThan(threshold) => size < *threshold,
81 SizeFilter::Equal(target) => size == *target,
82 SizeFilter::Between(min, max) => size >= *min && size <= *max,
83 }
84 }
85}
86
87#[derive(Debug, Clone)]
89pub enum MtimeFilter {
90 After(SystemTime),
92 Before(SystemTime),
94 WithinDays(u32),
96 OlderThanDays(u32),
98}
99
100impl MtimeFilter {
101 pub fn parse(s: &str) -> Result<Self> {
104 let s = s.trim();
105
106 if let Some(rest) = s.strip_prefix('-') {
108 if let Some(days_str) = rest.strip_suffix('d') {
109 let days: u32 = days_str.parse().context("Invalid days number")?;
110 return Ok(MtimeFilter::WithinDays(days));
111 }
112 }
113
114 if let Some(rest) = s.strip_prefix('+') {
115 if let Some(days_str) = rest.strip_suffix('d') {
116 let days: u32 = days_str.parse().context("Invalid days number")?;
117 return Ok(MtimeFilter::OlderThanDays(days));
118 }
119 }
120
121 if let Ok(date) = chrono::NaiveDate::parse_from_str(s, "%Y-%m-%d") {
123 let datetime = date.and_hms_opt(0, 0, 0).context("Invalid date")?;
124 let system_time =
125 SystemTime::UNIX_EPOCH + Duration::from_secs(datetime.and_utc().timestamp() as u64);
126 return Ok(MtimeFilter::After(system_time));
127 }
128
129 anyhow::bail!("Invalid mtime filter format: {}. Use '-7d', '+30d', or 'YYYY-MM-DD'", s)
130 }
131
132 pub fn matches(&self, mtime: SystemTime) -> bool {
134 let now = SystemTime::now();
135
136 match self {
137 MtimeFilter::After(threshold) => mtime >= *threshold,
138 MtimeFilter::Before(threshold) => mtime <= *threshold,
139 MtimeFilter::WithinDays(days) => {
140 let threshold = now - Duration::from_secs(*days as u64 * 24 * 60 * 60);
141 mtime >= threshold
142 }
143 MtimeFilter::OlderThanDays(days) => {
144 let threshold = now - Duration::from_secs(*days as u64 * 24 * 60 * 60);
145 mtime < threshold
146 }
147 }
148 }
149}
150
151#[derive(Debug, Clone, PartialEq, Eq)]
153pub enum FileTypeFilter {
154 Csv,
155 Json,
156 Log,
157 Code,
158 Text,
159 Parquet,
160 Custom(Vec<String>),
161}
162
163impl FileTypeFilter {
164 pub fn parse(s: &str) -> Result<Self> {
166 match s.to_lowercase().as_str() {
167 "csv" => Ok(FileTypeFilter::Csv),
168 "json" | "jsonl" => Ok(FileTypeFilter::Json),
169 "log" => Ok(FileTypeFilter::Log),
170 "code" => Ok(FileTypeFilter::Code),
171 "text" | "txt" => Ok(FileTypeFilter::Text),
172 "parquet" => Ok(FileTypeFilter::Parquet),
173 _ => {
174 let extensions: Vec<String> = s
176 .split(',')
177 .map(|ext| ext.trim().trim_start_matches('.').to_lowercase())
178 .collect();
179 Ok(FileTypeFilter::Custom(extensions))
180 }
181 }
182 }
183
184 pub fn extensions(&self) -> Vec<&str> {
186 match self {
187 FileTypeFilter::Csv => vec!["csv", "tsv"],
188 FileTypeFilter::Json => vec!["json", "jsonl", "ndjson"],
189 FileTypeFilter::Log => vec!["log", "logs"],
190 FileTypeFilter::Code => vec![
191 "rs", "py", "js", "ts", "go", "java", "c", "cpp", "h", "hpp", "rb", "php", "swift",
192 "kt", "scala", "sh", "bash", "zsh",
193 ],
194 FileTypeFilter::Text => vec!["txt", "md", "rst", "text"],
195 FileTypeFilter::Parquet => vec!["parquet", "pq"],
196 FileTypeFilter::Custom(exts) => exts.iter().map(|s| s.as_str()).collect(),
197 }
198 }
199
200 pub fn matches(&self, path: &Path) -> bool {
202 let ext = path.extension().and_then(|e| e.to_str()).map(|e| e.to_lowercase());
203
204 match ext {
205 Some(ext) => self.extensions().contains(&ext.as_str()),
206 None => false,
207 }
208 }
209}
210
211#[derive(Debug, Clone)]
213pub struct ScannedFile {
214 pub path: PathBuf,
216 pub size: u64,
218 pub modified: Option<SystemTime>,
220 pub is_dir: bool,
222}
223
224impl ScannedFile {
225 fn from_entry(entry: &ignore::DirEntry) -> Option<Self> {
227 let metadata = entry.metadata().ok()?;
228 Some(ScannedFile {
229 path: entry.path().to_path_buf(),
230 size: metadata.len(),
231 modified: metadata.modified().ok(),
232 is_dir: metadata.is_dir(),
233 })
234 }
235}
236
237#[derive(Debug, Clone)]
239pub struct ScanConfig {
240 pub root: PathBuf,
242 pub file_type: Option<FileTypeFilter>,
244 pub size_filter: Option<SizeFilter>,
246 pub mtime_filter: Option<MtimeFilter>,
248 pub max_depth: Option<usize>,
250 pub follow_links: bool,
252 pub respect_gitignore: bool,
254 pub threads: usize,
256 pub include_hidden: bool,
258}
259
260impl Default for ScanConfig {
261 fn default() -> Self {
262 Self {
263 root: PathBuf::from("."),
264 file_type: None,
265 size_filter: None,
266 mtime_filter: None,
267 max_depth: None,
268 follow_links: false,
269 respect_gitignore: true,
270 threads: 0, include_hidden: false,
272 }
273 }
274}
275
276impl ScanConfig {
277 pub fn new<P: AsRef<Path>>(root: P) -> Self {
279 Self { root: root.as_ref().to_path_buf(), ..Default::default() }
280 }
281
282 pub fn with_file_type(mut self, filter: FileTypeFilter) -> Self {
284 self.file_type = Some(filter);
285 self
286 }
287
288 pub fn with_size_filter(mut self, filter: SizeFilter) -> Self {
290 self.size_filter = Some(filter);
291 self
292 }
293
294 pub fn with_mtime_filter(mut self, filter: MtimeFilter) -> Self {
296 self.mtime_filter = Some(filter);
297 self
298 }
299
300 pub fn with_max_depth(mut self, depth: usize) -> Self {
302 self.max_depth = Some(depth);
303 self
304 }
305
306 pub fn with_follow_links(mut self, follow: bool) -> Self {
308 self.follow_links = follow;
309 self
310 }
311
312 pub fn with_respect_gitignore(mut self, respect: bool) -> Self {
314 self.respect_gitignore = respect;
315 self
316 }
317
318 pub fn with_threads(mut self, threads: usize) -> Self {
320 self.threads = threads;
321 self
322 }
323
324 pub fn with_include_hidden(mut self, include: bool) -> Self {
326 self.include_hidden = include;
327 self
328 }
329}
330
331#[derive(Debug, Clone, Default)]
333pub struct ScanStats {
334 pub total_files: usize,
336 pub matched_files: usize,
338 pub directories: usize,
340 pub skipped: usize,
342 pub errors: usize,
344 pub total_size: u64,
346 pub elapsed_ms: u64,
348}
349
350pub struct FileScanner {
352 config: ScanConfig,
353}
354
355impl FileScanner {
356 pub fn new(config: ScanConfig) -> Self {
358 Self { config }
359 }
360
361 fn build_walker(&self) -> WalkBuilder {
363 let mut builder = WalkBuilder::new(&self.config.root);
364
365 let threads = if self.config.threads == 0 { num_cpus::get() } else { self.config.threads };
367 builder.threads(threads);
368
369 builder.git_ignore(self.config.respect_gitignore);
371 builder.git_global(self.config.respect_gitignore);
372 builder.git_exclude(self.config.respect_gitignore);
373
374 builder.hidden(!self.config.include_hidden);
376
377 builder.follow_links(self.config.follow_links);
379
380 if let Some(depth) = self.config.max_depth {
382 builder.max_depth(Some(depth));
383 }
384
385 builder.add_custom_ignore_filename(".xoreignore");
387
388 builder
389 }
390
391 fn matches_filters(&self, file: &ScannedFile) -> bool {
393 if file.is_dir {
395 return false;
396 }
397
398 if let Some(ref filter) = self.config.file_type {
400 if !filter.matches(&file.path) {
401 return false;
402 }
403 }
404
405 if let Some(ref filter) = self.config.size_filter {
407 if !filter.matches(file.size) {
408 return false;
409 }
410 }
411
412 if let Some(ref filter) = self.config.mtime_filter {
414 if let Some(mtime) = file.modified {
415 if !filter.matches(mtime) {
416 return false;
417 }
418 } else {
419 return false;
421 }
422 }
423
424 true
425 }
426
427 #[instrument(skip(self), fields(root = %self.config.root.display()))]
429 pub fn scan(&self) -> Result<(Vec<ScannedFile>, ScanStats)> {
430 let start = std::time::Instant::now();
431 let mut stats = ScanStats::default();
432
433 info!("Starting file scan at {:?}", self.config.root);
434 debug!("Scan config: {:?}", self.config);
435
436 let walker = self.build_walker();
437
438 let entries: Vec<_> = walker
440 .build()
441 .filter_map(|entry| match entry {
442 Ok(e) => Some(e),
443 Err(err) => {
444 warn!("Error accessing entry: {}", err);
445 None
446 }
447 })
448 .collect();
449
450 let results: Vec<(Option<ScannedFile>, bool, bool)> = entries
452 .par_iter()
453 .map(|entry| {
454 let is_dir = entry.file_type().map(|t| t.is_dir()).unwrap_or(false);
455
456 match ScannedFile::from_entry(entry) {
457 Some(file) => {
458 let matches = self.matches_filters(&file);
459 (Some(file), is_dir, matches)
460 }
461 None => (None, is_dir, false),
462 }
463 })
464 .collect();
465
466 let mut matched_files = Vec::new();
468
469 for (file, is_dir, matches) in results {
470 if is_dir {
471 stats.directories += 1;
472 } else {
473 stats.total_files += 1;
474 }
475
476 if let Some(f) = file {
477 stats.total_size += f.size;
478
479 if matches {
480 stats.matched_files += 1;
481 matched_files.push(f);
482 } else if !is_dir {
483 stats.skipped += 1;
484 }
485 }
486 }
487
488 stats.elapsed_ms = start.elapsed().as_millis() as u64;
489
490 info!(
491 "Scan completed: {} files matched out of {} total ({} ms)",
492 stats.matched_files, stats.total_files, stats.elapsed_ms
493 );
494
495 Ok((matched_files, stats))
496 }
497
498 pub fn scan_iter(&self) -> impl Iterator<Item = Result<ScannedFile>> + '_ {
500 let walker = self.build_walker();
501
502 walker.build().filter_map(move |entry| match entry {
503 Ok(e) => {
504 let file = ScannedFile::from_entry(&e)?;
505 if self.matches_filters(&file) {
506 Some(Ok(file))
507 } else {
508 None
509 }
510 }
511 Err(err) => Some(Err(anyhow::anyhow!("Error accessing entry: {}", err))),
512 })
513 }
514}
515
516fn parse_size(s: &str) -> Result<u64> {
519 let s = s.trim().to_uppercase();
520
521 if let Ok(bytes) = s.parse::<u64>() {
523 return Ok(bytes);
524 }
525
526 let (num_str, unit) = if s.ends_with("GB") {
528 (&s[..s.len() - 2], 1024 * 1024 * 1024)
529 } else if s.ends_with("MB") {
530 (&s[..s.len() - 2], 1024 * 1024)
531 } else if s.ends_with("KB") {
532 (&s[..s.len() - 2], 1024)
533 } else if s.ends_with('B') {
534 (&s[..s.len() - 1], 1)
535 } else {
536 return Err(anyhow::anyhow!(
537 "Invalid size format: {}. Use format like '1MB', '500KB', '2GB'",
538 s
539 ));
540 };
541
542 let num: f64 =
543 num_str.trim().parse().with_context(|| format!("Invalid number in size: {}", num_str))?;
544
545 Ok((num * unit as f64) as u64)
546}
547
548#[cfg(test)]
549mod tests {
550 use super::*;
551
552 mod size_filter_tests {
553 use super::*;
554
555 #[test]
556 fn test_parse_bytes() {
557 let filter = SizeFilter::parse("1024").unwrap();
558 assert!(matches!(filter, SizeFilter::Equal(1024)));
559 }
560
561 #[test]
562 fn test_parse_kb() {
563 let filter = SizeFilter::parse(">1KB").unwrap();
564 assert!(matches!(filter, SizeFilter::GreaterThan(1024)));
565 }
566
567 #[test]
568 fn test_parse_mb() {
569 let filter = SizeFilter::parse("<10MB").unwrap();
570 assert!(matches!(filter, SizeFilter::LessThan(10485760)));
571 }
572
573 #[test]
574 fn test_parse_range() {
575 let filter = SizeFilter::parse("1MB-10MB").unwrap();
576 match filter {
577 SizeFilter::Between(min, max) => {
578 assert_eq!(min, 1024 * 1024);
579 assert_eq!(max, 10 * 1024 * 1024);
580 }
581 _ => panic!("Expected Between filter"),
582 }
583 }
584
585 #[test]
587 fn test_parse_gt_syntax() {
588 let filter = SizeFilter::parse("gt:1MB").unwrap();
589 assert!(matches!(filter, SizeFilter::GreaterThan(1048576)));
590 }
591
592 #[test]
593 fn test_parse_lt_syntax() {
594 let filter = SizeFilter::parse("lt:500KB").unwrap();
595 assert!(matches!(filter, SizeFilter::LessThan(512000)));
596 }
597
598 #[test]
599 fn test_parse_eq_syntax() {
600 let filter = SizeFilter::parse("eq:1GB").unwrap();
601 assert!(matches!(filter, SizeFilter::Equal(1073741824)));
602 }
603
604 #[test]
605 fn test_gt_syntax_with_spaces() {
606 let filter = SizeFilter::parse("gt: 2MB").unwrap();
607 assert!(matches!(filter, SizeFilter::GreaterThan(2097152)));
608 }
609
610 #[test]
611 fn test_matches() {
612 assert!(SizeFilter::GreaterThan(100).matches(200));
613 assert!(!SizeFilter::GreaterThan(100).matches(50));
614 assert!(SizeFilter::LessThan(100).matches(50));
615 assert!(SizeFilter::Between(10, 100).matches(50));
616 assert!(!SizeFilter::Between(10, 100).matches(5));
617 }
618 }
619
620 mod mtime_filter_tests {
621 use super::*;
622
623 #[test]
624 fn test_parse_within_days() {
625 let filter = MtimeFilter::parse("-7d").unwrap();
626 assert!(matches!(filter, MtimeFilter::WithinDays(7)));
627 }
628
629 #[test]
630 fn test_parse_older_than_days() {
631 let filter = MtimeFilter::parse("+30d").unwrap();
632 assert!(matches!(filter, MtimeFilter::OlderThanDays(30)));
633 }
634
635 #[test]
636 fn test_parse_date() {
637 let filter = MtimeFilter::parse("2024-01-01").unwrap();
638 assert!(matches!(filter, MtimeFilter::After(_)));
639 }
640
641 #[test]
642 fn test_within_days_matches() {
643 let filter = MtimeFilter::WithinDays(7);
644 let recent = SystemTime::now() - Duration::from_secs(3 * 24 * 60 * 60);
645 let old = SystemTime::now() - Duration::from_secs(10 * 24 * 60 * 60);
646
647 assert!(filter.matches(recent));
648 assert!(!filter.matches(old));
649 }
650 }
651
652 mod file_type_filter_tests {
653 use super::*;
654
655 #[test]
656 fn test_parse_csv() {
657 let filter = FileTypeFilter::parse("csv").unwrap();
658 assert_eq!(filter, FileTypeFilter::Csv);
659 }
660
661 #[test]
662 fn test_parse_custom() {
663 let filter = FileTypeFilter::parse("xml,yaml,toml").unwrap();
664 match filter {
665 FileTypeFilter::Custom(exts) => {
666 assert_eq!(exts, vec!["xml", "yaml", "toml"]);
667 }
668 _ => panic!("Expected Custom filter"),
669 }
670 }
671
672 #[test]
673 fn test_matches_csv() {
674 let filter = FileTypeFilter::Csv;
675 assert!(filter.matches(Path::new("data.csv")));
676 assert!(filter.matches(Path::new("data.tsv")));
677 assert!(!filter.matches(Path::new("data.json")));
678 }
679
680 #[test]
681 fn test_matches_code() {
682 let filter = FileTypeFilter::Code;
683 assert!(filter.matches(Path::new("main.rs")));
684 assert!(filter.matches(Path::new("app.py")));
685 assert!(!filter.matches(Path::new("data.csv")));
686 }
687 }
688
689 mod parse_size_tests {
690 use super::*;
691
692 #[test]
693 fn test_parse_pure_bytes() {
694 assert_eq!(parse_size("1024").unwrap(), 1024);
695 }
696
697 #[test]
698 fn test_parse_kb() {
699 assert_eq!(parse_size("1KB").unwrap(), 1024);
700 assert_eq!(parse_size("1kb").unwrap(), 1024);
701 }
702
703 #[test]
704 fn test_parse_mb() {
705 assert_eq!(parse_size("1MB").unwrap(), 1024 * 1024);
706 }
707
708 #[test]
709 fn test_parse_gb() {
710 assert_eq!(parse_size("1GB").unwrap(), 1024 * 1024 * 1024);
711 }
712
713 #[test]
714 fn test_parse_decimal() {
715 assert_eq!(parse_size("1.5MB").unwrap(), (1.5 * 1024.0 * 1024.0) as u64);
716 }
717 }
718}