Skip to main content

xore_search/
scanner.rs

1//! 文件扫描器模块
2//!
3//! 基于 `walkdir` + `ignore` 实现高性能文件扫描,支持:
4//! - 遵守 .gitignore 规则
5//! - Rayon 并行遍历
6//! - 多种过滤条件(类型、大小、修改时间)
7
8use std::path::{Path, PathBuf};
9use std::time::{Duration, SystemTime};
10
11use anyhow::{Context, Result};
12use ignore::WalkBuilder;
13use rayon::prelude::*;
14use tracing::{debug, info, instrument, warn};
15
16/// 文件大小过滤条件
17#[derive(Debug, Clone)]
18pub enum SizeFilter {
19    /// 大于指定字节数
20    GreaterThan(u64),
21    /// 小于指定字节数
22    LessThan(u64),
23    /// 等于指定字节数
24    Equal(u64),
25    /// 在指定范围内(包含边界)
26    Between(u64, u64),
27}
28
29impl SizeFilter {
30    /// 解析大小过滤字符串
31    /// 支持格式:
32    /// - 新语法(无需引号):`gt:1MB`, `lt:500KB`, `eq:1GB`
33    /// - 旧语法(需要引号):`">1MB"`, `"<500KB"`, `"=1GB"`
34    /// - 范围格式:`1MB-10MB`
35    pub fn parse(s: &str) -> Result<Self> {
36        let s = s.trim();
37
38        // 新语法:gt:1MB, lt:500KB, eq:1GB(无需 shell 引号)
39        if let Some(rest) = s.strip_prefix("gt:") {
40            let bytes = parse_size(rest.trim())?;
41            return Ok(SizeFilter::GreaterThan(bytes));
42        }
43        if let Some(rest) = s.strip_prefix("lt:") {
44            let bytes = parse_size(rest.trim())?;
45            return Ok(SizeFilter::LessThan(bytes));
46        }
47        if let Some(rest) = s.strip_prefix("eq:") {
48            let bytes = parse_size(rest.trim())?;
49            return Ok(SizeFilter::Equal(bytes));
50        }
51
52        // 处理范围格式 "1MB-10MB"
53        if let Some((min, max)) = s.split_once('-') {
54            let min_bytes = parse_size(min.trim())?;
55            let max_bytes = parse_size(max.trim())?;
56            return Ok(SizeFilter::Between(min_bytes, max_bytes));
57        }
58
59        // 旧语法:处理比较格式(需要 shell 引号)
60        if let Some(rest) = s.strip_prefix('>') {
61            let bytes = parse_size(rest.trim())?;
62            Ok(SizeFilter::GreaterThan(bytes))
63        } else if let Some(rest) = s.strip_prefix('<') {
64            let bytes = parse_size(rest.trim())?;
65            Ok(SizeFilter::LessThan(bytes))
66        } else if let Some(rest) = s.strip_prefix('=') {
67            let bytes = parse_size(rest.trim())?;
68            Ok(SizeFilter::Equal(bytes))
69        } else {
70            // 默认等于
71            let bytes = parse_size(s)?;
72            Ok(SizeFilter::Equal(bytes))
73        }
74    }
75
76    /// 检查文件大小是否匹配过滤条件
77    pub fn matches(&self, size: u64) -> bool {
78        match self {
79            SizeFilter::GreaterThan(threshold) => size > *threshold,
80            SizeFilter::LessThan(threshold) => size < *threshold,
81            SizeFilter::Equal(target) => size == *target,
82            SizeFilter::Between(min, max) => size >= *min && size <= *max,
83        }
84    }
85}
86
87/// 修改时间过滤条件
88#[derive(Debug, Clone)]
89pub enum MtimeFilter {
90    /// 在指定时间之后修改
91    After(SystemTime),
92    /// 在指定时间之前修改
93    Before(SystemTime),
94    /// 在过去N天内修改
95    WithinDays(u32),
96    /// 超过N天未修改
97    OlderThanDays(u32),
98}
99
100impl MtimeFilter {
101    /// 解析修改时间过滤字符串
102    /// 支持格式:"-7d"(过去7天), "+30d"(超过30天), "2024-01-01"(指定日期之后)
103    pub fn parse(s: &str) -> Result<Self> {
104        let s = s.trim();
105
106        // 处理相对时间格式
107        if let Some(rest) = s.strip_prefix('-') {
108            if let Some(days_str) = rest.strip_suffix('d') {
109                let days: u32 = days_str.parse().context("Invalid days number")?;
110                return Ok(MtimeFilter::WithinDays(days));
111            }
112        }
113
114        if let Some(rest) = s.strip_prefix('+') {
115            if let Some(days_str) = rest.strip_suffix('d') {
116                let days: u32 = days_str.parse().context("Invalid days number")?;
117                return Ok(MtimeFilter::OlderThanDays(days));
118            }
119        }
120
121        // 处理绝对日期格式 YYYY-MM-DD
122        if let Ok(date) = chrono::NaiveDate::parse_from_str(s, "%Y-%m-%d") {
123            let datetime = date.and_hms_opt(0, 0, 0).context("Invalid date")?;
124            let system_time =
125                SystemTime::UNIX_EPOCH + Duration::from_secs(datetime.and_utc().timestamp() as u64);
126            return Ok(MtimeFilter::After(system_time));
127        }
128
129        anyhow::bail!("Invalid mtime filter format: {}. Use '-7d', '+30d', or 'YYYY-MM-DD'", s)
130    }
131
132    /// 检查修改时间是否匹配过滤条件
133    pub fn matches(&self, mtime: SystemTime) -> bool {
134        let now = SystemTime::now();
135
136        match self {
137            MtimeFilter::After(threshold) => mtime >= *threshold,
138            MtimeFilter::Before(threshold) => mtime <= *threshold,
139            MtimeFilter::WithinDays(days) => {
140                let threshold = now - Duration::from_secs(*days as u64 * 24 * 60 * 60);
141                mtime >= threshold
142            }
143            MtimeFilter::OlderThanDays(days) => {
144                let threshold = now - Duration::from_secs(*days as u64 * 24 * 60 * 60);
145                mtime < threshold
146            }
147        }
148    }
149}
150
151/// 文件类型过滤
152#[derive(Debug, Clone, PartialEq, Eq)]
153pub enum FileTypeFilter {
154    Csv,
155    Json,
156    Log,
157    Code,
158    Text,
159    Parquet,
160    Custom(Vec<String>),
161}
162
163impl FileTypeFilter {
164    /// 解析文件类型过滤字符串
165    pub fn parse(s: &str) -> Result<Self> {
166        match s.to_lowercase().as_str() {
167            "csv" => Ok(FileTypeFilter::Csv),
168            "json" | "jsonl" => Ok(FileTypeFilter::Json),
169            "log" => Ok(FileTypeFilter::Log),
170            "code" => Ok(FileTypeFilter::Code),
171            "text" | "txt" => Ok(FileTypeFilter::Text),
172            "parquet" => Ok(FileTypeFilter::Parquet),
173            _ => {
174                // 支持逗号分隔的扩展名列表
175                let extensions: Vec<String> = s
176                    .split(',')
177                    .map(|ext| ext.trim().trim_start_matches('.').to_lowercase())
178                    .collect();
179                Ok(FileTypeFilter::Custom(extensions))
180            }
181        }
182    }
183
184    /// 获取该类型对应的文件扩展名列表
185    pub fn extensions(&self) -> Vec<&str> {
186        match self {
187            FileTypeFilter::Csv => vec!["csv", "tsv"],
188            FileTypeFilter::Json => vec!["json", "jsonl", "ndjson"],
189            FileTypeFilter::Log => vec!["log", "logs"],
190            FileTypeFilter::Code => vec![
191                "rs", "py", "js", "ts", "go", "java", "c", "cpp", "h", "hpp", "rb", "php", "swift",
192                "kt", "scala", "sh", "bash", "zsh",
193            ],
194            FileTypeFilter::Text => vec!["txt", "md", "rst", "text"],
195            FileTypeFilter::Parquet => vec!["parquet", "pq"],
196            FileTypeFilter::Custom(exts) => exts.iter().map(|s| s.as_str()).collect(),
197        }
198    }
199
200    /// 检查文件是否匹配类型过滤
201    pub fn matches(&self, path: &Path) -> bool {
202        let ext = path.extension().and_then(|e| e.to_str()).map(|e| e.to_lowercase());
203
204        match ext {
205            Some(ext) => self.extensions().contains(&ext.as_str()),
206            None => false,
207        }
208    }
209}
210
211/// 扫描到的文件信息
212#[derive(Debug, Clone)]
213pub struct ScannedFile {
214    /// 文件路径
215    pub path: PathBuf,
216    /// 文件大小(字节)
217    pub size: u64,
218    /// 修改时间
219    pub modified: Option<SystemTime>,
220    /// 是否为目录
221    pub is_dir: bool,
222}
223
224impl ScannedFile {
225    /// 从 DirEntry 创建 ScannedFile
226    fn from_entry(entry: &ignore::DirEntry) -> Option<Self> {
227        let metadata = entry.metadata().ok()?;
228        Some(ScannedFile {
229            path: entry.path().to_path_buf(),
230            size: metadata.len(),
231            modified: metadata.modified().ok(),
232            is_dir: metadata.is_dir(),
233        })
234    }
235}
236
237/// 扫描配置
238#[derive(Debug, Clone)]
239pub struct ScanConfig {
240    /// 扫描根路径
241    pub root: PathBuf,
242    /// 文件类型过滤
243    pub file_type: Option<FileTypeFilter>,
244    /// 文件大小过滤
245    pub size_filter: Option<SizeFilter>,
246    /// 修改时间过滤
247    pub mtime_filter: Option<MtimeFilter>,
248    /// 最大遍历深度(None 表示无限制)
249    pub max_depth: Option<usize>,
250    /// 是否跟随符号链接
251    pub follow_links: bool,
252    /// 是否遵守 .gitignore
253    pub respect_gitignore: bool,
254    /// 并行线程数(0 表示自动检测)
255    pub threads: usize,
256    /// 是否包含隐藏文件
257    pub include_hidden: bool,
258}
259
260impl Default for ScanConfig {
261    fn default() -> Self {
262        Self {
263            root: PathBuf::from("."),
264            file_type: None,
265            size_filter: None,
266            mtime_filter: None,
267            max_depth: None,
268            follow_links: false,
269            respect_gitignore: true,
270            threads: 0, // 自动检测
271            include_hidden: false,
272        }
273    }
274}
275
276impl ScanConfig {
277    /// 创建新的扫描配置
278    pub fn new<P: AsRef<Path>>(root: P) -> Self {
279        Self { root: root.as_ref().to_path_buf(), ..Default::default() }
280    }
281
282    /// 设置文件类型过滤
283    pub fn with_file_type(mut self, filter: FileTypeFilter) -> Self {
284        self.file_type = Some(filter);
285        self
286    }
287
288    /// 设置文件大小过滤
289    pub fn with_size_filter(mut self, filter: SizeFilter) -> Self {
290        self.size_filter = Some(filter);
291        self
292    }
293
294    /// 设置修改时间过滤
295    pub fn with_mtime_filter(mut self, filter: MtimeFilter) -> Self {
296        self.mtime_filter = Some(filter);
297        self
298    }
299
300    /// 设置最大深度
301    pub fn with_max_depth(mut self, depth: usize) -> Self {
302        self.max_depth = Some(depth);
303        self
304    }
305
306    /// 设置是否跟随符号链接
307    pub fn with_follow_links(mut self, follow: bool) -> Self {
308        self.follow_links = follow;
309        self
310    }
311
312    /// 设置是否遵守 .gitignore
313    pub fn with_respect_gitignore(mut self, respect: bool) -> Self {
314        self.respect_gitignore = respect;
315        self
316    }
317
318    /// 设置并行线程数
319    pub fn with_threads(mut self, threads: usize) -> Self {
320        self.threads = threads;
321        self
322    }
323
324    /// 设置是否包含隐藏文件
325    pub fn with_include_hidden(mut self, include: bool) -> Self {
326        self.include_hidden = include;
327        self
328    }
329}
330
331/// 扫描统计信息
332#[derive(Debug, Clone, Default)]
333pub struct ScanStats {
334    /// 扫描的文件总数
335    pub total_files: usize,
336    /// 匹配过滤条件的文件数
337    pub matched_files: usize,
338    /// 扫描的目录数
339    pub directories: usize,
340    /// 跳过的文件数(因过滤条件)
341    pub skipped: usize,
342    /// 错误数
343    pub errors: usize,
344    /// 总文件大小(字节)
345    pub total_size: u64,
346    /// 扫描耗时(毫秒)
347    pub elapsed_ms: u64,
348}
349
350/// 文件扫描器
351pub struct FileScanner {
352    config: ScanConfig,
353}
354
355impl FileScanner {
356    /// 创建新的文件扫描器
357    pub fn new(config: ScanConfig) -> Self {
358        Self { config }
359    }
360
361    /// 构建 WalkBuilder
362    fn build_walker(&self) -> WalkBuilder {
363        let mut builder = WalkBuilder::new(&self.config.root);
364
365        // 配置并行线程数
366        let threads = if self.config.threads == 0 { num_cpus::get() } else { self.config.threads };
367        builder.threads(threads);
368
369        // 配置 gitignore
370        builder.git_ignore(self.config.respect_gitignore);
371        builder.git_global(self.config.respect_gitignore);
372        builder.git_exclude(self.config.respect_gitignore);
373
374        // 配置隐藏文件
375        builder.hidden(!self.config.include_hidden);
376
377        // 配置符号链接
378        builder.follow_links(self.config.follow_links);
379
380        // 配置最大深度
381        if let Some(depth) = self.config.max_depth {
382            builder.max_depth(Some(depth));
383        }
384
385        // 添加标准忽略文件
386        builder.add_custom_ignore_filename(".xoreignore");
387
388        builder
389    }
390
391    /// 检查文件是否匹配所有过滤条件
392    fn matches_filters(&self, file: &ScannedFile) -> bool {
393        // 跳过目录
394        if file.is_dir {
395            return false;
396        }
397
398        // 检查文件类型
399        if let Some(ref filter) = self.config.file_type {
400            if !filter.matches(&file.path) {
401                return false;
402            }
403        }
404
405        // 检查文件大小
406        if let Some(ref filter) = self.config.size_filter {
407            if !filter.matches(file.size) {
408                return false;
409            }
410        }
411
412        // 检查修改时间
413        if let Some(ref filter) = self.config.mtime_filter {
414            if let Some(mtime) = file.modified {
415                if !filter.matches(mtime) {
416                    return false;
417                }
418            } else {
419                // 无法获取修改时间,跳过
420                return false;
421            }
422        }
423
424        true
425    }
426
427    /// 执行扫描(并行版本)
428    #[instrument(skip(self), fields(root = %self.config.root.display()))]
429    pub fn scan(&self) -> Result<(Vec<ScannedFile>, ScanStats)> {
430        let start = std::time::Instant::now();
431        let mut stats = ScanStats::default();
432
433        info!("Starting file scan at {:?}", self.config.root);
434        debug!("Scan config: {:?}", self.config);
435
436        let walker = self.build_walker();
437
438        // 收集所有条目
439        let entries: Vec<_> = walker
440            .build()
441            .filter_map(|entry| match entry {
442                Ok(e) => Some(e),
443                Err(err) => {
444                    warn!("Error accessing entry: {}", err);
445                    None
446                }
447            })
448            .collect();
449
450        // 使用 Rayon 并行处理
451        let results: Vec<(Option<ScannedFile>, bool, bool)> = entries
452            .par_iter()
453            .map(|entry| {
454                let is_dir = entry.file_type().map(|t| t.is_dir()).unwrap_or(false);
455
456                match ScannedFile::from_entry(entry) {
457                    Some(file) => {
458                        let matches = self.matches_filters(&file);
459                        (Some(file), is_dir, matches)
460                    }
461                    None => (None, is_dir, false),
462                }
463            })
464            .collect();
465
466        // 汇总结果
467        let mut matched_files = Vec::new();
468
469        for (file, is_dir, matches) in results {
470            if is_dir {
471                stats.directories += 1;
472            } else {
473                stats.total_files += 1;
474            }
475
476            if let Some(f) = file {
477                stats.total_size += f.size;
478
479                if matches {
480                    stats.matched_files += 1;
481                    matched_files.push(f);
482                } else if !is_dir {
483                    stats.skipped += 1;
484                }
485            }
486        }
487
488        stats.elapsed_ms = start.elapsed().as_millis() as u64;
489
490        info!(
491            "Scan completed: {} files matched out of {} total ({} ms)",
492            stats.matched_files, stats.total_files, stats.elapsed_ms
493        );
494
495        Ok((matched_files, stats))
496    }
497
498    /// 执行扫描并返回迭代器(内存友好版本)
499    pub fn scan_iter(&self) -> impl Iterator<Item = Result<ScannedFile>> + '_ {
500        let walker = self.build_walker();
501
502        walker.build().filter_map(move |entry| match entry {
503            Ok(e) => {
504                let file = ScannedFile::from_entry(&e)?;
505                if self.matches_filters(&file) {
506                    Some(Ok(file))
507                } else {
508                    None
509                }
510            }
511            Err(err) => Some(Err(anyhow::anyhow!("Error accessing entry: {}", err))),
512        })
513    }
514}
515
516/// 解析大小字符串为字节数
517/// 支持格式:1024, 1KB, 1.5MB, 2GB
518fn parse_size(s: &str) -> Result<u64> {
519    let s = s.trim().to_uppercase();
520
521    // 尝试直接解析为数字(纯字节)
522    if let Ok(bytes) = s.parse::<u64>() {
523        return Ok(bytes);
524    }
525
526    // 解析带单位的格式
527    let (num_str, unit) = if s.ends_with("GB") {
528        (&s[..s.len() - 2], 1024 * 1024 * 1024)
529    } else if s.ends_with("MB") {
530        (&s[..s.len() - 2], 1024 * 1024)
531    } else if s.ends_with("KB") {
532        (&s[..s.len() - 2], 1024)
533    } else if s.ends_with('B') {
534        (&s[..s.len() - 1], 1)
535    } else {
536        return Err(anyhow::anyhow!(
537            "Invalid size format: {}. Use format like '1MB', '500KB', '2GB'",
538            s
539        ));
540    };
541
542    let num: f64 =
543        num_str.trim().parse().with_context(|| format!("Invalid number in size: {}", num_str))?;
544
545    Ok((num * unit as f64) as u64)
546}
547
548#[cfg(test)]
549mod tests {
550    use super::*;
551
552    mod size_filter_tests {
553        use super::*;
554
555        #[test]
556        fn test_parse_bytes() {
557            let filter = SizeFilter::parse("1024").unwrap();
558            assert!(matches!(filter, SizeFilter::Equal(1024)));
559        }
560
561        #[test]
562        fn test_parse_kb() {
563            let filter = SizeFilter::parse(">1KB").unwrap();
564            assert!(matches!(filter, SizeFilter::GreaterThan(1024)));
565        }
566
567        #[test]
568        fn test_parse_mb() {
569            let filter = SizeFilter::parse("<10MB").unwrap();
570            assert!(matches!(filter, SizeFilter::LessThan(10485760)));
571        }
572
573        #[test]
574        fn test_parse_range() {
575            let filter = SizeFilter::parse("1MB-10MB").unwrap();
576            match filter {
577                SizeFilter::Between(min, max) => {
578                    assert_eq!(min, 1024 * 1024);
579                    assert_eq!(max, 10 * 1024 * 1024);
580                }
581                _ => panic!("Expected Between filter"),
582            }
583        }
584
585        // 新语法测试:gt:, lt:, eq:
586        #[test]
587        fn test_parse_gt_syntax() {
588            let filter = SizeFilter::parse("gt:1MB").unwrap();
589            assert!(matches!(filter, SizeFilter::GreaterThan(1048576)));
590        }
591
592        #[test]
593        fn test_parse_lt_syntax() {
594            let filter = SizeFilter::parse("lt:500KB").unwrap();
595            assert!(matches!(filter, SizeFilter::LessThan(512000)));
596        }
597
598        #[test]
599        fn test_parse_eq_syntax() {
600            let filter = SizeFilter::parse("eq:1GB").unwrap();
601            assert!(matches!(filter, SizeFilter::Equal(1073741824)));
602        }
603
604        #[test]
605        fn test_gt_syntax_with_spaces() {
606            let filter = SizeFilter::parse("gt: 2MB").unwrap();
607            assert!(matches!(filter, SizeFilter::GreaterThan(2097152)));
608        }
609
610        #[test]
611        fn test_matches() {
612            assert!(SizeFilter::GreaterThan(100).matches(200));
613            assert!(!SizeFilter::GreaterThan(100).matches(50));
614            assert!(SizeFilter::LessThan(100).matches(50));
615            assert!(SizeFilter::Between(10, 100).matches(50));
616            assert!(!SizeFilter::Between(10, 100).matches(5));
617        }
618    }
619
620    mod mtime_filter_tests {
621        use super::*;
622
623        #[test]
624        fn test_parse_within_days() {
625            let filter = MtimeFilter::parse("-7d").unwrap();
626            assert!(matches!(filter, MtimeFilter::WithinDays(7)));
627        }
628
629        #[test]
630        fn test_parse_older_than_days() {
631            let filter = MtimeFilter::parse("+30d").unwrap();
632            assert!(matches!(filter, MtimeFilter::OlderThanDays(30)));
633        }
634
635        #[test]
636        fn test_parse_date() {
637            let filter = MtimeFilter::parse("2024-01-01").unwrap();
638            assert!(matches!(filter, MtimeFilter::After(_)));
639        }
640
641        #[test]
642        fn test_within_days_matches() {
643            let filter = MtimeFilter::WithinDays(7);
644            let recent = SystemTime::now() - Duration::from_secs(3 * 24 * 60 * 60);
645            let old = SystemTime::now() - Duration::from_secs(10 * 24 * 60 * 60);
646
647            assert!(filter.matches(recent));
648            assert!(!filter.matches(old));
649        }
650    }
651
652    mod file_type_filter_tests {
653        use super::*;
654
655        #[test]
656        fn test_parse_csv() {
657            let filter = FileTypeFilter::parse("csv").unwrap();
658            assert_eq!(filter, FileTypeFilter::Csv);
659        }
660
661        #[test]
662        fn test_parse_custom() {
663            let filter = FileTypeFilter::parse("xml,yaml,toml").unwrap();
664            match filter {
665                FileTypeFilter::Custom(exts) => {
666                    assert_eq!(exts, vec!["xml", "yaml", "toml"]);
667                }
668                _ => panic!("Expected Custom filter"),
669            }
670        }
671
672        #[test]
673        fn test_matches_csv() {
674            let filter = FileTypeFilter::Csv;
675            assert!(filter.matches(Path::new("data.csv")));
676            assert!(filter.matches(Path::new("data.tsv")));
677            assert!(!filter.matches(Path::new("data.json")));
678        }
679
680        #[test]
681        fn test_matches_code() {
682            let filter = FileTypeFilter::Code;
683            assert!(filter.matches(Path::new("main.rs")));
684            assert!(filter.matches(Path::new("app.py")));
685            assert!(!filter.matches(Path::new("data.csv")));
686        }
687    }
688
689    mod parse_size_tests {
690        use super::*;
691
692        #[test]
693        fn test_parse_pure_bytes() {
694            assert_eq!(parse_size("1024").unwrap(), 1024);
695        }
696
697        #[test]
698        fn test_parse_kb() {
699            assert_eq!(parse_size("1KB").unwrap(), 1024);
700            assert_eq!(parse_size("1kb").unwrap(), 1024);
701        }
702
703        #[test]
704        fn test_parse_mb() {
705            assert_eq!(parse_size("1MB").unwrap(), 1024 * 1024);
706        }
707
708        #[test]
709        fn test_parse_gb() {
710            assert_eq!(parse_size("1GB").unwrap(), 1024 * 1024 * 1024);
711        }
712
713        #[test]
714        fn test_parse_decimal() {
715            assert_eq!(parse_size("1.5MB").unwrap(), (1.5 * 1024.0 * 1024.0) as u64);
716        }
717    }
718}