use anyhow::Result;
use ignore::{DirEntry, WalkBuilder};
use std::fs;
use std::path::{Path, PathBuf};
#[derive(Debug, Clone)]
pub struct CollectorConfig {
pub max_file_size: usize,
pub max_files: usize,
pub priority_extensions: Vec<&'static str>,
pub ignore_patterns: Vec<&'static str>,
}
pub struct FileCollector {
config: CollectorConfig,
}
impl FileCollector {
pub fn new(config: CollectorConfig) -> Self {
Self { config }
}
pub async fn collect_files(&self, root_path: &Path) -> Result<Vec<PathBuf>> {
let root_path = root_path.to_path_buf();
let config = self.config.clone();
tokio::task::spawn_blocking(move || {
Self::collect_files_sync(&config, &root_path)
})
.await?
}
fn collect_files_sync(config: &CollectorConfig, root_path: &Path) -> Result<Vec<PathBuf>> {
let mut priority_files = Vec::new();
let mut other_files = Vec::new();
let mut walker = WalkBuilder::new(root_path);
walker
.standard_filters(true) .hidden(false) .parents(false)
.ignore(true)
.git_ignore(true)
.git_global(true)
.git_exclude(true);
for pattern in &config.ignore_patterns {
walker.add_custom_ignore_filename(pattern);
}
let file_limit_threshold = config.max_files * 2;
for result in walker.build() {
let entry = result?;
if !Self::should_include_entry(&entry) {
continue;
}
let path = entry.path();
if path.is_file() {
if let Ok(metadata) = fs::metadata(path) {
if metadata.len() > config.max_file_size as u64 {
continue;
}
}
if let Some(ext) = path.extension() {
let ext_str = ext.to_string_lossy().to_lowercase();
if config
.priority_extensions
.iter()
.any(|&e| e == ext_str.as_str())
{
priority_files.push(path.to_path_buf());
} else {
other_files.push(path.to_path_buf());
}
} else {
other_files.push(path.to_path_buf());
}
if priority_files.len() + other_files.len() >= file_limit_threshold {
break;
}
}
}
let mut files = Vec::new();
files.extend(priority_files);
files.extend(other_files);
Ok(files)
}
fn should_include_entry(entry: &DirEntry) -> bool {
let path = entry.path();
if path.is_dir() {
let dir_name = path.file_name().unwrap_or_default().to_string_lossy();
let skip_dirs = [
"node_modules",
"target",
"dist",
"build",
".git",
".svn",
".hg",
"venv",
".venv",
"env",
".env",
"__pycache__",
".pytest_cache",
".mypy_cache",
".tox",
"vendor",
"bower_components",
".idea",
".vscode",
"coverage",
".coverage",
"htmlcov",
".gradle",
".cargo",
];
return !skip_dirs.iter().any(|&skip| dir_name == skip);
}
true
}
}