use globset::{Glob, GlobSet, GlobSetBuilder};
use ignore::WalkBuilder;
use std::{
path::{Path, PathBuf},
sync::mpsc,
};
#[derive(Debug, Clone)]
pub struct WalkConfig {
pub paths: Vec<PathBuf>,
pub extensions: Vec<String>, pub ignore_patterns: Vec<String>,
pub max_size: Option<u64>,
pub min_lines: Option<usize>,
pub max_lines: Option<usize>,
pub follow_symlinks: bool,
pub no_gitignore: bool,
}
impl Default for WalkConfig {
fn default() -> Self {
Self {
paths: vec![],
extensions: vec![],
ignore_patterns: vec![],
max_size: Some(512 * 1024),
min_lines: None,
max_lines: None,
follow_symlinks: false,
no_gitignore: false,
}
}
}
#[derive(Debug, Clone)]
pub struct DiscoveredFile {
pub path: PathBuf,
pub format: String,
}
fn build_ignore_glob_set(patterns: &[String]) -> GlobSet {
let mut builder = GlobSetBuilder::new();
for pattern in patterns {
let p = pattern.trim_start_matches('/');
if let Ok(g) = Glob::new(p) {
builder.add(g);
}
let glob_any_depth = format!("**/{}", p);
if let Ok(g) = Glob::new(&glob_any_depth) {
builder.add(g);
}
}
builder.build().unwrap_or_else(|_| GlobSet::empty())
}
pub fn walk(config: &WalkConfig) -> Vec<DiscoveredFile> {
let mut results = Vec::new();
for root in &config.paths {
walk_one(root, config, &mut results);
}
results
}
fn walk_one(root: &Path, config: &WalkConfig, results: &mut Vec<DiscoveredFile>) {
let mut builder = WalkBuilder::new(root);
builder.follow_links(config.follow_symlinks);
builder.git_ignore(!config.no_gitignore);
builder.hidden(false);
let ignore_set = build_ignore_glob_set(&config.ignore_patterns);
let (tx, rx) = mpsc::channel::<DiscoveredFile>();
let follow_symlinks = config.follow_symlinks;
let max_size = config.max_size;
let min_lines = config.min_lines;
let max_lines = config.max_lines;
let extensions = config.extensions.clone();
builder.build_parallel().run(move || {
let tx = tx.clone();
let extensions = extensions.clone();
let ignore_set = ignore_set.clone();
Box::new(move |entry_result| {
use ignore::WalkState;
let entry = match entry_result {
Ok(e) => e,
Err(_) => return WalkState::Continue,
};
let path = entry.path().to_path_buf();
if !path.is_file() {
return WalkState::Continue;
}
if !follow_symlinks {
if let Ok(meta) = std::fs::symlink_metadata(&path) {
if meta.file_type().is_symlink() {
return WalkState::Continue;
}
}
}
if let Some(max) = max_size {
if let Ok(meta) = std::fs::metadata(&path) {
if meta.len() > max {
return WalkState::Continue;
}
}
}
let format = match detect_format(&path, &extensions) {
Some(f) => f,
None => return WalkState::Continue,
};
if !ignore_set.is_empty() && ignore_set.is_match(&path) {
return WalkState::Continue;
}
if min_lines.is_some() || max_lines.is_some() {
match std::fs::read(&path) {
Ok(bytes) => {
let lc = count_lines(&bytes);
if min_lines.is_some_and(|m| lc < m) {
return WalkState::Continue;
}
if max_lines.is_some_and(|m| lc > m) {
return WalkState::Continue;
}
}
Err(_) => return WalkState::Continue,
}
}
let _ = tx.send(DiscoveredFile { path, format });
WalkState::Continue
})
});
results.extend(rx);
}
fn count_lines(bytes: &[u8]) -> usize {
bytes.iter().filter(|&&b| b == b'\n').count()
}
fn detect_format(path: &Path, filter: &[String]) -> Option<String> {
let fmt = path
.extension()
.and_then(|e| e.to_str())
.and_then(|ext| cpd_tokenizer::formats::get_format_by_extension(ext))
.map(|s| s.to_string())
.or_else(|| {
std::fs::read_to_string(path).ok().and_then(|c| {
c.lines()
.next()
.filter(|l| l.starts_with("#!"))
.and_then(|l| cpd_tokenizer::formats::get_format_by_shebang(l))
.map(|s| s.to_string())
})
})?;
if !filter.is_empty() && !filter.iter().any(|e| e == &fmt) {
return None;
}
Some(fmt)
}
#[cfg(test)]
mod tests {
use super::*;
fn fixtures() -> PathBuf {
PathBuf::from(env!("CARGO_MANIFEST_DIR")).join("tests/fixtures/walker")
}
#[test]
fn walks_js_and_ts_files() {
let dir = fixtures();
if !dir.exists() {
return;
}
let config = WalkConfig {
paths: vec![dir],
..Default::default()
};
let files = walk(&config);
assert!(
files.iter().any(|f| f.format == "javascript"),
"must find JS"
);
assert!(
files.iter().any(|f| f.format == "typescript"),
"must find TS"
);
}
#[test]
fn nonexistent_path_returns_empty() {
let config = WalkConfig {
paths: vec![PathBuf::from("/tmp/cpd-nonexistent-xyz")],
..Default::default()
};
let files = walk(&config);
assert!(files.is_empty());
}
#[test]
fn max_size_zero_excludes_all_files() {
let dir = fixtures();
if !dir.exists() {
return;
}
let config = WalkConfig {
paths: vec![dir],
max_size: Some(0),
..Default::default()
};
assert!(walk(&config).is_empty(), "max_size=0 must exclude all");
}
#[test]
fn extension_filter_limits_to_js_only() {
let dir = fixtures();
if !dir.exists() {
return;
}
let config = WalkConfig {
paths: vec![dir],
extensions: vec!["javascript".to_string()],
..Default::default()
};
let files = walk(&config);
assert!(
files.iter().all(|f| f.format == "javascript"),
"extension filter must return only JS files"
);
}
#[test]
fn ignore_glob_pattern_excludes_matching_paths() {
let dir = fixtures();
if !dir.exists() {
return;
}
let config = WalkConfig {
paths: vec![dir],
ignore_patterns: vec!["*.js".to_string()],
..Default::default()
};
let files = walk(&config);
assert!(
files.iter().all(|f| f.format != "javascript"),
"*.js glob pattern must exclude all JS files"
);
}
#[test]
fn count_lines_counts_newlines() {
assert_eq!(count_lines(b"a\nb\nc\n"), 3);
assert_eq!(count_lines(b"a\nb\nc"), 2);
assert_eq!(count_lines(b""), 0);
}
}