use std::collections::BTreeSet;
use std::path::{Path, PathBuf};
use crate::ports::FileSystemProvider;
use super::classification::{
DATA_FILE_GLOB_PATTERNS, MAX_DATA_FILE_BYTES, MAX_DISCOVERED_DATA_FILES,
MAX_DISCOVERED_SCRIPTS, SCRIPT_DISCOVERY_SUBDIRS, SCRIPT_GLOB_PATTERNS, SKIP_DISCOVERY_DIRS,
};
use super::{FileDiscoveryService, MAX_DISCOVERY_DEPTH};
const MANIFEST_NAMES: &[&str] = &[
"package.json",
"mcp.json",
"mcp.yaml",
"mcp.yml",
"requirements.txt",
"pyproject.toml",
"cargo.toml",
"dockerfile",
"docker-compose.yml",
"docker-compose.yaml",
"makefile",
"gnumakefile",
".npmrc",
"pip.conf",
];
const LOCKFILE_NAMES: &[&str] = &[
"package-lock.json",
"cargo.lock",
"poetry.lock",
"uv.lock",
"pipfile.lock",
"yarn.lock",
"pnpm-lock.yaml",
"npm-shrinkwrap.json",
];
fn is_manifest_or_lockfile(path: &Path) -> bool {
let Some(name) = path.file_name() else {
return false;
};
let lowered = name.to_string_lossy().to_ascii_lowercase();
MANIFEST_NAMES.contains(&lowered.as_str()) || LOCKFILE_NAMES.contains(&lowered.as_str())
}
impl<F: FileSystemProvider> FileDiscoveryService<F> {
pub fn discover_package_scripts(&self, package_root: &Path) -> Vec<PathBuf> {
self.discover_by_patterns(
package_root,
SCRIPT_GLOB_PATTERNS,
MAX_DISCOVERED_SCRIPTS,
None,
)
}
pub fn discover_package_data_files(&self, package_root: &Path) -> Vec<PathBuf> {
self.discover_by_patterns(
package_root,
DATA_FILE_GLOB_PATTERNS,
MAX_DISCOVERED_DATA_FILES,
Some(MAX_DATA_FILE_BYTES),
)
.into_iter()
.filter(|path| !is_manifest_or_lockfile(path))
.collect()
}
fn discover_by_patterns(
&self,
package_root: &Path,
patterns: &[&str],
cap: usize,
max_bytes: Option<u64>,
) -> Vec<PathBuf> {
let mut results = Vec::new();
let mut seen: BTreeSet<PathBuf> = BTreeSet::new();
let mut roots: Vec<(PathBuf, bool)> =
Vec::with_capacity(1 + SCRIPT_DISCOVERY_SUBDIRS.len());
roots.push((package_root.to_path_buf(), false));
for subdir in SCRIPT_DISCOVERY_SUBDIRS {
let candidate = package_root.join(subdir);
if self.fs_provider.exists(&candidate) {
roots.push((candidate, true));
}
}
for (root, recursive) in &roots {
let files = if *recursive {
match self
.fs_provider
.walk_files(root, MAX_DISCOVERY_DEPTH, SKIP_DISCOVERY_DIRS)
{
Ok(f) => f,
Err(err) => {
tracing::warn!(
path = %root.display(),
error = %err,
"discover_by_patterns: walk_files failed"
);
continue;
}
}
} else {
let mut combined = Vec::new();
for pattern in patterns {
match self.fs_provider.list_files(root, pattern, false) {
Ok(f) => combined.extend(f),
Err(err) => {
tracing::warn!(
path = %root.display(),
pattern = %pattern,
error = %err,
"discover_by_patterns: list_files failed"
);
}
}
}
combined
};
let extensions: Vec<&str> = patterns
.iter()
.filter_map(|p| p.strip_prefix("*."))
.collect();
for file in files {
if results.len() >= cap {
return results;
}
if *recursive && !extensions.is_empty() {
let matches_ext = file.extension().is_some_and(|ext| {
extensions
.iter()
.any(|e| e.eq_ignore_ascii_case(&ext.to_string_lossy()))
});
if !matches_ext {
continue;
}
}
if let Some(limit) = max_bytes {
match self.fs_provider.metadata(&file) {
Ok(meta) if meta.len > limit => continue,
Ok(_) => {}
Err(err) => {
tracing::debug!(
file = %file.display(),
error = %err,
"file_discovery: skipping file with unavailable metadata"
);
continue;
}
}
}
if seen.insert(file.clone()) {
results.push(file);
}
}
}
results
}
pub(crate) fn discover_package_manifests(&self, path: &Path) -> Vec<PathBuf> {
self.discover_files_by_name(path, MANIFEST_NAMES)
}
pub(crate) fn discover_lockfiles(&self, path: &Path) -> Vec<PathBuf> {
self.discover_files_by_name(path, LOCKFILE_NAMES)
}
fn discover_files_by_name(&self, root: &Path, names: &[&str]) -> Vec<PathBuf> {
let files =
match self
.fs_provider
.walk_files(root, MAX_DISCOVERY_DEPTH, SKIP_DISCOVERY_DIRS)
{
Ok(files) => files,
Err(err) => {
tracing::warn!("Skipping discovery walk in {}: {err}", root.display(),);
return Vec::new();
}
};
files
.into_iter()
.filter_map(|path| {
let file_name = path.file_name()?.to_string_lossy().to_ascii_lowercase();
names.contains(&file_name.as_str()).then_some(path)
})
.collect()
}
}
#[cfg(test)]
mod tests {
use super::*;
struct MetadataFailingFs {
listed: Vec<PathBuf>,
}
impl FileSystemProvider for MetadataFailingFs {
fn read_file_bytes(
&self,
_path: &Path,
) -> Result<crate::ports::FileContent, crate::ports::FileSystemError> {
Err(crate::ports::FileSystemError::PathNotFound(PathBuf::new()))
}
fn list_files(
&self,
_path: &Path,
_pattern: &str,
_recursive: bool,
) -> Result<Vec<PathBuf>, crate::ports::FileSystemError> {
Ok(self.listed.clone())
}
fn exists(&self, _path: &Path) -> bool {
true
}
fn metadata(
&self,
_path: &Path,
) -> Result<crate::ports::FileMeta, crate::ports::FileSystemError> {
Err(crate::ports::FileSystemError::PathNotFound(PathBuf::new()))
}
}
#[test]
fn discover_skips_file_when_metadata_unavailable() {
let listed = PathBuf::from("/virtual/big.yaml");
let fs = MetadataFailingFs {
listed: vec![listed.clone()],
};
let service = FileDiscoveryService::with_fs_provider(false, fs);
let results = service.discover_package_data_files(Path::new("/virtual"));
assert!(
!results.contains(&listed),
"file with unavailable metadata MUST be skipped, not included; got {results:?}"
);
}
#[test]
fn file_discovery_does_not_call_std_fs_metadata_directly() {
let body = include_str!("package_artifacts.rs");
let production = body.split("#[cfg(test)]").next().unwrap_or(body);
assert!(
production.contains("self.fs_provider.metadata("),
"file_discovery must route metadata through the fs_provider port"
);
for (idx, line) in production.lines().enumerate() {
let trimmed = line.trim_start();
if trimmed.starts_with("//") {
continue;
}
assert!(
!trimmed.contains("std::fs::metadata("),
"production line {} still calls std::fs::metadata directly: {line}",
idx + 1
);
}
}
#[test]
fn file_discovery_does_not_use_walkdir_or_filetype_directly() {
let body = include_str!("package_artifacts.rs");
let production = body.split("#[cfg(test)]").next().unwrap_or(body);
let collapsed: String = production.split_whitespace().collect::<Vec<_>>().join(" ");
assert!(
collapsed.contains("self.fs_provider .walk_files(")
|| collapsed.contains("self.fs_provider.walk_files(")
|| collapsed.contains("self . fs_provider . walk_files (")
|| collapsed.contains(".fs_provider .walk_files(")
|| collapsed.contains(".fs_provider.walk_files("),
"discover_files_by_name must route the recursive walk through the fs_provider port",
);
for (idx, line) in production.lines().enumerate() {
let trimmed = line.trim_start();
if trimmed.starts_with("//") {
continue;
}
assert!(
!trimmed.contains("WalkDir::new("),
"production line {} still calls WalkDir::new directly: {line}",
idx + 1
);
assert!(
!trimmed.contains(".file_type().is_file()"),
"production line {} still inspects file_type via the WalkDir entry: {line}",
idx + 1
);
}
}
#[test]
fn discover_package_manifests_invokes_walk_files_with_hardening_knobs() {
use std::sync::atomic::{AtomicUsize, Ordering};
use std::sync::{Arc, Mutex};
struct WalkRecordingFs {
walk_calls: Arc<AtomicUsize>,
recorded_max_depth: Arc<Mutex<Vec<usize>>>,
recorded_skip_dirs: Arc<Mutex<Vec<Vec<String>>>>,
}
impl FileSystemProvider for WalkRecordingFs {
fn read_file_bytes(
&self,
_path: &Path,
) -> Result<crate::ports::FileContent, crate::ports::FileSystemError> {
Err(crate::ports::FileSystemError::PathNotFound(PathBuf::new()))
}
fn list_files(
&self,
_path: &Path,
_pattern: &str,
_recursive: bool,
) -> Result<Vec<PathBuf>, crate::ports::FileSystemError> {
Ok(Vec::new())
}
fn exists(&self, _path: &Path) -> bool {
true
}
fn walk_files(
&self,
_path: &Path,
max_depth: usize,
skip_dirs: &[&str],
) -> Result<Vec<PathBuf>, crate::ports::FileSystemError> {
self.walk_calls.fetch_add(1, Ordering::SeqCst);
self.recorded_max_depth
.lock()
.expect("WalkRecordingFs mutex poisoned")
.push(max_depth);
self.recorded_skip_dirs
.lock()
.expect("WalkRecordingFs mutex poisoned")
.push(skip_dirs.iter().map(|s| (*s).to_string()).collect());
Ok(Vec::new())
}
}
let fs = WalkRecordingFs {
walk_calls: Arc::new(AtomicUsize::new(0)),
recorded_max_depth: Arc::new(Mutex::new(Vec::new())),
recorded_skip_dirs: Arc::new(Mutex::new(Vec::new())),
};
let calls = Arc::clone(&fs.walk_calls);
let max_depths = Arc::clone(&fs.recorded_max_depth);
let skip_dirs_log = Arc::clone(&fs.recorded_skip_dirs);
let service = FileDiscoveryService::with_fs_provider(false, fs);
let _ = service.discover_package_manifests(Path::new("/virtual/pkg"));
let _ = service.discover_lockfiles(Path::new("/virtual/pkg"));
assert!(
calls.load(Ordering::SeqCst) >= 2,
"manifest + lockfile discovery must invoke walk_files at least twice",
);
let depths = max_depths.lock().expect("poisoned");
assert!(
depths.iter().all(|d| *d == MAX_DISCOVERY_DEPTH),
"every walk_files call must forward MAX_DISCOVERY_DEPTH ({MAX_DISCOVERY_DEPTH}); got {depths:?}",
);
let skips = skip_dirs_log.lock().expect("poisoned");
let expected_skip: Vec<String> = SKIP_DISCOVERY_DIRS
.iter()
.map(|s| (*s).to_string())
.collect();
assert!(
skips.iter().all(|recorded| recorded == &expected_skip),
"every walk_files call must forward SKIP_DISCOVERY_DIRS verbatim; got {skips:?}",
);
}
struct FixedListFs {
per_pattern: Vec<(String, Vec<PathBuf>)>,
}
impl FileSystemProvider for FixedListFs {
fn read_file_bytes(
&self,
_path: &Path,
) -> Result<crate::ports::FileContent, crate::ports::FileSystemError> {
Err(crate::ports::FileSystemError::PathNotFound(PathBuf::new()))
}
fn list_files(
&self,
_path: &Path,
pattern: &str,
_recursive: bool,
) -> Result<Vec<PathBuf>, crate::ports::FileSystemError> {
for (registered, files) in &self.per_pattern {
if registered == pattern {
return Ok(files.clone());
}
}
Ok(Vec::new())
}
fn exists(&self, _path: &Path) -> bool {
true
}
fn metadata(
&self,
_path: &Path,
) -> Result<crate::ports::FileMeta, crate::ports::FileSystemError> {
Ok(crate::ports::FileMeta { len: 1024 })
}
}
#[test]
fn data_file_discovery_excludes_manifest_and_lockfile_names() {
let pkg_root = PathBuf::from("/virtual/pkg");
let yaml_files = vec![
pkg_root.join("docker-compose.yaml"),
pkg_root.join("pnpm-lock.yaml"),
pkg_root.join("mcp.yaml"),
pkg_root.join("benign-data.yaml"),
];
let json_files = vec![
pkg_root.join("package.json"),
pkg_root.join("package-lock.json"),
pkg_root.join("data.json"),
];
let fs = FixedListFs {
per_pattern: vec![
("*.yaml".to_string(), yaml_files.clone()),
("*.json".to_string(), json_files.clone()),
],
};
let service = FileDiscoveryService::with_fs_provider(false, fs);
let results = service.discover_package_data_files(&pkg_root);
for excluded in [
"docker-compose.yaml",
"pnpm-lock.yaml",
"mcp.yaml",
"package.json",
"package-lock.json",
] {
assert!(
!results.iter().any(|p| p.file_name().and_then(|n| n.to_str()) == Some(excluded)),
"data-file discovery must NOT return manifest/lockfile name {excluded}; got {results:?}"
);
}
assert!(
results
.iter()
.any(|p| p.file_name().and_then(|n| n.to_str()) == Some("benign-data.yaml")),
"data-file discovery must STILL return non-manifest YAML files; got {results:?}"
);
assert!(
results
.iter()
.any(|p| p.file_name().and_then(|n| n.to_str()) == Some("data.json")),
"data-file discovery must STILL return non-manifest JSON files; got {results:?}"
);
}
}