use std::path::{Component, Path, PathBuf};
use anyhow::{Context, Result};
use ignore::WalkBuilder;
use tokmd_io_port::MemFs;
use crate::path::{BoundedPath, PathViolation, ValidatedRoot, normalize_bounded_relative_path};
mod git;
#[derive(Debug, Clone)]
pub struct LicenseCandidates {
pub license_files: Vec<PathBuf>,
pub metadata_files: Vec<PathBuf>,
}
pub fn list_files(root: &Path, max_files: Option<usize>) -> Result<Vec<PathBuf>> {
if max_files == Some(0) {
return Ok(Vec::new());
}
let root = ValidatedRoot::new(root)?;
if let Some(files) = git::git_ls_files(root.canonical())? {
let mut bounded = Vec::new();
for path in files {
if let Some(path) = git::bound_git_relative_path(&root, &path)? {
bounded.push(path);
}
if let Some(limit) = max_files
&& bounded.len() >= limit
{
break;
}
}
return Ok(bounded);
}
let mut files: Vec<PathBuf> = Vec::new();
let mut builder = WalkBuilder::new(root.canonical());
builder.hidden(false);
builder.git_ignore(true);
builder.git_exclude(true);
builder.git_global(true);
builder.follow_links(false);
for entry in builder.build() {
let entry = entry?;
if !entry.file_type().map(|t| t.is_file()).unwrap_or(false) {
continue;
}
let rel = BoundedPath::existing_child(&root, entry.path())?
.relative()
.to_path_buf();
files.push(rel);
if let Some(limit) = max_files
&& files.len() >= limit
{
break;
}
}
files.sort();
Ok(files)
}
pub fn list_files_from_memfs(
fs: &MemFs,
root: &Path,
max_files: Option<usize>,
) -> Result<Vec<PathBuf>> {
if max_files == Some(0) {
return Ok(Vec::new());
}
let normalized_root = normalize_memfs_root(root)?;
let mut files: Vec<PathBuf> = fs
.file_paths()
.filter_map(|path| memfs_relative_path(path, &normalized_root))
.collect();
files.sort();
if let Some(limit) = max_files
&& files.len() > limit
{
files.truncate(limit);
}
Ok(files)
}
pub fn license_candidates(files: &[PathBuf]) -> LicenseCandidates {
let mut license_files = Vec::new();
let mut metadata_files = Vec::new();
for rel in files {
let name = rel
.file_name()
.and_then(|n| n.to_str())
.unwrap_or("")
.to_lowercase();
if name == "cargo.toml" || name == "package.json" || name == "pyproject.toml" {
metadata_files.push(rel.clone());
continue;
}
if name.starts_with("license") || name.starts_with("copying") || name.starts_with("notice")
{
license_files.push(rel.clone());
}
}
license_files.sort();
metadata_files.sort();
LicenseCandidates {
license_files,
metadata_files,
}
}
pub fn file_size(root: &Path, relative: &Path) -> Result<u64> {
let root = ValidatedRoot::new(root)?;
let path = BoundedPath::existing_relative(&root, relative)?;
let meta = std::fs::metadata(path.canonical())
.with_context(|| format!("Failed to stat {}", path.canonical().display()))?;
Ok(meta.len())
}
pub fn file_size_from_memfs(fs: &MemFs, root: &Path, relative: &Path) -> Result<u64> {
let normalized_root = normalize_memfs_root(root)?;
let normalized_relative = normalize_bounded_relative_path(relative)?;
let path = if normalized_root.as_os_str().is_empty() {
normalized_relative
} else {
normalized_root.join(normalized_relative)
};
fs.file_size(&path)
.with_context(|| format!("Failed to stat {}", path.display()))
}
fn normalize_memfs_root(path: &Path) -> Result<PathBuf> {
let mut normalized = PathBuf::new();
if path.as_os_str().is_empty() {
return Ok(normalized);
}
for component in path.components() {
match component {
Component::CurDir => {}
Component::Normal(part) => normalized.push(part),
Component::ParentDir => {
return Err(PathViolation::ParentTraversal(path.to_path_buf()).into());
}
Component::RootDir | Component::Prefix(_) => {
return Err(PathViolation::Absolute(path.to_path_buf()).into());
}
}
}
Ok(normalized)
}
fn memfs_relative_path(path: &Path, root: &Path) -> Option<PathBuf> {
if root.as_os_str().is_empty() {
return Some(path.to_path_buf());
}
path.strip_prefix(root).ok().map(Path::to_path_buf)
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
#[test]
fn test_license_candidates_detects_license_files() {
let files = vec![
PathBuf::from("LICENSE"),
PathBuf::from("LICENSE.md"),
PathBuf::from("LICENSE-MIT"),
PathBuf::from("COPYING"),
PathBuf::from("NOTICE"),
PathBuf::from("src/main.rs"),
];
let result = license_candidates(&files);
assert_eq!(result.license_files.len(), 5);
assert!(result.metadata_files.is_empty());
}
#[test]
fn test_license_candidates_detects_metadata_files() {
let files = vec![
PathBuf::from("Cargo.toml"),
PathBuf::from("package.json"),
PathBuf::from("pyproject.toml"),
PathBuf::from("src/lib.rs"),
];
let result = license_candidates(&files);
assert!(result.license_files.is_empty());
assert_eq!(result.metadata_files.len(), 3);
}
#[test]
fn test_license_candidates_mixed() {
let files = vec![
PathBuf::from("LICENSE"),
PathBuf::from("Cargo.toml"),
PathBuf::from("src/main.rs"),
];
let result = license_candidates(&files);
assert_eq!(result.license_files.len(), 1);
assert_eq!(result.metadata_files.len(), 1);
}
#[test]
fn test_license_candidates_empty_input() {
let result = license_candidates(&[]);
assert!(result.license_files.is_empty());
assert!(result.metadata_files.is_empty());
}
#[test]
fn test_license_candidates_case_insensitive() {
let files = vec![PathBuf::from("license"), PathBuf::from("License.txt")];
let result = license_candidates(&files);
assert_eq!(result.license_files.len(), 2);
}
#[test]
fn test_license_candidates_sorted_output() {
let files = vec![
PathBuf::from("z/Cargo.toml"),
PathBuf::from("a/Cargo.toml"),
PathBuf::from("z/LICENSE"),
PathBuf::from("a/LICENSE"),
];
let result = license_candidates(&files);
assert_eq!(result.license_files[0], PathBuf::from("a/LICENSE"));
assert_eq!(result.license_files[1], PathBuf::from("z/LICENSE"));
assert_eq!(result.metadata_files[0], PathBuf::from("a/Cargo.toml"));
assert_eq!(result.metadata_files[1], PathBuf::from("z/Cargo.toml"));
}
#[test]
fn test_file_size_returns_correct_bytes() {
let dir = tempfile::tempdir().unwrap();
let content = "hello world";
fs::write(dir.path().join("test.txt"), content).unwrap();
let size = file_size(dir.path(), Path::new("test.txt")).unwrap();
assert_eq!(size, content.len() as u64);
}
#[test]
fn test_file_size_missing_file_errors() {
let dir = tempfile::tempdir().unwrap();
let result = file_size(dir.path(), Path::new("nonexistent.txt"));
assert!(result.is_err());
}
#[test]
fn test_file_size_empty_file() {
let dir = tempfile::tempdir().unwrap();
fs::write(dir.path().join("empty.txt"), "").unwrap();
let size = file_size(dir.path(), Path::new("empty.txt")).unwrap();
assert_eq!(size, 0);
}
#[test]
fn test_list_files_max_zero_returns_empty() {
let dir = tempfile::tempdir().unwrap();
fs::write(dir.path().join("a.rs"), "content").unwrap();
let files = list_files(dir.path(), Some(0)).unwrap();
assert!(files.is_empty());
}
#[test]
fn test_list_files_respects_max_limit() {
let dir = tempfile::tempdir().unwrap();
fs::create_dir_all(dir.path().join(".git")).unwrap();
for i in 0..10 {
fs::write(dir.path().join(format!("file{i}.txt")), "x").unwrap();
}
let files = list_files(dir.path(), Some(3)).unwrap();
assert!(files.len() <= 3);
}
#[test]
fn test_list_files_deterministic_sort() {
let dir = tempfile::tempdir().unwrap();
fs::create_dir_all(dir.path().join(".git")).unwrap();
fs::create_dir_all(dir.path().join("foo")).unwrap();
fs::write(dir.path().join("foo/bar"), "content").unwrap();
fs::write(dir.path().join("foo/bar.rs"), "content").unwrap();
fs::write(dir.path().join("foo.rs"), "content").unwrap();
let files = list_files(dir.path(), None).unwrap();
let expected = vec![
PathBuf::from("foo/bar"),
PathBuf::from("foo/bar.rs"),
PathBuf::from("foo.rs"),
];
let actual: Vec<PathBuf> = files
.into_iter()
.filter(|p| {
let s = p.to_string_lossy();
s.starts_with("foo")
})
.collect();
assert_eq!(actual, expected);
}
#[test]
fn test_list_files_resolves_parent_segments_before_walking() {
let dir = tempfile::tempdir().unwrap();
let root = dir.path().join("repo");
fs::create_dir_all(root.join("src")).unwrap();
fs::create_dir_all(root.join("nested")).unwrap();
fs::write(root.join("src/lib.rs"), "pub fn lib() {}\n").unwrap();
let files = list_files(&root.join("nested").join(".."), None).unwrap();
assert_eq!(files, vec![PathBuf::from("src/lib.rs")]);
}
#[test]
fn test_list_files_does_not_return_symlink_escape_when_supported() {
let root_dir = tempfile::tempdir().unwrap();
let outside_dir = tempfile::tempdir().unwrap();
let outside_file = outside_dir.path().join("secret.rs");
let link = root_dir.path().join("leak.rs");
fs::write(&outside_file, "fn secret() {}\n").unwrap();
if create_file_symlink(&outside_file, &link).is_err() {
return;
}
let files = list_files(root_dir.path(), None).unwrap();
assert!(
!files.iter().any(|path| path == Path::new("leak.rs")),
"walk should not expose symlink escapes: {files:?}"
);
}
#[cfg(unix)]
fn create_file_symlink(src: &Path, dst: &Path) -> std::io::Result<()> {
std::os::unix::fs::symlink(src, dst)
}
#[cfg(windows)]
fn create_file_symlink(src: &Path, dst: &Path) -> std::io::Result<()> {
std::os::windows::fs::symlink_file(src, dst)
}
}