#[cfg(feature = "ignore")]
use std::collections::HashSet;
#[cfg(feature = "ignore")]
use std::fs;
#[cfg(feature = "ignore")]
use std::path::Path;
use std::path::PathBuf;
#[cfg(feature = "ignore")]
use ignore::WalkBuilder;
#[cfg(feature = "ignore")]
use crate::{Config, IndexError};
pub type FileRecord = (PathBuf, PathBuf, u64);
#[cfg(feature = "ignore")]
pub fn enumerate_files(config: &Config) -> Result<Vec<FileRecord>, IndexError> {
let mut files: Vec<FileRecord> = Vec::new();
let canonical_root = fs::canonicalize(&config.repo_root)?;
let mut seen_canonical: HashSet<PathBuf> = HashSet::new();
let walker = WalkBuilder::new(&config.repo_root)
.hidden(false) .git_ignore(true)
.follow_links(false)
.build();
let mut symlink_paths: Vec<PathBuf> = Vec::new();
for result in walker {
let entry = match result {
Ok(e) => e,
Err(_) => continue, };
let path = entry.path();
let Some(file_type) = entry.file_type() else {
continue;
};
if file_type.is_symlink() {
symlink_paths.push(path.to_path_buf());
continue;
}
if !file_type.is_file() {
continue;
}
if let Ok(canonical) = fs::canonicalize(path) {
seen_canonical.insert(canonical);
}
push_file_record(
path.to_path_buf(),
path,
&config.repo_root,
config.max_file_size,
&mut files,
);
}
for symlink_path in symlink_paths {
collect_symlink_entry(
&symlink_path,
&config.repo_root,
&canonical_root,
config.max_file_size,
&mut files,
&mut seen_canonical,
);
}
files.sort_unstable_by(|a, b| a.1.cmp(&b.1));
Ok(files)
}
#[cfg(feature = "ignore")]
fn push_file_record(
read_path: PathBuf,
display_path: &Path,
repo_root: &Path,
max_file_size: u64,
files: &mut Vec<FileRecord>,
) {
let size = match read_path.metadata() {
Ok(m) => m.len(),
Err(_) => return,
};
if size > max_file_size {
return;
}
let rel = match display_path.strip_prefix(repo_root) {
Ok(r) => r.to_path_buf(),
Err(_) => return,
};
files.push((read_path, rel, size));
}
#[cfg(feature = "ignore")]
fn collect_symlink_entry(
symlink_path: &Path,
repo_root: &Path,
canonical_root: &Path,
max_file_size: u64,
files: &mut Vec<FileRecord>,
seen_canonical: &mut HashSet<PathBuf>,
) {
let target = match fs::read_link(symlink_path) {
Ok(target) => target,
Err(_) => return,
};
let target_path = if target.is_absolute() {
target
} else {
symlink_path.parent().unwrap_or(repo_root).join(target)
};
let target_meta = match fs::symlink_metadata(&target_path) {
Ok(meta) => meta,
Err(_) => return,
};
if target_meta.file_type().is_symlink() {
return;
}
let canonical_target = match fs::canonicalize(&target_path) {
Ok(path) => path,
Err(_) => return,
};
if !canonical_target.starts_with(canonical_root) {
return;
}
let canonical_meta = match fs::symlink_metadata(&canonical_target) {
Ok(meta) => meta,
Err(_) => return,
};
if canonical_meta.file_type().is_symlink() {
return;
}
if seen_canonical.contains(&canonical_target) {
return;
}
if canonical_meta.is_file() {
seen_canonical.insert(canonical_target.clone());
push_file_record(
canonical_target,
symlink_path,
repo_root,
max_file_size,
files,
);
}
}
pub fn split_batches(files: &[FileRecord], batch_limit: u64) -> Vec<Vec<FileRecord>> {
let mut batches: Vec<Vec<FileRecord>> = Vec::new();
let mut current: Vec<FileRecord> = Vec::new();
let mut current_size: u64 = 0;
for record in files {
let size = record.2;
if !current.is_empty() && current_size + size > batch_limit {
batches.push(std::mem::take(&mut current));
current_size = 0;
}
current_size += size.min(batch_limit);
current.push(record.clone());
}
if !current.is_empty() {
batches.push(current);
}
batches
}
pub fn is_binary(content: &[u8]) -> bool {
let check = content.len().min(8192);
content[..check].contains(&0u8)
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[cfg(unix)]
#[test]
fn enumerate_files_skips_symlinked_directories() {
use std::os::unix::fs::symlink;
let repo = TempDir::new().unwrap();
let real_dir = repo.path().join("real");
fs::create_dir_all(&real_dir).unwrap();
fs::write(real_dir.join("nested.rs"), b"fn linked() {}\n").unwrap();
symlink(&real_dir, repo.path().join("alias")).unwrap();
let config = Config {
repo_root: repo.path().to_path_buf(),
..Config::default()
};
let files = enumerate_files(&config).unwrap();
assert_eq!(
files.iter().map(|(_, rel, _)| rel).collect::<Vec<_>>(),
vec![&PathBuf::from("real/nested.rs")],
"directory symlink contents must not be indexed through alias paths"
);
}
#[cfg(unix)]
#[test]
fn collect_symlink_entry_rejects_canonical_symlink() {
use std::os::unix::fs::symlink;
let repo = tempfile::TempDir::new().unwrap();
let outside = tempfile::TempDir::new().unwrap();
std::fs::write(outside.path().join("secret.rs"), b"secret").unwrap();
symlink(outside.path().join("secret.rs"), repo.path().join("link_b")).unwrap();
symlink(repo.path().join("link_b"), repo.path().join("link_a")).unwrap();
let config = crate::Config {
repo_root: repo.path().to_path_buf(),
..crate::Config::default()
};
let files = enumerate_files(&config).unwrap();
let found: Vec<_> = files
.iter()
.filter(|(_, rel, _)| rel.starts_with("link_a") || rel.starts_with("link_b"))
.collect();
assert!(
found.is_empty(),
"symlinks pointing outside repo must be rejected, found: {:?}",
found
);
}
#[cfg(unix)]
#[test]
fn enumerate_files_skips_symlink_outside_repo() {
use std::os::unix::fs::symlink;
let repo = TempDir::new().unwrap();
let outside = TempDir::new().unwrap();
fs::write(outside.path().join("secret.rs"), b"fn secret() {}\n").unwrap();
symlink(
outside.path().join("secret.rs"),
repo.path().join("escape.rs"),
)
.unwrap();
let config = Config {
repo_root: repo.path().to_path_buf(),
..Config::default()
};
let files = enumerate_files(&config).unwrap();
assert!(
!files.iter().any(|(_, rel, _)| rel == "escape.rs"),
"out-of-repo symlink targets must be skipped"
);
}
#[cfg(unix)]
#[test]
fn enumerate_files_deduplicates_multiple_symlinks_to_same_file() {
use std::os::unix::fs::symlink;
let repo = TempDir::new().unwrap();
let real = repo.path().join("real.rs");
fs::write(&real, b"fn visible() {}\n").unwrap();
for i in 0..10u8 {
symlink(&real, repo.path().join(format!("alias{i}.rs"))).unwrap();
}
let config = Config {
repo_root: repo.path().to_path_buf(),
..Config::default()
};
let files = enumerate_files(&config).unwrap();
let symlinked_files: Vec<_> = files
.iter()
.filter(|(_, rel, _)| rel.to_str().unwrap_or("").starts_with("alias"))
.collect();
assert!(
symlinked_files.is_empty(),
"symlink aliases to an already-indexed real file must not appear in results, got: {:?}",
symlinked_files
.iter()
.map(|(_, r, _)| r)
.collect::<Vec<_>>()
);
let real_files: Vec<_> = files
.iter()
.filter(|(_, rel, _)| rel.to_str().unwrap_or("") == "real.rs")
.collect();
assert_eq!(
real_files.len(),
1,
"the real file must appear exactly once"
);
}
#[cfg(unix)]
#[test]
fn enumerate_files_real_file_wins_over_symlink_alias() {
use std::os::unix::fs::symlink;
let repo = TempDir::new().unwrap();
let real = repo.path().join("real.rs");
fs::write(&real, b"fn original() {}\n").unwrap();
symlink(&real, repo.path().join("alias.rs")).unwrap();
let config = Config {
repo_root: repo.path().to_path_buf(),
..Config::default()
};
let files = enumerate_files(&config).unwrap();
assert_eq!(
files.len(),
1,
"real file + symlink to it must produce exactly one index entry, got: {:?}",
files.iter().map(|(_, r, _)| r).collect::<Vec<_>>()
);
assert_eq!(
files[0].1,
std::path::PathBuf::from("real.rs"),
"the surviving entry must be the real file, not the symlink"
);
}
}