#[cfg(feature = "ignore")]
use std::collections::HashSet;
#[cfg(feature = "ignore")]
use std::fs;
#[cfg(feature = "ignore")]
use std::path::Path;
use std::path::PathBuf;
#[cfg(feature = "ignore")]
use ignore::WalkBuilder;
#[cfg(feature = "ignore")]
use crate::{Config, IndexError};
pub type FileRecord = (PathBuf, PathBuf, u64);
#[cfg(feature = "ignore")]
pub fn enumerate_files(config: &Config) -> Result<Vec<FileRecord>, IndexError> {
let mut files: Vec<FileRecord> = Vec::new();
let canonical_root = fs::canonicalize(&config.repo_root)?;
let mut seen_canonical: HashSet<PathBuf> = HashSet::new();
let walker = WalkBuilder::new(&config.repo_root)
.hidden(false) .git_ignore(true)
.follow_links(false)
.build();
let mut symlink_paths: Vec<PathBuf> = Vec::new();
for result in walker {
let entry = match result {
Ok(e) => e,
Err(_) => continue, };
let path = entry.path();
let Some(file_type) = entry.file_type() else {
continue;
};
if file_type.is_symlink() {
symlink_paths.push(path.to_path_buf());
continue;
}
if !file_type.is_file() {
continue;
}
if let Ok(canonical) = fs::canonicalize(path) {
seen_canonical.insert(canonical);
}
push_file_record(
path.to_path_buf(),
path,
&config.repo_root,
config.max_file_size,
&mut files,
);
}
for symlink_path in symlink_paths {
collect_symlink_entry(
&symlink_path,
&config.repo_root,
&canonical_root,
config.max_file_size,
&mut files,
&mut seen_canonical,
config.verbose,
);
}
files.sort_unstable_by(|a, b| a.1.cmp(&b.1));
Ok(files)
}
#[cfg(feature = "ignore")]
fn push_file_record(
read_path: PathBuf,
display_path: &Path,
repo_root: &Path,
max_file_size: u64,
files: &mut Vec<FileRecord>,
) {
let size = match read_path.metadata() {
Ok(m) => m.len(),
Err(_) => return,
};
if size > max_file_size {
return;
}
let rel = match display_path.strip_prefix(repo_root) {
Ok(r) => crate::path_util::normalize_to_forward_slashes(r.to_path_buf()),
Err(_) => return,
};
files.push((read_path, rel, size));
}
#[cfg(feature = "ignore")]
fn log_symlink_skip(verbose: bool, symlink_path: &Path, reason: std::fmt::Arguments<'_>) {
if verbose {
eprintln!("st: skipping symlink {}: {reason}", symlink_path.display());
}
}
#[cfg(feature = "ignore")]
fn collect_symlink_entry(
symlink_path: &Path,
repo_root: &Path,
canonical_root: &Path,
max_file_size: u64,
files: &mut Vec<FileRecord>,
seen_canonical: &mut HashSet<PathBuf>,
verbose: bool,
) {
let target = match fs::read_link(symlink_path) {
Ok(target) => target,
Err(e) => {
log_symlink_skip(
verbose,
symlink_path,
format_args!("failed to read link: {e}"),
);
return;
}
};
let target_path = if target.is_absolute() {
target
} else {
symlink_path.parent().unwrap_or(repo_root).join(target)
};
let target_meta = match fs::symlink_metadata(&target_path) {
Ok(meta) => meta,
Err(e) => {
log_symlink_skip(
verbose,
symlink_path,
format_args!("failed to stat target: {e}"),
);
return;
}
};
if target_meta.file_type().is_symlink() {
return;
}
let canonical_target = match fs::canonicalize(&target_path) {
Ok(path) => path,
Err(e) => {
log_symlink_skip(
verbose,
symlink_path,
format_args!("failed to canonicalize target: {e}"),
);
return;
}
};
if !canonical_target.starts_with(canonical_root) {
log_symlink_skip(
verbose,
symlink_path,
format_args!("target {} is outside repo root", canonical_target.display()),
);
return;
}
let canonical_meta = match fs::symlink_metadata(&canonical_target) {
Ok(meta) => meta,
Err(e) => {
log_symlink_skip(
verbose,
symlink_path,
format_args!("failed to stat canonical target: {e}"),
);
return;
}
};
if canonical_meta.file_type().is_symlink() {
return;
}
if seen_canonical.contains(&canonical_target) {
return;
}
if canonical_meta.is_file() {
seen_canonical.insert(canonical_target.clone());
push_file_record(
canonical_target,
symlink_path,
repo_root,
max_file_size,
files,
);
}
}
pub fn split_batches(files: &[FileRecord], batch_limit: u64) -> Vec<Vec<FileRecord>> {
let mut batches: Vec<Vec<FileRecord>> = Vec::new();
let mut current: Vec<FileRecord> = Vec::new();
let mut current_size: u64 = 0;
for record in files {
let size = record.2;
if !current.is_empty() && current_size + size > batch_limit {
batches.push(std::mem::take(&mut current));
current_size = 0;
}
current_size += size.min(batch_limit);
current.push(record.clone());
}
if !current.is_empty() {
batches.push(current);
}
batches
}
pub fn is_binary(content: &[u8]) -> bool {
let check = content.len().min(8192);
content[..check].contains(&0u8)
}
#[cfg(test)]
#[path = "walk_tests.rs"]
mod tests;