use anyhow::{anyhow, Result};
use sha2::{Digest, Sha256};
use std::path::{Path, PathBuf};
#[derive(Debug, Clone)]
pub struct CapturedFile {
pub path: String,
pub sha256: String,
pub size: usize,
pub body: String,
}
#[derive(Debug, Clone)]
pub struct CapturedSymlink {
pub path: String,
pub target: String,
}
#[derive(Debug, Clone)]
pub struct CapturedBinary {
pub path: String,
pub sha256: String,
pub size: usize,
pub base64: String,
}
#[derive(Debug, Clone)]
pub struct CaptureConfig {
pub max_file_bytes: usize,
pub max_total_bytes: usize,
pub skip_dirs: Vec<String>,
pub skip_root_dirs: Vec<String>,
pub skip_extensions: Vec<String>,
pub include_binaries: bool,
}
impl Default for CaptureConfig {
fn default() -> Self {
Self {
max_file_bytes: 8 * 1024 * 1024,
max_total_bytes: 100 * 1024 * 1024, skip_dirs: ["node_modules", ".git", "vendor",
".venv", "venv", "__pycache__", ".pytest_cache",
".idea", ".vscode", "Pods",
".terraform", ".direnv", ".nuxt", ".next",
".svelte-kit", ".turbo", ".parcel-cache"]
.iter().map(|s| s.to_string()).collect(),
skip_root_dirs: ["target", "dist", "build", "out", "_build"]
.iter().map(|s| s.to_string()).collect(),
skip_extensions: ["png", "jpg", "jpeg", "gif", "ico", "svg",
"pdf", "zip", "tar", "gz", "bz2", "xz",
"exe", "dll", "so", "dylib", "a", "o",
"wasm", "class", "jar", "war",
"mp3", "mp4", "mov", "wav", "ogg",
"ttf", "woff", "woff2", "eot"]
.iter().map(|s| s.to_string()).collect(),
include_binaries: false,
}
}
}
pub fn capture(root: &Path, cfg: &CaptureConfig) -> Result<CaptureReport> {
let mut report = CaptureReport::default();
walk_dir(root, root, cfg, &mut report)?;
Ok(report)
}
#[derive(Debug, Default, Clone)]
pub struct CaptureReport {
pub files: Vec<CapturedFile>,
pub symlinks: Vec<CapturedSymlink>,
pub binaries: Vec<CapturedBinary>,
pub total_bytes: usize,
pub skipped_binary: usize,
pub skipped_too_large: usize,
pub skipped_total_cap: usize,
pub skipped_dir: usize,
}
fn walk_dir(
root: &Path,
dir: &Path,
cfg: &CaptureConfig,
report: &mut CaptureReport,
) -> Result<()> {
let entries = match std::fs::read_dir(dir) {
Ok(e) => e,
Err(_) => return Ok(()),
};
for entry in entries.flatten() {
let path = entry.path();
let name = entry.file_name();
let name_str = name.to_string_lossy();
if let Ok(ftype) = entry.file_type() {
if ftype.is_symlink() {
if let Ok(target) = std::fs::read_link(&path) {
let rel = path.strip_prefix(root).unwrap_or(&path);
let rel_str = rel.to_string_lossy().replace('\\', "/");
let target_str = target.to_string_lossy().to_string();
report.symlinks.push(CapturedSymlink {
path: rel_str,
target: target_str,
});
}
continue;
}
}
let meta = match entry.metadata() {
Ok(m) => m,
Err(_) => continue,
};
if meta.is_dir() {
if cfg.skip_dirs.iter().any(|s| s.as_str() == name_str.as_ref()) {
report.skipped_dir += 1;
continue;
}
if dir == root
&& cfg.skip_root_dirs.iter().any(|s| s.as_str() == name_str.as_ref())
{
report.skipped_dir += 1;
continue;
}
walk_dir(root, &path, cfg, report)?;
continue;
}
if !meta.is_file() {
continue;
}
let size = meta.len() as usize;
if size > cfg.max_file_bytes {
report.skipped_too_large += 1;
continue;
}
if report.total_bytes + size > cfg.max_total_bytes {
report.skipped_total_cap += 1;
continue;
}
let is_skip_ext = path.extension().and_then(|e| e.to_str())
.map(|ext| cfg.skip_extensions.iter().any(|s| s.eq_ignore_ascii_case(ext)))
.unwrap_or(false);
if is_skip_ext && !cfg.include_binaries {
report.skipped_binary += 1;
continue;
}
let bytes = match std::fs::read(&path) {
Ok(b) => b,
Err(_) => continue,
};
let has_nul = bytes.contains(&0u8);
let body_utf8 = if !has_nul && !is_skip_ext {
String::from_utf8(bytes.clone()).ok()
} else {
None
};
if body_utf8.is_none() {
if !cfg.include_binaries {
report.skipped_binary += 1;
continue;
}
let rel = path.strip_prefix(root).unwrap_or(&path);
let rel_str = rel.to_string_lossy().replace('\\', "/");
use base64::Engine;
let b64 = base64::engine::general_purpose::STANDARD.encode(&bytes);
let mut hasher = Sha256::new();
hasher.update(&bytes);
let sha256 = format!("{:x}", hasher.finalize());
report.total_bytes += size;
report.binaries.push(CapturedBinary {
path: rel_str,
sha256,
size,
base64: b64,
});
continue;
}
let body = body_utf8.unwrap();
let rel = path.strip_prefix(root).unwrap_or(&path);
let rel_str = rel.to_string_lossy().replace('\\', "/");
let mut hasher = Sha256::new();
hasher.update(&bytes);
let sha256 = format!("{:x}", hasher.finalize());
report.total_bytes += size;
report.files.push(CapturedFile {
path: rel_str,
sha256,
size,
body,
});
}
Ok(())
}
pub fn hash_bytes(bytes: &[u8]) -> String {
let mut hasher = Sha256::new();
hasher.update(bytes);
format!("{:x}", hasher.finalize())
}
pub fn restore(out: &Path, files: &[CapturedFile]) -> Result<Vec<PathBuf>> {
let mut written = Vec::new();
for f in files {
let dest = out.join(&f.path);
if let Some(parent) = dest.parent() {
std::fs::create_dir_all(parent)?;
}
std::fs::write(&dest, f.body.as_bytes())?;
written.push(dest);
}
Ok(written)
}
pub fn restore_binaries(out: &Path, binaries: &[CapturedBinary]) -> Result<Vec<PathBuf>> {
use base64::Engine;
let mut written = Vec::new();
for b in binaries {
let dest = out.join(&b.path);
if let Some(parent) = dest.parent() {
std::fs::create_dir_all(parent)?;
}
let bytes = base64::engine::general_purpose::STANDARD.decode(&b.base64)
.map_err(|e| anyhow!("base64 decode {}: {e}", b.path))?;
std::fs::write(&dest, &bytes)?;
written.push(dest);
}
Ok(written)
}
pub fn restore_symlinks(out: &Path, symlinks: &[CapturedSymlink]) -> Result<Vec<PathBuf>> {
let mut written = Vec::new();
for s in symlinks {
let dest = out.join(&s.path);
if dest.exists() { continue; }
if let Some(parent) = dest.parent() {
std::fs::create_dir_all(parent)?;
}
#[cfg(unix)]
std::os::unix::fs::symlink(&s.target, &dest)?;
#[cfg(windows)]
{
let _ = std::os::windows::fs::symlink_dir(&s.target, &dest)
.or_else(|_| std::os::windows::fs::symlink_file(&s.target, &dest));
}
written.push(dest);
}
Ok(written)
}
#[cfg(test)]
mod tests {
use super::*;
use std::fs;
fn mk(files: &[(&str, &str)]) -> tempdir::TempDir {
let tmp = tempdir::TempDir::new("fc").unwrap();
for (rel, body) in files {
let p = tmp.path().join(rel);
if let Some(parent) = p.parent() { fs::create_dir_all(parent).unwrap(); }
fs::write(&p, body).unwrap();
}
tmp
}
#[test]
fn captures_text_files_with_correct_sha256() {
let dir = mk(&[
("Cargo.toml", "[package]\nname=\"x\"\n"),
("src/lib.rs", "fn main() {}"),
]);
let cfg = CaptureConfig::default();
let r = capture(dir.path(), &cfg).unwrap();
assert_eq!(r.files.len(), 2);
let paths: Vec<&str> = r.files.iter().map(|f| f.path.as_str()).collect();
assert!(paths.contains(&"Cargo.toml"));
assert!(paths.contains(&"src/lib.rs"));
let cargo = r.files.iter().find(|f| f.path == "Cargo.toml").unwrap();
assert_eq!(cargo.sha256, hash_bytes(b"[package]\nname=\"x\"\n"));
}
#[test]
fn skips_target_node_modules_git_dirs() {
let dir = mk(&[
("src/lib.rs", "// real"),
("target/debug/something.rs", "// build artifact"),
("node_modules/foo/index.js", "// dep"),
(".git/HEAD", "ref: refs/heads/main"),
]);
let r = capture(dir.path(), &CaptureConfig::default()).unwrap();
let paths: Vec<&str> = r.files.iter().map(|f| f.path.as_str()).collect();
assert_eq!(paths, vec!["src/lib.rs"]);
assert!(r.skipped_dir >= 3);
}
#[test]
fn captures_hidden_files_but_skips_hidden_dirs() {
let dir = mk(&[
("src/lib.rs", "// real"),
(".gitignore", "target/\n*.log\n"),
(".editorconfig", "[*]\nindent_style = space\n"),
(".git/HEAD", "ref: refs/heads/main"),
]);
let r = capture(dir.path(), &CaptureConfig::default()).unwrap();
let mut paths: Vec<&str> = r.files.iter().map(|f| f.path.as_str()).collect();
paths.sort();
assert_eq!(paths, vec![".editorconfig", ".gitignore", "src/lib.rs"]);
}
#[test]
fn skips_binary_extensions_without_reading() {
let dir = mk(&[
("logo.png", "fake png bytes"),
("README.md", "real text"),
]);
let r = capture(dir.path(), &CaptureConfig::default()).unwrap();
let paths: Vec<&str> = r.files.iter().map(|f| f.path.as_str()).collect();
assert_eq!(paths, vec!["README.md"]);
assert!(r.skipped_binary >= 1);
}
#[test]
fn skips_files_with_nul_bytes() {
let dir = mk(&[
("data.dat", "before\0after"),
("README.md", "real"),
]);
let r = capture(dir.path(), &CaptureConfig::default()).unwrap();
let paths: Vec<&str> = r.files.iter().map(|f| f.path.as_str()).collect();
assert_eq!(paths, vec!["README.md"]);
}
#[test]
fn restore_writes_byte_identical_files() {
let dir = mk(&[
("a/b.txt", "hello world\n"),
("c.md", "# title\n\nbody"),
]);
let r = capture(dir.path(), &CaptureConfig::default()).unwrap();
let restore_dir = tempdir::TempDir::new("restore").unwrap();
let written = restore(restore_dir.path(), &r.files).unwrap();
assert_eq!(written.len(), 2);
let a = fs::read_to_string(restore_dir.path().join("a/b.txt")).unwrap();
let c = fs::read_to_string(restore_dir.path().join("c.md")).unwrap();
assert_eq!(a, "hello world\n");
assert_eq!(c, "# title\n\nbody");
}
#[test]
fn per_file_size_cap_skips_large_files() {
let dir = mk(&[
("small.txt", "tiny"),
("big.txt", &"x".repeat(2_000_000)),
]);
let mut cfg = CaptureConfig::default();
cfg.max_file_bytes = 1024;
let r = capture(dir.path(), &cfg).unwrap();
assert_eq!(r.files.len(), 1);
assert_eq!(r.skipped_too_large, 1);
}
}