use crate::error::Result;
use crate::merkle::FileEntryHashBuilder;
use crate::types::{ChangeStats, FileEntry, FileManifest, ProgressInfo};
use crate::utils;
use chrono::Utc;
use ignore::{WalkBuilder, WalkState, overrides::OverrideBuilder};
use rayon::prelude::*;
use std::path::{Path, PathBuf};
use std::sync::Arc;
use parking_lot::Mutex;
use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering};
use std::time::Instant;
use crate::collections::{HashMap, HashSet, HashSetExt};
use tracing::{debug, trace, warn};
#[derive(Debug)]
pub struct FileTracker {
root_path: PathBuf,
ignore_patterns: Vec<String>,
max_file_size: u64,
follow_symlinks: bool,
parallel_workers: usize,
}
impl FileTracker {
pub fn new(root_path: PathBuf) -> Self {
Self {
root_path,
ignore_patterns: Vec::new(),
max_file_size: 0,
follow_symlinks: false,
parallel_workers: num_cpus::get(),
}
}
pub fn with_ignore_patterns(mut self, patterns: Vec<String>) -> Self {
self.ignore_patterns = patterns;
self
}
pub fn with_max_file_size(mut self, size: u64) -> Self {
self.max_file_size = size;
self
}
pub fn with_follow_symlinks(mut self, follow: bool) -> Self {
self.follow_symlinks = follow;
self
}
pub fn with_parallel_workers(mut self, workers: usize) -> Self {
self.parallel_workers = workers.max(1);
self
}
pub fn scan_directory<F>(&self, progress_callback: Option<F>) -> Result<Vec<FileEntry>>
where
F: Fn(ProgressInfo) + Send + Sync,
{
let start = Instant::now();
let processed_count = Arc::new(AtomicUsize::new(0));
let total_size = Arc::new(AtomicU64::new(0));
let directories_with_files = Arc::new(Mutex::new(HashSet::<PathBuf>::new()));
let root_path = self.root_path.clone();
let dirs_with_files_clone = Arc::clone(&directories_with_files);
let mut walker_builder = WalkBuilder::new(&self.root_path);
walker_builder
.follow_links(self.follow_symlinks)
.hidden(false) .parents(true) .ignore(true) .git_ignore(true) .git_global(false) .git_exclude(false) .require_git(false) .threads(self.parallel_workers);
let mut override_builder = OverrideBuilder::new(&self.root_path);
override_builder.add("!.titor/**").ok();
override_builder.add("!.titor/").ok();
override_builder.add("!.titor").ok();
for pattern in &self.ignore_patterns {
let final_pattern = if pattern.starts_with('!') {
pattern[1..].to_string()
} else {
format!("!{}", pattern)
};
if let Err(e) = override_builder.add(&final_pattern) {
warn!("Invalid ignore pattern '{}': {}", pattern, e);
}
}
if let Ok(overrides) = override_builder.build() {
walker_builder.overrides(overrides);
}
let paths_to_process = Arc::new(Mutex::new(Vec::<(PathBuf, bool)>::new()));
walker_builder.build_parallel().run(|| {
let paths_to_process = Arc::clone(&paths_to_process);
let root_path = root_path.clone();
let dirs_with_files = Arc::clone(&dirs_with_files_clone);
Box::new(move |entry_result| {
match entry_result {
Ok(entry) => {
let path = entry.path();
if let Some(file_name) = path.file_name() {
let file_name_str = file_name.to_string_lossy();
if file_name_str == ".gitignore" || file_name_str == ".titor_ignore" {
return WalkState::Continue;
}
}
let is_dir = entry.file_type()
.map(|ft| ft.is_dir())
.unwrap_or(false);
if !is_dir {
let mut parent = path.parent();
while let Some(p) = parent {
if p == root_path {
break;
}
dirs_with_files.lock().insert(p.to_path_buf());
parent = p.parent();
}
}
paths_to_process.lock().push((path.to_path_buf(), is_dir));
}
Err(e) => {
warn!("Walk error: {}", e);
}
}
WalkState::Continue
})
});
let paths_vec = paths_to_process.lock().clone();
let root_path_for_processing = self.root_path.clone();
let file_entries: Vec<Option<FileEntry>> = paths_vec
.par_iter()
.map(|(path, is_directory)| {
process_file_entry(path, &root_path_for_processing, self.max_file_size, *is_directory)
.unwrap_or_else(|e| {
warn!("Error processing entry {:?}: {}", path, e);
None
})
})
.collect();
let mut final_entries = Vec::new();
let dirs_with_files = directories_with_files.lock();
for entry_opt in file_entries.into_iter() {
if let Some(entry) = entry_opt {
if !entry.is_directory || !dirs_with_files.contains(&self.root_path.join(&entry.path)) {
processed_count.fetch_add(1, Ordering::Relaxed);
total_size.fetch_add(entry.size, Ordering::Relaxed);
if let Some(ref callback) = progress_callback {
let info = ProgressInfo {
operation: "Scanning files".to_string(),
current_item: Some(entry.path.to_string_lossy().to_string()),
processed: processed_count.load(Ordering::Relaxed),
total: None,
bytes_processed: total_size.load(Ordering::Relaxed),
total_bytes: None,
};
callback(info);
}
final_entries.push(entry);
}
}
}
final_entries.sort_by(|a, b| a.path.cmp(&b.path));
let scan_duration = start.elapsed();
debug!(
"Scanned {} files/directories ({} bytes) in {:?}",
final_entries.len(),
total_size.load(Ordering::Relaxed),
scan_duration
);
Ok(final_entries)
}
pub fn detect_changes(&self, old_manifest: &FileManifest) -> Result<ChangeStats> {
let start = Instant::now();
let current_entries = self.scan_directory::<fn(ProgressInfo)>(None)?;
let old_map = create_file_map(&old_manifest.files);
let current_map = create_file_map(¤t_entries);
let mut stats = ChangeStats::default();
for (path, current_entry) in ¤t_map {
match old_map.get(path) {
Some(old_entry) => {
if current_entry.content_hash != old_entry.content_hash {
stats.files_modified += 1;
stats.bytes_modified += current_entry.size;
stats.changed_files.push((*path).to_path_buf());
}
}
None => {
stats.files_added += 1;
stats.bytes_added += current_entry.size;
stats.changed_files.push((*path).to_path_buf());
}
}
}
for (path, old_entry) in &old_map {
if !current_map.contains_key(path) {
stats.files_deleted += 1;
stats.bytes_deleted += old_entry.size;
stats.changed_files.push((*path).to_path_buf());
}
}
let detect_duration = start.elapsed();
debug!(
"Detected {} changes in {:?}",
stats.total_operations(),
detect_duration
);
Ok(stats)
}
}
fn process_file_entry(
path: &Path,
root_path: &Path,
max_file_size: u64,
is_directory: bool,
) -> Result<Option<FileEntry>> {
let metadata = match utils::get_file_metadata(path) {
Ok(m) => m,
Err(e) => {
trace!("Skipping entry {:?}: {}", path, e);
return Ok(None);
}
};
if !is_directory && max_file_size > 0 && metadata.size > max_file_size {
trace!("Skipping large file {:?} ({} bytes)", path, metadata.size);
return Ok(None);
}
if path == root_path {
return Ok(None);
}
let relative_path = utils::make_relative(path, root_path)?;
let (content_hash, symlink_target, size) = if is_directory {
let hash = utils::hash_data(format!("dir:{}", relative_path.display()).as_bytes());
(hash, None, 0u64)
} else if metadata.is_symlink {
let target = utils::read_symlink(path)?;
let hash = utils::hash_data(target.to_string_lossy().as_bytes());
(hash, Some(target), metadata.size)
} else {
let hash = utils::hash_file_content(path)?;
(hash, None, metadata.size)
};
let mut hash_builder = FileEntryHashBuilder::new();
let metadata_hash = hash_builder.hash_metadata(
metadata.permissions,
&metadata.modified.into(),
);
let combined_hash = hash_builder.combined_hash(&content_hash, &metadata_hash);
Ok(Some(FileEntry {
path: relative_path,
content_hash,
size,
permissions: metadata.permissions,
modified: metadata.modified.into(),
is_compressed: false, metadata_hash,
combined_hash,
is_symlink: metadata.is_symlink,
symlink_target,
is_directory,
}))
}
pub fn create_manifest(
checkpoint_id: String,
entries: Vec<FileEntry>,
merkle_root: String,
) -> FileManifest {
let total_size = entries.iter().map(|e| e.size).sum();
let file_count = entries.len();
FileManifest {
checkpoint_id,
files: entries,
total_size,
file_count,
merkle_root,
created_at: Utc::now(),
}
}
pub fn create_file_map<'a>(entries: &'a [FileEntry]) -> HashMap<&'a Path, &'a FileEntry> {
entries.iter()
.map(|e| (e.path.as_path(), e))
.collect()
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
use std::fs;
#[test]
fn test_file_tracker_scan() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
fs::write(root.join("file1.txt"), "content1").unwrap();
fs::write(root.join("file2.txt"), "content2").unwrap();
fs::create_dir(root.join("subdir")).unwrap();
fs::write(root.join("subdir/file3.txt"), "content3").unwrap();
let tracker = FileTracker::new(root.to_path_buf());
let entries = tracker.scan_directory::<fn(ProgressInfo)>(None).unwrap();
assert_eq!(entries.len(), 3);
assert!(entries.iter().any(|e| e.path == Path::new("file1.txt")));
assert!(entries.iter().any(|e| e.path == Path::new("file2.txt")));
assert!(entries.iter().any(|e| e.path == Path::new("subdir/file3.txt")));
}
#[test]
fn test_file_tracker_ignore_patterns() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
fs::write(root.join("file.txt"), "content").unwrap();
fs::write(root.join("file.tmp"), "temp").unwrap();
fs::write(root.join("file.log"), "log").unwrap();
fs::write(root.join(".gitignore"), "*.tmp\n*.log").unwrap();
let tracker = FileTracker::new(root.to_path_buf())
.with_ignore_patterns(vec!["*.tmp".to_string()]);
let entries = tracker.scan_directory::<fn(ProgressInfo)>(None).unwrap();
assert_eq!(entries.len(), 1);
assert_eq!(entries[0].path, Path::new("file.txt"));
}
#[test]
fn test_nested_gitignore() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
fs::create_dir_all(root.join("src")).unwrap();
fs::create_dir_all(root.join("src/tests")).unwrap();
fs::create_dir_all(root.join("build")).unwrap();
fs::create_dir_all(root.join("docs")).unwrap();
fs::write(root.join("README.md"), "readme").unwrap();
fs::write(root.join("config.json"), "config").unwrap();
fs::write(root.join("debug.log"), "debug").unwrap();
fs::write(root.join("temp.txt"), "temp").unwrap();
fs::write(root.join("src/main.rs"), "main").unwrap();
fs::write(root.join("src/lib.rs"), "lib").unwrap();
fs::write(root.join("src/test.tmp"), "test temp").unwrap();
fs::write(root.join("src/tests/test1.rs"), "test1").unwrap();
fs::write(root.join("src/tests/test2.rs"), "test2").unwrap();
fs::write(root.join("src/tests/fixture.json"), "fixture").unwrap();
fs::write(root.join("build/output.exe"), "output").unwrap();
fs::write(root.join("build/cache.dat"), "cache").unwrap();
fs::write(root.join("docs/api.md"), "api").unwrap();
fs::write(root.join("docs/internal.md"), "internal").unwrap();
fs::write(root.join(".gitignore"), "*.log\nbuild/\ntemp.txt").unwrap();
fs::write(root.join("src/.gitignore"), "*.tmp").unwrap();
fs::write(root.join("src/tests/.gitignore"), "*.json").unwrap();
fs::write(root.join("docs/.gitignore"), "internal.md").unwrap();
let tracker = FileTracker::new(root.to_path_buf());
let entries = tracker.scan_directory::<fn(ProgressInfo)>(None).unwrap();
let paths: Vec<_> = entries.iter()
.map(|e| e.path.to_string_lossy().to_string())
.collect();
assert!(paths.contains(&"README.md".to_string()));
assert!(paths.contains(&"config.json".to_string()));
assert!(paths.contains(&"src/main.rs".to_string()));
assert!(paths.contains(&"src/lib.rs".to_string()));
assert!(paths.contains(&"src/tests/test1.rs".to_string()));
assert!(paths.contains(&"src/tests/test2.rs".to_string()));
assert!(paths.contains(&"docs/api.md".to_string()));
assert!(!paths.contains(&"debug.log".to_string())); assert!(!paths.contains(&"temp.txt".to_string())); assert!(!paths.contains(&"build/output.exe".to_string())); assert!(!paths.contains(&"build/cache.dat".to_string())); assert!(!paths.contains(&"src/test.tmp".to_string())); assert!(!paths.contains(&"src/tests/fixture.json".to_string())); assert!(!paths.contains(&"docs/internal.md".to_string()));
assert_eq!(entries.len(), 7);
}
#[test]
fn test_gitignore_precedence() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
fs::create_dir_all(root.join("nested/deep")).unwrap();
fs::write(root.join("test.txt"), "root test").unwrap();
fs::write(root.join("nested/test.txt"), "nested test").unwrap();
fs::write(root.join("nested/deep/test.txt"), "deep test").unwrap();
fs::write(root.join("nested/keep.txt"), "keep").unwrap();
fs::write(root.join(".gitignore"), "test.txt").unwrap();
fs::write(root.join("nested/.gitignore"), "!test.txt").unwrap();
fs::write(root.join("nested/deep/.gitignore"), "test.txt").unwrap();
let tracker = FileTracker::new(root.to_path_buf());
let entries = tracker.scan_directory::<fn(ProgressInfo)>(None).unwrap();
let paths: Vec<_> = entries.iter()
.map(|e| e.path.to_string_lossy().to_string())
.collect();
assert!(!paths.contains(&"test.txt".to_string()));
assert!(paths.contains(&"nested/test.txt".to_string()));
assert!(!paths.contains(&"nested/deep/test.txt".to_string()));
assert!(paths.contains(&"nested/keep.txt".to_string()));
}
#[test]
fn test_custom_patterns_override() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
fs::write(root.join("include.txt"), "include").unwrap();
fs::write(root.join("exclude.txt"), "exclude").unwrap();
fs::write(root.join("force_exclude.txt"), "force").unwrap();
fs::write(root.join(".gitignore"), "exclude.txt").unwrap();
let tracker = FileTracker::new(root.to_path_buf())
.with_ignore_patterns(vec!["force_exclude.txt".to_string()]);
let entries = tracker.scan_directory::<fn(ProgressInfo)>(None).unwrap();
let paths: Vec<_> = entries.iter()
.map(|e| e.path.to_string_lossy().to_string())
.collect();
assert_eq!(entries.len(), 1);
assert!(paths.contains(&"include.txt".to_string()));
assert!(!paths.contains(&"exclude.txt".to_string()));
assert!(!paths.contains(&"force_exclude.txt".to_string()));
}
#[test]
fn test_detect_changes() {
let temp_dir = TempDir::new().unwrap();
let root = temp_dir.path();
fs::write(root.join("file1.txt"), "content1").unwrap();
fs::write(root.join("file2.txt"), "content2").unwrap();
let tracker = FileTracker::new(root.to_path_buf());
let entries1 = tracker.scan_directory::<fn(ProgressInfo)>(None).unwrap();
let manifest1 = create_manifest(
"checkpoint1".to_string(),
entries1,
"merkle1".to_string(),
);
fs::write(root.join("file1.txt"), "modified").unwrap(); fs::remove_file(root.join("file2.txt")).unwrap(); fs::write(root.join("file3.txt"), "new").unwrap();
let changes = tracker.detect_changes(&manifest1).unwrap();
assert_eq!(changes.files_added, 1);
assert_eq!(changes.files_modified, 1);
assert_eq!(changes.files_deleted, 1);
assert_eq!(changes.total_operations(), 3);
}
}