#![cfg(any(feature = "rag", feature = "rag-sqlite"))]
use std::collections::hash_map::DefaultHasher;
use std::collections::HashMap;
use std::hash::{Hash, Hasher};
use std::path::{Path, PathBuf};
use std::time::{SystemTime, UNIX_EPOCH};
use serde::{Deserialize, Serialize};
use super::{RagError, RagResult};
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct DocumentTracker {
entries: HashMap<String, TrackedDocument>,
#[serde(skip)]
state_path: Option<PathBuf>,
}
#[derive(Debug, Clone, Serialize, Deserialize)]
pub struct TrackedDocument {
pub source_uri: String,
pub content_hash: String,
pub chunk_ids: Vec<i64>,
pub last_indexed: String, pub file_modified: Option<String>,
pub file_size: Option<u64>,
}
#[derive(Debug, Clone, Default)]
pub struct SyncResult {
pub added: usize,
pub updated: usize,
pub deleted: usize,
pub unchanged: usize,
pub errors: Vec<(String, String)>,
}
#[derive(Debug, Clone)]
pub enum SourceChange {
New { uri: String, content: String },
Modified {
uri: String,
content: String,
old_chunk_ids: Vec<i64>,
},
Deleted { uri: String, old_chunk_ids: Vec<i64> },
Unchanged { uri: String },
}
impl DocumentTracker {
pub fn new() -> Self {
Self {
entries: HashMap::new(),
state_path: None,
}
}
pub fn with_state_file(path: impl AsRef<Path>) -> Self {
Self {
entries: HashMap::new(),
state_path: Some(path.as_ref().to_path_buf()),
}
}
pub fn load(&mut self) -> RagResult<()> {
let path = match &self.state_path {
Some(p) => p,
None => return Ok(()),
};
let content = std::fs::read_to_string(path)
.map_err(|e| RagError::ConfigError(format!("Failed to read tracker state: {}", e)))?;
let loaded: DocumentTracker = serde_json::from_str(&content)
.map_err(|e| RagError::SerializationError(format!("Failed to parse tracker state: {}", e)))?;
self.entries = loaded.entries;
Ok(())
}
pub fn save(&self) -> RagResult<()> {
let path = match &self.state_path {
Some(p) => p,
None => return Ok(()),
};
let content = serde_json::to_string_pretty(self)
.map_err(|e| RagError::SerializationError(format!("Failed to serialize tracker state: {}", e)))?;
std::fs::write(path, content)
.map_err(|e| RagError::ConfigError(format!("Failed to write tracker state: {}", e)))?;
Ok(())
}
pub fn content_hash(content: &str) -> String {
let mut hasher = DefaultHasher::new();
content.hash(&mut hasher);
format!("{:016x}", hasher.finish())
}
pub fn detect_changes(
&self,
dir: &Path,
pattern: Option<&str>,
recursive: bool,
) -> Vec<SourceChange> {
let files = collect_files(dir, pattern, recursive);
let mut changes = Vec::new();
let mut seen_uris: std::collections::HashSet<String> = std::collections::HashSet::new();
for path in &files {
let uri = path.to_string_lossy().to_string();
seen_uris.insert(uri.clone());
let change = match std::fs::read_to_string(path) {
Ok(content) => self.detect_file_change_with_content(path, &content),
Err(_) => continue, };
changes.push(change);
}
for (tracked_uri, doc) in &self.entries {
if !seen_uris.contains(tracked_uri) {
changes.push(SourceChange::Deleted {
uri: tracked_uri.clone(),
old_chunk_ids: doc.chunk_ids.clone(),
});
}
}
changes
}
pub fn detect_file_change(&self, path: &Path) -> SourceChange {
let content = std::fs::read_to_string(path).unwrap_or_default();
self.detect_file_change_with_content(path, &content)
}
fn detect_file_change_with_content(&self, path: &Path, content: &str) -> SourceChange {
let uri = path.to_string_lossy().to_string();
let current_hash = Self::content_hash(content);
match self.entries.get(&uri) {
Some(doc) => {
if doc.content_hash == current_hash {
SourceChange::Unchanged { uri }
} else {
SourceChange::Modified {
uri,
content: content.to_string(),
old_chunk_ids: doc.chunk_ids.clone(),
}
}
}
None => SourceChange::New {
uri,
content: content.to_string(),
},
}
}
pub fn record_indexed(&mut self, source_uri: &str, content_hash: &str, chunk_ids: Vec<i64>) {
let last_indexed = iso8601_now();
let entry = TrackedDocument {
source_uri: source_uri.to_string(),
content_hash: content_hash.to_string(),
chunk_ids,
last_indexed,
file_modified: None,
file_size: None,
};
self.entries.insert(source_uri.to_string(), entry);
}
pub fn record_deleted(&mut self, source_uri: &str) {
self.entries.remove(source_uri);
}
pub fn get(&self, source_uri: &str) -> Option<&TrackedDocument> {
self.entries.get(source_uri)
}
pub fn tracked_sources(&self) -> Vec<&str> {
self.entries.keys().map(|s| s.as_str()).collect()
}
pub fn chunk_ids(&self, source_uri: &str) -> Option<&[i64]> {
self.entries.get(source_uri).map(|d| d.chunk_ids.as_slice())
}
pub fn len(&self) -> usize {
self.entries.len()
}
pub fn is_empty(&self) -> bool {
self.entries.is_empty()
}
}
impl Default for DocumentTracker {
fn default() -> Self {
Self::new()
}
}
fn collect_files(dir: &Path, pattern: Option<&str>, recursive: bool) -> Vec<PathBuf> {
let mut files = Vec::new();
fn visit_dir(dir: &Path, files: &mut Vec<PathBuf>, pattern: Option<&str>, recursive: bool) -> std::io::Result<()> {
for entry in std::fs::read_dir(dir)? {
let entry = entry?;
let path = entry.path();
if path.is_dir() && recursive {
visit_dir(&path, files, pattern, recursive)?;
} else if path.is_file() {
let path_str = path.to_string_lossy();
let matches = match pattern {
Some(p) => glob::Pattern::new(p).map(|pat| pat.matches(&path_str)).unwrap_or(false),
None => true,
};
if matches {
files.push(path);
}
}
}
Ok(())
}
let _ = visit_dir(dir, &mut files, pattern, recursive);
files
}
fn iso8601_now() -> String {
let duration = SystemTime::now()
.duration_since(UNIX_EPOCH)
.unwrap_or_default();
let secs = duration.as_secs();
format!("{}", secs)
}
#[cfg(test)]
mod tests {
use super::*;
use tempfile::TempDir;
#[test]
fn test_content_hash_deterministic() {
let content = "hello world";
let h1 = DocumentTracker::content_hash(content);
let h2 = DocumentTracker::content_hash(content);
assert_eq!(h1, h2);
}
#[test]
fn test_content_hash_different() {
let h1 = DocumentTracker::content_hash("hello");
let h2 = DocumentTracker::content_hash("world");
assert_ne!(h1, h2);
}
#[test]
fn test_document_tracker_basic() {
let mut tracker = DocumentTracker::new();
assert!(tracker.is_empty());
tracker.record_indexed("file:///a.txt", "abc123", vec![1, 2, 3]);
assert_eq!(tracker.len(), 1);
let doc = tracker.get("file:///a.txt").unwrap();
assert_eq!(doc.content_hash, "abc123");
assert_eq!(doc.chunk_ids, vec![1, 2, 3]);
assert_eq!(tracker.chunk_ids("file:///a.txt"), Some([1i64, 2, 3].as_slice()));
tracker.record_deleted("file:///a.txt");
assert!(tracker.is_empty());
assert!(tracker.get("file:///a.txt").is_none());
}
#[test]
fn test_detect_changes_with_tempdir() {
let temp = TempDir::new().unwrap();
let dir = temp.path();
let f1 = dir.join("a.txt");
let f2 = dir.join("b.txt");
std::fs::write(&f1, "content a").unwrap();
std::fs::write(&f2, "content b").unwrap();
let mut tracker = DocumentTracker::new();
let changes = tracker.detect_changes(dir, None, false);
let new_count = changes.iter().filter(|c| matches!(c, SourceChange::New { .. })).count();
assert_eq!(new_count, 2, "expected 2 new files, got {:?}", changes);
tracker.record_indexed(&f1.to_string_lossy(), &DocumentTracker::content_hash("content a"), vec![1]);
tracker.record_indexed(&f2.to_string_lossy(), &DocumentTracker::content_hash("content b"), vec![2]);
std::fs::write(&f1, "content a modified").unwrap();
let changes2 = tracker.detect_changes(dir, None, false);
let modified = changes2.iter().find(|c| matches!(c, SourceChange::Modified { .. }));
assert!(modified.is_some(), "expected modified file, got {:?}", changes2);
let unchanged = changes2.iter().find(|c| matches!(c, SourceChange::Unchanged { .. }));
assert!(unchanged.is_some(), "expected unchanged file (b.txt), got {:?}", changes2);
}
#[test]
fn test_sync_result_defaults() {
let result = SyncResult::default();
assert_eq!(result.added, 0);
assert_eq!(result.updated, 0);
assert_eq!(result.deleted, 0);
assert_eq!(result.unchanged, 0);
assert!(result.errors.is_empty());
}
#[test]
fn test_tracker_save_load() {
let temp = TempDir::new().unwrap();
let state_path = temp.path().join("tracker.json");
let mut tracker = DocumentTracker::with_state_file(&state_path);
tracker.record_indexed("file:///x.txt", "hash1", vec![10, 20]);
tracker.record_indexed("file:///y.txt", "hash2", vec![30]);
tracker.save().unwrap();
let mut loaded = DocumentTracker::with_state_file(&state_path);
loaded.load().unwrap();
assert_eq!(loaded.len(), 2);
assert_eq!(loaded.get("file:///x.txt").unwrap().content_hash, "hash1");
assert_eq!(loaded.get("file:///x.txt").unwrap().chunk_ids, vec![10, 20]);
assert_eq!(loaded.get("file:///y.txt").unwrap().chunk_ids, vec![30]);
}
}