Skip to main content

ripvec_core/cache/
manifest.rs

1//! Manifest tracking indexed files with a Merkle tree for fast change detection.
2//!
3//! The manifest stores per-file metadata (mtime, size, content hash, chunk count)
4//! and per-directory metadata (mtime, hash). Directory hashes are computed bottom-up
5//! from children's content hashes, enabling top-down pruning during diffs.
6
7use std::collections::BTreeMap;
8use std::path::Path;
9
10use serde::{Deserialize, Serialize};
11
12/// Persistent manifest tracking all indexed files and their state.
13#[derive(Debug, Serialize, Deserialize)]
14pub struct Manifest {
15    /// Cache format version (bump to invalidate all caches).
16    pub version: u32,
17    /// The embedding model used to generate this index.
18    pub model_repo: String,
19    /// Root Merkle hash (changes when any file changes).
20    pub root_hash: String,
21    /// Directory entries with mtime and computed Merkle hash.
22    pub directories: BTreeMap<String, DirEntry>,
23    /// File entries with metadata and content hash.
24    pub files: BTreeMap<String, FileEntry>,
25}
26
27/// Directory metadata for Merkle tree change detection.
28#[derive(Debug, Serialize, Deserialize)]
29pub struct DirEntry {
30    /// Merkle hash computed from children's content hashes.
31    pub hash: String,
32    /// Last modification time (seconds since epoch).
33    pub mtime_secs: u64,
34}
35
36/// File metadata for change detection and cache lookup.
37#[derive(Debug, Serialize, Deserialize)]
38pub struct FileEntry {
39    /// Last modification time (seconds since epoch).
40    pub mtime_secs: u64,
41    /// File size in bytes.
42    pub size: u64,
43    /// Blake3 hash of the file content (used as object store key).
44    pub content_hash: String,
45    /// Number of chunks extracted from this file.
46    pub chunk_count: usize,
47}
48
49/// Current cache format version. Bump to invalidate all existing caches.
50pub const MANIFEST_VERSION: u32 = 2;
51
52impl Manifest {
53    /// Create a new empty manifest for the given model.
54    #[must_use]
55    pub fn new(model_repo: &str) -> Self {
56        Self {
57            version: MANIFEST_VERSION,
58            model_repo: model_repo.to_string(),
59            root_hash: String::new(),
60            directories: BTreeMap::new(),
61            files: BTreeMap::new(),
62        }
63    }
64
65    /// Register a file in the manifest.
66    pub fn add_file(
67        &mut self,
68        relative_path: &str,
69        mtime_secs: u64,
70        size: u64,
71        content_hash: &str,
72        chunk_count: usize,
73    ) {
74        self.files.insert(
75            relative_path.to_string(),
76            FileEntry {
77                mtime_secs,
78                size,
79                content_hash: content_hash.to_string(),
80                chunk_count,
81            },
82        );
83    }
84
85    /// Remove a file from the manifest.
86    pub fn remove_file(&mut self, relative_path: &str) {
87        self.files.remove(relative_path);
88    }
89
90    /// Recompute all directory hashes and the root hash bottom-up.
91    ///
92    /// For each directory, the hash is blake3 of the sorted children's content
93    /// hashes concatenated. The root hash is blake3 of all top-level entries.
94    pub fn recompute_hashes(&mut self) {
95        // Collect all directory paths from file paths
96        let mut dir_children: BTreeMap<String, Vec<String>> = BTreeMap::new();
97
98        for (file_path, entry) in &self.files {
99            let path = Path::new(file_path);
100            // Register hash with each ancestor directory
101            let mut current = path.parent();
102            while let Some(dir) = current {
103                let dir_str = dir.to_string_lossy().to_string();
104                if dir_str.is_empty() {
105                    break;
106                }
107                dir_children
108                    .entry(dir_str)
109                    .or_default()
110                    .push(entry.content_hash.clone());
111                current = dir.parent();
112            }
113            // Top-level files go into root
114            dir_children
115                .entry(String::new())
116                .or_default()
117                .push(entry.content_hash.clone());
118        }
119
120        // Compute directory hashes
121        self.directories.clear();
122        for (dir_path, child_hashes) in &mut dir_children {
123            if dir_path.is_empty() {
124                continue;
125            }
126            child_hashes.sort();
127            let combined = child_hashes.join("");
128            let hash = blake3::hash(combined.as_bytes()).to_hex().to_string();
129            self.directories.insert(
130                dir_path.clone(),
131                DirEntry {
132                    hash,
133                    mtime_secs: 0, // populated during diff, not recompute
134                },
135            );
136        }
137
138        // Root hash from all file content hashes
139        let mut all_hashes: Vec<&str> = self
140            .files
141            .values()
142            .map(|e| e.content_hash.as_str())
143            .collect();
144        all_hashes.sort_unstable();
145        let combined = all_hashes.join("");
146        self.root_hash = blake3::hash(combined.as_bytes()).to_hex().to_string();
147    }
148
149    /// Serialize to JSON string.
150    ///
151    /// # Errors
152    ///
153    /// Returns an error if serialization fails.
154    pub fn to_json(&self) -> crate::Result<String> {
155        serde_json::to_string_pretty(self)
156            .map_err(|e| crate::Error::Other(anyhow::anyhow!("manifest serialization: {e}")))
157    }
158
159    /// Deserialize from JSON string.
160    ///
161    /// # Errors
162    ///
163    /// Returns an error if the JSON is invalid.
164    pub fn from_json(json: &str) -> crate::Result<Self> {
165        serde_json::from_str(json)
166            .map_err(|e| crate::Error::Other(anyhow::anyhow!("manifest deserialization: {e}")))
167    }
168
169    /// Save manifest to a file.
170    ///
171    /// # Errors
172    ///
173    /// Returns an error if the file cannot be written.
174    pub fn save(&self, path: &Path) -> crate::Result<()> {
175        let json = self.to_json()?;
176        if let Some(parent) = path.parent() {
177            std::fs::create_dir_all(parent).map_err(|e| crate::Error::Io {
178                path: parent.display().to_string(),
179                source: e,
180            })?;
181        }
182        std::fs::write(path, json).map_err(|e| crate::Error::Io {
183            path: path.display().to_string(),
184            source: e,
185        })
186    }
187
188    /// Load manifest from a file.
189    ///
190    /// # Errors
191    ///
192    /// Returns an error if the file cannot be read or parsed.
193    pub fn load(path: &Path) -> crate::Result<Self> {
194        let json = std::fs::read_to_string(path).map_err(|e| crate::Error::Io {
195            path: path.display().to_string(),
196            source: e,
197        })?;
198        Self::from_json(&json)
199    }
200
201    /// Check whether this manifest is compatible with the given model.
202    #[must_use]
203    pub fn is_compatible(&self, model_repo: &str) -> bool {
204        self.version == MANIFEST_VERSION && self.model_repo == model_repo
205    }
206
207    /// Collect all content hashes referenced by files in this manifest.
208    #[must_use]
209    pub fn referenced_hashes(&self) -> Vec<String> {
210        self.files
211            .values()
212            .map(|e| e.content_hash.clone())
213            .collect()
214    }
215}
216
217#[cfg(test)]
218mod tests {
219    use super::*;
220
221    #[test]
222    fn round_trip_json() {
223        let mut m = Manifest::new("BAAI/bge-small-en-v1.5");
224        m.add_file("src/main.rs", 1000, 4523, "abc123", 8);
225        m.add_file("src/lib.rs", 1001, 2000, "def456", 5);
226        m.recompute_hashes();
227
228        let json = m.to_json().unwrap();
229        let loaded = Manifest::from_json(&json).unwrap();
230        assert_eq!(loaded.files.len(), 2);
231        assert_eq!(loaded.model_repo, "BAAI/bge-small-en-v1.5");
232        assert!(!loaded.root_hash.is_empty());
233    }
234
235    #[test]
236    fn root_hash_changes_on_file_change() {
237        let mut m1 = Manifest::new("model");
238        m1.add_file("a.rs", 1000, 100, "hash1", 5);
239        m1.recompute_hashes();
240        let h1 = m1.root_hash.clone();
241
242        let mut m2 = Manifest::new("model");
243        m2.add_file("a.rs", 1001, 100, "hash2", 5);
244        m2.recompute_hashes();
245
246        assert_ne!(h1, m2.root_hash);
247    }
248
249    #[test]
250    fn root_hash_stable_for_same_content() {
251        let mut m1 = Manifest::new("model");
252        m1.add_file("a.rs", 1000, 100, "hash1", 5);
253        m1.add_file("b.rs", 1001, 200, "hash2", 3);
254        m1.recompute_hashes();
255
256        let mut m2 = Manifest::new("model");
257        m2.add_file("b.rs", 1001, 200, "hash2", 3);
258        m2.add_file("a.rs", 1000, 100, "hash1", 5);
259        m2.recompute_hashes();
260
261        assert_eq!(m1.root_hash, m2.root_hash);
262    }
263
264    #[test]
265    fn directory_hashes_computed() {
266        let mut m = Manifest::new("model");
267        m.add_file("src/main.rs", 1000, 100, "hash1", 5);
268        m.add_file("src/lib.rs", 1001, 200, "hash2", 3);
269        m.add_file("tests/test.rs", 1002, 300, "hash3", 2);
270        m.recompute_hashes();
271
272        assert!(m.directories.contains_key("src"));
273        assert!(m.directories.contains_key("tests"));
274    }
275
276    #[test]
277    fn save_and_load() {
278        let dir = tempfile::TempDir::new().unwrap();
279        let path = dir.path().join("manifest.json");
280
281        let mut m = Manifest::new("test-model");
282        m.add_file("foo.rs", 100, 50, "aaa", 1);
283        m.recompute_hashes();
284        m.save(&path).unwrap();
285
286        let loaded = Manifest::load(&path).unwrap();
287        assert_eq!(loaded.files.len(), 1);
288        assert_eq!(loaded.root_hash, m.root_hash);
289    }
290
291    #[test]
292    fn is_compatible() {
293        let m = Manifest::new("BAAI/bge-small-en-v1.5");
294        assert!(m.is_compatible("BAAI/bge-small-en-v1.5"));
295        assert!(!m.is_compatible("nomic-ai/CodeRankEmbed"));
296    }
297
298    #[test]
299    fn referenced_hashes() {
300        let mut m = Manifest::new("model");
301        m.add_file("a.rs", 1, 1, "hash_a", 1);
302        m.add_file("b.rs", 2, 2, "hash_b", 2);
303        let hashes = m.referenced_hashes();
304        assert_eq!(hashes.len(), 2);
305        assert!(hashes.contains(&"hash_a".to_string()));
306        assert!(hashes.contains(&"hash_b".to_string()));
307    }
308}