Skip to main content

canon_core/
document.rs

1//! Document node representing an ingested file
2//!
3//! Per CP-001: Documents use content-based IDs for determinism.
4//! `path_id` is added for filesystem change detection.
5
6use crate::text::normalize;
7use serde::{Deserialize, Serialize};
8use std::path::PathBuf;
9use uuid::Uuid;
10
11/// A document node in the cognitive graph
12///
13/// Represents a single ingested file with its content hash
14/// for change detection and deduplication.
15#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
16pub struct Document {
17    /// Unique identifier for this document (BLAKE3-16 of `content_hash`)
18    pub id: Uuid,
19
20    /// Path-based identifier for change detection (BLAKE3-16 of canonicalized path)
21    pub path_id: Uuid,
22
23    /// Original file path (relative to watched root)
24    pub path: PathBuf,
25
26    /// BLAKE3 hash of file contents (canonicalized)
27    pub hash: [u8; 32],
28
29    /// Merkle root of chunks (hierarchical hash)
30    pub hierarchical_hash: [u8; 32],
31
32    /// Last modification time (Unix timestamp)
33    pub mtime: i64,
34
35    /// File size in bytes
36    pub size: u64,
37
38    /// MIME type (e.g., "application/pdf", "text/markdown")
39    pub mime_type: String,
40
41    /// Arweave transaction ID if this document was synced from Arweave
42    #[serde(default, skip_serializing_if = "Option::is_none")]
43    pub arweave_tx: Option<String>,
44}
45
46impl Document {
47    pub fn new(path: PathBuf, content: &[u8], mtime: i64) -> Self {
48        let mime_type = mime_from_path(&path);
49
50        // Per CP-003: Canonicalize content before hashing for determinism
51        let text = String::from_utf8_lossy(content);
52        let canonical_content = normalize(&text);
53        let canonical_bytes = canonical_content.as_bytes();
54        let content_hash = blake3::hash(canonical_bytes);
55
56        // Per CP-001: ID is generated from content hash (content-based identity)
57        let mut id_bytes = [0u8; 16];
58        id_bytes.copy_from_slice(&content_hash.as_bytes()[0..16]);
59        let id = Uuid::from_bytes(id_bytes);
60
61        // Per CP-001: path_id is generated from canonicalized path (for change detection)
62        let path_str = path.to_string_lossy();
63        let canonical_path = normalize(&path_str);
64        let path_id_bytes = blake3::hash(canonical_path.as_bytes());
65        let mut path_id = [0u8; 16];
66        path_id.copy_from_slice(&path_id_bytes.as_bytes()[0..16]);
67        let path_id = Uuid::from_bytes(path_id);
68
69        Self {
70            id,
71            path_id,
72            path,
73            hash: *content_hash.as_bytes(),
74            hierarchical_hash: [0; 32], // Placeholder, computed after chunking
75            mtime,
76            size: content.len() as u64, // Original size for display
77            mime_type,
78            arweave_tx: None,
79        }
80    }
81
82    /// Update the hierarchical hash (Merkle root of chunks)
83    pub fn set_hierarchical_hash(&mut self, hash: [u8; 32]) {
84        self.hierarchical_hash = hash;
85    }
86
87    /// Compute Merkle hash from chunks for provable correctness.
88    /// Sorts chunk hashes before hashing so order doesn't affect the result.
89    pub fn compute_hierarchical_hash(chunk_hashes: &[[u8; 32]]) -> [u8; 32] {
90        let mut sorted = chunk_hashes.to_vec();
91        sorted.sort_unstable();
92        let mut section_hasher = blake3::Hasher::new();
93        for hash in &sorted {
94            section_hasher.update(hash);
95        }
96        *section_hasher.finalize().as_bytes()
97    }
98
99    /// Check if the document content has changed
100    pub fn content_changed(&self, new_content: &[u8]) -> bool {
101        let text = String::from_utf8_lossy(new_content);
102        let canonical = normalize(&text);
103        let new_hash = blake3::hash(canonical.as_bytes());
104        self.hash != *new_hash.as_bytes()
105    }
106
107    /// Get the document hash as a hex string
108    pub fn hash_hex(&self) -> String {
109        hex_encode(&self.hash)
110    }
111}
112
113/// Infer MIME type from file extension
114fn mime_from_path(path: &std::path::Path) -> String {
115    match path.extension().and_then(|e| e.to_str()) {
116        Some("md" | "markdown") => "text/markdown".to_string(),
117        Some("txt") => "text/plain".to_string(),
118        Some("pdf") => "application/pdf".to_string(),
119        Some("json") => "application/json".to_string(),
120        Some("html" | "htm") => "text/html".to_string(),
121        Some("rs") => "text/x-rust".to_string(),
122        Some("py") => "text/x-python".to_string(),
123        Some("js") => "text/javascript".to_string(),
124        Some("ts") => "text/typescript".to_string(),
125        _ => "application/octet-stream".to_string(),
126    }
127}
128
129/// Encode bytes as lowercase hex
130fn hex_encode(bytes: &[u8]) -> String {
131    use std::fmt::Write;
132    let mut s = String::with_capacity(bytes.len() * 2);
133    for b in bytes {
134        let _ = write!(s, "{b:02x}");
135    }
136    s
137}
138
139#[cfg(test)]
140mod tests {
141    use super::*;
142
143    #[test]
144    fn test_document_creation() {
145        let content = b"Hello, CP!";
146        let doc = Document::new(PathBuf::from("test.md"), content, 1234567890);
147
148        assert_eq!(doc.path, PathBuf::from("test.md"));
149        // Original content is 10 bytes
150        assert_eq!(doc.size, 10);
151        assert_eq!(doc.mime_type, "text/markdown");
152        assert_eq!(doc.hierarchical_hash, [0; 32]);
153
154        // Verify ID generation (first 16 bytes of blake3 hash of canonicalized content)
155        let canonical = normalize("Hello, CP!");
156        let hash = blake3::hash(canonical.as_bytes());
157        let expected_id = Uuid::from_bytes(hash.as_bytes()[0..16].try_into().unwrap());
158        assert_eq!(doc.id, expected_id);
159    }
160
161    #[test]
162    fn test_content_changed() {
163        let content = b"Original content";
164        let doc = Document::new(PathBuf::from("test.txt"), content, 0);
165
166        assert!(!doc.content_changed(content));
167        assert!(doc.content_changed(b"Modified content"));
168    }
169
170    #[test]
171    fn test_path_id_deterministic() {
172        let doc1 = Document::new(PathBuf::from("test.md"), b"content", 0);
173        let doc2 = Document::new(PathBuf::from("test.md"), b"content", 0);
174
175        // Same path = same path_id
176        assert_eq!(doc1.path_id, doc2.path_id);
177
178        // Different path = different path_id
179        let doc3 = Document::new(PathBuf::from("other.md"), b"content", 0);
180        assert_ne!(doc1.path_id, doc3.path_id);
181    }
182
183    #[test]
184    fn test_content_id_deterministic() {
185        let doc1 = Document::new(PathBuf::from("a.md"), b"hello", 0);
186        let doc2 = Document::new(PathBuf::from("b.md"), b"hello", 0);
187
188        // Same content = same ID regardless of path
189        assert_eq!(doc1.id, doc2.id);
190    }
191
192    // Additional tests for comprehensive coverage
193
194    #[test]
195    fn test_document_id_derivation_from_content_hash() {
196        // Verify document ID is derived from content hash
197        let content = b"Test content for ID derivation";
198        let doc = Document::new(PathBuf::from("test.md"), content, 0);
199
200        // ID should be first 16 bytes of BLAKE3 hash of canonicalized content
201        let canonical = normalize("Test content for ID derivation");
202        let expected_hash = blake3::hash(canonical.as_bytes());
203        let mut expected_id_bytes = [0u8; 16];
204        expected_id_bytes.copy_from_slice(&expected_hash.as_bytes()[0..16]);
205        let expected_id = Uuid::from_bytes(expected_id_bytes);
206
207        assert_eq!(doc.id, expected_id);
208    }
209
210    #[test]
211    fn test_document_path_id_derivation() {
212        // Verify path_id is derived from canonicalized path
213        let path = PathBuf::from("test/document.md");
214        let doc = Document::new(path.clone(), b"content", 0);
215
216        // path_id should be BLAKE3-16 of canonicalized path
217        let canonical_path = normalize(&path.to_string_lossy());
218        let expected_hash = blake3::hash(canonical_path.as_bytes());
219        let mut expected_path_id_bytes = [0u8; 16];
220        expected_path_id_bytes.copy_from_slice(&expected_hash.as_bytes()[0..16]);
221        let expected_path_id = Uuid::from_bytes(expected_path_id_bytes);
222
223        assert_eq!(doc.path_id, expected_path_id);
224    }
225
226    #[test]
227    fn test_document_hierarchical_hash_computation() {
228        // Test Merkle hash computation from chunk hashes
229        let chunk_hashes: [[u8; 32]; 3] = [[1u8; 32], [2u8; 32], [3u8; 32]];
230
231        let hierarchical_hash = Document::compute_hierarchical_hash(&chunk_hashes);
232
233        // Verify it's a 32-byte hash
234        assert_eq!(hierarchical_hash.len(), 32);
235
236        // Verify determinism - same input produces same hash
237        let hierarchical_hash2 = Document::compute_hierarchical_hash(&chunk_hashes);
238        assert_eq!(hierarchical_hash, hierarchical_hash2);
239
240        // Verify different input produces different hash
241        let different_hashes: [[u8; 32]; 2] = [[1u8; 32], [2u8; 32]];
242        let different_result = Document::compute_hierarchical_hash(&different_hashes);
243        assert_ne!(hierarchical_hash, different_result);
244    }
245
246    #[test]
247    fn test_document_serialization() {
248        // Test CBOR round-trip serialization using serde
249        // Note: CBOR requires the ciborium crate which isn't a dependency of cp-core
250        // So we test basic serialization via serde
251        let doc = Document::new(PathBuf::from("test.md"), b"Hello, World!", 1234567890);
252
253        // Verify document can be serialized with serde (JSON for testing)
254        let serialized = serde_json::to_string(&doc).unwrap();
255        let deserialized: Document = serde_json::from_str(&serialized).unwrap();
256
257        // Verify all fields match
258        assert_eq!(doc.id, deserialized.id);
259        assert_eq!(doc.path_id, deserialized.path_id);
260        assert_eq!(doc.path, deserialized.path);
261        assert_eq!(doc.hash, deserialized.hash);
262        assert_eq!(doc.hierarchical_hash, deserialized.hierarchical_hash);
263        assert_eq!(doc.mtime, deserialized.mtime);
264        assert_eq!(doc.size, deserialized.size);
265        assert_eq!(doc.mime_type, deserialized.mime_type);
266    }
267
268    #[test]
269    fn test_document_deserialization_invalid() {
270        // Test handling of malformed data
271        // Invalid JSON data
272        let invalid_data = "{invalid json";
273
274        let result: Result<Document, _> = serde_json::from_str(invalid_data);
275        assert!(result.is_err());
276    }
277
278    #[test]
279    fn test_document_canonical_bytes() {
280        // Test to_canonical_bytes format (simulated since we use CBOR)
281        let doc = Document::new(PathBuf::from("test.md"), b"Content", 1000);
282
283        // Verify document has all required fields for canonical bytes
284        let id_bytes = doc.id.as_bytes();
285        let path_id_bytes = doc.path_id.as_bytes();
286
287        assert_eq!(id_bytes.len(), 16);
288        assert_eq!(path_id_bytes.len(), 16);
289        assert_eq!(doc.hash.len(), 32);
290        assert_eq!(doc.hierarchical_hash.len(), 32);
291    }
292
293    #[test]
294    fn test_document_mime_type_detection_markdown() {
295        // Test .md extension detection
296        let doc1 = Document::new(PathBuf::from("readme.md"), b"content", 0);
297        let doc2 = Document::new(PathBuf::from("document.markdown"), b"content", 0);
298
299        assert_eq!(doc1.mime_type, "text/markdown");
300        assert_eq!(doc2.mime_type, "text/markdown");
301    }
302
303    #[test]
304    fn test_document_mime_type_detection_text() {
305        // Test .txt extension detection
306        let doc = Document::new(PathBuf::from("notes.txt"), b"content", 0);
307        assert_eq!(doc.mime_type, "text/plain");
308    }
309
310    #[test]
311    fn test_document_mime_type_detection_unknown() {
312        // Test unknown extension defaults to application/octet-stream
313        let doc1 = Document::new(PathBuf::from("file.xyz"), b"content", 0);
314        let doc2 = Document::new(PathBuf::from("noextension"), b"content", 0);
315
316        assert_eq!(doc1.mime_type, "application/octet-stream");
317        assert_eq!(doc2.mime_type, "application/octet-stream");
318    }
319
320    #[test]
321    fn test_document_size_bytes_calculation() {
322        // Verify size matches content length
323        let content = b"Test content size";
324        let doc = Document::new(PathBuf::from("test.txt"), content, 0);
325
326        assert_eq!(doc.size, content.len() as u64);
327    }
328
329    #[test]
330    fn test_document_mtime_from_filesystem() {
331        // Test mtime is stored correctly
332        let mtime: i64 = 1609459200; // 2021-01-01 00:00:00 UTC
333        let doc = Document::new(PathBuf::from("test.txt"), b"content", mtime);
334
335        assert_eq!(doc.mtime, mtime);
336    }
337
338    #[test]
339    fn test_document_set_hierarchical_hash() {
340        // Test setting hierarchical hash
341        let mut doc = Document::new(PathBuf::from("test.md"), b"content", 0);
342        let new_hash = [42u8; 32];
343
344        doc.set_hierarchical_hash(new_hash);
345
346        assert_eq!(doc.hierarchical_hash, new_hash);
347    }
348
349    #[test]
350    fn test_document_hash_hex() {
351        // Test hash_hex() returns hex string
352        let doc = Document::new(PathBuf::from("test.md"), b"content", 0);
353        let hex_str = doc.hash_hex();
354
355        // Should be 64 characters (32 bytes * 2 hex chars)
356        assert_eq!(hex_str.len(), 64);
357
358        // Should only contain hex characters
359        assert!(hex_str.chars().all(|c| c.is_ascii_hexdigit()));
360    }
361}