canon_core/
document.rs

1//! Document node representing an ingested file
2//!
3//! Per CP-001: Documents use content-based IDs for determinism.
4//! path_id is added for filesystem change detection.
5
6use crate::text::normalize;
7use serde::{Deserialize, Serialize};
8use std::path::PathBuf;
9use uuid::Uuid;
10
11/// A document node in the cognitive graph
12///
13/// Represents a single ingested file with its content hash
14/// for change detection and deduplication.
15#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
16pub struct Document {
17    /// Unique identifier for this document (BLAKE3-16 of content_hash)
18    pub id: Uuid,
19
20    /// Path-based identifier for change detection (BLAKE3-16 of canonicalized path)
21    pub path_id: Uuid,
22
23    /// Original file path (relative to watched root)
24    pub path: PathBuf,
25
26    /// BLAKE3 hash of file contents (canonicalized)
27    pub hash: [u8; 32],
28
29    /// Merkle root of chunks (hierarchical hash)
30    pub hierarchical_hash: [u8; 32],
31
32    /// Last modification time (Unix timestamp)
33    pub mtime: i64,
34
35    /// File size in bytes
36    pub size: u64,
37
38    /// MIME type (e.g., "application/pdf", "text/markdown")
39    pub mime_type: String,
40}
41
42impl Document {
43    pub fn new(path: PathBuf, content: &[u8], mtime: i64) -> Self {
44        let mime_type = mime_from_path(&path);
45
46        // Per CP-003: Canonicalize content before hashing for determinism
47        let text = String::from_utf8_lossy(content);
48        let canonical_content = normalize(&text);
49        let canonical_bytes = canonical_content.as_bytes();
50        let content_hash = blake3::hash(canonical_bytes);
51
52        // Per CP-001: ID is generated from content hash (content-based identity)
53        let mut id_bytes = [0u8; 16];
54        id_bytes.copy_from_slice(&content_hash.as_bytes()[0..16]);
55        let id = Uuid::from_bytes(id_bytes);
56
57        // Per CP-001: path_id is generated from canonicalized path (for change detection)
58        let path_str = path.to_string_lossy();
59        let canonical_path = normalize(&path_str);
60        let path_id_bytes = blake3::hash(canonical_path.as_bytes());
61        let mut path_id = [0u8; 16];
62        path_id.copy_from_slice(&path_id_bytes.as_bytes()[0..16]);
63        let path_id = Uuid::from_bytes(path_id);
64
65        Self {
66            id,
67            path_id,
68            path,
69            hash: *content_hash.as_bytes(),
70            hierarchical_hash: [0; 32], // Placeholder, computed after chunking
71            mtime,
72            size: content.len() as u64, // Original size for display
73            mime_type,
74        }
75    }
76
77    /// Update the hierarchical hash (Merkle root of chunks)
78    pub fn set_hierarchical_hash(&mut self, hash: [u8; 32]) {
79        self.hierarchical_hash = hash;
80    }
81
82    /// Compute Merkle hash from chunks for provable correctness
83    pub fn compute_hierarchical_hash(chunk_hashes: &[[u8; 32]]) -> [u8; 32] {
84        let mut section_hasher = blake3::Hasher::new();
85        for hash in chunk_hashes {
86            section_hasher.update(hash);
87        }
88        *section_hasher.finalize().as_bytes()
89    }
90
91    /// Check if the document content has changed
92    pub fn content_changed(&self, new_content: &[u8]) -> bool {
93        let text = String::from_utf8_lossy(new_content);
94        let canonical = normalize(&text);
95        let new_hash = blake3::hash(canonical.as_bytes());
96        self.hash != *new_hash.as_bytes()
97    }
98
99    /// Get the document hash as a hex string
100    pub fn hash_hex(&self) -> String {
101        hex_encode(&self.hash)
102    }
103}
104
105/// Infer MIME type from file extension
106fn mime_from_path(path: &PathBuf) -> String {
107    match path.extension().and_then(|e| e.to_str()) {
108        Some("md") | Some("markdown") => "text/markdown".to_string(),
109        Some("txt") => "text/plain".to_string(),
110        Some("pdf") => "application/pdf".to_string(),
111        Some("json") => "application/json".to_string(),
112        Some("html") | Some("htm") => "text/html".to_string(),
113        Some("rs") => "text/x-rust".to_string(),
114        Some("py") => "text/x-python".to_string(),
115        Some("js") => "text/javascript".to_string(),
116        Some("ts") => "text/typescript".to_string(),
117        _ => "application/octet-stream".to_string(),
118    }
119}
120
121/// Encode bytes as lowercase hex
122fn hex_encode(bytes: &[u8]) -> String {
123    bytes.iter().map(|b| format!("{:02x}", b)).collect()
124}
125
126#[cfg(test)]
127mod tests {
128    use super::*;
129
130    #[test]
131    fn test_document_creation() {
132        let content = b"Hello, CP!";
133        let doc = Document::new(
134            PathBuf::from("test.md"),
135            content,
136            1234567890,
137        );
138
139        assert_eq!(doc.path, PathBuf::from("test.md"));
140        // Original content is 10 bytes
141        assert_eq!(doc.size, 10);
142        assert_eq!(doc.mime_type, "text/markdown");
143        assert_eq!(doc.hierarchical_hash, [0; 32]);
144
145        // Verify ID generation (first 16 bytes of blake3 hash of canonicalized content)
146        let canonical = normalize("Hello, CP!");
147        let hash = blake3::hash(canonical.as_bytes());
148        let expected_id = Uuid::from_bytes(hash.as_bytes()[0..16].try_into().unwrap());
149        assert_eq!(doc.id, expected_id);
150    }
151
152    #[test]
153    fn test_content_changed() {
154        let content = b"Original content";
155        let doc = Document::new(PathBuf::from("test.txt"), content, 0);
156
157        assert!(!doc.content_changed(content));
158        assert!(doc.content_changed(b"Modified content"));
159    }
160
161    #[test]
162    fn test_path_id_deterministic() {
163        let doc1 = Document::new(PathBuf::from("test.md"), b"content", 0);
164        let doc2 = Document::new(PathBuf::from("test.md"), b"content", 0);
165
166        // Same path = same path_id
167        assert_eq!(doc1.path_id, doc2.path_id);
168
169        // Different path = different path_id
170        let doc3 = Document::new(PathBuf::from("other.md"), b"content", 0);
171        assert_ne!(doc1.path_id, doc3.path_id);
172    }
173
174    #[test]
175    fn test_content_id_deterministic() {
176        let doc1 = Document::new(PathBuf::from("a.md"), b"hello", 0);
177        let doc2 = Document::new(PathBuf::from("b.md"), b"hello", 0);
178
179        // Same content = same ID regardless of path
180        assert_eq!(doc1.id, doc2.id);
181    }
182
183    // Additional tests for comprehensive coverage
184
185    #[test]
186    fn test_document_id_derivation_from_content_hash() {
187        // Verify document ID is derived from content hash
188        let content = b"Test content for ID derivation";
189        let doc = Document::new(PathBuf::from("test.md"), content, 0);
190
191        // ID should be first 16 bytes of BLAKE3 hash of canonicalized content
192        let canonical = normalize("Test content for ID derivation");
193        let expected_hash = blake3::hash(canonical.as_bytes());
194        let mut expected_id_bytes = [0u8; 16];
195        expected_id_bytes.copy_from_slice(&expected_hash.as_bytes()[0..16]);
196        let expected_id = Uuid::from_bytes(expected_id_bytes);
197
198        assert_eq!(doc.id, expected_id);
199    }
200
201    #[test]
202    fn test_document_path_id_derivation() {
203        // Verify path_id is derived from canonicalized path
204        let path = PathBuf::from("test/document.md");
205        let doc = Document::new(path.clone(), b"content", 0);
206
207        // path_id should be BLAKE3-16 of canonicalized path
208        let canonical_path = normalize(&path.to_string_lossy());
209        let expected_hash = blake3::hash(canonical_path.as_bytes());
210        let mut expected_path_id_bytes = [0u8; 16];
211        expected_path_id_bytes.copy_from_slice(&expected_hash.as_bytes()[0..16]);
212        let expected_path_id = Uuid::from_bytes(expected_path_id_bytes);
213
214        assert_eq!(doc.path_id, expected_path_id);
215    }
216
217    #[test]
218    fn test_document_hierarchical_hash_computation() {
219        // Test Merkle hash computation from chunk hashes
220        let chunk_hashes: [[u8; 32]; 3] = [
221            [1u8; 32],
222            [2u8; 32],
223            [3u8; 32],
224        ];
225
226        let hierarchical_hash = Document::compute_hierarchical_hash(&chunk_hashes);
227
228        // Verify it's a 32-byte hash
229        assert_eq!(hierarchical_hash.len(), 32);
230
231        // Verify determinism - same input produces same hash
232        let hierarchical_hash2 = Document::compute_hierarchical_hash(&chunk_hashes);
233        assert_eq!(hierarchical_hash, hierarchical_hash2);
234
235        // Verify different input produces different hash
236        let different_hashes: [[u8; 32]; 2] = [[1u8; 32], [2u8; 32]];
237        let different_result = Document::compute_hierarchical_hash(&different_hashes);
238        assert_ne!(hierarchical_hash, different_result);
239    }
240
241    #[test]
242    fn test_document_serialization() {
243        // Test CBOR round-trip serialization using serde
244        // Note: CBOR requires the ciborium crate which isn't a dependency of cp-core
245        // So we test basic serialization via serde
246        let doc = Document::new(
247            PathBuf::from("test.md"),
248            b"Hello, World!",
249            1234567890,
250        );
251
252        // Verify document can be serialized with serde (JSON for testing)
253        let serialized = serde_json::to_string(&doc).unwrap();
254        let deserialized: Document = serde_json::from_str(&serialized).unwrap();
255
256        // Verify all fields match
257        assert_eq!(doc.id, deserialized.id);
258        assert_eq!(doc.path_id, deserialized.path_id);
259        assert_eq!(doc.path, deserialized.path);
260        assert_eq!(doc.hash, deserialized.hash);
261        assert_eq!(doc.hierarchical_hash, deserialized.hierarchical_hash);
262        assert_eq!(doc.mtime, deserialized.mtime);
263        assert_eq!(doc.size, deserialized.size);
264        assert_eq!(doc.mime_type, deserialized.mime_type);
265    }
266
267    #[test]
268    fn test_document_deserialization_invalid() {
269        // Test handling of malformed data
270        // Invalid JSON data
271        let invalid_data = "{invalid json";
272
273        let result: Result<Document, _> = serde_json::from_str(invalid_data);
274        assert!(result.is_err());
275    }
276
277    #[test]
278    fn test_document_canonical_bytes() {
279        // Test to_canonical_bytes format (simulated since we use CBOR)
280        let doc = Document::new(
281            PathBuf::from("test.md"),
282            b"Content",
283            1000,
284        );
285
286        // Verify document has all required fields for canonical bytes
287        let id_bytes = doc.id.as_bytes();
288        let path_id_bytes = doc.path_id.as_bytes();
289
290        assert_eq!(id_bytes.len(), 16);
291        assert_eq!(path_id_bytes.len(), 16);
292        assert_eq!(doc.hash.len(), 32);
293        assert_eq!(doc.hierarchical_hash.len(), 32);
294    }
295
296    #[test]
297    fn test_document_mime_type_detection_markdown() {
298        // Test .md extension detection
299        let doc1 = Document::new(PathBuf::from("readme.md"), b"content", 0);
300        let doc2 = Document::new(PathBuf::from("document.markdown"), b"content", 0);
301
302        assert_eq!(doc1.mime_type, "text/markdown");
303        assert_eq!(doc2.mime_type, "text/markdown");
304    }
305
306    #[test]
307    fn test_document_mime_type_detection_text() {
308        // Test .txt extension detection
309        let doc = Document::new(PathBuf::from("notes.txt"), b"content", 0);
310        assert_eq!(doc.mime_type, "text/plain");
311    }
312
313    #[test]
314    fn test_document_mime_type_detection_unknown() {
315        // Test unknown extension defaults to application/octet-stream
316        let doc1 = Document::new(PathBuf::from("file.xyz"), b"content", 0);
317        let doc2 = Document::new(PathBuf::from("noextension"), b"content", 0);
318
319        assert_eq!(doc1.mime_type, "application/octet-stream");
320        assert_eq!(doc2.mime_type, "application/octet-stream");
321    }
322
323    #[test]
324    fn test_document_size_bytes_calculation() {
325        // Verify size matches content length
326        let content = b"Test content size";
327        let doc = Document::new(PathBuf::from("test.txt"), content, 0);
328
329        assert_eq!(doc.size, content.len() as u64);
330    }
331
332    #[test]
333    fn test_document_mtime_from_filesystem() {
334        // Test mtime is stored correctly
335        let mtime: i64 = 1609459200; // 2021-01-01 00:00:00 UTC
336        let doc = Document::new(PathBuf::from("test.txt"), b"content", mtime);
337
338        assert_eq!(doc.mtime, mtime);
339    }
340
341    #[test]
342    fn test_document_set_hierarchical_hash() {
343        // Test setting hierarchical hash
344        let mut doc = Document::new(PathBuf::from("test.md"), b"content", 0);
345        let new_hash = [42u8; 32];
346
347        doc.set_hierarchical_hash(new_hash);
348
349        assert_eq!(doc.hierarchical_hash, new_hash);
350    }
351
352    #[test]
353    fn test_document_hash_hex() {
354        // Test hash_hex() returns hex string
355        let doc = Document::new(PathBuf::from("test.md"), b"content", 0);
356        let hex_str = doc.hash_hex();
357
358        // Should be 64 characters (32 bytes * 2 hex chars)
359        assert_eq!(hex_str.len(), 64);
360
361        // Should only contain hex characters
362        assert!(hex_str.chars().all(|c| c.is_ascii_hexdigit()));
363    }
364}
canon_core/document.rs

canon_core/
document.rs