1use crate::text::normalize;
7use serde::{Deserialize, Serialize};
8use std::path::PathBuf;
9use uuid::Uuid;
10
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
16pub struct Document {
17 pub id: Uuid,
19
20 pub path_id: Uuid,
22
23 pub path: PathBuf,
25
26 pub hash: [u8; 32],
28
29 pub hierarchical_hash: [u8; 32],
31
32 pub mtime: i64,
34
35 pub size: u64,
37
38 pub mime_type: String,
40
41 #[serde(default, skip_serializing_if = "Option::is_none")]
43 pub arweave_tx: Option<String>,
44}
45
46impl Document {
47 pub fn new(path: PathBuf, content: &[u8], mtime: i64) -> Self {
48 let mime_type = mime_from_path(&path);
49
50 let text = String::from_utf8_lossy(content);
52 let canonical_content = normalize(&text);
53 let canonical_bytes = canonical_content.as_bytes();
54 let content_hash = blake3::hash(canonical_bytes);
55
56 let mut id_bytes = [0u8; 16];
58 id_bytes.copy_from_slice(&content_hash.as_bytes()[0..16]);
59 let id = Uuid::from_bytes(id_bytes);
60
61 let path_str = path.to_string_lossy();
63 let canonical_path = normalize(&path_str);
64 let path_id_bytes = blake3::hash(canonical_path.as_bytes());
65 let mut path_id = [0u8; 16];
66 path_id.copy_from_slice(&path_id_bytes.as_bytes()[0..16]);
67 let path_id = Uuid::from_bytes(path_id);
68
69 Self {
70 id,
71 path_id,
72 path,
73 hash: *content_hash.as_bytes(),
74 hierarchical_hash: [0; 32], mtime,
76 size: content.len() as u64, mime_type,
78 arweave_tx: None,
79 }
80 }
81
82 pub fn set_hierarchical_hash(&mut self, hash: [u8; 32]) {
84 self.hierarchical_hash = hash;
85 }
86
87 pub fn compute_hierarchical_hash(chunk_hashes: &[[u8; 32]]) -> [u8; 32] {
90 let mut sorted = chunk_hashes.to_vec();
91 sorted.sort_unstable();
92 let mut section_hasher = blake3::Hasher::new();
93 for hash in &sorted {
94 section_hasher.update(hash);
95 }
96 *section_hasher.finalize().as_bytes()
97 }
98
99 pub fn content_changed(&self, new_content: &[u8]) -> bool {
101 let text = String::from_utf8_lossy(new_content);
102 let canonical = normalize(&text);
103 let new_hash = blake3::hash(canonical.as_bytes());
104 self.hash != *new_hash.as_bytes()
105 }
106
107 pub fn hash_hex(&self) -> String {
109 hex_encode(&self.hash)
110 }
111}
112
113fn mime_from_path(path: &std::path::Path) -> String {
115 match path.extension().and_then(|e| e.to_str()) {
116 Some("md" | "markdown") => "text/markdown".to_string(),
117 Some("txt") => "text/plain".to_string(),
118 Some("pdf") => "application/pdf".to_string(),
119 Some("json") => "application/json".to_string(),
120 Some("html" | "htm") => "text/html".to_string(),
121 Some("rs") => "text/x-rust".to_string(),
122 Some("py") => "text/x-python".to_string(),
123 Some("js") => "text/javascript".to_string(),
124 Some("ts") => "text/typescript".to_string(),
125 _ => "application/octet-stream".to_string(),
126 }
127}
128
129fn hex_encode(bytes: &[u8]) -> String {
131 use std::fmt::Write;
132 let mut s = String::with_capacity(bytes.len() * 2);
133 for b in bytes {
134 let _ = write!(s, "{b:02x}");
135 }
136 s
137}
138
139#[cfg(test)]
140mod tests {
141 use super::*;
142
143 #[test]
144 fn test_document_creation() {
145 let content = b"Hello, CP!";
146 let doc = Document::new(PathBuf::from("test.md"), content, 1234567890);
147
148 assert_eq!(doc.path, PathBuf::from("test.md"));
149 assert_eq!(doc.size, 10);
151 assert_eq!(doc.mime_type, "text/markdown");
152 assert_eq!(doc.hierarchical_hash, [0; 32]);
153
154 let canonical = normalize("Hello, CP!");
156 let hash = blake3::hash(canonical.as_bytes());
157 let expected_id = Uuid::from_bytes(hash.as_bytes()[0..16].try_into().unwrap());
158 assert_eq!(doc.id, expected_id);
159 }
160
161 #[test]
162 fn test_content_changed() {
163 let content = b"Original content";
164 let doc = Document::new(PathBuf::from("test.txt"), content, 0);
165
166 assert!(!doc.content_changed(content));
167 assert!(doc.content_changed(b"Modified content"));
168 }
169
170 #[test]
171 fn test_path_id_deterministic() {
172 let doc1 = Document::new(PathBuf::from("test.md"), b"content", 0);
173 let doc2 = Document::new(PathBuf::from("test.md"), b"content", 0);
174
175 assert_eq!(doc1.path_id, doc2.path_id);
177
178 let doc3 = Document::new(PathBuf::from("other.md"), b"content", 0);
180 assert_ne!(doc1.path_id, doc3.path_id);
181 }
182
183 #[test]
184 fn test_content_id_deterministic() {
185 let doc1 = Document::new(PathBuf::from("a.md"), b"hello", 0);
186 let doc2 = Document::new(PathBuf::from("b.md"), b"hello", 0);
187
188 assert_eq!(doc1.id, doc2.id);
190 }
191
192 #[test]
195 fn test_document_id_derivation_from_content_hash() {
196 let content = b"Test content for ID derivation";
198 let doc = Document::new(PathBuf::from("test.md"), content, 0);
199
200 let canonical = normalize("Test content for ID derivation");
202 let expected_hash = blake3::hash(canonical.as_bytes());
203 let mut expected_id_bytes = [0u8; 16];
204 expected_id_bytes.copy_from_slice(&expected_hash.as_bytes()[0..16]);
205 let expected_id = Uuid::from_bytes(expected_id_bytes);
206
207 assert_eq!(doc.id, expected_id);
208 }
209
210 #[test]
211 fn test_document_path_id_derivation() {
212 let path = PathBuf::from("test/document.md");
214 let doc = Document::new(path.clone(), b"content", 0);
215
216 let canonical_path = normalize(&path.to_string_lossy());
218 let expected_hash = blake3::hash(canonical_path.as_bytes());
219 let mut expected_path_id_bytes = [0u8; 16];
220 expected_path_id_bytes.copy_from_slice(&expected_hash.as_bytes()[0..16]);
221 let expected_path_id = Uuid::from_bytes(expected_path_id_bytes);
222
223 assert_eq!(doc.path_id, expected_path_id);
224 }
225
226 #[test]
227 fn test_document_hierarchical_hash_computation() {
228 let chunk_hashes: [[u8; 32]; 3] = [[1u8; 32], [2u8; 32], [3u8; 32]];
230
231 let hierarchical_hash = Document::compute_hierarchical_hash(&chunk_hashes);
232
233 assert_eq!(hierarchical_hash.len(), 32);
235
236 let hierarchical_hash2 = Document::compute_hierarchical_hash(&chunk_hashes);
238 assert_eq!(hierarchical_hash, hierarchical_hash2);
239
240 let different_hashes: [[u8; 32]; 2] = [[1u8; 32], [2u8; 32]];
242 let different_result = Document::compute_hierarchical_hash(&different_hashes);
243 assert_ne!(hierarchical_hash, different_result);
244 }
245
246 #[test]
247 fn test_document_serialization() {
248 let doc = Document::new(PathBuf::from("test.md"), b"Hello, World!", 1234567890);
252
253 let serialized = serde_json::to_string(&doc).unwrap();
255 let deserialized: Document = serde_json::from_str(&serialized).unwrap();
256
257 assert_eq!(doc.id, deserialized.id);
259 assert_eq!(doc.path_id, deserialized.path_id);
260 assert_eq!(doc.path, deserialized.path);
261 assert_eq!(doc.hash, deserialized.hash);
262 assert_eq!(doc.hierarchical_hash, deserialized.hierarchical_hash);
263 assert_eq!(doc.mtime, deserialized.mtime);
264 assert_eq!(doc.size, deserialized.size);
265 assert_eq!(doc.mime_type, deserialized.mime_type);
266 }
267
268 #[test]
269 fn test_document_deserialization_invalid() {
270 let invalid_data = "{invalid json";
273
274 let result: Result<Document, _> = serde_json::from_str(invalid_data);
275 assert!(result.is_err());
276 }
277
278 #[test]
279 fn test_document_canonical_bytes() {
280 let doc = Document::new(PathBuf::from("test.md"), b"Content", 1000);
282
283 let id_bytes = doc.id.as_bytes();
285 let path_id_bytes = doc.path_id.as_bytes();
286
287 assert_eq!(id_bytes.len(), 16);
288 assert_eq!(path_id_bytes.len(), 16);
289 assert_eq!(doc.hash.len(), 32);
290 assert_eq!(doc.hierarchical_hash.len(), 32);
291 }
292
293 #[test]
294 fn test_document_mime_type_detection_markdown() {
295 let doc1 = Document::new(PathBuf::from("readme.md"), b"content", 0);
297 let doc2 = Document::new(PathBuf::from("document.markdown"), b"content", 0);
298
299 assert_eq!(doc1.mime_type, "text/markdown");
300 assert_eq!(doc2.mime_type, "text/markdown");
301 }
302
303 #[test]
304 fn test_document_mime_type_detection_text() {
305 let doc = Document::new(PathBuf::from("notes.txt"), b"content", 0);
307 assert_eq!(doc.mime_type, "text/plain");
308 }
309
310 #[test]
311 fn test_document_mime_type_detection_unknown() {
312 let doc1 = Document::new(PathBuf::from("file.xyz"), b"content", 0);
314 let doc2 = Document::new(PathBuf::from("noextension"), b"content", 0);
315
316 assert_eq!(doc1.mime_type, "application/octet-stream");
317 assert_eq!(doc2.mime_type, "application/octet-stream");
318 }
319
320 #[test]
321 fn test_document_size_bytes_calculation() {
322 let content = b"Test content size";
324 let doc = Document::new(PathBuf::from("test.txt"), content, 0);
325
326 assert_eq!(doc.size, content.len() as u64);
327 }
328
329 #[test]
330 fn test_document_mtime_from_filesystem() {
331 let mtime: i64 = 1609459200; let doc = Document::new(PathBuf::from("test.txt"), b"content", mtime);
334
335 assert_eq!(doc.mtime, mtime);
336 }
337
338 #[test]
339 fn test_document_set_hierarchical_hash() {
340 let mut doc = Document::new(PathBuf::from("test.md"), b"content", 0);
342 let new_hash = [42u8; 32];
343
344 doc.set_hierarchical_hash(new_hash);
345
346 assert_eq!(doc.hierarchical_hash, new_hash);
347 }
348
349 #[test]
350 fn test_document_hash_hex() {
351 let doc = Document::new(PathBuf::from("test.md"), b"content", 0);
353 let hex_str = doc.hash_hex();
354
355 assert_eq!(hex_str.len(), 64);
357
358 assert!(hex_str.chars().all(|c| c.is_ascii_hexdigit()));
360 }
361}