1use crate::text::normalize;
7use serde::{Deserialize, Serialize};
8use std::path::PathBuf;
9use uuid::Uuid;
10
11#[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)]
16pub struct Document {
17 pub id: Uuid,
19
20 pub path_id: Uuid,
22
23 pub path: PathBuf,
25
26 pub hash: [u8; 32],
28
29 pub hierarchical_hash: [u8; 32],
31
32 pub mtime: i64,
34
35 pub size: u64,
37
38 pub mime_type: String,
40}
41
42impl Document {
43 pub fn new(path: PathBuf, content: &[u8], mtime: i64) -> Self {
44 let mime_type = mime_from_path(&path);
45
46 let text = String::from_utf8_lossy(content);
48 let canonical_content = normalize(&text);
49 let canonical_bytes = canonical_content.as_bytes();
50 let content_hash = blake3::hash(canonical_bytes);
51
52 let mut id_bytes = [0u8; 16];
54 id_bytes.copy_from_slice(&content_hash.as_bytes()[0..16]);
55 let id = Uuid::from_bytes(id_bytes);
56
57 let path_str = path.to_string_lossy();
59 let canonical_path = normalize(&path_str);
60 let path_id_bytes = blake3::hash(canonical_path.as_bytes());
61 let mut path_id = [0u8; 16];
62 path_id.copy_from_slice(&path_id_bytes.as_bytes()[0..16]);
63 let path_id = Uuid::from_bytes(path_id);
64
65 Self {
66 id,
67 path_id,
68 path,
69 hash: *content_hash.as_bytes(),
70 hierarchical_hash: [0; 32], mtime,
72 size: content.len() as u64, mime_type,
74 }
75 }
76
77 pub fn set_hierarchical_hash(&mut self, hash: [u8; 32]) {
79 self.hierarchical_hash = hash;
80 }
81
82 pub fn compute_hierarchical_hash(chunk_hashes: &[[u8; 32]]) -> [u8; 32] {
84 let mut section_hasher = blake3::Hasher::new();
85 for hash in chunk_hashes {
86 section_hasher.update(hash);
87 }
88 *section_hasher.finalize().as_bytes()
89 }
90
91 pub fn content_changed(&self, new_content: &[u8]) -> bool {
93 let text = String::from_utf8_lossy(new_content);
94 let canonical = normalize(&text);
95 let new_hash = blake3::hash(canonical.as_bytes());
96 self.hash != *new_hash.as_bytes()
97 }
98
99 pub fn hash_hex(&self) -> String {
101 hex_encode(&self.hash)
102 }
103}
104
105fn mime_from_path(path: &PathBuf) -> String {
107 match path.extension().and_then(|e| e.to_str()) {
108 Some("md") | Some("markdown") => "text/markdown".to_string(),
109 Some("txt") => "text/plain".to_string(),
110 Some("pdf") => "application/pdf".to_string(),
111 Some("json") => "application/json".to_string(),
112 Some("html") | Some("htm") => "text/html".to_string(),
113 Some("rs") => "text/x-rust".to_string(),
114 Some("py") => "text/x-python".to_string(),
115 Some("js") => "text/javascript".to_string(),
116 Some("ts") => "text/typescript".to_string(),
117 _ => "application/octet-stream".to_string(),
118 }
119}
120
121fn hex_encode(bytes: &[u8]) -> String {
123 bytes.iter().map(|b| format!("{:02x}", b)).collect()
124}
125
126#[cfg(test)]
127mod tests {
128 use super::*;
129
130 #[test]
131 fn test_document_creation() {
132 let content = b"Hello, CP!";
133 let doc = Document::new(
134 PathBuf::from("test.md"),
135 content,
136 1234567890,
137 );
138
139 assert_eq!(doc.path, PathBuf::from("test.md"));
140 assert_eq!(doc.size, 10);
142 assert_eq!(doc.mime_type, "text/markdown");
143 assert_eq!(doc.hierarchical_hash, [0; 32]);
144
145 let canonical = normalize("Hello, CP!");
147 let hash = blake3::hash(canonical.as_bytes());
148 let expected_id = Uuid::from_bytes(hash.as_bytes()[0..16].try_into().unwrap());
149 assert_eq!(doc.id, expected_id);
150 }
151
152 #[test]
153 fn test_content_changed() {
154 let content = b"Original content";
155 let doc = Document::new(PathBuf::from("test.txt"), content, 0);
156
157 assert!(!doc.content_changed(content));
158 assert!(doc.content_changed(b"Modified content"));
159 }
160
161 #[test]
162 fn test_path_id_deterministic() {
163 let doc1 = Document::new(PathBuf::from("test.md"), b"content", 0);
164 let doc2 = Document::new(PathBuf::from("test.md"), b"content", 0);
165
166 assert_eq!(doc1.path_id, doc2.path_id);
168
169 let doc3 = Document::new(PathBuf::from("other.md"), b"content", 0);
171 assert_ne!(doc1.path_id, doc3.path_id);
172 }
173
174 #[test]
175 fn test_content_id_deterministic() {
176 let doc1 = Document::new(PathBuf::from("a.md"), b"hello", 0);
177 let doc2 = Document::new(PathBuf::from("b.md"), b"hello", 0);
178
179 assert_eq!(doc1.id, doc2.id);
181 }
182
183 #[test]
186 fn test_document_id_derivation_from_content_hash() {
187 let content = b"Test content for ID derivation";
189 let doc = Document::new(PathBuf::from("test.md"), content, 0);
190
191 let canonical = normalize("Test content for ID derivation");
193 let expected_hash = blake3::hash(canonical.as_bytes());
194 let mut expected_id_bytes = [0u8; 16];
195 expected_id_bytes.copy_from_slice(&expected_hash.as_bytes()[0..16]);
196 let expected_id = Uuid::from_bytes(expected_id_bytes);
197
198 assert_eq!(doc.id, expected_id);
199 }
200
201 #[test]
202 fn test_document_path_id_derivation() {
203 let path = PathBuf::from("test/document.md");
205 let doc = Document::new(path.clone(), b"content", 0);
206
207 let canonical_path = normalize(&path.to_string_lossy());
209 let expected_hash = blake3::hash(canonical_path.as_bytes());
210 let mut expected_path_id_bytes = [0u8; 16];
211 expected_path_id_bytes.copy_from_slice(&expected_hash.as_bytes()[0..16]);
212 let expected_path_id = Uuid::from_bytes(expected_path_id_bytes);
213
214 assert_eq!(doc.path_id, expected_path_id);
215 }
216
217 #[test]
218 fn test_document_hierarchical_hash_computation() {
219 let chunk_hashes: [[u8; 32]; 3] = [
221 [1u8; 32],
222 [2u8; 32],
223 [3u8; 32],
224 ];
225
226 let hierarchical_hash = Document::compute_hierarchical_hash(&chunk_hashes);
227
228 assert_eq!(hierarchical_hash.len(), 32);
230
231 let hierarchical_hash2 = Document::compute_hierarchical_hash(&chunk_hashes);
233 assert_eq!(hierarchical_hash, hierarchical_hash2);
234
235 let different_hashes: [[u8; 32]; 2] = [[1u8; 32], [2u8; 32]];
237 let different_result = Document::compute_hierarchical_hash(&different_hashes);
238 assert_ne!(hierarchical_hash, different_result);
239 }
240
241 #[test]
242 fn test_document_serialization() {
243 let doc = Document::new(
247 PathBuf::from("test.md"),
248 b"Hello, World!",
249 1234567890,
250 );
251
252 let serialized = serde_json::to_string(&doc).unwrap();
254 let deserialized: Document = serde_json::from_str(&serialized).unwrap();
255
256 assert_eq!(doc.id, deserialized.id);
258 assert_eq!(doc.path_id, deserialized.path_id);
259 assert_eq!(doc.path, deserialized.path);
260 assert_eq!(doc.hash, deserialized.hash);
261 assert_eq!(doc.hierarchical_hash, deserialized.hierarchical_hash);
262 assert_eq!(doc.mtime, deserialized.mtime);
263 assert_eq!(doc.size, deserialized.size);
264 assert_eq!(doc.mime_type, deserialized.mime_type);
265 }
266
267 #[test]
268 fn test_document_deserialization_invalid() {
269 let invalid_data = "{invalid json";
272
273 let result: Result<Document, _> = serde_json::from_str(invalid_data);
274 assert!(result.is_err());
275 }
276
277 #[test]
278 fn test_document_canonical_bytes() {
279 let doc = Document::new(
281 PathBuf::from("test.md"),
282 b"Content",
283 1000,
284 );
285
286 let id_bytes = doc.id.as_bytes();
288 let path_id_bytes = doc.path_id.as_bytes();
289
290 assert_eq!(id_bytes.len(), 16);
291 assert_eq!(path_id_bytes.len(), 16);
292 assert_eq!(doc.hash.len(), 32);
293 assert_eq!(doc.hierarchical_hash.len(), 32);
294 }
295
296 #[test]
297 fn test_document_mime_type_detection_markdown() {
298 let doc1 = Document::new(PathBuf::from("readme.md"), b"content", 0);
300 let doc2 = Document::new(PathBuf::from("document.markdown"), b"content", 0);
301
302 assert_eq!(doc1.mime_type, "text/markdown");
303 assert_eq!(doc2.mime_type, "text/markdown");
304 }
305
306 #[test]
307 fn test_document_mime_type_detection_text() {
308 let doc = Document::new(PathBuf::from("notes.txt"), b"content", 0);
310 assert_eq!(doc.mime_type, "text/plain");
311 }
312
313 #[test]
314 fn test_document_mime_type_detection_unknown() {
315 let doc1 = Document::new(PathBuf::from("file.xyz"), b"content", 0);
317 let doc2 = Document::new(PathBuf::from("noextension"), b"content", 0);
318
319 assert_eq!(doc1.mime_type, "application/octet-stream");
320 assert_eq!(doc2.mime_type, "application/octet-stream");
321 }
322
323 #[test]
324 fn test_document_size_bytes_calculation() {
325 let content = b"Test content size";
327 let doc = Document::new(PathBuf::from("test.txt"), content, 0);
328
329 assert_eq!(doc.size, content.len() as u64);
330 }
331
332 #[test]
333 fn test_document_mtime_from_filesystem() {
334 let mtime: i64 = 1609459200; let doc = Document::new(PathBuf::from("test.txt"), b"content", mtime);
337
338 assert_eq!(doc.mtime, mtime);
339 }
340
341 #[test]
342 fn test_document_set_hierarchical_hash() {
343 let mut doc = Document::new(PathBuf::from("test.md"), b"content", 0);
345 let new_hash = [42u8; 32];
346
347 doc.set_hierarchical_hash(new_hash);
348
349 assert_eq!(doc.hierarchical_hash, new_hash);
350 }
351
352 #[test]
353 fn test_document_hash_hex() {
354 let doc = Document::new(PathBuf::from("test.md"), b"content", 0);
356 let hex_str = doc.hash_hex();
357
358 assert_eq!(hex_str.len(), 64);
360
361 assert!(hex_str.chars().all(|c| c.is_ascii_hexdigit()));
363 }
364}