Skip to main content

vectorless/storage/
persistence.rs

1// Copyright (c) 2026 vectorless developers
2// SPDX-License-Identifier: Apache-2.0
3
4//! Persistence utilities for saving and loading document indices.
5//!
6//! # Features
7//!
8//! - **Atomic writes**: Write to temp file, then rename for crash safety
9//! - **Checksum verification**: SHA-256 checksums for data integrity
10//! - **Version header**: Format version for future migrations
11
12use sha2::{Digest, Sha256};
13use serde::{Deserialize, Serialize};
14use std::fs::File;
15use std::io::{BufReader, BufWriter, Read, Write};
16use std::path::{Path, PathBuf};
17
18use crate::document::DocumentTree;
19use crate::error::Result;
20use crate::Error;
21
22/// Current format version for persisted documents.
23const FORMAT_VERSION: u32 = 1;
24
25/// Metadata for a persisted document.
26#[derive(Debug, Clone, Serialize, Deserialize)]
27pub struct DocumentMeta {
28    /// Unique document identifier.
29    pub id: String,
30
31    /// Document name/title.
32    pub name: String,
33
34    /// Document format (md, pdf, etc.).
35    pub format: String,
36
37    /// Source file path.
38    pub source_path: Option<PathBuf>,
39
40    /// Document description.
41    pub description: Option<String>,
42
43    /// Page count (for PDFs).
44    pub page_count: Option<usize>,
45
46    /// Line count (for text files).
47    pub line_count: Option<usize>,
48
49    /// Creation timestamp.
50    pub created_at: chrono::DateTime<chrono::Utc>,
51
52    /// Last modified timestamp.
53    pub modified_at: chrono::DateTime<chrono::Utc>,
54}
55
56impl DocumentMeta {
57    /// Create new document metadata.
58    pub fn new(id: impl Into<String>, name: impl Into<String>, format: impl Into<String>) -> Self {
59        let now = chrono::Utc::now();
60        Self {
61            id: id.into(),
62            name: name.into(),
63            format: format.into(),
64            source_path: None,
65            description: None,
66            page_count: None,
67            line_count: None,
68            created_at: now,
69            modified_at: now,
70        }
71    }
72
73    /// Set the source path.
74    pub fn with_source_path(mut self, path: impl Into<PathBuf>) -> Self {
75        self.source_path = Some(path.into());
76        self
77    }
78
79    /// Set the description.
80    pub fn with_description(mut self, desc: impl Into<String>) -> Self {
81        self.description = Some(desc.into());
82        self
83    }
84}
85
86/// A persisted document index containing tree and metadata.
87#[derive(Debug, Clone, Serialize, Deserialize)]
88pub struct PersistedDocument {
89    /// Document metadata.
90    pub meta: DocumentMeta,
91
92    /// The document tree structure.
93    pub tree: DocumentTree,
94
95    /// Per-page content (for PDFs).
96    #[serde(default)]
97    pub pages: Vec<PageContent>,
98}
99
100impl PersistedDocument {
101    /// Create a new persisted document.
102    pub fn new(meta: DocumentMeta, tree: DocumentTree) -> Self {
103        Self {
104            meta,
105            tree,
106            pages: Vec::new(),
107        }
108    }
109
110    /// Add page content.
111    pub fn add_page(&mut self, page: usize, content: impl Into<String>) {
112        self.pages.push(PageContent {
113            page,
114            content: content.into(),
115        });
116    }
117}
118
119/// Content for a single page.
120#[derive(Debug, Clone, Serialize, Deserialize)]
121pub struct PageContent {
122    /// Page number (1-based).
123    pub page: usize,
124
125    /// Page text content.
126    pub content: String,
127}
128
129/// Wrapper for persisted data with checksum.
130#[derive(Debug, Serialize, Deserialize)]
131struct PersistedWrapper<T> {
132    /// Format version.
133    version: u32,
134    /// SHA-256 checksum of the payload.
135    checksum: String,
136    /// The actual data.
137    payload: T,
138}
139
140/// Options for save/load operations.
141#[derive(Debug, Clone)]
142pub struct PersistenceOptions {
143    /// Use atomic writes (temp file + rename).
144    pub atomic_writes: bool,
145    /// Verify checksums on load.
146    pub verify_checksum: bool,
147}
148
149impl Default for PersistenceOptions {
150    fn default() -> Self {
151        Self {
152            atomic_writes: true,
153            verify_checksum: true,
154        }
155    }
156}
157
158impl PersistenceOptions {
159    /// Create new options with defaults.
160    pub fn new() -> Self {
161        Self::default()
162    }
163
164    /// Set atomic writes option.
165    pub fn with_atomic_writes(mut self, enabled: bool) -> Self {
166        self.atomic_writes = enabled;
167        self
168    }
169
170    /// Set checksum verification option.
171    pub fn with_verify_checksum(mut self, enabled: bool) -> Self {
172        self.verify_checksum = enabled;
173        self
174    }
175}
176
177/// Calculate SHA-256 checksum of data.
178fn calculate_checksum(data: &[u8]) -> String {
179    let mut hasher = Sha256::new();
180    hasher.update(data);
181    format!("{:x}", hasher.finalize())
182}
183
184/// Save a document to a JSON file with atomic write and checksum.
185///
186/// # Atomic Write
187///
188/// When `atomic_writes` is enabled (default), this function:
189/// 1. Writes to a temporary file (`.tmp` suffix)
190/// 2. Renames temp file to target (atomic on most filesystems)
191///
192/// This prevents data corruption if the process crashes during write.
193///
194/// # Errors
195///
196/// Returns an error if:
197/// - Serialization fails
198/// - Cannot create temp file
199/// - Write fails
200/// - Rename fails
201pub fn save_document(path: &Path, doc: &PersistedDocument) -> Result<()> {
202    save_document_with_options(path, doc, &PersistenceOptions::default())
203}
204
205/// Save a document with custom options.
206pub fn save_document_with_options(
207    path: &Path,
208    doc: &PersistedDocument,
209    options: &PersistenceOptions,
210) -> Result<()> {
211    // Serialize the payload first
212    let payload_bytes = serde_json::to_vec(doc)
213        .map_err(|e| Error::Serialization(e.to_string()))?;
214
215    // Calculate checksum
216    let checksum = calculate_checksum(&payload_bytes);
217
218    // Create wrapper
219    let wrapper = PersistedWrapper {
220        version: FORMAT_VERSION,
221        checksum,
222        payload: doc.clone(),
223    };
224
225    // Serialize wrapper
226    let json = serde_json::to_string_pretty(&wrapper)
227        .map_err(|e| Error::Serialization(e.to_string()))?;
228
229    if options.atomic_writes {
230        // Atomic write: write to temp file, then rename
231        let temp_path = path.with_extension("tmp");
232
233        // Ensure parent directory exists
234        if let Some(parent) = path.parent() {
235            std::fs::create_dir_all(parent).map_err(Error::Io)?;
236        }
237
238        // Write to temp file
239        {
240            let file = File::create(&temp_path).map_err(Error::Io)?;
241            let mut writer = BufWriter::new(file);
242            writer.write_all(json.as_bytes()).map_err(Error::Io)?;
243            writer.flush().map_err(Error::Io)?;
244        }
245
246        // Atomic rename
247        std::fs::rename(&temp_path, path).map_err(Error::Io)?;
248    } else {
249        // Direct write (not atomic)
250        std::fs::write(path, json).map_err(Error::Io)?;
251    }
252
253    Ok(())
254}
255
256/// Load a document from a JSON file with checksum verification.
257///
258/// # Checksum Verification
259///
260/// When `verify_checksum` is enabled (default), this function:
261/// 1. Reads the file
262/// 2. Parses the wrapper
263/// 3. Re-serializes the payload
264/// 4. Verifies the checksum matches
265///
266/// # Errors
267///
268/// Returns an error if:
269/// - File doesn't exist
270/// - Parse fails
271/// - Checksum mismatch
272/// - Version mismatch (future: migration)
273pub fn load_document(path: &Path) -> Result<PersistedDocument> {
274    load_document_with_options(path, &PersistenceOptions::default())
275}
276
277/// Load a document with custom options.
278pub fn load_document_with_options(
279    path: &Path,
280    options: &PersistenceOptions,
281) -> Result<PersistedDocument> {
282    if !path.exists() {
283        return Err(Error::DocumentNotFound(
284            path.display().to_string()
285        ));
286    }
287
288    let file = File::open(path).map_err(Error::Io)?;
289    let reader = BufReader::new(file);
290
291    // Parse wrapper
292    let wrapper: PersistedWrapper<PersistedDocument> = serde_json::from_reader(reader)
293        .map_err(|e| Error::Parse(format!("Failed to parse document: {}", e)))?;
294
295    // Check version
296    if wrapper.version != FORMAT_VERSION {
297        return Err(Error::Parse(format!(
298            "Unsupported format version: {} (expected {})",
299            wrapper.version, FORMAT_VERSION
300        )));
301    }
302
303    // Verify checksum if enabled
304    if options.verify_checksum {
305        let payload_bytes = serde_json::to_vec(&wrapper.payload)
306            .map_err(|e| Error::Serialization(e.to_string()))?;
307
308        let expected_checksum = calculate_checksum(&payload_bytes);
309
310        if wrapper.checksum != expected_checksum {
311            return Err(Error::Parse(format!(
312                "Checksum mismatch: expected {}, got {}",
313                expected_checksum, wrapper.checksum
314            )));
315        }
316    }
317
318    Ok(wrapper.payload)
319}
320
321/// Save the workspace index (metadata for all documents).
322pub fn save_index(path: &Path, entries: &[DocumentMeta]) -> Result<()> {
323    save_index_with_options(path, entries, &PersistenceOptions::default())
324}
325
326/// Save the workspace index with custom options.
327pub fn save_index_with_options(
328    path: &Path,
329    entries: &[DocumentMeta],
330    options: &PersistenceOptions,
331) -> Result<()> {
332    // Serialize payload
333    let payload_bytes = serde_json::to_vec(entries)
334        .map_err(|e| Error::Serialization(e.to_string()))?;
335
336    let checksum = calculate_checksum(&payload_bytes);
337
338    let wrapper = PersistedWrapper {
339        version: FORMAT_VERSION,
340        checksum,
341        payload: entries.to_vec(),
342    };
343
344    let json = serde_json::to_string_pretty(&wrapper)
345        .map_err(|e| Error::Serialization(e.to_string()))?;
346
347    if options.atomic_writes {
348        let temp_path = path.with_extension("tmp");
349
350        // Ensure parent directory exists
351        if let Some(parent) = path.parent() {
352            std::fs::create_dir_all(parent).map_err(Error::Io)?;
353        }
354
355        // Write to temp file
356        {
357            let file = File::create(&temp_path).map_err(Error::Io)?;
358            let mut writer = BufWriter::new(file);
359            writer.write_all(json.as_bytes()).map_err(Error::Io)?;
360            writer.flush().map_err(Error::Io)?;
361        }
362
363        // Atomic rename
364        std::fs::rename(&temp_path, path).map_err(Error::Io)?;
365    } else {
366        std::fs::write(path, json).map_err(Error::Io)?;
367    }
368
369    Ok(())
370}
371
372/// Load the workspace index.
373pub fn load_index(path: &Path) -> Result<Vec<DocumentMeta>> {
374    load_index_with_options(path, &PersistenceOptions::default())
375}
376
377/// Load the workspace index with custom options.
378pub fn load_index_with_options(
379    path: &Path,
380    options: &PersistenceOptions,
381) -> Result<Vec<DocumentMeta>> {
382    if !path.exists() {
383        return Ok(Vec::new());
384    }
385
386    let file = File::open(path).map_err(Error::Io)?;
387    let reader = BufReader::new(file);
388
389    let wrapper: PersistedWrapper<Vec<DocumentMeta>> = serde_json::from_reader(reader)
390        .map_err(|e| Error::Parse(format!("Failed to parse index: {}", e)))?;
391
392    // Check version
393    if wrapper.version != FORMAT_VERSION {
394        return Err(Error::Parse(format!(
395            "Unsupported format version: {} (expected {})",
396            wrapper.version, FORMAT_VERSION
397        )));
398    }
399
400    // Verify checksum if enabled
401    if options.verify_checksum {
402        let payload_bytes = serde_json::to_vec(&wrapper.payload)
403            .map_err(|e| Error::Serialization(e.to_string()))?;
404
405        let expected_checksum = calculate_checksum(&payload_bytes);
406
407        if wrapper.checksum != expected_checksum {
408            return Err(Error::Parse(format!(
409                "Checksum mismatch: expected {}, got {}",
410                expected_checksum, wrapper.checksum
411            )));
412        }
413    }
414
415    Ok(wrapper.payload)
416}
417
418// ============================================================================
419// Bytes-based serialization (for StorageBackend integration)
420// ============================================================================
421
422/// Serialize a document to bytes (JSON with checksum wrapper).
423///
424/// This is useful for storage backends that work with byte arrays.
425pub fn save_document_to_bytes(doc: &PersistedDocument) -> Result<Vec<u8>> {
426    // Serialize the payload first
427    let payload_bytes = serde_json::to_vec(doc)
428        .map_err(|e| Error::Serialization(e.to_string()))?;
429
430    // Calculate checksum
431    let checksum = calculate_checksum(&payload_bytes);
432
433    // Create wrapper
434    let wrapper = PersistedWrapper {
435        version: FORMAT_VERSION,
436        checksum,
437        payload: doc.clone(),
438    };
439
440    // Serialize wrapper
441    serde_json::to_vec(&wrapper)
442        .map_err(|e| Error::Serialization(e.to_string()))
443}
444
445/// Deserialize a document from bytes.
446///
447/// Verifies checksum by default.
448pub fn load_document_from_bytes(data: &[u8]) -> Result<PersistedDocument> {
449    load_document_from_bytes_with_options(data, true)
450}
451
452/// Deserialize a document from bytes with optional checksum verification.
453pub fn load_document_from_bytes_with_options(
454    data: &[u8],
455    verify_checksum: bool,
456) -> Result<PersistedDocument> {
457    // Parse wrapper
458    let wrapper: PersistedWrapper<PersistedDocument> = serde_json::from_slice(data)
459        .map_err(|e| Error::Parse(format!("Failed to parse document: {}", e)))?;
460
461    // Check version
462    if wrapper.version != FORMAT_VERSION {
463        return Err(Error::VersionMismatch(format!(
464            "Expected version {}, got {}",
465            FORMAT_VERSION, wrapper.version
466        )));
467    }
468
469    // Verify checksum if enabled
470    if verify_checksum {
471        let payload_bytes = serde_json::to_vec(&wrapper.payload)
472            .map_err(|e| Error::Serialization(e.to_string()))?;
473
474        let expected_checksum = calculate_checksum(&payload_bytes);
475
476        if wrapper.checksum != expected_checksum {
477            return Err(Error::ChecksumMismatch(format!(
478                "Expected {}, got {}",
479                expected_checksum, wrapper.checksum
480            )));
481        }
482    }
483
484    Ok(wrapper.payload)
485}
486
487/// Serialize an index to bytes.
488pub fn save_index_to_bytes(entries: &[DocumentMeta]) -> Result<Vec<u8>> {
489    let payload_bytes = serde_json::to_vec(entries)
490        .map_err(|e| Error::Serialization(e.to_string()))?;
491
492    let checksum = calculate_checksum(&payload_bytes);
493
494    let wrapper = PersistedWrapper {
495        version: FORMAT_VERSION,
496        checksum,
497        payload: entries.to_vec(),
498    };
499
500    serde_json::to_vec(&wrapper)
501        .map_err(|e| Error::Serialization(e.to_string()))
502}
503
504/// Deserialize an index from bytes.
505pub fn load_index_from_bytes(data: &[u8]) -> Result<Vec<DocumentMeta>> {
506    load_index_from_bytes_with_options(data, true)
507}
508
509/// Deserialize an index from bytes with optional checksum verification.
510pub fn load_index_from_bytes_with_options(
511    data: &[u8],
512    verify_checksum: bool,
513) -> Result<Vec<DocumentMeta>> {
514    let wrapper: PersistedWrapper<Vec<DocumentMeta>> = serde_json::from_slice(data)
515        .map_err(|e| Error::Parse(format!("Failed to parse index: {}", e)))?;
516
517    // Check version
518    if wrapper.version != FORMAT_VERSION {
519        return Err(Error::VersionMismatch(format!(
520            "Expected version {}, got {}",
521            FORMAT_VERSION, wrapper.version
522        )));
523    }
524
525    // Verify checksum if enabled
526    if verify_checksum {
527        let payload_bytes = serde_json::to_vec(&wrapper.payload)
528            .map_err(|e| Error::Serialization(e.to_string()))?;
529
530        let expected_checksum = calculate_checksum(&payload_bytes);
531
532        if wrapper.checksum != expected_checksum {
533            return Err(Error::ChecksumMismatch(format!(
534                "Expected {}, got {}",
535                expected_checksum, wrapper.checksum
536            )));
537        }
538    }
539
540    Ok(wrapper.payload)
541}
542
543#[cfg(test)]
544mod tests {
545    use super::*;
546    use tempfile::TempDir;
547
548    fn create_test_doc(id: &str) -> PersistedDocument {
549        let meta = DocumentMeta::new(id, "Test Doc", "md");
550        let tree = DocumentTree::new("Root", "Content");
551        PersistedDocument::new(meta, tree)
552    }
553
554    #[test]
555    fn test_save_and_load_document() {
556        let temp = TempDir::new().unwrap();
557        let path = temp.path().join("test.json");
558
559        let doc = create_test_doc("doc-1");
560        save_document(&path, &doc).unwrap();
561
562        let loaded = load_document(&path).unwrap();
563        assert_eq!(loaded.meta.id, "doc-1");
564        assert_eq!(loaded.meta.name, "Test Doc");
565    }
566
567    #[test]
568    fn test_atomic_write() {
569        let temp = TempDir::new().unwrap();
570        let path = temp.path().join("atomic.json");
571
572        let doc = create_test_doc("doc-atomic");
573        let options = PersistenceOptions::new().with_atomic_writes(true);
574        save_document_with_options(&path, &doc, &options).unwrap();
575
576        // Temp file should not exist after save
577        assert!(!path.with_extension("tmp").exists());
578
579        let loaded = load_document(&path).unwrap();
580        assert_eq!(loaded.meta.id, "doc-atomic");
581    }
582
583    #[test]
584    fn test_checksum_verification() {
585        let temp = TempDir::new().unwrap();
586        let path = temp.path().join("checksum.json");
587
588        let doc = create_test_doc("doc-checksum");
589        save_document(&path, &doc).unwrap();
590
591        // Corrupt the file
592        let content = std::fs::read_to_string(&path).unwrap();
593        let corrupted = content.replace("doc-checksum", "doc-corrupted");
594        std::fs::write(&path, corrupted).unwrap();
595
596        // Load should fail with checksum error
597        let result = load_document(&path);
598        assert!(result.is_err());
599        let err = result.unwrap_err();
600        assert!(matches!(err, Error::Parse(_)));
601    }
602
603    #[test]
604    fn test_checksum_disabled() {
605        let temp = TempDir::new().unwrap();
606        let path = temp.path().join("no-checksum.json");
607
608        let doc = create_test_doc("doc-no-check");
609        save_document(&path, &doc).unwrap();
610
611        // Load with checksum disabled should succeed
612        let options = PersistenceOptions::new().with_verify_checksum(false);
613        let result = load_document_with_options(&path, &options);
614        assert!(result.is_ok());
615        let loaded = result.unwrap();
616        assert_eq!(loaded.meta.id, "doc-no-check");
617
618        // Now corrupt the checksum field specifically
619        let content = std::fs::read_to_string(&path).unwrap();
620        // Change the checksum value but keep the payload intact
621        let corrupted = content.replace(
622            &calculate_checksum(&serde_json::to_vec(&doc).unwrap()),
623            "0000000000000000000000000000000000000000000000000000000000000000"
624        );
625        std::fs::write(&path, corrupted).unwrap();
626
627        // Load with checksum disabled should still succeed
628        let result = load_document_with_options(&path, &options);
629        assert!(result.is_ok());
630
631        // Load with checksum enabled should fail
632        let options_enabled = PersistenceOptions::new().with_verify_checksum(true);
633        let result = load_document_with_options(&path, &options_enabled);
634        assert!(result.is_err());
635    }
636
637    #[test]
638    fn test_load_nonexistent() {
639        let result = load_document(Path::new("/nonexistent/path.json"));
640        assert!(result.is_err());
641        assert!(result.unwrap_err().is_not_found());
642    }
643
644    #[test]
645    fn test_save_and_load_index() {
646        let temp = TempDir::new().unwrap();
647        let path = temp.path().join("_meta.json");
648
649        let mut entries = Vec::new();
650        entries.push(DocumentMeta::new("doc-1", "Doc 1", "md"));
651        entries.push(DocumentMeta::new("doc-2", "Doc 2", "pdf"));
652
653        save_index(&path, &entries).unwrap();
654
655        let loaded = load_index(&path).unwrap();
656        assert_eq!(loaded.len(), 2);
657        assert_eq!(loaded[0].id, "doc-1");
658        assert_eq!(loaded[1].format, "pdf");
659    }
660
661    #[test]
662    fn test_load_empty_index() {
663        let temp = TempDir::new().unwrap();
664        let path = temp.path().join("nonexistent.json");
665
666        let loaded = load_index(&path).unwrap();
667        assert!(loaded.is_empty());
668    }
669
670    #[test]
671    fn test_checksum_calculation() {
672        let data1 = b"test data";
673        let data2 = b"test data";
674        let data3 = b"different data";
675
676        let checksum1 = calculate_checksum(data1);
677        let checksum2 = calculate_checksum(data2);
678        let checksum3 = calculate_checksum(data3);
679
680        assert_eq!(checksum1, checksum2);
681        assert_ne!(checksum1, checksum3);
682        assert_eq!(checksum1.len(), 64); // SHA-256 produces 64 hex chars
683    }
684}