Skip to main content

cdx_core/archive/
writer.rs

1//! Archive writer for Codex documents.
2
3use std::fs::File;
4use std::io::{BufWriter, Cursor, Seek, Write};
5use std::path::Path;
6
7use zip::write::FileOptions;
8use zip::ZipWriter;
9
10use crate::{Manifest, Result};
11
12use super::{validate_path, PHANTOMS_PATH, ZIP_COMMENT};
13
14/// Compression method for files in the archive.
15#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
16pub enum CompressionMethod {
17    /// Store without compression (for pre-compressed content like images).
18    Stored,
19    /// Deflate compression (widely compatible, required support).
20    #[default]
21    Deflate,
22    /// Zstandard compression (better ratio, optional support).
23    #[cfg(feature = "zstd")]
24    Zstd,
25}
26
27impl CompressionMethod {
28    fn to_zip_method(self) -> zip::CompressionMethod {
29        match self {
30            Self::Stored => zip::CompressionMethod::Stored,
31            Self::Deflate => zip::CompressionMethod::Deflated,
32            #[cfg(feature = "zstd")]
33            Self::Zstd => zip::CompressionMethod::Zstd,
34        }
35    }
36}
37
38/// Writer for creating Codex document archives.
39///
40/// `CdxWriter` creates properly formatted `.cdx` files, ensuring the manifest
41/// is written first and all required structure is maintained.
42///
43/// # Example
44///
45/// ```rust,ignore
46/// use cdx_core::archive::{CdxWriter, CompressionMethod};
47///
48/// let mut writer = CdxWriter::create("output.cdx")?;
49///
50/// writer.write_manifest(&manifest)?;
51/// writer.write_file("content/document.json", &content, CompressionMethod::Deflate)?;
52/// writer.write_file("metadata/dublin-core.json", &metadata, CompressionMethod::Deflate)?;
53///
54/// writer.finish()?;
55/// ```
56pub struct CdxWriter<W: Write + Seek> {
57    zip: ZipWriter<W>,
58    manifest_written: bool,
59    files_written: Vec<String>,
60}
61
62impl CdxWriter<BufWriter<File>> {
63    /// Create a new Codex document at the given file path.
64    ///
65    /// # Errors
66    ///
67    /// Returns an error if the file cannot be created.
68    pub fn create<P: AsRef<Path>>(path: P) -> Result<Self> {
69        let file = File::create(path.as_ref()).map_err(|e| {
70            if e.kind() == std::io::ErrorKind::NotFound {
71                crate::Error::FileNotFound {
72                    path: path.as_ref().to_path_buf(),
73                }
74            } else {
75                crate::Error::Io(e)
76            }
77        })?;
78        let writer = BufWriter::new(file);
79        Self::new(writer)
80    }
81}
82
83impl CdxWriter<Cursor<Vec<u8>>> {
84    /// Create a new Codex document in memory.
85    ///
86    /// # Panics
87    ///
88    /// This function will not panic in practice, as initializing
89    /// a `ZipWriter` on an in-memory buffer cannot fail.
90    #[must_use]
91    pub fn in_memory() -> Self {
92        let cursor = Cursor::new(Vec::new());
93        // This cannot fail for an in-memory buffer
94        Self::new(cursor).expect("in-memory writer should not fail")
95    }
96}
97
98impl<W: Write + Seek> CdxWriter<W> {
99    /// Create a new writer wrapping any `Write + Seek` destination.
100    ///
101    /// # Errors
102    ///
103    /// Returns an error if initialization fails.
104    pub fn new(writer: W) -> Result<Self> {
105        let mut zip = ZipWriter::new(writer);
106        zip.set_comment(ZIP_COMMENT);
107
108        Ok(Self {
109            zip,
110            manifest_written: false,
111            files_written: Vec::new(),
112        })
113    }
114
115    /// Write the manifest to the archive.
116    ///
117    /// This must be called before writing any other files, as the manifest
118    /// must be the first file in the archive per the Codex specification.
119    ///
120    /// # Errors
121    ///
122    /// Returns an error if:
123    /// - Writing fails
124    /// - The manifest has already been written
125    pub fn write_manifest(&mut self, manifest: &Manifest) -> Result<()> {
126        if self.manifest_written {
127            return Err(crate::Error::InvalidManifest {
128                reason: "manifest already written".to_string(),
129            });
130        }
131
132        if !self.files_written.is_empty() {
133            return Err(crate::Error::InvalidManifest {
134                reason: "manifest must be the first file in the archive".to_string(),
135            });
136        }
137
138        let json = serde_json::to_vec_pretty(manifest)?;
139        self.write_file_internal(super::MANIFEST_PATH, &json, CompressionMethod::Deflate)?;
140        self.manifest_written = true;
141
142        Ok(())
143    }
144
145    /// Write a file to the archive.
146    ///
147    /// # Errors
148    ///
149    /// Returns an error if:
150    /// - The manifest has not been written yet
151    /// - The path contains traversal patterns (security check)
152    /// - Writing fails
153    /// - A file with the same path already exists
154    pub fn write_file(
155        &mut self,
156        path: &str,
157        data: &[u8],
158        compression: CompressionMethod,
159    ) -> Result<()> {
160        if !self.manifest_written {
161            return Err(crate::Error::InvalidManifest {
162                reason: "manifest must be written before other files".to_string(),
163            });
164        }
165
166        validate_path(path)?;
167
168        if self.files_written.contains(&path.to_string()) {
169            return Err(crate::Error::InvalidManifest {
170                reason: format!("file already exists: {path}"),
171            });
172        }
173
174        self.write_file_internal(path, data, compression)
175    }
176
177    /// Internal file writing without manifest check (for manifest itself).
178    fn write_file_internal(
179        &mut self,
180        path: &str,
181        data: &[u8],
182        compression: CompressionMethod,
183    ) -> Result<()> {
184        let options = FileOptions::<()>::default()
185            .compression_method(compression.to_zip_method())
186            .unix_permissions(0o644);
187
188        self.zip.start_file(path, options)?;
189        self.zip.write_all(data)?;
190        self.files_written.push(path.to_string());
191
192        Ok(())
193    }
194
195    /// Write a file with automatic hash computation.
196    ///
197    /// Returns the computed hash for inclusion in the manifest.
198    ///
199    /// # Errors
200    ///
201    /// Returns an error if writing fails.
202    pub fn write_file_hashed(
203        &mut self,
204        path: &str,
205        data: &[u8],
206        compression: CompressionMethod,
207        algorithm: crate::HashAlgorithm,
208    ) -> Result<crate::DocumentId> {
209        let hash = crate::Hasher::hash(algorithm, data);
210        self.write_file(path, data, compression)?;
211        Ok(hash)
212    }
213
214    /// Write phantom clusters to the archive.
215    ///
216    /// Phantom clusters are stored at `phantoms/clusters.json` and are
217    /// not included in the content hash since they exist outside the
218    /// core content boundary.
219    ///
220    /// # Errors
221    ///
222    /// Returns an error if writing fails.
223    pub fn write_phantoms(&mut self, phantoms: &crate::extensions::PhantomClusters) -> Result<()> {
224        let json = serde_json::to_vec_pretty(phantoms)?;
225        self.write_file(PHANTOMS_PATH, &json, CompressionMethod::Deflate)
226    }
227
228    /// Start a directory in the archive.
229    ///
230    /// This is optional, as ZIP archives create directories implicitly,
231    /// but can be useful for clarity.
232    ///
233    /// # Errors
234    ///
235    /// Returns an error if adding the directory fails.
236    pub fn add_directory(&mut self, path: &str) -> Result<()> {
237        validate_path(path)?;
238
239        let dir_path = if path.ends_with('/') {
240            path.to_string()
241        } else {
242            format!("{path}/")
243        };
244
245        let options =
246            FileOptions::<()>::default().compression_method(zip::CompressionMethod::Stored);
247
248        self.zip.add_directory(&dir_path, options)?;
249
250        Ok(())
251    }
252
253    /// Check if the manifest has been written.
254    #[must_use]
255    pub fn manifest_written(&self) -> bool {
256        self.manifest_written
257    }
258
259    /// Get the list of files that have been written.
260    #[must_use]
261    pub fn files_written(&self) -> &[String] {
262        &self.files_written
263    }
264
265    /// Finish writing and close the archive.
266    ///
267    /// # Errors
268    ///
269    /// Returns an error if:
270    /// - The manifest was not written
271    /// - Finalizing the archive fails
272    pub fn finish(self) -> Result<W> {
273        if !self.manifest_written {
274            return Err(crate::Error::InvalidManifest {
275                reason: "manifest must be written before finishing".to_string(),
276            });
277        }
278
279        let writer = self.zip.finish()?;
280        Ok(writer)
281    }
282
283    /// Abort writing and return the underlying writer without finalizing.
284    ///
285    /// The resulting archive will be invalid.
286    ///
287    /// # Panics
288    ///
289    /// Panics if the ZIP finalization fails, which should not happen
290    /// for valid writer implementations.
291    #[must_use]
292    pub fn abort(self) -> W {
293        self.zip.finish().unwrap_or_else(|_| {
294            // If finish fails, we've already aborted, which is fine
295            panic!("abort should not fail")
296        })
297    }
298}
299
300#[cfg(test)]
301mod tests {
302    use super::*;
303    use crate::archive::{CONTENT_PATH, DUBLIN_CORE_PATH};
304    use crate::{ContentRef, DocumentId, Metadata};
305
306    fn create_test_manifest() -> Manifest {
307        let content = ContentRef {
308            path: CONTENT_PATH.to_string(),
309            hash: DocumentId::pending(),
310            compression: None,
311            merkle_root: None,
312            block_count: None,
313        };
314        let metadata = Metadata {
315            dublin_core: DUBLIN_CORE_PATH.to_string(),
316            custom: None,
317        };
318        Manifest::new(content, metadata)
319    }
320
321    #[test]
322    fn test_writer_in_memory() {
323        let mut writer = CdxWriter::in_memory();
324        let manifest = create_test_manifest();
325
326        writer.write_manifest(&manifest).unwrap();
327        writer
328            .write_file(
329                CONTENT_PATH,
330                br#"{"version":"0.1","blocks":[]}"#,
331                CompressionMethod::Deflate,
332            )
333            .unwrap();
334        writer
335            .write_file(
336                DUBLIN_CORE_PATH,
337                br#"{"title":"Test"}"#,
338                CompressionMethod::Deflate,
339            )
340            .unwrap();
341
342        let result = writer.finish().unwrap();
343        assert!(!result.into_inner().is_empty());
344    }
345
346    #[test]
347    fn test_writer_manifest_first() {
348        let mut writer = CdxWriter::in_memory();
349
350        // Try to write a file before manifest
351        let result = writer.write_file(CONTENT_PATH, b"test", CompressionMethod::Deflate);
352        assert!(result.is_err());
353    }
354
355    #[test]
356    fn test_writer_manifest_once() {
357        let mut writer = CdxWriter::in_memory();
358        let manifest = create_test_manifest();
359
360        writer.write_manifest(&manifest).unwrap();
361
362        // Try to write manifest again
363        let result = writer.write_manifest(&manifest);
364        assert!(result.is_err());
365    }
366
367    #[test]
368    fn test_writer_path_traversal_rejected() {
369        let mut writer = CdxWriter::in_memory();
370        let manifest = create_test_manifest();
371        writer.write_manifest(&manifest).unwrap();
372
373        let result = writer.write_file("../secret", b"data", CompressionMethod::Deflate);
374        assert!(result.is_err());
375    }
376
377    #[test]
378    fn test_writer_duplicate_file_rejected() {
379        let mut writer = CdxWriter::in_memory();
380        let manifest = create_test_manifest();
381        writer.write_manifest(&manifest).unwrap();
382
383        writer
384            .write_file(CONTENT_PATH, b"first", CompressionMethod::Deflate)
385            .unwrap();
386
387        let result = writer.write_file(CONTENT_PATH, b"second", CompressionMethod::Deflate);
388        assert!(result.is_err());
389    }
390
391    #[test]
392    fn test_writer_finish_requires_manifest() {
393        let writer = CdxWriter::in_memory();
394        let result = writer.finish();
395        assert!(result.is_err());
396    }
397
398    #[test]
399    fn test_writer_compression_stored() {
400        let mut writer = CdxWriter::in_memory();
401        let manifest = create_test_manifest();
402        writer.write_manifest(&manifest).unwrap();
403
404        writer
405            .write_file(CONTENT_PATH, b"test data", CompressionMethod::Stored)
406            .unwrap();
407
408        assert!(writer.files_written().contains(&CONTENT_PATH.to_string()));
409    }
410
411    #[test]
412    fn test_writer_hashed() {
413        let mut writer = CdxWriter::in_memory();
414        let manifest = create_test_manifest();
415        writer.write_manifest(&manifest).unwrap();
416
417        let data = b"test content";
418        let hash = writer
419            .write_file_hashed(
420                CONTENT_PATH,
421                data,
422                CompressionMethod::Deflate,
423                crate::HashAlgorithm::Sha256,
424            )
425            .unwrap();
426
427        assert!(!hash.is_pending());
428        assert_eq!(hash.algorithm(), crate::HashAlgorithm::Sha256);
429    }
430}