Skip to main content

cdx_core/archive/
mod.rs

1//! Archive reading and writing for Codex documents.
2//!
3//! Codex documents are packaged as ZIP archives with the `.cdx` extension.
4//! This module provides [`CdxReader`] and [`CdxWriter`] for working with these archives.
5//!
6//! # Reading Documents
7//!
8//! ```rust,ignore
9//! use cdx_core::archive::CdxReader;
10//!
11//! let mut reader = CdxReader::open("document.cdx")?;
12//! let manifest = reader.manifest();
13//! let content = reader.read_file("content/document.json")?;
14//! ```
15//!
16//! # Writing Documents
17//!
18//! ```rust,ignore
19//! use cdx_core::archive::CdxWriter;
20//!
21//! let mut writer = CdxWriter::create("output.cdx")?;
22//! writer.write_manifest(&manifest)?;
23//! writer.write_file("content/document.json", &content)?;
24//! writer.finish()?;
25//! ```
26
27mod reader;
28mod storage;
29mod writer;
30
31pub use reader::CdxReader;
32pub use storage::{ArchiveStorage, MemoryStorage};
33pub use writer::{CdxWriter, CompressionMethod};
34
35/// Path to the manifest file within the archive.
36pub const MANIFEST_PATH: &str = "manifest.json";
37
38/// Path to the content file within the archive.
39pub const CONTENT_PATH: &str = "content/document.json";
40
41/// Path to the Dublin Core metadata file within the archive.
42pub const DUBLIN_CORE_PATH: &str = "metadata/dublin-core.json";
43
44/// Path to the signatures file within the archive.
45pub const SIGNATURES_PATH: &str = "security/signatures.json";
46
47/// Path to the encryption metadata file within the archive.
48pub const ENCRYPTION_PATH: &str = "security/encryption.json";
49
50/// Path to the phantom clusters file within the archive.
51pub const PHANTOMS_PATH: &str = "phantoms/clusters.json";
52
53/// Path to the academic numbering configuration file within the archive.
54pub const ACADEMIC_NUMBERING_PATH: &str = "academic/numbering.json";
55
56/// Path to the collaboration comments file within the archive.
57pub const COMMENTS_PATH: &str = "collaboration/comments.json";
58
59/// Path to the form data file within the archive.
60pub const FORMS_DATA_PATH: &str = "forms/data.json";
61
62/// Path to the bibliography file within the archive.
63pub const BIBLIOGRAPHY_PATH: &str = "semantic/bibliography.json";
64
65/// Path to the JSON-LD metadata file within the archive.
66pub const JSONLD_PATH: &str = "metadata/jsonld.json";
67
68/// ZIP comment for Codex documents.
69pub const ZIP_COMMENT: &str = "Codex Document Format v0.1";
70
71/// Check whether an asset path contains only URL-safe characters.
72///
73/// Per the Codex spec, asset paths SHOULD use only URL-safe characters:
74/// alphanumerics, `.`, `-`, `_`, and `/`. This function returns `true` if
75/// the path is compliant.
76///
77/// This is a SHOULD-level requirement — non-compliant paths are still valid
78/// but may cause interoperability issues.
79#[must_use]
80pub fn is_url_safe_path(path: &str) -> bool {
81    path.bytes()
82        .all(|b| b.is_ascii_alphanumeric() || matches!(b, b'.' | b'-' | b'_' | b'/'))
83}
84
85/// Validate that a path is safe (no path traversal).
86///
87/// # Errors
88///
89/// Returns `PathTraversal` error if the path contains `..` segments or other unsafe patterns.
90pub(crate) fn validate_path(path: &str) -> crate::Result<()> {
91    // Check for path traversal attempts
92    if path.contains("..") {
93        return Err(crate::Error::PathTraversal {
94            path: path.to_string(),
95        });
96    }
97
98    // Check for absolute paths (should not start with /)
99    if path.starts_with('/') {
100        return Err(crate::Error::PathTraversal {
101            path: path.to_string(),
102        });
103    }
104
105    // Check for backslashes (Windows path separators)
106    if path.contains('\\') {
107        return Err(crate::Error::PathTraversal {
108            path: path.to_string(),
109        });
110    }
111
112    Ok(())
113}
114
115#[cfg(test)]
116mod tests {
117    use super::*;
118
119    #[test]
120    fn test_validate_path_safe() {
121        assert!(validate_path("manifest.json").is_ok());
122        assert!(validate_path("content/document.json").is_ok());
123        assert!(validate_path("assets/images/photo.png").is_ok());
124    }
125
126    #[test]
127    fn test_validate_path_traversal() {
128        assert!(validate_path("../secret").is_err());
129        assert!(validate_path("foo/../bar").is_err());
130        assert!(validate_path("foo/..").is_err());
131    }
132
133    #[test]
134    fn test_validate_path_absolute() {
135        assert!(validate_path("/etc/passwd").is_err());
136        assert!(validate_path("/manifest.json").is_err());
137    }
138
139    #[test]
140    fn test_validate_path_backslash() {
141        assert!(validate_path("foo\\bar").is_err());
142        assert!(validate_path("..\\secret").is_err());
143    }
144
145    #[test]
146    fn test_url_safe_path_valid() {
147        assert!(is_url_safe_path("assets/image-01.png"));
148        assert!(is_url_safe_path("assets/photo_2024.jpg"));
149        assert!(is_url_safe_path("content/document.json"));
150        assert!(is_url_safe_path("a-z_0-9/file.ext"));
151    }
152
153    #[test]
154    fn test_url_safe_path_invalid() {
155        // Spaces are not URL-safe
156        assert!(!is_url_safe_path("assets/file name.png"));
157        // Percent encoding characters
158        assert!(!is_url_safe_path("assets/file%20name.png"));
159        // Unicode characters
160        assert!(!is_url_safe_path("assets/文档.txt"));
161        // Special characters
162        assert!(!is_url_safe_path("assets/file@2x.png"));
163        assert!(!is_url_safe_path("assets/file#1.png"));
164        assert!(!is_url_safe_path("assets/file(1).png"));
165    }
166}