Skip to main content

cdx_core/archive/
reader.rs

1//! Archive reader for Codex documents.
2
3use std::fs::File;
4use std::io::{BufReader, Cursor, Read, Seek};
5use std::path::Path;
6
7use zip::ZipArchive;
8
9use crate::{Error, HashAlgorithm, Hasher, Manifest, Result};
10
11use super::{validate_path, CONTENT_PATH, DUBLIN_CORE_PATH, MANIFEST_PATH, PHANTOMS_PATH};
12
13/// Reader for Codex document archives.
14///
15/// `CdxReader` opens and validates `.cdx` files, providing access to their contents.
16/// The reader validates the archive structure on creation and provides lazy access
17/// to individual files.
18///
19/// # Example
20///
21/// ```rust,ignore
22/// use cdx_core::archive::CdxReader;
23///
24/// let mut reader = CdxReader::open("document.cdx")?;
25///
26/// // Access the manifest
27/// let manifest = reader.manifest();
28/// println!("Document state: {:?}", manifest.state);
29///
30/// // Read a file from the archive
31/// let content = reader.read_file("content/document.json")?;
32/// ```
33pub struct CdxReader<R: Read + Seek> {
34    archive: ZipArchive<R>,
35    manifest: Manifest,
36}
37
38impl CdxReader<BufReader<File>> {
39    /// Open a Codex document from a file path.
40    ///
41    /// # Errors
42    ///
43    /// Returns an error if:
44    /// - The file cannot be opened
45    /// - The file is not a valid ZIP archive
46    /// - Required files are missing
47    /// - The manifest is invalid
48    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
49        let file = File::open(path.as_ref()).map_err(|e| {
50            if e.kind() == std::io::ErrorKind::NotFound {
51                Error::FileNotFound {
52                    path: path.as_ref().to_path_buf(),
53                }
54            } else {
55                Error::Io(e)
56            }
57        })?;
58        let reader = BufReader::new(file);
59        Self::new(reader)
60    }
61}
62
63impl CdxReader<Cursor<Vec<u8>>> {
64    /// Open a Codex document from bytes in memory.
65    ///
66    /// # Errors
67    ///
68    /// Returns an error if:
69    /// - The data is not a valid ZIP archive
70    /// - Required files are missing
71    /// - The manifest is invalid
72    pub fn from_bytes(data: Vec<u8>) -> Result<Self> {
73        let cursor = Cursor::new(data);
74        Self::new(cursor)
75    }
76}
77
78impl<R: Read + Seek> CdxReader<R> {
79    /// Create a new reader from any `Read + Seek` source.
80    ///
81    /// This enables reading from files, memory buffers, network streams, etc.
82    ///
83    /// # Errors
84    ///
85    /// Returns an error if:
86    /// - The source is not a valid ZIP archive
87    /// - Required files are missing
88    /// - The manifest is invalid
89    pub fn new(reader: R) -> Result<Self> {
90        let mut archive = ZipArchive::new(reader)?;
91
92        // Validate structure
93        Self::validate_structure(&archive)?;
94
95        // Read and parse manifest
96        let manifest = Self::read_manifest(&mut archive)?;
97
98        // Validate manifest
99        manifest.validate()?;
100
101        Ok(Self { archive, manifest })
102    }
103
104    /// Validate the archive structure.
105    fn validate_structure(archive: &ZipArchive<R>) -> Result<()> {
106        // Check for required files
107        let required_files = [MANIFEST_PATH, CONTENT_PATH, DUBLIN_CORE_PATH];
108
109        for path in required_files {
110            if archive.index_for_name(path).is_none() {
111                return Err(Error::MissingFile {
112                    path: path.to_string(),
113                });
114            }
115        }
116
117        // Manifest must be the first file in the archive (per spec)
118        if let Some(first_file) = archive.file_names().next() {
119            if first_file != MANIFEST_PATH {
120                return Err(Error::InvalidArchiveStructure {
121                    reason: format!(
122                        "manifest.json must be the first file in the archive (found '{first_file}')"
123                    ),
124                });
125            }
126        }
127
128        Ok(())
129    }
130
131    /// Strip a UTF-8 BOM (byte order mark) prefix if present.
132    fn strip_utf8_bom(data: &[u8]) -> &[u8] {
133        data.strip_prefix(&[0xEF, 0xBB, 0xBF]).unwrap_or(data)
134    }
135
136    /// Read a file and parse it as JSON, stripping any UTF-8 BOM prefix.
137    fn read_json_file<T: serde::de::DeserializeOwned>(
138        archive: &mut ZipArchive<R>,
139        path: &str,
140    ) -> Result<T> {
141        let data = Self::read_file_internal(archive, path)?;
142        let json_data = Self::strip_utf8_bom(&data);
143        Ok(serde_json::from_slice(json_data)?)
144    }
145
146    /// Read and parse the manifest.
147    fn read_manifest(archive: &mut ZipArchive<R>) -> Result<Manifest> {
148        Self::read_json_file(archive, MANIFEST_PATH)
149    }
150
151    /// Maximum allowed file size for decompression (256 MiB).
152    ///
153    /// This limit protects against decompression bombs (zip bombs) where a small
154    /// compressed file expands to a very large size.
155    const MAX_FILE_SIZE: u64 = 256 * 1024 * 1024;
156
157    /// Internal file reading without path validation (for known-safe paths).
158    fn read_file_internal(archive: &mut ZipArchive<R>, path: &str) -> Result<Vec<u8>> {
159        let file = archive.by_name(path).map_err(|_| Error::MissingFile {
160            path: path.to_string(),
161        })?;
162
163        // Check declared size before allocating (catches honest oversized files)
164        if file.size() > Self::MAX_FILE_SIZE {
165            return Err(Error::FileTooLarge {
166                path: path.to_string(),
167                size: file.size(),
168                limit: Self::MAX_FILE_SIZE,
169            });
170        }
171
172        // Use try_from with fallback to 0 for platforms with smaller usize
173        let capacity = usize::try_from(file.size()).unwrap_or(0);
174        let mut data = Vec::with_capacity(capacity);
175        // Bounded read to catch spoofed/mismatched declared sizes
176        let bytes_read = file.take(Self::MAX_FILE_SIZE + 1).read_to_end(&mut data)?;
177        if bytes_read as u64 > Self::MAX_FILE_SIZE {
178            return Err(Error::FileTooLarge {
179                path: path.to_string(),
180                size: bytes_read as u64,
181                limit: Self::MAX_FILE_SIZE,
182            });
183        }
184        Ok(data)
185    }
186
187    /// Get a reference to the document manifest.
188    #[must_use]
189    pub fn manifest(&self) -> &Manifest {
190        &self.manifest
191    }
192
193    /// Read a file from the archive.
194    ///
195    /// # Errors
196    ///
197    /// Returns an error if:
198    /// - The path contains traversal patterns (security check)
199    /// - The file does not exist in the archive
200    /// - Reading the file fails
201    pub fn read_file(&mut self, path: &str) -> Result<Vec<u8>> {
202        validate_path(path)?;
203        Self::read_file_internal(&mut self.archive, path)
204    }
205
206    /// Read a file and verify its hash against the expected hash.
207    ///
208    /// # Errors
209    ///
210    /// Returns an error if:
211    /// - The path contains traversal patterns
212    /// - The file does not exist
213    /// - The hash does not match the expected value
214    pub fn read_file_verified(
215        &mut self,
216        path: &str,
217        expected_hash: &crate::DocumentId,
218    ) -> Result<Vec<u8>> {
219        let data = self.read_file(path)?;
220
221        // Skip verification for pending hashes
222        if expected_hash.is_pending() {
223            return Ok(data);
224        }
225
226        let actual_hash = Hasher::hash(expected_hash.algorithm(), &data);
227
228        if actual_hash != *expected_hash {
229            return Err(Error::HashMismatch {
230                path: path.to_string(),
231                expected: expected_hash.to_string(),
232                actual: actual_hash.to_string(),
233            });
234        }
235
236        Ok(data)
237    }
238
239    /// Read the content file.
240    ///
241    /// This is a convenience method for reading `content/document.json`.
242    ///
243    /// # Errors
244    ///
245    /// Returns an error if reading the content file fails.
246    pub fn read_content(&mut self) -> Result<Vec<u8>> {
247        self.read_file_verified(CONTENT_PATH, &self.manifest.content.hash.clone())
248    }
249
250    /// Read the Dublin Core metadata file.
251    ///
252    /// This is a convenience method for reading `metadata/dublin-core.json`.
253    ///
254    /// # Errors
255    ///
256    /// Returns an error if reading the metadata file fails.
257    pub fn read_dublin_core(&mut self) -> Result<Vec<u8>> {
258        self.read_file(&self.manifest.metadata.dublin_core.clone())
259    }
260
261    /// Check if a file exists in the archive.
262    ///
263    /// # Errors
264    ///
265    /// Returns an error if the path contains traversal patterns.
266    pub fn file_exists(&self, path: &str) -> Result<bool> {
267        validate_path(path)?;
268        Ok(self.archive.index_for_name(path).is_some())
269    }
270
271    /// Get the list of all file paths in the archive.
272    #[must_use]
273    pub fn file_names(&self) -> Vec<String> {
274        self.archive.file_names().map(String::from).collect()
275    }
276
277    /// Get the number of files in the archive.
278    #[must_use]
279    pub fn file_count(&self) -> usize {
280        self.archive.len()
281    }
282
283    /// Get the hash algorithm used by this document.
284    #[must_use]
285    pub fn hash_algorithm(&self) -> HashAlgorithm {
286        self.manifest.hash_algorithm
287    }
288
289    /// Read phantom clusters from the archive.
290    ///
291    /// Returns `None` if the phantom clusters file doesn't exist.
292    ///
293    /// # Errors
294    ///
295    /// Returns an error if the file exists but cannot be parsed.
296    pub fn read_phantoms(&mut self) -> Result<Option<crate::extensions::PhantomClusters>> {
297        if self.archive.index_for_name(PHANTOMS_PATH).is_none() {
298            return Ok(None);
299        }
300
301        let phantoms: crate::extensions::PhantomClusters =
302            Self::read_json_file(&mut self.archive, PHANTOMS_PATH)?;
303        Ok(Some(phantoms))
304    }
305
306    /// Verify all file hashes in the manifest.
307    ///
308    /// This checks:
309    /// - Content file hash
310    /// - Presentation file hashes (if any)
311    ///
312    /// # Errors
313    ///
314    /// Returns an error if any hash verification fails.
315    pub fn verify_hashes(&mut self) -> Result<()> {
316        // Verify content hash
317        let content_data = self.read_file(CONTENT_PATH)?;
318        if !self.manifest.content.hash.is_pending() {
319            let actual = Hasher::hash(self.manifest.content.hash.algorithm(), &content_data);
320            if actual != self.manifest.content.hash {
321                return Err(Error::HashMismatch {
322                    path: CONTENT_PATH.to_string(),
323                    expected: self.manifest.content.hash.to_string(),
324                    actual: actual.to_string(),
325                });
326            }
327        }
328
329        // Verify presentation hashes
330        for pres in &self.manifest.presentation.clone() {
331            if !pres.hash.is_pending() {
332                let data = self.read_file(&pres.path)?;
333                let actual = Hasher::hash(pres.hash.algorithm(), &data);
334                if actual != pres.hash {
335                    return Err(Error::HashMismatch {
336                        path: pres.path.clone(),
337                        expected: pres.hash.to_string(),
338                        actual: actual.to_string(),
339                    });
340                }
341            }
342        }
343
344        Ok(())
345    }
346}
347
348#[cfg(test)]
349mod tests {
350    use super::*;
351    use crate::archive::CdxWriter;
352    use crate::{ContentRef, DocumentId, Metadata};
353    use std::io::{Cursor, Write};
354
355    fn create_test_archive() -> Vec<u8> {
356        let buffer = Cursor::new(Vec::new());
357        let mut writer = CdxWriter::new(buffer).unwrap();
358
359        // Create a minimal manifest
360        let content = ContentRef {
361            path: CONTENT_PATH.to_string(),
362            hash: DocumentId::pending(),
363            compression: None,
364            merkle_root: None,
365            block_count: None,
366        };
367        let metadata = Metadata {
368            dublin_core: DUBLIN_CORE_PATH.to_string(),
369            custom: None,
370        };
371        let manifest = Manifest::new(content, metadata);
372
373        writer.write_manifest(&manifest).unwrap();
374        writer
375            .write_file(
376                CONTENT_PATH,
377                br#"{"version":"0.1","blocks":[]}"#,
378                super::super::writer::CompressionMethod::Deflate,
379            )
380            .unwrap();
381        writer
382            .write_file(
383                DUBLIN_CORE_PATH,
384                br#"{"title":"Test"}"#,
385                super::super::writer::CompressionMethod::Deflate,
386            )
387            .unwrap();
388
389        writer.finish().unwrap().into_inner()
390    }
391
392    #[test]
393    fn test_reader_from_bytes() {
394        let data = create_test_archive();
395        let reader = CdxReader::from_bytes(data).unwrap();
396        assert_eq!(reader.manifest().codex, "0.1");
397    }
398
399    #[test]
400    fn test_reader_file_list() {
401        let data = create_test_archive();
402        let reader = CdxReader::from_bytes(data).unwrap();
403        let files = reader.file_names();
404        assert!(files.contains(&MANIFEST_PATH.to_string()));
405        assert!(files.contains(&CONTENT_PATH.to_string()));
406        assert!(files.contains(&DUBLIN_CORE_PATH.to_string()));
407    }
408
409    #[test]
410    fn test_reader_read_file() {
411        let data = create_test_archive();
412        let mut reader = CdxReader::from_bytes(data).unwrap();
413        let content = reader.read_file(CONTENT_PATH).unwrap();
414        assert!(!content.is_empty());
415    }
416
417    #[test]
418    fn test_reader_file_exists() {
419        let data = create_test_archive();
420        let reader = CdxReader::from_bytes(data).unwrap();
421        assert!(reader.file_exists(MANIFEST_PATH).unwrap());
422        assert!(reader.file_exists(CONTENT_PATH).unwrap());
423        assert!(!reader.file_exists("nonexistent.json").unwrap());
424    }
425
426    #[test]
427    fn test_reader_path_traversal_rejected() {
428        let data = create_test_archive();
429        let mut reader = CdxReader::from_bytes(data).unwrap();
430        assert!(reader.read_file("../secret").is_err());
431        assert!(reader.file_exists("../secret").is_err());
432    }
433
434    #[test]
435    fn test_reader_missing_file_error() {
436        let data = create_test_archive();
437        let mut reader = CdxReader::from_bytes(data).unwrap();
438        let result = reader.read_file("nonexistent.json");
439        assert!(matches!(result, Err(Error::MissingFile { .. })));
440    }
441
442    #[test]
443    fn test_open_corrupted_zip() {
444        // Random bytes that aren't a valid ZIP
445        let corrupted = vec![0x50, 0x4B, 0x03, 0x04, 0xFF, 0xFF, 0xFF, 0xFF];
446        let result = CdxReader::from_bytes(corrupted);
447        assert!(result.is_err());
448    }
449
450    #[test]
451    fn test_open_not_a_zip() {
452        // Plain text, not a ZIP file
453        let not_zip = b"This is not a ZIP file at all".to_vec();
454        let result = CdxReader::from_bytes(not_zip);
455        assert!(result.is_err());
456    }
457
458    #[test]
459    fn test_open_empty_zip() {
460        // Create an empty ZIP (no files)
461        let buffer = Cursor::new(Vec::new());
462        let writer = zip::ZipWriter::new(buffer);
463        let empty_zip = writer.finish().unwrap().into_inner();
464
465        let result = CdxReader::from_bytes(empty_zip);
466        assert!(matches!(result, Err(Error::MissingFile { .. })));
467    }
468
469    #[test]
470    fn test_open_missing_manifest() {
471        // Create a ZIP without manifest.json
472        let buffer = Cursor::new(Vec::new());
473        let mut writer = zip::ZipWriter::new(buffer);
474        writer
475            .start_file::<&str, ()>(CONTENT_PATH, Default::default())
476            .unwrap();
477        writer.write_all(b"{}").unwrap();
478        writer
479            .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
480            .unwrap();
481        writer.write_all(b"{}").unwrap();
482        let data = writer.finish().unwrap().into_inner();
483
484        let result = CdxReader::from_bytes(data);
485        assert!(matches!(result, Err(Error::MissingFile { path }) if path == MANIFEST_PATH));
486    }
487
488    #[test]
489    fn test_open_missing_content() {
490        // Create a ZIP with manifest but no content file
491        let buffer = Cursor::new(Vec::new());
492        let mut writer = zip::ZipWriter::new(buffer);
493
494        // Add manifest
495        writer
496            .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
497            .unwrap();
498        writer.write_all(br#"{"codex":"0.1"}"#).unwrap();
499
500        // Add Dublin Core but no content
501        writer
502            .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
503            .unwrap();
504        writer.write_all(b"{}").unwrap();
505
506        let data = writer.finish().unwrap().into_inner();
507
508        let result = CdxReader::from_bytes(data);
509        assert!(matches!(result, Err(Error::MissingFile { path }) if path == CONTENT_PATH));
510    }
511
512    #[test]
513    fn test_open_invalid_manifest_json() {
514        // Create a ZIP with invalid JSON in manifest
515        let buffer = Cursor::new(Vec::new());
516        let mut writer = zip::ZipWriter::new(buffer);
517
518        writer
519            .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
520            .unwrap();
521        writer.write_all(b"{ invalid json }").unwrap();
522
523        writer
524            .start_file::<&str, ()>(CONTENT_PATH, Default::default())
525            .unwrap();
526        writer.write_all(b"{}").unwrap();
527
528        writer
529            .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
530            .unwrap();
531        writer.write_all(b"{}").unwrap();
532
533        let data = writer.finish().unwrap().into_inner();
534
535        let result = CdxReader::from_bytes(data);
536        assert!(result.is_err());
537    }
538
539    #[test]
540    fn test_read_file_hash_mismatch() {
541        let buffer = Cursor::new(Vec::new());
542        let mut writer = CdxWriter::new(buffer).unwrap();
543
544        // Create manifest with a specific hash
545        let expected_hash: DocumentId =
546            "sha256:0000000000000000000000000000000000000000000000000000000000000000"
547                .parse()
548                .unwrap();
549        let content = ContentRef {
550            path: CONTENT_PATH.to_string(),
551            hash: expected_hash.clone(),
552            compression: None,
553            merkle_root: None,
554            block_count: None,
555        };
556        let metadata = Metadata {
557            dublin_core: DUBLIN_CORE_PATH.to_string(),
558            custom: None,
559        };
560        let manifest = Manifest::new(content, metadata);
561
562        writer.write_manifest(&manifest).unwrap();
563        // Write content that doesn't match the hash
564        writer
565            .write_file(
566                CONTENT_PATH,
567                br#"{"version":"0.1","blocks":[]}"#,
568                super::super::writer::CompressionMethod::Deflate,
569            )
570            .unwrap();
571        writer
572            .write_file(
573                DUBLIN_CORE_PATH,
574                br#"{"title":"Test"}"#,
575                super::super::writer::CompressionMethod::Deflate,
576            )
577            .unwrap();
578
579        let data = writer.finish().unwrap().into_inner();
580        let mut reader = CdxReader::from_bytes(data).unwrap();
581
582        let result = reader.read_file_verified(CONTENT_PATH, &expected_hash);
583        assert!(matches!(result, Err(Error::HashMismatch { .. })));
584    }
585
586    #[test]
587    fn test_verify_hashes_with_mismatch() {
588        let buffer = Cursor::new(Vec::new());
589        let mut writer = CdxWriter::new(buffer).unwrap();
590
591        // Create manifest with a wrong hash
592        let wrong_hash: DocumentId =
593            "sha256:ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
594                .parse()
595                .unwrap();
596        let content = ContentRef {
597            path: CONTENT_PATH.to_string(),
598            hash: wrong_hash,
599            compression: None,
600            merkle_root: None,
601            block_count: None,
602        };
603        let metadata = Metadata {
604            dublin_core: DUBLIN_CORE_PATH.to_string(),
605            custom: None,
606        };
607        let manifest = Manifest::new(content, metadata);
608
609        writer.write_manifest(&manifest).unwrap();
610        writer
611            .write_file(
612                CONTENT_PATH,
613                br#"{"version":"0.1","blocks":[]}"#,
614                super::super::writer::CompressionMethod::Deflate,
615            )
616            .unwrap();
617        writer
618            .write_file(
619                DUBLIN_CORE_PATH,
620                br#"{"title":"Test"}"#,
621                super::super::writer::CompressionMethod::Deflate,
622            )
623            .unwrap();
624
625        let data = writer.finish().unwrap().into_inner();
626        let mut reader = CdxReader::from_bytes(data).unwrap();
627
628        let result = reader.verify_hashes();
629        assert!(matches!(result, Err(Error::HashMismatch { .. })));
630    }
631
632    #[test]
633    fn test_read_file_verified_with_pending_hash() {
634        let data = create_test_archive();
635        let mut reader = CdxReader::from_bytes(data).unwrap();
636
637        // Pending hashes should skip verification
638        let pending = DocumentId::pending();
639        let result = reader.read_file_verified(CONTENT_PATH, &pending);
640        assert!(result.is_ok());
641    }
642
643    #[test]
644    fn test_unicode_filenames() {
645        let buffer = Cursor::new(Vec::new());
646        let mut writer = CdxWriter::new(buffer).unwrap();
647
648        let content = ContentRef {
649            path: CONTENT_PATH.to_string(),
650            hash: DocumentId::pending(),
651            compression: None,
652            merkle_root: None,
653            block_count: None,
654        };
655        let metadata = Metadata {
656            dublin_core: DUBLIN_CORE_PATH.to_string(),
657            custom: None,
658        };
659        let manifest = Manifest::new(content, metadata);
660
661        writer.write_manifest(&manifest).unwrap();
662        writer
663            .write_file(
664                CONTENT_PATH,
665                br#"{"version":"0.1","blocks":[]}"#,
666                super::super::writer::CompressionMethod::Deflate,
667            )
668            .unwrap();
669        writer
670            .write_file(
671                DUBLIN_CORE_PATH,
672                br#"{"title":"Test"}"#,
673                super::super::writer::CompressionMethod::Deflate,
674            )
675            .unwrap();
676
677        // Add a file with Unicode characters
678        writer
679            .write_file(
680                "assets/文档.txt",
681                b"Unicode content",
682                super::super::writer::CompressionMethod::Deflate,
683            )
684            .unwrap();
685        writer
686            .write_file(
687                "assets/émoji_🎉.txt",
688                b"Emoji content",
689                super::super::writer::CompressionMethod::Deflate,
690            )
691            .unwrap();
692
693        let data = writer.finish().unwrap().into_inner();
694        let mut reader = CdxReader::from_bytes(data).unwrap();
695
696        // Verify we can read the Unicode files
697        let files = reader.file_names();
698        assert!(files.contains(&"assets/文档.txt".to_string()));
699        assert!(files.contains(&"assets/émoji_🎉.txt".to_string()));
700
701        let content = reader.read_file("assets/文档.txt").unwrap();
702        assert_eq!(content, b"Unicode content");
703
704        let emoji_content = reader.read_file("assets/émoji_🎉.txt").unwrap();
705        assert_eq!(emoji_content, b"Emoji content");
706    }
707
708    #[test]
709    fn test_file_count() {
710        let data = create_test_archive();
711        let reader = CdxReader::from_bytes(data).unwrap();
712        // manifest, content, dublin_core = 3 files
713        assert_eq!(reader.file_count(), 3);
714    }
715
716    #[test]
717    fn test_hash_algorithm() {
718        let data = create_test_archive();
719        let reader = CdxReader::from_bytes(data).unwrap();
720        assert_eq!(reader.hash_algorithm(), HashAlgorithm::Sha256);
721    }
722
723    #[test]
724    fn test_read_phantoms_none() {
725        let data = create_test_archive();
726        let mut reader = CdxReader::from_bytes(data).unwrap();
727        // No phantoms file in the test archive
728        let result = reader.read_phantoms().unwrap();
729        assert!(result.is_none());
730    }
731
732    #[test]
733    fn test_manifest_must_be_first_file() {
734        // Create a ZIP where manifest is NOT the first file
735        let buffer = Cursor::new(Vec::new());
736        let mut writer = zip::ZipWriter::new(buffer);
737
738        // Write content BEFORE manifest
739        writer
740            .start_file::<&str, ()>(CONTENT_PATH, Default::default())
741            .unwrap();
742        writer
743            .write_all(br#"{"version":"0.1","blocks":[]}"#)
744            .unwrap();
745
746        // Now write manifest (not first)
747        let manifest_json = r#"{
748            "codex": "0.1",
749            "id": "pending",
750            "state": "draft",
751            "created": "2024-01-01T00:00:00Z",
752            "modified": "2024-01-01T00:00:00Z",
753            "content": { "path": "content/document.json", "hash": "pending" },
754            "metadata": { "dublinCore": "metadata/dublin-core.json" }
755        }"#;
756        writer
757            .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
758            .unwrap();
759        writer.write_all(manifest_json.as_bytes()).unwrap();
760
761        writer
762            .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
763            .unwrap();
764        writer.write_all(br#"{"title":"Test"}"#).unwrap();
765
766        let data = writer.finish().unwrap().into_inner();
767        let result = CdxReader::from_bytes(data);
768
769        let err = result.err().expect("should be an error");
770        assert!(matches!(err, Error::InvalidArchiveStructure { .. }));
771    }
772
773    #[test]
774    fn test_manifest_first_file_passes() {
775        // Normal archive created by CdxWriter should have manifest first
776        let data = create_test_archive();
777        let result = CdxReader::from_bytes(data);
778        assert!(result.is_ok());
779    }
780
781    #[test]
782    fn test_utf8_bom_stripped_from_manifest() {
783        // Create a ZIP with BOM-prefixed manifest JSON
784        let buffer = Cursor::new(Vec::new());
785        let mut writer = zip::ZipWriter::new(buffer);
786
787        // Manifest with UTF-8 BOM prefix
788        let manifest_json = r#"{
789            "codex": "0.1",
790            "id": "pending",
791            "state": "draft",
792            "created": "2024-01-01T00:00:00Z",
793            "modified": "2024-01-01T00:00:00Z",
794            "hashAlgorithm": "sha256",
795            "content": { "path": "content/document.json", "hash": "pending" },
796            "metadata": { "dublinCore": "metadata/dublin-core.json" }
797        }"#;
798        let mut bom_manifest = vec![0xEF, 0xBB, 0xBF];
799        bom_manifest.extend_from_slice(manifest_json.as_bytes());
800
801        writer
802            .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
803            .unwrap();
804        writer.write_all(&bom_manifest).unwrap();
805
806        writer
807            .start_file::<&str, ()>(CONTENT_PATH, Default::default())
808            .unwrap();
809        writer
810            .write_all(br#"{"version":"0.1","blocks":[]}"#)
811            .unwrap();
812
813        writer
814            .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
815            .unwrap();
816        writer.write_all(br#"{"title":"Test"}"#).unwrap();
817
818        let data = writer.finish().unwrap().into_inner();
819        let reader = CdxReader::from_bytes(data);
820        assert!(
821            reader.is_ok(),
822            "BOM-prefixed manifest should parse correctly"
823        );
824        assert_eq!(reader.unwrap().manifest().codex, "0.1");
825    }
826
827    #[test]
828    fn test_utf8_bom_not_required() {
829        // Regular archive without BOM should still work fine
830        let data = create_test_archive();
831        let reader = CdxReader::from_bytes(data);
832        assert!(reader.is_ok());
833    }
834}