Skip to main content

cdx_core/archive/
reader.rs

1//! Archive reader for Codex documents.
2
3use std::fs::File;
4use std::io::{BufReader, Cursor, Read, Seek};
5use std::path::Path;
6
7use zip::ZipArchive;
8
9use crate::{Error, HashAlgorithm, Hasher, Manifest, Result};
10
11use super::{validate_path, CONTENT_PATH, DUBLIN_CORE_PATH, MANIFEST_PATH, PHANTOMS_PATH};
12
13/// Reader for Codex document archives.
14///
15/// `CdxReader` opens and validates `.cdx` files, providing access to their contents.
16/// The reader validates the archive structure on creation and provides lazy access
17/// to individual files.
18///
19/// # Example
20///
21/// ```rust,ignore
22/// use cdx_core::archive::CdxReader;
23///
24/// let mut reader = CdxReader::open("document.cdx")?;
25///
26/// // Access the manifest
27/// let manifest = reader.manifest();
28/// println!("Document state: {:?}", manifest.state);
29///
30/// // Read a file from the archive
31/// let content = reader.read_file("content/document.json")?;
32/// ```
33pub struct CdxReader<R: Read + Seek> {
34    archive: ZipArchive<R>,
35    manifest: Manifest,
36}
37
38impl CdxReader<BufReader<File>> {
39    /// Open a Codex document from a file path.
40    ///
41    /// # Errors
42    ///
43    /// Returns an error if:
44    /// - The file cannot be opened
45    /// - The file is not a valid ZIP archive
46    /// - Required files are missing
47    /// - The manifest is invalid
48    pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
49        let file = File::open(path.as_ref()).map_err(|e| {
50            if e.kind() == std::io::ErrorKind::NotFound {
51                Error::FileNotFound {
52                    path: path.as_ref().to_path_buf(),
53                }
54            } else {
55                Error::Io(e)
56            }
57        })?;
58        let reader = BufReader::new(file);
59        Self::new(reader)
60    }
61}
62
63impl CdxReader<Cursor<Vec<u8>>> {
64    /// Open a Codex document from bytes in memory.
65    ///
66    /// # Errors
67    ///
68    /// Returns an error if:
69    /// - The data is not a valid ZIP archive
70    /// - Required files are missing
71    /// - The manifest is invalid
72    pub fn from_bytes(data: Vec<u8>) -> Result<Self> {
73        let cursor = Cursor::new(data);
74        Self::new(cursor)
75    }
76}
77
78impl<R: Read + Seek> CdxReader<R> {
79    /// Create a new reader from any `Read + Seek` source.
80    ///
81    /// This enables reading from files, memory buffers, network streams, etc.
82    ///
83    /// # Errors
84    ///
85    /// Returns an error if:
86    /// - The source is not a valid ZIP archive
87    /// - Required files are missing
88    /// - The manifest is invalid
89    pub fn new(reader: R) -> Result<Self> {
90        let mut archive = ZipArchive::new(reader)?;
91
92        // Validate structure
93        Self::validate_structure(&archive)?;
94
95        // Read and parse manifest
96        let manifest = Self::read_manifest(&mut archive)?;
97
98        // Validate manifest
99        manifest.validate()?;
100
101        Ok(Self { archive, manifest })
102    }
103
104    /// Validate the archive structure.
105    fn validate_structure(archive: &ZipArchive<R>) -> Result<()> {
106        // Check for required files
107        let required_files = [MANIFEST_PATH, CONTENT_PATH, DUBLIN_CORE_PATH];
108
109        for path in required_files {
110            if archive.index_for_name(path).is_none() {
111                return Err(Error::MissingFile {
112                    path: path.to_string(),
113                });
114            }
115        }
116
117        // Manifest must be the first file in the archive (per spec)
118        if let Some(first_file) = archive.file_names().next() {
119            if first_file != MANIFEST_PATH {
120                return Err(Error::InvalidArchiveStructure {
121                    reason: format!(
122                        "manifest.json must be the first file in the archive (found '{first_file}')"
123                    ),
124                });
125            }
126        }
127
128        Ok(())
129    }
130
131    /// Strip a UTF-8 BOM (byte order mark) prefix if present.
132    fn strip_utf8_bom(data: &[u8]) -> &[u8] {
133        data.strip_prefix(&[0xEF, 0xBB, 0xBF]).unwrap_or(data)
134    }
135
136    /// Read a file and parse it as JSON, stripping any UTF-8 BOM prefix.
137    fn read_json_file<T: serde::de::DeserializeOwned>(
138        archive: &mut ZipArchive<R>,
139        path: &str,
140    ) -> Result<T> {
141        let data = Self::read_file_internal(archive, path)?;
142        let json_data = Self::strip_utf8_bom(&data);
143        Ok(serde_json::from_slice(json_data)?)
144    }
145
146    /// Read and parse the manifest.
147    fn read_manifest(archive: &mut ZipArchive<R>) -> Result<Manifest> {
148        Self::read_json_file(archive, MANIFEST_PATH)
149    }
150
151    /// Maximum allowed file size for decompression (256 MiB).
152    ///
153    /// This limit protects against decompression bombs (zip bombs) where a small
154    /// compressed file expands to a very large size.
155    const MAX_FILE_SIZE: u64 = 256 * 1024 * 1024;
156
157    /// Internal file reading without path validation (for known-safe paths).
158    fn read_file_internal(archive: &mut ZipArchive<R>, path: &str) -> Result<Vec<u8>> {
159        let file = archive.by_name(path).map_err(|e| match e {
160            zip::result::ZipError::FileNotFound => Error::MissingFile {
161                path: path.to_string(),
162            },
163            other => Error::InvalidArchive(other),
164        })?;
165
166        // Check declared size before allocating (catches honest oversized files)
167        if file.size() > Self::MAX_FILE_SIZE {
168            return Err(Error::FileTooLarge {
169                path: path.to_string(),
170                size: file.size(),
171                limit: Self::MAX_FILE_SIZE,
172            });
173        }
174
175        // Use try_from with fallback to 0 for platforms with smaller usize
176        let capacity = usize::try_from(file.size()).unwrap_or(0);
177        let mut data = Vec::with_capacity(capacity);
178        // Bounded read to catch spoofed/mismatched declared sizes
179        let bytes_read = file.take(Self::MAX_FILE_SIZE + 1).read_to_end(&mut data)?;
180        if bytes_read as u64 > Self::MAX_FILE_SIZE {
181            return Err(Error::FileTooLarge {
182                path: path.to_string(),
183                size: bytes_read as u64,
184                limit: Self::MAX_FILE_SIZE,
185            });
186        }
187        Ok(data)
188    }
189
190    /// Get a reference to the document manifest.
191    #[must_use]
192    pub fn manifest(&self) -> &Manifest {
193        &self.manifest
194    }
195
196    /// Read a file from the archive.
197    ///
198    /// # Errors
199    ///
200    /// Returns an error if:
201    /// - The path contains traversal patterns (security check)
202    /// - The file does not exist in the archive
203    /// - Reading the file fails
204    pub fn read_file(&mut self, path: &str) -> Result<Vec<u8>> {
205        validate_path(path)?;
206        Self::read_file_internal(&mut self.archive, path)
207    }
208
209    /// Read a file and verify its hash against the expected hash.
210    ///
211    /// # Errors
212    ///
213    /// Returns an error if:
214    /// - The path contains traversal patterns
215    /// - The file does not exist
216    /// - The hash does not match the expected value
217    pub fn read_file_verified(
218        &mut self,
219        path: &str,
220        expected_hash: &crate::DocumentId,
221    ) -> Result<Vec<u8>> {
222        let data = self.read_file(path)?;
223
224        // Skip verification for pending hashes
225        if expected_hash.is_pending() {
226            return Ok(data);
227        }
228
229        let actual_hash = Hasher::hash(expected_hash.algorithm(), &data);
230
231        if actual_hash != *expected_hash {
232            return Err(Error::HashMismatch {
233                path: path.to_string(),
234                expected: expected_hash.to_string(),
235                actual: actual_hash.to_string(),
236            });
237        }
238
239        Ok(data)
240    }
241
242    /// Read the content file.
243    ///
244    /// This is a convenience method for reading `content/document.json`.
245    ///
246    /// # Errors
247    ///
248    /// Returns an error if reading the content file fails.
249    pub fn read_content(&mut self) -> Result<Vec<u8>> {
250        self.read_file_verified(CONTENT_PATH, &self.manifest.content.hash.clone())
251    }
252
253    /// Read the Dublin Core metadata file.
254    ///
255    /// This is a convenience method for reading `metadata/dublin-core.json`.
256    ///
257    /// # Errors
258    ///
259    /// Returns an error if reading the metadata file fails.
260    pub fn read_dublin_core(&mut self) -> Result<Vec<u8>> {
261        self.read_file(&self.manifest.metadata.dublin_core.clone())
262    }
263
264    /// Check if a file exists in the archive.
265    ///
266    /// # Errors
267    ///
268    /// Returns an error if the path contains traversal patterns.
269    pub fn file_exists(&self, path: &str) -> Result<bool> {
270        validate_path(path)?;
271        Ok(self.archive.index_for_name(path).is_some())
272    }
273
274    /// Get the list of all file paths in the archive.
275    #[must_use]
276    pub fn file_names(&self) -> Vec<String> {
277        self.archive.file_names().map(String::from).collect()
278    }
279
280    /// Get the number of files in the archive.
281    #[must_use]
282    pub fn file_count(&self) -> usize {
283        self.archive.len()
284    }
285
286    /// Get the hash algorithm used by this document.
287    #[must_use]
288    pub fn hash_algorithm(&self) -> HashAlgorithm {
289        self.manifest.hash_algorithm
290    }
291
292    /// Read phantom clusters from the archive.
293    ///
294    /// Returns `None` if the phantom clusters file doesn't exist.
295    ///
296    /// # Errors
297    ///
298    /// Returns an error if the file exists but cannot be parsed.
299    pub fn read_phantoms(&mut self) -> Result<Option<crate::extensions::PhantomClusters>> {
300        if self.archive.index_for_name(PHANTOMS_PATH).is_none() {
301            return Ok(None);
302        }
303
304        let phantoms: crate::extensions::PhantomClusters =
305            Self::read_json_file(&mut self.archive, PHANTOMS_PATH)?;
306        Ok(Some(phantoms))
307    }
308
309    /// Verify all file hashes in the manifest.
310    ///
311    /// This checks:
312    /// - Content file hash
313    /// - Presentation file hashes (if any)
314    ///
315    /// # Errors
316    ///
317    /// Returns an error if any hash verification fails.
318    pub fn verify_hashes(&mut self) -> Result<()> {
319        // Verify content hash
320        let content_data = self.read_file(CONTENT_PATH)?;
321        if !self.manifest.content.hash.is_pending() {
322            let actual = Hasher::hash(self.manifest.content.hash.algorithm(), &content_data);
323            if actual != self.manifest.content.hash {
324                return Err(Error::HashMismatch {
325                    path: CONTENT_PATH.to_string(),
326                    expected: self.manifest.content.hash.to_string(),
327                    actual: actual.to_string(),
328                });
329            }
330        }
331
332        // Verify presentation hashes
333        for pres in &self.manifest.presentation.clone() {
334            if !pres.hash.is_pending() {
335                let data = self.read_file(&pres.path)?;
336                let actual = Hasher::hash(pres.hash.algorithm(), &data);
337                if actual != pres.hash {
338                    return Err(Error::HashMismatch {
339                        path: pres.path.clone(),
340                        expected: pres.hash.to_string(),
341                        actual: actual.to_string(),
342                    });
343                }
344            }
345        }
346
347        Ok(())
348    }
349}
350
351#[cfg(test)]
352mod tests {
353    use super::*;
354    use crate::archive::CdxWriter;
355    use crate::{ContentRef, DocumentId, Metadata};
356    use std::io::{Cursor, Write};
357
358    fn create_test_archive() -> Vec<u8> {
359        let buffer = Cursor::new(Vec::new());
360        let mut writer = CdxWriter::new(buffer).unwrap();
361
362        // Create a minimal manifest
363        let content = ContentRef {
364            path: CONTENT_PATH.to_string(),
365            hash: DocumentId::pending(),
366            compression: None,
367            merkle_root: None,
368            block_count: None,
369        };
370        let metadata = Metadata {
371            dublin_core: DUBLIN_CORE_PATH.to_string(),
372            custom: None,
373        };
374        let manifest = Manifest::new(content, metadata);
375
376        writer.write_manifest(&manifest).unwrap();
377        writer
378            .write_file(
379                CONTENT_PATH,
380                br#"{"version":"0.1","blocks":[]}"#,
381                super::super::writer::CompressionMethod::Deflate,
382            )
383            .unwrap();
384        writer
385            .write_file(
386                DUBLIN_CORE_PATH,
387                br#"{"title":"Test"}"#,
388                super::super::writer::CompressionMethod::Deflate,
389            )
390            .unwrap();
391
392        writer.finish().unwrap().into_inner()
393    }
394
395    #[test]
396    fn test_reader_from_bytes() {
397        let data = create_test_archive();
398        let reader = CdxReader::from_bytes(data).unwrap();
399        assert_eq!(reader.manifest().codex, "0.1");
400    }
401
402    #[test]
403    fn test_reader_file_list() {
404        let data = create_test_archive();
405        let reader = CdxReader::from_bytes(data).unwrap();
406        let files = reader.file_names();
407        assert!(files.contains(&MANIFEST_PATH.to_string()));
408        assert!(files.contains(&CONTENT_PATH.to_string()));
409        assert!(files.contains(&DUBLIN_CORE_PATH.to_string()));
410    }
411
412    #[test]
413    fn test_reader_read_file() {
414        let data = create_test_archive();
415        let mut reader = CdxReader::from_bytes(data).unwrap();
416        let content = reader.read_file(CONTENT_PATH).unwrap();
417        assert!(!content.is_empty());
418    }
419
420    #[test]
421    fn test_reader_file_exists() {
422        let data = create_test_archive();
423        let reader = CdxReader::from_bytes(data).unwrap();
424        assert!(reader.file_exists(MANIFEST_PATH).unwrap());
425        assert!(reader.file_exists(CONTENT_PATH).unwrap());
426        assert!(!reader.file_exists("nonexistent.json").unwrap());
427    }
428
429    #[test]
430    fn test_reader_path_traversal_rejected() {
431        let data = create_test_archive();
432        let mut reader = CdxReader::from_bytes(data).unwrap();
433        assert!(reader.read_file("../secret").is_err());
434        assert!(reader.file_exists("../secret").is_err());
435    }
436
437    #[test]
438    fn test_reader_missing_file_error() {
439        let data = create_test_archive();
440        let mut reader = CdxReader::from_bytes(data).unwrap();
441        let result = reader.read_file("nonexistent.json");
442        assert!(matches!(result, Err(Error::MissingFile { .. })));
443    }
444
445    #[test]
446    fn test_open_corrupted_zip() {
447        // Random bytes that aren't a valid ZIP
448        let corrupted = vec![0x50, 0x4B, 0x03, 0x04, 0xFF, 0xFF, 0xFF, 0xFF];
449        let result = CdxReader::from_bytes(corrupted);
450        assert!(result.is_err());
451    }
452
453    #[test]
454    fn test_open_not_a_zip() {
455        // Plain text, not a ZIP file
456        let not_zip = b"This is not a ZIP file at all".to_vec();
457        let result = CdxReader::from_bytes(not_zip);
458        assert!(result.is_err());
459    }
460
461    #[test]
462    fn test_open_empty_zip() {
463        // Create an empty ZIP (no files)
464        let buffer = Cursor::new(Vec::new());
465        let writer = zip::ZipWriter::new(buffer);
466        let empty_zip = writer.finish().unwrap().into_inner();
467
468        let result = CdxReader::from_bytes(empty_zip);
469        assert!(matches!(result, Err(Error::MissingFile { .. })));
470    }
471
472    #[test]
473    fn test_open_missing_manifest() {
474        // Create a ZIP without manifest.json
475        let buffer = Cursor::new(Vec::new());
476        let mut writer = zip::ZipWriter::new(buffer);
477        writer
478            .start_file::<&str, ()>(CONTENT_PATH, Default::default())
479            .unwrap();
480        writer.write_all(b"{}").unwrap();
481        writer
482            .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
483            .unwrap();
484        writer.write_all(b"{}").unwrap();
485        let data = writer.finish().unwrap().into_inner();
486
487        let result = CdxReader::from_bytes(data);
488        assert!(matches!(result, Err(Error::MissingFile { path }) if path == MANIFEST_PATH));
489    }
490
491    #[test]
492    fn test_open_missing_content() {
493        // Create a ZIP with manifest but no content file
494        let buffer = Cursor::new(Vec::new());
495        let mut writer = zip::ZipWriter::new(buffer);
496
497        // Add manifest
498        writer
499            .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
500            .unwrap();
501        writer.write_all(br#"{"codex":"0.1"}"#).unwrap();
502
503        // Add Dublin Core but no content
504        writer
505            .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
506            .unwrap();
507        writer.write_all(b"{}").unwrap();
508
509        let data = writer.finish().unwrap().into_inner();
510
511        let result = CdxReader::from_bytes(data);
512        assert!(matches!(result, Err(Error::MissingFile { path }) if path == CONTENT_PATH));
513    }
514
515    #[test]
516    fn test_open_invalid_manifest_json() {
517        // Create a ZIP with invalid JSON in manifest
518        let buffer = Cursor::new(Vec::new());
519        let mut writer = zip::ZipWriter::new(buffer);
520
521        writer
522            .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
523            .unwrap();
524        writer.write_all(b"{ invalid json }").unwrap();
525
526        writer
527            .start_file::<&str, ()>(CONTENT_PATH, Default::default())
528            .unwrap();
529        writer.write_all(b"{}").unwrap();
530
531        writer
532            .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
533            .unwrap();
534        writer.write_all(b"{}").unwrap();
535
536        let data = writer.finish().unwrap().into_inner();
537
538        let result = CdxReader::from_bytes(data);
539        assert!(result.is_err());
540    }
541
542    #[test]
543    fn test_read_file_hash_mismatch() {
544        let buffer = Cursor::new(Vec::new());
545        let mut writer = CdxWriter::new(buffer).unwrap();
546
547        // Create manifest with a specific hash
548        let expected_hash: DocumentId =
549            "sha256:0000000000000000000000000000000000000000000000000000000000000000"
550                .parse()
551                .unwrap();
552        let content = ContentRef {
553            path: CONTENT_PATH.to_string(),
554            hash: expected_hash.clone(),
555            compression: None,
556            merkle_root: None,
557            block_count: None,
558        };
559        let metadata = Metadata {
560            dublin_core: DUBLIN_CORE_PATH.to_string(),
561            custom: None,
562        };
563        let manifest = Manifest::new(content, metadata);
564
565        writer.write_manifest(&manifest).unwrap();
566        // Write content that doesn't match the hash
567        writer
568            .write_file(
569                CONTENT_PATH,
570                br#"{"version":"0.1","blocks":[]}"#,
571                super::super::writer::CompressionMethod::Deflate,
572            )
573            .unwrap();
574        writer
575            .write_file(
576                DUBLIN_CORE_PATH,
577                br#"{"title":"Test"}"#,
578                super::super::writer::CompressionMethod::Deflate,
579            )
580            .unwrap();
581
582        let data = writer.finish().unwrap().into_inner();
583        let mut reader = CdxReader::from_bytes(data).unwrap();
584
585        let result = reader.read_file_verified(CONTENT_PATH, &expected_hash);
586        assert!(matches!(result, Err(Error::HashMismatch { .. })));
587    }
588
589    #[test]
590    fn test_verify_hashes_with_mismatch() {
591        let buffer = Cursor::new(Vec::new());
592        let mut writer = CdxWriter::new(buffer).unwrap();
593
594        // Create manifest with a wrong hash
595        let wrong_hash: DocumentId =
596            "sha256:ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
597                .parse()
598                .unwrap();
599        let content = ContentRef {
600            path: CONTENT_PATH.to_string(),
601            hash: wrong_hash,
602            compression: None,
603            merkle_root: None,
604            block_count: None,
605        };
606        let metadata = Metadata {
607            dublin_core: DUBLIN_CORE_PATH.to_string(),
608            custom: None,
609        };
610        let manifest = Manifest::new(content, metadata);
611
612        writer.write_manifest(&manifest).unwrap();
613        writer
614            .write_file(
615                CONTENT_PATH,
616                br#"{"version":"0.1","blocks":[]}"#,
617                super::super::writer::CompressionMethod::Deflate,
618            )
619            .unwrap();
620        writer
621            .write_file(
622                DUBLIN_CORE_PATH,
623                br#"{"title":"Test"}"#,
624                super::super::writer::CompressionMethod::Deflate,
625            )
626            .unwrap();
627
628        let data = writer.finish().unwrap().into_inner();
629        let mut reader = CdxReader::from_bytes(data).unwrap();
630
631        let result = reader.verify_hashes();
632        assert!(matches!(result, Err(Error::HashMismatch { .. })));
633    }
634
635    #[test]
636    fn test_read_file_verified_with_pending_hash() {
637        let data = create_test_archive();
638        let mut reader = CdxReader::from_bytes(data).unwrap();
639
640        // Pending hashes should skip verification
641        let pending = DocumentId::pending();
642        let result = reader.read_file_verified(CONTENT_PATH, &pending);
643        assert!(result.is_ok());
644    }
645
646    #[test]
647    fn test_unicode_filenames() {
648        let buffer = Cursor::new(Vec::new());
649        let mut writer = CdxWriter::new(buffer).unwrap();
650
651        let content = ContentRef {
652            path: CONTENT_PATH.to_string(),
653            hash: DocumentId::pending(),
654            compression: None,
655            merkle_root: None,
656            block_count: None,
657        };
658        let metadata = Metadata {
659            dublin_core: DUBLIN_CORE_PATH.to_string(),
660            custom: None,
661        };
662        let manifest = Manifest::new(content, metadata);
663
664        writer.write_manifest(&manifest).unwrap();
665        writer
666            .write_file(
667                CONTENT_PATH,
668                br#"{"version":"0.1","blocks":[]}"#,
669                super::super::writer::CompressionMethod::Deflate,
670            )
671            .unwrap();
672        writer
673            .write_file(
674                DUBLIN_CORE_PATH,
675                br#"{"title":"Test"}"#,
676                super::super::writer::CompressionMethod::Deflate,
677            )
678            .unwrap();
679
680        // Add a file with Unicode characters
681        writer
682            .write_file(
683                "assets/文档.txt",
684                b"Unicode content",
685                super::super::writer::CompressionMethod::Deflate,
686            )
687            .unwrap();
688        writer
689            .write_file(
690                "assets/émoji_🎉.txt",
691                b"Emoji content",
692                super::super::writer::CompressionMethod::Deflate,
693            )
694            .unwrap();
695
696        let data = writer.finish().unwrap().into_inner();
697        let mut reader = CdxReader::from_bytes(data).unwrap();
698
699        // Verify we can read the Unicode files
700        let files = reader.file_names();
701        assert!(files.contains(&"assets/文档.txt".to_string()));
702        assert!(files.contains(&"assets/émoji_🎉.txt".to_string()));
703
704        let content = reader.read_file("assets/文档.txt").unwrap();
705        assert_eq!(content, b"Unicode content");
706
707        let emoji_content = reader.read_file("assets/émoji_🎉.txt").unwrap();
708        assert_eq!(emoji_content, b"Emoji content");
709    }
710
711    #[test]
712    fn test_file_count() {
713        let data = create_test_archive();
714        let reader = CdxReader::from_bytes(data).unwrap();
715        // manifest, content, dublin_core = 3 files
716        assert_eq!(reader.file_count(), 3);
717    }
718
719    #[test]
720    fn test_hash_algorithm() {
721        let data = create_test_archive();
722        let reader = CdxReader::from_bytes(data).unwrap();
723        assert_eq!(reader.hash_algorithm(), HashAlgorithm::Sha256);
724    }
725
726    #[test]
727    fn test_read_phantoms_none() {
728        let data = create_test_archive();
729        let mut reader = CdxReader::from_bytes(data).unwrap();
730        // No phantoms file in the test archive
731        let result = reader.read_phantoms().unwrap();
732        assert!(result.is_none());
733    }
734
735    #[test]
736    fn test_manifest_must_be_first_file() {
737        // Create a ZIP where manifest is NOT the first file
738        let buffer = Cursor::new(Vec::new());
739        let mut writer = zip::ZipWriter::new(buffer);
740
741        // Write content BEFORE manifest
742        writer
743            .start_file::<&str, ()>(CONTENT_PATH, Default::default())
744            .unwrap();
745        writer
746            .write_all(br#"{"version":"0.1","blocks":[]}"#)
747            .unwrap();
748
749        // Now write manifest (not first)
750        let manifest_json = r#"{
751            "codex": "0.1",
752            "id": "pending",
753            "state": "draft",
754            "created": "2024-01-01T00:00:00Z",
755            "modified": "2024-01-01T00:00:00Z",
756            "content": { "path": "content/document.json", "hash": "pending" },
757            "metadata": { "dublinCore": "metadata/dublin-core.json" }
758        }"#;
759        writer
760            .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
761            .unwrap();
762        writer.write_all(manifest_json.as_bytes()).unwrap();
763
764        writer
765            .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
766            .unwrap();
767        writer.write_all(br#"{"title":"Test"}"#).unwrap();
768
769        let data = writer.finish().unwrap().into_inner();
770        let result = CdxReader::from_bytes(data);
771
772        let err = result.err().expect("should be an error");
773        assert!(matches!(err, Error::InvalidArchiveStructure { .. }));
774    }
775
776    #[test]
777    fn test_manifest_first_file_passes() {
778        // Normal archive created by CdxWriter should have manifest first
779        let data = create_test_archive();
780        let result = CdxReader::from_bytes(data);
781        assert!(result.is_ok());
782    }
783
784    #[test]
785    fn test_utf8_bom_stripped_from_manifest() {
786        // Create a ZIP with BOM-prefixed manifest JSON
787        let buffer = Cursor::new(Vec::new());
788        let mut writer = zip::ZipWriter::new(buffer);
789
790        // Manifest with UTF-8 BOM prefix
791        let manifest_json = r#"{
792            "codex": "0.1",
793            "id": "pending",
794            "state": "draft",
795            "created": "2024-01-01T00:00:00Z",
796            "modified": "2024-01-01T00:00:00Z",
797            "hashAlgorithm": "sha256",
798            "content": { "path": "content/document.json", "hash": "pending" },
799            "metadata": { "dublinCore": "metadata/dublin-core.json" }
800        }"#;
801        let mut bom_manifest = vec![0xEF, 0xBB, 0xBF];
802        bom_manifest.extend_from_slice(manifest_json.as_bytes());
803
804        writer
805            .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
806            .unwrap();
807        writer.write_all(&bom_manifest).unwrap();
808
809        writer
810            .start_file::<&str, ()>(CONTENT_PATH, Default::default())
811            .unwrap();
812        writer
813            .write_all(br#"{"version":"0.1","blocks":[]}"#)
814            .unwrap();
815
816        writer
817            .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
818            .unwrap();
819        writer.write_all(br#"{"title":"Test"}"#).unwrap();
820
821        let data = writer.finish().unwrap().into_inner();
822        let reader = CdxReader::from_bytes(data);
823        assert!(
824            reader.is_ok(),
825            "BOM-prefixed manifest should parse correctly"
826        );
827        assert_eq!(reader.unwrap().manifest().codex, "0.1");
828    }
829
830    #[test]
831    fn test_utf8_bom_not_required() {
832        // Regular archive without BOM should still work fine
833        let data = create_test_archive();
834        let reader = CdxReader::from_bytes(data);
835        assert!(reader.is_ok());
836    }
837}