1use std::fs::File;
4use std::io::{BufReader, Cursor, Read, Seek};
5use std::path::Path;
6
7use zip::ZipArchive;
8
9use crate::{Error, HashAlgorithm, Hasher, Manifest, Result};
10
11use super::{validate_path, CONTENT_PATH, DUBLIN_CORE_PATH, MANIFEST_PATH, PHANTOMS_PATH};
12
13pub struct CdxReader<R: Read + Seek> {
34 archive: ZipArchive<R>,
35 manifest: Manifest,
36}
37
38impl CdxReader<BufReader<File>> {
39 pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
49 let file = File::open(path.as_ref()).map_err(|e| {
50 if e.kind() == std::io::ErrorKind::NotFound {
51 Error::FileNotFound {
52 path: path.as_ref().to_path_buf(),
53 }
54 } else {
55 Error::Io(e)
56 }
57 })?;
58 let reader = BufReader::new(file);
59 Self::new(reader)
60 }
61}
62
63impl CdxReader<Cursor<Vec<u8>>> {
64 pub fn from_bytes(data: Vec<u8>) -> Result<Self> {
73 let cursor = Cursor::new(data);
74 Self::new(cursor)
75 }
76}
77
78impl<R: Read + Seek> CdxReader<R> {
79 pub fn new(reader: R) -> Result<Self> {
90 let mut archive = ZipArchive::new(reader)?;
91
92 Self::validate_structure(&archive)?;
94
95 let manifest = Self::read_manifest(&mut archive)?;
97
98 manifest.validate()?;
100
101 Ok(Self { archive, manifest })
102 }
103
104 fn validate_structure(archive: &ZipArchive<R>) -> Result<()> {
106 let required_files = [MANIFEST_PATH, CONTENT_PATH, DUBLIN_CORE_PATH];
108
109 for path in required_files {
110 if archive.index_for_name(path).is_none() {
111 return Err(Error::MissingFile {
112 path: path.to_string(),
113 });
114 }
115 }
116
117 if let Some(first_file) = archive.file_names().next() {
119 if first_file != MANIFEST_PATH {
120 return Err(Error::InvalidArchiveStructure {
121 reason: format!(
122 "manifest.json must be the first file in the archive (found '{first_file}')"
123 ),
124 });
125 }
126 }
127
128 Ok(())
129 }
130
131 fn strip_utf8_bom(data: &[u8]) -> &[u8] {
133 data.strip_prefix(&[0xEF, 0xBB, 0xBF]).unwrap_or(data)
134 }
135
136 fn read_json_file<T: serde::de::DeserializeOwned>(
138 archive: &mut ZipArchive<R>,
139 path: &str,
140 ) -> Result<T> {
141 let data = Self::read_file_internal(archive, path)?;
142 let json_data = Self::strip_utf8_bom(&data);
143 Ok(serde_json::from_slice(json_data)?)
144 }
145
146 fn read_manifest(archive: &mut ZipArchive<R>) -> Result<Manifest> {
148 Self::read_json_file(archive, MANIFEST_PATH)
149 }
150
151 const MAX_FILE_SIZE: u64 = 256 * 1024 * 1024;
156
157 fn read_file_internal(archive: &mut ZipArchive<R>, path: &str) -> Result<Vec<u8>> {
159 let file = archive.by_name(path).map_err(|e| match e {
160 zip::result::ZipError::FileNotFound => Error::MissingFile {
161 path: path.to_string(),
162 },
163 other => Error::InvalidArchive(other),
164 })?;
165
166 if file.size() > Self::MAX_FILE_SIZE {
168 return Err(Error::FileTooLarge {
169 path: path.to_string(),
170 size: file.size(),
171 limit: Self::MAX_FILE_SIZE,
172 });
173 }
174
175 let capacity = usize::try_from(file.size()).unwrap_or(0);
177 let mut data = Vec::with_capacity(capacity);
178 let bytes_read = file.take(Self::MAX_FILE_SIZE + 1).read_to_end(&mut data)?;
180 if bytes_read as u64 > Self::MAX_FILE_SIZE {
181 return Err(Error::FileTooLarge {
182 path: path.to_string(),
183 size: bytes_read as u64,
184 limit: Self::MAX_FILE_SIZE,
185 });
186 }
187 Ok(data)
188 }
189
190 #[must_use]
192 pub fn manifest(&self) -> &Manifest {
193 &self.manifest
194 }
195
196 pub fn read_file(&mut self, path: &str) -> Result<Vec<u8>> {
205 validate_path(path)?;
206 Self::read_file_internal(&mut self.archive, path)
207 }
208
209 pub fn read_file_verified(
218 &mut self,
219 path: &str,
220 expected_hash: &crate::DocumentId,
221 ) -> Result<Vec<u8>> {
222 let data = self.read_file(path)?;
223
224 if expected_hash.is_pending() {
226 return Ok(data);
227 }
228
229 let actual_hash = Hasher::hash(expected_hash.algorithm(), &data);
230
231 if actual_hash != *expected_hash {
232 return Err(Error::HashMismatch {
233 path: path.to_string(),
234 expected: expected_hash.to_string(),
235 actual: actual_hash.to_string(),
236 });
237 }
238
239 Ok(data)
240 }
241
242 pub fn read_content(&mut self) -> Result<Vec<u8>> {
250 self.read_file_verified(CONTENT_PATH, &self.manifest.content.hash.clone())
251 }
252
253 pub fn read_dublin_core(&mut self) -> Result<Vec<u8>> {
261 self.read_file(&self.manifest.metadata.dublin_core.clone())
262 }
263
264 pub fn file_exists(&self, path: &str) -> Result<bool> {
270 validate_path(path)?;
271 Ok(self.archive.index_for_name(path).is_some())
272 }
273
274 #[must_use]
276 pub fn file_names(&self) -> Vec<String> {
277 self.archive.file_names().map(String::from).collect()
278 }
279
280 #[must_use]
282 pub fn file_count(&self) -> usize {
283 self.archive.len()
284 }
285
286 #[must_use]
288 pub fn hash_algorithm(&self) -> HashAlgorithm {
289 self.manifest.hash_algorithm
290 }
291
292 pub fn read_phantoms(&mut self) -> Result<Option<crate::extensions::PhantomClusters>> {
300 if self.archive.index_for_name(PHANTOMS_PATH).is_none() {
301 return Ok(None);
302 }
303
304 let phantoms: crate::extensions::PhantomClusters =
305 Self::read_json_file(&mut self.archive, PHANTOMS_PATH)?;
306 Ok(Some(phantoms))
307 }
308
309 pub fn verify_hashes(&mut self) -> Result<()> {
319 let content_data = self.read_file(CONTENT_PATH)?;
321 if !self.manifest.content.hash.is_pending() {
322 let actual = Hasher::hash(self.manifest.content.hash.algorithm(), &content_data);
323 if actual != self.manifest.content.hash {
324 return Err(Error::HashMismatch {
325 path: CONTENT_PATH.to_string(),
326 expected: self.manifest.content.hash.to_string(),
327 actual: actual.to_string(),
328 });
329 }
330 }
331
332 for pres in &self.manifest.presentation.clone() {
334 if !pres.hash.is_pending() {
335 let data = self.read_file(&pres.path)?;
336 let actual = Hasher::hash(pres.hash.algorithm(), &data);
337 if actual != pres.hash {
338 return Err(Error::HashMismatch {
339 path: pres.path.clone(),
340 expected: pres.hash.to_string(),
341 actual: actual.to_string(),
342 });
343 }
344 }
345 }
346
347 Ok(())
348 }
349}
350
351#[cfg(test)]
352mod tests {
353 use super::*;
354 use crate::archive::CdxWriter;
355 use crate::{ContentRef, DocumentId, Metadata};
356 use std::io::{Cursor, Write};
357
358 fn create_test_archive() -> Vec<u8> {
359 let buffer = Cursor::new(Vec::new());
360 let mut writer = CdxWriter::new(buffer).unwrap();
361
362 let content = ContentRef {
364 path: CONTENT_PATH.to_string(),
365 hash: DocumentId::pending(),
366 compression: None,
367 merkle_root: None,
368 block_count: None,
369 };
370 let metadata = Metadata {
371 dublin_core: DUBLIN_CORE_PATH.to_string(),
372 custom: None,
373 };
374 let manifest = Manifest::new(content, metadata);
375
376 writer.write_manifest(&manifest).unwrap();
377 writer
378 .write_file(
379 CONTENT_PATH,
380 br#"{"version":"0.1","blocks":[]}"#,
381 super::super::writer::CompressionMethod::Deflate,
382 )
383 .unwrap();
384 writer
385 .write_file(
386 DUBLIN_CORE_PATH,
387 br#"{"title":"Test"}"#,
388 super::super::writer::CompressionMethod::Deflate,
389 )
390 .unwrap();
391
392 writer.finish().unwrap().into_inner()
393 }
394
395 #[test]
396 fn test_reader_from_bytes() {
397 let data = create_test_archive();
398 let reader = CdxReader::from_bytes(data).unwrap();
399 assert_eq!(reader.manifest().codex, "0.1");
400 }
401
402 #[test]
403 fn test_reader_file_list() {
404 let data = create_test_archive();
405 let reader = CdxReader::from_bytes(data).unwrap();
406 let files = reader.file_names();
407 assert!(files.contains(&MANIFEST_PATH.to_string()));
408 assert!(files.contains(&CONTENT_PATH.to_string()));
409 assert!(files.contains(&DUBLIN_CORE_PATH.to_string()));
410 }
411
412 #[test]
413 fn test_reader_read_file() {
414 let data = create_test_archive();
415 let mut reader = CdxReader::from_bytes(data).unwrap();
416 let content = reader.read_file(CONTENT_PATH).unwrap();
417 assert!(!content.is_empty());
418 }
419
420 #[test]
421 fn test_reader_file_exists() {
422 let data = create_test_archive();
423 let reader = CdxReader::from_bytes(data).unwrap();
424 assert!(reader.file_exists(MANIFEST_PATH).unwrap());
425 assert!(reader.file_exists(CONTENT_PATH).unwrap());
426 assert!(!reader.file_exists("nonexistent.json").unwrap());
427 }
428
429 #[test]
430 fn test_reader_path_traversal_rejected() {
431 let data = create_test_archive();
432 let mut reader = CdxReader::from_bytes(data).unwrap();
433 assert!(reader.read_file("../secret").is_err());
434 assert!(reader.file_exists("../secret").is_err());
435 }
436
437 #[test]
438 fn test_reader_missing_file_error() {
439 let data = create_test_archive();
440 let mut reader = CdxReader::from_bytes(data).unwrap();
441 let result = reader.read_file("nonexistent.json");
442 assert!(matches!(result, Err(Error::MissingFile { .. })));
443 }
444
445 #[test]
446 fn test_open_corrupted_zip() {
447 let corrupted = vec![0x50, 0x4B, 0x03, 0x04, 0xFF, 0xFF, 0xFF, 0xFF];
449 let result = CdxReader::from_bytes(corrupted);
450 assert!(result.is_err());
451 }
452
453 #[test]
454 fn test_open_not_a_zip() {
455 let not_zip = b"This is not a ZIP file at all".to_vec();
457 let result = CdxReader::from_bytes(not_zip);
458 assert!(result.is_err());
459 }
460
461 #[test]
462 fn test_open_empty_zip() {
463 let buffer = Cursor::new(Vec::new());
465 let writer = zip::ZipWriter::new(buffer);
466 let empty_zip = writer.finish().unwrap().into_inner();
467
468 let result = CdxReader::from_bytes(empty_zip);
469 assert!(matches!(result, Err(Error::MissingFile { .. })));
470 }
471
472 #[test]
473 fn test_open_missing_manifest() {
474 let buffer = Cursor::new(Vec::new());
476 let mut writer = zip::ZipWriter::new(buffer);
477 writer
478 .start_file::<&str, ()>(CONTENT_PATH, Default::default())
479 .unwrap();
480 writer.write_all(b"{}").unwrap();
481 writer
482 .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
483 .unwrap();
484 writer.write_all(b"{}").unwrap();
485 let data = writer.finish().unwrap().into_inner();
486
487 let result = CdxReader::from_bytes(data);
488 assert!(matches!(result, Err(Error::MissingFile { path }) if path == MANIFEST_PATH));
489 }
490
491 #[test]
492 fn test_open_missing_content() {
493 let buffer = Cursor::new(Vec::new());
495 let mut writer = zip::ZipWriter::new(buffer);
496
497 writer
499 .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
500 .unwrap();
501 writer.write_all(br#"{"codex":"0.1"}"#).unwrap();
502
503 writer
505 .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
506 .unwrap();
507 writer.write_all(b"{}").unwrap();
508
509 let data = writer.finish().unwrap().into_inner();
510
511 let result = CdxReader::from_bytes(data);
512 assert!(matches!(result, Err(Error::MissingFile { path }) if path == CONTENT_PATH));
513 }
514
515 #[test]
516 fn test_open_invalid_manifest_json() {
517 let buffer = Cursor::new(Vec::new());
519 let mut writer = zip::ZipWriter::new(buffer);
520
521 writer
522 .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
523 .unwrap();
524 writer.write_all(b"{ invalid json }").unwrap();
525
526 writer
527 .start_file::<&str, ()>(CONTENT_PATH, Default::default())
528 .unwrap();
529 writer.write_all(b"{}").unwrap();
530
531 writer
532 .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
533 .unwrap();
534 writer.write_all(b"{}").unwrap();
535
536 let data = writer.finish().unwrap().into_inner();
537
538 let result = CdxReader::from_bytes(data);
539 assert!(result.is_err());
540 }
541
542 #[test]
543 fn test_read_file_hash_mismatch() {
544 let buffer = Cursor::new(Vec::new());
545 let mut writer = CdxWriter::new(buffer).unwrap();
546
547 let expected_hash: DocumentId =
549 "sha256:0000000000000000000000000000000000000000000000000000000000000000"
550 .parse()
551 .unwrap();
552 let content = ContentRef {
553 path: CONTENT_PATH.to_string(),
554 hash: expected_hash.clone(),
555 compression: None,
556 merkle_root: None,
557 block_count: None,
558 };
559 let metadata = Metadata {
560 dublin_core: DUBLIN_CORE_PATH.to_string(),
561 custom: None,
562 };
563 let manifest = Manifest::new(content, metadata);
564
565 writer.write_manifest(&manifest).unwrap();
566 writer
568 .write_file(
569 CONTENT_PATH,
570 br#"{"version":"0.1","blocks":[]}"#,
571 super::super::writer::CompressionMethod::Deflate,
572 )
573 .unwrap();
574 writer
575 .write_file(
576 DUBLIN_CORE_PATH,
577 br#"{"title":"Test"}"#,
578 super::super::writer::CompressionMethod::Deflate,
579 )
580 .unwrap();
581
582 let data = writer.finish().unwrap().into_inner();
583 let mut reader = CdxReader::from_bytes(data).unwrap();
584
585 let result = reader.read_file_verified(CONTENT_PATH, &expected_hash);
586 assert!(matches!(result, Err(Error::HashMismatch { .. })));
587 }
588
589 #[test]
590 fn test_verify_hashes_with_mismatch() {
591 let buffer = Cursor::new(Vec::new());
592 let mut writer = CdxWriter::new(buffer).unwrap();
593
594 let wrong_hash: DocumentId =
596 "sha256:ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
597 .parse()
598 .unwrap();
599 let content = ContentRef {
600 path: CONTENT_PATH.to_string(),
601 hash: wrong_hash,
602 compression: None,
603 merkle_root: None,
604 block_count: None,
605 };
606 let metadata = Metadata {
607 dublin_core: DUBLIN_CORE_PATH.to_string(),
608 custom: None,
609 };
610 let manifest = Manifest::new(content, metadata);
611
612 writer.write_manifest(&manifest).unwrap();
613 writer
614 .write_file(
615 CONTENT_PATH,
616 br#"{"version":"0.1","blocks":[]}"#,
617 super::super::writer::CompressionMethod::Deflate,
618 )
619 .unwrap();
620 writer
621 .write_file(
622 DUBLIN_CORE_PATH,
623 br#"{"title":"Test"}"#,
624 super::super::writer::CompressionMethod::Deflate,
625 )
626 .unwrap();
627
628 let data = writer.finish().unwrap().into_inner();
629 let mut reader = CdxReader::from_bytes(data).unwrap();
630
631 let result = reader.verify_hashes();
632 assert!(matches!(result, Err(Error::HashMismatch { .. })));
633 }
634
635 #[test]
636 fn test_read_file_verified_with_pending_hash() {
637 let data = create_test_archive();
638 let mut reader = CdxReader::from_bytes(data).unwrap();
639
640 let pending = DocumentId::pending();
642 let result = reader.read_file_verified(CONTENT_PATH, &pending);
643 assert!(result.is_ok());
644 }
645
646 #[test]
647 fn test_unicode_filenames() {
648 let buffer = Cursor::new(Vec::new());
649 let mut writer = CdxWriter::new(buffer).unwrap();
650
651 let content = ContentRef {
652 path: CONTENT_PATH.to_string(),
653 hash: DocumentId::pending(),
654 compression: None,
655 merkle_root: None,
656 block_count: None,
657 };
658 let metadata = Metadata {
659 dublin_core: DUBLIN_CORE_PATH.to_string(),
660 custom: None,
661 };
662 let manifest = Manifest::new(content, metadata);
663
664 writer.write_manifest(&manifest).unwrap();
665 writer
666 .write_file(
667 CONTENT_PATH,
668 br#"{"version":"0.1","blocks":[]}"#,
669 super::super::writer::CompressionMethod::Deflate,
670 )
671 .unwrap();
672 writer
673 .write_file(
674 DUBLIN_CORE_PATH,
675 br#"{"title":"Test"}"#,
676 super::super::writer::CompressionMethod::Deflate,
677 )
678 .unwrap();
679
680 writer
682 .write_file(
683 "assets/文档.txt",
684 b"Unicode content",
685 super::super::writer::CompressionMethod::Deflate,
686 )
687 .unwrap();
688 writer
689 .write_file(
690 "assets/émoji_🎉.txt",
691 b"Emoji content",
692 super::super::writer::CompressionMethod::Deflate,
693 )
694 .unwrap();
695
696 let data = writer.finish().unwrap().into_inner();
697 let mut reader = CdxReader::from_bytes(data).unwrap();
698
699 let files = reader.file_names();
701 assert!(files.contains(&"assets/文档.txt".to_string()));
702 assert!(files.contains(&"assets/émoji_🎉.txt".to_string()));
703
704 let content = reader.read_file("assets/文档.txt").unwrap();
705 assert_eq!(content, b"Unicode content");
706
707 let emoji_content = reader.read_file("assets/émoji_🎉.txt").unwrap();
708 assert_eq!(emoji_content, b"Emoji content");
709 }
710
711 #[test]
712 fn test_file_count() {
713 let data = create_test_archive();
714 let reader = CdxReader::from_bytes(data).unwrap();
715 assert_eq!(reader.file_count(), 3);
717 }
718
719 #[test]
720 fn test_hash_algorithm() {
721 let data = create_test_archive();
722 let reader = CdxReader::from_bytes(data).unwrap();
723 assert_eq!(reader.hash_algorithm(), HashAlgorithm::Sha256);
724 }
725
726 #[test]
727 fn test_read_phantoms_none() {
728 let data = create_test_archive();
729 let mut reader = CdxReader::from_bytes(data).unwrap();
730 let result = reader.read_phantoms().unwrap();
732 assert!(result.is_none());
733 }
734
735 #[test]
736 fn test_manifest_must_be_first_file() {
737 let buffer = Cursor::new(Vec::new());
739 let mut writer = zip::ZipWriter::new(buffer);
740
741 writer
743 .start_file::<&str, ()>(CONTENT_PATH, Default::default())
744 .unwrap();
745 writer
746 .write_all(br#"{"version":"0.1","blocks":[]}"#)
747 .unwrap();
748
749 let manifest_json = r#"{
751 "codex": "0.1",
752 "id": "pending",
753 "state": "draft",
754 "created": "2024-01-01T00:00:00Z",
755 "modified": "2024-01-01T00:00:00Z",
756 "content": { "path": "content/document.json", "hash": "pending" },
757 "metadata": { "dublinCore": "metadata/dublin-core.json" }
758 }"#;
759 writer
760 .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
761 .unwrap();
762 writer.write_all(manifest_json.as_bytes()).unwrap();
763
764 writer
765 .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
766 .unwrap();
767 writer.write_all(br#"{"title":"Test"}"#).unwrap();
768
769 let data = writer.finish().unwrap().into_inner();
770 let result = CdxReader::from_bytes(data);
771
772 let err = result.err().expect("should be an error");
773 assert!(matches!(err, Error::InvalidArchiveStructure { .. }));
774 }
775
776 #[test]
777 fn test_manifest_first_file_passes() {
778 let data = create_test_archive();
780 let result = CdxReader::from_bytes(data);
781 assert!(result.is_ok());
782 }
783
784 #[test]
785 fn test_utf8_bom_stripped_from_manifest() {
786 let buffer = Cursor::new(Vec::new());
788 let mut writer = zip::ZipWriter::new(buffer);
789
790 let manifest_json = r#"{
792 "codex": "0.1",
793 "id": "pending",
794 "state": "draft",
795 "created": "2024-01-01T00:00:00Z",
796 "modified": "2024-01-01T00:00:00Z",
797 "hashAlgorithm": "sha256",
798 "content": { "path": "content/document.json", "hash": "pending" },
799 "metadata": { "dublinCore": "metadata/dublin-core.json" }
800 }"#;
801 let mut bom_manifest = vec![0xEF, 0xBB, 0xBF];
802 bom_manifest.extend_from_slice(manifest_json.as_bytes());
803
804 writer
805 .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
806 .unwrap();
807 writer.write_all(&bom_manifest).unwrap();
808
809 writer
810 .start_file::<&str, ()>(CONTENT_PATH, Default::default())
811 .unwrap();
812 writer
813 .write_all(br#"{"version":"0.1","blocks":[]}"#)
814 .unwrap();
815
816 writer
817 .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
818 .unwrap();
819 writer.write_all(br#"{"title":"Test"}"#).unwrap();
820
821 let data = writer.finish().unwrap().into_inner();
822 let reader = CdxReader::from_bytes(data);
823 assert!(
824 reader.is_ok(),
825 "BOM-prefixed manifest should parse correctly"
826 );
827 assert_eq!(reader.unwrap().manifest().codex, "0.1");
828 }
829
830 #[test]
831 fn test_utf8_bom_not_required() {
832 let data = create_test_archive();
834 let reader = CdxReader::from_bytes(data);
835 assert!(reader.is_ok());
836 }
837}