1use std::fs::File;
4use std::io::{BufReader, Cursor, Read, Seek};
5use std::path::Path;
6
7use zip::ZipArchive;
8
9use crate::{Error, HashAlgorithm, Hasher, Manifest, Result};
10
11use super::{validate_path, CONTENT_PATH, DUBLIN_CORE_PATH, MANIFEST_PATH, PHANTOMS_PATH};
12
13pub struct CdxReader<R: Read + Seek> {
34 archive: ZipArchive<R>,
35 manifest: Manifest,
36}
37
38impl CdxReader<BufReader<File>> {
39 pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
49 let file = File::open(path.as_ref()).map_err(|e| {
50 if e.kind() == std::io::ErrorKind::NotFound {
51 Error::FileNotFound {
52 path: path.as_ref().to_path_buf(),
53 }
54 } else {
55 Error::Io(e)
56 }
57 })?;
58 let reader = BufReader::new(file);
59 Self::new(reader)
60 }
61}
62
63impl CdxReader<Cursor<Vec<u8>>> {
64 pub fn from_bytes(data: Vec<u8>) -> Result<Self> {
73 let cursor = Cursor::new(data);
74 Self::new(cursor)
75 }
76}
77
78impl<R: Read + Seek> CdxReader<R> {
79 pub fn new(reader: R) -> Result<Self> {
90 let mut archive = ZipArchive::new(reader)?;
91
92 Self::validate_structure(&archive)?;
94
95 let manifest = Self::read_manifest(&mut archive)?;
97
98 manifest.validate()?;
100
101 Ok(Self { archive, manifest })
102 }
103
104 fn validate_structure(archive: &ZipArchive<R>) -> Result<()> {
106 let required_files = [MANIFEST_PATH, CONTENT_PATH, DUBLIN_CORE_PATH];
108
109 for path in required_files {
110 if archive.index_for_name(path).is_none() {
111 return Err(Error::MissingFile {
112 path: path.to_string(),
113 });
114 }
115 }
116
117 if let Some(first_file) = archive.file_names().next() {
119 if first_file != MANIFEST_PATH {
120 return Err(Error::InvalidArchiveStructure {
121 reason: format!(
122 "manifest.json must be the first file in the archive (found '{first_file}')"
123 ),
124 });
125 }
126 }
127
128 Ok(())
129 }
130
131 fn strip_utf8_bom(data: &[u8]) -> &[u8] {
133 data.strip_prefix(&[0xEF, 0xBB, 0xBF]).unwrap_or(data)
134 }
135
136 fn read_json_file<T: serde::de::DeserializeOwned>(
138 archive: &mut ZipArchive<R>,
139 path: &str,
140 ) -> Result<T> {
141 let data = Self::read_file_internal(archive, path)?;
142 let json_data = Self::strip_utf8_bom(&data);
143 Ok(serde_json::from_slice(json_data)?)
144 }
145
146 fn read_manifest(archive: &mut ZipArchive<R>) -> Result<Manifest> {
148 Self::read_json_file(archive, MANIFEST_PATH)
149 }
150
151 const MAX_FILE_SIZE: u64 = 256 * 1024 * 1024;
156
157 fn read_file_internal(archive: &mut ZipArchive<R>, path: &str) -> Result<Vec<u8>> {
159 let file = archive.by_name(path).map_err(|_| Error::MissingFile {
160 path: path.to_string(),
161 })?;
162
163 if file.size() > Self::MAX_FILE_SIZE {
165 return Err(Error::FileTooLarge {
166 path: path.to_string(),
167 size: file.size(),
168 limit: Self::MAX_FILE_SIZE,
169 });
170 }
171
172 let capacity = usize::try_from(file.size()).unwrap_or(0);
174 let mut data = Vec::with_capacity(capacity);
175 let bytes_read = file.take(Self::MAX_FILE_SIZE + 1).read_to_end(&mut data)?;
177 if bytes_read as u64 > Self::MAX_FILE_SIZE {
178 return Err(Error::FileTooLarge {
179 path: path.to_string(),
180 size: bytes_read as u64,
181 limit: Self::MAX_FILE_SIZE,
182 });
183 }
184 Ok(data)
185 }
186
187 #[must_use]
189 pub fn manifest(&self) -> &Manifest {
190 &self.manifest
191 }
192
193 pub fn read_file(&mut self, path: &str) -> Result<Vec<u8>> {
202 validate_path(path)?;
203 Self::read_file_internal(&mut self.archive, path)
204 }
205
206 pub fn read_file_verified(
215 &mut self,
216 path: &str,
217 expected_hash: &crate::DocumentId,
218 ) -> Result<Vec<u8>> {
219 let data = self.read_file(path)?;
220
221 if expected_hash.is_pending() {
223 return Ok(data);
224 }
225
226 let actual_hash = Hasher::hash(expected_hash.algorithm(), &data);
227
228 if actual_hash != *expected_hash {
229 return Err(Error::HashMismatch {
230 path: path.to_string(),
231 expected: expected_hash.to_string(),
232 actual: actual_hash.to_string(),
233 });
234 }
235
236 Ok(data)
237 }
238
239 pub fn read_content(&mut self) -> Result<Vec<u8>> {
247 self.read_file_verified(CONTENT_PATH, &self.manifest.content.hash.clone())
248 }
249
250 pub fn read_dublin_core(&mut self) -> Result<Vec<u8>> {
258 self.read_file(&self.manifest.metadata.dublin_core.clone())
259 }
260
261 pub fn file_exists(&self, path: &str) -> Result<bool> {
267 validate_path(path)?;
268 Ok(self.archive.index_for_name(path).is_some())
269 }
270
271 #[must_use]
273 pub fn file_names(&self) -> Vec<String> {
274 self.archive.file_names().map(String::from).collect()
275 }
276
277 #[must_use]
279 pub fn file_count(&self) -> usize {
280 self.archive.len()
281 }
282
283 #[must_use]
285 pub fn hash_algorithm(&self) -> HashAlgorithm {
286 self.manifest.hash_algorithm
287 }
288
289 pub fn read_phantoms(&mut self) -> Result<Option<crate::extensions::PhantomClusters>> {
297 if self.archive.index_for_name(PHANTOMS_PATH).is_none() {
298 return Ok(None);
299 }
300
301 let phantoms: crate::extensions::PhantomClusters =
302 Self::read_json_file(&mut self.archive, PHANTOMS_PATH)?;
303 Ok(Some(phantoms))
304 }
305
306 pub fn verify_hashes(&mut self) -> Result<()> {
316 let content_data = self.read_file(CONTENT_PATH)?;
318 if !self.manifest.content.hash.is_pending() {
319 let actual = Hasher::hash(self.manifest.content.hash.algorithm(), &content_data);
320 if actual != self.manifest.content.hash {
321 return Err(Error::HashMismatch {
322 path: CONTENT_PATH.to_string(),
323 expected: self.manifest.content.hash.to_string(),
324 actual: actual.to_string(),
325 });
326 }
327 }
328
329 for pres in &self.manifest.presentation.clone() {
331 if !pres.hash.is_pending() {
332 let data = self.read_file(&pres.path)?;
333 let actual = Hasher::hash(pres.hash.algorithm(), &data);
334 if actual != pres.hash {
335 return Err(Error::HashMismatch {
336 path: pres.path.clone(),
337 expected: pres.hash.to_string(),
338 actual: actual.to_string(),
339 });
340 }
341 }
342 }
343
344 Ok(())
345 }
346}
347
348#[cfg(test)]
349mod tests {
350 use super::*;
351 use crate::archive::CdxWriter;
352 use crate::{ContentRef, DocumentId, Metadata};
353 use std::io::{Cursor, Write};
354
355 fn create_test_archive() -> Vec<u8> {
356 let buffer = Cursor::new(Vec::new());
357 let mut writer = CdxWriter::new(buffer).unwrap();
358
359 let content = ContentRef {
361 path: CONTENT_PATH.to_string(),
362 hash: DocumentId::pending(),
363 compression: None,
364 merkle_root: None,
365 block_count: None,
366 };
367 let metadata = Metadata {
368 dublin_core: DUBLIN_CORE_PATH.to_string(),
369 custom: None,
370 };
371 let manifest = Manifest::new(content, metadata);
372
373 writer.write_manifest(&manifest).unwrap();
374 writer
375 .write_file(
376 CONTENT_PATH,
377 br#"{"version":"0.1","blocks":[]}"#,
378 super::super::writer::CompressionMethod::Deflate,
379 )
380 .unwrap();
381 writer
382 .write_file(
383 DUBLIN_CORE_PATH,
384 br#"{"title":"Test"}"#,
385 super::super::writer::CompressionMethod::Deflate,
386 )
387 .unwrap();
388
389 writer.finish().unwrap().into_inner()
390 }
391
392 #[test]
393 fn test_reader_from_bytes() {
394 let data = create_test_archive();
395 let reader = CdxReader::from_bytes(data).unwrap();
396 assert_eq!(reader.manifest().codex, "0.1");
397 }
398
399 #[test]
400 fn test_reader_file_list() {
401 let data = create_test_archive();
402 let reader = CdxReader::from_bytes(data).unwrap();
403 let files = reader.file_names();
404 assert!(files.contains(&MANIFEST_PATH.to_string()));
405 assert!(files.contains(&CONTENT_PATH.to_string()));
406 assert!(files.contains(&DUBLIN_CORE_PATH.to_string()));
407 }
408
409 #[test]
410 fn test_reader_read_file() {
411 let data = create_test_archive();
412 let mut reader = CdxReader::from_bytes(data).unwrap();
413 let content = reader.read_file(CONTENT_PATH).unwrap();
414 assert!(!content.is_empty());
415 }
416
417 #[test]
418 fn test_reader_file_exists() {
419 let data = create_test_archive();
420 let reader = CdxReader::from_bytes(data).unwrap();
421 assert!(reader.file_exists(MANIFEST_PATH).unwrap());
422 assert!(reader.file_exists(CONTENT_PATH).unwrap());
423 assert!(!reader.file_exists("nonexistent.json").unwrap());
424 }
425
426 #[test]
427 fn test_reader_path_traversal_rejected() {
428 let data = create_test_archive();
429 let mut reader = CdxReader::from_bytes(data).unwrap();
430 assert!(reader.read_file("../secret").is_err());
431 assert!(reader.file_exists("../secret").is_err());
432 }
433
434 #[test]
435 fn test_reader_missing_file_error() {
436 let data = create_test_archive();
437 let mut reader = CdxReader::from_bytes(data).unwrap();
438 let result = reader.read_file("nonexistent.json");
439 assert!(matches!(result, Err(Error::MissingFile { .. })));
440 }
441
442 #[test]
443 fn test_open_corrupted_zip() {
444 let corrupted = vec![0x50, 0x4B, 0x03, 0x04, 0xFF, 0xFF, 0xFF, 0xFF];
446 let result = CdxReader::from_bytes(corrupted);
447 assert!(result.is_err());
448 }
449
450 #[test]
451 fn test_open_not_a_zip() {
452 let not_zip = b"This is not a ZIP file at all".to_vec();
454 let result = CdxReader::from_bytes(not_zip);
455 assert!(result.is_err());
456 }
457
458 #[test]
459 fn test_open_empty_zip() {
460 let buffer = Cursor::new(Vec::new());
462 let writer = zip::ZipWriter::new(buffer);
463 let empty_zip = writer.finish().unwrap().into_inner();
464
465 let result = CdxReader::from_bytes(empty_zip);
466 assert!(matches!(result, Err(Error::MissingFile { .. })));
467 }
468
469 #[test]
470 fn test_open_missing_manifest() {
471 let buffer = Cursor::new(Vec::new());
473 let mut writer = zip::ZipWriter::new(buffer);
474 writer
475 .start_file::<&str, ()>(CONTENT_PATH, Default::default())
476 .unwrap();
477 writer.write_all(b"{}").unwrap();
478 writer
479 .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
480 .unwrap();
481 writer.write_all(b"{}").unwrap();
482 let data = writer.finish().unwrap().into_inner();
483
484 let result = CdxReader::from_bytes(data);
485 assert!(matches!(result, Err(Error::MissingFile { path }) if path == MANIFEST_PATH));
486 }
487
488 #[test]
489 fn test_open_missing_content() {
490 let buffer = Cursor::new(Vec::new());
492 let mut writer = zip::ZipWriter::new(buffer);
493
494 writer
496 .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
497 .unwrap();
498 writer.write_all(br#"{"codex":"0.1"}"#).unwrap();
499
500 writer
502 .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
503 .unwrap();
504 writer.write_all(b"{}").unwrap();
505
506 let data = writer.finish().unwrap().into_inner();
507
508 let result = CdxReader::from_bytes(data);
509 assert!(matches!(result, Err(Error::MissingFile { path }) if path == CONTENT_PATH));
510 }
511
512 #[test]
513 fn test_open_invalid_manifest_json() {
514 let buffer = Cursor::new(Vec::new());
516 let mut writer = zip::ZipWriter::new(buffer);
517
518 writer
519 .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
520 .unwrap();
521 writer.write_all(b"{ invalid json }").unwrap();
522
523 writer
524 .start_file::<&str, ()>(CONTENT_PATH, Default::default())
525 .unwrap();
526 writer.write_all(b"{}").unwrap();
527
528 writer
529 .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
530 .unwrap();
531 writer.write_all(b"{}").unwrap();
532
533 let data = writer.finish().unwrap().into_inner();
534
535 let result = CdxReader::from_bytes(data);
536 assert!(result.is_err());
537 }
538
539 #[test]
540 fn test_read_file_hash_mismatch() {
541 let buffer = Cursor::new(Vec::new());
542 let mut writer = CdxWriter::new(buffer).unwrap();
543
544 let expected_hash: DocumentId =
546 "sha256:0000000000000000000000000000000000000000000000000000000000000000"
547 .parse()
548 .unwrap();
549 let content = ContentRef {
550 path: CONTENT_PATH.to_string(),
551 hash: expected_hash.clone(),
552 compression: None,
553 merkle_root: None,
554 block_count: None,
555 };
556 let metadata = Metadata {
557 dublin_core: DUBLIN_CORE_PATH.to_string(),
558 custom: None,
559 };
560 let manifest = Manifest::new(content, metadata);
561
562 writer.write_manifest(&manifest).unwrap();
563 writer
565 .write_file(
566 CONTENT_PATH,
567 br#"{"version":"0.1","blocks":[]}"#,
568 super::super::writer::CompressionMethod::Deflate,
569 )
570 .unwrap();
571 writer
572 .write_file(
573 DUBLIN_CORE_PATH,
574 br#"{"title":"Test"}"#,
575 super::super::writer::CompressionMethod::Deflate,
576 )
577 .unwrap();
578
579 let data = writer.finish().unwrap().into_inner();
580 let mut reader = CdxReader::from_bytes(data).unwrap();
581
582 let result = reader.read_file_verified(CONTENT_PATH, &expected_hash);
583 assert!(matches!(result, Err(Error::HashMismatch { .. })));
584 }
585
586 #[test]
587 fn test_verify_hashes_with_mismatch() {
588 let buffer = Cursor::new(Vec::new());
589 let mut writer = CdxWriter::new(buffer).unwrap();
590
591 let wrong_hash: DocumentId =
593 "sha256:ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
594 .parse()
595 .unwrap();
596 let content = ContentRef {
597 path: CONTENT_PATH.to_string(),
598 hash: wrong_hash,
599 compression: None,
600 merkle_root: None,
601 block_count: None,
602 };
603 let metadata = Metadata {
604 dublin_core: DUBLIN_CORE_PATH.to_string(),
605 custom: None,
606 };
607 let manifest = Manifest::new(content, metadata);
608
609 writer.write_manifest(&manifest).unwrap();
610 writer
611 .write_file(
612 CONTENT_PATH,
613 br#"{"version":"0.1","blocks":[]}"#,
614 super::super::writer::CompressionMethod::Deflate,
615 )
616 .unwrap();
617 writer
618 .write_file(
619 DUBLIN_CORE_PATH,
620 br#"{"title":"Test"}"#,
621 super::super::writer::CompressionMethod::Deflate,
622 )
623 .unwrap();
624
625 let data = writer.finish().unwrap().into_inner();
626 let mut reader = CdxReader::from_bytes(data).unwrap();
627
628 let result = reader.verify_hashes();
629 assert!(matches!(result, Err(Error::HashMismatch { .. })));
630 }
631
632 #[test]
633 fn test_read_file_verified_with_pending_hash() {
634 let data = create_test_archive();
635 let mut reader = CdxReader::from_bytes(data).unwrap();
636
637 let pending = DocumentId::pending();
639 let result = reader.read_file_verified(CONTENT_PATH, &pending);
640 assert!(result.is_ok());
641 }
642
643 #[test]
644 fn test_unicode_filenames() {
645 let buffer = Cursor::new(Vec::new());
646 let mut writer = CdxWriter::new(buffer).unwrap();
647
648 let content = ContentRef {
649 path: CONTENT_PATH.to_string(),
650 hash: DocumentId::pending(),
651 compression: None,
652 merkle_root: None,
653 block_count: None,
654 };
655 let metadata = Metadata {
656 dublin_core: DUBLIN_CORE_PATH.to_string(),
657 custom: None,
658 };
659 let manifest = Manifest::new(content, metadata);
660
661 writer.write_manifest(&manifest).unwrap();
662 writer
663 .write_file(
664 CONTENT_PATH,
665 br#"{"version":"0.1","blocks":[]}"#,
666 super::super::writer::CompressionMethod::Deflate,
667 )
668 .unwrap();
669 writer
670 .write_file(
671 DUBLIN_CORE_PATH,
672 br#"{"title":"Test"}"#,
673 super::super::writer::CompressionMethod::Deflate,
674 )
675 .unwrap();
676
677 writer
679 .write_file(
680 "assets/文档.txt",
681 b"Unicode content",
682 super::super::writer::CompressionMethod::Deflate,
683 )
684 .unwrap();
685 writer
686 .write_file(
687 "assets/émoji_🎉.txt",
688 b"Emoji content",
689 super::super::writer::CompressionMethod::Deflate,
690 )
691 .unwrap();
692
693 let data = writer.finish().unwrap().into_inner();
694 let mut reader = CdxReader::from_bytes(data).unwrap();
695
696 let files = reader.file_names();
698 assert!(files.contains(&"assets/文档.txt".to_string()));
699 assert!(files.contains(&"assets/émoji_🎉.txt".to_string()));
700
701 let content = reader.read_file("assets/文档.txt").unwrap();
702 assert_eq!(content, b"Unicode content");
703
704 let emoji_content = reader.read_file("assets/émoji_🎉.txt").unwrap();
705 assert_eq!(emoji_content, b"Emoji content");
706 }
707
708 #[test]
709 fn test_file_count() {
710 let data = create_test_archive();
711 let reader = CdxReader::from_bytes(data).unwrap();
712 assert_eq!(reader.file_count(), 3);
714 }
715
716 #[test]
717 fn test_hash_algorithm() {
718 let data = create_test_archive();
719 let reader = CdxReader::from_bytes(data).unwrap();
720 assert_eq!(reader.hash_algorithm(), HashAlgorithm::Sha256);
721 }
722
723 #[test]
724 fn test_read_phantoms_none() {
725 let data = create_test_archive();
726 let mut reader = CdxReader::from_bytes(data).unwrap();
727 let result = reader.read_phantoms().unwrap();
729 assert!(result.is_none());
730 }
731
732 #[test]
733 fn test_manifest_must_be_first_file() {
734 let buffer = Cursor::new(Vec::new());
736 let mut writer = zip::ZipWriter::new(buffer);
737
738 writer
740 .start_file::<&str, ()>(CONTENT_PATH, Default::default())
741 .unwrap();
742 writer
743 .write_all(br#"{"version":"0.1","blocks":[]}"#)
744 .unwrap();
745
746 let manifest_json = r#"{
748 "codex": "0.1",
749 "id": "pending",
750 "state": "draft",
751 "created": "2024-01-01T00:00:00Z",
752 "modified": "2024-01-01T00:00:00Z",
753 "content": { "path": "content/document.json", "hash": "pending" },
754 "metadata": { "dublinCore": "metadata/dublin-core.json" }
755 }"#;
756 writer
757 .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
758 .unwrap();
759 writer.write_all(manifest_json.as_bytes()).unwrap();
760
761 writer
762 .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
763 .unwrap();
764 writer.write_all(br#"{"title":"Test"}"#).unwrap();
765
766 let data = writer.finish().unwrap().into_inner();
767 let result = CdxReader::from_bytes(data);
768
769 let err = result.err().expect("should be an error");
770 assert!(matches!(err, Error::InvalidArchiveStructure { .. }));
771 }
772
773 #[test]
774 fn test_manifest_first_file_passes() {
775 let data = create_test_archive();
777 let result = CdxReader::from_bytes(data);
778 assert!(result.is_ok());
779 }
780
781 #[test]
782 fn test_utf8_bom_stripped_from_manifest() {
783 let buffer = Cursor::new(Vec::new());
785 let mut writer = zip::ZipWriter::new(buffer);
786
787 let manifest_json = r#"{
789 "codex": "0.1",
790 "id": "pending",
791 "state": "draft",
792 "created": "2024-01-01T00:00:00Z",
793 "modified": "2024-01-01T00:00:00Z",
794 "hashAlgorithm": "sha256",
795 "content": { "path": "content/document.json", "hash": "pending" },
796 "metadata": { "dublinCore": "metadata/dublin-core.json" }
797 }"#;
798 let mut bom_manifest = vec![0xEF, 0xBB, 0xBF];
799 bom_manifest.extend_from_slice(manifest_json.as_bytes());
800
801 writer
802 .start_file::<&str, ()>(MANIFEST_PATH, Default::default())
803 .unwrap();
804 writer.write_all(&bom_manifest).unwrap();
805
806 writer
807 .start_file::<&str, ()>(CONTENT_PATH, Default::default())
808 .unwrap();
809 writer
810 .write_all(br#"{"version":"0.1","blocks":[]}"#)
811 .unwrap();
812
813 writer
814 .start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
815 .unwrap();
816 writer.write_all(br#"{"title":"Test"}"#).unwrap();
817
818 let data = writer.finish().unwrap().into_inner();
819 let reader = CdxReader::from_bytes(data);
820 assert!(
821 reader.is_ok(),
822 "BOM-prefixed manifest should parse correctly"
823 );
824 assert_eq!(reader.unwrap().manifest().codex, "0.1");
825 }
826
827 #[test]
828 fn test_utf8_bom_not_required() {
829 let data = create_test_archive();
831 let reader = CdxReader::from_bytes(data);
832 assert!(reader.is_ok());
833 }
834}