use std::fs::File;
use std::io::{BufReader, Cursor, Read, Seek};
use std::path::Path;
use zip::ZipArchive;
use crate::{Error, HashAlgorithm, Hasher, Manifest, Result};
use super::{validate_path, CONTENT_PATH, DUBLIN_CORE_PATH, MANIFEST_PATH, PHANTOMS_PATH};
pub struct CdxReader<R: Read + Seek> {
archive: ZipArchive<R>,
manifest: Manifest,
}
impl CdxReader<BufReader<File>> {
pub fn open<P: AsRef<Path>>(path: P) -> Result<Self> {
let file = File::open(path.as_ref()).map_err(|e| {
if e.kind() == std::io::ErrorKind::NotFound {
Error::FileNotFound {
path: path.as_ref().to_path_buf(),
}
} else {
Error::Io(e)
}
})?;
let reader = BufReader::new(file);
Self::new(reader)
}
}
impl CdxReader<Cursor<Vec<u8>>> {
pub fn from_bytes(data: Vec<u8>) -> Result<Self> {
let cursor = Cursor::new(data);
Self::new(cursor)
}
}
impl<R: Read + Seek> CdxReader<R> {
pub fn new(reader: R) -> Result<Self> {
let mut archive = ZipArchive::new(reader)?;
Self::validate_structure(&archive)?;
let manifest = Self::read_manifest(&mut archive)?;
manifest.validate()?;
Ok(Self { archive, manifest })
}
fn validate_structure(archive: &ZipArchive<R>) -> Result<()> {
let required_files = [MANIFEST_PATH, CONTENT_PATH, DUBLIN_CORE_PATH];
for path in required_files {
if archive.index_for_name(path).is_none() {
return Err(Error::MissingFile {
path: path.to_string(),
});
}
}
if let Some(first_file) = archive.file_names().next() {
if first_file != MANIFEST_PATH {
return Err(Error::InvalidArchiveStructure {
reason: format!(
"manifest.json must be the first file in the archive (found '{first_file}')"
),
});
}
}
Ok(())
}
fn strip_utf8_bom(data: &[u8]) -> &[u8] {
data.strip_prefix(&[0xEF, 0xBB, 0xBF]).unwrap_or(data)
}
fn read_json_file<T: serde::de::DeserializeOwned>(
archive: &mut ZipArchive<R>,
path: &str,
) -> Result<T> {
let data = Self::read_file_internal(archive, path)?;
let json_data = Self::strip_utf8_bom(&data);
Ok(serde_json::from_slice(json_data)?)
}
fn read_manifest(archive: &mut ZipArchive<R>) -> Result<Manifest> {
Self::read_json_file(archive, MANIFEST_PATH)
}
const MAX_FILE_SIZE: u64 = 256 * 1024 * 1024;
fn read_file_internal(archive: &mut ZipArchive<R>, path: &str) -> Result<Vec<u8>> {
let file = archive.by_name(path).map_err(|e| match e {
zip::result::ZipError::FileNotFound => Error::MissingFile {
path: path.to_string(),
},
other => Error::InvalidArchive(other),
})?;
if file.size() > Self::MAX_FILE_SIZE {
return Err(Error::FileTooLarge {
path: path.to_string(),
size: file.size(),
limit: Self::MAX_FILE_SIZE,
});
}
let capacity = usize::try_from(file.size()).unwrap_or(0);
let mut data = Vec::with_capacity(capacity);
let bytes_read = file.take(Self::MAX_FILE_SIZE + 1).read_to_end(&mut data)?;
if bytes_read as u64 > Self::MAX_FILE_SIZE {
return Err(Error::FileTooLarge {
path: path.to_string(),
size: bytes_read as u64,
limit: Self::MAX_FILE_SIZE,
});
}
Ok(data)
}
#[must_use]
pub fn manifest(&self) -> &Manifest {
&self.manifest
}
pub fn read_file(&mut self, path: &str) -> Result<Vec<u8>> {
validate_path(path)?;
Self::read_file_internal(&mut self.archive, path)
}
pub fn read_file_verified(
&mut self,
path: &str,
expected_hash: &crate::DocumentId,
) -> Result<Vec<u8>> {
let data = self.read_file(path)?;
if expected_hash.is_pending() {
return Ok(data);
}
let actual_hash = Hasher::hash(expected_hash.algorithm(), &data);
if actual_hash != *expected_hash {
return Err(Error::HashMismatch {
path: path.to_string(),
expected: expected_hash.to_string(),
actual: actual_hash.to_string(),
});
}
Ok(data)
}
pub fn read_content(&mut self) -> Result<Vec<u8>> {
self.read_file_verified(CONTENT_PATH, &self.manifest.content.hash.clone())
}
pub fn read_dublin_core(&mut self) -> Result<Vec<u8>> {
self.read_file(&self.manifest.metadata.dublin_core.clone())
}
pub fn file_exists(&self, path: &str) -> Result<bool> {
validate_path(path)?;
Ok(self.archive.index_for_name(path).is_some())
}
#[must_use]
pub fn file_names(&self) -> Vec<String> {
self.archive.file_names().map(String::from).collect()
}
#[must_use]
pub fn file_count(&self) -> usize {
self.archive.len()
}
#[must_use]
pub fn hash_algorithm(&self) -> HashAlgorithm {
self.manifest.hash_algorithm
}
pub fn read_phantoms(&mut self) -> Result<Option<crate::extensions::PhantomClusters>> {
if self.archive.index_for_name(PHANTOMS_PATH).is_none() {
return Ok(None);
}
let phantoms: crate::extensions::PhantomClusters =
Self::read_json_file(&mut self.archive, PHANTOMS_PATH)?;
Ok(Some(phantoms))
}
pub fn verify_hashes(&mut self) -> Result<()> {
let content_data = self.read_file(CONTENT_PATH)?;
if !self.manifest.content.hash.is_pending() {
let actual = Hasher::hash(self.manifest.content.hash.algorithm(), &content_data);
if actual != self.manifest.content.hash {
return Err(Error::HashMismatch {
path: CONTENT_PATH.to_string(),
expected: self.manifest.content.hash.to_string(),
actual: actual.to_string(),
});
}
}
for pres in &self.manifest.presentation.clone() {
if !pres.hash.is_pending() {
let data = self.read_file(&pres.path)?;
let actual = Hasher::hash(pres.hash.algorithm(), &data);
if actual != pres.hash {
return Err(Error::HashMismatch {
path: pres.path.clone(),
expected: pres.hash.to_string(),
actual: actual.to_string(),
});
}
}
}
Ok(())
}
}
#[cfg(test)]
mod tests {
use super::*;
use crate::archive::CdxWriter;
use crate::{ContentRef, DocumentId, Metadata};
use std::io::{Cursor, Write};
fn create_test_archive() -> Vec<u8> {
let buffer = Cursor::new(Vec::new());
let mut writer = CdxWriter::new(buffer).unwrap();
let content = ContentRef {
path: CONTENT_PATH.to_string(),
hash: DocumentId::pending(),
compression: None,
merkle_root: None,
block_count: None,
};
let metadata = Metadata {
dublin_core: DUBLIN_CORE_PATH.to_string(),
custom: None,
};
let manifest = Manifest::new(content, metadata);
writer.write_manifest(&manifest).unwrap();
writer
.write_file(
CONTENT_PATH,
br#"{"version":"0.1","blocks":[]}"#,
super::super::writer::CompressionMethod::Deflate,
)
.unwrap();
writer
.write_file(
DUBLIN_CORE_PATH,
br#"{"title":"Test"}"#,
super::super::writer::CompressionMethod::Deflate,
)
.unwrap();
writer.finish().unwrap().into_inner()
}
#[test]
fn test_reader_from_bytes() {
let data = create_test_archive();
let reader = CdxReader::from_bytes(data).unwrap();
assert_eq!(reader.manifest().codex, "0.1");
}
#[test]
fn test_reader_file_list() {
let data = create_test_archive();
let reader = CdxReader::from_bytes(data).unwrap();
let files = reader.file_names();
assert!(files.contains(&MANIFEST_PATH.to_string()));
assert!(files.contains(&CONTENT_PATH.to_string()));
assert!(files.contains(&DUBLIN_CORE_PATH.to_string()));
}
#[test]
fn test_reader_read_file() {
let data = create_test_archive();
let mut reader = CdxReader::from_bytes(data).unwrap();
let content = reader.read_file(CONTENT_PATH).unwrap();
assert!(!content.is_empty());
}
#[test]
fn test_reader_file_exists() {
let data = create_test_archive();
let reader = CdxReader::from_bytes(data).unwrap();
assert!(reader.file_exists(MANIFEST_PATH).unwrap());
assert!(reader.file_exists(CONTENT_PATH).unwrap());
assert!(!reader.file_exists("nonexistent.json").unwrap());
}
#[test]
fn test_reader_path_traversal_rejected() {
let data = create_test_archive();
let mut reader = CdxReader::from_bytes(data).unwrap();
assert!(reader.read_file("../secret").is_err());
assert!(reader.file_exists("../secret").is_err());
}
#[test]
fn test_reader_missing_file_error() {
let data = create_test_archive();
let mut reader = CdxReader::from_bytes(data).unwrap();
let result = reader.read_file("nonexistent.json");
assert!(matches!(result, Err(Error::MissingFile { .. })));
}
#[test]
fn test_open_corrupted_zip() {
let corrupted = vec![0x50, 0x4B, 0x03, 0x04, 0xFF, 0xFF, 0xFF, 0xFF];
let result = CdxReader::from_bytes(corrupted);
assert!(result.is_err());
}
#[test]
fn test_open_not_a_zip() {
let not_zip = b"This is not a ZIP file at all".to_vec();
let result = CdxReader::from_bytes(not_zip);
assert!(result.is_err());
}
#[test]
fn test_open_empty_zip() {
let buffer = Cursor::new(Vec::new());
let writer = zip::ZipWriter::new(buffer);
let empty_zip = writer.finish().unwrap().into_inner();
let result = CdxReader::from_bytes(empty_zip);
assert!(matches!(result, Err(Error::MissingFile { .. })));
}
#[test]
fn test_open_missing_manifest() {
let buffer = Cursor::new(Vec::new());
let mut writer = zip::ZipWriter::new(buffer);
writer
.start_file::<&str, ()>(CONTENT_PATH, Default::default())
.unwrap();
writer.write_all(b"{}").unwrap();
writer
.start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
.unwrap();
writer.write_all(b"{}").unwrap();
let data = writer.finish().unwrap().into_inner();
let result = CdxReader::from_bytes(data);
assert!(matches!(result, Err(Error::MissingFile { path }) if path == MANIFEST_PATH));
}
#[test]
fn test_open_missing_content() {
let buffer = Cursor::new(Vec::new());
let mut writer = zip::ZipWriter::new(buffer);
writer
.start_file::<&str, ()>(MANIFEST_PATH, Default::default())
.unwrap();
writer.write_all(br#"{"codex":"0.1"}"#).unwrap();
writer
.start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
.unwrap();
writer.write_all(b"{}").unwrap();
let data = writer.finish().unwrap().into_inner();
let result = CdxReader::from_bytes(data);
assert!(matches!(result, Err(Error::MissingFile { path }) if path == CONTENT_PATH));
}
#[test]
fn test_open_invalid_manifest_json() {
let buffer = Cursor::new(Vec::new());
let mut writer = zip::ZipWriter::new(buffer);
writer
.start_file::<&str, ()>(MANIFEST_PATH, Default::default())
.unwrap();
writer.write_all(b"{ invalid json }").unwrap();
writer
.start_file::<&str, ()>(CONTENT_PATH, Default::default())
.unwrap();
writer.write_all(b"{}").unwrap();
writer
.start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
.unwrap();
writer.write_all(b"{}").unwrap();
let data = writer.finish().unwrap().into_inner();
let result = CdxReader::from_bytes(data);
assert!(result.is_err());
}
#[test]
fn test_read_file_hash_mismatch() {
let buffer = Cursor::new(Vec::new());
let mut writer = CdxWriter::new(buffer).unwrap();
let expected_hash: DocumentId =
"sha256:0000000000000000000000000000000000000000000000000000000000000000"
.parse()
.unwrap();
let content = ContentRef {
path: CONTENT_PATH.to_string(),
hash: expected_hash.clone(),
compression: None,
merkle_root: None,
block_count: None,
};
let metadata = Metadata {
dublin_core: DUBLIN_CORE_PATH.to_string(),
custom: None,
};
let manifest = Manifest::new(content, metadata);
writer.write_manifest(&manifest).unwrap();
writer
.write_file(
CONTENT_PATH,
br#"{"version":"0.1","blocks":[]}"#,
super::super::writer::CompressionMethod::Deflate,
)
.unwrap();
writer
.write_file(
DUBLIN_CORE_PATH,
br#"{"title":"Test"}"#,
super::super::writer::CompressionMethod::Deflate,
)
.unwrap();
let data = writer.finish().unwrap().into_inner();
let mut reader = CdxReader::from_bytes(data).unwrap();
let result = reader.read_file_verified(CONTENT_PATH, &expected_hash);
assert!(matches!(result, Err(Error::HashMismatch { .. })));
}
#[test]
fn test_verify_hashes_with_mismatch() {
let buffer = Cursor::new(Vec::new());
let mut writer = CdxWriter::new(buffer).unwrap();
let wrong_hash: DocumentId =
"sha256:ffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffffff"
.parse()
.unwrap();
let content = ContentRef {
path: CONTENT_PATH.to_string(),
hash: wrong_hash,
compression: None,
merkle_root: None,
block_count: None,
};
let metadata = Metadata {
dublin_core: DUBLIN_CORE_PATH.to_string(),
custom: None,
};
let manifest = Manifest::new(content, metadata);
writer.write_manifest(&manifest).unwrap();
writer
.write_file(
CONTENT_PATH,
br#"{"version":"0.1","blocks":[]}"#,
super::super::writer::CompressionMethod::Deflate,
)
.unwrap();
writer
.write_file(
DUBLIN_CORE_PATH,
br#"{"title":"Test"}"#,
super::super::writer::CompressionMethod::Deflate,
)
.unwrap();
let data = writer.finish().unwrap().into_inner();
let mut reader = CdxReader::from_bytes(data).unwrap();
let result = reader.verify_hashes();
assert!(matches!(result, Err(Error::HashMismatch { .. })));
}
#[test]
fn test_read_file_verified_with_pending_hash() {
let data = create_test_archive();
let mut reader = CdxReader::from_bytes(data).unwrap();
let pending = DocumentId::pending();
let result = reader.read_file_verified(CONTENT_PATH, &pending);
assert!(result.is_ok());
}
#[test]
fn test_unicode_filenames() {
let buffer = Cursor::new(Vec::new());
let mut writer = CdxWriter::new(buffer).unwrap();
let content = ContentRef {
path: CONTENT_PATH.to_string(),
hash: DocumentId::pending(),
compression: None,
merkle_root: None,
block_count: None,
};
let metadata = Metadata {
dublin_core: DUBLIN_CORE_PATH.to_string(),
custom: None,
};
let manifest = Manifest::new(content, metadata);
writer.write_manifest(&manifest).unwrap();
writer
.write_file(
CONTENT_PATH,
br#"{"version":"0.1","blocks":[]}"#,
super::super::writer::CompressionMethod::Deflate,
)
.unwrap();
writer
.write_file(
DUBLIN_CORE_PATH,
br#"{"title":"Test"}"#,
super::super::writer::CompressionMethod::Deflate,
)
.unwrap();
writer
.write_file(
"assets/文档.txt",
b"Unicode content",
super::super::writer::CompressionMethod::Deflate,
)
.unwrap();
writer
.write_file(
"assets/émoji_🎉.txt",
b"Emoji content",
super::super::writer::CompressionMethod::Deflate,
)
.unwrap();
let data = writer.finish().unwrap().into_inner();
let mut reader = CdxReader::from_bytes(data).unwrap();
let files = reader.file_names();
assert!(files.contains(&"assets/文档.txt".to_string()));
assert!(files.contains(&"assets/émoji_🎉.txt".to_string()));
let content = reader.read_file("assets/文档.txt").unwrap();
assert_eq!(content, b"Unicode content");
let emoji_content = reader.read_file("assets/émoji_🎉.txt").unwrap();
assert_eq!(emoji_content, b"Emoji content");
}
#[test]
fn test_file_count() {
let data = create_test_archive();
let reader = CdxReader::from_bytes(data).unwrap();
assert_eq!(reader.file_count(), 3);
}
#[test]
fn test_hash_algorithm() {
let data = create_test_archive();
let reader = CdxReader::from_bytes(data).unwrap();
assert_eq!(reader.hash_algorithm(), HashAlgorithm::Sha256);
}
#[test]
fn test_read_phantoms_none() {
let data = create_test_archive();
let mut reader = CdxReader::from_bytes(data).unwrap();
let result = reader.read_phantoms().unwrap();
assert!(result.is_none());
}
#[test]
fn test_manifest_must_be_first_file() {
let buffer = Cursor::new(Vec::new());
let mut writer = zip::ZipWriter::new(buffer);
writer
.start_file::<&str, ()>(CONTENT_PATH, Default::default())
.unwrap();
writer
.write_all(br#"{"version":"0.1","blocks":[]}"#)
.unwrap();
let manifest_json = r#"{
"codex": "0.1",
"id": "pending",
"state": "draft",
"created": "2024-01-01T00:00:00Z",
"modified": "2024-01-01T00:00:00Z",
"content": { "path": "content/document.json", "hash": "pending" },
"metadata": { "dublinCore": "metadata/dublin-core.json" }
}"#;
writer
.start_file::<&str, ()>(MANIFEST_PATH, Default::default())
.unwrap();
writer.write_all(manifest_json.as_bytes()).unwrap();
writer
.start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
.unwrap();
writer.write_all(br#"{"title":"Test"}"#).unwrap();
let data = writer.finish().unwrap().into_inner();
let result = CdxReader::from_bytes(data);
let err = result.err().expect("should be an error");
assert!(matches!(err, Error::InvalidArchiveStructure { .. }));
}
#[test]
fn test_manifest_first_file_passes() {
let data = create_test_archive();
let result = CdxReader::from_bytes(data);
assert!(result.is_ok());
}
#[test]
fn test_utf8_bom_stripped_from_manifest() {
let buffer = Cursor::new(Vec::new());
let mut writer = zip::ZipWriter::new(buffer);
let manifest_json = r#"{
"codex": "0.1",
"id": "pending",
"state": "draft",
"created": "2024-01-01T00:00:00Z",
"modified": "2024-01-01T00:00:00Z",
"hashAlgorithm": "sha256",
"content": { "path": "content/document.json", "hash": "pending" },
"metadata": { "dublinCore": "metadata/dublin-core.json" }
}"#;
let mut bom_manifest = vec![0xEF, 0xBB, 0xBF];
bom_manifest.extend_from_slice(manifest_json.as_bytes());
writer
.start_file::<&str, ()>(MANIFEST_PATH, Default::default())
.unwrap();
writer.write_all(&bom_manifest).unwrap();
writer
.start_file::<&str, ()>(CONTENT_PATH, Default::default())
.unwrap();
writer
.write_all(br#"{"version":"0.1","blocks":[]}"#)
.unwrap();
writer
.start_file::<&str, ()>(DUBLIN_CORE_PATH, Default::default())
.unwrap();
writer.write_all(br#"{"title":"Test"}"#).unwrap();
let data = writer.finish().unwrap().into_inner();
let reader = CdxReader::from_bytes(data);
assert!(
reader.is_ok(),
"BOM-prefixed manifest should parse correctly"
);
assert_eq!(reader.unwrap().manifest().codex, "0.1");
}
#[test]
fn test_utf8_bom_not_required() {
let data = create_test_archive();
let reader = CdxReader::from_bytes(data);
assert!(reader.is_ok());
}
}