adaptive_pipeline_domain/value_objects/binary_file_format.rs
1// /////////////////////////////////////////////////////////////////////////////
2// Adaptive Pipeline
3// Copyright (c) 2025 Michael Gardner, A Bit of Help, Inc.
4// SPDX-License-Identifier: BSD-3-Clause
5// See LICENSE file in the project root.
6// /////////////////////////////////////////////////////////////////////////////
7
8//! # Binary File Format Value Object
9//!
10//! This module defines the binary file format specification for the Adaptive
11//! Pipeline system. It provides a standardized format for storing processed
12//! files with complete recovery metadata and integrity verification.
13//!
14//! ## Overview
15//!
16//! The binary file format provides:
17//!
18//! - **File Recovery**: Complete metadata for recovering original files
19//! - **Integrity Verification**: Checksums and validation for processed files
20//! - **Processing History**: Complete record of processing steps applied
21//! - **Version Management**: Format versioning for backward compatibility
22//! - **Compression Support**: Efficient storage of processed data
23//!
24//! ## Architecture
25//!
26//! The format follows a structured binary layout:
27//!
28//! - **Magic Bytes**: File format identification
29//! - **Version Header**: Format version information
30//! - **Metadata Section**: Processing metadata and recovery information
31//! - **Data Section**: Actual processed file data
32//! - **Integrity Section**: Checksums and validation data
33//!
34//! ## Key Features
35//!
36//! ### File Recovery
37//!
38//! - **Original Filename**: Preserve original file names
39//! - **File Size**: Track original and processed file sizes
40//! - **Processing Steps**: Record all processing operations applied
41//! - **Restoration Metadata**: Information needed for complete recovery
42//!
43//! ### Integrity Verification
44//!
45//! - **Checksums**: Multiple checksum algorithms for verification
46//! - **Validation**: Comprehensive validation of file integrity
47//! - **Error Detection**: Detect corruption and processing errors
48//! - **Recovery Verification**: Verify recovered files match originals
49//!
50//! ### Format Versioning
51//!
52//! - **Version Management**: Support for multiple format versions
53//! - **Backward Compatibility**: Maintain compatibility with older versions
54//! - **Migration Support**: Automatic migration between format versions
55//! - **Feature Evolution**: Support for new features in future versions
56//!
57//! ## Usage Examples
58//!
59//! ### Creating a Binary File
60
61//!
62//! ### Reading and Validating a Binary File
63
64//!
65//! ### File Recovery Process
66
67//!
68//! ## File Format Specification
69//!
70//! ### Binary Layout
71//!
72//! The .adapipe file format uses the following binary layout:
73//!
74//!
75//! ### Header Components
76//!
77//! - **Magic Bytes**: 8 bytes - "ADAPIPE\0" (0x41444150495045000)
78//! - **Format Version**: 2 bytes - Current version number
79//! - **Header Length**: 4 bytes - Length of JSON header in bytes
80//! - **JSON Header**: Variable length - Metadata and processing information
81//! - **Processed Data**: Variable length - Actual processed file content
82//!
83//! ### JSON Header Structure
84//!
85//!
86//! ## Processing Steps
87//!
88//! ### Supported Operations
89//!
90//! - **Compression**: Various compression algorithms (brotli, gzip, lz4)
91//! - **Encryption**: Encryption algorithms (AES-256-GCM, ChaCha20-Poly1305)
92//! - **Validation**: Checksum and integrity validation
93//! - **Transformation**: Custom data transformations
94//!
95//! ### Step Parameters
96//!
97//! Each processing step can include parameters:
98//!
99//! - **Compression Level**: Compression quality/speed tradeoff
100//! - **Encryption Keys**: Key derivation and management information
101//! - **Algorithm Options**: Algorithm-specific configuration
102//! - **Custom Parameters**: Application-specific parameters
103//!
104//! ## Integrity Verification
105//!
106//! ### Checksum Algorithms
107//!
108//! - **SHA-256**: Primary checksum algorithm
109//! - **Blake3**: High-performance alternative
110//! - **CRC32**: Fast integrity checking
111//! - **Custom**: Support for custom checksum algorithms
112//!
113//! ### Verification Process
114//!
115//! 1. **Format Validation**: Verify magic bytes and version
116//! 2. **Header Validation**: Validate JSON header structure
117//! 3. **Data Integrity**: Verify processed data checksum
118//! 4. **Recovery Verification**: Verify recovered data matches original
119//!
120//! ## Error Handling
121//!
122//! ### Format Errors
123//!
124//! - **Invalid Magic Bytes**: File is not in .adapipe format
125//! - **Unsupported Version**: Format version not supported
126//! - **Corrupt Header**: JSON header is malformed or corrupt
127//! - **Invalid Data**: Processed data is corrupt or invalid
128//!
129//! ### Recovery Errors
130//!
131//! - **Missing Steps**: Required processing steps are missing
132//! - **Invalid Parameters**: Processing parameters are invalid
133//! - **Checksum Mismatch**: Data integrity verification failed
134//! - **Recovery Failure**: Unable to recover original data
135//!
136//! ## Performance Considerations
137//!
138//! ### File Size Optimization
139//!
140//! - **Efficient Encoding**: Compact binary encoding for metadata
141//! - **Compression**: Built-in compression for processed data
142//! - **Minimal Overhead**: Minimal format overhead
143//!
144//! ### Processing Performance
145//!
146//! - **Streaming**: Support for streaming processing of large files
147//! - **Parallel Processing**: Parallel processing of file chunks
148//! - **Memory Efficiency**: Efficient memory usage during processing
149//!
150//! ## Security Considerations
151//!
152//! ### Data Protection
153//!
154//! - **Encryption**: Strong encryption for sensitive data
155//! - **Key Management**: Secure key derivation and management
156//! - **Integrity**: Comprehensive integrity verification
157//!
158//! ### Attack Prevention
159//!
160//! - **Format Validation**: Prevent malformed file attacks
161//! - **Size Limits**: Prevent resource exhaustion attacks
162//! - **Checksum Verification**: Prevent data tampering
163//!
164//! ## Version Management
165//!
166//! ### Format Versioning
167//!
168//! - **Semantic Versioning**: Use semantic versioning for format versions
169//! - **Backward Compatibility**: Maintain compatibility with older versions
170//! - **Migration**: Automatic migration between format versions
171//!
172//! ### Feature Evolution
173//!
174//! - **New Algorithms**: Support for new compression/encryption algorithms
175//! - **Enhanced Metadata**: Extended metadata capabilities
176//! - **Performance Improvements**: Optimizations in new versions
177//!
178//! ## Integration
179//!
180//! The binary file format integrates with:
181//!
182//! - **File Processor**: Used by file processor for creating processed files
183//! - **Storage Systems**: Store processed files in various storage systems
184//! - **Recovery Systems**: Recover original files from processed files
185//! - **Validation Systems**: Validate file integrity and format compliance
186//!
187//! ## Future Enhancements
188//!
189//! Planned enhancements include:
190//!
191//! - **Streaming Support**: Enhanced streaming capabilities
192//! - **Compression Improvements**: Better compression algorithms
193//! - **Metadata Extensions**: Extended metadata capabilities
194//! - **Performance Optimizations**: Further performance improvements
195
196use serde::{Deserialize, Serialize};
197use sha2::{Digest, Sha256};
198use std::collections::HashMap;
199
200use crate::PipelineError;
201
202/// Magic bytes to identify our file format: "ADAPIPE\0"
203///
204/// These magic bytes are used to identify files in the Adaptive Pipeline
205/// binary format. They appear at the end of the file for efficient
206/// format detection without reading the entire file.
207///
208/// The magic bytes spell "ADAPIPE" followed by a null terminator:
209/// - 0x41 = 'A'
210/// - 0x44 = 'D'
211/// - 0x41 = 'A'
212/// - 0x50 = 'P'
213/// - 0x49 = 'I'
214/// - 0x50 = 'P'
215/// - 0x45 = 'E'
216/// - 0x00 = null terminator
217pub const MAGIC_BYTES: [u8; 8] = [0x41, 0x44, 0x41, 0x50, 0x49, 0x50, 0x45, 0x00];
218
219/// Current file format version
220///
221/// This constant defines the current version of the .adapipe file format.
222/// It is used for:
223/// - Format version validation when reading files
224/// - Backward compatibility checking
225/// - Migration between format versions
226/// - Feature availability determination
227///
228/// Version history:
229/// - Version 1: Initial format with basic compression and encryption support
230pub const CURRENT_FORMAT_VERSION: u16 = 1;
231
232/// File header for Adaptive Pipeline processed files (.adapipe format)
233///
234/// This header contains all information needed to:
235/// 1. Recover the original document (filename, size, processing steps)
236/// 2. Verify integrity of the processed output file we created
237/// 3. Validate the restored input file matches the original exactly
238///
239/// # Adaptive Pipeline File Format (.adapipe)
240/// ```text
241/// [CHUNK_DATA][JSON_HEADER][HEADER_LENGTH][FORMAT_VERSION][MAGIC_BYTES]
242/// ```
243///
244/// Note: This is NOT a general binary file format like .png or .exe.
245/// This is specifically for files processed by the Adaptive Pipeline system
246/// that have been compressed and/or encrypted with restoration metadata.
247///
248/// # Recovery Process
249#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
250pub struct FileHeader {
251 /// Application version that created this file
252 pub app_version: String,
253
254 /// File format version for backward compatibility
255 pub format_version: u16,
256
257 /// Original input filename (for restoration)
258 pub original_filename: String,
259
260 /// Original file size in bytes (for validation)
261 pub original_size: u64,
262
263 /// SHA256 checksum of original input file (for validation)
264 pub original_checksum: String,
265
266 /// SHA256 checksum of this output file (for integrity verification)
267 pub output_checksum: String,
268
269 /// Processing pipeline information (for restoration)
270 pub processing_steps: Vec<ProcessingStep>,
271
272 /// Chunk size used for processing
273 pub chunk_size: u32,
274
275 /// Number of chunks in the processed file
276 pub chunk_count: u32,
277
278 /// Processing timestamp (RFC3339)
279 pub processed_at: chrono::DateTime<chrono::Utc>,
280
281 /// Pipeline ID that processed this file
282 pub pipeline_id: String,
283
284 /// Additional metadata for debugging/auditing
285 pub metadata: HashMap<String, String>,
286}
287
288/// A single processing step that was applied to the file
289/// Steps are stored in the order they were applied, and must be reversed in
290/// reverse order
291#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
292pub struct ProcessingStep {
293 /// Step type (compression, encryption, etc.)
294 pub step_type: ProcessingStepType,
295
296 /// Algorithm used
297 pub algorithm: String,
298
299 /// Algorithm-specific parameters needed for restoration
300 pub parameters: HashMap<String, String>,
301
302 /// Order in which this step was applied (0-based)
303 pub order: u32,
304}
305
306/// Types of processing steps
307#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
308pub enum ProcessingStepType {
309 /// Compression step
310 Compression,
311 /// Encryption step
312 Encryption,
313 /// Checksum/integrity verification step
314 Checksum,
315 /// Pass-through step (no data modification)
316 PassThrough,
317 /// Legacy custom processing step (deprecated)
318 Custom(String),
319}
320
321/// Format for individual chunks in the file
322#[derive(Debug, Clone, PartialEq)]
323pub struct ChunkFormat {
324 /// Encryption nonce (12 bytes for AES-GCM)
325 /// Contains actual nonce when encrypted, zeros ([0u8; 12]) when not
326 /// encrypted
327 pub nonce: [u8; 12],
328
329 /// Length of payload data
330 pub data_length: u32,
331
332 /// Chunk payload data (may be raw, compressed, encrypted, or any
333 /// combination) Note: Previously named `encrypted_data` but renamed for
334 /// clarity since this field contains data in various states of
335 /// transformation
336 pub payload: Vec<u8>,
337}
338
339impl FileHeader {
340 /// Creates a new file header with default values
341 ///
342 /// # Purpose
343 /// Creates a `FileHeader` for tracking processing metadata and enabling
344 /// file recovery. The header stores all information needed to validate
345 /// and restore processed files.
346 ///
347 /// # Why
348 /// File headers provide:
349 /// - Recovery information to restore original files
350 /// - Integrity verification through checksums
351 /// - Processing history for debugging and auditing
352 /// - Version management for backward compatibility
353 ///
354 /// # Arguments
355 /// * `original_filename` - Name of the original input file (for
356 /// restoration)
357 /// * `original_size` - Size of the original file in bytes (for validation)
358 /// * `original_checksum` - SHA256 checksum of original file (for
359 /// validation)
360 ///
361 /// # Returns
362 /// `FileHeader` with default values:
363 /// - `app_version`: Current package version from Cargo.toml
364 /// - `format_version`: Current format version (1)
365 /// - `chunk_size`: 1MB default
366 /// - `processed_at`: Current timestamp
367 /// - Empty processing steps, pipeline ID, and metadata
368 ///
369 /// # Examples
370 pub fn new(original_filename: String, original_size: u64, original_checksum: String) -> Self {
371 Self {
372 app_version: env!("CARGO_PKG_VERSION").to_string(),
373 format_version: CURRENT_FORMAT_VERSION,
374 original_filename,
375 original_size,
376 original_checksum,
377 output_checksum: String::new(), // Will be set after processing
378 processing_steps: Vec::new(),
379 chunk_size: 1024 * 1024, // Default 1MB
380 chunk_count: 0,
381 processed_at: chrono::Utc::now(),
382 pipeline_id: String::new(),
383 metadata: HashMap::new(),
384 }
385 }
386
387 /// Adds a compression step to the processing pipeline
388 ///
389 /// # Purpose
390 /// Records a compression operation in the processing steps.
391 /// This information is used during file recovery to decompress the data.
392 ///
393 /// # Arguments
394 /// * `algorithm` - Name of compression algorithm (e.g., "brotli", "gzip",
395 /// "zstd", "lz4")
396 /// * `level` - Compression level (algorithm-specific, typically 1-9)
397 ///
398 /// # Returns
399 /// Updated `FileHeader` with compression step added (builder pattern)
400 ///
401 /// # Examples
402 pub fn add_compression_step(mut self, algorithm: &str, level: u32) -> Self {
403 let mut parameters = HashMap::new();
404 parameters.insert("level".to_string(), level.to_string());
405
406 self.processing_steps.push(ProcessingStep {
407 step_type: ProcessingStepType::Compression,
408 algorithm: algorithm.to_string(),
409 parameters,
410 order: self.processing_steps.len() as u32,
411 });
412 self
413 }
414
415 /// Adds an encryption step
416 pub fn add_encryption_step(
417 mut self,
418 algorithm: &str,
419 key_derivation: &str,
420 key_size: u32,
421 nonce_size: u32,
422 ) -> Self {
423 let mut parameters = HashMap::new();
424 parameters.insert("key_derivation".to_string(), key_derivation.to_string());
425 parameters.insert("key_size".to_string(), key_size.to_string());
426 parameters.insert("nonce_size".to_string(), nonce_size.to_string());
427
428 self.processing_steps.push(ProcessingStep {
429 step_type: ProcessingStepType::Encryption,
430 algorithm: algorithm.to_string(),
431 parameters,
432 order: self.processing_steps.len() as u32,
433 });
434 self
435 }
436
437 /// Adds a custom processing step
438 pub fn add_custom_step(mut self, step_name: &str, algorithm: &str, parameters: HashMap<String, String>) -> Self {
439 self.processing_steps.push(ProcessingStep {
440 step_type: ProcessingStepType::Custom(step_name.to_string()),
441 algorithm: algorithm.to_string(),
442 parameters,
443 order: self.processing_steps.len() as u32,
444 });
445 self
446 }
447
448 /// Adds a processing step using domain-driven ProcessingStepDescriptor
449 /// This is the preferred method that respects DIP and uses Value Objects
450 pub fn add_processing_step(
451 mut self,
452 descriptor: super::processing_step_descriptor::ProcessingStepDescriptor,
453 ) -> Self {
454 self.processing_steps.push(ProcessingStep {
455 step_type: descriptor.step_type().clone(),
456 algorithm: descriptor.algorithm().as_str().to_string(),
457 parameters: descriptor.parameters().as_map().clone(),
458 order: descriptor.order().value(),
459 });
460 self
461 }
462
463 /// Adds a checksum processing step
464 pub fn add_checksum_step(mut self, algorithm: &str) -> Self {
465 self.processing_steps.push(ProcessingStep {
466 step_type: ProcessingStepType::Checksum,
467 algorithm: algorithm.to_string(),
468 parameters: HashMap::new(),
469 order: self.processing_steps.len() as u32,
470 });
471 self
472 }
473
474 /// Adds a pass-through processing step
475 pub fn add_passthrough_step(mut self, algorithm: &str) -> Self {
476 self.processing_steps.push(ProcessingStep {
477 step_type: ProcessingStepType::PassThrough,
478 algorithm: algorithm.to_string(),
479 parameters: HashMap::new(),
480 order: self.processing_steps.len() as u32,
481 });
482 self
483 }
484
485 /// Sets chunk processing information
486 pub fn with_chunk_info(mut self, chunk_size: u32, chunk_count: u32) -> Self {
487 self.chunk_size = chunk_size;
488 self.chunk_count = chunk_count;
489 self
490 }
491
492 /// Sets pipeline ID
493 pub fn with_pipeline_id(mut self, pipeline_id: String) -> Self {
494 self.pipeline_id = pipeline_id;
495 self
496 }
497
498 /// Sets output file checksum (call after processing is complete)
499 pub fn with_output_checksum(mut self, checksum: String) -> Self {
500 self.output_checksum = checksum;
501 self
502 }
503
504 /// Adds metadata
505 pub fn with_metadata(mut self, key: String, value: String) -> Self {
506 self.metadata.insert(key, value);
507 self
508 }
509
510 /// Serializes the header to binary format for file footer
511 ///
512 /// # Purpose
513 /// Converts the header to the binary footer format that is appended to
514 /// processed files. The footer allows reading metadata from the end of
515 /// files without scanning the entire file.
516 ///
517 /// # Why
518 /// Storing metadata at the end provides:
519 /// - Efficient metadata access without reading full file
520 /// - Streaming-friendly format (header written after data)
521 /// - Simple format detection via magic bytes at end
522 ///
523 /// # Binary Format
524 /// ```text
525 /// [JSON_HEADER][HEADER_LENGTH (4 bytes)][FORMAT_VERSION (2 bytes)][MAGIC_BYTES (8 bytes)]
526 /// ```
527 ///
528 /// # Returns
529 /// * `Ok(Vec<u8>)` - Serialized footer bytes
530 /// * `Err(PipelineError::SerializationError)` - JSON serialization failed
531 ///
532 /// # Errors
533 /// Returns `PipelineError::SerializationError` if JSON serialization fails.
534 ///
535 /// # Examples
536 pub fn to_footer_bytes(&self) -> Result<Vec<u8>, PipelineError> {
537 // Serialize header to JSON
538 let header_json = serde_json::to_string(self)
539 .map_err(|e| PipelineError::SerializationError(format!("Failed to serialize header: {}", e)))?;
540
541 let header_bytes = header_json.as_bytes();
542 let header_length = header_bytes.len() as u32;
543
544 // Build footer format
545 let mut result = Vec::new();
546
547 // JSON header data
548 result.extend_from_slice(header_bytes);
549
550 // Header length (little-endian)
551 result.extend_from_slice(&header_length.to_le_bytes());
552
553 // Format version (little-endian)
554 result.extend_from_slice(&self.format_version.to_le_bytes());
555
556 // Magic bytes
557 result.extend_from_slice(&MAGIC_BYTES);
558
559 Ok(result)
560 }
561
562 /// Deserializes the header from file footer bytes
563 ///
564 /// # Purpose
565 /// Extracts and parses the file header from the footer at the end of a
566 /// processed file. This is the primary method for reading metadata from
567 /// .adapipe files.
568 ///
569 /// # Why
570 /// Reading from the footer enables:
571 /// - Quick metadata access without processing entire file
572 /// - Format validation before attempting recovery
573 /// - Backward compatibility checking
574 ///
575 /// # Arguments
576 /// * `file_data` - Complete file data including footer
577 ///
578 /// # Returns
579 /// * `Ok((FileHeader, usize))` - Parsed header and total footer size in
580 /// bytes
581 /// * `Err(PipelineError)` - Validation or parsing error
582 ///
583 /// # Errors
584 /// Returns `PipelineError` when:
585 /// - File too short (< 14 bytes minimum footer size)
586 /// - Invalid magic bytes (not an .adapipe file)
587 /// - Unsupported format version
588 /// - Incomplete footer data
589 /// - Invalid UTF-8 in JSON header
590 /// - JSON deserialization fails
591 ///
592 /// # Examples
593 pub fn from_footer_bytes(file_data: &[u8]) -> Result<(Self, usize), PipelineError> {
594 let file_size = file_data.len();
595
596 if file_size < 14 {
597 // 8 + 2 + 4 = minimum footer size
598 return Err(PipelineError::ValidationError("File too short for footer".to_string()));
599 }
600
601 // Read from end of file
602 let magic_start = file_size - 8;
603 let version_start = file_size - 10;
604 let length_start = file_size - 14;
605
606 // Check magic bytes
607 let magic_bytes = &file_data[magic_start..];
608 if magic_bytes != MAGIC_BYTES {
609 return Err(PipelineError::ValidationError(
610 "Invalid magic bytes - not an Adaptive Pipeline file".to_string(),
611 ));
612 }
613
614 // Read format version
615 let version_bytes = &file_data[version_start..version_start + 2];
616 let format_version = u16::from_le_bytes([version_bytes[0], version_bytes[1]]);
617 if format_version > CURRENT_FORMAT_VERSION {
618 return Err(PipelineError::ValidationError(format!(
619 "Unsupported format version: {} (current: {})",
620 format_version, CURRENT_FORMAT_VERSION
621 )));
622 }
623
624 // Read header length
625 let length_bytes = &file_data[length_start..length_start + 4];
626 let header_length =
627 u32::from_le_bytes([length_bytes[0], length_bytes[1], length_bytes[2], length_bytes[3]]) as usize;
628
629 // Calculate total footer size
630 let footer_size = header_length + 14; // JSON + length + version + magic
631 if file_size < footer_size {
632 return Err(PipelineError::ValidationError(
633 "File too short for complete footer".to_string(),
634 ));
635 }
636
637 // Extract and parse header JSON
638 let header_start = file_size - footer_size;
639 let header_json = &file_data[header_start..header_start + header_length];
640 let header_str = std::str::from_utf8(header_json)
641 .map_err(|e| PipelineError::ValidationError(format!("Invalid UTF-8 in header: {}", e)))?;
642
643 let header: FileHeader = serde_json::from_str(header_str)
644 .map_err(|e| PipelineError::SerializationError(format!("Failed to deserialize header: {}", e)))?;
645
646 Ok((header, footer_size))
647 }
648
649 /// Verifies the integrity of the processed output file
650 ///
651 /// # Purpose
652 /// Validates that the processed file data has not been corrupted or
653 /// tampered with by comparing its SHA256 checksum against the stored
654 /// checksum.
655 ///
656 /// # Why
657 /// Integrity verification provides:
658 /// - Detection of file corruption during storage or transmission
659 /// - Protection against data tampering
660 /// - Confidence in file recovery operations
661 ///
662 /// # Arguments
663 /// * `file_data` - Complete processed file data (including footer)
664 ///
665 /// # Returns
666 /// * `Ok(true)` - File integrity verified, checksum matches
667 /// * `Ok(false)` - File corrupted, checksum mismatch
668 /// * `Err(PipelineError::ValidationError)` - No checksum available
669 ///
670 /// # Errors
671 /// Returns `PipelineError::ValidationError` if `output_checksum` is empty.
672 ///
673 /// # Examples
674 pub fn verify_output_integrity(&self, file_data: &[u8]) -> Result<bool, PipelineError> {
675 if self.output_checksum.is_empty() {
676 return Err(PipelineError::ValidationError(
677 "No output checksum available for verification".to_string(),
678 ));
679 }
680
681 // Calculate checksum of entire file
682 let mut hasher = Sha256::new();
683 hasher.update(file_data);
684 let digest = hasher.finalize();
685 let calculated_checksum = hex::encode(digest);
686
687 Ok(calculated_checksum == self.output_checksum)
688 }
689
690 /// Gets the processing steps in reverse order for file restoration
691 ///
692 /// # Purpose
693 /// Returns processing steps in the order they must be reversed to restore
694 /// the original file. For example, if compression then encryption was
695 /// applied, restoration must decrypt then decompress.
696 ///
697 /// # Why
698 /// Processing operations must be reversed in opposite order:
699 /// - Apply: Compress → Encrypt
700 /// - Restore: Decrypt → Decompress
701 ///
702 /// # Returns
703 /// Vector of processing steps sorted by descending order (highest order
704 /// first)
705 ///
706 /// # Examples
707 pub fn get_restoration_steps(&self) -> Vec<&ProcessingStep> {
708 let mut steps: Vec<&ProcessingStep> = self.processing_steps.iter().collect();
709 steps.sort_by(|a, b| b.order.cmp(&a.order)); // Reverse order
710 steps
711 }
712
713 /// Validates a restored file against original specifications
714 ///
715 /// # Purpose
716 /// Verifies that a restored file matches the original file exactly by
717 /// checking both size and SHA256 checksum. This ensures complete
718 /// recovery fidelity.
719 ///
720 /// # Why
721 /// Restoration validation provides:
722 /// - Confidence that recovery was successful
723 /// - Detection of processing errors or data loss
724 /// - Verification of processing reversibility
725 ///
726 /// # Arguments
727 /// * `restored_data` - The restored/recovered file data
728 ///
729 /// # Returns
730 /// * `Ok(true)` - Restored file matches original (size and checksum)
731 /// * `Ok(false)` - Restored file does not match original
732 ///
733 /// # Examples
734 pub fn validate_restored_file(&self, restored_data: &[u8]) -> Result<bool, PipelineError> {
735 // Check size
736 if (restored_data.len() as u64) != self.original_size {
737 return Ok(false);
738 }
739
740 // Check checksum
741 let mut hasher = Sha256::new();
742 hasher.update(restored_data);
743 let digest = hasher.finalize();
744 let calculated_checksum = hex::encode(digest);
745
746 Ok(calculated_checksum == self.original_checksum)
747 }
748
749 /// Gets information about what processing was applied
750 pub fn get_processing_summary(&self) -> String {
751 if self.processing_steps.is_empty() {
752 return "No processing applied (pass-through)".to_string();
753 }
754
755 let steps: Vec<String> = self
756 .processing_steps
757 .iter()
758 .map(|step| match &step.step_type {
759 ProcessingStepType::Compression => format!("Compression ({})", step.algorithm),
760 ProcessingStepType::Encryption => format!("Encryption ({})", step.algorithm),
761 ProcessingStepType::Checksum => format!("Checksum ({})", step.algorithm),
762 ProcessingStepType::PassThrough => format!("PassThrough ({})", step.algorithm),
763 ProcessingStepType::Custom(name) => format!("Custom ({}: {})", name, step.algorithm),
764 })
765 .collect();
766
767 format!("Processing: {}", steps.join(" → "))
768 }
769
770 /// Checks if the file uses compression
771 pub fn is_compressed(&self) -> bool {
772 self.processing_steps
773 .iter()
774 .any(|step| matches!(step.step_type, ProcessingStepType::Compression))
775 }
776
777 /// Checks if the file uses encryption
778 pub fn is_encrypted(&self) -> bool {
779 self.processing_steps
780 .iter()
781 .any(|step| matches!(step.step_type, ProcessingStepType::Encryption))
782 }
783
784 /// Gets the compression algorithm if used
785 pub fn compression_algorithm(&self) -> Option<&str> {
786 self.processing_steps
787 .iter()
788 .find(|step| matches!(step.step_type, ProcessingStepType::Compression))
789 .map(|step| step.algorithm.as_str())
790 }
791
792 /// Gets the encryption algorithm if used
793 pub fn encryption_algorithm(&self) -> Option<&str> {
794 self.processing_steps
795 .iter()
796 .find(|step| matches!(step.step_type, ProcessingStepType::Encryption))
797 .map(|step| step.algorithm.as_str())
798 }
799
800 /// Validates the header for consistency
801 pub fn validate(&self) -> Result<(), PipelineError> {
802 if self.format_version == 0 {
803 return Err(PipelineError::ValidationError("Format version cannot be 0".to_string()));
804 }
805
806 if self.app_version.is_empty() {
807 return Err(PipelineError::ValidationError(
808 "App version cannot be empty".to_string(),
809 ));
810 }
811
812 if self.original_filename.is_empty() {
813 return Err(PipelineError::ValidationError(
814 "Original filename cannot be empty".to_string(),
815 ));
816 }
817
818 if self.chunk_size == 0 {
819 return Err(PipelineError::ValidationError("Chunk size cannot be 0".to_string()));
820 }
821
822 if self.chunk_size < 1024 {
823 return Err(PipelineError::ValidationError(
824 "Chunk size must be at least 1KB".to_string(),
825 ));
826 }
827
828 if self.original_size > 0 && self.chunk_count == 0 {
829 return Err(PipelineError::ValidationError(
830 "Non-empty file must have chunks".to_string(),
831 ));
832 }
833
834 if self.original_checksum.is_empty() && self.original_size > 0 {
835 return Err(PipelineError::ValidationError(
836 "Non-empty file must have original checksum".to_string(),
837 ));
838 }
839
840 // Validate processing steps
841 for step in &self.processing_steps {
842 if step.algorithm.is_empty() {
843 return Err(PipelineError::ValidationError(
844 "Processing step algorithm cannot be empty".to_string(),
845 ));
846 }
847 }
848
849 Ok(())
850 }
851}
852
853impl ChunkFormat {
854 /// Creates a new chunk format
855 pub fn new(nonce: [u8; 12], payload: Vec<u8>) -> Self {
856 Self {
857 nonce,
858 data_length: payload.len() as u32,
859 payload,
860 }
861 }
862
863 /// Serializes chunk to binary format
864 /// Format: `[NONCE][DATA_LENGTH][PAYLOAD]`
865 pub fn to_bytes(&self) -> Vec<u8> {
866 let mut result = Vec::new();
867
868 // Nonce (12 bytes)
869 result.extend_from_slice(&self.nonce);
870
871 // Data length (4 bytes, little-endian)
872 result.extend_from_slice(&self.data_length.to_le_bytes());
873
874 // Payload data
875 result.extend_from_slice(&self.payload);
876
877 result
878 }
879
880 /// Converts chunk to bytes and returns both bytes and size
881 ///
882 /// This is a convenience method that combines the common pattern of:
883 ///
884 /// # Returns
885 /// * `(Vec<u8>, u64)` - The serialized bytes and size as u64
886 ///
887 /// # Example
888 pub fn to_bytes_with_size(&self) -> (Vec<u8>, u64) {
889 let chunk_bytes = self.to_bytes();
890 let chunk_size = chunk_bytes.len() as u64;
891 (chunk_bytes, chunk_size)
892 }
893
894 /// Deserializes chunk from binary format
895 /// Returns (chunk, bytes_consumed)
896 pub fn from_bytes(data: &[u8]) -> Result<(Self, usize), PipelineError> {
897 if data.len() < 16 {
898 // 12 + 4 = minimum chunk header size
899 return Err(PipelineError::ValidationError(
900 "Data too short for chunk header".to_string(),
901 ));
902 }
903
904 // Read nonce
905 let mut nonce = [0u8; 12];
906 nonce.copy_from_slice(&data[0..12]);
907
908 // Read data length
909 let data_length = u32::from_le_bytes([data[12], data[13], data[14], data[15]]) as usize;
910
911 // Check if we have enough data
912 let total_size = 16 + data_length;
913 if data.len() < total_size {
914 return Err(PipelineError::ValidationError("Incomplete chunk data".to_string()));
915 }
916
917 // Read payload data
918 let payload = data[16..16 + data_length].to_vec();
919
920 Ok((
921 Self {
922 nonce,
923 data_length: data_length as u32,
924 payload,
925 },
926 total_size,
927 ))
928 }
929
930 /// Validates the chunk format
931 pub fn validate(&self) -> Result<(), PipelineError> {
932 if (self.data_length as usize) != self.payload.len() {
933 return Err(PipelineError::ValidationError("Chunk data length mismatch".to_string()));
934 }
935
936 if self.payload.is_empty() {
937 return Err(PipelineError::ValidationError("Chunk cannot be empty".to_string()));
938 }
939
940 Ok(())
941 }
942}
943
944impl Default for FileHeader {
945 fn default() -> Self {
946 Self::new("unknown".to_string(), 0, String::new())
947 }
948}
949
950#[cfg(test)]
951mod tests {
952 use super::*;
953
954 /// Tests file header creation and serialization with processing steps.
955 ///
956 /// This test validates that file headers can be created with multiple
957 /// processing steps (compression and encryption) and that all metadata
958 /// is properly stored and accessible.
959 ///
960 /// # Test Coverage
961 ///
962 /// - File header creation with fluent API
963 /// - Compression step addition with algorithm and level
964 /// - Encryption step addition with key derivation parameters
965 /// - Chunk information and pipeline ID configuration
966 /// - Output checksum configuration
967 /// - Header validation for consistency
968 /// - Processing step detection (compression/encryption flags)
969 /// - Algorithm extraction from processing steps
970 ///
971 /// # Assertions
972 ///
973 /// - Header validation passes for complete configuration
974 /// - Compression detection works correctly
975 /// - Encryption detection works correctly
976 /// - Algorithm names are extracted properly
977 /// - All fluent API methods chain correctly
978 #[test]
979 fn test_header_creation_and_serialization() {
980 let header = FileHeader::new("test.txt".to_string(), 1024, "abc123".to_string())
981 .add_compression_step("brotli", 6)
982 .add_encryption_step("aes256gcm", "argon2", 32, 12)
983 .with_chunk_info(1024 * 1024, 1)
984 .with_pipeline_id("test-pipeline".to_string())
985 .with_output_checksum("def456".to_string());
986
987 assert!(header.validate().is_ok());
988 assert!(header.is_compressed());
989 assert!(header.is_encrypted());
990 assert_eq!(header.compression_algorithm(), Some("brotli"));
991 assert_eq!(header.encryption_algorithm(), Some("aes256gcm"));
992 }
993
994 /// Tests header serialization and deserialization roundtrip.
995 ///
996 /// This test validates that file headers can be serialized to footer
997 /// bytes and then deserialized back to identical header objects,
998 /// ensuring data integrity during file I/O operations.
999 ///
1000 /// # Test Coverage
1001 ///
1002 /// - Header creation with compression step
1003 /// - Footer byte serialization
1004 /// - Footer byte deserialization
1005 /// - Roundtrip data integrity
1006 /// - Footer size calculation
1007 /// - Header equality comparison
1008 ///
1009 /// # Test Scenario
1010 ///
1011 /// Creates a header with compression, serializes it to footer bytes,
1012 /// then deserializes it back and verifies the restored header matches
1013 /// the original exactly.
1014 ///
1015 /// # Assertions
1016 ///
1017 /// - Original and restored headers are identical
1018 /// - Footer size matches actual byte length
1019 /// - Serialization/deserialization preserves all data
1020 /// - No data loss during roundtrip conversion
1021 #[test]
1022 fn test_header_footer_roundtrip() {
1023 let original_header = FileHeader::new("test.txt".to_string(), 1024, "abc123".to_string())
1024 .add_compression_step("brotli", 6)
1025 .with_output_checksum("def456".to_string());
1026
1027 let footer_data = original_header.to_footer_bytes().unwrap();
1028
1029 // Simulate reading from end of file
1030 let (restored_header, footer_size) = FileHeader::from_footer_bytes(&footer_data).unwrap();
1031
1032 assert_eq!(original_header, restored_header);
1033 assert_eq!(footer_size, footer_data.len());
1034 }
1035
1036 /// Tests restoration steps ordering for proper file recovery.
1037 ///
1038 /// This test validates that restoration steps are returned in reverse
1039 /// order of processing, ensuring that files can be properly restored
1040 /// by undoing operations in the correct sequence.
1041 ///
1042 /// # Test Coverage
1043 ///
1044 /// - Processing step order assignment
1045 /// - Restoration step order reversal
1046 /// - Multi-step processing (compression + encryption)
1047 /// - Step order validation
1048 /// - Proper restoration sequence
1049 ///
1050 /// # Test Scenario
1051 ///
1052 /// Creates a header with compression (order 0) followed by encryption
1053 /// (order 1), then verifies that restoration steps are returned in
1054 /// reverse order: encryption first, then compression.
1055 ///
1056 /// # Assertions
1057 ///
1058 /// - Restoration steps are in reverse processing order
1059 /// - Encryption step comes first (order 1)
1060 /// - Compression step comes second (order 0)
1061 /// - Step count matches processing step count
1062 /// - Order values are preserved correctly
1063 #[test]
1064 fn test_restoration_steps_order() {
1065 let header = FileHeader::new("test.txt".to_string(), 1024, "abc123".to_string())
1066 .add_compression_step("brotli", 6) // Order 0
1067 .add_encryption_step("aes256gcm", "argon2", 32, 12); // Order 1
1068
1069 let restoration_steps = header.get_restoration_steps();
1070
1071 // Should be in reverse order: encryption first (order 1), then compression
1072 // (order 0)
1073 assert_eq!(restoration_steps.len(), 2);
1074 assert_eq!(restoration_steps[0].order, 1); // Encryption
1075 assert_eq!(restoration_steps[1].order, 0); // Compression
1076 }
1077
1078 /// Tests chunk format serialization and deserialization roundtrip.
1079 ///
1080 /// This test validates that chunk data can be serialized to bytes
1081 /// and then deserialized back to identical chunk objects, ensuring
1082 /// data integrity for encrypted chunk storage.
1083 ///
1084 /// # Test Coverage
1085 ///
1086 /// - Chunk format creation with nonce and data
1087 /// - Chunk byte serialization
1088 /// - Chunk byte deserialization
1089 /// - Roundtrip data integrity
1090 /// - Bytes consumed calculation
1091 /// - Chunk validation after deserialization
1092 ///
1093 /// # Test Scenario
1094 ///
1095 /// Creates a chunk with test nonce and data, serializes it to bytes,
1096 /// then deserializes it back and verifies the restored chunk matches
1097 /// the original exactly.
1098 ///
1099 /// # Assertions
1100 ///
1101 /// - Original and restored chunks are identical
1102 /// - Bytes consumed matches serialized byte length
1103 /// - Deserialized chunk passes validation
1104 /// - Nonce and data are preserved exactly
1105 /// - No data corruption during roundtrip
1106 #[test]
1107 fn test_chunk_format_roundtrip() {
1108 let nonce = [1u8; 12];
1109 let data = vec![0xde, 0xad, 0xbe, 0xef];
1110 let original_chunk = ChunkFormat::new(nonce, data);
1111
1112 let chunk_bytes = original_chunk.to_bytes();
1113 let (restored_chunk, bytes_consumed) = ChunkFormat::from_bytes(&chunk_bytes).unwrap();
1114
1115 assert_eq!(original_chunk, restored_chunk);
1116 assert_eq!(bytes_consumed, chunk_bytes.len());
1117 assert!(restored_chunk.validate().is_ok());
1118 }
1119
1120 /// Tests error handling for invalid magic bytes in file headers.
1121 ///
1122 /// This test validates that the system properly rejects files that
1123 /// don't have the correct magic bytes, preventing processing of
1124 /// non-adapipe files and providing clear error messages.
1125 ///
1126 /// # Test Coverage
1127 ///
1128 /// - Invalid magic byte detection
1129 /// - Error handling for malformed files
1130 /// - Error message content validation
1131 /// - File format validation
1132 /// - Rejection of non-adapipe files
1133 ///
1134 /// # Test Scenario
1135 ///
1136 /// Creates invalid data with wrong magic bytes and attempts to
1137 /// parse it as a file header, expecting a clear error message
1138 /// about invalid magic bytes.
1139 ///
1140 /// # Assertions
1141 ///
1142 /// - Parsing fails with error result
1143 /// - Error message mentions "Invalid magic bytes"
1144 /// - System rejects malformed data
1145 /// - No false positives for invalid files
1146 /// - Clear error reporting for debugging
1147 #[test]
1148 fn test_invalid_magic_bytes() {
1149 let bad_data = vec![0xFF; 20];
1150 let result = FileHeader::from_footer_bytes(&bad_data);
1151 assert!(result.is_err());
1152 assert!(result.unwrap_err().to_string().contains("Invalid magic bytes"));
1153 }
1154
1155 /// Tests processing summary generation for file headers.
1156 ///
1157 /// This test validates that processing summaries correctly describe
1158 /// the operations applied to files, providing human-readable
1159 /// descriptions of the processing pipeline.
1160 ///
1161 /// # Test Coverage
1162 ///
1163 /// - Processing summary generation
1164 /// - Multi-step processing description
1165 /// - Algorithm name inclusion
1166 /// - Processing flow visualization
1167 /// - Human-readable output format
1168 ///
1169 /// # Test Scenario
1170 ///
1171 /// Creates a header with compression and encryption steps, then
1172 /// generates a processing summary and verifies it contains the
1173 /// expected algorithm names and flow indicators.
1174 ///
1175 /// # Assertions
1176 ///
1177 /// - Summary contains compression algorithm name
1178 /// - Summary contains encryption algorithm name
1179 /// - Summary includes flow indicator (→)
1180 /// - Format is human-readable
1181 /// - All processing steps are represented
1182 #[test]
1183 fn test_processing_summary() {
1184 let header = FileHeader::new("test.txt".to_string(), 1024, "abc123".to_string())
1185 .add_compression_step("brotli", 6)
1186 .add_encryption_step("aes256gcm", "argon2", 32, 12);
1187
1188 let summary = header.get_processing_summary();
1189 assert!(summary.contains("Compression (brotli)"));
1190 assert!(summary.contains("Encryption (aes256gcm)"));
1191 assert!(summary.contains("→")); // Should show processing flow
1192 }
1193
1194 /// Tests pass-through file handling without processing steps.
1195 ///
1196 /// This test validates that files with no processing steps are
1197 /// properly handled as pass-through files, with appropriate
1198 /// flags and summary messages.
1199 ///
1200 /// # Test Coverage
1201 ///
1202 /// - Pass-through file detection
1203 /// - No compression flag validation
1204 /// - No encryption flag validation
1205 /// - Pass-through summary message
1206 /// - Minimal processing configuration
1207 ///
1208 /// # Test Scenario
1209 ///
1210 /// Creates a header with only basic file information and no
1211 /// processing steps, then verifies that it's correctly identified
1212 /// as a pass-through file.
1213 ///
1214 /// # Assertions
1215 ///
1216 /// - Compression flag is false
1217 /// - Encryption flag is false
1218 /// - Summary indicates "No processing applied (pass-through)"
1219 /// - Header is valid despite no processing steps
1220 /// - Pass-through files are handled correctly
1221 #[test]
1222 fn test_pass_through_file() {
1223 let header = FileHeader::new("test.txt".to_string(), 1024, "abc123".to_string())
1224 .with_output_checksum("def456".to_string());
1225
1226 assert!(!header.is_compressed());
1227 assert!(!header.is_encrypted());
1228 assert_eq!(header.get_processing_summary(), "No processing applied (pass-through)");
1229 }
1230}