adaptive_pipeline_domain/value_objects/
binary_file_format.rs

1// /////////////////////////////////////////////////////////////////////////////
2// Adaptive Pipeline
3// Copyright (c) 2025 Michael Gardner, A Bit of Help, Inc.
4// SPDX-License-Identifier: BSD-3-Clause
5// See LICENSE file in the project root.
6// /////////////////////////////////////////////////////////////////////////////
7
8//! # Binary File Format Value Object
9//!
10//! This module defines the binary file format specification for the Adaptive
11//! Pipeline system. It provides a standardized format for storing processed
12//! files with complete recovery metadata and integrity verification.
13//!
14//! ## Overview
15//!
16//! The binary file format provides:
17//!
18//! - **File Recovery**: Complete metadata for recovering original files
19//! - **Integrity Verification**: Checksums and validation for processed files
20//! - **Processing History**: Complete record of processing steps applied
21//! - **Version Management**: Format versioning for backward compatibility
22//! - **Compression Support**: Efficient storage of processed data
23//!
24//! ## Architecture
25//!
26//! The format follows a structured binary layout:
27//!
28//! - **Magic Bytes**: File format identification
29//! - **Version Header**: Format version information
30//! - **Metadata Section**: Processing metadata and recovery information
31//! - **Data Section**: Actual processed file data
32//! - **Integrity Section**: Checksums and validation data
33//!
34//! ## Key Features
35//!
36//! ### File Recovery
37//!
38//! - **Original Filename**: Preserve original file names
39//! - **File Size**: Track original and processed file sizes
40//! - **Processing Steps**: Record all processing operations applied
41//! - **Restoration Metadata**: Information needed for complete recovery
42//!
43//! ### Integrity Verification
44//!
45//! - **Checksums**: Multiple checksum algorithms for verification
46//! - **Validation**: Comprehensive validation of file integrity
47//! - **Error Detection**: Detect corruption and processing errors
48//! - **Recovery Verification**: Verify recovered files match originals
49//!
50//! ### Format Versioning
51//!
52//! - **Version Management**: Support for multiple format versions
53//! - **Backward Compatibility**: Maintain compatibility with older versions
54//! - **Migration Support**: Automatic migration between format versions
55//! - **Feature Evolution**: Support for new features in future versions
56//!
57//! ## Usage Examples
58//!
59//! ### Creating a Binary File
60
61//!
62//! ### Reading and Validating a Binary File
63
64//!
65//! ### File Recovery Process
66
67//!
68//! ## File Format Specification
69//!
70//! ### Binary Layout
71//!
72//! The .adapipe file format uses the following binary layout:
73//!
74//!
75//! ### Header Components
76//!
77//! - **Magic Bytes**: 8 bytes - "ADAPIPE\0" (0x41444150495045000)
78//! - **Format Version**: 2 bytes - Current version number
79//! - **Header Length**: 4 bytes - Length of JSON header in bytes
80//! - **JSON Header**: Variable length - Metadata and processing information
81//! - **Processed Data**: Variable length - Actual processed file content
82//!
83//! ### JSON Header Structure
84//!
85//!
86//! ## Processing Steps
87//!
88//! ### Supported Operations
89//!
90//! - **Compression**: Various compression algorithms (brotli, gzip, lz4)
91//! - **Encryption**: Encryption algorithms (AES-256-GCM, ChaCha20-Poly1305)
92//! - **Validation**: Checksum and integrity validation
93//! - **Transformation**: Custom data transformations
94//!
95//! ### Step Parameters
96//!
97//! Each processing step can include parameters:
98//!
99//! - **Compression Level**: Compression quality/speed tradeoff
100//! - **Encryption Keys**: Key derivation and management information
101//! - **Algorithm Options**: Algorithm-specific configuration
102//! - **Custom Parameters**: Application-specific parameters
103//!
104//! ## Integrity Verification
105//!
106//! ### Checksum Algorithms
107//!
108//! - **SHA-256**: Primary checksum algorithm
109//! - **Blake3**: High-performance alternative
110//! - **CRC32**: Fast integrity checking
111//! - **Custom**: Support for custom checksum algorithms
112//!
113//! ### Verification Process
114//!
115//! 1. **Format Validation**: Verify magic bytes and version
116//! 2. **Header Validation**: Validate JSON header structure
117//! 3. **Data Integrity**: Verify processed data checksum
118//! 4. **Recovery Verification**: Verify recovered data matches original
119//!
120//! ## Error Handling
121//!
122//! ### Format Errors
123//!
124//! - **Invalid Magic Bytes**: File is not in .adapipe format
125//! - **Unsupported Version**: Format version not supported
126//! - **Corrupt Header**: JSON header is malformed or corrupt
127//! - **Invalid Data**: Processed data is corrupt or invalid
128//!
129//! ### Recovery Errors
130//!
131//! - **Missing Steps**: Required processing steps are missing
132//! - **Invalid Parameters**: Processing parameters are invalid
133//! - **Checksum Mismatch**: Data integrity verification failed
134//! - **Recovery Failure**: Unable to recover original data
135//!
136//! ## Performance Considerations
137//!
138//! ### File Size Optimization
139//!
140//! - **Efficient Encoding**: Compact binary encoding for metadata
141//! - **Compression**: Built-in compression for processed data
142//! - **Minimal Overhead**: Minimal format overhead
143//!
144//! ### Processing Performance
145//!
146//! - **Streaming**: Support for streaming processing of large files
147//! - **Parallel Processing**: Parallel processing of file chunks
148//! - **Memory Efficiency**: Efficient memory usage during processing
149//!
150//! ## Security Considerations
151//!
152//! ### Data Protection
153//!
154//! - **Encryption**: Strong encryption for sensitive data
155//! - **Key Management**: Secure key derivation and management
156//! - **Integrity**: Comprehensive integrity verification
157//!
158//! ### Attack Prevention
159//!
160//! - **Format Validation**: Prevent malformed file attacks
161//! - **Size Limits**: Prevent resource exhaustion attacks
162//! - **Checksum Verification**: Prevent data tampering
163//!
164//! ## Version Management
165//!
166//! ### Format Versioning
167//!
168//! - **Semantic Versioning**: Use semantic versioning for format versions
169//! - **Backward Compatibility**: Maintain compatibility with older versions
170//! - **Migration**: Automatic migration between format versions
171//!
172//! ### Feature Evolution
173//!
174//! - **New Algorithms**: Support for new compression/encryption algorithms
175//! - **Enhanced Metadata**: Extended metadata capabilities
176//! - **Performance Improvements**: Optimizations in new versions
177//!
178//! ## Integration
179//!
180//! The binary file format integrates with:
181//!
182//! - **File Processor**: Used by file processor for creating processed files
183//! - **Storage Systems**: Store processed files in various storage systems
184//! - **Recovery Systems**: Recover original files from processed files
185//! - **Validation Systems**: Validate file integrity and format compliance
186//!
187//! ## Future Enhancements
188//!
189//! Planned enhancements include:
190//!
191//! - **Streaming Support**: Enhanced streaming capabilities
192//! - **Compression Improvements**: Better compression algorithms
193//! - **Metadata Extensions**: Extended metadata capabilities
194//! - **Performance Optimizations**: Further performance improvements
195
196use serde::{Deserialize, Serialize};
197use sha2::{Digest, Sha256};
198use std::collections::HashMap;
199
200use crate::PipelineError;
201
202/// Magic bytes to identify our file format: "ADAPIPE\0"
203///
204/// These magic bytes are used to identify files in the Adaptive Pipeline
205/// binary format. They appear at the end of the file for efficient
206/// format detection without reading the entire file.
207///
208/// The magic bytes spell "ADAPIPE" followed by a null terminator:
209/// - 0x41 = 'A'
210/// - 0x44 = 'D'
211/// - 0x41 = 'A'
212/// - 0x50 = 'P'
213/// - 0x49 = 'I'
214/// - 0x50 = 'P'
215/// - 0x45 = 'E'
216/// - 0x00 = null terminator
217pub const MAGIC_BYTES: [u8; 8] = [0x41, 0x44, 0x41, 0x50, 0x49, 0x50, 0x45, 0x00];
218
219/// Current file format version
220///
221/// This constant defines the current version of the .adapipe file format.
222/// It is used for:
223/// - Format version validation when reading files
224/// - Backward compatibility checking
225/// - Migration between format versions
226/// - Feature availability determination
227///
228/// Version history:
229/// - Version 1: Initial format with basic compression and encryption support
230pub const CURRENT_FORMAT_VERSION: u16 = 1;
231
232/// File header for Adaptive Pipeline processed files (.adapipe format)
233///
234/// This header contains all information needed to:
235/// 1. Recover the original document (filename, size, processing steps)
236/// 2. Verify integrity of the processed output file we created
237/// 3. Validate the restored input file matches the original exactly
238///
239/// # Adaptive Pipeline File Format (.adapipe)
240/// ```text
241/// [CHUNK_DATA][JSON_HEADER][HEADER_LENGTH][FORMAT_VERSION][MAGIC_BYTES]
242/// ```
243///
244/// Note: This is NOT a general binary file format like .png or .exe.
245/// This is specifically for files processed by the Adaptive Pipeline system
246/// that have been compressed and/or encrypted with restoration metadata.
247///
248/// # Recovery Process
249#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
250pub struct FileHeader {
251    /// Application version that created this file
252    pub app_version: String,
253
254    /// File format version for backward compatibility
255    pub format_version: u16,
256
257    /// Original input filename (for restoration)
258    pub original_filename: String,
259
260    /// Original file size in bytes (for validation)
261    pub original_size: u64,
262
263    /// SHA256 checksum of original input file (for validation)
264    pub original_checksum: String,
265
266    /// SHA256 checksum of this output file (for integrity verification)
267    pub output_checksum: String,
268
269    /// Processing pipeline information (for restoration)
270    pub processing_steps: Vec<ProcessingStep>,
271
272    /// Chunk size used for processing
273    pub chunk_size: u32,
274
275    /// Number of chunks in the processed file
276    pub chunk_count: u32,
277
278    /// Processing timestamp (RFC3339)
279    pub processed_at: chrono::DateTime<chrono::Utc>,
280
281    /// Pipeline ID that processed this file
282    pub pipeline_id: String,
283
284    /// Additional metadata for debugging/auditing
285    pub metadata: HashMap<String, String>,
286}
287
288/// A single processing step that was applied to the file
289/// Steps are stored in the order they were applied, and must be reversed in
290/// reverse order
291#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
292pub struct ProcessingStep {
293    /// Step type (compression, encryption, etc.)
294    pub step_type: ProcessingStepType,
295
296    /// Algorithm used
297    pub algorithm: String,
298
299    /// Algorithm-specific parameters needed for restoration
300    pub parameters: HashMap<String, String>,
301
302    /// Order in which this step was applied (0-based)
303    pub order: u32,
304}
305
306/// Types of processing steps
307#[derive(Debug, Clone, PartialEq, Serialize, Deserialize)]
308pub enum ProcessingStepType {
309    /// Compression step
310    Compression,
311    /// Encryption step
312    Encryption,
313    /// Checksum/integrity verification step
314    Checksum,
315    /// Pass-through step (no data modification)
316    PassThrough,
317    /// Legacy custom processing step (deprecated)
318    Custom(String),
319}
320
321/// Format for individual chunks in the file
322#[derive(Debug, Clone, PartialEq)]
323pub struct ChunkFormat {
324    /// Encryption nonce (12 bytes for AES-GCM)
325    /// Contains actual nonce when encrypted, zeros ([0u8; 12]) when not
326    /// encrypted
327    pub nonce: [u8; 12],
328
329    /// Length of payload data
330    pub data_length: u32,
331
332    /// Chunk payload data (may be raw, compressed, encrypted, or any
333    /// combination) Note: Previously named `encrypted_data` but renamed for
334    /// clarity since this field contains data in various states of
335    /// transformation
336    pub payload: Vec<u8>,
337}
338
339impl FileHeader {
340    /// Creates a new file header with default values
341    ///
342    /// # Purpose
343    /// Creates a `FileHeader` for tracking processing metadata and enabling
344    /// file recovery. The header stores all information needed to validate
345    /// and restore processed files.
346    ///
347    /// # Why
348    /// File headers provide:
349    /// - Recovery information to restore original files
350    /// - Integrity verification through checksums
351    /// - Processing history for debugging and auditing
352    /// - Version management for backward compatibility
353    ///
354    /// # Arguments
355    /// * `original_filename` - Name of the original input file (for
356    ///   restoration)
357    /// * `original_size` - Size of the original file in bytes (for validation)
358    /// * `original_checksum` - SHA256 checksum of original file (for
359    ///   validation)
360    ///
361    /// # Returns
362    /// `FileHeader` with default values:
363    /// - `app_version`: Current package version from Cargo.toml
364    /// - `format_version`: Current format version (1)
365    /// - `chunk_size`: 1MB default
366    /// - `processed_at`: Current timestamp
367    /// - Empty processing steps, pipeline ID, and metadata
368    ///
369    /// # Examples
370    pub fn new(original_filename: String, original_size: u64, original_checksum: String) -> Self {
371        Self {
372            app_version: env!("CARGO_PKG_VERSION").to_string(),
373            format_version: CURRENT_FORMAT_VERSION,
374            original_filename,
375            original_size,
376            original_checksum,
377            output_checksum: String::new(), // Will be set after processing
378            processing_steps: Vec::new(),
379            chunk_size: 1024 * 1024, // Default 1MB
380            chunk_count: 0,
381            processed_at: chrono::Utc::now(),
382            pipeline_id: String::new(),
383            metadata: HashMap::new(),
384        }
385    }
386
387    /// Adds a compression step to the processing pipeline
388    ///
389    /// # Purpose
390    /// Records a compression operation in the processing steps.
391    /// This information is used during file recovery to decompress the data.
392    ///
393    /// # Arguments
394    /// * `algorithm` - Name of compression algorithm (e.g., "brotli", "gzip",
395    ///   "zstd", "lz4")
396    /// * `level` - Compression level (algorithm-specific, typically 1-9)
397    ///
398    /// # Returns
399    /// Updated `FileHeader` with compression step added (builder pattern)
400    ///
401    /// # Examples
402    pub fn add_compression_step(mut self, algorithm: &str, level: u32) -> Self {
403        let mut parameters = HashMap::new();
404        parameters.insert("level".to_string(), level.to_string());
405
406        self.processing_steps.push(ProcessingStep {
407            step_type: ProcessingStepType::Compression,
408            algorithm: algorithm.to_string(),
409            parameters,
410            order: self.processing_steps.len() as u32,
411        });
412        self
413    }
414
415    /// Adds an encryption step
416    pub fn add_encryption_step(
417        mut self,
418        algorithm: &str,
419        key_derivation: &str,
420        key_size: u32,
421        nonce_size: u32,
422    ) -> Self {
423        let mut parameters = HashMap::new();
424        parameters.insert("key_derivation".to_string(), key_derivation.to_string());
425        parameters.insert("key_size".to_string(), key_size.to_string());
426        parameters.insert("nonce_size".to_string(), nonce_size.to_string());
427
428        self.processing_steps.push(ProcessingStep {
429            step_type: ProcessingStepType::Encryption,
430            algorithm: algorithm.to_string(),
431            parameters,
432            order: self.processing_steps.len() as u32,
433        });
434        self
435    }
436
437    /// Adds a custom processing step
438    pub fn add_custom_step(mut self, step_name: &str, algorithm: &str, parameters: HashMap<String, String>) -> Self {
439        self.processing_steps.push(ProcessingStep {
440            step_type: ProcessingStepType::Custom(step_name.to_string()),
441            algorithm: algorithm.to_string(),
442            parameters,
443            order: self.processing_steps.len() as u32,
444        });
445        self
446    }
447
448    /// Adds a processing step using domain-driven ProcessingStepDescriptor
449    /// This is the preferred method that respects DIP and uses Value Objects
450    pub fn add_processing_step(
451        mut self,
452        descriptor: super::processing_step_descriptor::ProcessingStepDescriptor,
453    ) -> Self {
454        self.processing_steps.push(ProcessingStep {
455            step_type: descriptor.step_type().clone(),
456            algorithm: descriptor.algorithm().as_str().to_string(),
457            parameters: descriptor.parameters().as_map().clone(),
458            order: descriptor.order().value(),
459        });
460        self
461    }
462
463    /// Adds a checksum processing step
464    pub fn add_checksum_step(mut self, algorithm: &str) -> Self {
465        self.processing_steps.push(ProcessingStep {
466            step_type: ProcessingStepType::Checksum,
467            algorithm: algorithm.to_string(),
468            parameters: HashMap::new(),
469            order: self.processing_steps.len() as u32,
470        });
471        self
472    }
473
474    /// Adds a pass-through processing step
475    pub fn add_passthrough_step(mut self, algorithm: &str) -> Self {
476        self.processing_steps.push(ProcessingStep {
477            step_type: ProcessingStepType::PassThrough,
478            algorithm: algorithm.to_string(),
479            parameters: HashMap::new(),
480            order: self.processing_steps.len() as u32,
481        });
482        self
483    }
484
485    /// Sets chunk processing information
486    pub fn with_chunk_info(mut self, chunk_size: u32, chunk_count: u32) -> Self {
487        self.chunk_size = chunk_size;
488        self.chunk_count = chunk_count;
489        self
490    }
491
492    /// Sets pipeline ID
493    pub fn with_pipeline_id(mut self, pipeline_id: String) -> Self {
494        self.pipeline_id = pipeline_id;
495        self
496    }
497
498    /// Sets output file checksum (call after processing is complete)
499    pub fn with_output_checksum(mut self, checksum: String) -> Self {
500        self.output_checksum = checksum;
501        self
502    }
503
504    /// Adds metadata
505    pub fn with_metadata(mut self, key: String, value: String) -> Self {
506        self.metadata.insert(key, value);
507        self
508    }
509
510    /// Serializes the header to binary format for file footer
511    ///
512    /// # Purpose
513    /// Converts the header to the binary footer format that is appended to
514    /// processed files. The footer allows reading metadata from the end of
515    /// files without scanning the entire file.
516    ///
517    /// # Why
518    /// Storing metadata at the end provides:
519    /// - Efficient metadata access without reading full file
520    /// - Streaming-friendly format (header written after data)
521    /// - Simple format detection via magic bytes at end
522    ///
523    /// # Binary Format
524    /// ```text
525    /// [JSON_HEADER][HEADER_LENGTH (4 bytes)][FORMAT_VERSION (2 bytes)][MAGIC_BYTES (8 bytes)]
526    /// ```
527    ///
528    /// # Returns
529    /// * `Ok(Vec<u8>)` - Serialized footer bytes
530    /// * `Err(PipelineError::SerializationError)` - JSON serialization failed
531    ///
532    /// # Errors
533    /// Returns `PipelineError::SerializationError` if JSON serialization fails.
534    ///
535    /// # Examples
536    pub fn to_footer_bytes(&self) -> Result<Vec<u8>, PipelineError> {
537        // Serialize header to JSON
538        let header_json = serde_json::to_string(self)
539            .map_err(|e| PipelineError::SerializationError(format!("Failed to serialize header: {}", e)))?;
540
541        let header_bytes = header_json.as_bytes();
542        let header_length = header_bytes.len() as u32;
543
544        // Build footer format
545        let mut result = Vec::new();
546
547        // JSON header data
548        result.extend_from_slice(header_bytes);
549
550        // Header length (little-endian)
551        result.extend_from_slice(&header_length.to_le_bytes());
552
553        // Format version (little-endian)
554        result.extend_from_slice(&self.format_version.to_le_bytes());
555
556        // Magic bytes
557        result.extend_from_slice(&MAGIC_BYTES);
558
559        Ok(result)
560    }
561
562    /// Deserializes the header from file footer bytes
563    ///
564    /// # Purpose
565    /// Extracts and parses the file header from the footer at the end of a
566    /// processed file. This is the primary method for reading metadata from
567    /// .adapipe files.
568    ///
569    /// # Why
570    /// Reading from the footer enables:
571    /// - Quick metadata access without processing entire file
572    /// - Format validation before attempting recovery
573    /// - Backward compatibility checking
574    ///
575    /// # Arguments
576    /// * `file_data` - Complete file data including footer
577    ///
578    /// # Returns
579    /// * `Ok((FileHeader, usize))` - Parsed header and total footer size in
580    ///   bytes
581    /// * `Err(PipelineError)` - Validation or parsing error
582    ///
583    /// # Errors
584    /// Returns `PipelineError` when:
585    /// - File too short (< 14 bytes minimum footer size)
586    /// - Invalid magic bytes (not an .adapipe file)
587    /// - Unsupported format version
588    /// - Incomplete footer data
589    /// - Invalid UTF-8 in JSON header
590    /// - JSON deserialization fails
591    ///
592    /// # Examples
593    pub fn from_footer_bytes(file_data: &[u8]) -> Result<(Self, usize), PipelineError> {
594        let file_size = file_data.len();
595
596        if file_size < 14 {
597            // 8 + 2 + 4 = minimum footer size
598            return Err(PipelineError::ValidationError("File too short for footer".to_string()));
599        }
600
601        // Read from end of file
602        let magic_start = file_size - 8;
603        let version_start = file_size - 10;
604        let length_start = file_size - 14;
605
606        // Check magic bytes
607        let magic_bytes = &file_data[magic_start..];
608        if magic_bytes != MAGIC_BYTES {
609            return Err(PipelineError::ValidationError(
610                "Invalid magic bytes - not an Adaptive Pipeline file".to_string(),
611            ));
612        }
613
614        // Read format version
615        let version_bytes = &file_data[version_start..version_start + 2];
616        let format_version = u16::from_le_bytes([version_bytes[0], version_bytes[1]]);
617        if format_version > CURRENT_FORMAT_VERSION {
618            return Err(PipelineError::ValidationError(format!(
619                "Unsupported format version: {} (current: {})",
620                format_version, CURRENT_FORMAT_VERSION
621            )));
622        }
623
624        // Read header length
625        let length_bytes = &file_data[length_start..length_start + 4];
626        let header_length =
627            u32::from_le_bytes([length_bytes[0], length_bytes[1], length_bytes[2], length_bytes[3]]) as usize;
628
629        // Calculate total footer size
630        let footer_size = header_length + 14; // JSON + length + version + magic
631        if file_size < footer_size {
632            return Err(PipelineError::ValidationError(
633                "File too short for complete footer".to_string(),
634            ));
635        }
636
637        // Extract and parse header JSON
638        let header_start = file_size - footer_size;
639        let header_json = &file_data[header_start..header_start + header_length];
640        let header_str = std::str::from_utf8(header_json)
641            .map_err(|e| PipelineError::ValidationError(format!("Invalid UTF-8 in header: {}", e)))?;
642
643        let header: FileHeader = serde_json::from_str(header_str)
644            .map_err(|e| PipelineError::SerializationError(format!("Failed to deserialize header: {}", e)))?;
645
646        Ok((header, footer_size))
647    }
648
649    /// Verifies the integrity of the processed output file
650    ///
651    /// # Purpose
652    /// Validates that the processed file data has not been corrupted or
653    /// tampered with by comparing its SHA256 checksum against the stored
654    /// checksum.
655    ///
656    /// # Why
657    /// Integrity verification provides:
658    /// - Detection of file corruption during storage or transmission
659    /// - Protection against data tampering
660    /// - Confidence in file recovery operations
661    ///
662    /// # Arguments
663    /// * `file_data` - Complete processed file data (including footer)
664    ///
665    /// # Returns
666    /// * `Ok(true)` - File integrity verified, checksum matches
667    /// * `Ok(false)` - File corrupted, checksum mismatch
668    /// * `Err(PipelineError::ValidationError)` - No checksum available
669    ///
670    /// # Errors
671    /// Returns `PipelineError::ValidationError` if `output_checksum` is empty.
672    ///
673    /// # Examples
674    pub fn verify_output_integrity(&self, file_data: &[u8]) -> Result<bool, PipelineError> {
675        if self.output_checksum.is_empty() {
676            return Err(PipelineError::ValidationError(
677                "No output checksum available for verification".to_string(),
678            ));
679        }
680
681        // Calculate checksum of entire file
682        let mut hasher = Sha256::new();
683        hasher.update(file_data);
684        let digest = hasher.finalize();
685        let calculated_checksum = hex::encode(digest);
686
687        Ok(calculated_checksum == self.output_checksum)
688    }
689
690    /// Gets the processing steps in reverse order for file restoration
691    ///
692    /// # Purpose
693    /// Returns processing steps in the order they must be reversed to restore
694    /// the original file. For example, if compression then encryption was
695    /// applied, restoration must decrypt then decompress.
696    ///
697    /// # Why
698    /// Processing operations must be reversed in opposite order:
699    /// - Apply: Compress → Encrypt
700    /// - Restore: Decrypt → Decompress
701    ///
702    /// # Returns
703    /// Vector of processing steps sorted by descending order (highest order
704    /// first)
705    ///
706    /// # Examples
707    pub fn get_restoration_steps(&self) -> Vec<&ProcessingStep> {
708        let mut steps: Vec<&ProcessingStep> = self.processing_steps.iter().collect();
709        steps.sort_by(|a, b| b.order.cmp(&a.order)); // Reverse order
710        steps
711    }
712
713    /// Validates a restored file against original specifications
714    ///
715    /// # Purpose
716    /// Verifies that a restored file matches the original file exactly by
717    /// checking both size and SHA256 checksum. This ensures complete
718    /// recovery fidelity.
719    ///
720    /// # Why
721    /// Restoration validation provides:
722    /// - Confidence that recovery was successful
723    /// - Detection of processing errors or data loss
724    /// - Verification of processing reversibility
725    ///
726    /// # Arguments
727    /// * `restored_data` - The restored/recovered file data
728    ///
729    /// # Returns
730    /// * `Ok(true)` - Restored file matches original (size and checksum)
731    /// * `Ok(false)` - Restored file does not match original
732    ///
733    /// # Examples
734    pub fn validate_restored_file(&self, restored_data: &[u8]) -> Result<bool, PipelineError> {
735        // Check size
736        if (restored_data.len() as u64) != self.original_size {
737            return Ok(false);
738        }
739
740        // Check checksum
741        let mut hasher = Sha256::new();
742        hasher.update(restored_data);
743        let digest = hasher.finalize();
744        let calculated_checksum = hex::encode(digest);
745
746        Ok(calculated_checksum == self.original_checksum)
747    }
748
749    /// Gets information about what processing was applied
750    pub fn get_processing_summary(&self) -> String {
751        if self.processing_steps.is_empty() {
752            return "No processing applied (pass-through)".to_string();
753        }
754
755        let steps: Vec<String> = self
756            .processing_steps
757            .iter()
758            .map(|step| match &step.step_type {
759                ProcessingStepType::Compression => format!("Compression ({})", step.algorithm),
760                ProcessingStepType::Encryption => format!("Encryption ({})", step.algorithm),
761                ProcessingStepType::Checksum => format!("Checksum ({})", step.algorithm),
762                ProcessingStepType::PassThrough => format!("PassThrough ({})", step.algorithm),
763                ProcessingStepType::Custom(name) => format!("Custom ({}: {})", name, step.algorithm),
764            })
765            .collect();
766
767        format!("Processing: {}", steps.join(" → "))
768    }
769
770    /// Checks if the file uses compression
771    pub fn is_compressed(&self) -> bool {
772        self.processing_steps
773            .iter()
774            .any(|step| matches!(step.step_type, ProcessingStepType::Compression))
775    }
776
777    /// Checks if the file uses encryption
778    pub fn is_encrypted(&self) -> bool {
779        self.processing_steps
780            .iter()
781            .any(|step| matches!(step.step_type, ProcessingStepType::Encryption))
782    }
783
784    /// Gets the compression algorithm if used
785    pub fn compression_algorithm(&self) -> Option<&str> {
786        self.processing_steps
787            .iter()
788            .find(|step| matches!(step.step_type, ProcessingStepType::Compression))
789            .map(|step| step.algorithm.as_str())
790    }
791
792    /// Gets the encryption algorithm if used
793    pub fn encryption_algorithm(&self) -> Option<&str> {
794        self.processing_steps
795            .iter()
796            .find(|step| matches!(step.step_type, ProcessingStepType::Encryption))
797            .map(|step| step.algorithm.as_str())
798    }
799
800    /// Validates the header for consistency
801    pub fn validate(&self) -> Result<(), PipelineError> {
802        if self.format_version == 0 {
803            return Err(PipelineError::ValidationError("Format version cannot be 0".to_string()));
804        }
805
806        if self.app_version.is_empty() {
807            return Err(PipelineError::ValidationError(
808                "App version cannot be empty".to_string(),
809            ));
810        }
811
812        if self.original_filename.is_empty() {
813            return Err(PipelineError::ValidationError(
814                "Original filename cannot be empty".to_string(),
815            ));
816        }
817
818        if self.chunk_size == 0 {
819            return Err(PipelineError::ValidationError("Chunk size cannot be 0".to_string()));
820        }
821
822        if self.chunk_size < 1024 {
823            return Err(PipelineError::ValidationError(
824                "Chunk size must be at least 1KB".to_string(),
825            ));
826        }
827
828        if self.original_size > 0 && self.chunk_count == 0 {
829            return Err(PipelineError::ValidationError(
830                "Non-empty file must have chunks".to_string(),
831            ));
832        }
833
834        if self.original_checksum.is_empty() && self.original_size > 0 {
835            return Err(PipelineError::ValidationError(
836                "Non-empty file must have original checksum".to_string(),
837            ));
838        }
839
840        // Validate processing steps
841        for step in &self.processing_steps {
842            if step.algorithm.is_empty() {
843                return Err(PipelineError::ValidationError(
844                    "Processing step algorithm cannot be empty".to_string(),
845                ));
846            }
847        }
848
849        Ok(())
850    }
851}
852
853impl ChunkFormat {
854    /// Creates a new chunk format
855    pub fn new(nonce: [u8; 12], payload: Vec<u8>) -> Self {
856        Self {
857            nonce,
858            data_length: payload.len() as u32,
859            payload,
860        }
861    }
862
863    /// Serializes chunk to binary format
864    /// Format: `[NONCE][DATA_LENGTH][PAYLOAD]`
865    pub fn to_bytes(&self) -> Vec<u8> {
866        let mut result = Vec::new();
867
868        // Nonce (12 bytes)
869        result.extend_from_slice(&self.nonce);
870
871        // Data length (4 bytes, little-endian)
872        result.extend_from_slice(&self.data_length.to_le_bytes());
873
874        // Payload data
875        result.extend_from_slice(&self.payload);
876
877        result
878    }
879
880    /// Converts chunk to bytes and returns both bytes and size
881    ///
882    /// This is a convenience method that combines the common pattern of:
883    ///
884    /// # Returns
885    /// * `(Vec<u8>, u64)` - The serialized bytes and size as u64
886    ///
887    /// # Example
888    pub fn to_bytes_with_size(&self) -> (Vec<u8>, u64) {
889        let chunk_bytes = self.to_bytes();
890        let chunk_size = chunk_bytes.len() as u64;
891        (chunk_bytes, chunk_size)
892    }
893
894    /// Deserializes chunk from binary format
895    /// Returns (chunk, bytes_consumed)
896    pub fn from_bytes(data: &[u8]) -> Result<(Self, usize), PipelineError> {
897        if data.len() < 16 {
898            // 12 + 4 = minimum chunk header size
899            return Err(PipelineError::ValidationError(
900                "Data too short for chunk header".to_string(),
901            ));
902        }
903
904        // Read nonce
905        let mut nonce = [0u8; 12];
906        nonce.copy_from_slice(&data[0..12]);
907
908        // Read data length
909        let data_length = u32::from_le_bytes([data[12], data[13], data[14], data[15]]) as usize;
910
911        // Check if we have enough data
912        let total_size = 16 + data_length;
913        if data.len() < total_size {
914            return Err(PipelineError::ValidationError("Incomplete chunk data".to_string()));
915        }
916
917        // Read payload data
918        let payload = data[16..16 + data_length].to_vec();
919
920        Ok((
921            Self {
922                nonce,
923                data_length: data_length as u32,
924                payload,
925            },
926            total_size,
927        ))
928    }
929
930    /// Validates the chunk format
931    pub fn validate(&self) -> Result<(), PipelineError> {
932        if (self.data_length as usize) != self.payload.len() {
933            return Err(PipelineError::ValidationError("Chunk data length mismatch".to_string()));
934        }
935
936        if self.payload.is_empty() {
937            return Err(PipelineError::ValidationError("Chunk cannot be empty".to_string()));
938        }
939
940        Ok(())
941    }
942}
943
944impl Default for FileHeader {
945    fn default() -> Self {
946        Self::new("unknown".to_string(), 0, String::new())
947    }
948}
949
950#[cfg(test)]
951mod tests {
952    use super::*;
953
954    /// Tests file header creation and serialization with processing steps.
955    ///
956    /// This test validates that file headers can be created with multiple
957    /// processing steps (compression and encryption) and that all metadata
958    /// is properly stored and accessible.
959    ///
960    /// # Test Coverage
961    ///
962    /// - File header creation with fluent API
963    /// - Compression step addition with algorithm and level
964    /// - Encryption step addition with key derivation parameters
965    /// - Chunk information and pipeline ID configuration
966    /// - Output checksum configuration
967    /// - Header validation for consistency
968    /// - Processing step detection (compression/encryption flags)
969    /// - Algorithm extraction from processing steps
970    ///
971    /// # Assertions
972    ///
973    /// - Header validation passes for complete configuration
974    /// - Compression detection works correctly
975    /// - Encryption detection works correctly
976    /// - Algorithm names are extracted properly
977    /// - All fluent API methods chain correctly
978    #[test]
979    fn test_header_creation_and_serialization() {
980        let header = FileHeader::new("test.txt".to_string(), 1024, "abc123".to_string())
981            .add_compression_step("brotli", 6)
982            .add_encryption_step("aes256gcm", "argon2", 32, 12)
983            .with_chunk_info(1024 * 1024, 1)
984            .with_pipeline_id("test-pipeline".to_string())
985            .with_output_checksum("def456".to_string());
986
987        assert!(header.validate().is_ok());
988        assert!(header.is_compressed());
989        assert!(header.is_encrypted());
990        assert_eq!(header.compression_algorithm(), Some("brotli"));
991        assert_eq!(header.encryption_algorithm(), Some("aes256gcm"));
992    }
993
994    /// Tests header serialization and deserialization roundtrip.
995    ///
996    /// This test validates that file headers can be serialized to footer
997    /// bytes and then deserialized back to identical header objects,
998    /// ensuring data integrity during file I/O operations.
999    ///
1000    /// # Test Coverage
1001    ///
1002    /// - Header creation with compression step
1003    /// - Footer byte serialization
1004    /// - Footer byte deserialization
1005    /// - Roundtrip data integrity
1006    /// - Footer size calculation
1007    /// - Header equality comparison
1008    ///
1009    /// # Test Scenario
1010    ///
1011    /// Creates a header with compression, serializes it to footer bytes,
1012    /// then deserializes it back and verifies the restored header matches
1013    /// the original exactly.
1014    ///
1015    /// # Assertions
1016    ///
1017    /// - Original and restored headers are identical
1018    /// - Footer size matches actual byte length
1019    /// - Serialization/deserialization preserves all data
1020    /// - No data loss during roundtrip conversion
1021    #[test]
1022    fn test_header_footer_roundtrip() {
1023        let original_header = FileHeader::new("test.txt".to_string(), 1024, "abc123".to_string())
1024            .add_compression_step("brotli", 6)
1025            .with_output_checksum("def456".to_string());
1026
1027        let footer_data = original_header.to_footer_bytes().unwrap();
1028
1029        // Simulate reading from end of file
1030        let (restored_header, footer_size) = FileHeader::from_footer_bytes(&footer_data).unwrap();
1031
1032        assert_eq!(original_header, restored_header);
1033        assert_eq!(footer_size, footer_data.len());
1034    }
1035
1036    /// Tests restoration steps ordering for proper file recovery.
1037    ///
1038    /// This test validates that restoration steps are returned in reverse
1039    /// order of processing, ensuring that files can be properly restored
1040    /// by undoing operations in the correct sequence.
1041    ///
1042    /// # Test Coverage
1043    ///
1044    /// - Processing step order assignment
1045    /// - Restoration step order reversal
1046    /// - Multi-step processing (compression + encryption)
1047    /// - Step order validation
1048    /// - Proper restoration sequence
1049    ///
1050    /// # Test Scenario
1051    ///
1052    /// Creates a header with compression (order 0) followed by encryption
1053    /// (order 1), then verifies that restoration steps are returned in
1054    /// reverse order: encryption first, then compression.
1055    ///
1056    /// # Assertions
1057    ///
1058    /// - Restoration steps are in reverse processing order
1059    /// - Encryption step comes first (order 1)
1060    /// - Compression step comes second (order 0)
1061    /// - Step count matches processing step count
1062    /// - Order values are preserved correctly
1063    #[test]
1064    fn test_restoration_steps_order() {
1065        let header = FileHeader::new("test.txt".to_string(), 1024, "abc123".to_string())
1066            .add_compression_step("brotli", 6) // Order 0
1067            .add_encryption_step("aes256gcm", "argon2", 32, 12); // Order 1
1068
1069        let restoration_steps = header.get_restoration_steps();
1070
1071        // Should be in reverse order: encryption first (order 1), then compression
1072        // (order 0)
1073        assert_eq!(restoration_steps.len(), 2);
1074        assert_eq!(restoration_steps[0].order, 1); // Encryption
1075        assert_eq!(restoration_steps[1].order, 0); // Compression
1076    }
1077
1078    /// Tests chunk format serialization and deserialization roundtrip.
1079    ///
1080    /// This test validates that chunk data can be serialized to bytes
1081    /// and then deserialized back to identical chunk objects, ensuring
1082    /// data integrity for encrypted chunk storage.
1083    ///
1084    /// # Test Coverage
1085    ///
1086    /// - Chunk format creation with nonce and data
1087    /// - Chunk byte serialization
1088    /// - Chunk byte deserialization
1089    /// - Roundtrip data integrity
1090    /// - Bytes consumed calculation
1091    /// - Chunk validation after deserialization
1092    ///
1093    /// # Test Scenario
1094    ///
1095    /// Creates a chunk with test nonce and data, serializes it to bytes,
1096    /// then deserializes it back and verifies the restored chunk matches
1097    /// the original exactly.
1098    ///
1099    /// # Assertions
1100    ///
1101    /// - Original and restored chunks are identical
1102    /// - Bytes consumed matches serialized byte length
1103    /// - Deserialized chunk passes validation
1104    /// - Nonce and data are preserved exactly
1105    /// - No data corruption during roundtrip
1106    #[test]
1107    fn test_chunk_format_roundtrip() {
1108        let nonce = [1u8; 12];
1109        let data = vec![0xde, 0xad, 0xbe, 0xef];
1110        let original_chunk = ChunkFormat::new(nonce, data);
1111
1112        let chunk_bytes = original_chunk.to_bytes();
1113        let (restored_chunk, bytes_consumed) = ChunkFormat::from_bytes(&chunk_bytes).unwrap();
1114
1115        assert_eq!(original_chunk, restored_chunk);
1116        assert_eq!(bytes_consumed, chunk_bytes.len());
1117        assert!(restored_chunk.validate().is_ok());
1118    }
1119
1120    /// Tests error handling for invalid magic bytes in file headers.
1121    ///
1122    /// This test validates that the system properly rejects files that
1123    /// don't have the correct magic bytes, preventing processing of
1124    /// non-adapipe files and providing clear error messages.
1125    ///
1126    /// # Test Coverage
1127    ///
1128    /// - Invalid magic byte detection
1129    /// - Error handling for malformed files
1130    /// - Error message content validation
1131    /// - File format validation
1132    /// - Rejection of non-adapipe files
1133    ///
1134    /// # Test Scenario
1135    ///
1136    /// Creates invalid data with wrong magic bytes and attempts to
1137    /// parse it as a file header, expecting a clear error message
1138    /// about invalid magic bytes.
1139    ///
1140    /// # Assertions
1141    ///
1142    /// - Parsing fails with error result
1143    /// - Error message mentions "Invalid magic bytes"
1144    /// - System rejects malformed data
1145    /// - No false positives for invalid files
1146    /// - Clear error reporting for debugging
1147    #[test]
1148    fn test_invalid_magic_bytes() {
1149        let bad_data = vec![0xFF; 20];
1150        let result = FileHeader::from_footer_bytes(&bad_data);
1151        assert!(result.is_err());
1152        assert!(result.unwrap_err().to_string().contains("Invalid magic bytes"));
1153    }
1154
1155    /// Tests processing summary generation for file headers.
1156    ///
1157    /// This test validates that processing summaries correctly describe
1158    /// the operations applied to files, providing human-readable
1159    /// descriptions of the processing pipeline.
1160    ///
1161    /// # Test Coverage
1162    ///
1163    /// - Processing summary generation
1164    /// - Multi-step processing description
1165    /// - Algorithm name inclusion
1166    /// - Processing flow visualization
1167    /// - Human-readable output format
1168    ///
1169    /// # Test Scenario
1170    ///
1171    /// Creates a header with compression and encryption steps, then
1172    /// generates a processing summary and verifies it contains the
1173    /// expected algorithm names and flow indicators.
1174    ///
1175    /// # Assertions
1176    ///
1177    /// - Summary contains compression algorithm name
1178    /// - Summary contains encryption algorithm name
1179    /// - Summary includes flow indicator (→)
1180    /// - Format is human-readable
1181    /// - All processing steps are represented
1182    #[test]
1183    fn test_processing_summary() {
1184        let header = FileHeader::new("test.txt".to_string(), 1024, "abc123".to_string())
1185            .add_compression_step("brotli", 6)
1186            .add_encryption_step("aes256gcm", "argon2", 32, 12);
1187
1188        let summary = header.get_processing_summary();
1189        assert!(summary.contains("Compression (brotli)"));
1190        assert!(summary.contains("Encryption (aes256gcm)"));
1191        assert!(summary.contains("→")); // Should show processing flow
1192    }
1193
1194    /// Tests pass-through file handling without processing steps.
1195    ///
1196    /// This test validates that files with no processing steps are
1197    /// properly handled as pass-through files, with appropriate
1198    /// flags and summary messages.
1199    ///
1200    /// # Test Coverage
1201    ///
1202    /// - Pass-through file detection
1203    /// - No compression flag validation
1204    /// - No encryption flag validation
1205    /// - Pass-through summary message
1206    /// - Minimal processing configuration
1207    ///
1208    /// # Test Scenario
1209    ///
1210    /// Creates a header with only basic file information and no
1211    /// processing steps, then verifies that it's correctly identified
1212    /// as a pass-through file.
1213    ///
1214    /// # Assertions
1215    ///
1216    /// - Compression flag is false
1217    /// - Encryption flag is false
1218    /// - Summary indicates "No processing applied (pass-through)"
1219    /// - Header is valid despite no processing steps
1220    /// - Pass-through files are handled correctly
1221    #[test]
1222    fn test_pass_through_file() {
1223        let header = FileHeader::new("test.txt".to_string(), 1024, "abc123".to_string())
1224            .with_output_checksum("def456".to_string());
1225
1226        assert!(!header.is_compressed());
1227        assert!(!header.is_encrypted());
1228        assert_eq!(header.get_processing_summary(), "No processing applied (pass-through)");
1229    }
1230}
adaptive_pipeline_domain/value_objects/binary_file_format.rs

adaptive_pipeline_domain/value_objects/
binary_file_format.rs