adaptive_pipeline_domain/value_objects/
file_chunk.rs

1// /////////////////////////////////////////////////////////////////////////////
2// Adaptive Pipeline
3// Copyright (c) 2025 Michael Gardner, A Bit of Help, Inc.
4// SPDX-License-Identifier: BSD-3-Clause
5// See LICENSE file in the project root.
6// /////////////////////////////////////////////////////////////////////////////
7
8//! # File Chunk Value Object
9//!
10//! This module provides the `FileChunk` value object, which represents an
11//! immutable chunk of file data for processing within the adaptive pipeline
12//! system. It follows Domain-Driven Design principles and ensures data
13//! integrity throughout processing.
14//!
15//! ## Overview
16//!
17//! The file chunk value object provides:
18//!
19//! - **Immutable Data**: Once created, chunks cannot be modified
20//! - **Unique Identity**: Each chunk has a unique UUID for tracking
21//! - **Sequence Ordering**: Chunks maintain sequence numbers for reassembly
22//! - **Integrity Verification**: Optional checksums for data integrity
23//! - **Metadata Tracking**: Creation timestamps and processing metadata
24//!
25//! ## Design Principles
26//!
27//! The file chunk follows Domain-Driven Design value object principles:
28//!
29//! - **Immutability**: Once created, chunks cannot be modified
30//! - **Value Semantics**: Chunks are compared by value, not identity
31//! - **Self-Validation**: Chunks validate their own data integrity
32//! - **Rich Behavior**: Chunks provide methods for common operations
33//!
34//! ## Chunk Structure
35//!
36//! ### Core Data
37//! - **ID**: Unique UUID for chunk identification and tracking
38//! - **Sequence Number**: Position in the original file for reassembly
39//! - **Offset**: Byte offset in the original file
40//! - **Size**: Validated chunk size within system limits
41//! - **Data**: The actual chunk data bytes
42//!
43//! ### Metadata
44//! - **Checksum**: Optional SHA-256 checksum for integrity verification
45//! - **Is Final**: Flag indicating if this is the last chunk in a file
46//! - **Created At**: UTC timestamp of chunk creation
47//!
48//! ## Usage Examples
49//!
50//! ### Basic Chunk Creation
51
52//!
53//! ### Chunk with Checksum
54
55//!
56//! ### Chunk Processing Chain
57
58//!
59//! ## Chunk Validation
60//!
61//! ### Data Integrity
62
63//!
64//! ### Sequence Validation
65
66//!
67//! ## Performance Considerations
68//!
69//! ### Memory Usage
70//!
71//! - **Data Storage**: Chunks store data in `Vec<u8>` for efficient access
72//! - **Metadata Overhead**: Minimal metadata overhead per chunk
73//! - **Cloning**: Chunks can be cloned efficiently for processing
74//!
75//! ### Processing Efficiency
76//!
77//! - **Immutable Design**: Prevents accidental mutations during processing
78//! - **Builder Pattern**: Efficient creation of modified chunks
79//! - **Lazy Checksum**: Checksums are calculated only when needed
80//!
81//! ### Memory Management
82//!
83//! - **Automatic Cleanup**: Chunks are automatically cleaned up when dropped
84//! - **Reference Counting**: Use `Arc<FileChunk>` for shared ownership
85//! - **Streaming**: Chunks can be processed in streaming fashion
86//!
87//! ## Thread Safety
88//!
89//! The file chunk is fully thread-safe:
90//!
91//! - **Immutable**: Once created, chunks cannot be modified
92//! - **Send + Sync**: Chunks can be safely sent between threads
93//! - **No Shared State**: No mutable shared state to synchronize
94//!
95//! ## Serialization
96//!
97//! ### JSON Serialization
98
99//!
100//! ### Binary Serialization
101
102//!
103//! ## Integration
104//!
105//! The file chunk integrates with:
106//!
107//! - **File Processing**: Core unit of file processing operations
108//! - **Pipeline Stages**: Passed between processing stages
109//! - **Storage Systems**: Serialized for persistent storage
110//! - **Network Transport**: Transmitted between distributed components
111//!
112//! ## Error Handling
113//!
114//! ### Validation Errors
115//!
116//! - **Invalid Size**: Chunk size outside valid bounds
117//! - **Invalid Data**: Corrupted or invalid chunk data
118//! - **Checksum Mismatch**: Data integrity verification failures
119//! - **Sequence Errors**: Invalid sequence numbers or ordering
120//!
121//! ### Recovery Strategies
122//!
123//! - **Retry Logic**: Automatic retry for transient failures
124//! - **Fallback Processing**: Alternative processing for corrupted chunks
125//! - **Error Reporting**: Detailed error context for debugging
126//!
127//! ## Future Enhancements
128//!
129//! Planned enhancements include:
130//!
131//! - **Compression**: Built-in compression for chunk data
132//! - **Encryption**: Encrypted chunk data for security
133//! - **Streaming**: Streaming chunk processing for large files
134//! - **Caching**: Intelligent caching of frequently accessed chunks
135
136use crate::services::datetime_serde;
137use crate::{ChunkSize, PipelineError};
138use hex;
139use serde::{Deserialize, Serialize};
140use sha2::{Digest, Sha256};
141use uuid::Uuid;
142
143/// Represents an immutable chunk of file data for processing
144///
145/// This is a Value Object in Domain-Driven Design terms - it represents data
146/// without identity that cannot be modified once created. Any "changes" create
147/// new instances, ensuring data integrity and preventing accidental mutations
148/// during processing.
149///
150/// # Key Features
151///
152/// - **Immutability**: Once created, chunks cannot be modified
153/// - **Unique Identity**: Each chunk has a UUID for tracking and identification
154/// - **Sequence Ordering**: Maintains sequence numbers for proper file
155///   reassembly
156/// - **Integrity Verification**: Optional checksums for data integrity
157///   validation
158/// - **Metadata Tracking**: Creation timestamps and processing metadata
159///
160/// # Design Principles
161///
162/// - **Value Object**: Compared by value, not identity
163/// - **Self-Validation**: Validates its own data integrity
164/// - **Builder Pattern**: Use methods like `with_checksum()` for modifications
165/// - **Thread Safety**: Fully thread-safe due to immutability
166///
167/// # Examples
168///
169///
170/// # Developer Notes
171///
172/// - Use builder methods like `with_checksum()` to create modified versions
173/// - Processing stages should create new chunks rather than modifying existing
174///   ones
175/// - This design prevents data corruption and ensures thread safety
176#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
177pub struct FileChunk {
178    id: Uuid,
179    sequence_number: u64,
180    offset: u64,
181    size: ChunkSize,
182    data: Vec<u8>,
183    checksum: Option<String>,
184    is_final: bool,
185    #[serde(with = "datetime_serde")]
186    created_at: chrono::DateTime<chrono::Utc>,
187}
188
189impl FileChunk {
190    /// Creates a new file chunk
191    ///
192    /// # Purpose
193    /// Creates an immutable file chunk value object for pipeline processing.
194    /// Chunks are the fundamental unit of file processing in the adaptive
195    /// pipeline.
196    ///
197    /// # Why
198    /// File chunking enables:
199    /// - Parallel processing of large files
200    /// - Memory-efficient streaming
201    /// - Independent processing units
202    /// - Granular error recovery
203    ///
204    /// # Arguments
205    /// * `sequence_number` - The order of this chunk in the file (0-based)
206    /// * `offset` - Byte offset in the original file where this chunk starts
207    /// * `data` - The actual chunk data bytes (must not be empty)
208    /// * `is_final` - Whether this is the last chunk in the file
209    ///
210    /// # Returns
211    /// * `Ok(FileChunk)` - Successfully created chunk with unique UUID
212    /// * `Err(PipelineError::InvalidChunk)` - Data is empty
213    ///
214    /// # Errors
215    /// Returns `PipelineError::InvalidChunk` when data is empty.
216    ///
217    /// # Side Effects
218    /// - Generates new UUID for chunk identification
219    /// - Sets creation timestamp to current UTC time
220    /// - Calculates chunk size from data length
221    ///
222    /// # Examples
223    ///
224    ///
225    /// # Developer Notes
226    /// - Each chunk gets a unique UUID for tracking across pipeline stages
227    /// - Chunk size is automatically validated against system limits
228    /// - Checksum is initially None - use `with_calculated_checksum()` to add
229    /// - This is a Value Object - create new instances for "changes"
230    pub fn new(sequence_number: u64, offset: u64, data: Vec<u8>, is_final: bool) -> Result<Self, PipelineError> {
231        if data.is_empty() {
232            return Err(PipelineError::InvalidChunk("Chunk data cannot be empty".to_string()));
233        }
234
235        let size = ChunkSize::new(data.len())?;
236
237        Ok(FileChunk {
238            id: Uuid::new_v4(),
239            sequence_number,
240            offset,
241            size,
242            data,
243            checksum: None,
244            is_final,
245            created_at: chrono::Utc::now(),
246        })
247    }
248
249    /// Creates a new file chunk with checksum
250    ///
251    /// # Developer Notes
252    /// - This is a convenience constructor for chunks that already have
253    ///   checksums
254    /// - Prefer using `new()` followed by `with_checksum()` for clarity
255    pub fn new_with_checksum(
256        sequence_number: u64,
257        offset: u64,
258        data: Vec<u8>,
259        checksum: String,
260        is_final: bool,
261    ) -> Result<Self, PipelineError> {
262        let chunk = Self::new(sequence_number, offset, data, is_final)?;
263        Ok(chunk.with_checksum(checksum))
264    }
265
266    // === Immutable Accessors ===
267
268    /// Gets the chunk ID
269    pub fn id(&self) -> Uuid {
270        self.id
271    }
272
273    /// Gets the sequence number
274    pub fn sequence_number(&self) -> u64 {
275        self.sequence_number
276    }
277
278    /// Gets the offset in the original file
279    pub fn offset(&self) -> u64 {
280        self.offset
281    }
282
283    /// Gets the chunk size
284    pub fn size(&self) -> &ChunkSize {
285        &self.size
286    }
287
288    /// Gets the chunk data (immutable reference)
289    pub fn data(&self) -> &[u8] {
290        &self.data
291    }
292
293    /// Gets the checksum if available
294    pub fn checksum(&self) -> Option<&str> {
295        self.checksum.as_deref()
296    }
297
298    /// Checks if this is the final chunk
299    pub fn is_final(&self) -> bool {
300        self.is_final
301    }
302
303    /// Gets the creation timestamp
304    pub fn created_at(&self) -> chrono::DateTime<chrono::Utc> {
305        self.created_at
306    }
307
308    /// Gets the actual data length
309    pub fn data_len(&self) -> usize {
310        self.data.len()
311    }
312
313    /// Checks if the chunk is empty
314    pub fn is_empty(&self) -> bool {
315        self.data.is_empty()
316    }
317
318    // === Immutable Builder Pattern Methods ===
319
320    /// Creates a new FileChunk with updated data
321    ///
322    /// # Developer Notes
323    /// - This creates a completely new chunk instance
324    /// - The old chunk remains unchanged (immutability)
325    /// - Checksum is cleared since data changed
326    /// - Use this pattern: `let new_chunk =
327    ///   old_chunk.with_data(new_data).unwrap();`
328    pub fn with_data(&self, data: Vec<u8>) -> Result<Self, PipelineError> {
329        if data.is_empty() {
330            return Err(PipelineError::InvalidChunk("Chunk data cannot be empty".to_string()));
331        }
332
333        let size = ChunkSize::new(data.len())?;
334
335        Ok(FileChunk {
336            id: Uuid::new_v4(), // New chunk gets new ID
337            sequence_number: self.sequence_number,
338            offset: self.offset,
339            size,
340            data,
341            checksum: None, // Clear checksum when data changes
342            is_final: self.is_final,
343            created_at: chrono::Utc::now(), // New creation time
344        })
345    }
346
347    /// Creates a new FileChunk with a checksum
348    ///
349    /// # Developer Notes
350    /// - This preserves all other data and adds/updates the checksum
351    /// - Use this after processing: `let verified_chunk =
352    ///   chunk.with_checksum(hash);`
353    pub fn with_checksum(&self, checksum: String) -> Self {
354        FileChunk {
355            id: self.id,
356            sequence_number: self.sequence_number,
357            offset: self.offset,
358            size: self.size,
359            data: self.data.clone(),
360            checksum: Some(checksum),
361            is_final: self.is_final,
362            created_at: self.created_at,
363        }
364    }
365
366    /// Creates a new FileChunk with calculated SHA-256 checksum
367    ///
368    /// # Developer Notes
369    /// - Calculates SHA-256 hash of current data
370    /// - Returns new chunk with checksum set
371    /// - Original chunk remains unchanged
372    pub fn with_calculated_checksum(&self) -> Result<Self, PipelineError> {
373        let mut hasher = Sha256::new();
374        hasher.update(&self.data);
375        let digest = hasher.finalize();
376        let checksum = hex::encode(digest);
377        Ok(self.with_checksum(checksum))
378    }
379
380    /// Creates a new FileChunk without data (for security)
381    ///
382    /// # Developer Notes
383    /// - Creates new chunk with empty data vector
384    /// - Useful for secure cleanup while preserving metadata
385    /// - Checksum is cleared since data is gone
386    pub fn without_data(&self) -> Self {
387        FileChunk {
388            id: self.id,
389            sequence_number: self.sequence_number,
390            offset: self.offset,
391            size: ChunkSize::new(0).unwrap_or_else(|_| ChunkSize::default()), /* Empty chunk - ChunkSize(0) should
392                                                                               * never fail, but handle it safely */
393            data: Vec::new(),
394            checksum: None, // Clear checksum
395            is_final: self.is_final,
396            created_at: self.created_at,
397        }
398    }
399
400    // === Verification Methods (Read-Only) ===
401
402    /// Verifies the chunk integrity using the stored checksum
403    ///
404    /// # Purpose
405    /// Validates that chunk data has not been corrupted by comparing the stored
406    /// SHA-256 checksum against a freshly calculated hash of the current data.
407    ///
408    /// # Why
409    /// Integrity verification provides:
410    /// - Detection of data corruption during processing or storage
411    /// - Confidence in pipeline operations
412    /// - Early error detection before expensive operations
413    /// - Compliance with data integrity requirements
414    ///
415    /// # Returns
416    /// * `Ok(true)` - Checksum matches, data is intact
417    /// * `Ok(false)` - Checksum mismatch, data corrupted
418    /// * `Err(PipelineError::InvalidChunk)` - No checksum available
419    ///
420    /// # Errors
421    /// Returns `PipelineError::InvalidChunk` when the chunk has no stored
422    /// checksum.
423    ///
424    /// # Examples
425    ///
426    ///
427    /// # Developer Notes
428    /// - This method is read-only and doesn't modify the chunk
429    /// - Use before critical processing to ensure data integrity
430    /// - Consider verification before expensive operations like encryption
431    pub fn verify_integrity(&self) -> Result<bool, PipelineError> {
432        if let Some(stored_checksum) = &self.checksum {
433            let mut hasher = Sha256::new();
434            hasher.update(&self.data);
435            let digest = hasher.finalize();
436            let calculated_checksum = hex::encode(digest);
437            Ok(calculated_checksum == *stored_checksum)
438        } else {
439            Err(PipelineError::InvalidChunk(
440                "No checksum available for verification".to_string(),
441            ))
442        }
443    }
444
445    /// Calculates SHA-256 checksum without modifying the chunk
446    ///
447    /// # Developer Notes
448    /// - This is a pure function - doesn't modify the chunk
449    /// - Use when you need the checksum but don't want to create a new chunk
450    /// - For creating a chunk with checksum, use `with_calculated_checksum()`
451    pub fn calculate_checksum(&self) -> Result<String, PipelineError> {
452        let mut hasher = Sha256::new();
453        hasher.update(&self.data);
454        let digest = hasher.finalize();
455        Ok(hex::encode(digest))
456    }
457}
458
459#[cfg(test)]
460mod tests {
461    use super::*;
462
463    /// Tests file chunk creation with basic properties.
464    ///
465    /// This test validates that file chunks can be created with
466    /// required properties and that all metadata is properly
467    /// stored and accessible.
468    ///
469    /// # Test Coverage
470    ///
471    /// - File chunk creation with minimum size requirement
472    /// - Sequence number assignment
473    /// - Offset position tracking
474    /// - Data storage and retrieval
475    /// - Final chunk flag handling
476    /// - Checksum initialization (none by default)
477    ///
478    /// # Test Scenario
479    ///
480    /// Creates a file chunk with test data meeting minimum size
481    /// requirements and verifies all properties are set correctly.
482    ///
483    /// # Assertions
484    ///
485    /// - Sequence number matches input
486    /// - Offset matches input
487    /// - Data is stored correctly
488    /// - Final flag is set correctly
489    /// - Checksum is initially None
490    #[test]
491    fn test_file_chunk_creation() {
492        // Create test data that meets minimum chunk size requirement (1MB)
493        let data = vec![42u8; ChunkSize::MIN_SIZE];
494        let chunk = FileChunk::new(0, 0, data.clone(), false).unwrap();
495
496        assert_eq!(chunk.sequence_number(), 0);
497        assert_eq!(chunk.offset(), 0);
498        assert_eq!(chunk.data(), &data);
499        assert!(!chunk.is_final());
500        assert!(chunk.checksum().is_none());
501    }
502
503    /// Tests file chunk immutability and data modification behavior.
504    ///
505    /// This test validates that file chunks are immutable and that
506    /// data modifications create new chunk instances while preserving
507    /// the original chunk unchanged.
508    ///
509    /// # Test Coverage
510    ///
511    /// - File chunk immutability
512    /// - Data modification with `with_data()`
513    /// - Original chunk preservation
514    /// - New chunk creation
515    /// - Unique ID generation for modified chunks
516    ///
517    /// # Test Scenario
518    ///
519    /// Creates a file chunk, modifies its data using `with_data()`,
520    /// then verifies the original chunk is unchanged and the new
521    /// chunk has different data and ID.
522    ///
523    /// # Assertions
524    ///
525    /// - Original chunk data is unchanged
526    /// - New chunk has modified data
527    /// - Chunk IDs are different
528    /// - Immutability is preserved
529    #[test]
530    fn test_file_chunk_immutability() {
531        let data = vec![42u8; ChunkSize::MIN_SIZE];
532        let chunk1 = FileChunk::new(0, 0, data.clone(), false).unwrap();
533
534        // Creating new chunk with different data
535        let new_data = vec![99u8; ChunkSize::MIN_SIZE];
536        let chunk2 = chunk1.with_data(new_data.clone()).unwrap();
537
538        // Original chunk unchanged
539        assert_eq!(chunk1.data(), &data);
540        assert_eq!(chunk2.data(), &new_data);
541        assert_ne!(chunk1.id(), chunk2.id()); // Different IDs
542    }
543
544    /// Tests file chunk checksum addition and preservation.
545    ///
546    /// This test validates that checksums can be added to file chunks
547    /// while preserving the original chunk and maintaining the same
548    /// chunk ID for checksum-only modifications.
549    ///
550    /// # Test Coverage
551    ///
552    /// - Checksum addition with `with_checksum()`
553    /// - Original chunk preservation
554    /// - Checksum storage and retrieval
555    /// - ID preservation for checksum addition
556    /// - Checksum immutability
557    ///
558    /// # Test Scenario
559    ///
560    /// Creates a file chunk without checksum, adds a checksum using
561    /// `with_checksum()`, then verifies the original chunk is unchanged
562    /// and the new chunk has the checksum with the same ID.
563    ///
564    /// # Assertions
565    ///
566    /// - Original chunk has no checksum
567    /// - New chunk has the specified checksum
568    /// - Chunk IDs are the same (checksum addition preserves ID)
569    /// - Checksum is stored correctly
570    #[test]
571    fn test_file_chunk_with_checksum() {
572        let data = vec![42u8; ChunkSize::MIN_SIZE];
573        let chunk1 = FileChunk::new(0, 0, data, false).unwrap();
574        let chunk2 = chunk1.with_checksum("test_hash".to_string());
575
576        // Original chunk unchanged
577        assert!(chunk1.checksum().is_none());
578        assert_eq!(chunk2.checksum(), Some("test_hash"));
579        assert_eq!(chunk1.id(), chunk2.id()); // Same ID for checksum addition
580    }
581
582    /// Tests file chunk automatic checksum calculation.
583    ///
584    /// This test validates that file chunks can automatically
585    /// calculate checksums from their data and that the calculated
586    /// checksum matches manual calculation.
587    ///
588    /// # Test Coverage
589    ///
590    /// - Automatic checksum calculation with `with_calculated_checksum()`
591    /// - Manual checksum calculation with `calculate_checksum()`
592    /// - Checksum accuracy verification
593    /// - Original chunk preservation
594    /// - Checksum consistency
595    ///
596    /// # Test Scenario
597    ///
598    /// Creates a file chunk, calculates its checksum automatically,
599    /// then verifies the checksum matches manual calculation.
600    ///
601    /// # Assertions
602    ///
603    /// - Original chunk has no checksum
604    /// - New chunk has calculated checksum
605    /// - Calculated checksum matches manual calculation
606    /// - Checksum calculation is accurate
607    #[test]
608    fn test_file_chunk_calculated_checksum() {
609        let data = vec![42u8; ChunkSize::MIN_SIZE];
610        let chunk = FileChunk::new(0, 0, data, false).unwrap();
611        let chunk_with_checksum = chunk.with_calculated_checksum().unwrap();
612
613        assert!(chunk.checksum().is_none());
614        assert!(chunk_with_checksum.checksum().is_some());
615
616        // Verify the checksum is correct
617        let calculated = chunk.calculate_checksum().unwrap();
618        assert_eq!(chunk_with_checksum.checksum(), Some(calculated.as_str()));
619    }
620
621    /// Tests file chunk integrity verification.
622    ///
623    /// This test validates that file chunks can verify their data
624    /// integrity using checksums and that verification fails
625    /// appropriately when checksums are missing.
626    ///
627    /// # Test Coverage
628    ///
629    /// - Integrity verification with `verify_integrity()`
630    /// - Successful verification with valid checksum
631    /// - Failed verification without checksum
632    /// - Checksum-based data validation
633    /// - Error handling for missing checksums
634    ///
635    /// # Test Scenario
636    ///
637    /// Creates a file chunk with calculated checksum and verifies
638    /// integrity passes, then tests that chunks without checksums
639    /// fail verification.
640    ///
641    /// # Assertions
642    ///
643    /// - Chunk with checksum passes integrity verification
644    /// - Chunk without checksum fails integrity verification
645    /// - Verification logic works correctly
646    /// - Error handling is appropriate
647    #[test]
648    fn test_file_chunk_verify_integrity() {
649        let data = vec![42u8; ChunkSize::MIN_SIZE];
650        let chunk = FileChunk::new(0, 0, data, false).unwrap();
651        let chunk_with_checksum = chunk.with_calculated_checksum().unwrap();
652
653        // Should verify successfully
654        assert!(chunk_with_checksum.verify_integrity().unwrap());
655
656        // Chunk without checksum should error
657        assert!(chunk.verify_integrity().is_err());
658    }
659
660    /// Tests file chunk rejection of empty data.
661    ///
662    /// This test validates that file chunks reject empty data
663    /// during creation, ensuring all chunks contain meaningful
664    /// data for processing.
665    ///
666    /// # Test Coverage
667    ///
668    /// - Empty data rejection during creation
669    /// - Validation error handling
670    /// - Minimum data requirements
671    /// - Input validation
672    /// - Error message clarity
673    ///
674    /// # Test Scenario
675    ///
676    /// Attempts to create a file chunk with empty data and
677    /// verifies that creation fails with appropriate error.
678    ///
679    /// # Assertions
680    ///
681    /// - Empty data creation fails
682    /// - Error is returned appropriately
683    /// - Validation prevents invalid chunks
684    /// - Input requirements are enforced
685    #[test]
686    fn test_empty_data_rejection() {
687        let result = FileChunk::new(0, 0, vec![], false);
688        assert!(result.is_err());
689    }
690}