adaptive_pipeline_domain/value_objects/file_chunk.rs
1// /////////////////////////////////////////////////////////////////////////////
2// Adaptive Pipeline
3// Copyright (c) 2025 Michael Gardner, A Bit of Help, Inc.
4// SPDX-License-Identifier: BSD-3-Clause
5// See LICENSE file in the project root.
6// /////////////////////////////////////////////////////////////////////////////
7
8//! # File Chunk Value Object
9//!
10//! This module provides the `FileChunk` value object, which represents an
11//! immutable chunk of file data for processing within the adaptive pipeline
12//! system. It follows Domain-Driven Design principles and ensures data
13//! integrity throughout processing.
14//!
15//! ## Overview
16//!
17//! The file chunk value object provides:
18//!
19//! - **Immutable Data**: Once created, chunks cannot be modified
20//! - **Unique Identity**: Each chunk has a unique UUID for tracking
21//! - **Sequence Ordering**: Chunks maintain sequence numbers for reassembly
22//! - **Integrity Verification**: Optional checksums for data integrity
23//! - **Metadata Tracking**: Creation timestamps and processing metadata
24//!
25//! ## Design Principles
26//!
27//! The file chunk follows Domain-Driven Design value object principles:
28//!
29//! - **Immutability**: Once created, chunks cannot be modified
30//! - **Value Semantics**: Chunks are compared by value, not identity
31//! - **Self-Validation**: Chunks validate their own data integrity
32//! - **Rich Behavior**: Chunks provide methods for common operations
33//!
34//! ## Chunk Structure
35//!
36//! ### Core Data
37//! - **ID**: Unique UUID for chunk identification and tracking
38//! - **Sequence Number**: Position in the original file for reassembly
39//! - **Offset**: Byte offset in the original file
40//! - **Size**: Validated chunk size within system limits
41//! - **Data**: The actual chunk data bytes
42//!
43//! ### Metadata
44//! - **Checksum**: Optional SHA-256 checksum for integrity verification
45//! - **Is Final**: Flag indicating if this is the last chunk in a file
46//! - **Created At**: UTC timestamp of chunk creation
47//!
48//! ## Usage Examples
49//!
50//! ### Basic Chunk Creation
51
52//!
53//! ### Chunk with Checksum
54
55//!
56//! ### Chunk Processing Chain
57
58//!
59//! ## Chunk Validation
60//!
61//! ### Data Integrity
62
63//!
64//! ### Sequence Validation
65
66//!
67//! ## Performance Considerations
68//!
69//! ### Memory Usage
70//!
71//! - **Data Storage**: Chunks store data in `Vec<u8>` for efficient access
72//! - **Metadata Overhead**: Minimal metadata overhead per chunk
73//! - **Cloning**: Chunks can be cloned efficiently for processing
74//!
75//! ### Processing Efficiency
76//!
77//! - **Immutable Design**: Prevents accidental mutations during processing
78//! - **Builder Pattern**: Efficient creation of modified chunks
79//! - **Lazy Checksum**: Checksums are calculated only when needed
80//!
81//! ### Memory Management
82//!
83//! - **Automatic Cleanup**: Chunks are automatically cleaned up when dropped
84//! - **Reference Counting**: Use `Arc<FileChunk>` for shared ownership
85//! - **Streaming**: Chunks can be processed in streaming fashion
86//!
87//! ## Thread Safety
88//!
89//! The file chunk is fully thread-safe:
90//!
91//! - **Immutable**: Once created, chunks cannot be modified
92//! - **Send + Sync**: Chunks can be safely sent between threads
93//! - **No Shared State**: No mutable shared state to synchronize
94//!
95//! ## Serialization
96//!
97//! ### JSON Serialization
98
99//!
100//! ### Binary Serialization
101
102//!
103//! ## Integration
104//!
105//! The file chunk integrates with:
106//!
107//! - **File Processing**: Core unit of file processing operations
108//! - **Pipeline Stages**: Passed between processing stages
109//! - **Storage Systems**: Serialized for persistent storage
110//! - **Network Transport**: Transmitted between distributed components
111//!
112//! ## Error Handling
113//!
114//! ### Validation Errors
115//!
116//! - **Invalid Size**: Chunk size outside valid bounds
117//! - **Invalid Data**: Corrupted or invalid chunk data
118//! - **Checksum Mismatch**: Data integrity verification failures
119//! - **Sequence Errors**: Invalid sequence numbers or ordering
120//!
121//! ### Recovery Strategies
122//!
123//! - **Retry Logic**: Automatic retry for transient failures
124//! - **Fallback Processing**: Alternative processing for corrupted chunks
125//! - **Error Reporting**: Detailed error context for debugging
126//!
127//! ## Future Enhancements
128//!
129//! Planned enhancements include:
130//!
131//! - **Compression**: Built-in compression for chunk data
132//! - **Encryption**: Encrypted chunk data for security
133//! - **Streaming**: Streaming chunk processing for large files
134//! - **Caching**: Intelligent caching of frequently accessed chunks
135
136use crate::services::datetime_serde;
137use crate::{ChunkSize, PipelineError};
138use hex;
139use serde::{Deserialize, Serialize};
140use sha2::{Digest, Sha256};
141use uuid::Uuid;
142
143/// Represents an immutable chunk of file data for processing
144///
145/// This is a Value Object in Domain-Driven Design terms - it represents data
146/// without identity that cannot be modified once created. Any "changes" create
147/// new instances, ensuring data integrity and preventing accidental mutations
148/// during processing.
149///
150/// # Key Features
151///
152/// - **Immutability**: Once created, chunks cannot be modified
153/// - **Unique Identity**: Each chunk has a UUID for tracking and identification
154/// - **Sequence Ordering**: Maintains sequence numbers for proper file
155/// reassembly
156/// - **Integrity Verification**: Optional checksums for data integrity
157/// validation
158/// - **Metadata Tracking**: Creation timestamps and processing metadata
159///
160/// # Design Principles
161///
162/// - **Value Object**: Compared by value, not identity
163/// - **Self-Validation**: Validates its own data integrity
164/// - **Builder Pattern**: Use methods like `with_checksum()` for modifications
165/// - **Thread Safety**: Fully thread-safe due to immutability
166///
167/// # Examples
168///
169///
170/// # Developer Notes
171///
172/// - Use builder methods like `with_checksum()` to create modified versions
173/// - Processing stages should create new chunks rather than modifying existing
174/// ones
175/// - This design prevents data corruption and ensures thread safety
176#[derive(Debug, Clone, Serialize, Deserialize, PartialEq, Eq)]
177pub struct FileChunk {
178 id: Uuid,
179 sequence_number: u64,
180 offset: u64,
181 size: ChunkSize,
182 data: Vec<u8>,
183 checksum: Option<String>,
184 is_final: bool,
185 #[serde(with = "datetime_serde")]
186 created_at: chrono::DateTime<chrono::Utc>,
187}
188
189impl FileChunk {
190 /// Creates a new file chunk
191 ///
192 /// # Purpose
193 /// Creates an immutable file chunk value object for pipeline processing.
194 /// Chunks are the fundamental unit of file processing in the adaptive
195 /// pipeline.
196 ///
197 /// # Why
198 /// File chunking enables:
199 /// - Parallel processing of large files
200 /// - Memory-efficient streaming
201 /// - Independent processing units
202 /// - Granular error recovery
203 ///
204 /// # Arguments
205 /// * `sequence_number` - The order of this chunk in the file (0-based)
206 /// * `offset` - Byte offset in the original file where this chunk starts
207 /// * `data` - The actual chunk data bytes (must not be empty)
208 /// * `is_final` - Whether this is the last chunk in the file
209 ///
210 /// # Returns
211 /// * `Ok(FileChunk)` - Successfully created chunk with unique UUID
212 /// * `Err(PipelineError::InvalidChunk)` - Data is empty
213 ///
214 /// # Errors
215 /// Returns `PipelineError::InvalidChunk` when data is empty.
216 ///
217 /// # Side Effects
218 /// - Generates new UUID for chunk identification
219 /// - Sets creation timestamp to current UTC time
220 /// - Calculates chunk size from data length
221 ///
222 /// # Examples
223 ///
224 ///
225 /// # Developer Notes
226 /// - Each chunk gets a unique UUID for tracking across pipeline stages
227 /// - Chunk size is automatically validated against system limits
228 /// - Checksum is initially None - use `with_calculated_checksum()` to add
229 /// - This is a Value Object - create new instances for "changes"
230 pub fn new(sequence_number: u64, offset: u64, data: Vec<u8>, is_final: bool) -> Result<Self, PipelineError> {
231 if data.is_empty() {
232 return Err(PipelineError::InvalidChunk("Chunk data cannot be empty".to_string()));
233 }
234
235 let size = ChunkSize::new(data.len())?;
236
237 Ok(FileChunk {
238 id: Uuid::new_v4(),
239 sequence_number,
240 offset,
241 size,
242 data,
243 checksum: None,
244 is_final,
245 created_at: chrono::Utc::now(),
246 })
247 }
248
249 /// Creates a new file chunk with checksum
250 ///
251 /// # Developer Notes
252 /// - This is a convenience constructor for chunks that already have
253 /// checksums
254 /// - Prefer using `new()` followed by `with_checksum()` for clarity
255 pub fn new_with_checksum(
256 sequence_number: u64,
257 offset: u64,
258 data: Vec<u8>,
259 checksum: String,
260 is_final: bool,
261 ) -> Result<Self, PipelineError> {
262 let chunk = Self::new(sequence_number, offset, data, is_final)?;
263 Ok(chunk.with_checksum(checksum))
264 }
265
266 // === Immutable Accessors ===
267
268 /// Gets the chunk ID
269 pub fn id(&self) -> Uuid {
270 self.id
271 }
272
273 /// Gets the sequence number
274 pub fn sequence_number(&self) -> u64 {
275 self.sequence_number
276 }
277
278 /// Gets the offset in the original file
279 pub fn offset(&self) -> u64 {
280 self.offset
281 }
282
283 /// Gets the chunk size
284 pub fn size(&self) -> &ChunkSize {
285 &self.size
286 }
287
288 /// Gets the chunk data (immutable reference)
289 pub fn data(&self) -> &[u8] {
290 &self.data
291 }
292
293 /// Gets the checksum if available
294 pub fn checksum(&self) -> Option<&str> {
295 self.checksum.as_deref()
296 }
297
298 /// Checks if this is the final chunk
299 pub fn is_final(&self) -> bool {
300 self.is_final
301 }
302
303 /// Gets the creation timestamp
304 pub fn created_at(&self) -> chrono::DateTime<chrono::Utc> {
305 self.created_at
306 }
307
308 /// Gets the actual data length
309 pub fn data_len(&self) -> usize {
310 self.data.len()
311 }
312
313 /// Checks if the chunk is empty
314 pub fn is_empty(&self) -> bool {
315 self.data.is_empty()
316 }
317
318 // === Immutable Builder Pattern Methods ===
319
320 /// Creates a new FileChunk with updated data
321 ///
322 /// # Developer Notes
323 /// - This creates a completely new chunk instance
324 /// - The old chunk remains unchanged (immutability)
325 /// - Checksum is cleared since data changed
326 /// - Use this pattern: `let new_chunk =
327 /// old_chunk.with_data(new_data).unwrap();`
328 pub fn with_data(&self, data: Vec<u8>) -> Result<Self, PipelineError> {
329 if data.is_empty() {
330 return Err(PipelineError::InvalidChunk("Chunk data cannot be empty".to_string()));
331 }
332
333 let size = ChunkSize::new(data.len())?;
334
335 Ok(FileChunk {
336 id: Uuid::new_v4(), // New chunk gets new ID
337 sequence_number: self.sequence_number,
338 offset: self.offset,
339 size,
340 data,
341 checksum: None, // Clear checksum when data changes
342 is_final: self.is_final,
343 created_at: chrono::Utc::now(), // New creation time
344 })
345 }
346
347 /// Creates a new FileChunk with a checksum
348 ///
349 /// # Developer Notes
350 /// - This preserves all other data and adds/updates the checksum
351 /// - Use this after processing: `let verified_chunk =
352 /// chunk.with_checksum(hash);`
353 pub fn with_checksum(&self, checksum: String) -> Self {
354 FileChunk {
355 id: self.id,
356 sequence_number: self.sequence_number,
357 offset: self.offset,
358 size: self.size,
359 data: self.data.clone(),
360 checksum: Some(checksum),
361 is_final: self.is_final,
362 created_at: self.created_at,
363 }
364 }
365
366 /// Creates a new FileChunk with calculated SHA-256 checksum
367 ///
368 /// # Developer Notes
369 /// - Calculates SHA-256 hash of current data
370 /// - Returns new chunk with checksum set
371 /// - Original chunk remains unchanged
372 pub fn with_calculated_checksum(&self) -> Result<Self, PipelineError> {
373 let mut hasher = Sha256::new();
374 hasher.update(&self.data);
375 let digest = hasher.finalize();
376 let checksum = hex::encode(digest);
377 Ok(self.with_checksum(checksum))
378 }
379
380 /// Creates a new FileChunk without data (for security)
381 ///
382 /// # Developer Notes
383 /// - Creates new chunk with empty data vector
384 /// - Useful for secure cleanup while preserving metadata
385 /// - Checksum is cleared since data is gone
386 pub fn without_data(&self) -> Self {
387 FileChunk {
388 id: self.id,
389 sequence_number: self.sequence_number,
390 offset: self.offset,
391 size: ChunkSize::new(0).unwrap_or_else(|_| ChunkSize::default()), /* Empty chunk - ChunkSize(0) should
392 * never fail, but handle it safely */
393 data: Vec::new(),
394 checksum: None, // Clear checksum
395 is_final: self.is_final,
396 created_at: self.created_at,
397 }
398 }
399
400 // === Verification Methods (Read-Only) ===
401
402 /// Verifies the chunk integrity using the stored checksum
403 ///
404 /// # Purpose
405 /// Validates that chunk data has not been corrupted by comparing the stored
406 /// SHA-256 checksum against a freshly calculated hash of the current data.
407 ///
408 /// # Why
409 /// Integrity verification provides:
410 /// - Detection of data corruption during processing or storage
411 /// - Confidence in pipeline operations
412 /// - Early error detection before expensive operations
413 /// - Compliance with data integrity requirements
414 ///
415 /// # Returns
416 /// * `Ok(true)` - Checksum matches, data is intact
417 /// * `Ok(false)` - Checksum mismatch, data corrupted
418 /// * `Err(PipelineError::InvalidChunk)` - No checksum available
419 ///
420 /// # Errors
421 /// Returns `PipelineError::InvalidChunk` when the chunk has no stored
422 /// checksum.
423 ///
424 /// # Examples
425 ///
426 ///
427 /// # Developer Notes
428 /// - This method is read-only and doesn't modify the chunk
429 /// - Use before critical processing to ensure data integrity
430 /// - Consider verification before expensive operations like encryption
431 pub fn verify_integrity(&self) -> Result<bool, PipelineError> {
432 if let Some(stored_checksum) = &self.checksum {
433 let mut hasher = Sha256::new();
434 hasher.update(&self.data);
435 let digest = hasher.finalize();
436 let calculated_checksum = hex::encode(digest);
437 Ok(calculated_checksum == *stored_checksum)
438 } else {
439 Err(PipelineError::InvalidChunk(
440 "No checksum available for verification".to_string(),
441 ))
442 }
443 }
444
445 /// Calculates SHA-256 checksum without modifying the chunk
446 ///
447 /// # Developer Notes
448 /// - This is a pure function - doesn't modify the chunk
449 /// - Use when you need the checksum but don't want to create a new chunk
450 /// - For creating a chunk with checksum, use `with_calculated_checksum()`
451 pub fn calculate_checksum(&self) -> Result<String, PipelineError> {
452 let mut hasher = Sha256::new();
453 hasher.update(&self.data);
454 let digest = hasher.finalize();
455 Ok(hex::encode(digest))
456 }
457}
458
459#[cfg(test)]
460mod tests {
461 use super::*;
462
463 /// Tests file chunk creation with basic properties.
464 ///
465 /// This test validates that file chunks can be created with
466 /// required properties and that all metadata is properly
467 /// stored and accessible.
468 ///
469 /// # Test Coverage
470 ///
471 /// - File chunk creation with minimum size requirement
472 /// - Sequence number assignment
473 /// - Offset position tracking
474 /// - Data storage and retrieval
475 /// - Final chunk flag handling
476 /// - Checksum initialization (none by default)
477 ///
478 /// # Test Scenario
479 ///
480 /// Creates a file chunk with test data meeting minimum size
481 /// requirements and verifies all properties are set correctly.
482 ///
483 /// # Assertions
484 ///
485 /// - Sequence number matches input
486 /// - Offset matches input
487 /// - Data is stored correctly
488 /// - Final flag is set correctly
489 /// - Checksum is initially None
490 #[test]
491 fn test_file_chunk_creation() {
492 // Create test data that meets minimum chunk size requirement (1MB)
493 let data = vec![42u8; ChunkSize::MIN_SIZE];
494 let chunk = FileChunk::new(0, 0, data.clone(), false).unwrap();
495
496 assert_eq!(chunk.sequence_number(), 0);
497 assert_eq!(chunk.offset(), 0);
498 assert_eq!(chunk.data(), &data);
499 assert!(!chunk.is_final());
500 assert!(chunk.checksum().is_none());
501 }
502
503 /// Tests file chunk immutability and data modification behavior.
504 ///
505 /// This test validates that file chunks are immutable and that
506 /// data modifications create new chunk instances while preserving
507 /// the original chunk unchanged.
508 ///
509 /// # Test Coverage
510 ///
511 /// - File chunk immutability
512 /// - Data modification with `with_data()`
513 /// - Original chunk preservation
514 /// - New chunk creation
515 /// - Unique ID generation for modified chunks
516 ///
517 /// # Test Scenario
518 ///
519 /// Creates a file chunk, modifies its data using `with_data()`,
520 /// then verifies the original chunk is unchanged and the new
521 /// chunk has different data and ID.
522 ///
523 /// # Assertions
524 ///
525 /// - Original chunk data is unchanged
526 /// - New chunk has modified data
527 /// - Chunk IDs are different
528 /// - Immutability is preserved
529 #[test]
530 fn test_file_chunk_immutability() {
531 let data = vec![42u8; ChunkSize::MIN_SIZE];
532 let chunk1 = FileChunk::new(0, 0, data.clone(), false).unwrap();
533
534 // Creating new chunk with different data
535 let new_data = vec![99u8; ChunkSize::MIN_SIZE];
536 let chunk2 = chunk1.with_data(new_data.clone()).unwrap();
537
538 // Original chunk unchanged
539 assert_eq!(chunk1.data(), &data);
540 assert_eq!(chunk2.data(), &new_data);
541 assert_ne!(chunk1.id(), chunk2.id()); // Different IDs
542 }
543
544 /// Tests file chunk checksum addition and preservation.
545 ///
546 /// This test validates that checksums can be added to file chunks
547 /// while preserving the original chunk and maintaining the same
548 /// chunk ID for checksum-only modifications.
549 ///
550 /// # Test Coverage
551 ///
552 /// - Checksum addition with `with_checksum()`
553 /// - Original chunk preservation
554 /// - Checksum storage and retrieval
555 /// - ID preservation for checksum addition
556 /// - Checksum immutability
557 ///
558 /// # Test Scenario
559 ///
560 /// Creates a file chunk without checksum, adds a checksum using
561 /// `with_checksum()`, then verifies the original chunk is unchanged
562 /// and the new chunk has the checksum with the same ID.
563 ///
564 /// # Assertions
565 ///
566 /// - Original chunk has no checksum
567 /// - New chunk has the specified checksum
568 /// - Chunk IDs are the same (checksum addition preserves ID)
569 /// - Checksum is stored correctly
570 #[test]
571 fn test_file_chunk_with_checksum() {
572 let data = vec![42u8; ChunkSize::MIN_SIZE];
573 let chunk1 = FileChunk::new(0, 0, data, false).unwrap();
574 let chunk2 = chunk1.with_checksum("test_hash".to_string());
575
576 // Original chunk unchanged
577 assert!(chunk1.checksum().is_none());
578 assert_eq!(chunk2.checksum(), Some("test_hash"));
579 assert_eq!(chunk1.id(), chunk2.id()); // Same ID for checksum addition
580 }
581
582 /// Tests file chunk automatic checksum calculation.
583 ///
584 /// This test validates that file chunks can automatically
585 /// calculate checksums from their data and that the calculated
586 /// checksum matches manual calculation.
587 ///
588 /// # Test Coverage
589 ///
590 /// - Automatic checksum calculation with `with_calculated_checksum()`
591 /// - Manual checksum calculation with `calculate_checksum()`
592 /// - Checksum accuracy verification
593 /// - Original chunk preservation
594 /// - Checksum consistency
595 ///
596 /// # Test Scenario
597 ///
598 /// Creates a file chunk, calculates its checksum automatically,
599 /// then verifies the checksum matches manual calculation.
600 ///
601 /// # Assertions
602 ///
603 /// - Original chunk has no checksum
604 /// - New chunk has calculated checksum
605 /// - Calculated checksum matches manual calculation
606 /// - Checksum calculation is accurate
607 #[test]
608 fn test_file_chunk_calculated_checksum() {
609 let data = vec![42u8; ChunkSize::MIN_SIZE];
610 let chunk = FileChunk::new(0, 0, data, false).unwrap();
611 let chunk_with_checksum = chunk.with_calculated_checksum().unwrap();
612
613 assert!(chunk.checksum().is_none());
614 assert!(chunk_with_checksum.checksum().is_some());
615
616 // Verify the checksum is correct
617 let calculated = chunk.calculate_checksum().unwrap();
618 assert_eq!(chunk_with_checksum.checksum(), Some(calculated.as_str()));
619 }
620
621 /// Tests file chunk integrity verification.
622 ///
623 /// This test validates that file chunks can verify their data
624 /// integrity using checksums and that verification fails
625 /// appropriately when checksums are missing.
626 ///
627 /// # Test Coverage
628 ///
629 /// - Integrity verification with `verify_integrity()`
630 /// - Successful verification with valid checksum
631 /// - Failed verification without checksum
632 /// - Checksum-based data validation
633 /// - Error handling for missing checksums
634 ///
635 /// # Test Scenario
636 ///
637 /// Creates a file chunk with calculated checksum and verifies
638 /// integrity passes, then tests that chunks without checksums
639 /// fail verification.
640 ///
641 /// # Assertions
642 ///
643 /// - Chunk with checksum passes integrity verification
644 /// - Chunk without checksum fails integrity verification
645 /// - Verification logic works correctly
646 /// - Error handling is appropriate
647 #[test]
648 fn test_file_chunk_verify_integrity() {
649 let data = vec![42u8; ChunkSize::MIN_SIZE];
650 let chunk = FileChunk::new(0, 0, data, false).unwrap();
651 let chunk_with_checksum = chunk.with_calculated_checksum().unwrap();
652
653 // Should verify successfully
654 assert!(chunk_with_checksum.verify_integrity().unwrap());
655
656 // Chunk without checksum should error
657 assert!(chunk.verify_integrity().is_err());
658 }
659
660 /// Tests file chunk rejection of empty data.
661 ///
662 /// This test validates that file chunks reject empty data
663 /// during creation, ensuring all chunks contain meaningful
664 /// data for processing.
665 ///
666 /// # Test Coverage
667 ///
668 /// - Empty data rejection during creation
669 /// - Validation error handling
670 /// - Minimum data requirements
671 /// - Input validation
672 /// - Error message clarity
673 ///
674 /// # Test Scenario
675 ///
676 /// Attempts to create a file chunk with empty data and
677 /// verifies that creation fails with appropriate error.
678 ///
679 /// # Assertions
680 ///
681 /// - Empty data creation fails
682 /// - Error is returned appropriately
683 /// - Validation prevents invalid chunks
684 /// - Input requirements are enforced
685 #[test]
686 fn test_empty_data_rejection() {
687 let result = FileChunk::new(0, 0, vec![], false);
688 assert!(result.is_err());
689 }
690}