Skip to main content

shadow_storage/
chunking.rs

1//! File chunking for distributed storage
2
3use shadow_core::error::{Result, ShadowError};
4use bytes::Bytes;
5use crypto::hash;
6
7/// Chunk information
8#[derive(Debug, Clone)]
9pub struct ChunkInfo {
10    /// Chunk index
11    pub index: usize,
12    /// Chunk hash (BLAKE3)
13    pub hash: [u8; 32],
14    /// Chunk size in bytes
15    pub size: usize,
16}
17
18/// File chunker with content-defined chunking
19pub struct Chunker {
20    /// Target chunk size
21    chunk_size: usize,
22    /// Minimum chunk size
23    min_size: usize,
24    /// Maximum chunk size
25    max_size: usize,
26}
27
28impl Chunker {
29    /// Create new chunker with target size
30    pub fn new(chunk_size: usize) -> Self {
31        Self {
32            chunk_size,
33            min_size: chunk_size / 4,
34            max_size: chunk_size * 4,
35        }
36    }
37
38    /// Split data into chunks
39    pub fn chunk(&self, data: &[u8]) -> Result<Vec<(ChunkInfo, Bytes)>> {
40        let mut chunks = Vec::new();
41        let mut offset = 0;
42        let mut index = 0;
43
44        while offset < data.len() {
45            let remaining = data.len() - offset;
46            let chunk_size = remaining.min(self.chunk_size);
47            
48            let chunk_data = &data[offset..offset + chunk_size];
49            let chunk_hash = *hash::hash_data(chunk_data).as_bytes();
50            
51            let info = ChunkInfo {
52                index,
53                hash: chunk_hash,
54                size: chunk_size,
55            };
56            
57            chunks.push((info, Bytes::copy_from_slice(chunk_data)));
58            
59            offset += chunk_size;
60            index += 1;
61        }
62
63        Ok(chunks)
64    }
65
66    /// Reassemble chunks into original data
67    pub fn reassemble(&self, chunks: &[(ChunkInfo, Bytes)]) -> Result<Bytes> {
68        // Sort chunks by index
69        let mut sorted = chunks.to_vec();
70        sorted.sort_by_key(|(info, _)| info.index);
71
72        // Verify hashes and concatenate
73        let mut result = Vec::new();
74        
75        for (info, chunk_data) in sorted {
76            // Verify hash
77            let computed_hash = *hash::hash_data(&chunk_data).as_bytes();
78            if computed_hash != info.hash {
79                return Err(ShadowError::Storage(format!(
80                    "Chunk {} hash mismatch", info.index
81                )));
82            }
83            
84            result.extend_from_slice(&chunk_data);
85        }
86
87        Ok(Bytes::from(result))
88    }
89
90    /// Calculate content hash (root hash)
91    pub fn content_hash(chunks: &[ChunkInfo]) -> [u8; 32] {
92        let mut combined = Vec::new();
93        for chunk in chunks {
94            combined.extend_from_slice(&chunk.hash);
95        }
96        *hash::hash_data(&combined).as_bytes()
97    }
98}
99
100impl Default for Chunker {
101    fn default() -> Self {
102        Self::new(256 * 1024) // 256 KB default
103    }
104}
105
106#[cfg(test)]
107mod tests {
108    use super::*;
109
110    #[test]
111    fn test_chunking() {
112        let chunker = Chunker::new(100);
113        let data = vec![1u8; 250];
114        
115        let chunks = chunker.chunk(&data).unwrap();
116        
117        // Should create 3 chunks (100 + 100 + 50)
118        assert_eq!(chunks.len(), 3);
119        assert_eq!(chunks[0].0.size, 100);
120        assert_eq!(chunks[1].0.size, 100);
121        assert_eq!(chunks[2].0.size, 50);
122    }
123
124    #[test]
125    fn test_reassembly() {
126        let chunker = Chunker::new(100);
127        let original = vec![1u8, 2, 3, 4, 5, 6, 7, 8, 9, 10];
128        
129        let chunks = chunker.chunk(&original).unwrap();
130        let reassembled = chunker.reassemble(&chunks).unwrap();
131        
132        assert_eq!(reassembled.as_ref(), original.as_slice());
133    }
134
135    #[test]
136    fn test_content_hash() {
137        let chunker = Chunker::new(50);
138        let data = vec![1u8; 100];
139        
140        let chunks = chunker.chunk(&data).unwrap();
141        let chunk_infos: Vec<ChunkInfo> = chunks.iter().map(|(info, _)| info.clone()).collect();
142        
143        let hash1 = Chunker::content_hash(&chunk_infos);
144        let hash2 = Chunker::content_hash(&chunk_infos);
145        
146        assert_eq!(hash1, hash2);
147    }
148}