Skip to main content

tensor_blob/
chunker.rs

1// SPDX-License-Identifier: MIT OR Apache-2.0
2use sha2::{Digest, Sha256};
3
4/// A content-addressed chunk of data.
5#[derive(Debug, Clone)]
6pub struct Chunk {
7    /// Content hash in format "sha256:{hex}".
8    pub hash: String,
9    /// Raw chunk data.
10    pub data: Vec<u8>,
11    /// Size of the chunk in bytes.
12    pub size: usize,
13}
14
15impl Chunk {
16    #[must_use]
17    pub fn new(data: Vec<u8>) -> Self {
18        let hash = compute_hash(&data);
19        let size = data.len();
20        Self { hash, data, size }
21    }
22
23    #[must_use]
24    pub fn key(&self) -> String {
25        format!("_blob:chunk:{}", self.hash)
26    }
27}
28
29/// Chunker for splitting data into content-addressable chunks.
30pub struct Chunker {
31    chunk_size: usize,
32}
33
34impl Chunker {
35    #[must_use]
36    pub const fn new(chunk_size: usize) -> Self {
37        Self { chunk_size }
38    }
39
40    #[must_use]
41    pub const fn chunk_size(&self) -> usize {
42        self.chunk_size
43    }
44
45    /// Split data into chunks.
46    pub fn chunk<'a>(&'a self, data: &'a [u8]) -> impl Iterator<Item = Chunk> + 'a {
47        data.chunks(self.chunk_size).map(|chunk_data| {
48            let hash = compute_hash(chunk_data);
49            Chunk {
50                hash,
51                data: chunk_data.to_vec(),
52                size: chunk_data.len(),
53            }
54        })
55    }
56
57    /// Count how many chunks data would produce without allocating.
58    #[must_use]
59    pub const fn chunk_count(&self, data_len: usize) -> usize {
60        if data_len == 0 {
61            0
62        } else {
63            data_len.div_ceil(self.chunk_size)
64        }
65    }
66}
67
68/// Compute SHA-256 hash of data.
69#[must_use]
70pub fn compute_hash(data: &[u8]) -> String {
71    let mut hasher = Sha256::new();
72    hasher.update(data);
73    let result = hasher.finalize();
74    format!("sha256:{result:x}")
75}
76
77/// Compute SHA-256 hash of multiple data segments.
78#[must_use]
79pub fn compute_hash_streaming<'a>(segments: impl Iterator<Item = &'a [u8]>) -> String {
80    let mut hasher = Sha256::new();
81    for segment in segments {
82        hasher.update(segment);
83    }
84    let result = hasher.finalize();
85    format!("sha256:{result:x}")
86}
87
88/// A streaming hasher for computing checksums incrementally.
89pub struct StreamingHasher {
90    hasher: Sha256,
91}
92
93impl Default for StreamingHasher {
94    fn default() -> Self {
95        Self::new()
96    }
97}
98
99impl StreamingHasher {
100    #[must_use]
101    pub fn new() -> Self {
102        Self {
103            hasher: Sha256::new(),
104        }
105    }
106
107    pub fn update(&mut self, data: &[u8]) {
108        self.hasher.update(data);
109    }
110
111    #[must_use]
112    pub fn finalize(self) -> String {
113        let result = self.hasher.finalize();
114        format!("sha256:{result:x}")
115    }
116}
117
118#[cfg(test)]
119mod tests {
120    use super::*;
121
122    #[test]
123    fn test_compute_hash() {
124        let data = b"hello world";
125        let hash = compute_hash(data);
126        // SHA-256 of "hello world"
127        assert!(hash.starts_with("sha256:"));
128        assert_eq!(hash.len(), 7 + 64); // "sha256:" + 64 hex chars
129    }
130
131    #[test]
132    fn test_compute_hash_deterministic() {
133        let data = b"test data";
134        let hash1 = compute_hash(data);
135        let hash2 = compute_hash(data);
136        assert_eq!(hash1, hash2);
137    }
138
139    #[test]
140    fn test_compute_hash_different_data() {
141        let hash1 = compute_hash(b"data1");
142        let hash2 = compute_hash(b"data2");
143        assert_ne!(hash1, hash2);
144    }
145
146    #[test]
147    fn test_compute_hash_empty() {
148        let hash = compute_hash(b"");
149        assert!(hash.starts_with("sha256:"));
150    }
151
152    #[test]
153    fn test_chunker_single_chunk() {
154        let chunker = Chunker::new(1024);
155        let data = vec![0u8; 100];
156        let chunks: Vec<_> = chunker.chunk(&data).collect();
157
158        assert_eq!(chunks.len(), 1);
159        assert_eq!(chunks[0].size, 100);
160        assert_eq!(chunks[0].data, data);
161    }
162
163    #[test]
164    fn test_chunker_multiple_chunks() {
165        let chunker = Chunker::new(100);
166        let data = vec![0u8; 250];
167        let chunks: Vec<_> = chunker.chunk(&data).collect();
168
169        assert_eq!(chunks.len(), 3);
170        assert_eq!(chunks[0].size, 100);
171        assert_eq!(chunks[1].size, 100);
172        assert_eq!(chunks[2].size, 50);
173    }
174
175    #[test]
176    fn test_chunker_exact_multiple() {
177        let chunker = Chunker::new(100);
178        let data = vec![0u8; 300];
179        let chunks: Vec<_> = chunker.chunk(&data).collect();
180
181        assert_eq!(chunks.len(), 3);
182        assert_eq!(chunks[0].size, 100);
183        assert_eq!(chunks[1].size, 100);
184        assert_eq!(chunks[2].size, 100);
185    }
186
187    #[test]
188    fn test_chunker_empty_data() {
189        let chunker = Chunker::new(100);
190        let data: Vec<u8> = vec![];
191        let chunks: Vec<_> = chunker.chunk(&data).collect();
192
193        assert_eq!(chunks.len(), 0);
194    }
195
196    #[test]
197    fn test_chunk_count() {
198        let chunker = Chunker::new(100);
199        assert_eq!(chunker.chunk_count(0), 0);
200        assert_eq!(chunker.chunk_count(1), 1);
201        assert_eq!(chunker.chunk_count(100), 1);
202        assert_eq!(chunker.chunk_count(101), 2);
203        assert_eq!(chunker.chunk_count(200), 2);
204        assert_eq!(chunker.chunk_count(250), 3);
205    }
206
207    #[test]
208    fn test_chunk_key() {
209        let chunk = Chunk::new(vec![1, 2, 3]);
210        assert!(chunk.key().starts_with("_blob:chunk:sha256:"));
211    }
212
213    #[test]
214    fn test_streaming_hasher() {
215        let mut hasher = StreamingHasher::new();
216        hasher.update(b"hello ");
217        hasher.update(b"world");
218        let hash = hasher.finalize();
219
220        let direct_hash = compute_hash(b"hello world");
221        assert_eq!(hash, direct_hash);
222    }
223
224    #[test]
225    fn test_compute_hash_streaming() {
226        let segments = vec![b"hello ".as_slice(), b"world".as_slice()];
227        let hash = compute_hash_streaming(segments.into_iter());
228
229        let direct_hash = compute_hash(b"hello world");
230        assert_eq!(hash, direct_hash);
231    }
232
233    #[test]
234    fn test_chunk_content_addressing() {
235        let chunker = Chunker::new(100);
236
237        // Same content should produce same hash
238        let data1 = vec![42u8; 100];
239        let data2 = vec![42u8; 100];
240
241        let chunks1: Vec<_> = chunker.chunk(&data1).collect();
242        let chunks2: Vec<_> = chunker.chunk(&data2).collect();
243
244        assert_eq!(chunks1[0].hash, chunks2[0].hash);
245    }
246
247    #[test]
248    fn test_chunk_different_content() {
249        let chunker = Chunker::new(100);
250
251        let data1 = vec![1u8; 100];
252        let data2 = vec![2u8; 100];
253
254        let chunks1: Vec<_> = chunker.chunk(&data1).collect();
255        let chunks2: Vec<_> = chunker.chunk(&data2).collect();
256
257        assert_ne!(chunks1[0].hash, chunks2[0].hash);
258    }
259}