Skip to main content

iscc_lib/
streaming.rs

1//! Streaming hash types for incremental ISCC code generation.
2//!
3//! Provides `DataHasher` and `InstanceHasher` — streaming counterparts to
4//! `gen_data_code_v0` and `gen_instance_code_v0`. Both follow the
5//! `new() → update(&[u8]) → finalize()` pattern for incremental processing
6//! of large files without loading entire contents into memory.
7
8use crate::types::{DataCodeResult, InstanceCodeResult};
9use crate::{IsccResult, cdc, codec, minhash};
10
11/// Streaming Instance-Code generator.
12///
13/// Incrementally hashes data with BLAKE3 to produce an ISCC Instance-Code
14/// identical to `gen_instance_code_v0` for the same byte stream.
15pub struct InstanceHasher {
16    hasher: blake3::Hasher,
17    filesize: u64,
18}
19
20impl InstanceHasher {
21    /// Create a new `InstanceHasher`.
22    pub fn new() -> Self {
23        Self {
24            hasher: blake3::Hasher::new(),
25            filesize: 0,
26        }
27    }
28
29    /// Push data into the hasher.
30    pub fn update(&mut self, data: &[u8]) {
31        self.filesize += data.len() as u64;
32        self.hasher.update(data);
33    }
34
35    /// Consume the hasher and produce an Instance-Code result.
36    ///
37    /// Equivalent to calling `gen_instance_code_v0` with the concatenation
38    /// of all data passed to `update`.
39    pub fn finalize(self, bits: u32) -> IsccResult<InstanceCodeResult> {
40        let digest = self.hasher.finalize();
41        let datahash = format!("1e20{}", hex::encode(digest.as_bytes()));
42        let component = codec::encode_component(
43            codec::MainType::Instance,
44            codec::SubType::None,
45            codec::Version::V0,
46            bits,
47            digest.as_bytes(),
48        )?;
49        Ok(InstanceCodeResult {
50            iscc: format!("ISCC:{component}"),
51            datahash,
52            filesize: self.filesize,
53        })
54    }
55}
56
57impl Default for InstanceHasher {
58    /// Create a new `InstanceHasher` (delegates to `new()`).
59    fn default() -> Self {
60        Self::new()
61    }
62}
63
64/// Streaming Data-Code generator.
65///
66/// Incrementally processes data with content-defined chunking (CDC) and
67/// MinHash to produce an ISCC Data-Code identical to `gen_data_code_v0`
68/// for the same byte stream. Uses a persistent internal buffer to avoid
69/// per-call heap allocations.
70pub struct DataHasher {
71    chunk_features: Vec<u32>,
72    buf: Vec<u8>,
73}
74
75impl DataHasher {
76    /// Create a new `DataHasher`.
77    pub fn new() -> Self {
78        Self {
79            chunk_features: Vec::new(),
80            buf: Vec::new(),
81        }
82    }
83
84    /// Push data into the hasher.
85    ///
86    /// Appends data to the internal buffer (which starts with the retained
87    /// tail from the previous call), runs CDC, hashes all complete chunks,
88    /// and shifts the last chunk (tail) to the front of the buffer for the
89    /// next call. The buffer is reused across calls to avoid allocations.
90    pub fn update(&mut self, data: &[u8]) {
91        self.buf.extend_from_slice(data);
92
93        let chunks = cdc::alg_cdc_chunks_unchecked(&self.buf, false, cdc::DATA_AVG_CHUNK_SIZE);
94
95        // Process all chunks except the last (which becomes the new tail).
96        // This mirrors the Python `push()` method's `prev_chunk` pattern.
97        let mut prev_chunk: Option<&[u8]> = None;
98        for chunk in &chunks {
99            if let Some(pc) = prev_chunk {
100                self.chunk_features.push(xxhash_rust::xxh32::xxh32(pc, 0));
101            }
102            prev_chunk = Some(chunk);
103        }
104
105        // Extract tail length before dropping borrows on self.buf
106        let tail_len = prev_chunk.map_or(0, |c| c.len());
107        drop(chunks);
108
109        // Shift tail to front of buffer, reusing existing capacity
110        let tail_start = self.buf.len() - tail_len;
111        self.buf.copy_within(tail_start.., 0);
112        self.buf.truncate(tail_len);
113    }
114
115    /// Consume the hasher and produce a Data-Code result.
116    ///
117    /// Equivalent to calling `gen_data_code_v0` with the concatenation
118    /// of all data passed to `update`.
119    pub fn finalize(mut self, bits: u32) -> IsccResult<DataCodeResult> {
120        if !self.buf.is_empty() {
121            self.chunk_features
122                .push(xxhash_rust::xxh32::xxh32(&self.buf, 0));
123        } else if self.chunk_features.is_empty() {
124            // Empty input: ensure at least one feature
125            self.chunk_features.push(xxhash_rust::xxh32::xxh32(b"", 0));
126        }
127
128        let digest = minhash::alg_minhash_256(&self.chunk_features);
129        let component = codec::encode_component(
130            codec::MainType::Data,
131            codec::SubType::None,
132            codec::Version::V0,
133            bits,
134            &digest,
135        )?;
136
137        Ok(DataCodeResult {
138            iscc: format!("ISCC:{component}"),
139        })
140    }
141}
142
143impl Default for DataHasher {
144    /// Create a new `DataHasher` (delegates to `new()`).
145    fn default() -> Self {
146        Self::new()
147    }
148}
149
150#[cfg(test)]
151mod tests {
152    use super::*;
153    use crate::{gen_data_code_v0, gen_instance_code_v0};
154
155    // ---- InstanceHasher tests ----
156
157    #[test]
158    fn test_instance_hasher_empty() {
159        let ih = InstanceHasher::new();
160        let streaming = ih.finalize(64).unwrap();
161        let oneshot = gen_instance_code_v0(b"", 64).unwrap();
162        assert_eq!(streaming.iscc, oneshot.iscc);
163        assert_eq!(streaming.datahash, oneshot.datahash);
164        assert_eq!(streaming.filesize, oneshot.filesize);
165        assert_eq!(streaming.filesize, 0);
166    }
167
168    #[test]
169    fn test_instance_hasher_small_data() {
170        let data = b"Hello, ISCC World!";
171        let mut ih = InstanceHasher::new();
172        ih.update(data);
173        let streaming = ih.finalize(64).unwrap();
174        let oneshot = gen_instance_code_v0(data, 64).unwrap();
175        assert_eq!(streaming.iscc, oneshot.iscc);
176        assert_eq!(streaming.datahash, oneshot.datahash);
177        assert_eq!(streaming.filesize, oneshot.filesize);
178    }
179
180    #[test]
181    fn test_instance_hasher_multi_chunk() {
182        let data = b"The quick brown fox jumps over the lazy dog";
183        let mut ih = InstanceHasher::new();
184        ih.update(&data[..10]);
185        ih.update(&data[10..25]);
186        ih.update(&data[25..]);
187        let streaming = ih.finalize(64).unwrap();
188        let oneshot = gen_instance_code_v0(data, 64).unwrap();
189        assert_eq!(streaming.iscc, oneshot.iscc);
190        assert_eq!(streaming.datahash, oneshot.datahash);
191        assert_eq!(streaming.filesize, oneshot.filesize);
192    }
193
194    #[test]
195    fn test_instance_hasher_byte_at_a_time() {
196        let data = b"streaming byte by byte";
197        let mut ih = InstanceHasher::new();
198        for &b in data.iter() {
199            ih.update(&[b]);
200        }
201        let streaming = ih.finalize(128).unwrap();
202        let oneshot = gen_instance_code_v0(data, 128).unwrap();
203        assert_eq!(streaming.iscc, oneshot.iscc);
204        assert_eq!(streaming.datahash, oneshot.datahash);
205        assert_eq!(streaming.filesize, oneshot.filesize);
206    }
207
208    #[test]
209    fn test_instance_hasher_default() {
210        let ih = InstanceHasher::default();
211        let streaming = ih.finalize(64).unwrap();
212        let oneshot = gen_instance_code_v0(b"", 64).unwrap();
213        assert_eq!(streaming.iscc, oneshot.iscc);
214    }
215
216    #[test]
217    fn test_instance_hasher_various_bits() {
218        let data = b"test various bit widths";
219        for bits in [64, 128, 256] {
220            let mut ih = InstanceHasher::new();
221            ih.update(data);
222            let streaming = ih.finalize(bits).unwrap();
223            let oneshot = gen_instance_code_v0(data, bits).unwrap();
224            assert_eq!(streaming.iscc, oneshot.iscc, "bits={bits}");
225            assert_eq!(streaming.datahash, oneshot.datahash, "bits={bits}");
226        }
227    }
228
229    #[test]
230    fn test_instance_hasher_conformance() {
231        let json_str = include_str!("../tests/data.json");
232        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
233        let section = &data["gen_instance_code_v0"];
234        let cases = section.as_object().unwrap();
235
236        for (name, tc) in cases {
237            let inputs = tc["inputs"].as_array().unwrap();
238            let stream_str = inputs[0].as_str().unwrap();
239            let bits = inputs[1].as_u64().unwrap() as u32;
240
241            let hex_data = stream_str
242                .strip_prefix("stream:")
243                .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {name}"));
244            let input_bytes = hex::decode(hex_data)
245                .unwrap_or_else(|e| panic!("invalid hex in test case {name}: {e}"));
246
247            // One-shot reference
248            let oneshot = gen_instance_code_v0(&input_bytes, bits)
249                .unwrap_or_else(|e| panic!("gen_instance_code_v0 failed for {name}: {e}"));
250
251            // Streaming — single update
252            let mut ih = InstanceHasher::new();
253            ih.update(&input_bytes);
254            let streaming = ih
255                .finalize(bits)
256                .unwrap_or_else(|e| panic!("InstanceHasher failed for {name}: {e}"));
257
258            assert_eq!(
259                streaming.iscc, oneshot.iscc,
260                "ISCC mismatch in test case {name}"
261            );
262            assert_eq!(
263                streaming.datahash, oneshot.datahash,
264                "datahash mismatch in test case {name}"
265            );
266            assert_eq!(
267                streaming.filesize, oneshot.filesize,
268                "filesize mismatch in test case {name}"
269            );
270
271            // Streaming — multi-chunk (split into 256-byte chunks)
272            let mut ih2 = InstanceHasher::new();
273            for chunk in input_bytes.chunks(256) {
274                ih2.update(chunk);
275            }
276            let streaming2 = ih2
277                .finalize(bits)
278                .unwrap_or_else(|e| panic!("InstanceHasher multi-chunk failed for {name}: {e}"));
279
280            assert_eq!(
281                streaming2.iscc, oneshot.iscc,
282                "multi-chunk ISCC mismatch in test case {name}"
283            );
284            assert_eq!(
285                streaming2.datahash, oneshot.datahash,
286                "multi-chunk datahash mismatch in test case {name}"
287            );
288        }
289    }
290
291    // ---- DataHasher tests ----
292
293    #[test]
294    fn test_data_hasher_empty() {
295        let dh = DataHasher::new();
296        let streaming = dh.finalize(64).unwrap();
297        let oneshot = gen_data_code_v0(b"", 64).unwrap();
298        assert_eq!(streaming.iscc, oneshot.iscc);
299    }
300
301    #[test]
302    fn test_data_hasher_small_data() {
303        let data = b"Hello, ISCC World!";
304        let mut dh = DataHasher::new();
305        dh.update(data);
306        let streaming = dh.finalize(64).unwrap();
307        let oneshot = gen_data_code_v0(data, 64).unwrap();
308        assert_eq!(streaming.iscc, oneshot.iscc);
309    }
310
311    #[test]
312    fn test_data_hasher_multi_chunk_small() {
313        let data = b"The quick brown fox jumps over the lazy dog";
314        let mut dh = DataHasher::new();
315        dh.update(&data[..10]);
316        dh.update(&data[10..25]);
317        dh.update(&data[25..]);
318        let streaming = dh.finalize(64).unwrap();
319        let oneshot = gen_data_code_v0(data, 64).unwrap();
320        assert_eq!(streaming.iscc, oneshot.iscc);
321    }
322
323    #[test]
324    fn test_data_hasher_byte_at_a_time() {
325        // Small data that fits within a single CDC chunk
326        let data = b"streaming byte by byte";
327        let mut dh = DataHasher::new();
328        for &b in data.iter() {
329            dh.update(&[b]);
330        }
331        let streaming = dh.finalize(64).unwrap();
332        let oneshot = gen_data_code_v0(data, 64).unwrap();
333        assert_eq!(streaming.iscc, oneshot.iscc);
334    }
335
336    #[test]
337    fn test_data_hasher_large_data_multi_chunk() {
338        // Generate data large enough to produce multiple CDC chunks
339        let data: Vec<u8> = (0..10_000).map(|i| (i % 256) as u8).collect();
340        for chunk_size in [1, 256, 1024, 4096] {
341            let mut dh = DataHasher::new();
342            for chunk in data.chunks(chunk_size) {
343                dh.update(chunk);
344            }
345            let streaming = dh.finalize(64).unwrap();
346            let oneshot = gen_data_code_v0(&data, 64).unwrap();
347            assert_eq!(
348                streaming.iscc, oneshot.iscc,
349                "chunk_size={chunk_size} mismatch"
350            );
351        }
352    }
353
354    #[test]
355    fn test_data_hasher_default() {
356        let dh = DataHasher::default();
357        let streaming = dh.finalize(64).unwrap();
358        let oneshot = gen_data_code_v0(b"", 64).unwrap();
359        assert_eq!(streaming.iscc, oneshot.iscc);
360    }
361
362    #[test]
363    fn test_data_hasher_various_bits() {
364        let data = b"test various bit widths for data code";
365        for bits in [64, 128, 256] {
366            let mut dh = DataHasher::new();
367            dh.update(data);
368            let streaming = dh.finalize(bits).unwrap();
369            let oneshot = gen_data_code_v0(data, bits).unwrap();
370            assert_eq!(streaming.iscc, oneshot.iscc, "bits={bits}");
371        }
372    }
373
374    #[test]
375    fn test_data_hasher_conformance() {
376        let json_str = include_str!("../tests/data.json");
377        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
378        let section = &data["gen_data_code_v0"];
379        let cases = section.as_object().unwrap();
380
381        for (name, tc) in cases {
382            let inputs = tc["inputs"].as_array().unwrap();
383            let stream_str = inputs[0].as_str().unwrap();
384            let bits = inputs[1].as_u64().unwrap() as u32;
385
386            let hex_data = stream_str
387                .strip_prefix("stream:")
388                .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {name}"));
389            let input_bytes = hex::decode(hex_data)
390                .unwrap_or_else(|e| panic!("invalid hex in test case {name}: {e}"));
391
392            // One-shot reference
393            let oneshot = gen_data_code_v0(&input_bytes, bits)
394                .unwrap_or_else(|e| panic!("gen_data_code_v0 failed for {name}: {e}"));
395
396            // Streaming — single update
397            let mut dh = DataHasher::new();
398            dh.update(&input_bytes);
399            let streaming = dh
400                .finalize(bits)
401                .unwrap_or_else(|e| panic!("DataHasher failed for {name}: {e}"));
402
403            assert_eq!(
404                streaming.iscc, oneshot.iscc,
405                "ISCC mismatch in test case {name}"
406            );
407
408            // Streaming — 256-byte chunks
409            let mut dh2 = DataHasher::new();
410            for chunk in input_bytes.chunks(256) {
411                dh2.update(chunk);
412            }
413            let streaming2 = dh2
414                .finalize(bits)
415                .unwrap_or_else(|e| panic!("DataHasher multi-chunk failed for {name}: {e}"));
416
417            assert_eq!(
418                streaming2.iscc, oneshot.iscc,
419                "multi-chunk ISCC mismatch in test case {name}"
420            );
421
422            // Streaming — 1-byte chunks (stress test)
423            let mut dh3 = DataHasher::new();
424            for &b in &input_bytes {
425                dh3.update(&[b]);
426            }
427            let streaming3 = dh3
428                .finalize(bits)
429                .unwrap_or_else(|e| panic!("DataHasher byte-at-a-time failed for {name}: {e}"));
430
431            assert_eq!(
432                streaming3.iscc, oneshot.iscc,
433                "byte-at-a-time ISCC mismatch in test case {name}"
434            );
435        }
436    }
437}