Skip to main content

iscc_lib/
streaming.rs

1//! Streaming hash types for incremental ISCC code generation.
2//!
3//! Provides `DataHasher` and `InstanceHasher` — streaming counterparts to
4//! `gen_data_code_v0` and `gen_instance_code_v0`. Both follow the
5//! `new() → update(&[u8]) → finalize()` pattern for incremental processing
6//! of large files without loading entire contents into memory.
7
8use crate::types::{DataCodeResult, InstanceCodeResult};
9use crate::{IsccResult, cdc, codec, minhash};
10
11/// Streaming Instance-Code generator.
12///
13/// Incrementally hashes data with BLAKE3 to produce an ISCC Instance-Code
14/// identical to `gen_instance_code_v0` for the same byte stream.
15pub struct InstanceHasher {
16    hasher: blake3::Hasher,
17    filesize: u64,
18}
19
20impl InstanceHasher {
21    /// Create a new `InstanceHasher`.
22    pub fn new() -> Self {
23        Self {
24            hasher: blake3::Hasher::new(),
25            filesize: 0,
26        }
27    }
28
29    /// Push data into the hasher.
30    pub fn update(&mut self, data: &[u8]) {
31        self.filesize += data.len() as u64;
32        self.hasher.update(data);
33    }
34
35    /// Consume the hasher and produce an Instance-Code result.
36    ///
37    /// Equivalent to calling `gen_instance_code_v0` with the concatenation
38    /// of all data passed to `update`.
39    pub fn finalize(self, bits: u32) -> IsccResult<InstanceCodeResult> {
40        let digest = self.hasher.finalize();
41        let datahash = format!("1e20{}", hex::encode(digest.as_bytes()));
42        let component = codec::encode_component(
43            codec::MainType::Instance,
44            codec::SubType::None,
45            codec::Version::V0,
46            bits,
47            digest.as_bytes(),
48        )?;
49        Ok(InstanceCodeResult {
50            iscc: format!("ISCC:{component}"),
51            datahash,
52            filesize: self.filesize,
53        })
54    }
55}
56
57impl Default for InstanceHasher {
58    /// Create a new `InstanceHasher` (delegates to `new()`).
59    fn default() -> Self {
60        Self::new()
61    }
62}
63
64/// Streaming Data-Code generator.
65///
66/// Incrementally processes data with content-defined chunking (CDC) and
67/// MinHash to produce an ISCC Data-Code identical to `gen_data_code_v0`
68/// for the same byte stream.
69pub struct DataHasher {
70    chunk_features: Vec<u32>,
71    tail: Vec<u8>,
72}
73
74impl DataHasher {
75    /// Create a new `DataHasher`.
76    pub fn new() -> Self {
77        Self {
78            chunk_features: Vec::new(),
79            tail: Vec::new(),
80        }
81    }
82
83    /// Push data into the hasher.
84    ///
85    /// Prepends any leftover tail from the previous call, runs CDC on the
86    /// combined buffer, hashes all complete chunks, and retains the last
87    /// chunk as the new tail for the next call.
88    pub fn update(&mut self, data: &[u8]) {
89        let combined = if self.tail.is_empty() {
90            data.to_vec()
91        } else {
92            [self.tail.as_slice(), data].concat()
93        };
94
95        let chunks = cdc::alg_cdc_chunks(&combined, false, cdc::DATA_AVG_CHUNK_SIZE);
96
97        // Process all chunks except the last (which becomes the new tail).
98        // This mirrors the Python `push()` method's `prev_chunk` pattern.
99        let mut prev_chunk: Option<&[u8]> = None;
100        for chunk in &chunks {
101            if let Some(pc) = prev_chunk {
102                self.chunk_features.push(xxhash_rust::xxh32::xxh32(pc, 0));
103            }
104            prev_chunk = Some(chunk);
105        }
106
107        // The last chunk becomes the new tail
108        self.tail = prev_chunk.unwrap_or(&b""[..]).to_vec();
109    }
110
111    /// Consume the hasher and produce a Data-Code result.
112    ///
113    /// Equivalent to calling `gen_data_code_v0` with the concatenation
114    /// of all data passed to `update`.
115    pub fn finalize(mut self, bits: u32) -> IsccResult<DataCodeResult> {
116        if !self.tail.is_empty() {
117            self.chunk_features
118                .push(xxhash_rust::xxh32::xxh32(&self.tail, 0));
119        } else if self.chunk_features.is_empty() {
120            // Empty input: ensure at least one feature
121            self.chunk_features.push(xxhash_rust::xxh32::xxh32(b"", 0));
122        }
123
124        let digest = minhash::alg_minhash_256(&self.chunk_features);
125        let component = codec::encode_component(
126            codec::MainType::Data,
127            codec::SubType::None,
128            codec::Version::V0,
129            bits,
130            &digest,
131        )?;
132
133        Ok(DataCodeResult {
134            iscc: format!("ISCC:{component}"),
135        })
136    }
137}
138
139impl Default for DataHasher {
140    /// Create a new `DataHasher` (delegates to `new()`).
141    fn default() -> Self {
142        Self::new()
143    }
144}
145
146#[cfg(test)]
147mod tests {
148    use super::*;
149    use crate::{gen_data_code_v0, gen_instance_code_v0};
150
151    // ---- InstanceHasher tests ----
152
153    #[test]
154    fn test_instance_hasher_empty() {
155        let ih = InstanceHasher::new();
156        let streaming = ih.finalize(64).unwrap();
157        let oneshot = gen_instance_code_v0(b"", 64).unwrap();
158        assert_eq!(streaming.iscc, oneshot.iscc);
159        assert_eq!(streaming.datahash, oneshot.datahash);
160        assert_eq!(streaming.filesize, oneshot.filesize);
161        assert_eq!(streaming.filesize, 0);
162    }
163
164    #[test]
165    fn test_instance_hasher_small_data() {
166        let data = b"Hello, ISCC World!";
167        let mut ih = InstanceHasher::new();
168        ih.update(data);
169        let streaming = ih.finalize(64).unwrap();
170        let oneshot = gen_instance_code_v0(data, 64).unwrap();
171        assert_eq!(streaming.iscc, oneshot.iscc);
172        assert_eq!(streaming.datahash, oneshot.datahash);
173        assert_eq!(streaming.filesize, oneshot.filesize);
174    }
175
176    #[test]
177    fn test_instance_hasher_multi_chunk() {
178        let data = b"The quick brown fox jumps over the lazy dog";
179        let mut ih = InstanceHasher::new();
180        ih.update(&data[..10]);
181        ih.update(&data[10..25]);
182        ih.update(&data[25..]);
183        let streaming = ih.finalize(64).unwrap();
184        let oneshot = gen_instance_code_v0(data, 64).unwrap();
185        assert_eq!(streaming.iscc, oneshot.iscc);
186        assert_eq!(streaming.datahash, oneshot.datahash);
187        assert_eq!(streaming.filesize, oneshot.filesize);
188    }
189
190    #[test]
191    fn test_instance_hasher_byte_at_a_time() {
192        let data = b"streaming byte by byte";
193        let mut ih = InstanceHasher::new();
194        for &b in data.iter() {
195            ih.update(&[b]);
196        }
197        let streaming = ih.finalize(128).unwrap();
198        let oneshot = gen_instance_code_v0(data, 128).unwrap();
199        assert_eq!(streaming.iscc, oneshot.iscc);
200        assert_eq!(streaming.datahash, oneshot.datahash);
201        assert_eq!(streaming.filesize, oneshot.filesize);
202    }
203
204    #[test]
205    fn test_instance_hasher_default() {
206        let ih = InstanceHasher::default();
207        let streaming = ih.finalize(64).unwrap();
208        let oneshot = gen_instance_code_v0(b"", 64).unwrap();
209        assert_eq!(streaming.iscc, oneshot.iscc);
210    }
211
212    #[test]
213    fn test_instance_hasher_various_bits() {
214        let data = b"test various bit widths";
215        for bits in [64, 128, 256] {
216            let mut ih = InstanceHasher::new();
217            ih.update(data);
218            let streaming = ih.finalize(bits).unwrap();
219            let oneshot = gen_instance_code_v0(data, bits).unwrap();
220            assert_eq!(streaming.iscc, oneshot.iscc, "bits={bits}");
221            assert_eq!(streaming.datahash, oneshot.datahash, "bits={bits}");
222        }
223    }
224
225    #[test]
226    fn test_instance_hasher_conformance() {
227        let json_str = include_str!("../tests/data.json");
228        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
229        let section = &data["gen_instance_code_v0"];
230        let cases = section.as_object().unwrap();
231
232        for (name, tc) in cases {
233            let inputs = tc["inputs"].as_array().unwrap();
234            let stream_str = inputs[0].as_str().unwrap();
235            let bits = inputs[1].as_u64().unwrap() as u32;
236
237            let hex_data = stream_str
238                .strip_prefix("stream:")
239                .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {name}"));
240            let input_bytes = hex::decode(hex_data)
241                .unwrap_or_else(|e| panic!("invalid hex in test case {name}: {e}"));
242
243            // One-shot reference
244            let oneshot = gen_instance_code_v0(&input_bytes, bits)
245                .unwrap_or_else(|e| panic!("gen_instance_code_v0 failed for {name}: {e}"));
246
247            // Streaming — single update
248            let mut ih = InstanceHasher::new();
249            ih.update(&input_bytes);
250            let streaming = ih
251                .finalize(bits)
252                .unwrap_or_else(|e| panic!("InstanceHasher failed for {name}: {e}"));
253
254            assert_eq!(
255                streaming.iscc, oneshot.iscc,
256                "ISCC mismatch in test case {name}"
257            );
258            assert_eq!(
259                streaming.datahash, oneshot.datahash,
260                "datahash mismatch in test case {name}"
261            );
262            assert_eq!(
263                streaming.filesize, oneshot.filesize,
264                "filesize mismatch in test case {name}"
265            );
266
267            // Streaming — multi-chunk (split into 256-byte chunks)
268            let mut ih2 = InstanceHasher::new();
269            for chunk in input_bytes.chunks(256) {
270                ih2.update(chunk);
271            }
272            let streaming2 = ih2
273                .finalize(bits)
274                .unwrap_or_else(|e| panic!("InstanceHasher multi-chunk failed for {name}: {e}"));
275
276            assert_eq!(
277                streaming2.iscc, oneshot.iscc,
278                "multi-chunk ISCC mismatch in test case {name}"
279            );
280            assert_eq!(
281                streaming2.datahash, oneshot.datahash,
282                "multi-chunk datahash mismatch in test case {name}"
283            );
284        }
285    }
286
287    // ---- DataHasher tests ----
288
289    #[test]
290    fn test_data_hasher_empty() {
291        let dh = DataHasher::new();
292        let streaming = dh.finalize(64).unwrap();
293        let oneshot = gen_data_code_v0(b"", 64).unwrap();
294        assert_eq!(streaming.iscc, oneshot.iscc);
295    }
296
297    #[test]
298    fn test_data_hasher_small_data() {
299        let data = b"Hello, ISCC World!";
300        let mut dh = DataHasher::new();
301        dh.update(data);
302        let streaming = dh.finalize(64).unwrap();
303        let oneshot = gen_data_code_v0(data, 64).unwrap();
304        assert_eq!(streaming.iscc, oneshot.iscc);
305    }
306
307    #[test]
308    fn test_data_hasher_multi_chunk_small() {
309        let data = b"The quick brown fox jumps over the lazy dog";
310        let mut dh = DataHasher::new();
311        dh.update(&data[..10]);
312        dh.update(&data[10..25]);
313        dh.update(&data[25..]);
314        let streaming = dh.finalize(64).unwrap();
315        let oneshot = gen_data_code_v0(data, 64).unwrap();
316        assert_eq!(streaming.iscc, oneshot.iscc);
317    }
318
319    #[test]
320    fn test_data_hasher_byte_at_a_time() {
321        // Small data that fits within a single CDC chunk
322        let data = b"streaming byte by byte";
323        let mut dh = DataHasher::new();
324        for &b in data.iter() {
325            dh.update(&[b]);
326        }
327        let streaming = dh.finalize(64).unwrap();
328        let oneshot = gen_data_code_v0(data, 64).unwrap();
329        assert_eq!(streaming.iscc, oneshot.iscc);
330    }
331
332    #[test]
333    fn test_data_hasher_large_data_multi_chunk() {
334        // Generate data large enough to produce multiple CDC chunks
335        let data: Vec<u8> = (0..10_000).map(|i| (i % 256) as u8).collect();
336        for chunk_size in [1, 256, 1024, 4096] {
337            let mut dh = DataHasher::new();
338            for chunk in data.chunks(chunk_size) {
339                dh.update(chunk);
340            }
341            let streaming = dh.finalize(64).unwrap();
342            let oneshot = gen_data_code_v0(&data, 64).unwrap();
343            assert_eq!(
344                streaming.iscc, oneshot.iscc,
345                "chunk_size={chunk_size} mismatch"
346            );
347        }
348    }
349
350    #[test]
351    fn test_data_hasher_default() {
352        let dh = DataHasher::default();
353        let streaming = dh.finalize(64).unwrap();
354        let oneshot = gen_data_code_v0(b"", 64).unwrap();
355        assert_eq!(streaming.iscc, oneshot.iscc);
356    }
357
358    #[test]
359    fn test_data_hasher_various_bits() {
360        let data = b"test various bit widths for data code";
361        for bits in [64, 128, 256] {
362            let mut dh = DataHasher::new();
363            dh.update(data);
364            let streaming = dh.finalize(bits).unwrap();
365            let oneshot = gen_data_code_v0(data, bits).unwrap();
366            assert_eq!(streaming.iscc, oneshot.iscc, "bits={bits}");
367        }
368    }
369
370    #[test]
371    fn test_data_hasher_conformance() {
372        let json_str = include_str!("../tests/data.json");
373        let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
374        let section = &data["gen_data_code_v0"];
375        let cases = section.as_object().unwrap();
376
377        for (name, tc) in cases {
378            let inputs = tc["inputs"].as_array().unwrap();
379            let stream_str = inputs[0].as_str().unwrap();
380            let bits = inputs[1].as_u64().unwrap() as u32;
381
382            let hex_data = stream_str
383                .strip_prefix("stream:")
384                .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {name}"));
385            let input_bytes = hex::decode(hex_data)
386                .unwrap_or_else(|e| panic!("invalid hex in test case {name}: {e}"));
387
388            // One-shot reference
389            let oneshot = gen_data_code_v0(&input_bytes, bits)
390                .unwrap_or_else(|e| panic!("gen_data_code_v0 failed for {name}: {e}"));
391
392            // Streaming — single update
393            let mut dh = DataHasher::new();
394            dh.update(&input_bytes);
395            let streaming = dh
396                .finalize(bits)
397                .unwrap_or_else(|e| panic!("DataHasher failed for {name}: {e}"));
398
399            assert_eq!(
400                streaming.iscc, oneshot.iscc,
401                "ISCC mismatch in test case {name}"
402            );
403
404            // Streaming — 256-byte chunks
405            let mut dh2 = DataHasher::new();
406            for chunk in input_bytes.chunks(256) {
407                dh2.update(chunk);
408            }
409            let streaming2 = dh2
410                .finalize(bits)
411                .unwrap_or_else(|e| panic!("DataHasher multi-chunk failed for {name}: {e}"));
412
413            assert_eq!(
414                streaming2.iscc, oneshot.iscc,
415                "multi-chunk ISCC mismatch in test case {name}"
416            );
417
418            // Streaming — 1-byte chunks (stress test)
419            let mut dh3 = DataHasher::new();
420            for &b in &input_bytes {
421                dh3.update(&[b]);
422            }
423            let streaming3 = dh3
424                .finalize(bits)
425                .unwrap_or_else(|e| panic!("DataHasher byte-at-a-time failed for {name}: {e}"));
426
427            assert_eq!(
428                streaming3.iscc, oneshot.iscc,
429                "byte-at-a-time ISCC mismatch in test case {name}"
430            );
431        }
432    }
433}