1use crate::types::{DataCodeResult, InstanceCodeResult};
9use crate::{IsccResult, cdc, codec, minhash};
10
11pub struct InstanceHasher {
16 hasher: blake3::Hasher,
17 filesize: u64,
18}
19
20impl InstanceHasher {
21 pub fn new() -> Self {
23 Self {
24 hasher: blake3::Hasher::new(),
25 filesize: 0,
26 }
27 }
28
29 pub fn update(&mut self, data: &[u8]) {
31 self.filesize += data.len() as u64;
32 self.hasher.update(data);
33 }
34
35 pub fn finalize(self, bits: u32) -> IsccResult<InstanceCodeResult> {
40 let digest = self.hasher.finalize();
41 let datahash = format!("1e20{}", hex::encode(digest.as_bytes()));
42 let component = codec::encode_component(
43 codec::MainType::Instance,
44 codec::SubType::None,
45 codec::Version::V0,
46 bits,
47 digest.as_bytes(),
48 )?;
49 Ok(InstanceCodeResult {
50 iscc: format!("ISCC:{component}"),
51 datahash,
52 filesize: self.filesize,
53 })
54 }
55}
56
57impl Default for InstanceHasher {
58 fn default() -> Self {
60 Self::new()
61 }
62}
63
64pub struct DataHasher {
70 chunk_features: Vec<u32>,
71 tail: Vec<u8>,
72}
73
74impl DataHasher {
75 pub fn new() -> Self {
77 Self {
78 chunk_features: Vec::new(),
79 tail: Vec::new(),
80 }
81 }
82
83 pub fn update(&mut self, data: &[u8]) {
89 let combined = if self.tail.is_empty() {
90 data.to_vec()
91 } else {
92 [self.tail.as_slice(), data].concat()
93 };
94
95 let chunks = cdc::alg_cdc_chunks(&combined, false, cdc::DATA_AVG_CHUNK_SIZE);
96
97 let mut prev_chunk: Option<&[u8]> = None;
100 for chunk in &chunks {
101 if let Some(pc) = prev_chunk {
102 self.chunk_features.push(xxhash_rust::xxh32::xxh32(pc, 0));
103 }
104 prev_chunk = Some(chunk);
105 }
106
107 self.tail = prev_chunk.unwrap_or(&b""[..]).to_vec();
109 }
110
111 pub fn finalize(mut self, bits: u32) -> IsccResult<DataCodeResult> {
116 if !self.tail.is_empty() {
117 self.chunk_features
118 .push(xxhash_rust::xxh32::xxh32(&self.tail, 0));
119 } else if self.chunk_features.is_empty() {
120 self.chunk_features.push(xxhash_rust::xxh32::xxh32(b"", 0));
122 }
123
124 let digest = minhash::alg_minhash_256(&self.chunk_features);
125 let component = codec::encode_component(
126 codec::MainType::Data,
127 codec::SubType::None,
128 codec::Version::V0,
129 bits,
130 &digest,
131 )?;
132
133 Ok(DataCodeResult {
134 iscc: format!("ISCC:{component}"),
135 })
136 }
137}
138
139impl Default for DataHasher {
140 fn default() -> Self {
142 Self::new()
143 }
144}
145
146#[cfg(test)]
147mod tests {
148 use super::*;
149 use crate::{gen_data_code_v0, gen_instance_code_v0};
150
151 #[test]
154 fn test_instance_hasher_empty() {
155 let ih = InstanceHasher::new();
156 let streaming = ih.finalize(64).unwrap();
157 let oneshot = gen_instance_code_v0(b"", 64).unwrap();
158 assert_eq!(streaming.iscc, oneshot.iscc);
159 assert_eq!(streaming.datahash, oneshot.datahash);
160 assert_eq!(streaming.filesize, oneshot.filesize);
161 assert_eq!(streaming.filesize, 0);
162 }
163
164 #[test]
165 fn test_instance_hasher_small_data() {
166 let data = b"Hello, ISCC World!";
167 let mut ih = InstanceHasher::new();
168 ih.update(data);
169 let streaming = ih.finalize(64).unwrap();
170 let oneshot = gen_instance_code_v0(data, 64).unwrap();
171 assert_eq!(streaming.iscc, oneshot.iscc);
172 assert_eq!(streaming.datahash, oneshot.datahash);
173 assert_eq!(streaming.filesize, oneshot.filesize);
174 }
175
176 #[test]
177 fn test_instance_hasher_multi_chunk() {
178 let data = b"The quick brown fox jumps over the lazy dog";
179 let mut ih = InstanceHasher::new();
180 ih.update(&data[..10]);
181 ih.update(&data[10..25]);
182 ih.update(&data[25..]);
183 let streaming = ih.finalize(64).unwrap();
184 let oneshot = gen_instance_code_v0(data, 64).unwrap();
185 assert_eq!(streaming.iscc, oneshot.iscc);
186 assert_eq!(streaming.datahash, oneshot.datahash);
187 assert_eq!(streaming.filesize, oneshot.filesize);
188 }
189
190 #[test]
191 fn test_instance_hasher_byte_at_a_time() {
192 let data = b"streaming byte by byte";
193 let mut ih = InstanceHasher::new();
194 for &b in data.iter() {
195 ih.update(&[b]);
196 }
197 let streaming = ih.finalize(128).unwrap();
198 let oneshot = gen_instance_code_v0(data, 128).unwrap();
199 assert_eq!(streaming.iscc, oneshot.iscc);
200 assert_eq!(streaming.datahash, oneshot.datahash);
201 assert_eq!(streaming.filesize, oneshot.filesize);
202 }
203
204 #[test]
205 fn test_instance_hasher_default() {
206 let ih = InstanceHasher::default();
207 let streaming = ih.finalize(64).unwrap();
208 let oneshot = gen_instance_code_v0(b"", 64).unwrap();
209 assert_eq!(streaming.iscc, oneshot.iscc);
210 }
211
212 #[test]
213 fn test_instance_hasher_various_bits() {
214 let data = b"test various bit widths";
215 for bits in [64, 128, 256] {
216 let mut ih = InstanceHasher::new();
217 ih.update(data);
218 let streaming = ih.finalize(bits).unwrap();
219 let oneshot = gen_instance_code_v0(data, bits).unwrap();
220 assert_eq!(streaming.iscc, oneshot.iscc, "bits={bits}");
221 assert_eq!(streaming.datahash, oneshot.datahash, "bits={bits}");
222 }
223 }
224
225 #[test]
226 fn test_instance_hasher_conformance() {
227 let json_str = include_str!("../tests/data.json");
228 let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
229 let section = &data["gen_instance_code_v0"];
230 let cases = section.as_object().unwrap();
231
232 for (name, tc) in cases {
233 let inputs = tc["inputs"].as_array().unwrap();
234 let stream_str = inputs[0].as_str().unwrap();
235 let bits = inputs[1].as_u64().unwrap() as u32;
236
237 let hex_data = stream_str
238 .strip_prefix("stream:")
239 .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {name}"));
240 let input_bytes = hex::decode(hex_data)
241 .unwrap_or_else(|e| panic!("invalid hex in test case {name}: {e}"));
242
243 let oneshot = gen_instance_code_v0(&input_bytes, bits)
245 .unwrap_or_else(|e| panic!("gen_instance_code_v0 failed for {name}: {e}"));
246
247 let mut ih = InstanceHasher::new();
249 ih.update(&input_bytes);
250 let streaming = ih
251 .finalize(bits)
252 .unwrap_or_else(|e| panic!("InstanceHasher failed for {name}: {e}"));
253
254 assert_eq!(
255 streaming.iscc, oneshot.iscc,
256 "ISCC mismatch in test case {name}"
257 );
258 assert_eq!(
259 streaming.datahash, oneshot.datahash,
260 "datahash mismatch in test case {name}"
261 );
262 assert_eq!(
263 streaming.filesize, oneshot.filesize,
264 "filesize mismatch in test case {name}"
265 );
266
267 let mut ih2 = InstanceHasher::new();
269 for chunk in input_bytes.chunks(256) {
270 ih2.update(chunk);
271 }
272 let streaming2 = ih2
273 .finalize(bits)
274 .unwrap_or_else(|e| panic!("InstanceHasher multi-chunk failed for {name}: {e}"));
275
276 assert_eq!(
277 streaming2.iscc, oneshot.iscc,
278 "multi-chunk ISCC mismatch in test case {name}"
279 );
280 assert_eq!(
281 streaming2.datahash, oneshot.datahash,
282 "multi-chunk datahash mismatch in test case {name}"
283 );
284 }
285 }
286
287 #[test]
290 fn test_data_hasher_empty() {
291 let dh = DataHasher::new();
292 let streaming = dh.finalize(64).unwrap();
293 let oneshot = gen_data_code_v0(b"", 64).unwrap();
294 assert_eq!(streaming.iscc, oneshot.iscc);
295 }
296
297 #[test]
298 fn test_data_hasher_small_data() {
299 let data = b"Hello, ISCC World!";
300 let mut dh = DataHasher::new();
301 dh.update(data);
302 let streaming = dh.finalize(64).unwrap();
303 let oneshot = gen_data_code_v0(data, 64).unwrap();
304 assert_eq!(streaming.iscc, oneshot.iscc);
305 }
306
307 #[test]
308 fn test_data_hasher_multi_chunk_small() {
309 let data = b"The quick brown fox jumps over the lazy dog";
310 let mut dh = DataHasher::new();
311 dh.update(&data[..10]);
312 dh.update(&data[10..25]);
313 dh.update(&data[25..]);
314 let streaming = dh.finalize(64).unwrap();
315 let oneshot = gen_data_code_v0(data, 64).unwrap();
316 assert_eq!(streaming.iscc, oneshot.iscc);
317 }
318
319 #[test]
320 fn test_data_hasher_byte_at_a_time() {
321 let data = b"streaming byte by byte";
323 let mut dh = DataHasher::new();
324 for &b in data.iter() {
325 dh.update(&[b]);
326 }
327 let streaming = dh.finalize(64).unwrap();
328 let oneshot = gen_data_code_v0(data, 64).unwrap();
329 assert_eq!(streaming.iscc, oneshot.iscc);
330 }
331
332 #[test]
333 fn test_data_hasher_large_data_multi_chunk() {
334 let data: Vec<u8> = (0..10_000).map(|i| (i % 256) as u8).collect();
336 for chunk_size in [1, 256, 1024, 4096] {
337 let mut dh = DataHasher::new();
338 for chunk in data.chunks(chunk_size) {
339 dh.update(chunk);
340 }
341 let streaming = dh.finalize(64).unwrap();
342 let oneshot = gen_data_code_v0(&data, 64).unwrap();
343 assert_eq!(
344 streaming.iscc, oneshot.iscc,
345 "chunk_size={chunk_size} mismatch"
346 );
347 }
348 }
349
350 #[test]
351 fn test_data_hasher_default() {
352 let dh = DataHasher::default();
353 let streaming = dh.finalize(64).unwrap();
354 let oneshot = gen_data_code_v0(b"", 64).unwrap();
355 assert_eq!(streaming.iscc, oneshot.iscc);
356 }
357
358 #[test]
359 fn test_data_hasher_various_bits() {
360 let data = b"test various bit widths for data code";
361 for bits in [64, 128, 256] {
362 let mut dh = DataHasher::new();
363 dh.update(data);
364 let streaming = dh.finalize(bits).unwrap();
365 let oneshot = gen_data_code_v0(data, bits).unwrap();
366 assert_eq!(streaming.iscc, oneshot.iscc, "bits={bits}");
367 }
368 }
369
370 #[test]
371 fn test_data_hasher_conformance() {
372 let json_str = include_str!("../tests/data.json");
373 let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
374 let section = &data["gen_data_code_v0"];
375 let cases = section.as_object().unwrap();
376
377 for (name, tc) in cases {
378 let inputs = tc["inputs"].as_array().unwrap();
379 let stream_str = inputs[0].as_str().unwrap();
380 let bits = inputs[1].as_u64().unwrap() as u32;
381
382 let hex_data = stream_str
383 .strip_prefix("stream:")
384 .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {name}"));
385 let input_bytes = hex::decode(hex_data)
386 .unwrap_or_else(|e| panic!("invalid hex in test case {name}: {e}"));
387
388 let oneshot = gen_data_code_v0(&input_bytes, bits)
390 .unwrap_or_else(|e| panic!("gen_data_code_v0 failed for {name}: {e}"));
391
392 let mut dh = DataHasher::new();
394 dh.update(&input_bytes);
395 let streaming = dh
396 .finalize(bits)
397 .unwrap_or_else(|e| panic!("DataHasher failed for {name}: {e}"));
398
399 assert_eq!(
400 streaming.iscc, oneshot.iscc,
401 "ISCC mismatch in test case {name}"
402 );
403
404 let mut dh2 = DataHasher::new();
406 for chunk in input_bytes.chunks(256) {
407 dh2.update(chunk);
408 }
409 let streaming2 = dh2
410 .finalize(bits)
411 .unwrap_or_else(|e| panic!("DataHasher multi-chunk failed for {name}: {e}"));
412
413 assert_eq!(
414 streaming2.iscc, oneshot.iscc,
415 "multi-chunk ISCC mismatch in test case {name}"
416 );
417
418 let mut dh3 = DataHasher::new();
420 for &b in &input_bytes {
421 dh3.update(&[b]);
422 }
423 let streaming3 = dh3
424 .finalize(bits)
425 .unwrap_or_else(|e| panic!("DataHasher byte-at-a-time failed for {name}: {e}"));
426
427 assert_eq!(
428 streaming3.iscc, oneshot.iscc,
429 "byte-at-a-time ISCC mismatch in test case {name}"
430 );
431 }
432 }
433}