1use crate::types::{DataCodeResult, InstanceCodeResult};
9use crate::{IsccResult, cdc, codec, minhash};
10
11pub struct InstanceHasher {
16 hasher: blake3::Hasher,
17 filesize: u64,
18}
19
20impl InstanceHasher {
21 pub fn new() -> Self {
23 Self {
24 hasher: blake3::Hasher::new(),
25 filesize: 0,
26 }
27 }
28
29 pub fn update(&mut self, data: &[u8]) {
31 self.filesize += data.len() as u64;
32 self.hasher.update(data);
33 }
34
35 pub fn finalize(self, bits: u32) -> IsccResult<InstanceCodeResult> {
40 let digest = self.hasher.finalize();
41 let datahash = format!("1e20{}", hex::encode(digest.as_bytes()));
42 let component = codec::encode_component(
43 codec::MainType::Instance,
44 codec::SubType::None,
45 codec::Version::V0,
46 bits,
47 digest.as_bytes(),
48 )?;
49 Ok(InstanceCodeResult {
50 iscc: format!("ISCC:{component}"),
51 datahash,
52 filesize: self.filesize,
53 })
54 }
55}
56
57impl Default for InstanceHasher {
58 fn default() -> Self {
60 Self::new()
61 }
62}
63
64pub struct DataHasher {
71 chunk_features: Vec<u32>,
72 buf: Vec<u8>,
73}
74
75impl DataHasher {
76 pub fn new() -> Self {
78 Self {
79 chunk_features: Vec::new(),
80 buf: Vec::new(),
81 }
82 }
83
84 pub fn update(&mut self, data: &[u8]) {
91 self.buf.extend_from_slice(data);
92
93 let chunks = cdc::alg_cdc_chunks_unchecked(&self.buf, false, cdc::DATA_AVG_CHUNK_SIZE);
94
95 let mut prev_chunk: Option<&[u8]> = None;
98 for chunk in &chunks {
99 if let Some(pc) = prev_chunk {
100 self.chunk_features.push(xxhash_rust::xxh32::xxh32(pc, 0));
101 }
102 prev_chunk = Some(chunk);
103 }
104
105 let tail_len = prev_chunk.map_or(0, |c| c.len());
107 drop(chunks);
108
109 let tail_start = self.buf.len() - tail_len;
111 self.buf.copy_within(tail_start.., 0);
112 self.buf.truncate(tail_len);
113 }
114
115 pub fn finalize(mut self, bits: u32) -> IsccResult<DataCodeResult> {
120 if !self.buf.is_empty() {
121 self.chunk_features
122 .push(xxhash_rust::xxh32::xxh32(&self.buf, 0));
123 } else if self.chunk_features.is_empty() {
124 self.chunk_features.push(xxhash_rust::xxh32::xxh32(b"", 0));
126 }
127
128 let digest = minhash::alg_minhash_256(&self.chunk_features);
129 let component = codec::encode_component(
130 codec::MainType::Data,
131 codec::SubType::None,
132 codec::Version::V0,
133 bits,
134 &digest,
135 )?;
136
137 Ok(DataCodeResult {
138 iscc: format!("ISCC:{component}"),
139 })
140 }
141}
142
143impl Default for DataHasher {
144 fn default() -> Self {
146 Self::new()
147 }
148}
149
150#[cfg(test)]
151mod tests {
152 use super::*;
153 use crate::{gen_data_code_v0, gen_instance_code_v0};
154
155 #[test]
158 fn test_instance_hasher_empty() {
159 let ih = InstanceHasher::new();
160 let streaming = ih.finalize(64).unwrap();
161 let oneshot = gen_instance_code_v0(b"", 64).unwrap();
162 assert_eq!(streaming.iscc, oneshot.iscc);
163 assert_eq!(streaming.datahash, oneshot.datahash);
164 assert_eq!(streaming.filesize, oneshot.filesize);
165 assert_eq!(streaming.filesize, 0);
166 }
167
168 #[test]
169 fn test_instance_hasher_small_data() {
170 let data = b"Hello, ISCC World!";
171 let mut ih = InstanceHasher::new();
172 ih.update(data);
173 let streaming = ih.finalize(64).unwrap();
174 let oneshot = gen_instance_code_v0(data, 64).unwrap();
175 assert_eq!(streaming.iscc, oneshot.iscc);
176 assert_eq!(streaming.datahash, oneshot.datahash);
177 assert_eq!(streaming.filesize, oneshot.filesize);
178 }
179
180 #[test]
181 fn test_instance_hasher_multi_chunk() {
182 let data = b"The quick brown fox jumps over the lazy dog";
183 let mut ih = InstanceHasher::new();
184 ih.update(&data[..10]);
185 ih.update(&data[10..25]);
186 ih.update(&data[25..]);
187 let streaming = ih.finalize(64).unwrap();
188 let oneshot = gen_instance_code_v0(data, 64).unwrap();
189 assert_eq!(streaming.iscc, oneshot.iscc);
190 assert_eq!(streaming.datahash, oneshot.datahash);
191 assert_eq!(streaming.filesize, oneshot.filesize);
192 }
193
194 #[test]
195 fn test_instance_hasher_byte_at_a_time() {
196 let data = b"streaming byte by byte";
197 let mut ih = InstanceHasher::new();
198 for &b in data.iter() {
199 ih.update(&[b]);
200 }
201 let streaming = ih.finalize(128).unwrap();
202 let oneshot = gen_instance_code_v0(data, 128).unwrap();
203 assert_eq!(streaming.iscc, oneshot.iscc);
204 assert_eq!(streaming.datahash, oneshot.datahash);
205 assert_eq!(streaming.filesize, oneshot.filesize);
206 }
207
208 #[test]
209 fn test_instance_hasher_default() {
210 let ih = InstanceHasher::default();
211 let streaming = ih.finalize(64).unwrap();
212 let oneshot = gen_instance_code_v0(b"", 64).unwrap();
213 assert_eq!(streaming.iscc, oneshot.iscc);
214 }
215
216 #[test]
217 fn test_instance_hasher_various_bits() {
218 let data = b"test various bit widths";
219 for bits in [64, 128, 256] {
220 let mut ih = InstanceHasher::new();
221 ih.update(data);
222 let streaming = ih.finalize(bits).unwrap();
223 let oneshot = gen_instance_code_v0(data, bits).unwrap();
224 assert_eq!(streaming.iscc, oneshot.iscc, "bits={bits}");
225 assert_eq!(streaming.datahash, oneshot.datahash, "bits={bits}");
226 }
227 }
228
229 #[test]
230 fn test_instance_hasher_conformance() {
231 let json_str = include_str!("../tests/data.json");
232 let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
233 let section = &data["gen_instance_code_v0"];
234 let cases = section.as_object().unwrap();
235
236 for (name, tc) in cases {
237 let inputs = tc["inputs"].as_array().unwrap();
238 let stream_str = inputs[0].as_str().unwrap();
239 let bits = inputs[1].as_u64().unwrap() as u32;
240
241 let hex_data = stream_str
242 .strip_prefix("stream:")
243 .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {name}"));
244 let input_bytes = hex::decode(hex_data)
245 .unwrap_or_else(|e| panic!("invalid hex in test case {name}: {e}"));
246
247 let oneshot = gen_instance_code_v0(&input_bytes, bits)
249 .unwrap_or_else(|e| panic!("gen_instance_code_v0 failed for {name}: {e}"));
250
251 let mut ih = InstanceHasher::new();
253 ih.update(&input_bytes);
254 let streaming = ih
255 .finalize(bits)
256 .unwrap_or_else(|e| panic!("InstanceHasher failed for {name}: {e}"));
257
258 assert_eq!(
259 streaming.iscc, oneshot.iscc,
260 "ISCC mismatch in test case {name}"
261 );
262 assert_eq!(
263 streaming.datahash, oneshot.datahash,
264 "datahash mismatch in test case {name}"
265 );
266 assert_eq!(
267 streaming.filesize, oneshot.filesize,
268 "filesize mismatch in test case {name}"
269 );
270
271 let mut ih2 = InstanceHasher::new();
273 for chunk in input_bytes.chunks(256) {
274 ih2.update(chunk);
275 }
276 let streaming2 = ih2
277 .finalize(bits)
278 .unwrap_or_else(|e| panic!("InstanceHasher multi-chunk failed for {name}: {e}"));
279
280 assert_eq!(
281 streaming2.iscc, oneshot.iscc,
282 "multi-chunk ISCC mismatch in test case {name}"
283 );
284 assert_eq!(
285 streaming2.datahash, oneshot.datahash,
286 "multi-chunk datahash mismatch in test case {name}"
287 );
288 }
289 }
290
291 #[test]
294 fn test_data_hasher_empty() {
295 let dh = DataHasher::new();
296 let streaming = dh.finalize(64).unwrap();
297 let oneshot = gen_data_code_v0(b"", 64).unwrap();
298 assert_eq!(streaming.iscc, oneshot.iscc);
299 }
300
301 #[test]
302 fn test_data_hasher_small_data() {
303 let data = b"Hello, ISCC World!";
304 let mut dh = DataHasher::new();
305 dh.update(data);
306 let streaming = dh.finalize(64).unwrap();
307 let oneshot = gen_data_code_v0(data, 64).unwrap();
308 assert_eq!(streaming.iscc, oneshot.iscc);
309 }
310
311 #[test]
312 fn test_data_hasher_multi_chunk_small() {
313 let data = b"The quick brown fox jumps over the lazy dog";
314 let mut dh = DataHasher::new();
315 dh.update(&data[..10]);
316 dh.update(&data[10..25]);
317 dh.update(&data[25..]);
318 let streaming = dh.finalize(64).unwrap();
319 let oneshot = gen_data_code_v0(data, 64).unwrap();
320 assert_eq!(streaming.iscc, oneshot.iscc);
321 }
322
323 #[test]
324 fn test_data_hasher_byte_at_a_time() {
325 let data = b"streaming byte by byte";
327 let mut dh = DataHasher::new();
328 for &b in data.iter() {
329 dh.update(&[b]);
330 }
331 let streaming = dh.finalize(64).unwrap();
332 let oneshot = gen_data_code_v0(data, 64).unwrap();
333 assert_eq!(streaming.iscc, oneshot.iscc);
334 }
335
336 #[test]
337 fn test_data_hasher_large_data_multi_chunk() {
338 let data: Vec<u8> = (0..10_000).map(|i| (i % 256) as u8).collect();
340 for chunk_size in [1, 256, 1024, 4096] {
341 let mut dh = DataHasher::new();
342 for chunk in data.chunks(chunk_size) {
343 dh.update(chunk);
344 }
345 let streaming = dh.finalize(64).unwrap();
346 let oneshot = gen_data_code_v0(&data, 64).unwrap();
347 assert_eq!(
348 streaming.iscc, oneshot.iscc,
349 "chunk_size={chunk_size} mismatch"
350 );
351 }
352 }
353
354 #[test]
355 fn test_data_hasher_default() {
356 let dh = DataHasher::default();
357 let streaming = dh.finalize(64).unwrap();
358 let oneshot = gen_data_code_v0(b"", 64).unwrap();
359 assert_eq!(streaming.iscc, oneshot.iscc);
360 }
361
362 #[test]
363 fn test_data_hasher_various_bits() {
364 let data = b"test various bit widths for data code";
365 for bits in [64, 128, 256] {
366 let mut dh = DataHasher::new();
367 dh.update(data);
368 let streaming = dh.finalize(bits).unwrap();
369 let oneshot = gen_data_code_v0(data, bits).unwrap();
370 assert_eq!(streaming.iscc, oneshot.iscc, "bits={bits}");
371 }
372 }
373
374 #[test]
375 fn test_data_hasher_conformance() {
376 let json_str = include_str!("../tests/data.json");
377 let data: serde_json::Value = serde_json::from_str(json_str).unwrap();
378 let section = &data["gen_data_code_v0"];
379 let cases = section.as_object().unwrap();
380
381 for (name, tc) in cases {
382 let inputs = tc["inputs"].as_array().unwrap();
383 let stream_str = inputs[0].as_str().unwrap();
384 let bits = inputs[1].as_u64().unwrap() as u32;
385
386 let hex_data = stream_str
387 .strip_prefix("stream:")
388 .unwrap_or_else(|| panic!("expected 'stream:' prefix in test case {name}"));
389 let input_bytes = hex::decode(hex_data)
390 .unwrap_or_else(|e| panic!("invalid hex in test case {name}: {e}"));
391
392 let oneshot = gen_data_code_v0(&input_bytes, bits)
394 .unwrap_or_else(|e| panic!("gen_data_code_v0 failed for {name}: {e}"));
395
396 let mut dh = DataHasher::new();
398 dh.update(&input_bytes);
399 let streaming = dh
400 .finalize(bits)
401 .unwrap_or_else(|e| panic!("DataHasher failed for {name}: {e}"));
402
403 assert_eq!(
404 streaming.iscc, oneshot.iscc,
405 "ISCC mismatch in test case {name}"
406 );
407
408 let mut dh2 = DataHasher::new();
410 for chunk in input_bytes.chunks(256) {
411 dh2.update(chunk);
412 }
413 let streaming2 = dh2
414 .finalize(bits)
415 .unwrap_or_else(|e| panic!("DataHasher multi-chunk failed for {name}: {e}"));
416
417 assert_eq!(
418 streaming2.iscc, oneshot.iscc,
419 "multi-chunk ISCC mismatch in test case {name}"
420 );
421
422 let mut dh3 = DataHasher::new();
424 for &b in &input_bytes {
425 dh3.update(&[b]);
426 }
427 let streaming3 = dh3
428 .finalize(bits)
429 .unwrap_or_else(|e| panic!("DataHasher byte-at-a-time failed for {name}: {e}"));
430
431 assert_eq!(
432 streaming3.iscc, oneshot.iscc,
433 "byte-at-a-time ISCC mismatch in test case {name}"
434 );
435 }
436 }
437}