1use bytes::Bytes;
4use ipfrs_core::{Block, Cid, Result};
5use sha2::{Digest, Sha256};
6use std::collections::HashMap;
7
8#[inline]
12pub fn compute_cid(data: &[u8]) -> Cid {
13 let hash = Sha256::digest(data);
14
15 let mut multihash = Vec::with_capacity(34);
17 multihash.push(0x12); multihash.push(32); multihash.extend_from_slice(&hash);
20
21 Cid::try_from(multihash).unwrap_or_else(|_| {
24 let cid_bytes = format!("bafkreei{}", hex::encode(&hash[..16]));
26 Cid::try_from(cid_bytes.as_bytes().to_vec()).unwrap()
27 })
28}
29
30#[inline]
32pub fn create_block(data: Vec<u8>) -> Result<Block> {
33 let cid = compute_cid(&data);
34 Ok(Block::from_parts(cid, Bytes::from(data)))
35}
36
37pub fn generate_random_block(size: usize, seed: u64) -> Vec<u8> {
39 let mut rng = fastrand::Rng::with_seed(seed);
40 let mut data = vec![0u8; size];
41
42 for chunk in data.chunks_mut(8) {
43 let val = rng.u64(..);
44 let bytes = val.to_le_bytes();
45 let len = chunk.len().min(8);
46 chunk[..len].copy_from_slice(&bytes[..len]);
47 }
48
49 data
50}
51
52pub fn generate_compressible_data(size: usize) -> Vec<u8> {
54 let mut data = vec![0u8; size];
55 let pattern = b"IPFS is a distributed file system. ";
56
57 for (i, byte) in data.iter_mut().enumerate() {
58 *byte = pattern[i % pattern.len()];
59 }
60
61 data
62}
63
64pub fn generate_incompressible_data(size: usize, seed: u64) -> Vec<u8> {
66 generate_random_block(size, seed)
67}
68
69pub fn create_blocks_batch(data_vec: Vec<Vec<u8>>) -> Result<HashMap<Cid, Block>> {
74 let mut blocks = HashMap::with_capacity(data_vec.len());
75
76 for data in data_vec {
77 let block = create_block(data)?;
78 blocks.insert(*block.cid(), block);
79 }
80
81 Ok(blocks)
82}
83
84pub fn generate_random_blocks(count: usize, size: usize, start_seed: u64) -> Result<Vec<Block>> {
88 let mut blocks = Vec::with_capacity(count);
89
90 for i in 0..count {
91 let seed = start_seed.wrapping_add(i as u64);
92 let data = generate_random_block(size, seed);
93 let block = create_block(data)?;
94 blocks.push(block);
95 }
96
97 Ok(blocks)
98}
99
100pub fn generate_compressible_blocks(count: usize, size: usize) -> Result<Vec<Block>> {
104 let patterns = [
105 b"IPFS is a distributed file system. ".to_vec(),
106 b"Content addressing with CIDs. ".to_vec(),
107 b"Merkle DAGs for data structures. ".to_vec(),
108 b"Peer-to-peer networking protocol. ".to_vec(),
109 b"Immutable data storage layer. ".to_vec(),
110 ];
111
112 let mut blocks = Vec::with_capacity(count);
113
114 for i in 0..count {
115 let pattern = &patterns[i % patterns.len()];
116 let mut data = vec![0u8; size];
117
118 for (j, byte) in data.iter_mut().enumerate() {
119 *byte = pattern[j % pattern.len()];
120 }
121
122 let block = create_block(data)?;
123 blocks.push(block);
124 }
125
126 Ok(blocks)
127}
128
129pub fn generate_mixed_size_blocks(small: usize, medium: usize, large: usize) -> Result<Vec<Block>> {
134 let mut blocks = Vec::with_capacity(small + medium + large);
135 let mut seed = 0u64;
136
137 for _ in 0..small {
139 let data = generate_random_block(1024, seed);
140 blocks.push(create_block(data)?);
141 seed = seed.wrapping_add(1);
142 }
143
144 for _ in 0..medium {
146 let data = generate_random_block(64 * 1024, seed);
147 blocks.push(create_block(data)?);
148 seed = seed.wrapping_add(1);
149 }
150
151 for _ in 0..large {
153 let data = generate_random_block(1024 * 1024, seed);
154 blocks.push(create_block(data)?);
155 seed = seed.wrapping_add(1);
156 }
157
158 Ok(blocks)
159}
160
161pub fn generate_dedup_dataset(unique: usize, duplicate_factor: usize) -> Result<Vec<Block>> {
166 let mut blocks = Vec::new();
167
168 let unique_blocks = generate_random_blocks(unique, 4096, 42)?;
170
171 for _ in 0..duplicate_factor {
173 blocks.extend(unique_blocks.iter().cloned());
174 }
175
176 Ok(blocks)
177}
178
179pub fn extract_cids(blocks: &[Block]) -> Vec<Cid> {
181 blocks.iter().map(|b| *b.cid()).collect()
182}
183
184pub fn compute_total_size(blocks: &[Block]) -> usize {
186 blocks.iter().map(|b| b.data().len()).sum()
187}
188
189pub fn group_blocks_by_size(blocks: &[Block]) -> (Vec<Block>, Vec<Block>, Vec<Block>) {
196 let mut small = Vec::new();
197 let mut medium = Vec::new();
198 let mut large = Vec::new();
199
200 for block in blocks {
201 let size = block.data().len();
202 if size < 16 * 1024 {
203 small.push(block.clone());
204 } else if size < 256 * 1024 {
205 medium.push(block.clone());
206 } else {
207 large.push(block.clone());
208 }
209 }
210
211 (small, medium, large)
212}
213
214pub fn validate_block_integrity(block: &Block) -> bool {
218 let computed_cid = compute_cid(block.data());
219 computed_cid == *block.cid()
220}
221
222pub fn validate_blocks_batch(blocks: &[Block]) -> Vec<(Cid, bool)> {
226 blocks
227 .iter()
228 .map(|block| (*block.cid(), validate_block_integrity(block)))
229 .collect()
230}
231
232#[derive(Debug, Clone)]
234pub struct BlockStatistics {
235 pub count: usize,
237 pub total_size: usize,
239 pub avg_size: f64,
241 pub min_size: usize,
243 pub max_size: usize,
245 pub median_size: usize,
247}
248
249impl BlockStatistics {
250 pub fn from_blocks(blocks: &[Block]) -> Self {
252 if blocks.is_empty() {
253 return Self {
254 count: 0,
255 total_size: 0,
256 avg_size: 0.0,
257 min_size: 0,
258 max_size: 0,
259 median_size: 0,
260 };
261 }
262
263 let mut sizes: Vec<usize> = blocks.iter().map(|b| b.data().len()).collect();
264 sizes.sort_unstable();
265
266 let count = blocks.len();
267 let total_size: usize = sizes.iter().sum();
268 let avg_size = total_size as f64 / count as f64;
269 let min_size = sizes[0];
270 let max_size = sizes[count - 1];
271 let median_size = sizes[count / 2];
272
273 Self {
274 count,
275 total_size,
276 avg_size,
277 min_size,
278 max_size,
279 median_size,
280 }
281 }
282
283 pub fn estimated_memory_overhead(&self) -> usize {
285 self.count * 64
287 }
288
289 pub fn total_memory_footprint(&self) -> usize {
291 self.total_size + self.estimated_memory_overhead()
292 }
293}
294
295pub fn filter_blocks_by_size(blocks: &[Block], min_size: usize, max_size: usize) -> Vec<Block> {
297 blocks
298 .iter()
299 .filter(|block| {
300 let size = block.data().len();
301 size >= min_size && size <= max_size
302 })
303 .cloned()
304 .collect()
305}
306
307pub fn sort_blocks_by_size_asc(blocks: &mut [Block]) {
309 blocks.sort_by_key(|b| b.data().len());
310}
311
312pub fn sort_blocks_by_size_desc(blocks: &mut [Block]) {
314 blocks.sort_by_key(|b| std::cmp::Reverse(b.data().len()));
315}
316
317pub fn find_duplicates(blocks: &[Block]) -> HashMap<Cid, usize> {
321 let mut counts = HashMap::new();
322 for block in blocks {
323 *counts.entry(*block.cid()).or_insert(0) += 1;
324 }
325 counts.retain(|_, count| *count > 1);
326 counts
327}
328
329pub fn deduplicate_blocks(blocks: &[Block]) -> Vec<Block> {
331 let mut seen = std::collections::HashSet::new();
332 let mut result = Vec::new();
333
334 for block in blocks {
335 if seen.insert(*block.cid()) {
336 result.push(block.clone());
337 }
338 }
339
340 result
341}
342
343pub fn estimate_compression_ratio(data: &[u8]) -> f64 {
347 if data.is_empty() {
348 return 1.0;
349 }
350
351 let mut counts = [0u64; 256];
353 for &byte in data {
354 counts[byte as usize] += 1;
355 }
356
357 let len = data.len() as f64;
358 let mut entropy = 0.0;
359
360 for &count in &counts {
361 if count > 0 {
362 let p = count as f64 / len;
363 entropy -= p * p.log2();
364 }
365 }
366
367 (entropy / 8.0).min(1.0)
369}
370
371pub fn sample_blocks(blocks: &[Block], count: usize) -> Vec<Block> {
375 if blocks.len() <= count {
376 return blocks.to_vec();
377 }
378
379 let step = blocks.len() / count;
380 blocks.iter().step_by(step).take(count).cloned().collect()
381}
382
383pub fn generate_pattern_blocks(count: usize, size: usize, pattern: &str) -> Result<Vec<Block>> {
387 match pattern {
388 "sequential" => {
389 let mut blocks = Vec::new();
390 for i in 0..count {
391 let mut data = vec![0u8; size];
392 for (j, byte) in data.iter_mut().enumerate() {
393 *byte = ((i + j) % 256) as u8;
394 }
395 blocks.push(create_block(data)?);
396 }
397 Ok(blocks)
398 }
399 "random" => generate_random_blocks(count, size, 42),
400 "compressible" => generate_compressible_blocks(count, size),
401 "sparse" => {
402 let mut blocks = Vec::new();
403 for _ in 0..count {
404 let mut data = vec![0u8; size];
405 let mut rng = fastrand::Rng::new();
407 for _ in 0..size / 10 {
408 let idx = rng.usize(..size);
409 data[idx] = rng.u8(1..);
410 }
411 blocks.push(create_block(data)?);
412 }
413 Ok(blocks)
414 }
415 _ => generate_random_blocks(count, size, 42), }
417}
418
419#[cfg(test)]
420mod tests {
421 use super::*;
422
423 #[test]
424 fn test_compute_cid() {
425 let data = b"hello world";
426 let cid1 = compute_cid(data);
427 let cid2 = compute_cid(data);
428
429 assert_eq!(cid1, cid2);
431 }
432
433 #[test]
434 fn test_create_block() {
435 let data = b"hello world".to_vec();
436 let block = create_block(data.clone()).unwrap();
437 assert_eq!(block.data(), &data);
438 }
439
440 #[test]
441 fn test_generate_random_block() {
442 let block1 = generate_random_block(100, 42);
443 let block2 = generate_random_block(100, 42);
444 let block3 = generate_random_block(100, 43);
445
446 assert_eq!(block1, block2); assert_ne!(block1, block3); }
449
450 #[test]
451 fn test_generate_compressible_data() {
452 let data = generate_compressible_data(1000);
453 assert_eq!(data.len(), 1000);
454
455 let pattern = b"IPFS is a distributed file system. ";
457 for i in 0..10 {
458 assert_eq!(data[i], pattern[i % pattern.len()]);
459 }
460 }
461
462 #[test]
463 fn test_create_blocks_batch() {
464 let data_vec = vec![b"block1".to_vec(), b"block2".to_vec(), b"block3".to_vec()];
465
466 let blocks = create_blocks_batch(data_vec).unwrap();
467 assert_eq!(blocks.len(), 3);
468 }
469
470 #[test]
471 fn test_generate_random_blocks() {
472 let blocks = generate_random_blocks(10, 1024, 42).unwrap();
473 assert_eq!(blocks.len(), 10);
474
475 for block in &blocks {
477 assert_eq!(block.data().len(), 1024);
478 }
479
480 let cid1 = blocks[0].cid();
482 let cid2 = blocks[1].cid();
483 assert_ne!(cid1, cid2);
484 }
485
486 #[test]
487 fn test_generate_compressible_blocks() {
488 let blocks = generate_compressible_blocks(5, 1024).unwrap();
489 assert_eq!(blocks.len(), 5);
490
491 for block in &blocks {
492 assert_eq!(block.data().len(), 1024);
493 }
494 }
495
496 #[test]
497 fn test_generate_mixed_size_blocks() {
498 let blocks = generate_mixed_size_blocks(2, 3, 1).unwrap();
499 assert_eq!(blocks.len(), 6); assert_eq!(blocks[0].data().len(), 1024); assert_eq!(blocks[2].data().len(), 64 * 1024); assert_eq!(blocks[5].data().len(), 1024 * 1024); }
506
507 #[test]
508 fn test_generate_dedup_dataset() {
509 let blocks = generate_dedup_dataset(10, 3).unwrap();
510 assert_eq!(blocks.len(), 30); for i in 0..10 {
514 assert_eq!(blocks[i].cid(), blocks[i + 10].cid());
515 }
516 }
517
518 #[test]
519 fn test_extract_cids() {
520 let blocks = generate_random_blocks(5, 1024, 42).unwrap();
521 let cids = extract_cids(&blocks);
522 assert_eq!(cids.len(), 5);
523 assert_eq!(cids[0], *blocks[0].cid());
524 }
525
526 #[test]
527 fn test_compute_total_size() {
528 let blocks = generate_mixed_size_blocks(2, 2, 1).unwrap();
529 let total = compute_total_size(&blocks);
530
531 let expected = 2 * 1024 + 2 * 64 * 1024 + 1024 * 1024;
533 assert_eq!(total, expected);
534 }
535
536 #[test]
537 fn test_group_blocks_by_size() {
538 let blocks = generate_mixed_size_blocks(3, 2, 1).unwrap();
539 let (small, medium, large) = group_blocks_by_size(&blocks);
540
541 assert_eq!(small.len(), 3);
542 assert_eq!(medium.len(), 2);
543 assert_eq!(large.len(), 1);
544 }
545
546 #[test]
547 fn test_validate_block_integrity() {
548 let data = b"test data".to_vec();
549 let block = create_block(data).unwrap();
550 assert!(validate_block_integrity(&block));
551 }
552
553 #[test]
554 fn test_validate_blocks_batch() {
555 let blocks = generate_random_blocks(5, 1024, 42).unwrap();
556 let results = validate_blocks_batch(&blocks);
557 assert_eq!(results.len(), 5);
558 for (_, is_valid) in results {
559 assert!(is_valid);
560 }
561 }
562
563 #[test]
564 fn test_block_statistics() {
565 let blocks = generate_mixed_size_blocks(2, 3, 1).unwrap();
566 let stats = BlockStatistics::from_blocks(&blocks);
567
568 assert_eq!(stats.count, 6);
569 assert!(stats.avg_size > 0.0);
570 assert!(stats.min_size <= stats.max_size);
571 assert!(stats.total_memory_footprint() > stats.total_size);
572 }
573
574 #[test]
575 fn test_filter_blocks_by_size() {
576 let blocks = generate_mixed_size_blocks(5, 5, 5).unwrap();
577 let filtered = filter_blocks_by_size(&blocks, 2000, 100_000);
578
579 for block in &filtered {
581 let size = block.data().len();
582 assert!(size >= 2000 && size <= 100_000);
583 }
584 }
585
586 #[test]
587 fn test_sort_blocks_by_size() {
588 let mut blocks = generate_mixed_size_blocks(2, 2, 2).unwrap();
589
590 sort_blocks_by_size_asc(&mut blocks);
591 for i in 1..blocks.len() {
592 assert!(blocks[i - 1].data().len() <= blocks[i].data().len());
593 }
594
595 sort_blocks_by_size_desc(&mut blocks);
596 for i in 1..blocks.len() {
597 assert!(blocks[i - 1].data().len() >= blocks[i].data().len());
598 }
599 }
600
601 #[test]
602 fn test_find_duplicates() {
603 let unique_blocks = generate_random_blocks(5, 1024, 42).unwrap();
604 let mut all_blocks = unique_blocks.clone();
605 all_blocks.extend(unique_blocks.clone()); let duplicates = find_duplicates(&all_blocks);
608 assert_eq!(duplicates.len(), 5); for (_, count) in duplicates {
611 assert_eq!(count, 2);
612 }
613 }
614
615 #[test]
616 fn test_deduplicate_blocks() {
617 let unique_blocks = generate_random_blocks(5, 1024, 42).unwrap();
618 let mut all_blocks = unique_blocks.clone();
619 all_blocks.extend(unique_blocks.clone());
620
621 let deduped = deduplicate_blocks(&all_blocks);
622 assert_eq!(deduped.len(), 5);
623 }
624
625 #[test]
626 fn test_estimate_compression_ratio() {
627 let compressible = generate_compressible_data(1000);
629 let compressible_ratio = estimate_compression_ratio(&compressible);
630
631 let random = generate_random_block(1000, 42);
633 let random_ratio = estimate_compression_ratio(&random);
634
635 assert!(compressible_ratio < random_ratio);
637 }
638
639 #[test]
640 fn test_sample_blocks() {
641 let blocks = generate_random_blocks(100, 1024, 42).unwrap();
642
643 let sample = sample_blocks(&blocks, 10);
644 assert_eq!(sample.len(), 10);
645
646 let sample_all = sample_blocks(&blocks, 200);
648 assert_eq!(sample_all.len(), 100);
649 }
650
651 #[test]
652 fn test_generate_pattern_blocks() {
653 let sequential = generate_pattern_blocks(5, 1024, "sequential").unwrap();
654 assert_eq!(sequential.len(), 5);
655
656 let random = generate_pattern_blocks(5, 1024, "random").unwrap();
657 assert_eq!(random.len(), 5);
658
659 let compressible = generate_pattern_blocks(5, 1024, "compressible").unwrap();
660 assert_eq!(compressible.len(), 5);
661
662 let sparse = generate_pattern_blocks(5, 1024, "sparse").unwrap();
663 assert_eq!(sparse.len(), 5);
664
665 let sparse_data = sparse[0].data();
667 let zero_count = sparse_data.iter().filter(|&&b| b == 0).count();
668 assert!(zero_count > sparse_data.len() * 8 / 10); }
670}