ipfrs_storage/
utils.rs

1//! Utility functions for testing, benchmarking, and batch operations
2
3use bytes::Bytes;
4use ipfrs_core::{Block, Cid, Result};
5use sha2::{Digest, Sha256};
6use std::collections::HashMap;
7
8/// Compute CID from raw data for testing/benchmarking
9///
10/// This creates a CIDv1 with SHA-256 hash and raw codec
11#[inline]
12pub fn compute_cid(data: &[u8]) -> Cid {
13    let hash = Sha256::digest(data);
14
15    // Create CIDv1 with SHA-256 multihash (0x12) and raw codec (0x55)
16    let mut multihash = Vec::with_capacity(34);
17    multihash.push(0x12); // SHA-256
18    multihash.push(32); // hash length
19    multihash.extend_from_slice(&hash);
20
21    // For testing, we'll use the multihash as the CID bytes directly
22    // In production, this would use the proper CID encoding
23    Cid::try_from(multihash).unwrap_or_else(|_| {
24        // Fallback: create a simple CID from the hash
25        let cid_bytes = format!("bafkreei{}", hex::encode(&hash[..16]));
26        Cid::try_from(cid_bytes.as_bytes().to_vec()).unwrap()
27    })
28}
29
30/// Create a block from raw data for testing
31#[inline]
32pub fn create_block(data: Vec<u8>) -> Result<Block> {
33    let cid = compute_cid(&data);
34    Ok(Block::from_parts(cid, Bytes::from(data)))
35}
36
37/// Generate random block data for testing
38pub fn generate_random_block(size: usize, seed: u64) -> Vec<u8> {
39    let mut rng = fastrand::Rng::with_seed(seed);
40    let mut data = vec![0u8; size];
41
42    for chunk in data.chunks_mut(8) {
43        let val = rng.u64(..);
44        let bytes = val.to_le_bytes();
45        let len = chunk.len().min(8);
46        chunk[..len].copy_from_slice(&bytes[..len]);
47    }
48
49    data
50}
51
52/// Generate compressible data for testing compression
53pub fn generate_compressible_data(size: usize) -> Vec<u8> {
54    let mut data = vec![0u8; size];
55    let pattern = b"IPFS is a distributed file system. ";
56
57    for (i, byte) in data.iter_mut().enumerate() {
58        *byte = pattern[i % pattern.len()];
59    }
60
61    data
62}
63
64/// Generate incompressible data for testing
65pub fn generate_incompressible_data(size: usize, seed: u64) -> Vec<u8> {
66    generate_random_block(size, seed)
67}
68
69/// Create multiple blocks from raw data efficiently
70///
71/// This is optimized for batch operations, creating all blocks in parallel
72/// Returns a HashMap mapping CIDs to Blocks for easy lookup
73pub fn create_blocks_batch(data_vec: Vec<Vec<u8>>) -> Result<HashMap<Cid, Block>> {
74    let mut blocks = HashMap::with_capacity(data_vec.len());
75
76    for data in data_vec {
77        let block = create_block(data)?;
78        blocks.insert(*block.cid(), block);
79    }
80
81    Ok(blocks)
82}
83
84/// Generate multiple random blocks with sequential seeds
85///
86/// Returns a Vec of (Block, seed) tuples for reproducibility
87pub fn generate_random_blocks(count: usize, size: usize, start_seed: u64) -> Result<Vec<Block>> {
88    let mut blocks = Vec::with_capacity(count);
89
90    for i in 0..count {
91        let seed = start_seed.wrapping_add(i as u64);
92        let data = generate_random_block(size, seed);
93        let block = create_block(data)?;
94        blocks.push(block);
95    }
96
97    Ok(blocks)
98}
99
100/// Generate multiple compressible blocks for compression testing
101///
102/// Each block has a different pattern to test compression efficiency
103pub fn generate_compressible_blocks(count: usize, size: usize) -> Result<Vec<Block>> {
104    let patterns = [
105        b"IPFS is a distributed file system. ".to_vec(),
106        b"Content addressing with CIDs. ".to_vec(),
107        b"Merkle DAGs for data structures. ".to_vec(),
108        b"Peer-to-peer networking protocol. ".to_vec(),
109        b"Immutable data storage layer. ".to_vec(),
110    ];
111
112    let mut blocks = Vec::with_capacity(count);
113
114    for i in 0..count {
115        let pattern = &patterns[i % patterns.len()];
116        let mut data = vec![0u8; size];
117
118        for (j, byte) in data.iter_mut().enumerate() {
119            *byte = pattern[j % pattern.len()];
120        }
121
122        let block = create_block(data)?;
123        blocks.push(block);
124    }
125
126    Ok(blocks)
127}
128
129/// Create a test dataset with mixed block sizes
130///
131/// Returns blocks with sizes: small (1KB), medium (64KB), large (1MB)
132/// Useful for testing size-dependent optimizations
133pub fn generate_mixed_size_blocks(small: usize, medium: usize, large: usize) -> Result<Vec<Block>> {
134    let mut blocks = Vec::with_capacity(small + medium + large);
135    let mut seed = 0u64;
136
137    // Small blocks (1KB)
138    for _ in 0..small {
139        let data = generate_random_block(1024, seed);
140        blocks.push(create_block(data)?);
141        seed = seed.wrapping_add(1);
142    }
143
144    // Medium blocks (64KB)
145    for _ in 0..medium {
146        let data = generate_random_block(64 * 1024, seed);
147        blocks.push(create_block(data)?);
148        seed = seed.wrapping_add(1);
149    }
150
151    // Large blocks (1MB)
152    for _ in 0..large {
153        let data = generate_random_block(1024 * 1024, seed);
154        blocks.push(create_block(data)?);
155        seed = seed.wrapping_add(1);
156    }
157
158    Ok(blocks)
159}
160
161/// Create blocks with controlled deduplication characteristics
162///
163/// - `unique`: Number of unique blocks
164/// - `duplicate_factor`: How many times each block is duplicated
165pub fn generate_dedup_dataset(unique: usize, duplicate_factor: usize) -> Result<Vec<Block>> {
166    let mut blocks = Vec::new();
167
168    // Generate unique blocks
169    let unique_blocks = generate_random_blocks(unique, 4096, 42)?;
170
171    // Duplicate them
172    for _ in 0..duplicate_factor {
173        blocks.extend(unique_blocks.iter().cloned());
174    }
175
176    Ok(blocks)
177}
178
179/// Extract CIDs from a collection of blocks
180pub fn extract_cids(blocks: &[Block]) -> Vec<Cid> {
181    blocks.iter().map(|b| *b.cid()).collect()
182}
183
184/// Compute total size of a block collection
185pub fn compute_total_size(blocks: &[Block]) -> usize {
186    blocks.iter().map(|b| b.data().len()).sum()
187}
188
189/// Group blocks by size ranges for analysis
190///
191/// Returns (small, medium, large) where:
192/// - small: < 16KB
193/// - medium: 16KB - 256KB
194/// - large: > 256KB
195pub fn group_blocks_by_size(blocks: &[Block]) -> (Vec<Block>, Vec<Block>, Vec<Block>) {
196    let mut small = Vec::new();
197    let mut medium = Vec::new();
198    let mut large = Vec::new();
199
200    for block in blocks {
201        let size = block.data().len();
202        if size < 16 * 1024 {
203            small.push(block.clone());
204        } else if size < 256 * 1024 {
205            medium.push(block.clone());
206        } else {
207            large.push(block.clone());
208        }
209    }
210
211    (small, medium, large)
212}
213
214/// Validate block integrity by recomputing CID
215///
216/// Returns true if the block's CID matches the computed CID from its data
217pub fn validate_block_integrity(block: &Block) -> bool {
218    let computed_cid = compute_cid(block.data());
219    computed_cid == *block.cid()
220}
221
222/// Batch validate block integrity for multiple blocks
223///
224/// Returns a vector of (CID, is_valid) tuples
225pub fn validate_blocks_batch(blocks: &[Block]) -> Vec<(Cid, bool)> {
226    blocks
227        .iter()
228        .map(|block| (*block.cid(), validate_block_integrity(block)))
229        .collect()
230}
231
232/// Compute statistics for a collection of blocks
233#[derive(Debug, Clone)]
234pub struct BlockStatistics {
235    /// Total number of blocks
236    pub count: usize,
237    /// Total size in bytes
238    pub total_size: usize,
239    /// Average block size
240    pub avg_size: f64,
241    /// Minimum block size
242    pub min_size: usize,
243    /// Maximum block size
244    pub max_size: usize,
245    /// Median block size (approximate)
246    pub median_size: usize,
247}
248
249impl BlockStatistics {
250    /// Compute statistics from a block collection
251    pub fn from_blocks(blocks: &[Block]) -> Self {
252        if blocks.is_empty() {
253            return Self {
254                count: 0,
255                total_size: 0,
256                avg_size: 0.0,
257                min_size: 0,
258                max_size: 0,
259                median_size: 0,
260            };
261        }
262
263        let mut sizes: Vec<usize> = blocks.iter().map(|b| b.data().len()).collect();
264        sizes.sort_unstable();
265
266        let count = blocks.len();
267        let total_size: usize = sizes.iter().sum();
268        let avg_size = total_size as f64 / count as f64;
269        let min_size = sizes[0];
270        let max_size = sizes[count - 1];
271        let median_size = sizes[count / 2];
272
273        Self {
274            count,
275            total_size,
276            avg_size,
277            min_size,
278            max_size,
279            median_size,
280        }
281    }
282
283    /// Estimate memory overhead (CID + metadata)
284    pub fn estimated_memory_overhead(&self) -> usize {
285        // Rough estimate: 64 bytes per block for CID and metadata
286        self.count * 64
287    }
288
289    /// Total memory footprint (data + overhead)
290    pub fn total_memory_footprint(&self) -> usize {
291        self.total_size + self.estimated_memory_overhead()
292    }
293}
294
295/// Filter blocks by size range
296pub fn filter_blocks_by_size(blocks: &[Block], min_size: usize, max_size: usize) -> Vec<Block> {
297    blocks
298        .iter()
299        .filter(|block| {
300            let size = block.data().len();
301            size >= min_size && size <= max_size
302        })
303        .cloned()
304        .collect()
305}
306
307/// Sort blocks by size (ascending)
308pub fn sort_blocks_by_size_asc(blocks: &mut [Block]) {
309    blocks.sort_by_key(|b| b.data().len());
310}
311
312/// Sort blocks by size (descending)
313pub fn sort_blocks_by_size_desc(blocks: &mut [Block]) {
314    blocks.sort_by_key(|b| std::cmp::Reverse(b.data().len()));
315}
316
317/// Find duplicate blocks (same CID)
318///
319/// Returns a HashMap mapping CIDs to their occurrence count
320pub fn find_duplicates(blocks: &[Block]) -> HashMap<Cid, usize> {
321    let mut counts = HashMap::new();
322    for block in blocks {
323        *counts.entry(*block.cid()).or_insert(0) += 1;
324    }
325    counts.retain(|_, count| *count > 1);
326    counts
327}
328
329/// Deduplicate blocks by CID (keep first occurrence)
330pub fn deduplicate_blocks(blocks: &[Block]) -> Vec<Block> {
331    let mut seen = std::collections::HashSet::new();
332    let mut result = Vec::new();
333
334    for block in blocks {
335        if seen.insert(*block.cid()) {
336            result.push(block.clone());
337        }
338    }
339
340    result
341}
342
343/// Estimate compression ratio for a block
344///
345/// Returns a value between 0.0 and 1.0, where lower values indicate better compression
346pub fn estimate_compression_ratio(data: &[u8]) -> f64 {
347    if data.is_empty() {
348        return 1.0;
349    }
350
351    // Simple entropy-based estimation
352    let mut counts = [0u64; 256];
353    for &byte in data {
354        counts[byte as usize] += 1;
355    }
356
357    let len = data.len() as f64;
358    let mut entropy = 0.0;
359
360    for &count in &counts {
361        if count > 0 {
362            let p = count as f64 / len;
363            entropy -= p * p.log2();
364        }
365    }
366
367    // Normalize entropy to 0-1 range (8 bits max entropy)
368    (entropy / 8.0).min(1.0)
369}
370
371/// Sample a subset of blocks for testing
372///
373/// Returns up to `count` blocks, evenly distributed across the input
374pub fn sample_blocks(blocks: &[Block], count: usize) -> Vec<Block> {
375    if blocks.len() <= count {
376        return blocks.to_vec();
377    }
378
379    let step = blocks.len() / count;
380    blocks.iter().step_by(step).take(count).cloned().collect()
381}
382
383/// Generate blocks with specific patterns for testing
384///
385/// Patterns: "sequential", "random", "compressible", "sparse"
386pub fn generate_pattern_blocks(count: usize, size: usize, pattern: &str) -> Result<Vec<Block>> {
387    match pattern {
388        "sequential" => {
389            let mut blocks = Vec::new();
390            for i in 0..count {
391                let mut data = vec![0u8; size];
392                for (j, byte) in data.iter_mut().enumerate() {
393                    *byte = ((i + j) % 256) as u8;
394                }
395                blocks.push(create_block(data)?);
396            }
397            Ok(blocks)
398        }
399        "random" => generate_random_blocks(count, size, 42),
400        "compressible" => generate_compressible_blocks(count, size),
401        "sparse" => {
402            let mut blocks = Vec::new();
403            for _ in 0..count {
404                let mut data = vec![0u8; size];
405                // Set only 10% of bytes to non-zero
406                let mut rng = fastrand::Rng::new();
407                for _ in 0..size / 10 {
408                    let idx = rng.usize(..size);
409                    data[idx] = rng.u8(1..);
410                }
411                blocks.push(create_block(data)?);
412            }
413            Ok(blocks)
414        }
415        _ => generate_random_blocks(count, size, 42), // Default to random
416    }
417}
418
419#[cfg(test)]
420mod tests {
421    use super::*;
422
423    #[test]
424    fn test_compute_cid() {
425        let data = b"hello world";
426        let cid1 = compute_cid(data);
427        let cid2 = compute_cid(data);
428
429        // Same data should produce same CID
430        assert_eq!(cid1, cid2);
431    }
432
433    #[test]
434    fn test_create_block() {
435        let data = b"hello world".to_vec();
436        let block = create_block(data.clone()).unwrap();
437        assert_eq!(block.data(), &data);
438    }
439
440    #[test]
441    fn test_generate_random_block() {
442        let block1 = generate_random_block(100, 42);
443        let block2 = generate_random_block(100, 42);
444        let block3 = generate_random_block(100, 43);
445
446        assert_eq!(block1, block2); // Same seed = same data
447        assert_ne!(block1, block3); // Different seed = different data
448    }
449
450    #[test]
451    fn test_generate_compressible_data() {
452        let data = generate_compressible_data(1000);
453        assert_eq!(data.len(), 1000);
454
455        // Check it has repeating pattern
456        let pattern = b"IPFS is a distributed file system. ";
457        for i in 0..10 {
458            assert_eq!(data[i], pattern[i % pattern.len()]);
459        }
460    }
461
462    #[test]
463    fn test_create_blocks_batch() {
464        let data_vec = vec![b"block1".to_vec(), b"block2".to_vec(), b"block3".to_vec()];
465
466        let blocks = create_blocks_batch(data_vec).unwrap();
467        assert_eq!(blocks.len(), 3);
468    }
469
470    #[test]
471    fn test_generate_random_blocks() {
472        let blocks = generate_random_blocks(10, 1024, 42).unwrap();
473        assert_eq!(blocks.len(), 10);
474
475        // All blocks should have the specified size
476        for block in &blocks {
477            assert_eq!(block.data().len(), 1024);
478        }
479
480        // Blocks should be unique (different seeds)
481        let cid1 = blocks[0].cid();
482        let cid2 = blocks[1].cid();
483        assert_ne!(cid1, cid2);
484    }
485
486    #[test]
487    fn test_generate_compressible_blocks() {
488        let blocks = generate_compressible_blocks(5, 1024).unwrap();
489        assert_eq!(blocks.len(), 5);
490
491        for block in &blocks {
492            assert_eq!(block.data().len(), 1024);
493        }
494    }
495
496    #[test]
497    fn test_generate_mixed_size_blocks() {
498        let blocks = generate_mixed_size_blocks(2, 3, 1).unwrap();
499        assert_eq!(blocks.len(), 6); // 2 + 3 + 1
500
501        // Verify sizes
502        assert_eq!(blocks[0].data().len(), 1024); // small
503        assert_eq!(blocks[2].data().len(), 64 * 1024); // medium
504        assert_eq!(blocks[5].data().len(), 1024 * 1024); // large
505    }
506
507    #[test]
508    fn test_generate_dedup_dataset() {
509        let blocks = generate_dedup_dataset(10, 3).unwrap();
510        assert_eq!(blocks.len(), 30); // 10 unique * 3 duplicates
511
512        // First 10 should match next 10 (duplicates)
513        for i in 0..10 {
514            assert_eq!(blocks[i].cid(), blocks[i + 10].cid());
515        }
516    }
517
518    #[test]
519    fn test_extract_cids() {
520        let blocks = generate_random_blocks(5, 1024, 42).unwrap();
521        let cids = extract_cids(&blocks);
522        assert_eq!(cids.len(), 5);
523        assert_eq!(cids[0], *blocks[0].cid());
524    }
525
526    #[test]
527    fn test_compute_total_size() {
528        let blocks = generate_mixed_size_blocks(2, 2, 1).unwrap();
529        let total = compute_total_size(&blocks);
530
531        // 2 * 1KB + 2 * 64KB + 1 * 1MB
532        let expected = 2 * 1024 + 2 * 64 * 1024 + 1024 * 1024;
533        assert_eq!(total, expected);
534    }
535
536    #[test]
537    fn test_group_blocks_by_size() {
538        let blocks = generate_mixed_size_blocks(3, 2, 1).unwrap();
539        let (small, medium, large) = group_blocks_by_size(&blocks);
540
541        assert_eq!(small.len(), 3);
542        assert_eq!(medium.len(), 2);
543        assert_eq!(large.len(), 1);
544    }
545
546    #[test]
547    fn test_validate_block_integrity() {
548        let data = b"test data".to_vec();
549        let block = create_block(data).unwrap();
550        assert!(validate_block_integrity(&block));
551    }
552
553    #[test]
554    fn test_validate_blocks_batch() {
555        let blocks = generate_random_blocks(5, 1024, 42).unwrap();
556        let results = validate_blocks_batch(&blocks);
557        assert_eq!(results.len(), 5);
558        for (_, is_valid) in results {
559            assert!(is_valid);
560        }
561    }
562
563    #[test]
564    fn test_block_statistics() {
565        let blocks = generate_mixed_size_blocks(2, 3, 1).unwrap();
566        let stats = BlockStatistics::from_blocks(&blocks);
567
568        assert_eq!(stats.count, 6);
569        assert!(stats.avg_size > 0.0);
570        assert!(stats.min_size <= stats.max_size);
571        assert!(stats.total_memory_footprint() > stats.total_size);
572    }
573
574    #[test]
575    fn test_filter_blocks_by_size() {
576        let blocks = generate_mixed_size_blocks(5, 5, 5).unwrap();
577        let filtered = filter_blocks_by_size(&blocks, 2000, 100_000);
578
579        // Should filter out 1KB blocks and keep 64KB blocks and 1MB blocks
580        for block in &filtered {
581            let size = block.data().len();
582            assert!(size >= 2000 && size <= 100_000);
583        }
584    }
585
586    #[test]
587    fn test_sort_blocks_by_size() {
588        let mut blocks = generate_mixed_size_blocks(2, 2, 2).unwrap();
589
590        sort_blocks_by_size_asc(&mut blocks);
591        for i in 1..blocks.len() {
592            assert!(blocks[i - 1].data().len() <= blocks[i].data().len());
593        }
594
595        sort_blocks_by_size_desc(&mut blocks);
596        for i in 1..blocks.len() {
597            assert!(blocks[i - 1].data().len() >= blocks[i].data().len());
598        }
599    }
600
601    #[test]
602    fn test_find_duplicates() {
603        let unique_blocks = generate_random_blocks(5, 1024, 42).unwrap();
604        let mut all_blocks = unique_blocks.clone();
605        all_blocks.extend(unique_blocks.clone()); // Add duplicates
606
607        let duplicates = find_duplicates(&all_blocks);
608        assert_eq!(duplicates.len(), 5); // All 5 blocks appear twice
609
610        for (_, count) in duplicates {
611            assert_eq!(count, 2);
612        }
613    }
614
615    #[test]
616    fn test_deduplicate_blocks() {
617        let unique_blocks = generate_random_blocks(5, 1024, 42).unwrap();
618        let mut all_blocks = unique_blocks.clone();
619        all_blocks.extend(unique_blocks.clone());
620
621        let deduped = deduplicate_blocks(&all_blocks);
622        assert_eq!(deduped.len(), 5);
623    }
624
625    #[test]
626    fn test_estimate_compression_ratio() {
627        // Compressible data (repeating pattern)
628        let compressible = generate_compressible_data(1000);
629        let compressible_ratio = estimate_compression_ratio(&compressible);
630
631        // Random data (incompressible)
632        let random = generate_random_block(1000, 42);
633        let random_ratio = estimate_compression_ratio(&random);
634
635        // Compressible data should have lower entropy (better compression potential)
636        assert!(compressible_ratio < random_ratio);
637    }
638
639    #[test]
640    fn test_sample_blocks() {
641        let blocks = generate_random_blocks(100, 1024, 42).unwrap();
642
643        let sample = sample_blocks(&blocks, 10);
644        assert_eq!(sample.len(), 10);
645
646        // Test sampling more than available
647        let sample_all = sample_blocks(&blocks, 200);
648        assert_eq!(sample_all.len(), 100);
649    }
650
651    #[test]
652    fn test_generate_pattern_blocks() {
653        let sequential = generate_pattern_blocks(5, 1024, "sequential").unwrap();
654        assert_eq!(sequential.len(), 5);
655
656        let random = generate_pattern_blocks(5, 1024, "random").unwrap();
657        assert_eq!(random.len(), 5);
658
659        let compressible = generate_pattern_blocks(5, 1024, "compressible").unwrap();
660        assert_eq!(compressible.len(), 5);
661
662        let sparse = generate_pattern_blocks(5, 1024, "sparse").unwrap();
663        assert_eq!(sparse.len(), 5);
664
665        // Test sparse pattern has mostly zeros
666        let sparse_data = sparse[0].data();
667        let zero_count = sparse_data.iter().filter(|&&b| b == 0).count();
668        assert!(zero_count > sparse_data.len() * 8 / 10); // At least 80% zeros
669    }
670}