use crate::error::Result;
use crate::hash::Hash;
use crate::object::ChunkEntry;
#[derive(Debug, Clone)]
pub struct ChunkerConfig {
pub min_size: usize,
pub avg_size: usize,
pub max_size: usize,
}
impl Default for ChunkerConfig {
fn default() -> Self {
Self {
min_size: 128 * 1024, avg_size: 512 * 1024, max_size: 1024 * 1024, }
}
}
pub fn chunk_file(data: &[u8], config: &ChunkerConfig) -> Result<Vec<ChunkEntry>> {
use fastcdc::v2020::FastCDC;
let chunker = FastCDC::new(
data,
config.min_size as u32,
config.avg_size as u32,
config.max_size as u32,
);
let mut chunks = Vec::new();
for chunk in chunker {
let chunk_data = &data[chunk.offset..chunk.offset + chunk.length];
let hash = Hash::hash_bytes(chunk_data);
chunks.push(ChunkEntry {
hash,
size: chunk.length as u64,
});
}
Ok(chunks)
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chunk_file_basic() {
let data = vec![0u8; 2 * 1024 * 1024];
let config = ChunkerConfig::default();
let chunks = chunk_file(&data, &config).unwrap();
assert!(
chunks.len() >= 2,
"Expected at least 2 chunks, got {}",
chunks.len()
);
let total_size: u64 = chunks.iter().map(|c| c.size).sum();
assert_eq!(total_size, data.len() as u64);
for chunk in &chunks {
assert!(
chunk.size >= config.min_size as u64,
"Chunk size {} is below minimum {}",
chunk.size,
config.min_size
);
assert!(
chunk.size <= config.max_size as u64,
"Chunk size {} exceeds maximum {}",
chunk.size,
config.max_size
);
}
}
#[test]
fn test_chunk_boundaries() {
let data = (0..2 * 1024 * 1024)
.map(|i| (i % 256) as u8)
.collect::<Vec<_>>();
let config = ChunkerConfig::default();
let chunks = chunk_file(&data, &config).unwrap();
for chunk in &chunks {
assert!(
chunk.size >= config.min_size as u64 || chunk.size == data.len() as u64, "Chunk size {} violates min size {}",
chunk.size,
config.min_size
);
assert!(
chunk.size <= config.max_size as u64,
"Chunk size {} exceeds max size {}",
chunk.size,
config.max_size
);
}
}
#[test]
fn test_deterministic() {
let data = vec![42u8; 2 * 1024 * 1024];
let config = ChunkerConfig::default();
let chunks1 = chunk_file(&data, &config).unwrap();
let chunks2 = chunk_file(&data, &config).unwrap();
assert_eq!(
chunks1.len(),
chunks2.len(),
"Chunk count should be deterministic"
);
for (c1, c2) in chunks1.iter().zip(chunks2.iter()) {
assert_eq!(c1.hash, c2.hash, "Chunk hashes should be deterministic");
assert_eq!(c1.size, c2.size, "Chunk sizes should be deterministic");
}
}
#[test]
fn test_small_file() {
let data = vec![0u8; 100 * 1024]; let config = ChunkerConfig::default();
let chunks = chunk_file(&data, &config).unwrap();
assert_eq!(chunks.len(), 1, "Small file should create single chunk");
assert_eq!(chunks[0].size, data.len() as u64);
}
#[test]
fn test_empty_file() {
let data = vec![];
let config = ChunkerConfig::default();
let chunks = chunk_file(&data, &config).unwrap();
assert_eq!(chunks.len(), 0, "Empty file should create no chunks");
}
use proptest::prelude::*;
proptest! {
#![proptest_config(ProptestConfig {
cases: 32, // Reduced case count - chunking is expensive
max_shrink_iters: 5000,
..ProptestConfig::default()
})]
#[test]
fn prop_chunk_sizes_bounded(
data in prop::collection::vec(any::<u8>(), 0..2_000_000)
) {
let config = ChunkerConfig::default();
let chunks = chunk_file(&data, &config)?;
for (i, chunk) in chunks.iter().enumerate() {
let is_last = i == chunks.len() - 1;
prop_assert!(
chunk.size <= config.max_size as u64,
"Chunk {} size {} exceeds max {}",
i, chunk.size, config.max_size
);
if !is_last && chunks.len() > 1 {
prop_assert!(
chunk.size >= config.min_size as u64,
"Chunk {} size {} below min {} (not last chunk)",
i, chunk.size, config.min_size
);
}
}
}
#[test]
fn prop_chunking_deterministic(
data in prop::collection::vec(any::<u8>(), 0..2_000_000)
) {
let config = ChunkerConfig::default();
let chunks1 = chunk_file(&data, &config)?;
let chunks2 = chunk_file(&data, &config)?;
prop_assert_eq!(chunks1.len(), chunks2.len(), "Chunk count must be deterministic");
for (i, (c1, c2)) in chunks1.iter().zip(chunks2.iter()).enumerate() {
prop_assert_eq!(c1.hash, c2.hash, "Chunk {} hash must be deterministic", i);
prop_assert_eq!(c1.size, c2.size, "Chunk {} size must be deterministic", i);
}
}
#[test]
fn prop_total_size_preserved(
data in prop::collection::vec(any::<u8>(), 0..2_000_000)
) {
let config = ChunkerConfig::default();
let chunks = chunk_file(&data, &config)?;
let total_size: u64 = chunks.iter().map(|c| c.size).sum();
prop_assert_eq!(
total_size,
data.len() as u64,
"Total chunk size must equal input size"
);
}
#[test]
fn prop_boundary_stability_after_insert(
original_data in prop::collection::vec(any::<u8>(), 2_000_000..3_000_000),
insert_data in prop::collection::vec(any::<u8>(), 100..5_000)
) {
let config = ChunkerConfig::default();
let original_chunks = chunk_file(&original_data, &config)?;
let mut modified_data = insert_data.clone();
modified_data.extend_from_slice(&original_data);
let modified_chunks = chunk_file(&modified_data, &config)?;
let mut matching_chunks = 0;
for orig_chunk in &original_chunks {
if modified_chunks.iter().any(|mod_chunk| mod_chunk.hash == orig_chunk.hash) {
matching_chunks += 1;
}
}
let reuse_ratio = matching_chunks as f64 / original_chunks.len() as f64;
prop_assert!(
reuse_ratio >= 0.30 || original_chunks.len() < 3,
"Expected at least 30% chunk reuse after small insertion, got {:.1}% ({}/{} chunks)",
reuse_ratio * 100.0,
matching_chunks,
original_chunks.len()
);
}
#[test]
fn prop_boundary_stability_after_append(
original_data in prop::collection::vec(any::<u8>(), 2_000_000..3_000_000),
append_data in prop::collection::vec(any::<u8>(), 500_000..1_000_000)
) {
let config = ChunkerConfig::default();
let original_chunks = chunk_file(&original_data, &config)?;
let mut modified_data = original_data.clone();
modified_data.extend_from_slice(&append_data);
let modified_chunks = chunk_file(&modified_data, &config)?;
let prefix_len = original_chunks.len().saturating_sub(1); let mut matching_prefix = 0;
for i in 0..prefix_len.min(modified_chunks.len()) {
if original_chunks[i].hash == modified_chunks[i].hash {
matching_prefix += 1;
}
}
let match_ratio = if prefix_len > 0 {
matching_prefix as f64 / prefix_len as f64
} else {
1.0
};
prop_assert!(
match_ratio >= 0.80 || prefix_len < 2,
"Expected at least 80% prefix chunk preservation after append, got {:.1}% ({}/{} chunks)",
match_ratio * 100.0,
matching_prefix,
prefix_len
);
}
#[test]
fn prop_boundary_stability_after_delete(
original_data in prop::collection::vec(any::<u8>(), 3_000_000..4_000_000),
delete_offset in 1_000_000usize..2_000_000usize,
delete_len in 1_000usize..10_000usize
) {
let config = ChunkerConfig::default();
let original_chunks = chunk_file(&original_data, &config)?;
let delete_end = (delete_offset + delete_len).min(original_data.len());
let mut modified_data = Vec::new();
modified_data.extend_from_slice(&original_data[..delete_offset]);
if delete_end < original_data.len() {
modified_data.extend_from_slice(&original_data[delete_end..]);
}
let modified_chunks = chunk_file(&modified_data, &config)?;
let mut matching_chunks = 0;
for orig_chunk in &original_chunks {
if modified_chunks.iter().any(|mod_chunk| mod_chunk.hash == orig_chunk.hash) {
matching_chunks += 1;
}
}
let reuse_ratio = matching_chunks as f64 / original_chunks.len() as f64;
prop_assert!(
reuse_ratio >= 0.40 || original_chunks.len() < 3,
"Expected at least 40% chunk reuse after small deletion, got {:.1}% ({}/{} chunks)",
reuse_ratio * 100.0,
matching_chunks,
original_chunks.len()
);
}
}
}