mod common;
use common::helpers::{assert_all_valid_utf8, create_temp_file};
use kiru::{BytesChunker, Chunker, ChunkingError, Source, StreamType};
use proptest::prelude::*;
fn find_byte_overlap(prev: &str, next: &str, target_overlap: usize) -> usize {
let max_check = target_overlap.min(next.len()).min(prev.len());
for len in (1..=max_check).rev() {
if !next.is_char_boundary(len) {
continue;
}
let prev_start = prev.len().saturating_sub(len);
if !prev.is_char_boundary(prev_start) {
continue;
}
if prev[prev_start..] == next[..len] {
return len;
}
}
0
}
fn reconstruct_from_byte_chunks(chunks: &[String], overlap: usize, original: &str) -> String {
let mut reconstructed = String::new();
let mut original_pos = 0;
for (i, chunk) in chunks.iter().enumerate() {
if i == 0 {
reconstructed.push_str(chunk);
original_pos = chunk.len();
} else {
let expected_start = original_pos.saturating_sub(overlap);
let actual_start = (0..=expected_start)
.rev()
.find(|&pos| original.is_char_boundary(pos))
.unwrap_or(0);
let actual_overlap = original_pos - actual_start;
let skip_boundary = (0..=chunk.len())
.find(|&pos| chunk.is_char_boundary(pos) && pos >= actual_overlap)
.unwrap_or(chunk.len());
reconstructed.push_str(&chunk[skip_boundary..]);
original_pos += chunk.len() - skip_boundary;
}
}
reconstructed
}
fn assert_byte_chunk_sizes(chunks: &[String], chunk_size: usize, tolerance: usize) {
for (i, chunk) in chunks.iter().enumerate() {
let size = chunk.len();
if i < chunks.len() - 1 {
assert!(
size >= chunk_size.saturating_sub(tolerance),
"Chunk {} too small: {} bytes < {} bytes (tolerance: {} bytes)",
i,
size,
chunk_size.saturating_sub(tolerance),
tolerance
);
assert!(
size <= chunk_size + tolerance,
"Chunk {} too large: {} bytes > {} bytes (tolerance: {} bytes)",
i,
size,
chunk_size + tolerance,
tolerance
);
}
}
}
fn assert_byte_overlaps(chunks: &[String], overlap: usize, min_tolerance: usize) {
for i in 0..chunks.len().saturating_sub(1) {
let current = &chunks[i];
let next = &chunks[i + 1];
let actual_overlap = find_byte_overlap(current, next, overlap + min_tolerance);
let min_overlap = overlap.saturating_sub(min_tolerance);
assert!(
actual_overlap >= min_overlap,
"Overlap between chunks {} and {} too small: {} bytes < {} bytes (target: {} bytes)",
i,
i + 1,
actual_overlap,
min_overlap,
overlap
);
}
}
fn assert_byte_chunks_valid(
chunks: &[String],
original_text: &str,
chunk_size: usize,
overlap: usize,
tolerance: usize,
) {
assert_all_valid_utf8(chunks);
assert_byte_chunk_sizes(chunks, chunk_size, tolerance);
if chunks.len() > 1 {
assert_byte_overlaps(chunks, overlap, tolerance);
}
let reconstructed = reconstruct_from_byte_chunks(chunks, overlap, original_text);
assert_eq!(
reconstructed,
original_text,
"Reconstruction failed! Expected length: {}, got: {}",
original_text.len(),
reconstructed.len()
);
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(1000))]
#[test]
fn string_chunks_complete_validation(
text in "\\PC{100,5000}",
chunk_size in 50usize..500,
overlap in 10usize..50,
) {
prop_assume!(overlap < chunk_size - 10);
prop_assume!(!text.is_empty());
let chunker = BytesChunker::new(chunk_size, overlap)?;
let chunks = chunker.chunk_string(text.clone()).collect::<Vec<_>>();
if !chunks.is_empty() {
assert_byte_chunks_valid(&chunks, &text, chunk_size, overlap, 7);
}
}
#[test]
fn file_chunks_complete_validation(
text in "\\PC{100,5000}",
chunk_size in 50usize..500,
overlap in 10usize..50,
) {
prop_assume!(overlap < chunk_size - 10);
prop_assume!(!text.is_empty());
let (_dir, path) = create_temp_file(&text);
let chunker = BytesChunker::new(chunk_size, overlap)?;
let stream = StreamType::from_source(&Source::File(path))?;
let chunks = chunker.chunk_stream(stream).collect::<Vec<_>>();
if !chunks.is_empty() {
assert_byte_chunks_valid(&chunks, &text, chunk_size, overlap, 7);
}
}
#[test]
fn file_large_content(
pattern in "\\PC{100,500}",
repeats in 10usize..100,
chunk_size in 100usize..1000,
overlap in 10usize..100,
) {
prop_assume!(overlap < chunk_size - 10);
let text = pattern.repeat(repeats);
let (_dir, path) = create_temp_file(&text);
let chunker = BytesChunker::new(chunk_size, overlap)?;
let stream = StreamType::from_source(&Source::File(path))?;
let chunks = chunker.chunk_stream(stream).collect::<Vec<_>>();
assert_byte_chunks_valid(&chunks, &text, chunk_size, overlap, 10);
}
#[test]
fn file_with_multibyte_chars(
emoji_count in 10usize..100,
chunk_size in 50usize..200,
overlap in 10usize..100,
) {
prop_assume!(overlap < chunk_size - 10);
let emojis = ["🎉", "🎊", "🎈", "🎁", "🎂", "❤️", "🌟", "✨"];
let text: String = (0..emoji_count)
.map(|i| emojis[i % emojis.len()])
.collect();
let (_dir, path) = create_temp_file(&text);
let chunker = BytesChunker::new(chunk_size, overlap)?;
let stream = StreamType::from_source(&Source::File(path))?;
let chunks = chunker.chunk_stream(stream).collect::<Vec<_>>();
assert_byte_chunks_valid(&chunks, &text, chunk_size, overlap, 10);
}
}
#[test]
fn edge_case_empty_string() {
let chunker = BytesChunker::new(100, 10).unwrap();
let chunks = chunker.chunk_string("".to_string()).collect::<Vec<_>>();
assert!(chunks.is_empty());
}
#[test]
fn edge_case_empty_file() {
let (_dir, path) = create_temp_file("");
let chunker = BytesChunker::new(100, 10).unwrap();
let stream = StreamType::from_source(&Source::File(path)).unwrap();
let chunks: Vec<_> = chunker.chunk_stream(stream).collect();
assert!(chunks.is_empty());
}
#[test]
fn edge_case_string_smaller_than_chunk() {
let chunker = BytesChunker::new(100, 10).unwrap();
let chunks: Vec<_> = chunker.chunk_string("Hi".to_string()).collect();
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0], "Hi");
}
#[test]
fn edge_case_file_smaller_than_chunk() {
let (_dir, path) = create_temp_file("Hi");
let chunker = BytesChunker::new(100, 10).unwrap();
let stream = StreamType::from_source(&Source::File(path)).unwrap();
let chunks: Vec<_> = chunker.chunk_stream(stream).collect();
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0], "Hi");
}
#[test]
fn edge_case_string_exactly_chunk_size() {
let text = "12345";
let chunker = BytesChunker::new(5, 0).unwrap();
let chunks: Vec<_> = chunker.chunk_string(text.to_string()).collect();
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0], text);
}
#[test]
fn edge_case_file_exactly_chunk_size() {
let text = "12345";
let (_dir, path) = create_temp_file(text);
let chunker = BytesChunker::new(5, 0).unwrap();
let stream = StreamType::from_source(&Source::File(path)).unwrap();
let chunks: Vec<_> = chunker.chunk_stream(stream).collect();
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0], text);
}
#[test]
fn error_overlap_equals_chunk_size() {
let result = BytesChunker::new(5, 5);
assert!(matches!(
result,
Err(ChunkingError::InvalidArguments {
chunk_size: 5,
overlap: 5
})
));
}
#[test]
fn error_overlap_greater_than_chunk_size() {
let result = BytesChunker::new(5, 10);
assert!(matches!(
result,
Err(ChunkingError::InvalidArguments {
chunk_size: 5,
overlap: 10
})
));
}
#[test]
fn error_file_not_found() {
let result = StreamType::from_source(&Source::File(
"/path/that/definitely/does/not/exist/file.txt".to_string(),
));
assert!(matches!(result, Err(ChunkingError::Io(_))));
}