use super::fingerprint::ChunkerConfig;
#[derive(Debug, Clone)]
pub struct SemanticChunker {
chunk_size: usize,
chunk_overlap: usize,
separators: Vec<String>,
}
impl SemanticChunker {
pub fn new(chunk_size: usize, chunk_overlap: usize, separators: Vec<String>) -> Self {
Self { chunk_size, chunk_overlap, separators }
}
pub fn from_config(config: &ChunkerConfig) -> Self {
Self {
chunk_size: config.chunk_size,
chunk_overlap: config.chunk_overlap,
separators: Self::default_separators(),
}
}
fn default_separators() -> Vec<String> {
vec![
"\n## ".to_string(), "\n### ".to_string(), "\n#### ".to_string(), "\nfn ".to_string(), "\npub fn ".to_string(), "\nimpl ".to_string(), "\nstruct ".to_string(), "\nenum ".to_string(), "\nmod ".to_string(), "\n```".to_string(), "\n\n".to_string(), "\n".to_string(), " ".to_string(), ]
}
pub fn split(&self, text: &str) -> Vec<Chunk> {
let mut chunks = Vec::new();
let mut current_pos = 0;
while current_pos < text.len() {
let (chunk_text, end_pos) = self.extract_chunk(text, current_pos);
if !chunk_text.trim().is_empty() {
let start_line = text[..current_pos].matches('\n').count() + 1;
let end_line = start_line + chunk_text.matches('\n').count();
chunks.push(Chunk {
content: chunk_text.to_string(),
start_offset: current_pos,
end_offset: end_pos,
start_line,
end_line,
});
}
let advance = if end_pos - current_pos > self.chunk_overlap {
end_pos - current_pos - self.chunk_overlap
} else {
end_pos - current_pos
};
let new_pos = current_pos + advance.max(1);
current_pos = Self::find_next_char_boundary(text, new_pos);
}
chunks
}
fn extract_chunk(&self, text: &str, start: usize) -> (String, usize) {
let remaining = &text[start..];
let target_end = Self::find_char_boundary(text, (start + self.chunk_size).min(text.len()));
if start + remaining.len() <= target_end {
return (remaining.to_string(), text.len());
}
let search_region = &text[start..target_end];
for separator in &self.separators {
if let Some(pos) = search_region.rfind(separator.as_str()) {
if pos > 0 {
let end = start + pos + separator.len();
return (text[start..end].to_string(), end);
}
}
}
(text[start..target_end].to_string(), target_end)
}
fn find_char_boundary(text: &str, pos: usize) -> usize {
if pos >= text.len() {
return text.len();
}
let mut boundary = pos;
while boundary > 0 && !text.is_char_boundary(boundary) {
boundary -= 1;
}
boundary
}
fn find_next_char_boundary(text: &str, pos: usize) -> usize {
if pos >= text.len() {
return text.len();
}
let mut boundary = pos;
while boundary < text.len() && !text.is_char_boundary(boundary) {
boundary += 1;
}
boundary
}
pub fn config(&self) -> ChunkerConfig {
let sep_refs: Vec<&str> = self.separators.iter().map(|s| s.as_str()).collect();
ChunkerConfig::new(self.chunk_size, self.chunk_overlap, &sep_refs)
}
}
impl Default for SemanticChunker {
fn default() -> Self {
Self { chunk_size: 512, chunk_overlap: 64, separators: Self::default_separators() }
}
}
#[derive(Debug, Clone, PartialEq, Eq)]
pub struct Chunk {
pub content: String,
pub start_offset: usize,
pub end_offset: usize,
pub start_line: usize,
pub end_line: usize,
}
impl Chunk {
pub fn content_hash(&self) -> [u8; 32] {
let mut hash = [0u8; 32];
let mut state: u64 = 0xcbf2_9ce4_8422_2325;
for &byte in self.content.as_bytes() {
state ^= byte as u64;
state = state.wrapping_mul(0x0100_0000_01b3);
}
for i in 0..4 {
let chunk = state.wrapping_add(i as u64).to_le_bytes();
hash[i * 8..(i + 1) * 8].copy_from_slice(&chunk);
}
hash
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_chunker_default() {
let chunker = SemanticChunker::default();
assert_eq!(chunker.chunk_size, 512);
assert_eq!(chunker.chunk_overlap, 64);
assert!(!chunker.separators.is_empty());
}
#[test]
fn test_split_short_text() {
let chunker = SemanticChunker::default();
let text = "Short text";
let chunks = chunker.split(text);
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0].content, "Short text");
assert_eq!(chunks[0].start_line, 1);
assert_eq!(chunks[0].end_line, 1);
}
#[test]
fn test_split_markdown_headers() {
let chunker = SemanticChunker::new(100, 10, vec!["\n## ".to_string()]);
let text = "# Title\n\nIntro paragraph.\n\n## Section 1\n\nContent 1.\n\n## Section 2\n\nContent 2.";
let chunks = chunker.split(text);
assert!(chunks.len() >= 2);
}
#[test]
fn test_split_rust_code() {
let chunker = SemanticChunker::new(200, 20, vec!["\nfn ".to_string(), "\n\n".to_string()]);
let text = r#"
fn foo() {
println!("foo");
}
fn bar() {
println!("bar");
}
fn baz() {
println!("baz");
}
"#;
let chunks = chunker.split(text);
assert!(!chunks.is_empty());
for chunk in &chunks {
assert!(!chunk.content.is_empty());
}
}
#[test]
fn test_chunk_overlap() {
let chunker = SemanticChunker::new(50, 10, vec![" ".to_string()]);
let text = "word1 word2 word3 word4 word5 word6 word7 word8 word9 word10 word11 word12";
let chunks = chunker.split(text);
if chunks.len() > 1 {
for chunk in &chunks {
assert!(chunk.content.len() <= chunker.chunk_size + 20); }
}
}
#[test]
fn test_chunk_line_tracking() {
let chunker = SemanticChunker::new(50, 5, vec!["\n".to_string()]);
let text = "Line 1\nLine 2\nLine 3\nLine 4\nLine 5";
let chunks = chunker.split(text);
assert_eq!(chunks[0].start_line, 1);
}
#[test]
fn test_chunk_content_hash_deterministic() {
let chunk1 = Chunk {
content: "test content".to_string(),
start_offset: 0,
end_offset: 12,
start_line: 1,
end_line: 1,
};
let chunk2 = Chunk {
content: "test content".to_string(),
start_offset: 100, end_offset: 112,
start_line: 5,
end_line: 5,
};
assert_eq!(chunk1.content_hash(), chunk2.content_hash());
}
#[test]
fn test_chunk_content_hash_different() {
let chunk1 = Chunk {
content: "content 1".to_string(),
start_offset: 0,
end_offset: 9,
start_line: 1,
end_line: 1,
};
let chunk2 = Chunk {
content: "content 2".to_string(),
start_offset: 0,
end_offset: 9,
start_line: 1,
end_line: 1,
};
assert_ne!(chunk1.content_hash(), chunk2.content_hash());
}
#[test]
fn test_empty_text() {
let chunker = SemanticChunker::default();
let chunks = chunker.split("");
assert!(chunks.is_empty());
}
#[test]
fn test_whitespace_only() {
let chunker = SemanticChunker::default();
let chunks = chunker.split(" \n\n \t ");
assert!(chunks.is_empty());
}
#[test]
fn test_config_round_trip() {
let chunker = SemanticChunker::new(256, 32, vec!["\n".to_string()]);
let config = chunker.config();
assert_eq!(config.chunk_size, 256);
assert_eq!(config.chunk_overlap, 32);
}
#[test]
fn test_large_document_chunking() {
let chunker = SemanticChunker::new(100, 20, SemanticChunker::default_separators());
let mut text = String::new();
for i in 0..50 {
text.push_str(&format!(
"\n## Section {}\n\nThis is paragraph {} with some content.\n",
i, i
));
}
let chunks = chunker.split(&text);
assert!(chunks.len() > 1);
for chunk in &chunks {
assert!(!chunk.content.trim().is_empty());
}
}
mod proptests {
use super::*;
use proptest::prelude::*;
fn chunk_size_strategy() -> impl Strategy<Value = usize> {
32usize..=1024
}
fn overlap_strategy(chunk_size: usize) -> impl Strategy<Value = usize> {
0..=(chunk_size / 2)
}
proptest! {
#![proptest_config(ProptestConfig::with_cases(50))]
#[test]
fn prop_chunking_preserves_content(text in ".{0,500}") {
let chunker = SemanticChunker::default();
let chunks = chunker.split(&text);
for chunk in &chunks {
prop_assert!(text.contains(chunk.content.trim()) || chunk.content.trim().is_empty());
}
}
#[test]
fn prop_chunk_lines_valid(text in ".{1,500}\n.{1,500}\n.{1,500}") {
let chunker = SemanticChunker::default();
let chunks = chunker.split(&text);
for chunk in &chunks {
prop_assert!(chunk.start_line <= chunk.end_line,
"start_line {} > end_line {}", chunk.start_line, chunk.end_line);
prop_assert!(chunk.start_line >= 1,
"start_line {} should be >= 1", chunk.start_line);
}
}
#[test]
fn prop_chunk_offsets_valid(text in ".{10,500}") {
let chunker = SemanticChunker::default();
let chunks = chunker.split(&text);
for chunk in &chunks {
prop_assert!(chunk.start_offset <= chunk.end_offset,
"start_offset {} > end_offset {}", chunk.start_offset, chunk.end_offset);
prop_assert!(chunk.end_offset <= text.len(),
"end_offset {} > text.len() {}", chunk.end_offset, text.len());
}
}
#[test]
fn prop_content_hash_deterministic(content in ".{1,100}") {
let chunk = Chunk {
content: content.clone(),
start_offset: 0,
end_offset: content.len(),
start_line: 1,
end_line: 1,
};
let hash1 = chunk.content_hash();
let hash2 = chunk.content_hash();
prop_assert_eq!(hash1, hash2);
}
#[test]
fn prop_content_hash_different(
content1 in "[a-z]{5,20}",
content2 in "[A-Z]{5,20}"
) {
if content1 != content2 {
let chunk1 = Chunk {
content: content1.clone(),
start_offset: 0,
end_offset: content1.len(),
start_line: 1,
end_line: 1,
};
let chunk2 = Chunk {
content: content2.clone(),
start_offset: 0,
end_offset: content2.len(),
start_line: 1,
end_line: 1,
};
prop_assert_ne!(chunk1.content_hash(), chunk2.content_hash());
}
}
#[test]
fn prop_chunk_size_respected(
chunk_size in chunk_size_strategy(),
text_len in 100usize..2000
) {
let overlap = chunk_size / 4;
let chunker = SemanticChunker::new(chunk_size, overlap, vec![" ".to_string()]);
let text: String = (0..text_len).map(|i| if i % 10 == 0 { ' ' } else { 'a' }).collect();
let chunks = chunker.split(&text);
for chunk in &chunks {
prop_assert!(chunk.content.len() <= chunk_size * 2,
"chunk len {} > 2 * chunk_size {}", chunk.content.len(), chunk_size * 2);
}
}
}
}
}