#[cfg(feature = "memory")]
use reasonkit_mem::{Chunk, EmbeddingIds};
use uuid::Uuid;
#[derive(Debug, Clone, Copy)]
pub enum ChunkingStrategy {
FixedSize {
target_chars: usize,
overlap_chars: usize,
},
Semantic {
max_chars: usize,
overlap_chars: usize,
},
}
impl Default for ChunkingStrategy {
fn default() -> Self {
ChunkingStrategy::FixedSize {
target_chars: 2000,
overlap_chars: 200,
}
}
}
#[cfg(feature = "memory")]
pub fn chunk_text(text: &str, strategy: ChunkingStrategy) -> Vec<Chunk> {
match strategy {
ChunkingStrategy::FixedSize {
target_chars,
overlap_chars,
} => chunk_fixed_size(text, target_chars, overlap_chars),
ChunkingStrategy::Semantic {
max_chars,
overlap_chars,
} => chunk_semantic(text, max_chars, overlap_chars),
}
}
#[cfg(feature = "memory")]
fn chunk_fixed_size(text: &str, target_chars: usize, overlap_chars: usize) -> Vec<Chunk> {
let mut chunks = Vec::new();
let mut start = 0;
let mut index = 0;
while start < text.len() {
let end = (start + target_chars).min(text.len());
let chunk_text = &text[start..end];
chunks.push(Chunk {
id: Uuid::new_v4(),
text: chunk_text.to_string(),
index,
start_char: start,
end_char: end,
token_count: None, section: None,
page: None,
embedding_ids: EmbeddingIds::default(),
});
start += target_chars.saturating_sub(overlap_chars);
index += 1;
}
chunks
}
#[cfg(feature = "memory")]
fn chunk_semantic(text: &str, max_chars: usize, overlap_chars: usize) -> Vec<Chunk> {
let paragraphs: Vec<&str> = text.split("\n\n").collect();
let mut chunks = Vec::new();
let mut index = 0;
let mut char_offset = 0;
for para in paragraphs {
if para.is_empty() {
continue;
}
if para.len() <= max_chars {
chunks.push(Chunk {
id: Uuid::new_v4(),
text: para.to_string(),
index,
start_char: char_offset,
end_char: char_offset + para.len(),
token_count: None,
section: None,
page: None,
embedding_ids: EmbeddingIds::default(),
});
index += 1;
} else {
let mut start = 0;
while start < para.len() {
let end = (start + max_chars).min(para.len());
let chunk_text = ¶[start..end];
chunks.push(Chunk {
id: Uuid::new_v4(),
text: chunk_text.to_string(),
index,
start_char: char_offset + start,
end_char: char_offset + end,
token_count: None,
section: None,
page: None,
embedding_ids: EmbeddingIds::default(),
});
start += max_chars.saturating_sub(overlap_chars);
index += 1;
}
}
char_offset += para.len() + 2; }
chunks
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[cfg(feature = "memory")]
fn test_fixed_size_chunking() {
let text = "This is a sample document. ".repeat(100);
let chunks = chunk_fixed_size(&text, 100, 20);
assert!(!chunks.is_empty());
assert!(chunks[0].text.len() <= 100);
}
#[test]
#[cfg(feature = "memory")]
fn test_semantic_chunking() {
let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
let chunks = chunk_semantic(&text, 50, 10);
assert_eq!(chunks.len(), 3);
assert!(chunks.iter().all(|c| c.text.contains("paragraph")));
}
}