use crate::error::Result;
use regex::Regex;
#[derive(Debug, Clone)]
pub struct ContentChunk {
pub content: String,
pub start_byte: usize,
pub end_byte: usize,
pub chunk_index: usize,
}
#[derive(Debug, Clone, Default)]
pub enum ChunkingStrategy {
Sentence,
Paragraph,
Semantic,
#[default]
Hybrid,
}
impl std::str::FromStr for ChunkingStrategy {
type Err = ();
fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
match s.to_lowercase().as_str() {
"sentence" => Ok(ChunkingStrategy::Sentence),
"paragraph" => Ok(ChunkingStrategy::Paragraph),
"semantic" => Ok(ChunkingStrategy::Semantic),
"hybrid" => Ok(ChunkingStrategy::Hybrid),
_ => Ok(ChunkingStrategy::Hybrid), }
}
}
pub struct FileChunker {
chunk_size: usize,
overlap_size: usize,
strategy: ChunkingStrategy,
}
impl FileChunker {
pub fn new(chunk_size: usize, overlap_size: usize) -> Self {
Self {
chunk_size,
overlap_size,
strategy: ChunkingStrategy::default(),
}
}
pub fn with_strategy(chunk_size: usize, overlap_size: usize, strategy: ChunkingStrategy) -> Self {
Self {
chunk_size,
overlap_size,
strategy,
}
}
pub fn with_defaults() -> Self {
Self::new(8192, 200)
}
pub fn chunk_content(&self, content: &str) -> Result<Vec<ContentChunk>> {
match self.strategy {
ChunkingStrategy::Sentence => self.chunk_by_sentences(content),
ChunkingStrategy::Paragraph => self.chunk_by_paragraphs(content),
ChunkingStrategy::Semantic => self.chunk_semantic(content),
ChunkingStrategy::Hybrid => self.chunk_hybrid(content),
}
}
fn chunk_by_sentences(&self, content: &str) -> Result<Vec<ContentChunk>> {
let sentence_regex = Regex::new(r"[.!?]+\s+").map_err(|e| {
crate::error::Error::InternalError(format!("Failed to create sentence regex: {}", e))
})?;
let sentences: Vec<&str> = sentence_regex
.split(content)
.filter(|s| !s.trim().is_empty())
.collect();
self.group_sentences_into_chunks(&sentences, content)
}
fn chunk_by_paragraphs(&self, content: &str) -> Result<Vec<ContentChunk>> {
let paragraphs: Vec<&str> = content
.split("\n\n")
.filter(|p| !p.trim().is_empty())
.collect();
self.group_paragraphs_into_chunks(¶graphs, content)
}
fn chunk_semantic(&self, content: &str) -> Result<Vec<ContentChunk>> {
let semantic_boundaries = self.find_semantic_boundaries(content)?;
self.create_chunks_from_boundaries(content, &semantic_boundaries)
}
fn chunk_hybrid(&self, content: &str) -> Result<Vec<ContentChunk>> {
let mut chunks = Vec::new();
let content_bytes = content.as_bytes();
let mut start = 0;
let mut chunk_index = 0;
while start < content_bytes.len() {
let initial_end = (start + self.chunk_size).min(content_bytes.len());
let semantic_end = self.find_best_semantic_boundary(
content,
start,
initial_end,
content_bytes.len()
);
let chunk_content = content[start..semantic_end].to_string();
chunks.push(ContentChunk {
content: chunk_content,
start_byte: start,
end_byte: semantic_end,
chunk_index,
});
if semantic_end >= content_bytes.len() {
break;
}
start = self.calculate_semantic_overlap_start(content, semantic_end);
chunk_index += 1;
}
Ok(chunks)
}
fn group_sentences_into_chunks(&self, sentences: &[&str], _original: &str) -> Result<Vec<ContentChunk>> {
let mut chunks = Vec::new();
let mut current_chunk = String::new();
let mut chunk_start = 0;
let mut chunk_index = 0;
for sentence in sentences {
let potential_chunk = if current_chunk.is_empty() {
sentence.to_string()
} else {
format!("{} {}", current_chunk, sentence)
};
if potential_chunk.len() <= self.chunk_size || current_chunk.is_empty() {
current_chunk = potential_chunk;
} else {
let chunk_end = chunk_start + current_chunk.len();
chunks.push(ContentChunk {
content: current_chunk.trim().to_string(),
start_byte: chunk_start,
end_byte: chunk_end,
chunk_index,
});
let overlap_content = self.calculate_sentence_overlap(¤t_chunk);
current_chunk = if overlap_content.is_empty() {
sentence.to_string()
} else {
format!("{} {}", overlap_content, sentence)
};
chunk_start = chunk_end - overlap_content.len();
chunk_index += 1;
}
}
if !current_chunk.trim().is_empty() {
let chunk_end = chunk_start + current_chunk.len();
chunks.push(ContentChunk {
content: current_chunk.trim().to_string(),
start_byte: chunk_start,
end_byte: chunk_end,
chunk_index,
});
}
Ok(chunks)
}
fn group_paragraphs_into_chunks(&self, paragraphs: &[&str], _original: &str) -> Result<Vec<ContentChunk>> {
let mut chunks = Vec::new();
let mut current_chunk = String::new();
let mut chunk_start = 0;
let mut chunk_index = 0;
for paragraph in paragraphs {
let potential_chunk = if current_chunk.is_empty() {
paragraph.to_string()
} else {
format!("{}\n\n{}", current_chunk, paragraph)
};
if potential_chunk.len() <= self.chunk_size || current_chunk.is_empty() {
current_chunk = potential_chunk;
} else {
let chunk_end = chunk_start + current_chunk.len();
chunks.push(ContentChunk {
content: current_chunk.trim().to_string(),
start_byte: chunk_start,
end_byte: chunk_end,
chunk_index,
});
current_chunk = paragraph.to_string();
chunk_start = chunk_end;
chunk_index += 1;
}
}
if !current_chunk.trim().is_empty() {
let chunk_end = chunk_start + current_chunk.len();
chunks.push(ContentChunk {
content: current_chunk.trim().to_string(),
start_byte: chunk_start,
end_byte: chunk_end,
chunk_index,
});
}
Ok(chunks)
}
fn find_semantic_boundaries(&self, content: &str) -> Result<Vec<usize>> {
let mut boundaries = vec![0];
let code_block_regex = Regex::new(r"```[\s\S]*?```").map_err(|e| {
crate::error::Error::InternalError(format!("Failed to create code block regex: {}", e))
})?;
for mat in code_block_regex.find_iter(content) {
boundaries.push(mat.start());
boundaries.push(mat.end());
}
let header_regex = Regex::new(r"(?m)^#{1,6}\s").map_err(|e| {
crate::error::Error::InternalError(format!("Failed to create header regex: {}", e))
})?;
for mat in header_regex.find_iter(content) {
boundaries.push(mat.start());
}
let paragraph_regex = Regex::new(r"\n\s*\n").map_err(|e| {
crate::error::Error::InternalError(format!("Failed to create paragraph regex: {}", e))
})?;
for mat in paragraph_regex.find_iter(content) {
boundaries.push(mat.end());
}
boundaries.push(content.len()); boundaries.sort_unstable();
boundaries.dedup();
Ok(boundaries)
}
fn create_chunks_from_boundaries(&self, content: &str, boundaries: &[usize]) -> Result<Vec<ContentChunk>> {
let mut chunks = Vec::new();
let mut chunk_index = 0;
for window in boundaries.windows(2) {
let start = window[0];
let end = window[1];
let chunk_content = content[start..end].trim();
if !chunk_content.is_empty() && chunk_content.len() >= 10 {
chunks.push(ContentChunk {
content: chunk_content.to_string(),
start_byte: start,
end_byte: end,
chunk_index,
});
chunk_index += 1;
}
}
Ok(chunks)
}
fn find_best_semantic_boundary(&self, content: &str, start: usize, target_end: usize, content_len: usize) -> usize {
if target_end >= content_len {
return content_len;
}
let search_start = (target_end.saturating_sub(200)).max(start);
let search_end = (target_end + 200).min(content_len);
let search_text = &content[search_start..search_end];
let relative_target = target_end - search_start;
if let Some(pos) = self.find_nearest_match(search_text, r"\n\s*\n", relative_target) {
return search_start + pos;
}
if let Some(pos) = self.find_nearest_match(search_text, r"\n", relative_target) {
return search_start + pos;
}
if let Some(pos) = self.find_nearest_match(search_text, r"[.!?]\s+", relative_target) {
return search_start + pos;
}
if let Some(pos) = self.find_nearest_match(search_text, r"\s+", relative_target) {
return search_start + pos;
}
target_end
}
fn find_nearest_match(&self, text: &str, pattern: &str, target: usize) -> Option<usize> {
let regex = Regex::new(pattern).ok()?;
let mut closest_pos = None;
let mut closest_distance = usize::MAX;
for mat in regex.find_iter(text) {
let distance = if mat.end() > target {
mat.end() - target
} else {
target - mat.end()
};
if distance < closest_distance {
closest_distance = distance;
closest_pos = Some(mat.end());
}
}
closest_pos
}
fn calculate_semantic_overlap_start(&self, content: &str, end: usize) -> usize {
let overlap_target = end.saturating_sub(self.overlap_size);
let search_start = overlap_target.saturating_sub(50);
let search_end = end.min(search_start + 100);
if search_start >= search_end {
return overlap_target;
}
let search_text = &content[search_start..search_end];
let relative_target = overlap_target - search_start;
if let Some(pos) = self.find_nearest_match(search_text, r"\s+", relative_target) {
search_start + pos
} else {
overlap_target
}
}
fn calculate_sentence_overlap(&self, chunk: &str) -> String {
let words: Vec<&str> = chunk.split_whitespace().collect();
let overlap_words = (words.len() * self.overlap_size / chunk.len()).min(words.len() / 4);
if overlap_words > 0 {
words[words.len().saturating_sub(overlap_words)..]
.join(" ")
} else {
String::new()
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
#[allow(clippy::needless_borrow)]
fn test_basic_chunking() {
let chunker = FileChunker::new(100, 20);
let content = "a".repeat(250);
let chunks = chunker.chunk_content(&content).unwrap();
assert!(chunks.len() >= 3);
assert_eq!(chunks[0].chunk_index, 0);
assert_eq!(chunks[1].chunk_index, 1);
}
#[test]
#[allow(clippy::needless_borrow)]
fn test_utf8_boundary_safety() {
let chunker = FileChunker::new(10, 2);
let content = "Hello 世界 World";
let chunks = chunker.chunk_content(&content).unwrap();
for chunk in chunks {
assert!(
chunk.content.is_ascii()
|| chunk
.content
.chars()
.all(|c| c.is_alphabetic() || c.is_whitespace())
);
}
}
#[test]
#[allow(clippy::needless_borrow)]
fn test_overlap() {
let chunker = FileChunker::new(50, 10);
let content = "a".repeat(100);
let chunks = chunker.chunk_content(&content).unwrap();
if chunks.len() > 1 {
let overlap_start = chunks[1].start_byte;
let first_end = chunks[0].end_byte;
assert!(overlap_start < first_end);
}
}
}