1use crate::error::Result;
2use regex::Regex;
3
4#[derive(Debug, Clone)]
6pub struct ContentChunk {
7 pub content: String,
8 pub start_byte: usize,
9 pub end_byte: usize,
10 pub chunk_index: usize,
11}
12
13#[derive(Debug, Clone, Default)]
15pub enum ChunkingStrategy {
16 Sentence,
18 Paragraph,
20 Semantic,
22 #[default]
24 Hybrid,
25}
26
27impl std::str::FromStr for ChunkingStrategy {
28 type Err = ();
29
30 fn from_str(s: &str) -> std::result::Result<Self, Self::Err> {
31 match s.to_lowercase().as_str() {
32 "sentence" => Ok(ChunkingStrategy::Sentence),
33 "paragraph" => Ok(ChunkingStrategy::Paragraph),
34 "semantic" => Ok(ChunkingStrategy::Semantic),
35 "hybrid" => Ok(ChunkingStrategy::Hybrid),
36 _ => Ok(ChunkingStrategy::Hybrid), }
38 }
39}
40
41pub struct FileChunker {
43 chunk_size: usize,
44 overlap_size: usize,
45 strategy: ChunkingStrategy,
46}
47
48impl FileChunker {
49 pub fn new(chunk_size: usize, overlap_size: usize) -> Self {
51 Self {
52 chunk_size,
53 overlap_size,
54 strategy: ChunkingStrategy::default(),
55 }
56 }
57
58 pub fn with_strategy(chunk_size: usize, overlap_size: usize, strategy: ChunkingStrategy) -> Self {
60 Self {
61 chunk_size,
62 overlap_size,
63 strategy,
64 }
65 }
66
67 pub fn with_defaults() -> Self {
69 Self::new(8192, 200)
70 }
71
72 pub fn chunk_content(&self, content: &str) -> Result<Vec<ContentChunk>> {
74 match self.strategy {
75 ChunkingStrategy::Sentence => self.chunk_by_sentences(content),
76 ChunkingStrategy::Paragraph => self.chunk_by_paragraphs(content),
77 ChunkingStrategy::Semantic => self.chunk_semantic(content),
78 ChunkingStrategy::Hybrid => self.chunk_hybrid(content),
79 }
80 }
81
82 fn chunk_by_sentences(&self, content: &str) -> Result<Vec<ContentChunk>> {
84 let sentence_regex = Regex::new(r"[.!?]+\s+").map_err(|e| {
85 crate::error::Error::InternalError(format!("Failed to create sentence regex: {}", e))
86 })?;
87
88 let sentences: Vec<&str> = sentence_regex
89 .split(content)
90 .filter(|s| !s.trim().is_empty())
91 .collect();
92
93 self.group_sentences_into_chunks(&sentences, content)
94 }
95
96 fn chunk_by_paragraphs(&self, content: &str) -> Result<Vec<ContentChunk>> {
98 let paragraphs: Vec<&str> = content
99 .split("\n\n")
100 .filter(|p| !p.trim().is_empty())
101 .collect();
102
103 self.group_paragraphs_into_chunks(¶graphs, content)
104 }
105
106 fn chunk_semantic(&self, content: &str) -> Result<Vec<ContentChunk>> {
108 let semantic_boundaries = self.find_semantic_boundaries(content)?;
115 self.create_chunks_from_boundaries(content, &semantic_boundaries)
116 }
117
118 fn chunk_hybrid(&self, content: &str) -> Result<Vec<ContentChunk>> {
120 let mut chunks = Vec::new();
121 let content_bytes = content.as_bytes();
122 let mut start = 0;
123 let mut chunk_index = 0;
124
125 while start < content_bytes.len() {
126 let initial_end = (start + self.chunk_size).min(content_bytes.len());
127
128 let semantic_end = self.find_best_semantic_boundary(
130 content,
131 start,
132 initial_end,
133 content_bytes.len()
134 );
135
136 let chunk_content = content[start..semantic_end].to_string();
138
139 chunks.push(ContentChunk {
140 content: chunk_content,
141 start_byte: start,
142 end_byte: semantic_end,
143 chunk_index,
144 });
145
146 if semantic_end >= content_bytes.len() {
148 break;
149 }
150
151 start = self.calculate_semantic_overlap_start(content, semantic_end);
152 chunk_index += 1;
153 }
154
155 Ok(chunks)
156 }
157
158 fn group_sentences_into_chunks(&self, sentences: &[&str], _original: &str) -> Result<Vec<ContentChunk>> {
160 let mut chunks = Vec::new();
161 let mut current_chunk = String::new();
162 let mut chunk_start = 0;
163 let mut chunk_index = 0;
164
165 for sentence in sentences {
166 let potential_chunk = if current_chunk.is_empty() {
167 sentence.to_string()
168 } else {
169 format!("{} {}", current_chunk, sentence)
170 };
171
172 if potential_chunk.len() <= self.chunk_size || current_chunk.is_empty() {
173 current_chunk = potential_chunk;
174 } else {
175 let chunk_end = chunk_start + current_chunk.len();
177 chunks.push(ContentChunk {
178 content: current_chunk.trim().to_string(),
179 start_byte: chunk_start,
180 end_byte: chunk_end,
181 chunk_index,
182 });
183
184 let overlap_content = self.calculate_sentence_overlap(¤t_chunk);
186 current_chunk = if overlap_content.is_empty() {
187 sentence.to_string()
188 } else {
189 format!("{} {}", overlap_content, sentence)
190 };
191
192 chunk_start = chunk_end - overlap_content.len();
193 chunk_index += 1;
194 }
195 }
196
197 if !current_chunk.trim().is_empty() {
199 let chunk_end = chunk_start + current_chunk.len();
200 chunks.push(ContentChunk {
201 content: current_chunk.trim().to_string(),
202 start_byte: chunk_start,
203 end_byte: chunk_end,
204 chunk_index,
205 });
206 }
207
208 Ok(chunks)
209 }
210
211 fn group_paragraphs_into_chunks(&self, paragraphs: &[&str], _original: &str) -> Result<Vec<ContentChunk>> {
213 let mut chunks = Vec::new();
214 let mut current_chunk = String::new();
215 let mut chunk_start = 0;
216 let mut chunk_index = 0;
217
218 for paragraph in paragraphs {
219 let potential_chunk = if current_chunk.is_empty() {
220 paragraph.to_string()
221 } else {
222 format!("{}\n\n{}", current_chunk, paragraph)
223 };
224
225 if potential_chunk.len() <= self.chunk_size || current_chunk.is_empty() {
226 current_chunk = potential_chunk;
227 } else {
228 let chunk_end = chunk_start + current_chunk.len();
230 chunks.push(ContentChunk {
231 content: current_chunk.trim().to_string(),
232 start_byte: chunk_start,
233 end_byte: chunk_end,
234 chunk_index,
235 });
236
237 current_chunk = paragraph.to_string();
239 chunk_start = chunk_end;
240 chunk_index += 1;
241 }
242 }
243
244 if !current_chunk.trim().is_empty() {
246 let chunk_end = chunk_start + current_chunk.len();
247 chunks.push(ContentChunk {
248 content: current_chunk.trim().to_string(),
249 start_byte: chunk_start,
250 end_byte: chunk_end,
251 chunk_index,
252 });
253 }
254
255 Ok(chunks)
256 }
257
258 fn find_semantic_boundaries(&self, content: &str) -> Result<Vec<usize>> {
260 let mut boundaries = vec![0]; let code_block_regex = Regex::new(r"```[\s\S]*?```").map_err(|e| {
264 crate::error::Error::InternalError(format!("Failed to create code block regex: {}", e))
265 })?;
266
267 for mat in code_block_regex.find_iter(content) {
268 boundaries.push(mat.start());
269 boundaries.push(mat.end());
270 }
271
272 let header_regex = Regex::new(r"(?m)^#{1,6}\s").map_err(|e| {
274 crate::error::Error::InternalError(format!("Failed to create header regex: {}", e))
275 })?;
276
277 for mat in header_regex.find_iter(content) {
278 boundaries.push(mat.start());
279 }
280
281 let paragraph_regex = Regex::new(r"\n\s*\n").map_err(|e| {
283 crate::error::Error::InternalError(format!("Failed to create paragraph regex: {}", e))
284 })?;
285
286 for mat in paragraph_regex.find_iter(content) {
287 boundaries.push(mat.end());
288 }
289
290 boundaries.push(content.len()); boundaries.sort_unstable();
292 boundaries.dedup();
293
294 Ok(boundaries)
295 }
296
297 fn create_chunks_from_boundaries(&self, content: &str, boundaries: &[usize]) -> Result<Vec<ContentChunk>> {
299 let mut chunks = Vec::new();
300 let mut chunk_index = 0;
301
302 for window in boundaries.windows(2) {
303 let start = window[0];
304 let end = window[1];
305 let chunk_content = content[start..end].trim();
306
307 if !chunk_content.is_empty() && chunk_content.len() >= 10 {
308 chunks.push(ContentChunk {
309 content: chunk_content.to_string(),
310 start_byte: start,
311 end_byte: end,
312 chunk_index,
313 });
314 chunk_index += 1;
315 }
316 }
317
318 Ok(chunks)
319 }
320
321 fn find_best_semantic_boundary(&self, content: &str, start: usize, target_end: usize, content_len: usize) -> usize {
323 if target_end >= content_len {
324 return content_len;
325 }
326
327 let search_start = (target_end.saturating_sub(200)).max(start);
329 let search_end = (target_end + 200).min(content_len);
330
331 let search_text = &content[search_start..search_end];
332
333 let relative_target = target_end - search_start;
340
341 if let Some(pos) = self.find_nearest_match(search_text, r"\n\s*\n", relative_target) {
343 return search_start + pos;
344 }
345
346 if let Some(pos) = self.find_nearest_match(search_text, r"\n", relative_target) {
348 return search_start + pos;
349 }
350
351 if let Some(pos) = self.find_nearest_match(search_text, r"[.!?]\s+", relative_target) {
353 return search_start + pos;
354 }
355
356 if let Some(pos) = self.find_nearest_match(search_text, r"\s+", relative_target) {
358 return search_start + pos;
359 }
360
361 target_end
363 }
364
365 fn find_nearest_match(&self, text: &str, pattern: &str, target: usize) -> Option<usize> {
367 let regex = Regex::new(pattern).ok()?;
368 let mut closest_pos = None;
369 let mut closest_distance = usize::MAX;
370
371 for mat in regex.find_iter(text) {
372 let distance = if mat.end() > target {
373 mat.end() - target
374 } else {
375 target - mat.end()
376 };
377
378 if distance < closest_distance {
379 closest_distance = distance;
380 closest_pos = Some(mat.end());
381 }
382 }
383
384 closest_pos
385 }
386
387 fn calculate_semantic_overlap_start(&self, content: &str, end: usize) -> usize {
389 let overlap_target = end.saturating_sub(self.overlap_size);
390
391 let search_start = overlap_target.saturating_sub(50);
393 let search_end = end.min(search_start + 100);
394
395 if search_start >= search_end {
396 return overlap_target;
397 }
398
399 let search_text = &content[search_start..search_end];
400 let relative_target = overlap_target - search_start;
401
402 if let Some(pos) = self.find_nearest_match(search_text, r"\s+", relative_target) {
403 search_start + pos
404 } else {
405 overlap_target
406 }
407 }
408
409 fn calculate_sentence_overlap(&self, chunk: &str) -> String {
411 let words: Vec<&str> = chunk.split_whitespace().collect();
412 let overlap_words = (words.len() * self.overlap_size / chunk.len()).min(words.len() / 4);
413
414 if overlap_words > 0 {
415 words[words.len().saturating_sub(overlap_words)..]
416 .join(" ")
417 } else {
418 String::new()
419 }
420 }
421}
422
423#[cfg(test)]
424mod tests {
425 use super::*;
426
427 #[test]
428 #[allow(clippy::needless_borrow)]
429 fn test_basic_chunking() {
430 let chunker = FileChunker::new(100, 20);
431 let content = "a".repeat(250);
432 let chunks = chunker.chunk_content(&content).unwrap();
433
434 assert!(chunks.len() >= 3);
435 assert_eq!(chunks[0].chunk_index, 0);
436 assert_eq!(chunks[1].chunk_index, 1);
437 }
438
439 #[test]
440 #[allow(clippy::needless_borrow)]
441 fn test_utf8_boundary_safety() {
442 let chunker = FileChunker::new(10, 2);
443 let content = "Hello 世界 World";
444 let chunks = chunker.chunk_content(&content).unwrap();
445
446 for chunk in chunks {
448 assert!(
449 chunk.content.is_ascii()
450 || chunk
451 .content
452 .chars()
453 .all(|c| c.is_alphabetic() || c.is_whitespace())
454 );
455 }
456 }
457
458 #[test]
459 #[allow(clippy::needless_borrow)]
460 fn test_overlap() {
461 let chunker = FileChunker::new(50, 10);
462 let content = "a".repeat(100);
463 let chunks = chunker.chunk_content(&content).unwrap();
464
465 if chunks.len() > 1 {
467 let overlap_start = chunks[1].start_byte;
468 let first_end = chunks[0].end_byte;
469 assert!(overlap_start < first_end);
470 }
471 }
472}