1use std::str::FromStr;
10
11use serde::{Deserialize, Serialize};
12use text_splitter::TextSplitter;
13
14use crate::types::{AppError, Result};
15
16#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize, Default)]
22#[serde(rename_all = "kebab-case")]
23pub enum ChunkingStrategy {
24 #[default]
26 Word,
27 Semantic,
29 Character,
31}
32
33impl FromStr for ChunkingStrategy {
34 type Err = AppError;
35
36 fn from_str(s: &str) -> Result<Self> {
37 match s.to_lowercase().as_str() {
38 "word" | "words" => Ok(Self::Word),
39 "semantic" | "sentence" | "paragraph" => Ok(Self::Semantic),
40 "character" | "char" | "chars" => Ok(Self::Character),
41 _ => Err(AppError::Internal(format!(
42 "Unknown chunking strategy: {}. Use: word, semantic, character",
43 s
44 ))),
45 }
46 }
47}
48
49impl std::fmt::Display for ChunkingStrategy {
50 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
51 let name = match self {
52 Self::Word => "word",
53 Self::Semantic => "semantic",
54 Self::Character => "character",
55 };
56 write!(f, "{}", name)
57 }
58}
59
60#[derive(Debug, Clone, Serialize, Deserialize)]
66pub struct ChunkerConfig {
67 #[serde(default)]
69 pub strategy: ChunkingStrategy,
70 #[serde(default = "default_chunk_size")]
72 pub chunk_size: usize,
73 #[serde(default = "default_chunk_overlap")]
75 pub chunk_overlap: usize,
76 #[serde(default = "default_min_chunk_size")]
78 pub min_chunk_size: usize,
79}
80
81fn default_chunk_size() -> usize {
82 512
83}
84
85fn default_chunk_overlap() -> usize {
86 50
87}
88
89fn default_min_chunk_size() -> usize {
90 20
91}
92
93impl Default for ChunkerConfig {
94 fn default() -> Self {
95 Self {
96 strategy: ChunkingStrategy::default(),
97 chunk_size: default_chunk_size(),
98 chunk_overlap: default_chunk_overlap(),
99 min_chunk_size: default_min_chunk_size(),
100 }
101 }
102}
103
104#[derive(Debug, Clone, Serialize, Deserialize)]
110pub struct Chunk {
111 pub index: usize,
113 pub content: String,
115 pub start_offset: usize,
117 pub end_offset: usize,
119}
120
121#[derive(Debug, Clone)]
127pub struct TextChunker {
128 config: ChunkerConfig,
129}
130
131impl TextChunker {
132 pub fn new(config: ChunkerConfig) -> Self {
134 Self { config }
135 }
136
137 pub fn with_word_chunking(chunk_size: usize, chunk_overlap: usize) -> Self {
139 Self::new(ChunkerConfig {
140 strategy: ChunkingStrategy::Word,
141 chunk_size,
142 chunk_overlap,
143 min_chunk_size: default_min_chunk_size(),
144 })
145 }
146
147 pub fn with_semantic_chunking(max_chunk_size: usize) -> Self {
149 Self::new(ChunkerConfig {
150 strategy: ChunkingStrategy::Semantic,
151 chunk_size: max_chunk_size,
152 chunk_overlap: 0, min_chunk_size: default_min_chunk_size(),
154 })
155 }
156
157 pub fn with_character_chunking(chunk_size: usize, chunk_overlap: usize) -> Self {
159 Self::new(ChunkerConfig {
160 strategy: ChunkingStrategy::Character,
161 chunk_size,
162 chunk_overlap,
163 min_chunk_size: default_min_chunk_size(),
164 })
165 }
166
167 pub fn chunk(&self, text: &str) -> Vec<String> {
169 self.chunk_with_metadata(text)
170 .into_iter()
171 .map(|c| c.content)
172 .collect()
173 }
174
175 pub fn chunk_with_metadata(&self, text: &str) -> Vec<Chunk> {
177 match self.config.strategy {
178 ChunkingStrategy::Word => self.chunk_by_words(text),
179 ChunkingStrategy::Semantic => self.chunk_semantically(text),
180 ChunkingStrategy::Character => self.chunk_by_characters(text),
181 }
182 }
183
184 fn chunk_by_words(&self, text: &str) -> Vec<Chunk> {
186 let words: Vec<&str> = text.split_whitespace().collect();
187 let mut chunks = Vec::new();
188 let step = self
189 .config
190 .chunk_size
191 .saturating_sub(self.config.chunk_overlap)
192 .max(1);
193
194 let mut chunk_index = 0;
195 let mut word_index = 0;
196
197 while word_index < words.len() {
198 let end = (word_index + self.config.chunk_size).min(words.len());
199 let chunk_words = &words[word_index..end];
200 let content = chunk_words.join(" ");
201
202 if content.len() >= self.config.min_chunk_size {
203 let start_offset = if word_index == 0 {
205 0
206 } else {
207 words[..word_index]
208 .iter()
209 .map(|w| w.len() + 1)
210 .sum::<usize>()
211 };
212 let end_offset = start_offset + content.len();
213
214 chunks.push(Chunk {
215 index: chunk_index,
216 content,
217 start_offset,
218 end_offset,
219 });
220 chunk_index += 1;
221 }
222
223 word_index += step;
224 }
225
226 chunks
227 }
228
229 fn chunk_semantically(&self, text: &str) -> Vec<Chunk> {
231 let splitter = TextSplitter::new(self.config.chunk_size);
232
233 let mut chunks = Vec::new();
234 let mut current_offset = 0;
235
236 for (index, chunk_text) in splitter.chunks(text).enumerate() {
237 let start_offset = text[current_offset..]
239 .find(chunk_text)
240 .map(|pos| current_offset + pos)
241 .unwrap_or(current_offset);
242 let end_offset = start_offset + chunk_text.len();
243
244 if chunk_text.len() >= self.config.min_chunk_size {
245 chunks.push(Chunk {
246 index,
247 content: chunk_text.to_string(),
248 start_offset,
249 end_offset,
250 });
251 }
252
253 current_offset = end_offset;
254 }
255
256 chunks
257 }
258
259 fn chunk_by_characters(&self, text: &str) -> Vec<Chunk> {
261 let chars: Vec<char> = text.chars().collect();
262 let mut chunks = Vec::new();
263 let step = self
264 .config
265 .chunk_size
266 .saturating_sub(self.config.chunk_overlap)
267 .max(1);
268
269 let mut char_index = 0;
270 let mut chunk_index = 0;
271
272 while char_index < chars.len() {
273 let end = (char_index + self.config.chunk_size).min(chars.len());
274 let content: String = chars[char_index..end].iter().collect();
275
276 if content.len() >= self.config.min_chunk_size {
277 chunks.push(Chunk {
278 index: chunk_index,
279 content,
280 start_offset: char_index,
281 end_offset: end,
282 });
283 chunk_index += 1;
284 }
285
286 char_index += step;
287 }
288
289 chunks
290 }
291
292 pub fn config(&self) -> &ChunkerConfig {
294 &self.config
295 }
296}
297
298impl Default for TextChunker {
299 fn default() -> Self {
300 Self::new(ChunkerConfig::default())
301 }
302}
303
304#[cfg(test)]
309mod tests {
310 use super::*;
311
312 #[test]
313 fn test_chunking_strategy_from_str() {
314 assert_eq!(
315 "word".parse::<ChunkingStrategy>().unwrap(),
316 ChunkingStrategy::Word
317 );
318 assert_eq!(
319 "semantic".parse::<ChunkingStrategy>().unwrap(),
320 ChunkingStrategy::Semantic
321 );
322 assert_eq!(
323 "character".parse::<ChunkingStrategy>().unwrap(),
324 ChunkingStrategy::Character
325 );
326 }
327
328 #[test]
329 fn test_word_chunking_basic() {
330 let chunker = TextChunker::with_word_chunking(5, 2);
331 let text = "one two three four five six seven eight nine ten";
332 let chunks = chunker.chunk(text);
333
334 assert!(!chunks.is_empty());
335 assert!(chunks[0].split_whitespace().count() <= 5);
336 }
337
338 #[test]
339 fn test_word_chunking_overlap() {
340 let config = ChunkerConfig {
342 strategy: ChunkingStrategy::Word,
343 chunk_size: 4,
344 chunk_overlap: 2,
345 min_chunk_size: 5, };
347 let chunker = TextChunker::new(config);
348 let text = "alpha bravo charlie delta echo foxtrot golf hotel india juliet";
349 let chunks = chunker.chunk(text);
350
351 assert!(
353 chunks.len() > 1,
354 "Expected multiple chunks, got: {:?}",
355 chunks
356 );
357 }
358
359 #[test]
360 fn test_semantic_chunking() {
361 let chunker = TextChunker::with_semantic_chunking(100);
362 let text = "This is the first sentence. This is the second sentence. \
363 And here is a third one that is a bit longer.";
364 let chunks = chunker.chunk(text);
365
366 assert!(!chunks.is_empty());
368 }
369
370 #[test]
371 fn test_character_chunking() {
372 let config = ChunkerConfig {
373 strategy: ChunkingStrategy::Character,
374 chunk_size: 20,
375 chunk_overlap: 5,
376 min_chunk_size: 10,
377 };
378 let chunker = TextChunker::new(config);
379 let text = "This is a test string that should be chunked by characters.";
380 let chunks = chunker.chunk_with_metadata(text);
381
382 assert!(!chunks.is_empty());
383 for chunk in &chunks {
384 assert!(chunk.content.len() <= 20);
385 }
386 }
387
388 #[test]
389 fn test_chunk_metadata() {
390 let chunker = TextChunker::with_semantic_chunking(50);
391 let text = "Hello world. This is a test.";
392 let chunks = chunker.chunk_with_metadata(text);
393
394 assert!(!chunks.is_empty());
395 assert_eq!(chunks[0].index, 0);
396 assert!(chunks[0].start_offset < chunks[0].end_offset);
397 }
398
399 #[test]
400 fn test_default_config() {
401 let config = ChunkerConfig::default();
402 assert_eq!(config.strategy, ChunkingStrategy::Word);
403 assert_eq!(config.chunk_size, 512);
404 assert_eq!(config.chunk_overlap, 50);
405 }
406
407 #[test]
408 fn test_backward_compatible_api() {
409 let chunker = TextChunker::with_word_chunking(100, 10);
411 let text = "Hello world. This is a test with multiple words.";
412 let chunks = chunker.chunk(text);
413 assert!(!chunks.is_empty());
414 }
415
416 #[test]
417 fn test_empty_text() {
418 let chunker = TextChunker::default();
419 let chunks = chunker.chunk("");
420 assert!(chunks.is_empty());
421 }
422
423 #[test]
424 fn test_small_text() {
425 let config = ChunkerConfig {
426 strategy: ChunkingStrategy::Word,
427 chunk_size: 100,
428 chunk_overlap: 10,
429 min_chunk_size: 5,
430 };
431 let chunker = TextChunker::new(config);
432 let text = "Short text";
433 let chunks = chunker.chunk(text);
434
435 assert_eq!(chunks.len(), 1);
436 assert_eq!(chunks[0], "Short text");
437 }
438}