1use crate::chunking::traits::{ChunkMetadata, Chunker};
7use crate::chunking::{DEFAULT_CHUNK_SIZE, DEFAULT_OVERLAP, MAX_CHUNK_SIZE};
8use crate::core::Chunk;
9use crate::error::{ChunkingError, Result};
10
11#[derive(Debug, Clone)]
30pub struct FixedChunker {
31 chunk_size: usize,
33 overlap: usize,
35 line_aware: bool,
37}
38
39impl Default for FixedChunker {
40 fn default() -> Self {
41 Self::new()
42 }
43}
44
45impl FixedChunker {
46 #[must_use]
48 pub const fn new() -> Self {
49 Self {
50 chunk_size: DEFAULT_CHUNK_SIZE,
51 overlap: DEFAULT_OVERLAP,
52 line_aware: true,
53 }
54 }
55
56 #[must_use]
62 pub const fn with_size(chunk_size: usize) -> Self {
63 Self {
64 chunk_size,
65 overlap: 0,
66 line_aware: true,
67 }
68 }
69
70 #[must_use]
77 pub const fn with_size_and_overlap(chunk_size: usize, overlap: usize) -> Self {
78 Self {
79 chunk_size,
80 overlap,
81 line_aware: true,
82 }
83 }
84
85 #[must_use]
90 pub const fn line_aware(mut self, enabled: bool) -> Self {
91 self.line_aware = enabled;
92 self
93 }
94
95 fn find_boundary(&self, text: &str, target_pos: usize) -> usize {
97 let mut pos = target_pos.min(text.len());
98
99 while !text.is_char_boundary(pos) && pos > 0 {
101 pos -= 1;
102 }
103
104 if self.line_aware && pos > 0 {
106 let search_start = pos.saturating_sub(self.chunk_size / 10); if let Some(newline_offset) = text[search_start..pos].rfind('\n') {
108 let newline_pos = search_start + newline_offset + 1; if newline_pos > search_start {
110 return newline_pos;
111 }
112 }
113 }
114
115 pos
116 }
117}
118
119impl Chunker for FixedChunker {
120 fn chunk(
121 &self,
122 buffer_id: i64,
123 text: &str,
124 metadata: Option<&ChunkMetadata>,
125 ) -> Result<Vec<Chunk>> {
126 let (chunk_size, overlap) = metadata.map_or((self.chunk_size, self.overlap), |meta| {
128 (meta.chunk_size, meta.overlap)
129 });
130
131 if chunk_size == 0 {
133 return Err(ChunkingError::InvalidConfig {
134 reason: "chunk_size must be > 0".to_string(),
135 }
136 .into());
137 }
138 if chunk_size > MAX_CHUNK_SIZE {
139 return Err(ChunkingError::ChunkTooLarge {
140 size: chunk_size,
141 max: MAX_CHUNK_SIZE,
142 }
143 .into());
144 }
145 if overlap >= chunk_size {
146 return Err(ChunkingError::OverlapTooLarge {
147 overlap,
148 size: chunk_size,
149 }
150 .into());
151 }
152
153 if text.is_empty() {
155 return Ok(vec![]);
156 }
157
158 if text.len() <= chunk_size {
160 return Ok(vec![Chunk::with_strategy(
161 buffer_id,
162 text.to_string(),
163 0..text.len(),
164 0,
165 self.name(),
166 )]);
167 }
168
169 let mut chunks = Vec::new();
170 let mut start = 0;
171 let mut index = 0;
172
173 while start < text.len() {
174 let target_end = (start + chunk_size).min(text.len());
175 let end = if target_end >= text.len() {
176 text.len()
177 } else {
178 self.find_boundary(text, target_end)
179 };
180
181 let end = if end <= start {
183 (start + chunk_size).min(text.len())
184 } else {
185 end
186 };
187
188 let content = text[start..end].to_string();
189 let mut chunk =
190 Chunk::with_strategy(buffer_id, content, start..end, index, self.name());
191
192 if index > 0 && overlap > 0 {
193 chunk.set_has_overlap(true);
194 }
195
196 chunks.push(chunk);
197
198 if let Some(meta) = metadata
200 && meta.max_chunks > 0
201 && chunks.len() >= meta.max_chunks
202 {
203 break;
204 }
205
206 if end >= text.len() {
208 break;
209 }
210
211 start = if overlap > 0 {
212 end.saturating_sub(overlap)
213 } else {
214 end
215 };
216
217 if start <= chunks.last().map_or(0, |c| c.byte_range.start) {
219 start = end;
220 }
221
222 index += 1;
223 }
224
225 Ok(chunks)
226 }
227
228 fn name(&self) -> &'static str {
229 "fixed"
230 }
231
232 fn description(&self) -> &'static str {
233 "Fixed-size chunking with optional line boundary alignment"
234 }
235}
236
237#[cfg(test)]
238mod tests {
239 use super::*;
240
241 #[test]
242 fn test_fixed_chunker_default() {
243 let chunker = FixedChunker::new();
244 assert_eq!(chunker.chunk_size, DEFAULT_CHUNK_SIZE);
245 assert_eq!(chunker.overlap, DEFAULT_OVERLAP);
246 }
247
248 #[test]
249 fn test_fixed_chunker_empty_text() {
250 let chunker = FixedChunker::with_size(100);
251 let chunks = chunker.chunk(1, "", None).unwrap();
252 assert!(chunks.is_empty());
253 }
254
255 #[test]
256 fn test_fixed_chunker_small_text() {
257 let chunker = FixedChunker::with_size(100);
258 let text = "Hello, world!";
259 let chunks = chunker.chunk(1, text, None).unwrap();
260 assert_eq!(chunks.len(), 1);
261 assert_eq!(chunks[0].content, text);
262 }
263
264 #[test]
265 fn test_fixed_chunker_exact_size() {
266 let chunker = FixedChunker::with_size(10).line_aware(false);
267 let text = "0123456789";
268 let chunks = chunker.chunk(1, text, None).unwrap();
269 assert_eq!(chunks.len(), 1);
270 assert_eq!(chunks[0].content, text);
271 }
272
273 #[test]
274 fn test_fixed_chunker_multiple_chunks() {
275 let chunker = FixedChunker::with_size(10).line_aware(false);
276 let text = "0123456789ABCDEFGHIJ";
277 let chunks = chunker.chunk(1, text, None).unwrap();
278 assert_eq!(chunks.len(), 2);
279 assert_eq!(chunks[0].byte_range, 0..10);
280 assert_eq!(chunks[1].byte_range, 10..20);
281 }
282
283 #[test]
284 fn test_fixed_chunker_with_overlap() {
285 let chunker = FixedChunker::with_size_and_overlap(10, 3).line_aware(false);
286 let text = "0123456789ABCDEFGHIJ";
287 let chunks = chunker.chunk(1, text, None).unwrap();
288
289 assert!(chunks.len() >= 2);
291 assert!(chunks[1].metadata.has_overlap);
292 }
293
294 #[test]
295 fn test_fixed_chunker_line_aware() {
296 let chunker = FixedChunker::with_size(15).line_aware(true);
297 let text = "Hello\nWorld\nTest";
298 let chunks = chunker.chunk(1, text, None).unwrap();
299
300 assert!(!chunks.is_empty());
302 }
303
304 #[test]
305 fn test_fixed_chunker_unicode() {
306 let chunker = FixedChunker::with_size(5).line_aware(false);
307 let text = "Hello世界Test";
308 let chunks = chunker.chunk(1, text, None).unwrap();
309
310 for chunk in &chunks {
312 assert!(chunk.content.is_char_boundary(0));
313 }
314 }
315
316 #[test]
317 fn test_fixed_chunker_preserves_indices() {
318 let chunker = FixedChunker::with_size(10).line_aware(false);
319 let text = "0123456789ABCDEFGHIJ";
320 let chunks = chunker.chunk(1, text, None).unwrap();
321
322 for (i, chunk) in chunks.iter().enumerate() {
323 assert_eq!(chunk.index, i);
324 assert_eq!(chunk.buffer_id, 1);
325 }
326 }
327
328 #[test]
329 fn test_fixed_chunker_invalid_config() {
330 let chunker = FixedChunker::with_size(0);
331 let result = chunker.chunk(1, "test", None);
332 assert!(result.is_err());
333 }
334
335 #[test]
336 fn test_fixed_chunker_overlap_too_large() {
337 let chunker = FixedChunker::with_size_and_overlap(10, 10);
338 let result = chunker.chunk(1, "test content here", None);
339 assert!(result.is_err());
340 }
341
342 #[test]
343 fn test_fixed_chunker_max_chunks() {
344 let chunker = FixedChunker::with_size(5).line_aware(false);
345 let text = "0123456789ABCDEFGHIJ";
346 let meta = ChunkMetadata::with_size(5).max_chunks(2);
347 let chunks = chunker.chunk(1, text, Some(&meta)).unwrap();
348 assert_eq!(chunks.len(), 2);
349 }
350
351 #[test]
352 fn test_fixed_chunker_strategy_name() {
353 let chunker = FixedChunker::new();
354 assert_eq!(chunker.name(), "fixed");
355
356 let chunks = chunker.chunk(1, "Hello, world!", None).unwrap();
357 assert_eq!(chunks[0].metadata.strategy, Some("fixed".to_string()));
358 }
359
360 #[test]
361 fn test_fixed_chunker_default_impl() {
362 let chunker = FixedChunker::default();
364 assert_eq!(chunker.chunk_size, DEFAULT_CHUNK_SIZE);
365 assert_eq!(chunker.overlap, DEFAULT_OVERLAP);
366 assert!(chunker.line_aware);
367 }
368
369 #[test]
370 fn test_fixed_chunker_chunk_too_large() {
371 let chunker = FixedChunker::with_size(MAX_CHUNK_SIZE + 1);
373 let result = chunker.chunk(1, "test", None);
374 assert!(result.is_err());
375 }
376
377 #[test]
378 fn test_fixed_chunker_line_aware_boundary() {
379 let chunker = FixedChunker::with_size(20).line_aware(true);
381 let text = "Hello world\nSecond line here\nThird line";
382 let chunks = chunker.chunk(1, text, None).unwrap();
383
384 assert!(!chunks.is_empty());
386 for chunk in &chunks[..chunks.len().saturating_sub(1)] {
388 let content = &chunk.content;
389 assert!(content.ends_with('\n') || content.len() <= 20);
391 }
392 }
393
394 #[test]
395 fn test_fixed_chunker_description() {
396 let chunker = FixedChunker::new();
398 let desc = chunker.description();
399 assert!(desc.contains("Fixed"));
400 assert!(!desc.is_empty());
401 }
402
403 #[test]
404 fn test_fixed_chunker_large_overlap() {
405 let chunker = FixedChunker::with_size_and_overlap(10, 8).line_aware(false);
407 let text = "AAAAAAAAAABBBBBBBBBBCCCCCCCCCC";
408 let chunks = chunker.chunk(1, text, None).unwrap();
409
410 assert!(chunks.len() >= 2);
412 for i in 1..chunks.len() {
414 assert!(chunks[i].byte_range.start >= chunks[i - 1].byte_range.start);
415 }
416 }
417
418 #[test]
419 fn test_fixed_chunker_metadata_override() {
420 let chunker = FixedChunker::with_size(1000);
422 let text = "A".repeat(50);
423 let meta = ChunkMetadata::with_size_and_overlap(20, 5);
424 let chunks = chunker.chunk(1, &text, Some(&meta)).unwrap();
425
426 assert!(chunks.len() > 1);
428 }
429
430 #[test]
431 fn test_fixed_chunker_line_aware_newline_found() {
432 let chunker = FixedChunker::with_size(25).line_aware(true);
435 let text = "Hello world here\nSecond line of text\nThird line";
436 let chunks = chunker.chunk(1, text, None).unwrap();
437
438 assert!(!chunks.is_empty());
440 if chunks.len() > 1 {
442 let first = &chunks[0];
443 assert!(
445 first.content.ends_with('\n') || first.content.len() <= 25,
446 "First chunk content: '{}'",
447 first.content
448 );
449 }
450 }
451
452 #[test]
453 fn test_fixed_chunker_force_progress_edge_case() {
454 let chunker = FixedChunker::with_size(3).line_aware(false);
457 let text = "ABCDEFGHIJ";
458 let chunks = chunker.chunk(1, text, None).unwrap();
459
460 assert!(!chunks.is_empty());
462 let total_len: usize = chunks.iter().map(|c| c.content.len()).sum();
464 assert!(total_len >= text.len() - 3);
466 }
467
468 #[test]
469 fn test_fixed_chunker_no_backward_progress() {
470 let chunker = FixedChunker::with_size_and_overlap(10, 9).line_aware(false);
473 let text = "ABCDEFGHIJKLMNOPQRST";
474 let chunks = chunker.chunk(1, text, None).unwrap();
475
476 for i in 1..chunks.len() {
478 assert!(
479 chunks[i].byte_range.start >= chunks[i - 1].byte_range.start,
480 "Chunk {} starts before chunk {}: {} < {}",
481 i,
482 i - 1,
483 chunks[i].byte_range.start,
484 chunks[i - 1].byte_range.start
485 );
486 }
487 }
488}