Skip to main content

rlm_rs/chunking/
fixed.rs

1//! Fixed-size chunking strategy.
2//!
3//! Provides simple character-based chunking with configurable size and overlap.
4//! Respects UTF-8 character boundaries to avoid splitting multi-byte characters.
5
6use crate::chunking::traits::{ChunkMetadata, Chunker};
7use crate::chunking::{DEFAULT_CHUNK_SIZE, DEFAULT_OVERLAP, MAX_CHUNK_SIZE};
8use crate::core::Chunk;
9use crate::error::{ChunkingError, Result};
10
11/// Fixed-size chunker that splits text at character boundaries.
12///
13/// This is the simplest chunking strategy, splitting text into
14/// fixed-size segments with optional overlap. It ensures chunks
15/// never split multi-byte UTF-8 characters.
16///
17/// # Examples
18///
19/// ```
20/// use rlm_rs::chunking::{Chunker, FixedChunker};
21///
22/// let chunker = FixedChunker::with_size(100);
23/// let text = "Hello, world! ".repeat(20);
24/// let chunks = chunker.chunk(1, &text, None).unwrap();
25/// for chunk in &chunks {
26///     assert!(chunk.size() <= 100);
27/// }
28/// ```
29#[derive(Debug, Clone)]
30pub struct FixedChunker {
31    /// Target chunk size in characters.
32    chunk_size: usize,
33    /// Overlap between consecutive chunks.
34    overlap: usize,
35    /// Whether to align chunks to line boundaries.
36    line_aware: bool,
37}
38
39impl Default for FixedChunker {
40    fn default() -> Self {
41        Self::new()
42    }
43}
44
45impl FixedChunker {
46    /// Creates a new fixed chunker with default settings.
47    #[must_use]
48    pub const fn new() -> Self {
49        Self {
50            chunk_size: DEFAULT_CHUNK_SIZE,
51            overlap: DEFAULT_OVERLAP,
52            line_aware: true,
53        }
54    }
55
56    /// Creates a fixed chunker with custom chunk size and no overlap.
57    ///
58    /// # Arguments
59    ///
60    /// * `chunk_size` - Target size for each chunk in characters.
61    #[must_use]
62    pub const fn with_size(chunk_size: usize) -> Self {
63        Self {
64            chunk_size,
65            overlap: 0,
66            line_aware: true,
67        }
68    }
69
70    /// Creates a fixed chunker with custom size and overlap.
71    ///
72    /// # Arguments
73    ///
74    /// * `chunk_size` - Target size for each chunk in characters.
75    /// * `overlap` - Number of characters to overlap between chunks.
76    #[must_use]
77    pub const fn with_size_and_overlap(chunk_size: usize, overlap: usize) -> Self {
78        Self {
79            chunk_size,
80            overlap,
81            line_aware: true,
82        }
83    }
84
85    /// Sets whether to align chunks to line boundaries.
86    ///
87    /// When enabled, chunks will end at the nearest newline before
88    /// the target size (if one exists within a reasonable range).
89    #[must_use]
90    pub const fn line_aware(mut self, enabled: bool) -> Self {
91        self.line_aware = enabled;
92        self
93    }
94
95    /// Finds a valid chunk boundary respecting UTF-8 and optionally lines.
96    fn find_boundary(&self, text: &str, target_pos: usize) -> usize {
97        let mut pos = target_pos.min(text.len());
98
99        // First, find valid UTF-8 boundary
100        while !text.is_char_boundary(pos) && pos > 0 {
101            pos -= 1;
102        }
103
104        // If line-aware, try to find a newline before this position
105        if self.line_aware && pos > 0 {
106            let search_start = pos.saturating_sub(self.chunk_size / 10); // Look back up to 10%
107            if let Some(newline_offset) = text[search_start..pos].rfind('\n') {
108                let newline_pos = search_start + newline_offset + 1; // Position after newline
109                if newline_pos > search_start {
110                    return newline_pos;
111                }
112            }
113        }
114
115        pos
116    }
117}
118
119impl Chunker for FixedChunker {
120    fn chunk(
121        &self,
122        buffer_id: i64,
123        text: &str,
124        metadata: Option<&ChunkMetadata>,
125    ) -> Result<Vec<Chunk>> {
126        // Get effective chunk size and overlap
127        let (chunk_size, overlap) = metadata.map_or((self.chunk_size, self.overlap), |meta| {
128            (meta.chunk_size, meta.overlap)
129        });
130
131        // Validate configuration
132        if chunk_size == 0 {
133            return Err(ChunkingError::InvalidConfig {
134                reason: "chunk_size must be > 0".to_string(),
135            }
136            .into());
137        }
138        if chunk_size > MAX_CHUNK_SIZE {
139            return Err(ChunkingError::ChunkTooLarge {
140                size: chunk_size,
141                max: MAX_CHUNK_SIZE,
142            }
143            .into());
144        }
145        if overlap >= chunk_size {
146            return Err(ChunkingError::OverlapTooLarge {
147                overlap,
148                size: chunk_size,
149            }
150            .into());
151        }
152
153        // Handle empty text
154        if text.is_empty() {
155            return Ok(vec![]);
156        }
157
158        // Handle text smaller than chunk size
159        if text.len() <= chunk_size {
160            return Ok(vec![Chunk::with_strategy(
161                buffer_id,
162                text.to_string(),
163                0..text.len(),
164                0,
165                self.name(),
166            )]);
167        }
168
169        let mut chunks = Vec::new();
170        let mut start = 0;
171        let mut index = 0;
172
173        while start < text.len() {
174            let target_end = (start + chunk_size).min(text.len());
175            let end = if target_end >= text.len() {
176                text.len()
177            } else {
178                self.find_boundary(text, target_end)
179            };
180
181            // Ensure we make progress
182            let end = if end <= start {
183                (start + chunk_size).min(text.len())
184            } else {
185                end
186            };
187
188            let content = text[start..end].to_string();
189            let mut chunk =
190                Chunk::with_strategy(buffer_id, content, start..end, index, self.name());
191
192            if index > 0 && overlap > 0 {
193                chunk.set_has_overlap(true);
194            }
195
196            chunks.push(chunk);
197
198            // Check max chunks limit
199            if let Some(meta) = metadata
200                && meta.max_chunks > 0
201                && chunks.len() >= meta.max_chunks
202            {
203                break;
204            }
205
206            // Move to next chunk
207            if end >= text.len() {
208                break;
209            }
210
211            start = if overlap > 0 {
212                end.saturating_sub(overlap)
213            } else {
214                end
215            };
216
217            // Ensure we don't go backwards
218            if start <= chunks.last().map_or(0, |c| c.byte_range.start) {
219                start = end;
220            }
221
222            index += 1;
223        }
224
225        Ok(chunks)
226    }
227
228    fn name(&self) -> &'static str {
229        "fixed"
230    }
231
232    fn description(&self) -> &'static str {
233        "Fixed-size chunking with optional line boundary alignment"
234    }
235}
236
237#[cfg(test)]
238mod tests {
239    use super::*;
240
241    #[test]
242    fn test_fixed_chunker_default() {
243        let chunker = FixedChunker::new();
244        assert_eq!(chunker.chunk_size, DEFAULT_CHUNK_SIZE);
245        assert_eq!(chunker.overlap, DEFAULT_OVERLAP);
246    }
247
248    #[test]
249    fn test_fixed_chunker_empty_text() {
250        let chunker = FixedChunker::with_size(100);
251        let chunks = chunker.chunk(1, "", None).unwrap();
252        assert!(chunks.is_empty());
253    }
254
255    #[test]
256    fn test_fixed_chunker_small_text() {
257        let chunker = FixedChunker::with_size(100);
258        let text = "Hello, world!";
259        let chunks = chunker.chunk(1, text, None).unwrap();
260        assert_eq!(chunks.len(), 1);
261        assert_eq!(chunks[0].content, text);
262    }
263
264    #[test]
265    fn test_fixed_chunker_exact_size() {
266        let chunker = FixedChunker::with_size(10).line_aware(false);
267        let text = "0123456789";
268        let chunks = chunker.chunk(1, text, None).unwrap();
269        assert_eq!(chunks.len(), 1);
270        assert_eq!(chunks[0].content, text);
271    }
272
273    #[test]
274    fn test_fixed_chunker_multiple_chunks() {
275        let chunker = FixedChunker::with_size(10).line_aware(false);
276        let text = "0123456789ABCDEFGHIJ";
277        let chunks = chunker.chunk(1, text, None).unwrap();
278        assert_eq!(chunks.len(), 2);
279        assert_eq!(chunks[0].byte_range, 0..10);
280        assert_eq!(chunks[1].byte_range, 10..20);
281    }
282
283    #[test]
284    fn test_fixed_chunker_with_overlap() {
285        let chunker = FixedChunker::with_size_and_overlap(10, 3).line_aware(false);
286        let text = "0123456789ABCDEFGHIJ";
287        let chunks = chunker.chunk(1, text, None).unwrap();
288
289        // With overlap, second chunk should start at 7 (10 - 3)
290        assert!(chunks.len() >= 2);
291        assert!(chunks[1].metadata.has_overlap);
292    }
293
294    #[test]
295    fn test_fixed_chunker_line_aware() {
296        let chunker = FixedChunker::with_size(15).line_aware(true);
297        let text = "Hello\nWorld\nTest";
298        let chunks = chunker.chunk(1, text, None).unwrap();
299
300        // Should try to align to newline
301        assert!(!chunks.is_empty());
302    }
303
304    #[test]
305    fn test_fixed_chunker_unicode() {
306        let chunker = FixedChunker::with_size(5).line_aware(false);
307        let text = "Hello世界Test";
308        let chunks = chunker.chunk(1, text, None).unwrap();
309
310        // All chunks should be valid UTF-8
311        for chunk in &chunks {
312            assert!(chunk.content.is_char_boundary(0));
313        }
314    }
315
316    #[test]
317    fn test_fixed_chunker_preserves_indices() {
318        let chunker = FixedChunker::with_size(10).line_aware(false);
319        let text = "0123456789ABCDEFGHIJ";
320        let chunks = chunker.chunk(1, text, None).unwrap();
321
322        for (i, chunk) in chunks.iter().enumerate() {
323            assert_eq!(chunk.index, i);
324            assert_eq!(chunk.buffer_id, 1);
325        }
326    }
327
328    #[test]
329    fn test_fixed_chunker_invalid_config() {
330        let chunker = FixedChunker::with_size(0);
331        let result = chunker.chunk(1, "test", None);
332        assert!(result.is_err());
333    }
334
335    #[test]
336    fn test_fixed_chunker_overlap_too_large() {
337        let chunker = FixedChunker::with_size_and_overlap(10, 10);
338        let result = chunker.chunk(1, "test content here", None);
339        assert!(result.is_err());
340    }
341
342    #[test]
343    fn test_fixed_chunker_max_chunks() {
344        let chunker = FixedChunker::with_size(5).line_aware(false);
345        let text = "0123456789ABCDEFGHIJ";
346        let meta = ChunkMetadata::with_size(5).max_chunks(2);
347        let chunks = chunker.chunk(1, text, Some(&meta)).unwrap();
348        assert_eq!(chunks.len(), 2);
349    }
350
351    #[test]
352    fn test_fixed_chunker_strategy_name() {
353        let chunker = FixedChunker::new();
354        assert_eq!(chunker.name(), "fixed");
355
356        let chunks = chunker.chunk(1, "Hello, world!", None).unwrap();
357        assert_eq!(chunks[0].metadata.strategy, Some("fixed".to_string()));
358    }
359
360    #[test]
361    fn test_fixed_chunker_default_impl() {
362        // Test Default trait implementation (lines 40-41)
363        let chunker = FixedChunker::default();
364        assert_eq!(chunker.chunk_size, DEFAULT_CHUNK_SIZE);
365        assert_eq!(chunker.overlap, DEFAULT_OVERLAP);
366        assert!(chunker.line_aware);
367    }
368
369    #[test]
370    fn test_fixed_chunker_chunk_too_large() {
371        // Test ChunkTooLarge error (lines 139-143)
372        let chunker = FixedChunker::with_size(MAX_CHUNK_SIZE + 1);
373        let result = chunker.chunk(1, "test", None);
374        assert!(result.is_err());
375    }
376
377    #[test]
378    fn test_fixed_chunker_line_aware_boundary() {
379        // Test line-aware boundary finding with newline (lines 108-110)
380        let chunker = FixedChunker::with_size(20).line_aware(true);
381        let text = "Hello world\nSecond line here\nThird line";
382        let chunks = chunker.chunk(1, text, None).unwrap();
383
384        // Should prefer breaking at newline boundaries
385        assert!(!chunks.is_empty());
386        // Verify that chunks try to align to newlines
387        for chunk in &chunks[..chunks.len().saturating_sub(1)] {
388            let content = &chunk.content;
389            // Non-final chunks should end at or near newline
390            assert!(content.ends_with('\n') || content.len() <= 20);
391        }
392    }
393
394    #[test]
395    fn test_fixed_chunker_description() {
396        // Test description method
397        let chunker = FixedChunker::new();
398        let desc = chunker.description();
399        assert!(desc.contains("Fixed"));
400        assert!(!desc.is_empty());
401    }
402
403    #[test]
404    fn test_fixed_chunker_large_overlap() {
405        // Test with overlap to trigger backwards check (line 218)
406        let chunker = FixedChunker::with_size_and_overlap(10, 8).line_aware(false);
407        let text = "AAAAAAAAAABBBBBBBBBBCCCCCCCCCC";
408        let chunks = chunker.chunk(1, text, None).unwrap();
409
410        // Should handle high overlap without going backwards
411        assert!(chunks.len() >= 2);
412        // Verify each chunk makes forward progress
413        for i in 1..chunks.len() {
414            assert!(chunks[i].byte_range.start >= chunks[i - 1].byte_range.start);
415        }
416    }
417
418    #[test]
419    fn test_fixed_chunker_metadata_override() {
420        // Test with metadata overriding chunker settings
421        let chunker = FixedChunker::with_size(1000);
422        let text = "A".repeat(50);
423        let meta = ChunkMetadata::with_size_and_overlap(20, 5);
424        let chunks = chunker.chunk(1, &text, Some(&meta)).unwrap();
425
426        // Metadata chunk_size (20) should override chunker's (1000)
427        assert!(chunks.len() > 1);
428    }
429
430    #[test]
431    fn test_fixed_chunker_line_aware_newline_found() {
432        // Test line-aware boundary finding that finds newline (lines 108-110)
433        // The text has newlines within the search window
434        let chunker = FixedChunker::with_size(25).line_aware(true);
435        let text = "Hello world here\nSecond line of text\nThird line";
436        let chunks = chunker.chunk(1, text, None).unwrap();
437
438        // Should prefer breaking at newline boundaries
439        assert!(!chunks.is_empty());
440        // First non-final chunk should end at newline if possible
441        if chunks.len() > 1 {
442            let first = &chunks[0];
443            // The chunk should have been aligned to newline
444            assert!(
445                first.content.ends_with('\n') || first.content.len() <= 25,
446                "First chunk content: '{}'",
447                first.content
448            );
449        }
450    }
451
452    #[test]
453    fn test_fixed_chunker_force_progress_edge_case() {
454        // Test edge case where end <= start requiring forced progress (line 183)
455        // This happens with pathological input where boundary finding returns 0
456        let chunker = FixedChunker::with_size(3).line_aware(false);
457        let text = "ABCDEFGHIJ";
458        let chunks = chunker.chunk(1, text, None).unwrap();
459
460        // Should still make progress through the entire text
461        assert!(!chunks.is_empty());
462        // Verify coverage
463        let total_len: usize = chunks.iter().map(|c| c.content.len()).sum();
464        // With overlap=0, total should roughly equal text length (minus any alignment)
465        assert!(total_len >= text.len() - 3);
466    }
467
468    #[test]
469    fn test_fixed_chunker_no_backward_progress() {
470        // Test that start doesn't go backwards after overlap (line 218)
471        // Use high overlap ratio to trigger this edge case
472        let chunker = FixedChunker::with_size_and_overlap(10, 9).line_aware(false);
473        let text = "ABCDEFGHIJKLMNOPQRST";
474        let chunks = chunker.chunk(1, text, None).unwrap();
475
476        // Verify each chunk starts at or after the previous chunk's start
477        for i in 1..chunks.len() {
478            assert!(
479                chunks[i].byte_range.start >= chunks[i - 1].byte_range.start,
480                "Chunk {} starts before chunk {}: {} < {}",
481                i,
482                i - 1,
483                chunks[i].byte_range.start,
484                chunks[i - 1].byte_range.start
485            );
486        }
487    }
488}