Skip to main content

rlm_rs/chunking/
traits.rs

1//! Chunker trait definition.
2//!
3//! Defines the interface for all chunking strategies, enabling
4//! pluggable text segmentation approaches.
5
6use crate::core::Chunk;
7use crate::error::Result;
8
9/// Trait for chunking text into processable segments.
10///
11/// Implementations must be `Send + Sync` to support parallel processing.
12/// Each chunker should produce consistent, deterministic output for the
13/// same input.
14///
15/// # Examples
16///
17/// ```
18/// use rlm_rs::chunking::{Chunker, FixedChunker};
19///
20/// let chunker = FixedChunker::with_size(100);
21/// let text = "Hello, world! ".repeat(20);
22/// let chunks = chunker.chunk(1, &text, None).unwrap();
23/// assert!(!chunks.is_empty());
24/// ```
25pub trait Chunker: Send + Sync {
26    /// Chunks the input text into segments.
27    ///
28    /// # Arguments
29    ///
30    /// * `buffer_id` - ID of the source buffer.
31    /// * `text` - The input text to chunk.
32    /// * `metadata` - Optional metadata for context-aware chunking.
33    ///
34    /// # Returns
35    ///
36    /// A vector of chunks with byte offsets and metadata.
37    ///
38    /// # Errors
39    ///
40    /// Returns an error if chunking fails (e.g., invalid configuration).
41    fn chunk(
42        &self,
43        buffer_id: i64,
44        text: &str,
45        metadata: Option<&ChunkMetadata>,
46    ) -> Result<Vec<Chunk>>;
47
48    /// Returns the name of the chunking strategy.
49    fn name(&self) -> &'static str;
50
51    /// Returns whether this chunker supports parallel processing.
52    ///
53    /// Default is `false`. Chunkers that benefit from parallelization
54    /// should override this to return `true`.
55    fn supports_parallel(&self) -> bool {
56        false
57    }
58
59    /// Returns a description of the chunking strategy.
60    fn description(&self) -> &'static str {
61        "No description available"
62    }
63
64    /// Validates configuration before chunking.
65    ///
66    /// # Arguments
67    ///
68    /// * `metadata` - Optional metadata to validate.
69    ///
70    /// # Returns
71    ///
72    /// `Ok(())` if configuration is valid, error otherwise.
73    ///
74    /// # Errors
75    ///
76    /// Returns an error if chunk size is zero or overlap exceeds chunk size.
77    fn validate(&self, metadata: Option<&ChunkMetadata>) -> Result<()> {
78        if let Some(meta) = metadata {
79            if meta.chunk_size == 0 {
80                return Err(crate::error::ChunkingError::InvalidConfig {
81                    reason: "chunk_size must be > 0".to_string(),
82                }
83                .into());
84            }
85            if meta.overlap >= meta.chunk_size {
86                return Err(crate::error::ChunkingError::OverlapTooLarge {
87                    overlap: meta.overlap,
88                    size: meta.chunk_size,
89                }
90                .into());
91            }
92        }
93        Ok(())
94    }
95}
96
97/// Metadata provided to chunkers for context-aware processing.
98///
99/// This allows callers to customize chunking behavior without
100/// modifying the chunker itself.
101#[derive(Debug, Clone, Default)]
102pub struct ChunkMetadata {
103    /// Source file path (for content-type detection).
104    pub source: Option<String>,
105
106    /// File MIME type or extension (e.g., "md", "json", "py").
107    pub content_type: Option<String>,
108
109    /// Target chunk size in characters.
110    pub chunk_size: usize,
111
112    /// Overlap between consecutive chunks.
113    pub overlap: usize,
114
115    /// Whether to preserve line boundaries.
116    pub preserve_lines: bool,
117
118    /// Whether to preserve sentence boundaries.
119    pub preserve_sentences: bool,
120
121    /// Maximum chunks to produce (0 = unlimited).
122    pub max_chunks: usize,
123}
124
125impl ChunkMetadata {
126    /// Creates new metadata with default chunk size.
127    #[must_use]
128    pub fn new() -> Self {
129        Self {
130            chunk_size: super::DEFAULT_CHUNK_SIZE,
131            overlap: super::DEFAULT_OVERLAP,
132            preserve_lines: true,
133            preserve_sentences: false,
134            ..Default::default()
135        }
136    }
137
138    /// Creates metadata with custom chunk size and no overlap.
139    #[must_use]
140    pub fn with_size(chunk_size: usize) -> Self {
141        Self {
142            chunk_size,
143            overlap: 0,
144            ..Self::new()
145        }
146    }
147
148    /// Creates metadata with custom size and overlap.
149    #[must_use]
150    pub fn with_size_and_overlap(chunk_size: usize, overlap: usize) -> Self {
151        Self {
152            chunk_size,
153            overlap,
154            ..Self::new()
155        }
156    }
157
158    /// Sets the source path.
159    #[must_use]
160    pub fn source(mut self, source: &str) -> Self {
161        self.source = Some(source.to_string());
162        self
163    }
164
165    /// Sets the content type.
166    #[must_use]
167    pub fn content_type(mut self, content_type: &str) -> Self {
168        self.content_type = Some(content_type.to_string());
169        self
170    }
171
172    /// Sets whether to preserve line boundaries.
173    #[must_use]
174    pub const fn preserve_lines(mut self, preserve: bool) -> Self {
175        self.preserve_lines = preserve;
176        self
177    }
178
179    /// Sets whether to preserve sentence boundaries.
180    #[must_use]
181    pub const fn preserve_sentences(mut self, preserve: bool) -> Self {
182        self.preserve_sentences = preserve;
183        self
184    }
185
186    /// Sets maximum chunks.
187    #[must_use]
188    pub const fn max_chunks(mut self, max: usize) -> Self {
189        self.max_chunks = max;
190        self
191    }
192}
193
194#[cfg(test)]
195mod tests {
196    use super::*;
197
198    #[test]
199    fn test_chunk_metadata_defaults() {
200        let meta = ChunkMetadata::new();
201        assert_eq!(meta.chunk_size, super::super::DEFAULT_CHUNK_SIZE);
202        assert_eq!(meta.overlap, super::super::DEFAULT_OVERLAP);
203        assert!(meta.preserve_lines);
204        assert!(!meta.preserve_sentences);
205    }
206
207    #[test]
208    fn test_chunk_metadata_builder() {
209        let meta = ChunkMetadata::with_size_and_overlap(1000, 100)
210            .source("test.txt")
211            .content_type("txt")
212            .preserve_sentences(true)
213            .max_chunks(10);
214
215        assert_eq!(meta.chunk_size, 1000);
216        assert_eq!(meta.overlap, 100);
217        assert_eq!(meta.source, Some("test.txt".to_string()));
218        assert_eq!(meta.content_type, Some("txt".to_string()));
219        assert!(meta.preserve_sentences);
220        assert_eq!(meta.max_chunks, 10);
221    }
222
223    #[test]
224    fn test_chunk_metadata_with_size() {
225        let meta = ChunkMetadata::with_size(500);
226        assert_eq!(meta.chunk_size, 500);
227        assert_eq!(meta.overlap, 0);
228    }
229
230    #[test]
231    fn test_chunk_metadata_preserve_lines() {
232        let meta = ChunkMetadata::new().preserve_lines(false);
233        assert!(!meta.preserve_lines);
234
235        let meta = ChunkMetadata::new().preserve_lines(true);
236        assert!(meta.preserve_lines);
237    }
238
239    // Test validation through FixedChunker since trait methods need a concrete impl
240    mod validation_tests {
241        use crate::chunking::FixedChunker;
242        use crate::chunking::traits::{ChunkMetadata, Chunker};
243
244        #[test]
245        fn test_chunker_validate_zero_chunk_size() {
246            let chunker = FixedChunker::with_size(100);
247            let meta = ChunkMetadata {
248                chunk_size: 0,
249                overlap: 0,
250                ..Default::default()
251            };
252            let result = chunker.validate(Some(&meta));
253            assert!(result.is_err());
254        }
255
256        #[test]
257        fn test_chunker_validate_overlap_too_large() {
258            let chunker = FixedChunker::with_size(100);
259            let meta = ChunkMetadata {
260                chunk_size: 50,
261                overlap: 100, // overlap >= chunk_size
262                ..Default::default()
263            };
264            let result = chunker.validate(Some(&meta));
265            assert!(result.is_err());
266        }
267
268        #[test]
269        fn test_chunker_validate_valid() {
270            let chunker = FixedChunker::with_size(100);
271            let meta = ChunkMetadata {
272                chunk_size: 100,
273                overlap: 10,
274                ..Default::default()
275            };
276            let result = chunker.validate(Some(&meta));
277            assert!(result.is_ok());
278        }
279
280        #[test]
281        fn test_chunker_validate_none() {
282            let chunker = FixedChunker::with_size(100);
283            let result = chunker.validate(None);
284            assert!(result.is_ok());
285        }
286
287        #[test]
288        fn test_chunker_supports_parallel() {
289            let chunker = FixedChunker::with_size(100);
290            // FixedChunker doesn't support parallel by default
291            assert!(!chunker.supports_parallel());
292        }
293
294        #[test]
295        fn test_chunker_description() {
296            let chunker = FixedChunker::with_size(100);
297            let desc = chunker.description();
298            assert!(!desc.is_empty());
299        }
300
301        #[test]
302        fn test_chunker_name() {
303            let chunker = FixedChunker::with_size(100);
304            assert_eq!(chunker.name(), "fixed");
305        }
306    }
307
308    /// A minimal chunker that uses all default trait implementations
309    struct MinimalChunker;
310
311    impl Chunker for MinimalChunker {
312        fn chunk(
313            &self,
314            _buffer_id: i64,
315            _text: &str,
316            _metadata: Option<&ChunkMetadata>,
317        ) -> crate::error::Result<Vec<crate::core::Chunk>> {
318            Ok(vec![])
319        }
320
321        fn name(&self) -> &'static str {
322            "minimal"
323        }
324    }
325
326    #[test]
327    fn test_chunker_default_description() {
328        // Test default description() method (lines 60-61)
329        let chunker = MinimalChunker;
330        let desc = chunker.description();
331        assert_eq!(desc, "No description available");
332    }
333
334    #[test]
335    fn test_chunker_default_supports_parallel() {
336        // Test default supports_parallel() (line 56)
337        let chunker = MinimalChunker;
338        assert!(!chunker.supports_parallel());
339    }
340}