rlm_rs/chunking/
traits.rs1use crate::core::Chunk;
7use crate::error::Result;
8
9pub trait Chunker: Send + Sync {
26 fn chunk(
42 &self,
43 buffer_id: i64,
44 text: &str,
45 metadata: Option<&ChunkMetadata>,
46 ) -> Result<Vec<Chunk>>;
47
48 fn name(&self) -> &'static str;
50
51 fn supports_parallel(&self) -> bool {
56 false
57 }
58
59 fn description(&self) -> &'static str {
61 "No description available"
62 }
63
64 fn validate(&self, metadata: Option<&ChunkMetadata>) -> Result<()> {
78 if let Some(meta) = metadata {
79 if meta.chunk_size == 0 {
80 return Err(crate::error::ChunkingError::InvalidConfig {
81 reason: "chunk_size must be > 0".to_string(),
82 }
83 .into());
84 }
85 if meta.overlap >= meta.chunk_size {
86 return Err(crate::error::ChunkingError::OverlapTooLarge {
87 overlap: meta.overlap,
88 size: meta.chunk_size,
89 }
90 .into());
91 }
92 }
93 Ok(())
94 }
95}
96
97#[derive(Debug, Clone, Default)]
102pub struct ChunkMetadata {
103 pub source: Option<String>,
105
106 pub content_type: Option<String>,
108
109 pub chunk_size: usize,
111
112 pub overlap: usize,
114
115 pub preserve_lines: bool,
117
118 pub preserve_sentences: bool,
120
121 pub max_chunks: usize,
123}
124
125impl ChunkMetadata {
126 #[must_use]
128 pub fn new() -> Self {
129 Self {
130 chunk_size: super::DEFAULT_CHUNK_SIZE,
131 overlap: super::DEFAULT_OVERLAP,
132 preserve_lines: true,
133 preserve_sentences: false,
134 ..Default::default()
135 }
136 }
137
138 #[must_use]
140 pub fn with_size(chunk_size: usize) -> Self {
141 Self {
142 chunk_size,
143 overlap: 0,
144 ..Self::new()
145 }
146 }
147
148 #[must_use]
150 pub fn with_size_and_overlap(chunk_size: usize, overlap: usize) -> Self {
151 Self {
152 chunk_size,
153 overlap,
154 ..Self::new()
155 }
156 }
157
158 #[must_use]
160 pub fn source(mut self, source: &str) -> Self {
161 self.source = Some(source.to_string());
162 self
163 }
164
165 #[must_use]
167 pub fn content_type(mut self, content_type: &str) -> Self {
168 self.content_type = Some(content_type.to_string());
169 self
170 }
171
172 #[must_use]
174 pub const fn preserve_lines(mut self, preserve: bool) -> Self {
175 self.preserve_lines = preserve;
176 self
177 }
178
179 #[must_use]
181 pub const fn preserve_sentences(mut self, preserve: bool) -> Self {
182 self.preserve_sentences = preserve;
183 self
184 }
185
186 #[must_use]
188 pub const fn max_chunks(mut self, max: usize) -> Self {
189 self.max_chunks = max;
190 self
191 }
192}
193
194#[cfg(test)]
195mod tests {
196 use super::*;
197
198 #[test]
199 fn test_chunk_metadata_defaults() {
200 let meta = ChunkMetadata::new();
201 assert_eq!(meta.chunk_size, super::super::DEFAULT_CHUNK_SIZE);
202 assert_eq!(meta.overlap, super::super::DEFAULT_OVERLAP);
203 assert!(meta.preserve_lines);
204 assert!(!meta.preserve_sentences);
205 }
206
207 #[test]
208 fn test_chunk_metadata_builder() {
209 let meta = ChunkMetadata::with_size_and_overlap(1000, 100)
210 .source("test.txt")
211 .content_type("txt")
212 .preserve_sentences(true)
213 .max_chunks(10);
214
215 assert_eq!(meta.chunk_size, 1000);
216 assert_eq!(meta.overlap, 100);
217 assert_eq!(meta.source, Some("test.txt".to_string()));
218 assert_eq!(meta.content_type, Some("txt".to_string()));
219 assert!(meta.preserve_sentences);
220 assert_eq!(meta.max_chunks, 10);
221 }
222
223 #[test]
224 fn test_chunk_metadata_with_size() {
225 let meta = ChunkMetadata::with_size(500);
226 assert_eq!(meta.chunk_size, 500);
227 assert_eq!(meta.overlap, 0);
228 }
229
230 #[test]
231 fn test_chunk_metadata_preserve_lines() {
232 let meta = ChunkMetadata::new().preserve_lines(false);
233 assert!(!meta.preserve_lines);
234
235 let meta = ChunkMetadata::new().preserve_lines(true);
236 assert!(meta.preserve_lines);
237 }
238
239 mod validation_tests {
241 use crate::chunking::FixedChunker;
242 use crate::chunking::traits::{ChunkMetadata, Chunker};
243
244 #[test]
245 fn test_chunker_validate_zero_chunk_size() {
246 let chunker = FixedChunker::with_size(100);
247 let meta = ChunkMetadata {
248 chunk_size: 0,
249 overlap: 0,
250 ..Default::default()
251 };
252 let result = chunker.validate(Some(&meta));
253 assert!(result.is_err());
254 }
255
256 #[test]
257 fn test_chunker_validate_overlap_too_large() {
258 let chunker = FixedChunker::with_size(100);
259 let meta = ChunkMetadata {
260 chunk_size: 50,
261 overlap: 100, ..Default::default()
263 };
264 let result = chunker.validate(Some(&meta));
265 assert!(result.is_err());
266 }
267
268 #[test]
269 fn test_chunker_validate_valid() {
270 let chunker = FixedChunker::with_size(100);
271 let meta = ChunkMetadata {
272 chunk_size: 100,
273 overlap: 10,
274 ..Default::default()
275 };
276 let result = chunker.validate(Some(&meta));
277 assert!(result.is_ok());
278 }
279
280 #[test]
281 fn test_chunker_validate_none() {
282 let chunker = FixedChunker::with_size(100);
283 let result = chunker.validate(None);
284 assert!(result.is_ok());
285 }
286
287 #[test]
288 fn test_chunker_supports_parallel() {
289 let chunker = FixedChunker::with_size(100);
290 assert!(!chunker.supports_parallel());
292 }
293
294 #[test]
295 fn test_chunker_description() {
296 let chunker = FixedChunker::with_size(100);
297 let desc = chunker.description();
298 assert!(!desc.is_empty());
299 }
300
301 #[test]
302 fn test_chunker_name() {
303 let chunker = FixedChunker::with_size(100);
304 assert_eq!(chunker.name(), "fixed");
305 }
306 }
307
308 struct MinimalChunker;
310
311 impl Chunker for MinimalChunker {
312 fn chunk(
313 &self,
314 _buffer_id: i64,
315 _text: &str,
316 _metadata: Option<&ChunkMetadata>,
317 ) -> crate::error::Result<Vec<crate::core::Chunk>> {
318 Ok(vec![])
319 }
320
321 fn name(&self) -> &'static str {
322 "minimal"
323 }
324 }
325
326 #[test]
327 fn test_chunker_default_description() {
328 let chunker = MinimalChunker;
330 let desc = chunker.description();
331 assert_eq!(desc, "No description available");
332 }
333
334 #[test]
335 fn test_chunker_default_supports_parallel() {
336 let chunker = MinimalChunker;
338 assert!(!chunker.supports_parallel());
339 }
340}