Skip to main content

rlm_rs/chunking/
mod.rs

1//! Chunking strategies for RLM-RS.
2//!
3//! This module provides a trait-based system for chunking text content
4//! into processable segments. Multiple strategies are available:
5//!
6//! - **Fixed**: Simple character-based chunking with configurable size and overlap
7//! - **Semantic**: Unicode-aware chunking respecting sentence/paragraph boundaries
8//! - **Code**: Language-aware chunking at function/class boundaries
9//! - **Parallel**: Orchestrator for parallel chunk processing
10
11pub mod code;
12pub mod fixed;
13pub mod parallel;
14pub mod semantic;
15pub mod traits;
16
17pub use code::CodeChunker;
18pub use fixed::FixedChunker;
19pub use parallel::ParallelChunker;
20pub use semantic::SemanticChunker;
21pub use traits::{ChunkMetadata as ChunkerMetadata, Chunker};
22
23/// Default chunk size in characters (~750 tokens at 4 chars/token).
24/// Sized for granular semantic search with embeddings.
25pub const DEFAULT_CHUNK_SIZE: usize = 3_000;
26
27/// Default overlap size in characters (for context continuity).
28pub const DEFAULT_OVERLAP: usize = 500;
29
30/// Maximum allowed chunk size (50k chars, ~12.5k tokens).
31pub const MAX_CHUNK_SIZE: usize = 50_000;
32
33/// Creates the default chunker (semantic).
34#[must_use]
35pub const fn default_chunker() -> SemanticChunker {
36    SemanticChunker::new()
37}
38
39/// Creates a chunker by name.
40///
41/// # Arguments
42///
43/// * `name` - Chunker strategy name: "fixed", "semantic", "code", or "parallel".
44///
45/// # Returns
46///
47/// A boxed chunker trait object, or an error for unknown strategies.
48///
49/// # Errors
50///
51/// Returns [`crate::error::ChunkingError::UnknownStrategy`] if the strategy name is not recognized.
52pub fn create_chunker(name: &str) -> crate::error::Result<Box<dyn Chunker>> {
53    match name.to_lowercase().as_str() {
54        "fixed" => Ok(Box::new(FixedChunker::new())),
55        "semantic" => Ok(Box::new(SemanticChunker::new())),
56        "code" | "ast" => Ok(Box::new(CodeChunker::new())),
57        "parallel" => Ok(Box::new(ParallelChunker::new(SemanticChunker::new()))),
58        _ => Err(crate::error::ChunkingError::UnknownStrategy {
59            name: name.to_string(),
60        }
61        .into()),
62    }
63}
64
65/// Lists available chunking strategy names.
66#[must_use]
67pub fn available_strategies() -> Vec<&'static str> {
68    vec!["fixed", "semantic", "code", "parallel"]
69}
70
71#[cfg(test)]
72mod tests {
73    use super::*;
74
75    #[test]
76    fn test_default_chunker() {
77        // Test default_chunker function (lines 32-33)
78        let chunker = default_chunker();
79        assert_eq!(chunker.name(), "semantic");
80    }
81
82    #[test]
83    fn test_create_chunker_fixed() {
84        let chunker = create_chunker("fixed").unwrap();
85        assert_eq!(chunker.name(), "fixed");
86    }
87
88    #[test]
89    fn test_create_chunker_semantic() {
90        let chunker = create_chunker("semantic").unwrap();
91        assert_eq!(chunker.name(), "semantic");
92    }
93
94    #[test]
95    fn test_create_chunker_parallel() {
96        let chunker = create_chunker("parallel").unwrap();
97        assert_eq!(chunker.name(), "parallel");
98    }
99
100    #[test]
101    fn test_create_chunker_unknown() {
102        let result = create_chunker("unknown");
103        assert!(result.is_err());
104    }
105
106    #[test]
107    fn test_create_chunker_case_insensitive() {
108        let chunker = create_chunker("FIXED").unwrap();
109        assert_eq!(chunker.name(), "fixed");
110    }
111
112    #[test]
113    fn test_available_strategies() {
114        let strategies = available_strategies();
115        assert_eq!(strategies.len(), 4);
116        assert!(strategies.contains(&"fixed"));
117        assert!(strategies.contains(&"semantic"));
118        assert!(strategies.contains(&"code"));
119        assert!(strategies.contains(&"parallel"));
120    }
121
122    #[test]
123    fn test_create_chunker_code() {
124        let chunker = create_chunker("code").unwrap();
125        assert_eq!(chunker.name(), "code");
126    }
127
128    #[test]
129    fn test_create_chunker_ast_alias() {
130        let chunker = create_chunker("ast").unwrap();
131        assert_eq!(chunker.name(), "code");
132    }
133}