Skip to main content

sc/embeddings/
chunking.rs

1//! Text chunking for embeddings.
2//!
3//! Splits large text into overlapping chunks for embedding generation.
4//! This ensures full semantic coverage while respecting model token limits.
5//!
6//! # Design Decisions
7//!
8//! - **Character-based chunking**: Simple, predictable, works with any language.
9//!   Token-based would be more accurate but requires the model's tokenizer.
10//! - **Word boundary splitting**: Avoids breaking mid-word which can confuse embeddings.
11//! - **Overlapping windows**: Maintains context at chunk boundaries for better retrieval.
12//! - **Configurable parameters**: Different models have different optimal chunk sizes.
13
14/// Configuration for text chunking.
15#[derive(Debug, Clone)]
16pub struct ChunkConfig {
17    /// Maximum characters per chunk.
18    /// Default: 2000 (~500 tokens for most models).
19    pub max_chars: usize,
20
21    /// Number of characters to overlap between chunks.
22    /// Default: 200 (~50 tokens) for context continuity.
23    pub overlap: usize,
24
25    /// Minimum chunk size (avoids tiny trailing chunks).
26    /// Default: 100 characters.
27    pub min_chunk_size: usize,
28}
29
30impl Default for ChunkConfig {
31    fn default() -> Self {
32        Self {
33            max_chars: 2000,
34            overlap: 200,
35            min_chunk_size: 100,
36        }
37    }
38}
39
40impl ChunkConfig {
41    /// Create a config optimized for Ollama nomic-embed-text.
42    ///
43    /// nomic-embed-text has an 8192 token context window.
44    /// We use conservative chunking to stay well under the limit.
45    #[must_use]
46    pub fn for_ollama() -> Self {
47        Self {
48            max_chars: 2000,
49            overlap: 200,
50            min_chunk_size: 100,
51        }
52    }
53
54    /// Create a config for HuggingFace MiniLM models.
55    ///
56    /// MiniLM models have a 256 token limit, so we use smaller chunks.
57    #[must_use]
58    pub fn for_minilm() -> Self {
59        Self {
60            max_chars: 800,
61            overlap: 100,
62            min_chunk_size: 50,
63        }
64    }
65}
66
67/// A text chunk with its index.
68#[derive(Debug, Clone)]
69pub struct TextChunk {
70    /// The chunk text.
71    pub text: String,
72    /// Zero-based index of this chunk.
73    pub index: usize,
74    /// Character offset in the original text.
75    pub start_offset: usize,
76    /// Character offset where this chunk ends.
77    pub end_offset: usize,
78}
79
80/// Split text into overlapping chunks.
81///
82/// Uses word boundaries to avoid splitting mid-word.
83///
84/// # Examples
85///
86/// ```rust,ignore
87/// use sc::embeddings::chunking::{chunk_text, ChunkConfig};
88///
89/// let config = ChunkConfig::default();
90/// let chunks = chunk_text("This is a test.", &config);
91/// assert_eq!(chunks.len(), 1);
92/// assert_eq!(chunks[0].text, "This is a test.");
93/// ```
94#[must_use]
95pub fn chunk_text(text: &str, config: &ChunkConfig) -> Vec<TextChunk> {
96    let text = text.trim();
97
98    if text.is_empty() {
99        return vec![];
100    }
101
102    // Small text: return as single chunk
103    if text.len() <= config.max_chars {
104        return vec![TextChunk {
105            text: text.to_string(),
106            index: 0,
107            start_offset: 0,
108            end_offset: text.len(),
109        }];
110    }
111
112    let mut chunks = Vec::new();
113    let mut start = 0;
114    let mut index = 0;
115
116    while start < text.len() {
117        // Calculate end position
118        let mut end = (start + config.max_chars).min(text.len());
119
120        // If we're not at the end, find a word boundary
121        if end < text.len() {
122            end = find_word_boundary(text, end, start + config.min_chunk_size);
123        }
124
125        let chunk_text = &text[start..end];
126
127        // Skip if chunk is too small (unless it's the last one)
128        if chunk_text.len() >= config.min_chunk_size || start + chunk_text.len() >= text.len() {
129            chunks.push(TextChunk {
130                text: chunk_text.to_string(),
131                index,
132                start_offset: start,
133                end_offset: end,
134            });
135            index += 1;
136        }
137
138        // Move start forward, accounting for overlap
139        let next_start = end.saturating_sub(config.overlap);
140
141        // Ensure we make progress
142        if next_start <= start {
143            start = end;
144        } else {
145            start = next_start;
146        }
147
148        // Break if we've processed everything
149        if end >= text.len() {
150            break;
151        }
152    }
153
154    chunks
155}
156
157/// Find a word boundary near the target position.
158///
159/// Searches backward from `target` to find a space or punctuation boundary.
160/// Won't go further back than `min_pos`.
161fn find_word_boundary(text: &str, target: usize, min_pos: usize) -> usize {
162    let bytes = text.as_bytes();
163
164    // Search backward for a word boundary
165    for i in (min_pos..=target).rev() {
166        if i >= bytes.len() {
167            continue;
168        }
169
170        let c = bytes[i] as char;
171        if c.is_whitespace() || matches!(c, '.' | '!' | '?' | ';' | ',' | '\n') {
172            // Include the boundary character
173            return (i + 1).min(text.len());
174        }
175    }
176
177    // No boundary found, just use target
178    target
179}
180
181/// Prepare text for embedding by concatenating key and value.
182///
183/// Creates a searchable representation of a context item.
184#[must_use]
185pub fn prepare_item_text(key: &str, value: &str, category: Option<&str>) -> String {
186    let mut text = String::new();
187
188    // Add category prefix if present
189    if let Some(cat) = category {
190        text.push_str(&format!("[{cat}] "));
191    }
192
193    // Add key as a title
194    text.push_str(key);
195    text.push_str(": ");
196
197    // Add value
198    text.push_str(value);
199
200    text
201}
202
203#[cfg(test)]
204mod tests {
205    use super::*;
206
207    #[test]
208    fn test_single_chunk() {
209        let config = ChunkConfig::default();
210        let chunks = chunk_text("Hello world", &config);
211
212        assert_eq!(chunks.len(), 1);
213        assert_eq!(chunks[0].text, "Hello world");
214        assert_eq!(chunks[0].index, 0);
215    }
216
217    #[test]
218    fn test_empty_text() {
219        let config = ChunkConfig::default();
220        let chunks = chunk_text("", &config);
221
222        assert!(chunks.is_empty());
223    }
224
225    #[test]
226    fn test_whitespace_only() {
227        let config = ChunkConfig::default();
228        let chunks = chunk_text("   \n\t  ", &config);
229
230        assert!(chunks.is_empty());
231    }
232
233    #[test]
234    fn test_multiple_chunks() {
235        let config = ChunkConfig {
236            max_chars: 50,
237            overlap: 10,
238            min_chunk_size: 10,
239        };
240
241        let text = "The quick brown fox jumps over the lazy dog. This is a test sentence that should be split into multiple chunks.";
242        let chunks = chunk_text(text, &config);
243
244        assert!(chunks.len() > 1);
245
246        // Verify each chunk is within size limit
247        for chunk in &chunks {
248            assert!(chunk.text.len() <= config.max_chars);
249        }
250
251        // Verify indices are sequential
252        for (i, chunk) in chunks.iter().enumerate() {
253            assert_eq!(chunk.index, i);
254        }
255    }
256
257    #[test]
258    fn test_overlap() {
259        let config = ChunkConfig {
260            max_chars: 20,
261            overlap: 5,
262            min_chunk_size: 5,
263        };
264
265        let text = "one two three four five six seven eight";
266        let chunks = chunk_text(text, &config);
267
268        // With overlap, later chunks should contain some text from previous chunks
269        if chunks.len() >= 2 {
270            // The overlap means chunks share some content
271            // (end of first chunk is after start of second)
272            assert!(chunks[0].end_offset > chunks[1].start_offset);
273        }
274    }
275
276    #[test]
277    fn test_prepare_item_text() {
278        let text = prepare_item_text("auth-decision", "Use JWT tokens", Some("decision"));
279        assert_eq!(text, "[decision] auth-decision: Use JWT tokens");
280
281        let text_no_category = prepare_item_text("key", "value", None);
282        assert_eq!(text_no_category, "key: value");
283    }
284}