sc/embeddings/
chunking.rs1#[derive(Debug, Clone)]
16pub struct ChunkConfig {
17 pub max_chars: usize,
20
21 pub overlap: usize,
24
25 pub min_chunk_size: usize,
28}
29
30impl Default for ChunkConfig {
31 fn default() -> Self {
32 Self {
33 max_chars: 2000,
34 overlap: 200,
35 min_chunk_size: 100,
36 }
37 }
38}
39
40impl ChunkConfig {
41 #[must_use]
46 pub fn for_ollama() -> Self {
47 Self {
48 max_chars: 2000,
49 overlap: 200,
50 min_chunk_size: 100,
51 }
52 }
53
54 #[must_use]
58 pub fn for_minilm() -> Self {
59 Self {
60 max_chars: 800,
61 overlap: 100,
62 min_chunk_size: 50,
63 }
64 }
65}
66
67#[derive(Debug, Clone)]
69pub struct TextChunk {
70 pub text: String,
72 pub index: usize,
74 pub start_offset: usize,
76 pub end_offset: usize,
78}
79
80#[must_use]
95pub fn chunk_text(text: &str, config: &ChunkConfig) -> Vec<TextChunk> {
96 let text = text.trim();
97
98 if text.is_empty() {
99 return vec![];
100 }
101
102 if text.len() <= config.max_chars {
104 return vec![TextChunk {
105 text: text.to_string(),
106 index: 0,
107 start_offset: 0,
108 end_offset: text.len(),
109 }];
110 }
111
112 let mut chunks = Vec::new();
113 let mut start = 0;
114 let mut index = 0;
115
116 while start < text.len() {
117 let mut end = (start + config.max_chars).min(text.len());
119
120 if end < text.len() {
122 end = find_word_boundary(text, end, start + config.min_chunk_size);
123 }
124
125 let chunk_text = &text[start..end];
126
127 if chunk_text.len() >= config.min_chunk_size || start + chunk_text.len() >= text.len() {
129 chunks.push(TextChunk {
130 text: chunk_text.to_string(),
131 index,
132 start_offset: start,
133 end_offset: end,
134 });
135 index += 1;
136 }
137
138 let next_start = end.saturating_sub(config.overlap);
140
141 if next_start <= start {
143 start = end;
144 } else {
145 start = next_start;
146 }
147
148 if end >= text.len() {
150 break;
151 }
152 }
153
154 chunks
155}
156
157fn find_word_boundary(text: &str, target: usize, min_pos: usize) -> usize {
162 let bytes = text.as_bytes();
163
164 for i in (min_pos..=target).rev() {
166 if i >= bytes.len() {
167 continue;
168 }
169
170 let c = bytes[i] as char;
171 if c.is_whitespace() || matches!(c, '.' | '!' | '?' | ';' | ',' | '\n') {
172 return (i + 1).min(text.len());
174 }
175 }
176
177 target
179}
180
181#[must_use]
185pub fn prepare_item_text(key: &str, value: &str, category: Option<&str>) -> String {
186 let mut text = String::new();
187
188 if let Some(cat) = category {
190 text.push_str(&format!("[{cat}] "));
191 }
192
193 text.push_str(key);
195 text.push_str(": ");
196
197 text.push_str(value);
199
200 text
201}
202
203#[cfg(test)]
204mod tests {
205 use super::*;
206
207 #[test]
208 fn test_single_chunk() {
209 let config = ChunkConfig::default();
210 let chunks = chunk_text("Hello world", &config);
211
212 assert_eq!(chunks.len(), 1);
213 assert_eq!(chunks[0].text, "Hello world");
214 assert_eq!(chunks[0].index, 0);
215 }
216
217 #[test]
218 fn test_empty_text() {
219 let config = ChunkConfig::default();
220 let chunks = chunk_text("", &config);
221
222 assert!(chunks.is_empty());
223 }
224
225 #[test]
226 fn test_whitespace_only() {
227 let config = ChunkConfig::default();
228 let chunks = chunk_text(" \n\t ", &config);
229
230 assert!(chunks.is_empty());
231 }
232
233 #[test]
234 fn test_multiple_chunks() {
235 let config = ChunkConfig {
236 max_chars: 50,
237 overlap: 10,
238 min_chunk_size: 10,
239 };
240
241 let text = "The quick brown fox jumps over the lazy dog. This is a test sentence that should be split into multiple chunks.";
242 let chunks = chunk_text(text, &config);
243
244 assert!(chunks.len() > 1);
245
246 for chunk in &chunks {
248 assert!(chunk.text.len() <= config.max_chars);
249 }
250
251 for (i, chunk) in chunks.iter().enumerate() {
253 assert_eq!(chunk.index, i);
254 }
255 }
256
257 #[test]
258 fn test_overlap() {
259 let config = ChunkConfig {
260 max_chars: 20,
261 overlap: 5,
262 min_chunk_size: 5,
263 };
264
265 let text = "one two three four five six seven eight";
266 let chunks = chunk_text(text, &config);
267
268 if chunks.len() >= 2 {
270 assert!(chunks[0].end_offset > chunks[1].start_offset);
273 }
274 }
275
276 #[test]
277 fn test_prepare_item_text() {
278 let text = prepare_item_text("auth-decision", "Use JWT tokens", Some("decision"));
279 assert_eq!(text, "[decision] auth-decision: Use JWT tokens");
280
281 let text_no_category = prepare_item_text("key", "value", None);
282 assert_eq!(text_no_category, "key: value");
283 }
284}