1use crate::constants::{CHUNK_OVERLAP_TOKENS, CHUNK_SIZE_TOKENS, EMBEDDING_DIM};
10use text_splitter::{ChunkConfig, MarkdownSplitter};
11
12const CHARS_PER_TOKEN: usize = 2;
18
19pub const CHUNK_SIZE_CHARS: usize = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
21
22pub const CHUNK_OVERLAP_CHARS: usize = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
24
25#[derive(Debug, Clone)]
27pub struct Chunk {
28 pub start_offset: usize,
30 pub end_offset: usize,
32 pub token_count_approx: usize,
34}
35
36pub fn needs_chunking(body: &str) -> bool {
38 body.len() > CHUNK_SIZE_CHARS
39}
40
41pub fn split_into_chunks(body: &str) -> Vec<Chunk> {
49 if !needs_chunking(body) {
50 return vec![Chunk {
51 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
52 start_offset: 0,
53 end_offset: body.len(),
54 }];
55 }
56
57 let mut chunks = Vec::with_capacity(body.len() / CHUNK_SIZE_CHARS + 1);
58 let mut start = 0usize;
59
60 while start < body.len() {
61 start = next_char_boundary(body, start);
62 let desired_end = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
63 let end = if desired_end < body.len() {
64 find_split_boundary(body, start, desired_end)
65 } else {
66 desired_end
67 };
68
69 let end = if end <= start {
70 let fallback = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
71 if fallback > start {
72 fallback
73 } else {
74 body.len()
75 }
76 } else {
77 end
78 };
79
80 let token_count_approx = body[start..end].chars().count() / CHARS_PER_TOKEN;
81 chunks.push(Chunk {
82 start_offset: start,
83 end_offset: end,
84 token_count_approx,
85 });
86
87 if end >= body.len() {
88 break;
89 }
90
91 let next_start = next_char_boundary(body, end.saturating_sub(CHUNK_OVERLAP_CHARS));
92 start = if next_start >= end { end } else { next_start };
93 }
94
95 chunks
96}
97
98pub fn split_into_chunks_by_token_offsets(
104 body: &str,
105 token_offsets: &[(usize, usize)],
106) -> Vec<Chunk> {
107 if token_offsets.len() <= CHUNK_SIZE_TOKENS {
108 return vec![Chunk {
109 token_count_approx: token_offsets.len(),
110 start_offset: 0,
111 end_offset: body.len(),
112 }];
113 }
114
115 let mut chunks = Vec::with_capacity(token_offsets.len() / CHUNK_SIZE_TOKENS + 1);
116 let mut start_token = 0usize;
117
118 while start_token < token_offsets.len() {
119 let end_token = (start_token + CHUNK_SIZE_TOKENS).min(token_offsets.len());
120
121 chunks.push(Chunk {
122 start_offset: if start_token == 0 {
123 0
124 } else {
125 token_offsets[start_token].0
126 },
127 end_offset: if end_token == token_offsets.len() {
128 body.len()
129 } else {
130 token_offsets[end_token - 1].1
131 },
132 token_count_approx: end_token - start_token,
133 });
134
135 if end_token == token_offsets.len() {
136 break;
137 }
138
139 let next_start = end_token.saturating_sub(CHUNK_OVERLAP_TOKENS);
140 start_token = if next_start <= start_token {
141 end_token
142 } else {
143 next_start
144 };
145 }
146
147 chunks
148}
149
150pub fn split_into_chunks_hierarchical(body: &str) -> Vec<Chunk> {
158 if body.is_empty() {
159 return Vec::new();
160 }
161
162 let char_chunk_size = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
167 let char_overlap = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
168 let config = ChunkConfig::new(char_chunk_size)
169 .with_overlap(char_overlap)
170 .expect("compile-time invariant: CHUNK_OVERLAP must be smaller than chunk size");
171
172 let splitter = MarkdownSplitter::new(config);
173
174 let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
175
176 if items.is_empty() {
177 return vec![Chunk {
178 start_offset: 0,
179 end_offset: body.len(),
180 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
181 }];
182 }
183
184 items
185 .into_iter()
186 .map(|(start, text)| {
187 let end = start + text.len();
188 Chunk {
189 start_offset: start,
190 end_offset: end,
191 token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
192 }
193 })
194 .collect()
195}
196
197pub fn chunk_text<'a>(body: &'a str, chunk: &Chunk) -> &'a str {
199 &body[chunk.start_offset..chunk.end_offset]
200}
201
202fn find_split_boundary(body: &str, start: usize, desired_end: usize) -> usize {
203 let slice = &body[start..desired_end];
204 if let Some(pos) = slice.rfind("\n\n") {
205 return start + pos + 2;
206 }
207 if let Some(pos) = slice.rfind(". ") {
208 return start + pos + 2;
209 }
210 if let Some(pos) = slice.rfind(' ') {
211 return start + pos + 1;
212 }
213 desired_end
214}
215
216fn previous_char_boundary(body: &str, mut idx: usize) -> usize {
217 idx = idx.min(body.len());
218 while idx > 0 && !body.is_char_boundary(idx) {
219 idx -= 1;
220 }
221 idx
222}
223
224fn next_char_boundary(body: &str, mut idx: usize) -> usize {
225 idx = idx.min(body.len());
226 while idx < body.len() && !body.is_char_boundary(idx) {
227 idx += 1;
228 }
229 idx
230}
231
232pub fn aggregate_embeddings(chunk_embeddings: &[Vec<f32>]) -> Vec<f32> {
237 if chunk_embeddings.is_empty() {
238 return vec![0.0f32; EMBEDDING_DIM];
239 }
240 if chunk_embeddings.len() == 1 {
241 return chunk_embeddings[0].clone();
242 }
243
244 let dim = chunk_embeddings[0].len();
245 let mut mean = vec![0.0f32; dim];
246 for emb in chunk_embeddings {
247 for (i, v) in emb.iter().enumerate() {
248 mean[i] += v;
249 }
250 }
251 let n = chunk_embeddings.len() as f32;
252 for v in &mut mean {
253 *v /= n;
254 }
255
256 let norm: f32 = mean.iter().map(|x| x * x).sum::<f32>().sqrt();
257 if norm > 1e-9 {
258 for v in &mut mean {
259 *v /= norm;
260 }
261 }
262 mean
263}
264
265#[cfg(test)]
266mod tests {
267 use super::*;
268
269 #[test]
270 fn test_short_body_no_chunking() {
271 let body = "short text";
272 assert!(!needs_chunking(body));
273 let chunks = split_into_chunks(body);
274 assert_eq!(chunks.len(), 1);
275 assert_eq!(chunk_text(body, &chunks[0]), body);
276 }
277
278 #[test]
279 fn test_long_body_produces_multiple_chunks() {
280 let body = "word ".repeat(1000);
281 assert!(needs_chunking(&body));
282 let chunks = split_into_chunks(&body);
283 assert!(chunks.len() > 1);
284 assert!(chunks.iter().all(|c| !chunk_text(&body, c).is_empty()));
285 }
286
287 #[test]
288 fn split_by_token_offsets_respeita_limite_e_overlap() {
289 let body = "ab".repeat(460);
290 let offsets: Vec<(usize, usize)> = (0..460)
291 .map(|i| {
292 let start = i * 2;
293 (start, start + 2)
294 })
295 .collect();
296
297 let chunks = split_into_chunks_by_token_offsets(&body, &offsets);
298 assert_eq!(chunks.len(), 2);
299 assert_eq!(chunks[0].token_count_approx, CHUNK_SIZE_TOKENS);
300 assert_eq!(chunks[1].token_count_approx, 110);
301 assert_eq!(chunks[0].start_offset, 0);
302 assert_eq!(
303 chunks[1].start_offset,
304 offsets[CHUNK_SIZE_TOKENS - CHUNK_OVERLAP_TOKENS].0
305 );
306 }
307
308 #[test]
309 fn split_by_token_offsets_returns_one_chunk_when_fits() {
310 let body = "texto curto";
311 let offsets = vec![(0, 5), (6, 11)];
312 let chunks = split_into_chunks_by_token_offsets(body, &offsets);
313 assert_eq!(chunks.len(), 1);
314 assert_eq!(chunks[0].start_offset, 0);
315 assert_eq!(chunks[0].end_offset, body.len());
316 assert_eq!(chunks[0].token_count_approx, 2);
317 }
318
319 #[test]
320 fn test_multibyte_body_preserves_progress_and_boundaries() {
321 let body = "a\u{e7}\u{e3}o \u{fa}til ".repeat(1000);
326 let chunks = split_into_chunks(&body);
327 assert!(chunks.len() > 1);
328 for chunk in &chunks {
329 assert!(!chunk_text(&body, chunk).is_empty());
330 assert!(body.is_char_boundary(chunk.start_offset));
331 assert!(body.is_char_boundary(chunk.end_offset));
332 assert!(chunk.end_offset > chunk.start_offset);
333 }
334 for pair in chunks.windows(2) {
335 assert!(pair[1].start_offset >= pair[0].start_offset);
336 assert!(pair[1].end_offset > pair[0].start_offset);
337 }
338 }
339
340 #[test]
341 fn test_aggregate_embeddings_normalizes() {
342 let embs = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
343 let agg = aggregate_embeddings(&embs);
344 let norm: f32 = agg.iter().map(|x| x * x).sum::<f32>().sqrt();
345 assert!((norm - 1.0).abs() < 1e-5);
346 }
347
348 fn split_hier_chars(body: &str, size: usize) -> Vec<Chunk> {
349 use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
350 if body.is_empty() {
351 return Vec::new();
352 }
353 let config = ChunkConfig::new(size)
354 .with_sizer(Characters)
355 .with_overlap(0)
356 .expect("overlap must be smaller than size");
357 let splitter = MarkdownSplitter::new(config);
358 let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
359 if items.is_empty() {
360 return vec![Chunk {
361 start_offset: 0,
362 end_offset: body.len(),
363 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
364 }];
365 }
366 items
367 .into_iter()
368 .map(|(start, text)| {
369 let end = start + text.len();
370 Chunk {
371 start_offset: start,
372 end_offset: end,
373 token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
374 }
375 })
376 .collect()
377 }
378
379 #[test]
380 fn test_hierarchical_empty_body_returns_empty() {
381 use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
382 let config = ChunkConfig::new(100)
383 .with_sizer(Characters)
384 .with_overlap(0)
385 .expect("overlap < size");
386 let splitter = MarkdownSplitter::new(config);
387 let result: Vec<_> = splitter.chunk_indices("").collect();
388 assert!(result.is_empty());
389 }
390
391 #[test]
392 fn test_markdown_h1_boundary_yields_two_chunks() {
393 let body = "# Title 1\n\nbody1 body1 body1 body1 body1 body1\n\n# Title 2\n\nbody2 body2 body2 body2 body2 body2";
394 let chunks = split_hier_chars(body, 30);
395 assert!(
396 chunks.len() >= 2,
397 "expected >=2 chunks, got {}",
398 chunks.len()
399 );
400 for c in &chunks {
401 assert!(body.is_char_boundary(c.start_offset));
402 assert!(body.is_char_boundary(c.end_offset));
403 }
404 }
405
406 #[test]
407 fn test_markdown_h2_nested_respects_boundaries() {
408 let body = "# H1\n\n## H2a\n\nParagraph A with enough text to force a split.\n\n## H2b\n\nParagraph B with enough text to force a split as well.";
409 let chunks = split_hier_chars(body, 40);
410 assert!(!chunks.is_empty());
411 for c in &chunks {
412 assert!(body.is_char_boundary(c.start_offset));
413 assert!(body.is_char_boundary(c.end_offset));
414 assert!(c.end_offset > c.start_offset);
415 assert!(c.end_offset <= body.len());
416 }
417 }
418
419 #[test]
420 fn test_markdown_paragraph_soft_boundary() {
421 let para = "Plain text sentence used to fill the paragraph. ";
422 let body = format!(
423 "{}\n\n{}\n\n{}",
424 para.repeat(3),
425 para.repeat(3),
426 para.repeat(3)
427 );
428 let chunks = split_hier_chars(&body, 80);
429 assert!(
430 chunks.len() >= 2,
431 "expected >=2 chunks with a body of {} chars",
432 body.len()
433 );
434 for c in &chunks {
435 assert!(body.is_char_boundary(c.start_offset));
436 assert!(body.is_char_boundary(c.end_offset));
437 }
438 }
439
440 #[test]
441 fn test_markdown_60kb_valid_offsets() {
442 let block = "# Section\n\nBlock content text. ".repeat(1700);
443 assert!(
444 block.len() > 50_000,
445 "body must be >50KB, has {} bytes",
446 block.len()
447 );
448 let chunks = split_hier_chars(&block, 256);
449 assert!(chunks.len() > 1);
450 for c in &chunks {
451 assert!(block.is_char_boundary(c.start_offset));
452 assert!(block.is_char_boundary(c.end_offset));
453 assert!(c.end_offset > c.start_offset);
454 assert!(!chunk_text(&block, c).is_empty());
455 }
456 }
457
458 #[test]
459 fn test_fallback_plain_text_without_markers() {
460 let body = "a ".repeat(1000);
461 let chunks = split_hier_chars(&body, 100);
462 assert!(!chunks.is_empty());
463 for c in &chunks {
464 assert!(body.is_char_boundary(c.start_offset));
465 assert!(body.is_char_boundary(c.end_offset));
466 }
467 }
468}