1use crate::constants::{CHUNK_OVERLAP_TOKENS, CHUNK_SIZE_TOKENS};
10use text_splitter::{ChunkConfig, MarkdownSplitter};
11
12const CHARS_PER_TOKEN: usize = 2;
18
19pub const CHUNK_SIZE_CHARS: usize = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
21
22pub const CHUNK_OVERLAP_CHARS: usize = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
24
25#[derive(Debug, Clone)]
27pub struct Chunk {
28 pub start_offset: usize,
30 pub end_offset: usize,
32 pub token_count_approx: usize,
34}
35
36pub fn needs_chunking(body: &str) -> bool {
38 body.len() > CHUNK_SIZE_CHARS
39}
40
41pub fn split_into_chunks(body: &str) -> Vec<Chunk> {
49 if !needs_chunking(body) {
50 return vec![Chunk {
51 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
52 start_offset: 0,
53 end_offset: body.len(),
54 }];
55 }
56
57 let mut chunks = Vec::with_capacity(body.len() / CHUNK_SIZE_CHARS + 1);
58 let mut start = 0usize;
59
60 while start < body.len() {
61 start = next_char_boundary(body, start);
62 let desired_end = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
63 let end = if desired_end < body.len() {
64 find_split_boundary(body, start, desired_end)
65 } else {
66 desired_end
67 };
68
69 let end = if end <= start {
70 let fallback = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
71 if fallback > start {
72 fallback
73 } else {
74 body.len()
75 }
76 } else {
77 end
78 };
79
80 let token_count_approx = body[start..end].chars().count() / CHARS_PER_TOKEN;
81 chunks.push(Chunk {
82 start_offset: start,
83 end_offset: end,
84 token_count_approx,
85 });
86
87 if end >= body.len() {
88 break;
89 }
90
91 let next_start = next_char_boundary(body, end.saturating_sub(CHUNK_OVERLAP_CHARS));
92 start = if next_start >= end { end } else { next_start };
93 }
94
95 chunks
96}
97
98pub fn split_into_chunks_by_token_offsets(
104 body: &str,
105 token_offsets: &[(usize, usize)],
106) -> Vec<Chunk> {
107 if token_offsets.len() <= CHUNK_SIZE_TOKENS {
108 return vec![Chunk {
109 token_count_approx: token_offsets.len(),
110 start_offset: 0,
111 end_offset: body.len(),
112 }];
113 }
114
115 let mut chunks = Vec::with_capacity(token_offsets.len() / CHUNK_SIZE_TOKENS + 1);
116 let mut start_token = 0usize;
117
118 while start_token < token_offsets.len() {
119 let end_token = (start_token + CHUNK_SIZE_TOKENS).min(token_offsets.len());
120
121 chunks.push(Chunk {
122 start_offset: if start_token == 0 {
123 0
124 } else {
125 token_offsets[start_token].0
126 },
127 end_offset: if end_token == token_offsets.len() {
128 body.len()
129 } else {
130 token_offsets[end_token - 1].1
131 },
132 token_count_approx: end_token - start_token,
133 });
134
135 if end_token == token_offsets.len() {
136 break;
137 }
138
139 let next_start = end_token.saturating_sub(CHUNK_OVERLAP_TOKENS);
140 start_token = if next_start <= start_token {
141 end_token
142 } else {
143 next_start
144 };
145 }
146
147 chunks
148}
149
150pub fn split_into_chunks_hierarchical(body: &str) -> Vec<Chunk> {
158 if body.is_empty() {
159 return Vec::new();
160 }
161
162 let char_chunk_size = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
167 let char_overlap = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
168 let config = ChunkConfig::new(char_chunk_size)
169 .with_overlap(char_overlap)
170 .expect("compile-time invariant: CHUNK_OVERLAP must be smaller than chunk size");
171
172 let splitter = MarkdownSplitter::new(config);
173
174 let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
175
176 if items.is_empty() {
177 return vec![Chunk {
178 start_offset: 0,
179 end_offset: body.len(),
180 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
181 }];
182 }
183
184 items
185 .into_iter()
186 .map(|(start, text)| {
187 let end = start + text.len();
188 Chunk {
189 start_offset: start,
190 end_offset: end,
191 token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
192 }
193 })
194 .collect()
195}
196
197pub fn chunk_text<'a>(body: &'a str, chunk: &Chunk) -> &'a str {
199 &body[chunk.start_offset..chunk.end_offset]
200}
201
202fn find_split_boundary(body: &str, start: usize, desired_end: usize) -> usize {
203 let slice = &body[start..desired_end];
204 if let Some(pos) = slice.rfind("\n\n") {
205 return start + pos + 2;
206 }
207 if let Some(pos) = slice.rfind(". ") {
208 return start + pos + 2;
209 }
210 if let Some(pos) = slice.rfind(' ') {
211 return start + pos + 1;
212 }
213 desired_end
214}
215
216fn previous_char_boundary(body: &str, mut idx: usize) -> usize {
217 idx = idx.min(body.len());
218 while idx > 0 && !body.is_char_boundary(idx) {
219 idx -= 1;
220 }
221 idx
222}
223
224fn next_char_boundary(body: &str, mut idx: usize) -> usize {
225 idx = idx.min(body.len());
226 while idx < body.len() && !body.is_char_boundary(idx) {
227 idx += 1;
228 }
229 idx
230}
231
232pub fn aggregate_embeddings(chunk_embeddings: &[Vec<f32>]) -> Vec<f32> {
238 if chunk_embeddings.is_empty() {
239 return vec![0.0f32; crate::constants::embedding_dim()];
240 }
241 if chunk_embeddings.len() == 1 {
242 return chunk_embeddings[0].clone();
243 }
244
245 let dim = chunk_embeddings[0].len();
246 let mut mean = vec![0.0f32; dim];
247 for emb in chunk_embeddings {
248 for (i, v) in emb.iter().enumerate() {
249 mean[i] += v;
250 }
251 }
252 let n = chunk_embeddings.len() as f32;
253 for v in &mut mean {
254 *v /= n;
255 }
256
257 let norm: f32 = mean.iter().map(|x| x * x).sum::<f32>().sqrt();
258 if norm > 1e-9 {
259 for v in &mut mean {
260 *v /= norm;
261 }
262 }
263 mean
264}
265
266#[cfg(test)]
267mod tests {
268 use super::*;
269
270 #[test]
271 fn test_short_body_no_chunking() {
272 let body = "short text";
273 assert!(!needs_chunking(body));
274 let chunks = split_into_chunks(body);
275 assert_eq!(chunks.len(), 1);
276 assert_eq!(chunk_text(body, &chunks[0]), body);
277 }
278
279 #[test]
280 fn test_long_body_produces_multiple_chunks() {
281 let body = "word ".repeat(1000);
282 assert!(needs_chunking(&body));
283 let chunks = split_into_chunks(&body);
284 assert!(chunks.len() > 1);
285 assert!(chunks.iter().all(|c| !chunk_text(&body, c).is_empty()));
286 }
287
288 #[test]
289 fn split_by_token_offsets_respeita_limite_e_overlap() {
290 let body = "ab".repeat(460);
291 let offsets: Vec<(usize, usize)> = (0..460)
292 .map(|i| {
293 let start = i * 2;
294 (start, start + 2)
295 })
296 .collect();
297
298 let chunks = split_into_chunks_by_token_offsets(&body, &offsets);
299 assert_eq!(chunks.len(), 2);
300 assert_eq!(chunks[0].token_count_approx, CHUNK_SIZE_TOKENS);
301 assert_eq!(chunks[1].token_count_approx, 110);
302 assert_eq!(chunks[0].start_offset, 0);
303 assert_eq!(
304 chunks[1].start_offset,
305 offsets[CHUNK_SIZE_TOKENS - CHUNK_OVERLAP_TOKENS].0
306 );
307 }
308
309 #[test]
310 fn split_by_token_offsets_returns_one_chunk_when_fits() {
311 let body = "texto curto";
312 let offsets = vec![(0, 5), (6, 11)];
313 let chunks = split_into_chunks_by_token_offsets(body, &offsets);
314 assert_eq!(chunks.len(), 1);
315 assert_eq!(chunks[0].start_offset, 0);
316 assert_eq!(chunks[0].end_offset, body.len());
317 assert_eq!(chunks[0].token_count_approx, 2);
318 }
319
320 #[test]
321 fn test_multibyte_body_preserves_progress_and_boundaries() {
322 let body = "a\u{e7}\u{e3}o \u{fa}til ".repeat(1000);
327 let chunks = split_into_chunks(&body);
328 assert!(chunks.len() > 1);
329 for chunk in &chunks {
330 assert!(!chunk_text(&body, chunk).is_empty());
331 assert!(body.is_char_boundary(chunk.start_offset));
332 assert!(body.is_char_boundary(chunk.end_offset));
333 assert!(chunk.end_offset > chunk.start_offset);
334 }
335 for pair in chunks.windows(2) {
336 assert!(pair[1].start_offset >= pair[0].start_offset);
337 assert!(pair[1].end_offset > pair[0].start_offset);
338 }
339 }
340
341 #[test]
342 fn test_aggregate_embeddings_normalizes() {
343 let embs = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
344 let agg = aggregate_embeddings(&embs);
345 let norm: f32 = agg.iter().map(|x| x * x).sum::<f32>().sqrt();
346 assert!((norm - 1.0).abs() < 1e-5);
347 }
348
349 fn split_hier_chars(body: &str, size: usize) -> Vec<Chunk> {
350 use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
351 if body.is_empty() {
352 return Vec::new();
353 }
354 let config = ChunkConfig::new(size)
355 .with_sizer(Characters)
356 .with_overlap(0)
357 .expect("overlap must be smaller than size");
358 let splitter = MarkdownSplitter::new(config);
359 let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
360 if items.is_empty() {
361 return vec![Chunk {
362 start_offset: 0,
363 end_offset: body.len(),
364 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
365 }];
366 }
367 items
368 .into_iter()
369 .map(|(start, text)| {
370 let end = start + text.len();
371 Chunk {
372 start_offset: start,
373 end_offset: end,
374 token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
375 }
376 })
377 .collect()
378 }
379
380 #[test]
381 fn test_hierarchical_empty_body_returns_empty() {
382 use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
383 let config = ChunkConfig::new(100)
384 .with_sizer(Characters)
385 .with_overlap(0)
386 .expect("overlap < size");
387 let splitter = MarkdownSplitter::new(config);
388 let result: Vec<_> = splitter.chunk_indices("").collect();
389 assert!(result.is_empty());
390 }
391
392 #[test]
393 fn test_markdown_h1_boundary_yields_two_chunks() {
394 let body = "# Title 1\n\nbody1 body1 body1 body1 body1 body1\n\n# Title 2\n\nbody2 body2 body2 body2 body2 body2";
395 let chunks = split_hier_chars(body, 30);
396 assert!(
397 chunks.len() >= 2,
398 "expected >=2 chunks, got {}",
399 chunks.len()
400 );
401 for c in &chunks {
402 assert!(body.is_char_boundary(c.start_offset));
403 assert!(body.is_char_boundary(c.end_offset));
404 }
405 }
406
407 #[test]
408 fn test_markdown_h2_nested_respects_boundaries() {
409 let body = "# H1\n\n## H2a\n\nParagraph A with enough text to force a split.\n\n## H2b\n\nParagraph B with enough text to force a split as well.";
410 let chunks = split_hier_chars(body, 40);
411 assert!(!chunks.is_empty());
412 for c in &chunks {
413 assert!(body.is_char_boundary(c.start_offset));
414 assert!(body.is_char_boundary(c.end_offset));
415 assert!(c.end_offset > c.start_offset);
416 assert!(c.end_offset <= body.len());
417 }
418 }
419
420 #[test]
421 fn test_markdown_paragraph_soft_boundary() {
422 let para = "Plain text sentence used to fill the paragraph. ";
423 let body = format!(
424 "{}\n\n{}\n\n{}",
425 para.repeat(3),
426 para.repeat(3),
427 para.repeat(3)
428 );
429 let chunks = split_hier_chars(&body, 80);
430 assert!(
431 chunks.len() >= 2,
432 "expected >=2 chunks with a body of {} chars",
433 body.len()
434 );
435 for c in &chunks {
436 assert!(body.is_char_boundary(c.start_offset));
437 assert!(body.is_char_boundary(c.end_offset));
438 }
439 }
440
441 #[test]
442 fn test_markdown_60kb_valid_offsets() {
443 let block = "# Section\n\nBlock content text. ".repeat(1700);
444 assert!(
445 block.len() > 50_000,
446 "body must be >50KB, has {} bytes",
447 block.len()
448 );
449 let chunks = split_hier_chars(&block, 256);
450 assert!(chunks.len() > 1);
451 for c in &chunks {
452 assert!(block.is_char_boundary(c.start_offset));
453 assert!(block.is_char_boundary(c.end_offset));
454 assert!(c.end_offset > c.start_offset);
455 assert!(!chunk_text(&block, c).is_empty());
456 }
457 }
458
459 #[test]
460 fn test_fallback_plain_text_without_markers() {
461 let body = "a ".repeat(1000);
462 let chunks = split_hier_chars(&body, 100);
463 assert!(!chunks.is_empty());
464 for c in &chunks {
465 assert!(body.is_char_boundary(c.start_offset));
466 assert!(body.is_char_boundary(c.end_offset));
467 }
468 }
469}