1use crate::constants::{CHUNK_OVERLAP_TOKENS, CHUNK_SIZE_TOKENS};
10use text_splitter::{ChunkConfig, MarkdownSplitter};
11
12const CHARS_PER_TOKEN: usize = 2;
18
19pub const CHUNK_SIZE_CHARS: usize = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
21
22pub const CHUNK_OVERLAP_CHARS: usize = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
24
25#[derive(Debug, Clone)]
27pub struct Chunk {
28 pub start_offset: usize,
30 pub end_offset: usize,
32 pub token_count_approx: usize,
34}
35
36pub fn needs_chunking(body: &str) -> bool {
38 body.len() > CHUNK_SIZE_CHARS
39}
40
41pub fn split_into_chunks(body: &str) -> Vec<Chunk> {
49 if !needs_chunking(body) {
50 return vec![Chunk {
51 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
52 start_offset: 0,
53 end_offset: body.len(),
54 }];
55 }
56
57 let mut chunks = Vec::with_capacity(body.len() / CHUNK_SIZE_CHARS + 1);
58 let mut start = 0usize;
59
60 while start < body.len() {
61 start = next_char_boundary(body, start);
62 let desired_end = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
63 let end = if desired_end < body.len() {
64 find_split_boundary(body, start, desired_end)
65 } else {
66 desired_end
67 };
68
69 let end = if end <= start {
70 let fallback = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
71 if fallback > start {
72 fallback
73 } else {
74 body.len()
75 }
76 } else {
77 end
78 };
79
80 let token_count_approx = body[start..end].chars().count() / CHARS_PER_TOKEN;
81 chunks.push(Chunk {
82 start_offset: start,
83 end_offset: end,
84 token_count_approx,
85 });
86
87 if end >= body.len() {
88 break;
89 }
90
91 let next_start = next_char_boundary(body, end.saturating_sub(CHUNK_OVERLAP_CHARS));
92 start = if next_start >= end { end } else { next_start };
93 }
94
95 chunks
96}
97
98pub fn split_into_chunks_by_token_offsets(
104 body: &str,
105 token_offsets: &[(usize, usize)],
106) -> Vec<Chunk> {
107 if token_offsets.len() <= CHUNK_SIZE_TOKENS {
108 return vec![Chunk {
109 token_count_approx: token_offsets.len(),
110 start_offset: 0,
111 end_offset: body.len(),
112 }];
113 }
114
115 let mut chunks = Vec::with_capacity(token_offsets.len() / CHUNK_SIZE_TOKENS + 1);
116 let mut start_token = 0usize;
117
118 while start_token < token_offsets.len() {
119 let end_token = (start_token + CHUNK_SIZE_TOKENS).min(token_offsets.len());
120
121 chunks.push(Chunk {
122 start_offset: if start_token == 0 {
123 0
124 } else {
125 token_offsets[start_token].0
126 },
127 end_offset: if end_token == token_offsets.len() {
128 body.len()
129 } else {
130 token_offsets[end_token - 1].1
131 },
132 token_count_approx: end_token - start_token,
133 });
134
135 if end_token == token_offsets.len() {
136 break;
137 }
138
139 let next_start = end_token.saturating_sub(CHUNK_OVERLAP_TOKENS);
140 start_token = if next_start <= start_token {
141 end_token
142 } else {
143 next_start
144 };
145 }
146
147 chunks
148}
149
150#[allow(clippy::expect_used)]
160pub fn split_into_chunks_hierarchical(body: &str) -> Vec<Chunk> {
161 if body.is_empty() {
162 return Vec::new();
163 }
164
165 let char_chunk_size = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
170 let char_overlap = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
171 let config = ChunkConfig::new(char_chunk_size)
172 .with_overlap(char_overlap)
173 .expect("compile-time invariant: CHUNK_OVERLAP must be smaller than chunk size");
174
175 let splitter = MarkdownSplitter::new(config);
176
177 let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
178
179 if items.is_empty() {
180 return vec![Chunk {
181 start_offset: 0,
182 end_offset: body.len(),
183 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
184 }];
185 }
186
187 items
188 .into_iter()
189 .map(|(start, text)| {
190 let end = start + text.len();
191 Chunk {
192 start_offset: start,
193 end_offset: end,
194 token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
195 }
196 })
197 .collect()
198}
199
200pub fn chunk_text<'a>(body: &'a str, chunk: &Chunk) -> &'a str {
202 &body[chunk.start_offset..chunk.end_offset]
203}
204
205fn find_split_boundary(body: &str, start: usize, desired_end: usize) -> usize {
206 let slice = &body[start..desired_end];
207 if let Some(pos) = slice.rfind("\n\n") {
208 return start + pos + 2;
209 }
210 if let Some(pos) = slice.rfind(". ") {
211 return start + pos + 2;
212 }
213 if let Some(pos) = slice.rfind(' ') {
214 return start + pos + 1;
215 }
216 desired_end
217}
218
219fn previous_char_boundary(body: &str, mut idx: usize) -> usize {
220 idx = idx.min(body.len());
221 while idx > 0 && !body.is_char_boundary(idx) {
222 idx -= 1;
223 }
224 idx
225}
226
227fn next_char_boundary(body: &str, mut idx: usize) -> usize {
228 idx = idx.min(body.len());
229 while idx < body.len() && !body.is_char_boundary(idx) {
230 idx += 1;
231 }
232 idx
233}
234
235pub fn aggregate_embeddings(chunk_embeddings: &[Vec<f32>]) -> Vec<f32> {
241 if chunk_embeddings.is_empty() {
242 return vec![0.0f32; crate::constants::embedding_dim()];
243 }
244 if chunk_embeddings.len() == 1 {
245 return chunk_embeddings[0].clone();
246 }
247
248 let dim = chunk_embeddings[0].len();
249 let mut mean = vec![0.0f32; dim];
250 for emb in chunk_embeddings {
251 for (i, v) in emb.iter().enumerate() {
252 mean[i] += v;
253 }
254 }
255 let n = chunk_embeddings.len() as f32;
256 for v in &mut mean {
257 *v /= n;
258 }
259
260 let norm: f32 = mean.iter().map(|x| x * x).sum::<f32>().sqrt();
261 if norm > 1e-9 {
262 for v in &mut mean {
263 *v /= norm;
264 }
265 }
266 mean
267}
268
269#[derive(Debug, Clone, Copy)]
272pub struct BodyBudget {
273 pub bytes: usize,
275 pub approx_tokens: usize,
277 pub chunk_count: usize,
279 pub partition_count: usize,
281 pub exceeds_limits: bool,
284}
285
286pub fn estimate_chunk_count(body: &str) -> usize {
289 split_into_chunks_hierarchical(body).len()
290}
291
292fn fits_single_partition(body: &str) -> bool {
296 body.len() <= crate::constants::AUTOSPLIT_PARTITION_MAX_BYTES
297 && estimate_chunk_count(body) <= crate::constants::REMEMBER_MAX_SAFE_MULTI_CHUNKS
298 && crate::tokenizer::count_tokens(body) <= crate::constants::EMBEDDING_REQUEST_MAX_TOKENS
299}
300
301pub fn assess_body_budget(body: &str) -> BodyBudget {
306 let partition_count = split_body_by_sections(body).len();
307 BodyBudget {
308 bytes: body.len(),
309 approx_tokens: crate::tokenizer::count_tokens(body),
310 chunk_count: estimate_chunk_count(body),
311 partition_count,
312 exceeds_limits: partition_count > 1,
313 }
314}
315
316pub fn split_body_by_sections(body: &str) -> Vec<String> {
327 if fits_single_partition(body) {
328 return vec![body.to_string()];
329 }
330
331 let max_bytes = crate::constants::AUTOSPLIT_PARTITION_MAX_BYTES;
332 let sections = split_markdown_sections(body);
333
334 let mut packed: Vec<String> = Vec::new();
335 let mut current = String::new();
336 for section in sections {
337 if !current.is_empty() && current.len() + section.len() > max_bytes {
338 packed.push(std::mem::take(&mut current));
339 }
340 current.push_str(§ion);
341 }
342 if !current.is_empty() {
343 packed.push(current);
344 }
345
346 let mut result = Vec::with_capacity(packed.len());
347 for partition in packed {
348 if fits_single_partition(&partition) {
349 result.push(partition);
350 } else {
351 result.extend(hard_slice_to_budget(&partition));
352 }
353 }
354
355 if result.is_empty() {
356 vec![body.to_string()]
357 } else {
358 result
359 }
360}
361
362fn split_markdown_sections(body: &str) -> Vec<String> {
367 let mut sections: Vec<String> = Vec::new();
368 let mut current = String::new();
369 for line in body.split_inclusive('\n') {
370 if is_atx_header(line) && !current.is_empty() {
371 sections.push(std::mem::take(&mut current));
372 }
373 current.push_str(line);
374 }
375 if !current.is_empty() {
376 sections.push(current);
377 }
378 if sections.is_empty() {
379 sections.push(body.to_string());
380 }
381 sections
382}
383
384fn is_atx_header(line: &str) -> bool {
387 let trimmed = line.trim_start_matches(' ');
388 let hashes = trimmed.chars().take_while(|&c| c == '#').count();
389 if hashes == 0 || hashes > 6 {
390 return false;
391 }
392 let after = &trimmed[hashes..];
393 after.is_empty() || after.starts_with(' ') || after.starts_with('\n') || after.starts_with('\t')
394}
395
396fn hard_slice_to_budget(body: &str) -> Vec<String> {
401 let max_bytes = crate::constants::AUTOSPLIT_PARTITION_MAX_BYTES;
402 let mut pieces: Vec<String> = Vec::new();
403 let mut start = 0usize;
404 while start < body.len() {
405 let mut end = previous_char_boundary(body, (start + max_bytes).min(body.len()));
406 if end <= start {
407 end = next_char_boundary(body, start + 1);
408 }
409 pieces.push(body[start..end].to_string());
410 start = end;
411 }
412 pieces
413}
414
415#[cfg(test)]
416mod tests {
417 use super::*;
418
419 #[test]
420 fn test_short_body_no_chunking() {
421 let body = "short text";
422 assert!(!needs_chunking(body));
423 let chunks = split_into_chunks(body);
424 assert_eq!(chunks.len(), 1);
425 assert_eq!(chunk_text(body, &chunks[0]), body);
426 }
427
428 #[test]
429 fn test_long_body_produces_multiple_chunks() {
430 let body = "word ".repeat(1000);
431 assert!(needs_chunking(&body));
432 let chunks = split_into_chunks(&body);
433 assert!(chunks.len() > 1);
434 assert!(chunks.iter().all(|c| !chunk_text(&body, c).is_empty()));
435 }
436
437 #[test]
438 fn split_by_token_offsets_respeita_limite_e_overlap() {
439 let body = "ab".repeat(460);
440 let offsets: Vec<(usize, usize)> = (0..460)
441 .map(|i| {
442 let start = i * 2;
443 (start, start + 2)
444 })
445 .collect();
446
447 let chunks = split_into_chunks_by_token_offsets(&body, &offsets);
448 assert_eq!(chunks.len(), 2);
449 assert_eq!(chunks[0].token_count_approx, CHUNK_SIZE_TOKENS);
450 assert_eq!(chunks[1].token_count_approx, 110);
451 assert_eq!(chunks[0].start_offset, 0);
452 assert_eq!(
453 chunks[1].start_offset,
454 offsets[CHUNK_SIZE_TOKENS - CHUNK_OVERLAP_TOKENS].0
455 );
456 }
457
458 #[test]
459 fn split_by_token_offsets_returns_one_chunk_when_fits() {
460 let body = "texto curto";
461 let offsets = vec![(0, 5), (6, 11)];
462 let chunks = split_into_chunks_by_token_offsets(body, &offsets);
463 assert_eq!(chunks.len(), 1);
464 assert_eq!(chunks[0].start_offset, 0);
465 assert_eq!(chunks[0].end_offset, body.len());
466 assert_eq!(chunks[0].token_count_approx, 2);
467 }
468
469 #[test]
470 fn test_multibyte_body_preserves_progress_and_boundaries() {
471 let body = "a\u{e7}\u{e3}o \u{fa}til ".repeat(1000);
476 let chunks = split_into_chunks(&body);
477 assert!(chunks.len() > 1);
478 for chunk in &chunks {
479 assert!(!chunk_text(&body, chunk).is_empty());
480 assert!(body.is_char_boundary(chunk.start_offset));
481 assert!(body.is_char_boundary(chunk.end_offset));
482 assert!(chunk.end_offset > chunk.start_offset);
483 }
484 for pair in chunks.windows(2) {
485 assert!(pair[1].start_offset >= pair[0].start_offset);
486 assert!(pair[1].end_offset > pair[0].start_offset);
487 }
488 }
489
490 #[test]
491 fn test_aggregate_embeddings_normalizes() {
492 let embs = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
493 let agg = aggregate_embeddings(&embs);
494 let norm: f32 = agg.iter().map(|x| x * x).sum::<f32>().sqrt();
495 assert!((norm - 1.0).abs() < 1e-5);
496 }
497
498 fn split_hier_chars(body: &str, size: usize) -> Vec<Chunk> {
499 use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
500 if body.is_empty() {
501 return Vec::new();
502 }
503 let config = ChunkConfig::new(size)
504 .with_sizer(Characters)
505 .with_overlap(0)
506 .expect("overlap must be smaller than size");
507 let splitter = MarkdownSplitter::new(config);
508 let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
509 if items.is_empty() {
510 return vec![Chunk {
511 start_offset: 0,
512 end_offset: body.len(),
513 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
514 }];
515 }
516 items
517 .into_iter()
518 .map(|(start, text)| {
519 let end = start + text.len();
520 Chunk {
521 start_offset: start,
522 end_offset: end,
523 token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
524 }
525 })
526 .collect()
527 }
528
529 #[test]
530 fn test_hierarchical_empty_body_returns_empty() {
531 use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
532 let config = ChunkConfig::new(100)
533 .with_sizer(Characters)
534 .with_overlap(0)
535 .expect("overlap < size");
536 let splitter = MarkdownSplitter::new(config);
537 let result: Vec<_> = splitter.chunk_indices("").collect();
538 assert!(result.is_empty());
539 }
540
541 #[test]
542 fn test_markdown_h1_boundary_yields_two_chunks() {
543 let body = "# Title 1\n\nbody1 body1 body1 body1 body1 body1\n\n# Title 2\n\nbody2 body2 body2 body2 body2 body2";
544 let chunks = split_hier_chars(body, 30);
545 assert!(
546 chunks.len() >= 2,
547 "expected >=2 chunks, got {}",
548 chunks.len()
549 );
550 for c in &chunks {
551 assert!(body.is_char_boundary(c.start_offset));
552 assert!(body.is_char_boundary(c.end_offset));
553 }
554 }
555
556 #[test]
557 fn test_markdown_h2_nested_respects_boundaries() {
558 let body = "# H1\n\n## H2a\n\nParagraph A with enough text to force a split.\n\n## H2b\n\nParagraph B with enough text to force a split as well.";
559 let chunks = split_hier_chars(body, 40);
560 assert!(!chunks.is_empty());
561 for c in &chunks {
562 assert!(body.is_char_boundary(c.start_offset));
563 assert!(body.is_char_boundary(c.end_offset));
564 assert!(c.end_offset > c.start_offset);
565 assert!(c.end_offset <= body.len());
566 }
567 }
568
569 #[test]
570 fn test_markdown_paragraph_soft_boundary() {
571 let para = "Plain text sentence used to fill the paragraph. ";
572 let body = format!(
573 "{}\n\n{}\n\n{}",
574 para.repeat(3),
575 para.repeat(3),
576 para.repeat(3)
577 );
578 let chunks = split_hier_chars(&body, 80);
579 assert!(
580 chunks.len() >= 2,
581 "expected >=2 chunks with a body of {} chars",
582 body.len()
583 );
584 for c in &chunks {
585 assert!(body.is_char_boundary(c.start_offset));
586 assert!(body.is_char_boundary(c.end_offset));
587 }
588 }
589
590 #[test]
591 fn test_markdown_60kb_valid_offsets() {
592 let block = "# Section\n\nBlock content text. ".repeat(1700);
593 assert!(
594 block.len() > 50_000,
595 "body must be >50KB, has {} bytes",
596 block.len()
597 );
598 let chunks = split_hier_chars(&block, 256);
599 assert!(chunks.len() > 1);
600 for c in &chunks {
601 assert!(block.is_char_boundary(c.start_offset));
602 assert!(block.is_char_boundary(c.end_offset));
603 assert!(c.end_offset > c.start_offset);
604 assert!(!chunk_text(&block, c).is_empty());
605 }
606 }
607
608 #[test]
609 fn test_fallback_plain_text_without_markers() {
610 let body = "a ".repeat(1000);
611 let chunks = split_hier_chars(&body, 100);
612 assert!(!chunks.is_empty());
613 for c in &chunks {
614 assert!(body.is_char_boundary(c.start_offset));
615 assert!(body.is_char_boundary(c.end_offset));
616 }
617 }
618
619 fn assert_partition_within_limits(p: &str) {
622 assert!(
623 p.len() <= crate::constants::AUTOSPLIT_PARTITION_MAX_BYTES,
624 "partition {} bytes exceeds byte budget",
625 p.len()
626 );
627 assert!(
628 estimate_chunk_count(p) <= crate::constants::REMEMBER_MAX_SAFE_MULTI_CHUNKS,
629 "partition exceeds chunk budget"
630 );
631 assert!(
632 crate::tokenizer::count_tokens(p) <= crate::constants::EMBEDDING_REQUEST_MAX_TOKENS,
633 "partition exceeds token budget"
634 );
635 }
636
637 #[test]
638 fn assess_body_budget_small_body_fits() {
639 let budget = assess_body_budget("# Title\n\nshort body");
640 assert_eq!(budget.partition_count, 1);
641 assert!(!budget.exceeds_limits);
642 assert!(budget.chunk_count >= 1);
643 assert!(budget.approx_tokens >= 1);
644 }
645
646 #[test]
647 fn split_body_by_sections_returns_single_for_small_body() {
648 let body = "# H\n\nsmall";
649 let parts = split_body_by_sections(body);
650 assert_eq!(parts.len(), 1);
651 assert_eq!(parts[0], body);
652 }
653
654 #[test]
655 fn split_body_by_sections_partitions_large_markdown_below_limits() {
656 let mut body = String::new();
658 for i in 0..400 {
659 body.push_str(&format!("# Section {i}\n\n{}\n\n", "body text ".repeat(50)));
660 }
661 assert!(body.len() > crate::constants::AUTOSPLIT_PARTITION_MAX_BYTES);
662
663 let parts = split_body_by_sections(&body);
664 assert!(
665 parts.len() > 1,
666 "expected multiple partitions, got {}",
667 parts.len()
668 );
669 for p in &parts {
670 assert_partition_within_limits(p);
671 }
672 assert_eq!(parts.concat(), body);
674 }
675
676 #[test]
677 fn split_body_by_sections_hard_slices_headerless_body() {
678 let body = "x".repeat(crate::constants::AUTOSPLIT_PARTITION_MAX_BYTES * 3 + 17);
680 let parts = split_body_by_sections(&body);
681 assert!(parts.len() > 1);
682 for p in &parts {
683 assert_partition_within_limits(p);
684 }
685 assert_eq!(parts.concat(), body);
686 }
687
688 #[test]
689 fn is_atx_header_recognizes_headers() {
690 assert!(is_atx_header("# Title"));
691 assert!(is_atx_header("### Sub\n"));
692 assert!(is_atx_header(" ## Indented"));
693 assert!(!is_atx_header("####### too many"));
694 assert!(!is_atx_header("#nospace"));
695 assert!(!is_atx_header("plain text"));
696 }
697}