1use crate::config::{ChunkPolicy, ChunkingConfig};
2use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument};
3use crate::retrieval_context::{render_structural_body, ChunkRetrievalContext};
4use crate::Result;
5use kbolt_types::KboltError;
6
7const TABLE_HEADER_ATTR: &str = "__kbolt_table_header";
8const TABLE_RETRIEVAL_PREFIX_ATTR: &str = "__kbolt_table_retrieval_prefix";
9
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct FinalChunk {
12 pub text: String,
13 pub retrieval_prefix: Option<String>,
14 pub offset: usize,
15 pub length: usize,
16 pub heading: Option<String>,
17 pub kind: FinalChunkKind,
18}
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21pub enum FinalChunkKind {
22 Section,
23 Paragraph,
24 Code,
25 Table,
26 Mixed,
27}
28
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30enum PackClass {
31 Narrative,
32 Code,
33 Table,
34 Opaque,
35}
36
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38enum NarrativeBoundary {
39 Sentence,
40 Clause,
41 TokenWindow,
42}
43
44pub trait TokenCounter {
45 fn count(&self, text: &str) -> Result<usize>;
46
47 fn fits_within_token_limit_by_byte_len(&self, _byte_len: usize, _max_tokens: usize) -> bool {
48 false
49 }
50}
51
52#[derive(Debug, Default, Clone, Copy)]
53pub struct WhitespaceTokenCounter;
54
55impl TokenCounter for WhitespaceTokenCounter {
56 fn count(&self, text: &str) -> Result<usize> {
57 Ok(count_whitespace_tokens(text))
58 }
59}
60
61impl FinalChunkKind {
62 pub fn as_storage_kind(self) -> &'static str {
63 match self {
64 Self::Section => "section",
65 Self::Paragraph => "paragraph",
66 Self::Code => "code",
67 Self::Table => "table",
68 Self::Mixed => "mixed",
69 }
70 }
71}
72
73impl TryFrom<&str> for FinalChunkKind {
74 type Error = KboltError;
75
76 fn try_from(value: &str) -> std::result::Result<Self, Self::Error> {
77 match value {
78 "section" => Ok(Self::Section),
79 "paragraph" => Ok(Self::Paragraph),
80 "code" => Ok(Self::Code),
81 "table" => Ok(Self::Table),
82 "mixed" => Ok(Self::Mixed),
83 other => Err(KboltError::Internal(format!(
84 "invalid stored chunk kind: {other}"
85 ))),
86 }
87 }
88}
89
90impl FinalChunk {
91 pub fn retrieval_text(&self) -> String {
92 chunk_retrieval_body(self.text.as_str(), self.retrieval_prefix.as_deref())
93 }
94}
95
96pub fn chunk_retrieval_body(canonical_text: &str, retrieval_prefix: Option<&str>) -> String {
97 render_structural_body(ChunkRetrievalContext {
98 body: canonical_text,
99 retrieval_prefix,
100 title: None,
101 heading: None,
102 })
103}
104
105pub fn chunk_document(document: &ExtractedDocument, policy: &ChunkPolicy) -> Vec<FinalChunk> {
106 let counter = WhitespaceTokenCounter;
107 chunk_document_with_counter(document, policy, &counter)
108 .expect("whitespace token counter should be infallible")
109}
110
111pub fn chunk_canonical_document(
112 document: &ExtractedDocument,
113 policy: &ChunkPolicy,
114) -> Vec<FinalChunk> {
115 let counter = WhitespaceTokenCounter;
116 chunk_canonical_document_with_counter(document, policy, &counter)
117 .expect("whitespace token counter should be infallible")
118}
119
120pub fn chunk_document_with_counter(
121 document: &ExtractedDocument,
122 policy: &ChunkPolicy,
123 counter: &dyn TokenCounter,
124) -> Result<Vec<FinalChunk>> {
125 chunk_document_with_counter_inner(document, policy, counter, TableHeaderMode::SourceBlocks)
126}
127
128pub fn chunk_canonical_document_with_counter(
129 document: &ExtractedDocument,
130 policy: &ChunkPolicy,
131 counter: &dyn TokenCounter,
132) -> Result<Vec<FinalChunk>> {
133 let chunks = chunk_document_with_counter_inner(
134 document,
135 policy,
136 counter,
137 TableHeaderMode::CanonicalBlocks,
138 )?;
139 hydrate_canonical_chunk_text(document, chunks)
140}
141
142fn chunk_document_with_counter_inner(
143 document: &ExtractedDocument,
144 policy: &ChunkPolicy,
145 counter: &dyn TokenCounter,
146 table_header_mode: TableHeaderMode,
147) -> Result<Vec<FinalChunk>> {
148 if document.blocks.is_empty() {
149 return Ok(Vec::new());
150 }
151
152 debug_assert_valid_blocks(&document.blocks);
153
154 let soft_max = normalized_soft_max(policy);
155 let expanded =
156 expand_blocks_for_hard_max(&document.blocks, policy, counter, table_header_mode)?;
157 let mut chunks = Vec::new();
158 let mut current = Vec::new();
159
160 for block in &expanded {
161 let structurally_compatible = current
162 .last()
163 .is_none_or(|last| can_pack_together(last, block));
164 let candidate_fits = if current.is_empty() || !structurally_compatible {
165 false
166 } else {
167 candidate_chunk_fits_within(¤t, block, soft_max, counter)?
168 };
169
170 if current.is_empty() || (structurally_compatible && candidate_fits) {
171 current.push(block.clone());
172 continue;
173 }
174
175 chunks.push(finalize_chunk(¤t, table_header_mode));
176 current.clear();
177 current.push(block.clone());
178 }
179
180 if !current.is_empty() {
181 chunks.push(finalize_chunk(¤t, table_header_mode));
182 }
183
184 Ok(chunks)
185}
186
187#[derive(Debug, Clone, Copy, PartialEq, Eq)]
188enum TableHeaderMode {
189 SourceBlocks,
190 CanonicalBlocks,
191}
192
193pub fn resolve_policy(
196 config: &ChunkingConfig,
197 profile: Option<&str>,
198 cli_override: Option<&ChunkPolicy>,
199) -> ChunkPolicy {
200 if let Some(override_policy) = cli_override {
201 return override_policy.clone();
202 }
203
204 if let Some(profile_name) = profile {
205 let key = normalize_profile_key(profile_name);
206 if let Some(policy) = config.profiles.get(&key) {
207 return policy.clone();
208 }
209 }
210
211 config.defaults.clone()
212}
213
214fn normalize_profile_key(raw: &str) -> String {
215 raw.trim().trim_start_matches('.').to_ascii_lowercase()
216}
217
218fn count_whitespace_tokens(text: &str) -> usize {
219 text.split_whitespace().count()
220}
221
222fn countable_chunk_text(blocks: &[ExtractedBlock]) -> String {
223 let body = blocks
224 .iter()
225 .map(|block| block.text.as_str())
226 .collect::<Vec<_>>()
227 .join("\n\n");
228
229 let prefix = table_header_prefix_for_body(blocks, body.as_str());
230 chunk_retrieval_body(body.as_str(), prefix)
231}
232
233fn countable_chunk_text_byte_upper_bound(blocks: &[ExtractedBlock]) -> Option<usize> {
234 let body_len = joined_block_text_byte_len(blocks)?;
235 let prefix_len = table_header_prefix_upper_bound_len(blocks);
236 match prefix_len {
237 Some(prefix_len) if body_len == 0 => Some(prefix_len),
238 Some(prefix_len) => prefix_len.checked_add(1)?.checked_add(body_len),
239 None => Some(body_len),
240 }
241}
242
243fn joined_block_text_byte_len(blocks: &[ExtractedBlock]) -> Option<usize> {
244 let text_len = blocks
245 .iter()
246 .try_fold(0usize, |sum, block| sum.checked_add(block.text.len()))?;
247 let separators = blocks.len().saturating_sub(1).checked_mul(2)?;
248 text_len.checked_add(separators)
249}
250
251fn table_header_prefix_upper_bound_len(blocks: &[ExtractedBlock]) -> Option<usize> {
252 if derive_chunk_kind(blocks) != FinalChunkKind::Table {
253 return None;
254 }
255 let has_header = blocks
256 .iter()
257 .any(|block| block.kind == BlockKind::TableHeader);
258 if has_header {
259 return None;
260 }
261 blocks
262 .first()
263 .and_then(|block| block.attrs.get(TABLE_RETRIEVAL_PREFIX_ATTR))
264 .map(String::as_str)
265 .map(str::trim)
266 .filter(|header| !header.is_empty())
267 .map(str::len)
268}
269
270fn chunk_text_guaranteed_fits(
271 blocks: &[ExtractedBlock],
272 max_tokens: usize,
273 counter: &dyn TokenCounter,
274) -> bool {
275 countable_chunk_text_byte_upper_bound(blocks)
276 .is_some_and(|byte_len| counter.fits_within_token_limit_by_byte_len(byte_len, max_tokens))
277}
278
279fn chunk_fits_within(
280 blocks: &[ExtractedBlock],
281 max_tokens: usize,
282 counter: &dyn TokenCounter,
283) -> Result<bool> {
284 if chunk_text_guaranteed_fits(blocks, max_tokens, counter) {
285 return Ok(true);
286 }
287 Ok(count_finalized_chunk_tokens(blocks, counter)? <= max_tokens)
288}
289
290fn count_finalized_chunk_tokens(
291 blocks: &[ExtractedBlock],
292 counter: &dyn TokenCounter,
293) -> Result<usize> {
294 counter.count(countable_chunk_text(blocks).as_str())
295}
296
297fn candidate_chunk_fits_within(
298 current: &[ExtractedBlock],
299 next: &ExtractedBlock,
300 max_tokens: usize,
301 counter: &dyn TokenCounter,
302) -> Result<bool> {
303 let mut candidate = current.to_vec();
304 candidate.push(next.clone());
305 chunk_fits_within(&candidate, max_tokens, counter)
306}
307
308fn single_block_fits_within(
309 block: &ExtractedBlock,
310 max_tokens: usize,
311 counter: &dyn TokenCounter,
312) -> Result<bool> {
313 chunk_fits_within(std::slice::from_ref(block), max_tokens, counter)
314}
315
316fn count_single_block_tokens(block: &ExtractedBlock, counter: &dyn TokenCounter) -> Result<usize> {
317 count_finalized_chunk_tokens(std::slice::from_ref(block), counter)
318}
319
320fn hydrate_canonical_chunk_text(
321 document: &ExtractedDocument,
322 mut chunks: Vec<FinalChunk>,
323) -> Result<Vec<FinalChunk>> {
324 let canonical_text = document
325 .blocks
326 .iter()
327 .map(|block| block.text.as_str())
328 .collect::<Vec<_>>()
329 .join("\n\n");
330
331 for chunk in &mut chunks {
332 let end = chunk.offset.checked_add(chunk.length).ok_or_else(|| {
333 KboltError::Internal("canonical chunk text span overflows usize".to_string())
334 })?;
335 if end > canonical_text.len()
336 || !canonical_text.is_char_boundary(chunk.offset)
337 || !canonical_text.is_char_boundary(end)
338 {
339 return Err(KboltError::Internal(format!(
340 "canonical chunk text span {}..{} is invalid for text length {}",
341 chunk.offset,
342 end,
343 canonical_text.len()
344 ))
345 .into());
346 }
347 chunk.text = canonical_text[chunk.offset..end].to_string();
348 }
349
350 Ok(chunks)
351}
352
353fn normalized_soft_max(policy: &ChunkPolicy) -> usize {
354 let target = policy.target_tokens.max(1);
355 policy.soft_max_tokens.max(target)
356}
357
358fn normalized_target(policy: &ChunkPolicy) -> usize {
359 policy.target_tokens.max(1)
360}
361
362fn normalized_hard_max(policy: &ChunkPolicy) -> usize {
363 let soft_max = normalized_soft_max(policy);
364 policy.hard_max_tokens.max(soft_max)
365}
366
367fn normalized_overlap(policy: &ChunkPolicy, hard_max: usize) -> usize {
368 policy
369 .boundary_overlap_tokens
370 .min(hard_max.saturating_sub(1))
371}
372
373fn expand_blocks_for_hard_max(
374 blocks: &[ExtractedBlock],
375 policy: &ChunkPolicy,
376 counter: &dyn TokenCounter,
377 table_header_mode: TableHeaderMode,
378) -> Result<Vec<ExtractedBlock>> {
379 let hard_max = normalized_hard_max(policy);
380 let target = normalized_target(policy);
381 let overlap = normalized_overlap(policy, hard_max);
382 let mut expanded = Vec::new();
383 let mut active_table_header: Option<String> = None;
384
385 for block in blocks {
386 match block.kind {
387 BlockKind::TableHeader => {
388 active_table_header = Some(block.text.clone());
389 }
390 BlockKind::TableRow => {}
391 _ => {
392 active_table_header = None;
393 }
394 }
395
396 let tagged = match table_header_mode {
397 TableHeaderMode::SourceBlocks => {
398 attach_table_header_attr(block, active_table_header.as_deref())
399 }
400 TableHeaderMode::CanonicalBlocks => attach_table_retrieval_prefix_attr(
401 block,
402 fitting_table_retrieval_prefix(active_table_header.as_deref(), hard_max, counter)?,
403 ),
404 };
405
406 if single_block_fits_within(&tagged, hard_max, counter)? {
407 expanded.push(tagged);
408 continue;
409 }
410
411 if is_narrative_block_kind(&tagged.kind) {
412 if let Some(sentence_splits) =
413 split_block_by_sentence_boundaries(&tagged, target, hard_max, overlap, counter)?
414 {
415 expanded.extend(sentence_splits);
416 continue;
417 }
418 }
419
420 if tagged.kind == BlockKind::CodeFence {
421 if let Some(code_splits) =
422 split_code_block_by_blank_lines(&tagged, hard_max, overlap, counter)?
423 {
424 expanded.extend(code_splits);
425 continue;
426 }
427 }
428
429 expanded.extend(split_block_by_tokens(&tagged, hard_max, overlap, counter)?);
430 }
431
432 Ok(expanded)
433}
434
435fn attach_table_header_attr(block: &ExtractedBlock, table_header: Option<&str>) -> ExtractedBlock {
436 let mut tagged = block.clone();
437 if tagged.kind == BlockKind::TableRow {
438 if let Some(header) = table_header {
439 if !header.trim().is_empty() {
440 tagged
441 .attrs
442 .insert(TABLE_HEADER_ATTR.to_string(), header.to_string());
443 }
444 }
445 }
446 tagged
447}
448
449fn attach_table_retrieval_prefix_attr(
450 block: &ExtractedBlock,
451 table_header: Option<&str>,
452) -> ExtractedBlock {
453 let mut tagged = block.clone();
454 if tagged.kind == BlockKind::TableRow {
455 if let Some(header) = table_header {
456 if !header.trim().is_empty() {
457 tagged
458 .attrs
459 .insert(TABLE_RETRIEVAL_PREFIX_ATTR.to_string(), header.to_string());
460 }
461 }
462 }
463 tagged
464}
465
466fn fitting_table_retrieval_prefix<'a>(
467 table_header: Option<&'a str>,
468 hard_max: usize,
469 counter: &dyn TokenCounter,
470) -> Result<Option<&'a str>> {
471 let Some(header) = table_header.filter(|header| !header.trim().is_empty()) else {
472 return Ok(None);
473 };
474
475 if hard_max > 0 && counter.fits_within_token_limit_by_byte_len(header.len(), hard_max - 1) {
476 return Ok(Some(header));
477 }
478
479 if counter.count(header)? < hard_max {
480 Ok(Some(header))
481 } else {
482 Ok(None)
483 }
484}
485
486fn is_narrative_block_kind(kind: &BlockKind) -> bool {
487 matches!(
488 kind,
489 BlockKind::Paragraph | BlockKind::ListItem | BlockKind::BlockQuote
490 )
491}
492
493fn pack_class_for_kind(kind: &BlockKind) -> PackClass {
494 match kind {
495 BlockKind::Heading | BlockKind::Paragraph | BlockKind::ListItem | BlockKind::BlockQuote => {
496 PackClass::Narrative
497 }
498 BlockKind::CodeFence => PackClass::Code,
499 BlockKind::TableHeader | BlockKind::TableRow => PackClass::Table,
500 BlockKind::HtmlBlock => PackClass::Opaque,
501 }
502}
503
504fn can_pack_together(current: &ExtractedBlock, next: &ExtractedBlock) -> bool {
505 let current_class = pack_class_for_kind(¤t.kind);
506 let next_class = pack_class_for_kind(&next.kind);
507 current_class == next_class
508 && current_class != PackClass::Opaque
509 && heading_scopes_compatible(current, next)
510}
511
512fn heading_scopes_compatible(current: &ExtractedBlock, next: &ExtractedBlock) -> bool {
513 let current_class = pack_class_for_kind(¤t.kind);
514 let next_class = pack_class_for_kind(&next.kind);
515 if current_class != PackClass::Narrative || next_class != PackClass::Narrative {
516 return true;
517 }
518
519 if next.kind == BlockKind::Heading {
521 return false;
522 }
523
524 if current.kind == BlockKind::Heading {
526 return true;
527 }
528
529 current.heading_path == next.heading_path
530}
531
532fn split_block_by_tokens(
533 block: &ExtractedBlock,
534 hard_max: usize,
535 overlap: usize,
536 counter: &dyn TokenCounter,
537) -> Result<Vec<ExtractedBlock>> {
538 if block.text.is_empty() {
539 return Ok(vec![block.clone()]);
540 }
541
542 let mut out = Vec::new();
543 let mut start_byte = 0usize;
544 while start_byte < block.text.len() {
545 let end_byte = find_largest_fitting_end_byte(block, start_byte, hard_max, counter)?;
546 out.push(split_block_range_by_bytes(block, start_byte, end_byte));
547
548 if end_byte == block.text.len() {
549 break;
550 }
551
552 start_byte = next_start_byte(block, start_byte, end_byte, overlap, counter)?;
553 }
554
555 Ok(out)
556}
557
558fn split_code_block_by_blank_lines(
559 block: &ExtractedBlock,
560 hard_max: usize,
561 overlap: usize,
562 counter: &dyn TokenCounter,
563) -> Result<Option<Vec<ExtractedBlock>>> {
564 let groups = code_group_ranges(block.text.as_str());
565 if groups.len() <= 1 {
566 return Ok(None);
567 }
568
569 let mut packed_ranges = Vec::new();
570 let mut current: Option<(usize, usize)> = None;
571 for (start, end) in groups {
572 match current {
573 None => {
574 current = Some((start, end));
575 }
576 Some((current_start, current_end)) => {
577 let candidate = split_block_range_by_bytes(block, current_start, end);
578 if single_block_fits_within(&candidate, hard_max, counter)? {
579 current = Some((current_start, end));
580 } else {
581 packed_ranges.push((current_start, current_end));
582 current = Some((start, end));
583 }
584 }
585 }
586 }
587 if let Some((start, end)) = current {
588 packed_ranges.push((start, end));
589 }
590
591 let mut out = Vec::new();
592 for (start, end) in packed_ranges {
593 let split = split_block_range_by_bytes(block, start, end);
594 if !single_block_fits_within(&split, hard_max, counter)? {
595 out.extend(split_block_by_tokens(&split, hard_max, overlap, counter)?);
596 } else {
597 out.push(split);
598 }
599 }
600
601 Ok((out.len() > 1).then_some(out))
602}
603
604fn split_block_by_sentence_boundaries(
605 block: &ExtractedBlock,
606 target_tokens: usize,
607 hard_max: usize,
608 overlap: usize,
609 counter: &dyn TokenCounter,
610) -> Result<Option<Vec<ExtractedBlock>>> {
611 let spans = token_byte_spans(block.text.as_str());
612 if spans.is_empty() {
613 return Ok(Some(vec![block.clone()]));
614 }
615
616 let sentence_end_tokens = sentence_end_token_indices(block.text.as_str(), &spans)
617 .into_iter()
618 .map(|index| spans[index].1)
619 .collect::<Vec<_>>();
620 let clause_end_tokens = clause_end_token_indices(block.text.as_str(), &spans)
621 .into_iter()
622 .map(|index| spans[index].1)
623 .collect::<Vec<_>>();
624 if sentence_end_tokens.is_empty() && clause_end_tokens.is_empty() {
625 return Ok(None);
626 }
627
628 let mut used_structural_boundary = false;
629 let mut out = Vec::new();
630 let mut start_byte = 0usize;
631 while start_byte < block.text.len() {
632 let mut candidates = sentence_end_tokens
633 .iter()
634 .copied()
635 .filter(|end_byte| *end_byte > start_byte)
636 .map(|end_byte| (end_byte, NarrativeBoundary::Sentence))
637 .collect::<Vec<_>>();
638 candidates.extend(
639 clause_end_tokens
640 .iter()
641 .copied()
642 .filter(|end_byte| *end_byte > start_byte)
643 .map(|end_byte| (end_byte, NarrativeBoundary::Clause)),
644 );
645 let fallback_end = find_largest_fitting_end_byte(block, start_byte, hard_max, counter)?;
646 candidates.push((fallback_end, NarrativeBoundary::TokenWindow));
647
648 let Some((end_byte, boundary)) = choose_best_narrative_boundary(
649 block,
650 start_byte,
651 target_tokens,
652 hard_max,
653 &candidates,
654 counter,
655 )?
656 else {
657 return Err(KboltError::Inference(
658 "failed to choose a fitting narrative split boundary".to_string(),
659 )
660 .into());
661 };
662
663 if matches!(
664 boundary,
665 NarrativeBoundary::Sentence | NarrativeBoundary::Clause
666 ) {
667 used_structural_boundary = true;
668 }
669
670 out.push(split_block_range_by_bytes(block, start_byte, end_byte));
671 if end_byte == block.text.len() {
672 break;
673 }
674
675 start_byte = next_start_byte(block, start_byte, end_byte, overlap, counter)?;
676 }
677
678 Ok(used_structural_boundary.then_some(out))
679}
680
681fn split_block_range_by_bytes(
682 block: &ExtractedBlock,
683 byte_start: usize,
684 byte_end: usize,
685) -> ExtractedBlock {
686 debug_assert!(byte_start < byte_end, "byte range must be non-empty");
687 debug_assert!(
688 byte_end <= block.text.len(),
689 "byte range exceeds block text"
690 );
691
692 let mut split = block.clone();
693 split.offset = block.offset.saturating_add(byte_start);
694 split.length = byte_end.saturating_sub(byte_start);
695 split.text = block.text[byte_start..byte_end].to_string();
696 if let Some(header) = block.attrs.get(TABLE_RETRIEVAL_PREFIX_ATTR) {
697 if canonical_table_row_body_start(block.text.as_str(), header.as_str())
698 .is_some_and(|body_start| byte_start < body_start)
699 {
700 split.attrs.remove(TABLE_RETRIEVAL_PREFIX_ATTR);
701 }
702 }
703 split
704}
705
706fn choose_best_narrative_boundary(
707 block: &ExtractedBlock,
708 start_byte: usize,
709 target_tokens: usize,
710 hard_max: usize,
711 candidates: &[(usize, NarrativeBoundary)],
712 counter: &dyn TokenCounter,
713) -> Result<Option<(usize, NarrativeBoundary)>> {
714 let mut best: Option<(usize, NarrativeBoundary, i64)> = None;
715 for (end_byte, boundary) in candidates {
716 if *end_byte <= start_byte {
717 continue;
718 }
719
720 let candidate = split_block_range_by_bytes(block, start_byte, *end_byte);
721 let token_count = count_single_block_tokens(&candidate, counter)?;
722 if token_count > hard_max {
723 continue;
724 }
725
726 let boundary_score = match boundary {
727 NarrativeBoundary::Sentence => 30,
728 NarrativeBoundary::Clause => 15,
729 NarrativeBoundary::TokenWindow => 0,
730 };
731 let distance = token_count.abs_diff(target_tokens) as i64;
732 let score = boundary_score - (distance * 10);
733 let replace = best
734 .as_ref()
735 .map(|(best_end_byte, _, best_score)| {
736 score > *best_score || (score == *best_score && *end_byte > *best_end_byte)
737 })
738 .unwrap_or(true);
739 if replace {
740 best = Some((*end_byte, *boundary, score));
741 }
742 }
743
744 Ok(best.map(|(end_byte, boundary, _)| (end_byte, boundary)))
745}
746
747fn find_largest_fitting_end_byte(
748 block: &ExtractedBlock,
749 start_byte: usize,
750 hard_max: usize,
751 counter: &dyn TokenCounter,
752) -> Result<usize> {
753 let text = block.text.as_str();
754 let token_spans = token_byte_spans(text);
755 let mut token_boundaries = token_spans
756 .iter()
757 .map(|(_, end)| *end)
758 .filter(|end| *end > start_byte)
759 .collect::<Vec<_>>();
760 token_boundaries.dedup();
761
762 if let Some(end_byte) =
763 largest_fitting_boundary(block, start_byte, hard_max, &token_boundaries, counter)?
764 {
765 return Ok(end_byte);
766 }
767
768 let mut char_boundaries = text
769 .char_indices()
770 .map(|(idx, ch)| idx + ch.len_utf8())
771 .filter(|end| *end > start_byte)
772 .collect::<Vec<_>>();
773 char_boundaries.dedup();
774 largest_fitting_boundary(block, start_byte, hard_max, &char_boundaries, counter)?.ok_or_else(
775 || {
776 KboltError::Inference(format!(
777 "failed to find a fitting split boundary for block at offset {}",
778 block.offset
779 ))
780 .into()
781 },
782 )
783}
784
785fn largest_fitting_boundary(
786 block: &ExtractedBlock,
787 start_byte: usize,
788 hard_max: usize,
789 boundaries: &[usize],
790 counter: &dyn TokenCounter,
791) -> Result<Option<usize>> {
792 if boundaries.is_empty() {
793 return Ok(None);
794 }
795
796 let mut left = 0usize;
797 let mut right = boundaries.len();
798 let mut best = None;
799 while left < right {
800 let mid = left + (right - left) / 2;
801 let end_byte = boundaries[mid];
802 let candidate = split_block_range_by_bytes(block, start_byte, end_byte);
803 if single_block_fits_within(&candidate, hard_max, counter)? {
804 best = Some(end_byte);
805 left = mid + 1;
806 } else {
807 right = mid;
808 }
809 }
810
811 Ok(best)
812}
813
814fn next_start_byte(
815 block: &ExtractedBlock,
816 current_start_byte: usize,
817 end_byte: usize,
818 overlap: usize,
819 counter: &dyn TokenCounter,
820) -> Result<usize> {
821 debug_assert!(
822 end_byte > current_start_byte,
823 "end byte must advance beyond start byte"
824 );
825 if overlap == 0 {
826 return Ok(next_content_start_byte(block.text.as_str(), end_byte));
827 }
828
829 let text = block.text.as_str();
830 let token_starts = token_byte_spans(text)
831 .into_iter()
832 .map(|(start, _)| start)
833 .filter(|start| *start > current_start_byte && *start < end_byte)
834 .collect::<Vec<_>>();
835 if let Some(next_start) =
836 earliest_fitting_overlap_start(block, end_byte, overlap, &token_starts, counter)?
837 {
838 return Ok(next_start);
839 }
840
841 let char_starts = text
842 .char_indices()
843 .map(|(idx, _)| idx)
844 .filter(|idx| *idx > current_start_byte && *idx < end_byte)
845 .collect::<Vec<_>>();
846 Ok(
847 earliest_fitting_overlap_start(block, end_byte, overlap, &char_starts, counter)?
848 .map(|start| next_content_start_byte(text, start))
849 .unwrap_or_else(|| next_content_start_byte(text, end_byte)),
850 )
851}
852
853fn next_content_start_byte(text: &str, start_byte: usize) -> usize {
854 if start_byte >= text.len() {
855 return text.len();
856 }
857
858 for (idx, ch) in text[start_byte..].char_indices() {
859 if !ch.is_whitespace() {
860 return start_byte + idx;
861 }
862 }
863
864 text.len()
865}
866
867fn earliest_fitting_overlap_start(
868 block: &ExtractedBlock,
869 end_byte: usize,
870 overlap: usize,
871 candidates: &[usize],
872 counter: &dyn TokenCounter,
873) -> Result<Option<usize>> {
874 if candidates.is_empty() {
875 return Ok(None);
876 }
877
878 let mut left = 0usize;
879 let mut right = candidates.len();
880 let mut best = None;
881 while left < right {
882 let mid = left + (right - left) / 2;
883 let start_byte = candidates[mid];
884 let candidate = split_block_range_by_bytes(block, start_byte, end_byte);
885 if single_block_fits_within(&candidate, overlap, counter)? {
886 best = Some(start_byte);
887 right = mid;
888 } else {
889 left = mid + 1;
890 }
891 }
892
893 Ok(best)
894}
895
896fn sentence_end_token_indices(text: &str, spans: &[(usize, usize)]) -> Vec<usize> {
897 spans
898 .iter()
899 .enumerate()
900 .filter_map(|(index, (start, end))| {
901 token_ends_sentence(&text[*start..*end]).then_some(index)
902 })
903 .collect()
904}
905
906fn clause_end_token_indices(text: &str, spans: &[(usize, usize)]) -> Vec<usize> {
907 spans
908 .iter()
909 .enumerate()
910 .filter_map(|(index, (start, end))| token_ends_clause(&text[*start..*end]).then_some(index))
911 .collect()
912}
913
914fn token_ends_sentence(token: &str) -> bool {
915 let trimmed = token.trim_end_matches(['"', '\'', ')', ']', '}']);
916 trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?')
917}
918
919fn token_ends_clause(token: &str) -> bool {
920 let trimmed = token.trim_end_matches(['"', '\'', ')', ']', '}']);
921 trimmed.ends_with(',') || trimmed.ends_with(';') || trimmed.ends_with(':')
922}
923
924#[cfg(test)]
925fn best_narrative_cut(
926 candidates: &[(usize, NarrativeBoundary)],
927 target_tokens: usize,
928 start_token: usize,
929 total_tokens: usize,
930) -> Option<usize> {
931 candidates
932 .iter()
933 .copied()
934 .max_by_key(|(end_token, boundary)| {
935 (
936 score_narrative_cut(
937 target_tokens,
938 start_token,
939 *end_token,
940 total_tokens,
941 *boundary,
942 ),
943 *end_token as i64,
944 )
945 })
946 .map(|(end_token, _)| end_token)
947}
948
949#[cfg(test)]
950fn score_narrative_cut(
951 target_tokens: usize,
952 start_token: usize,
953 end_token: usize,
954 total_tokens: usize,
955 boundary: NarrativeBoundary,
956) -> i64 {
957 let chunk_tokens = end_token.saturating_sub(start_token) as i64;
958 let distance = (chunk_tokens - target_tokens as i64).abs();
959 let boundary_score = match boundary {
960 NarrativeBoundary::Sentence => 30,
961 NarrativeBoundary::Clause => 15,
962 NarrativeBoundary::TokenWindow => 0,
963 };
964 let tiny_tail_penalty = {
965 let tiny_tail_threshold = (target_tokens / 4).max(1);
966 let tail = total_tokens.saturating_sub(end_token);
967 if tail > 0 && tail < tiny_tail_threshold {
968 20
969 } else {
970 0
971 }
972 };
973
974 boundary_score - (distance * 10) - tiny_tail_penalty
975}
976
977fn debug_assert_valid_blocks(blocks: &[ExtractedBlock]) {
978 for block in blocks {
979 debug_assert_eq!(
980 block.text.len(),
981 block.length,
982 "extractor invariant violated: text byte length and source length differ"
983 );
984 }
985}
986
987fn code_group_ranges(text: &str) -> Vec<(usize, usize)> {
988 let bytes = text.as_bytes();
989 let mut groups = Vec::new();
990 let mut group_start: Option<usize> = None;
991 let mut line_start = 0usize;
992
993 while line_start < bytes.len() {
994 let line_end = next_line_end_bytes(bytes, line_start);
995 let content_end = trim_line_ending_bytes(bytes, line_start, line_end);
996 let is_blank = is_blank_line_bytes(bytes, line_start, content_end);
997
998 match (group_start, is_blank) {
999 (None, false) => {
1000 group_start = Some(line_start);
1001 }
1002 (Some(start), true) => {
1003 let end = trim_trailing_newlines_bytes(bytes, line_start);
1004 if end > start {
1005 groups.push((start, end));
1006 }
1007 group_start = None;
1008 }
1009 _ => {}
1010 }
1011
1012 line_start = line_end;
1013 }
1014
1015 if let Some(start) = group_start {
1016 let end = trim_trailing_newlines_bytes(bytes, bytes.len());
1017 if end > start {
1018 groups.push((start, end));
1019 }
1020 }
1021
1022 groups
1023}
1024
1025fn next_line_end_bytes(bytes: &[u8], start: usize) -> usize {
1026 let mut index = start;
1027 while index < bytes.len() {
1028 if bytes[index] == b'\n' {
1029 return index + 1;
1030 }
1031 index += 1;
1032 }
1033 bytes.len()
1034}
1035
1036fn trim_line_ending_bytes(bytes: &[u8], start: usize, end: usize) -> usize {
1037 let mut content_end = end;
1038 while content_end > start && matches!(bytes[content_end - 1], b'\n' | b'\r') {
1039 content_end -= 1;
1040 }
1041 content_end
1042}
1043
1044fn is_blank_line_bytes(bytes: &[u8], start: usize, end: usize) -> bool {
1045 bytes[start..end]
1046 .iter()
1047 .all(|byte| matches!(byte, b' ' | b'\t'))
1048}
1049
1050fn trim_trailing_newlines_bytes(bytes: &[u8], end: usize) -> usize {
1051 let mut result = end;
1052 while result > 0 && matches!(bytes[result - 1], b'\n' | b'\r') {
1053 result -= 1;
1054 }
1055 result
1056}
1057
1058fn token_byte_spans(text: &str) -> Vec<(usize, usize)> {
1059 let mut spans = Vec::new();
1060 let mut token_start: Option<usize> = None;
1061
1062 for (idx, ch) in text.char_indices() {
1063 if ch.is_whitespace() {
1064 if let Some(start) = token_start.take() {
1065 spans.push((start, idx));
1066 }
1067 } else if token_start.is_none() {
1068 token_start = Some(idx);
1069 }
1070 }
1071
1072 if let Some(start) = token_start {
1073 spans.push((start, text.len()));
1074 }
1075
1076 spans
1077}
1078
1079fn finalize_chunk(blocks: &[ExtractedBlock], table_header_mode: TableHeaderMode) -> FinalChunk {
1080 let start = blocks.first().map(|block| block.offset).unwrap_or(0);
1081 let end = blocks
1082 .last()
1083 .map(|block| block.offset.saturating_add(block.length))
1084 .unwrap_or(start);
1085 let mut text = blocks
1086 .iter()
1087 .map(|block| block.text.as_str())
1088 .collect::<Vec<_>>()
1089 .join("\n\n");
1090 let heading = resolve_heading(blocks);
1091 let kind = derive_chunk_kind(blocks);
1092 let retrieval_prefix = if matches!(table_header_mode, TableHeaderMode::CanonicalBlocks) {
1093 table_header_prefix_for_body(blocks, text.as_str()).map(ToString::to_string)
1094 } else {
1095 None
1096 };
1097 if kind == FinalChunkKind::Table {
1098 let has_header = blocks
1099 .iter()
1100 .any(|block| block.kind == BlockKind::TableHeader);
1101 if matches!(table_header_mode, TableHeaderMode::SourceBlocks) && !has_header {
1102 if let Some(header) = blocks
1103 .first()
1104 .and_then(|block| block.attrs.get(TABLE_HEADER_ATTR))
1105 .map(String::as_str)
1106 {
1107 text = format!("{header}\n{text}");
1108 }
1109 }
1110 }
1111
1112 FinalChunk {
1113 text,
1114 retrieval_prefix,
1115 offset: start,
1116 length: end.saturating_sub(start),
1117 heading,
1118 kind,
1119 }
1120}
1121
1122fn table_header_prefix_for_body<'a>(blocks: &'a [ExtractedBlock], body: &str) -> Option<&'a str> {
1123 let kind = derive_chunk_kind(blocks);
1124 if kind != FinalChunkKind::Table {
1125 return None;
1126 }
1127
1128 let has_header = blocks
1129 .iter()
1130 .any(|block| block.kind == BlockKind::TableHeader);
1131 if has_header {
1132 return None;
1133 }
1134
1135 let header = blocks
1136 .first()
1137 .and_then(|block| block.attrs.get(TABLE_RETRIEVAL_PREFIX_ATTR))
1138 .map(String::as_str)
1139 .map(str::trim)
1140 .filter(|header| !header.is_empty())?;
1141
1142 let body_trimmed = body.trim_start();
1143 if body_trimmed == header || body_trimmed.starts_with(&format!("{header}\n")) {
1144 None
1145 } else {
1146 Some(header)
1147 }
1148}
1149
1150fn canonical_table_row_body_start(text: &str, header: &str) -> Option<usize> {
1151 let rest = text.strip_prefix(header)?;
1152 rest.strip_prefix('\n')?;
1153 Some(header.len() + 1)
1154}
1155
1156fn resolve_heading(blocks: &[ExtractedBlock]) -> Option<String> {
1157 blocks
1158 .iter()
1159 .rev()
1160 .find_map(|block| (!block.heading_path.is_empty()).then(|| block.heading_path.join(" > ")))
1161}
1162
1163pub fn derive_chunk_kind(blocks: &[ExtractedBlock]) -> FinalChunkKind {
1164 if blocks.is_empty() {
1165 return FinalChunkKind::Mixed;
1166 }
1167
1168 if blocks
1169 .iter()
1170 .all(|block| block.kind == BlockKind::CodeFence)
1171 {
1172 return FinalChunkKind::Code;
1173 }
1174
1175 if blocks
1176 .iter()
1177 .all(|block| matches!(block.kind, BlockKind::TableHeader | BlockKind::TableRow))
1178 {
1179 return FinalChunkKind::Table;
1180 }
1181
1182 if blocks.iter().all(|block| {
1183 matches!(
1184 block.kind,
1185 BlockKind::Paragraph | BlockKind::ListItem | BlockKind::BlockQuote
1186 )
1187 }) {
1188 return FinalChunkKind::Paragraph;
1189 }
1190
1191 if blocks.iter().any(|block| block.kind == BlockKind::Heading)
1192 && blocks.iter().all(|block| {
1193 matches!(
1194 block.kind,
1195 BlockKind::Heading
1196 | BlockKind::Paragraph
1197 | BlockKind::ListItem
1198 | BlockKind::BlockQuote
1199 )
1200 })
1201 {
1202 return FinalChunkKind::Section;
1203 }
1204
1205 FinalChunkKind::Mixed
1206}
1207
1208#[cfg(test)]
1209mod tests {
1210 use std::cell::Cell;
1211 use std::collections::HashMap;
1212
1213 use crate::config::{ChunkPolicy, ChunkingConfig};
1214 use crate::ingest::chunk::{
1215 best_narrative_cut, can_pack_together, chunk_canonical_document, chunk_document,
1216 chunk_document_with_counter, derive_chunk_kind, resolve_policy, score_narrative_cut,
1217 FinalChunkKind, NarrativeBoundary, TokenCounter, WhitespaceTokenCounter,
1218 };
1219 use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument};
1220
1221 fn baseline_config() -> ChunkingConfig {
1222 ChunkingConfig {
1223 defaults: ChunkPolicy {
1224 target_tokens: 800,
1225 soft_max_tokens: 950,
1226 hard_max_tokens: 1200,
1227 boundary_overlap_tokens: 48,
1228 neighbor_window: 1,
1229 contextual_prefix: true,
1230 },
1231 profiles: HashMap::from([(
1232 "md".to_string(),
1233 ChunkPolicy {
1234 target_tokens: 300,
1235 soft_max_tokens: 360,
1236 hard_max_tokens: 480,
1237 boundary_overlap_tokens: 24,
1238 neighbor_window: 2,
1239 contextual_prefix: false,
1240 },
1241 )]),
1242 }
1243 }
1244
1245 #[test]
1246 fn resolve_policy_prefers_cli_override() {
1247 let config = baseline_config();
1248 let override_policy = ChunkPolicy {
1249 target_tokens: 128,
1250 soft_max_tokens: 160,
1251 hard_max_tokens: 196,
1252 boundary_overlap_tokens: 16,
1253 neighbor_window: 3,
1254 contextual_prefix: false,
1255 };
1256
1257 let resolved = resolve_policy(&config, Some("md"), Some(&override_policy));
1258 assert_eq!(resolved, override_policy);
1259 }
1260
1261 #[test]
1262 fn resolve_policy_uses_normalized_profile_key() {
1263 let config = baseline_config();
1264
1265 let resolved = resolve_policy(&config, Some(".MD"), None);
1266 assert_eq!(resolved.target_tokens, 300);
1267 assert_eq!(resolved.soft_max_tokens, 360);
1268 assert_eq!(resolved.hard_max_tokens, 480);
1269 assert_eq!(resolved.boundary_overlap_tokens, 24);
1270 assert_eq!(resolved.neighbor_window, 2);
1271 assert!(!resolved.contextual_prefix);
1272 }
1273
1274 #[test]
1275 fn resolve_policy_falls_back_to_defaults() {
1276 let config = baseline_config();
1277
1278 let resolved = resolve_policy(&config, Some("txt"), None);
1279 assert_eq!(resolved, config.defaults);
1280 }
1281
1282 fn block(kind: BlockKind) -> ExtractedBlock {
1283 ExtractedBlock {
1284 text: "x".to_string(),
1285 offset: 0,
1286 length: 1,
1287 kind,
1288 heading_path: vec![],
1289 attrs: HashMap::new(),
1290 }
1291 }
1292
1293 fn block_with(
1294 kind: BlockKind,
1295 text: &str,
1296 offset: usize,
1297 heading_path: &[&str],
1298 ) -> ExtractedBlock {
1299 ExtractedBlock {
1300 text: text.to_string(),
1301 offset,
1302 length: text.len(),
1303 kind,
1304 heading_path: heading_path.iter().map(|value| value.to_string()).collect(),
1305 attrs: HashMap::new(),
1306 }
1307 }
1308
1309 #[test]
1310 fn derive_chunk_kind_code_only_is_code() {
1311 let blocks = vec![block(BlockKind::CodeFence), block(BlockKind::CodeFence)];
1312 assert_eq!(derive_chunk_kind(&blocks), FinalChunkKind::Code);
1313 }
1314
1315 #[test]
1316 fn derive_chunk_kind_table_only_is_table() {
1317 let blocks = vec![block(BlockKind::TableHeader), block(BlockKind::TableRow)];
1318 assert_eq!(derive_chunk_kind(&blocks), FinalChunkKind::Table);
1319 }
1320
1321 #[test]
1322 fn derive_chunk_kind_narrative_without_heading_is_paragraph() {
1323 let blocks = vec![block(BlockKind::Paragraph), block(BlockKind::ListItem)];
1324 assert_eq!(derive_chunk_kind(&blocks), FinalChunkKind::Paragraph);
1325 }
1326
1327 #[test]
1328 fn derive_chunk_kind_heading_scoped_narrative_is_section() {
1329 let blocks = vec![block(BlockKind::Heading), block(BlockKind::Paragraph)];
1330 assert_eq!(derive_chunk_kind(&blocks), FinalChunkKind::Section);
1331 }
1332
1333 #[test]
1334 fn derive_chunk_kind_mixed_content_is_mixed() {
1335 let blocks = vec![block(BlockKind::CodeFence), block(BlockKind::Paragraph)];
1336 assert_eq!(derive_chunk_kind(&blocks), FinalChunkKind::Mixed);
1337 }
1338
1339 #[test]
1340 fn chunk_kind_storage_labels_are_stable() {
1341 assert_eq!(FinalChunkKind::Section.as_storage_kind(), "section");
1342 assert_eq!(FinalChunkKind::Paragraph.as_storage_kind(), "paragraph");
1343 assert_eq!(FinalChunkKind::Code.as_storage_kind(), "code");
1344 assert_eq!(FinalChunkKind::Table.as_storage_kind(), "table");
1345 assert_eq!(FinalChunkKind::Mixed.as_storage_kind(), "mixed");
1346 }
1347
1348 #[test]
1349 fn chunk_kind_parses_storage_labels() {
1350 assert_eq!(
1351 FinalChunkKind::try_from("section").expect("parse section"),
1352 FinalChunkKind::Section
1353 );
1354 assert_eq!(
1355 FinalChunkKind::try_from("paragraph").expect("parse paragraph"),
1356 FinalChunkKind::Paragraph
1357 );
1358 assert_eq!(
1359 FinalChunkKind::try_from("code").expect("parse code"),
1360 FinalChunkKind::Code
1361 );
1362 assert_eq!(
1363 FinalChunkKind::try_from("table").expect("parse table"),
1364 FinalChunkKind::Table
1365 );
1366 assert_eq!(
1367 FinalChunkKind::try_from("mixed").expect("parse mixed"),
1368 FinalChunkKind::Mixed
1369 );
1370 }
1371
1372 #[test]
1373 fn chunk_kind_rejects_unknown_storage_labels() {
1374 let err = FinalChunkKind::try_from("unknown").expect_err("unknown label should fail");
1375 assert!(err.to_string().contains("invalid stored chunk kind"));
1376 }
1377
1378 #[test]
1379 fn whitespace_token_counter_counts_word_boundaries() {
1380 let counter = WhitespaceTokenCounter;
1381 assert_eq!(counter.count("").expect("count empty"), 0);
1382 assert_eq!(counter.count("alpha").expect("count token"), 1);
1383 assert_eq!(
1384 counter
1385 .count("alpha beta\tgamma\n\ndelta")
1386 .expect("count whitespace"),
1387 4
1388 );
1389 }
1390
1391 struct SeparatorAwareCounter;
1392
1393 impl TokenCounter for SeparatorAwareCounter {
1394 fn count(&self, text: &str) -> crate::Result<usize> {
1395 Ok(text.split_whitespace().count() + text.matches("\n\n").count())
1396 }
1397 }
1398
1399 struct CharCountCounter;
1400
1401 impl TokenCounter for CharCountCounter {
1402 fn count(&self, text: &str) -> crate::Result<usize> {
1403 Ok(text.chars().count())
1404 }
1405 }
1406
1407 struct ByteBoundCounter {
1408 calls: Cell<usize>,
1409 }
1410
1411 impl TokenCounter for ByteBoundCounter {
1412 fn count(&self, _text: &str) -> crate::Result<usize> {
1413 self.calls.set(self.calls.get() + 1);
1414 Ok(usize::MAX)
1415 }
1416
1417 fn fits_within_token_limit_by_byte_len(&self, byte_len: usize, max_tokens: usize) -> bool {
1418 byte_len
1419 .checked_add(2)
1420 .is_some_and(|upper_bound| upper_bound <= max_tokens)
1421 }
1422 }
1423
1424 #[test]
1425 fn chunk_document_skips_count_when_byte_length_guarantees_fit() {
1426 let policy = ChunkPolicy {
1427 target_tokens: 8,
1428 soft_max_tokens: 8,
1429 hard_max_tokens: 8,
1430 boundary_overlap_tokens: 0,
1431 neighbor_window: 1,
1432 contextual_prefix: true,
1433 };
1434 let document = ExtractedDocument {
1435 blocks: vec![block_with(BlockKind::Paragraph, "alpha", 0, &[])],
1436 metadata: HashMap::new(),
1437 title: None,
1438 };
1439 let counter = ByteBoundCounter {
1440 calls: Cell::new(0),
1441 };
1442
1443 let chunks = chunk_document_with_counter(&document, &policy, &counter)
1444 .expect("chunk with byte-bound counter");
1445
1446 assert_eq!(chunks.len(), 1);
1447 assert_eq!(chunks[0].text, "alpha");
1448 assert_eq!(counter.calls.get(), 0);
1449 }
1450
1451 #[test]
1452 fn chunk_document_with_counter_sizes_candidate_chunk_text_not_additive_blocks() {
1453 let policy = ChunkPolicy {
1454 target_tokens: 2,
1455 soft_max_tokens: 2,
1456 hard_max_tokens: 8,
1457 boundary_overlap_tokens: 0,
1458 neighbor_window: 1,
1459 contextual_prefix: true,
1460 };
1461 let document = ExtractedDocument {
1462 blocks: vec![
1463 block_with(BlockKind::Paragraph, "alpha", 0, &[]),
1464 block_with(BlockKind::Paragraph, "beta", 8, &[]),
1465 ],
1466 metadata: HashMap::new(),
1467 title: None,
1468 };
1469
1470 let chunks = chunk_document_with_counter(&document, &policy, &SeparatorAwareCounter)
1471 .expect("chunk with separator-aware counter");
1472 assert_eq!(chunks.len(), 2);
1473 assert_eq!(chunks[0].text, "alpha");
1474 assert_eq!(chunks[1].text, "beta");
1475 }
1476
1477 #[test]
1478 fn canonical_chunk_text_matches_stored_span_after_split_fragments_pack() {
1479 let policy = ChunkPolicy {
1480 target_tokens: 2,
1481 soft_max_tokens: 6,
1482 hard_max_tokens: 3,
1483 boundary_overlap_tokens: 0,
1484 neighbor_window: 1,
1485 contextual_prefix: true,
1486 };
1487 let text = "one two three four five six";
1488 let document = ExtractedDocument {
1489 blocks: vec![block_with(BlockKind::Paragraph, text, 0, &[])],
1490 metadata: HashMap::new(),
1491 title: None,
1492 };
1493
1494 let chunks = chunk_canonical_document(&document, &policy);
1495
1496 assert_eq!(chunks.len(), 1);
1497 assert_eq!(chunks[0].offset, 0);
1498 assert_eq!(chunks[0].length, text.len());
1499 assert_eq!(chunks[0].text, text);
1500 }
1501
1502 #[test]
1503 fn chunk_document_with_counter_falls_back_to_char_boundaries_for_single_oversized_token() {
1504 let policy = ChunkPolicy {
1505 target_tokens: 4,
1506 soft_max_tokens: 4,
1507 hard_max_tokens: 4,
1508 boundary_overlap_tokens: 0,
1509 neighbor_window: 1,
1510 contextual_prefix: true,
1511 };
1512 let document = ExtractedDocument {
1513 blocks: vec![block_with(BlockKind::CodeFence, "abcdefghij", 0, &[])],
1514 metadata: HashMap::new(),
1515 title: None,
1516 };
1517
1518 let chunks = chunk_document_with_counter(&document, &policy, &CharCountCounter)
1519 .expect("chunk oversized single token");
1520 assert_eq!(chunks.len(), 3);
1521 assert_eq!(chunks[0].text, "abcd");
1522 assert_eq!(chunks[1].text, "efgh");
1523 assert_eq!(chunks[2].text, "ij");
1524 }
1525
1526 #[test]
1527 fn chunk_document_packs_adjacent_blocks_within_soft_max() {
1528 let policy = ChunkPolicy {
1529 target_tokens: 3,
1530 soft_max_tokens: 4,
1531 hard_max_tokens: 8,
1532 boundary_overlap_tokens: 0,
1533 neighbor_window: 1,
1534 contextual_prefix: true,
1535 };
1536 let document = ExtractedDocument {
1537 blocks: vec![
1538 block_with(BlockKind::Paragraph, "alpha beta", 0, &[]),
1539 block_with(BlockKind::Paragraph, "gamma", 12, &[]),
1540 block_with(BlockKind::Paragraph, "delta epsilon", 20, &[]),
1541 ],
1542 metadata: HashMap::new(),
1543 title: None,
1544 };
1545
1546 let chunks = chunk_document(&document, &policy);
1547 assert_eq!(chunks.len(), 2);
1548 assert_eq!(chunks[0].text, "alpha beta\n\ngamma");
1549 assert_eq!(chunks[0].offset, 0);
1550 assert_eq!(chunks[0].length, 17);
1551 assert_eq!(chunks[0].kind, FinalChunkKind::Paragraph);
1552 assert_eq!(chunks[1].text, "delta epsilon");
1553 }
1554
1555 #[test]
1556 fn chunk_document_resolves_heading_from_structural_path() {
1557 let policy = ChunkPolicy {
1558 target_tokens: 2,
1559 soft_max_tokens: 4,
1560 hard_max_tokens: 8,
1561 boundary_overlap_tokens: 0,
1562 neighbor_window: 1,
1563 contextual_prefix: true,
1564 };
1565 let document = ExtractedDocument {
1566 blocks: vec![block_with(
1567 BlockKind::Paragraph,
1568 "body text",
1569 100,
1570 &["Guide", "Intro"],
1571 )],
1572 metadata: HashMap::new(),
1573 title: Some("Doc".to_string()),
1574 };
1575
1576 let chunks = chunk_document(&document, &policy);
1577 assert_eq!(chunks.len(), 1);
1578 assert_eq!(chunks[0].heading.as_deref(), Some("Guide > Intro"));
1579 }
1580
1581 #[test]
1582 fn chunk_document_with_counter_handles_empty_input() {
1583 let policy = ChunkPolicy::default();
1584 let document = ExtractedDocument {
1585 blocks: vec![],
1586 metadata: HashMap::new(),
1587 title: None,
1588 };
1589 let counter = WhitespaceTokenCounter;
1590
1591 let chunks =
1592 chunk_document_with_counter(&document, &policy, &counter).expect("chunk empty input");
1593 assert!(chunks.is_empty());
1594 }
1595
1596 #[test]
1597 fn chunk_document_splits_oversized_block_at_hard_max_with_overlap() {
1598 let policy = ChunkPolicy {
1599 target_tokens: 4,
1600 soft_max_tokens: 4,
1601 hard_max_tokens: 4,
1602 boundary_overlap_tokens: 1,
1603 neighbor_window: 1,
1604 contextual_prefix: true,
1605 };
1606 let document = ExtractedDocument {
1607 blocks: vec![block_with(
1608 BlockKind::Paragraph,
1609 "one two three four five six seven eight nine ten",
1610 10,
1611 &["Doc"],
1612 )],
1613 metadata: HashMap::new(),
1614 title: None,
1615 };
1616
1617 let chunks = chunk_document(&document, &policy);
1618 assert_eq!(chunks.len(), 3);
1619 assert_eq!(chunks[0].text, "one two three four");
1620 assert_eq!(chunks[1].text, "four five six seven");
1621 assert_eq!(chunks[2].text, "seven eight nine ten");
1622
1623 assert_eq!(chunks[0].offset, 10);
1624 assert_eq!(chunks[0].length, 18);
1625 assert_eq!(chunks[1].offset, 24);
1626 assert_eq!(chunks[1].length, 19);
1627 assert_eq!(chunks[2].offset, 38);
1628 assert_eq!(chunks[2].length, 20);
1629 }
1630
1631 #[test]
1632 fn chunk_document_prefers_sentence_boundaries_for_narrative_forced_split() {
1633 let policy = ChunkPolicy {
1634 target_tokens: 4,
1635 soft_max_tokens: 4,
1636 hard_max_tokens: 4,
1637 boundary_overlap_tokens: 0,
1638 neighbor_window: 1,
1639 contextual_prefix: true,
1640 };
1641 let document = ExtractedDocument {
1642 blocks: vec![block_with(
1643 BlockKind::Paragraph,
1644 "alpha one. beta two three. gamma four five. delta six seven.",
1645 0,
1646 &["Doc"],
1647 )],
1648 metadata: HashMap::new(),
1649 title: None,
1650 };
1651
1652 let chunks = chunk_document(&document, &policy);
1653 assert_eq!(chunks.len(), 4);
1654 assert_eq!(chunks[0].text, "alpha one.");
1655 assert_eq!(chunks[1].text, "beta two three.");
1656 assert_eq!(chunks[2].text, "gamma four five.");
1657 assert_eq!(chunks[3].text, "delta six seven.");
1658 }
1659
1660 #[test]
1661 fn chunk_document_keeps_token_window_split_for_non_narrative_blocks() {
1662 let policy = ChunkPolicy {
1663 target_tokens: 4,
1664 soft_max_tokens: 4,
1665 hard_max_tokens: 4,
1666 boundary_overlap_tokens: 0,
1667 neighbor_window: 1,
1668 contextual_prefix: true,
1669 };
1670 let document = ExtractedDocument {
1671 blocks: vec![block_with(
1672 BlockKind::CodeFence,
1673 "alpha. beta gamma delta epsilon zeta",
1674 0,
1675 &[],
1676 )],
1677 metadata: HashMap::new(),
1678 title: None,
1679 };
1680
1681 let chunks = chunk_document(&document, &policy);
1682 assert_eq!(chunks.len(), 2);
1683 assert_eq!(chunks[0].text, "alpha. beta gamma delta");
1684 assert_eq!(chunks[1].text, "epsilon zeta");
1685 assert!(chunks
1686 .iter()
1687 .all(|chunk| chunk.kind == FinalChunkKind::Code));
1688 }
1689
1690 #[test]
1691 fn chunk_document_carries_table_header_for_row_only_chunks() {
1692 let policy = ChunkPolicy {
1693 target_tokens: 4,
1694 soft_max_tokens: 4,
1695 hard_max_tokens: 4,
1696 boundary_overlap_tokens: 0,
1697 neighbor_window: 1,
1698 contextual_prefix: true,
1699 };
1700 let document = ExtractedDocument {
1701 blocks: vec![
1702 block_with(BlockKind::TableHeader, "h1 h2 h3 h4", 0, &[]),
1703 block_with(BlockKind::TableRow, "r1a r1b r1c r1d", 20, &[]),
1704 block_with(BlockKind::TableRow, "r2a r2b r2c r2d", 40, &[]),
1705 ],
1706 metadata: HashMap::new(),
1707 title: None,
1708 };
1709
1710 let chunks = chunk_document(&document, &policy);
1711 assert_eq!(chunks.len(), 3);
1712
1713 assert_eq!(chunks[0].text, "h1 h2 h3 h4");
1714 assert_eq!(chunks[0].offset, 0);
1715 assert_eq!(chunks[0].length, 11);
1716
1717 assert_eq!(chunks[1].text, "h1 h2 h3 h4\nr1a r1b r1c r1d");
1718 assert_eq!(chunks[1].offset, 20);
1719 assert_eq!(chunks[1].length, 15);
1720
1721 assert_eq!(chunks[2].text, "h1 h2 h3 h4\nr2a r2b r2c r2d");
1722 assert_eq!(chunks[2].offset, 40);
1723 assert_eq!(chunks[2].length, 15);
1724 assert!(chunks
1725 .iter()
1726 .all(|chunk| chunk.kind == FinalChunkKind::Table));
1727 }
1728
1729 #[test]
1730 fn canonical_table_row_splits_keep_exact_text_and_prefix_continuations() {
1731 let policy = ChunkPolicy {
1732 target_tokens: 4,
1733 soft_max_tokens: 4,
1734 hard_max_tokens: 5,
1735 boundary_overlap_tokens: 0,
1736 neighbor_window: 1,
1737 contextual_prefix: true,
1738 };
1739 let header = "sku price status";
1740 let row = "alpha beta gamma delta epsilon zeta eta theta iota kappa";
1741 let document = ExtractedDocument {
1742 blocks: vec![
1743 block_with(BlockKind::TableHeader, header, 0, &[]),
1744 block_with(BlockKind::TableRow, row, 20, &[]),
1745 ],
1746 metadata: HashMap::new(),
1747 title: None,
1748 };
1749 let canonical = crate::ingest::canonical::build_canonical_document(&document);
1750
1751 let chunks = chunk_canonical_document(&canonical.document, &policy);
1752
1753 assert!(
1754 chunks
1755 .iter()
1756 .any(|chunk| chunk.retrieval_prefix.as_deref() == Some(header)),
1757 "expected at least one split continuation with table header prefix"
1758 );
1759 for chunk in &chunks {
1760 assert_eq!(
1761 chunk.text,
1762 canonical.text[chunk.offset..chunk.offset + chunk.length]
1763 );
1764 if chunk.retrieval_prefix.as_deref() == Some(header) {
1765 assert!(!chunk.text.trim_start().starts_with(header));
1766 assert!(chunk.retrieval_text().starts_with("sku price status\n"));
1767 }
1768 }
1769 }
1770
1771 #[test]
1772 fn canonical_table_row_with_oversized_header_still_chunks() {
1773 let policy = ChunkPolicy {
1774 target_tokens: 4,
1775 soft_max_tokens: 4,
1776 hard_max_tokens: 5,
1777 boundary_overlap_tokens: 0,
1778 neighbor_window: 1,
1779 contextual_prefix: true,
1780 };
1781 let header = "h1 h2 h3 h4 h5 h6 h7 h8";
1782 let document = ExtractedDocument {
1783 blocks: vec![
1784 block_with(BlockKind::TableHeader, header, 0, &[]),
1785 block_with(BlockKind::TableRow, "alpha beta gamma tailneedle", 30, &[]),
1786 ],
1787 metadata: HashMap::new(),
1788 title: None,
1789 };
1790 let canonical = crate::ingest::canonical::build_canonical_document(&document);
1791
1792 let chunks = chunk_canonical_document(&canonical.document, &policy);
1793
1794 assert!(
1795 chunks.iter().any(|chunk| chunk.text.contains("tailneedle")),
1796 "expected oversized-header row body to remain chunkable"
1797 );
1798 assert!(
1799 chunks.iter().all(|chunk| chunk.retrieval_prefix.is_none()),
1800 "oversized headers should not be persisted as retrieval prefixes: {chunks:?}"
1801 );
1802 for chunk in &chunks {
1803 assert_eq!(
1804 chunk.text,
1805 canonical.text[chunk.offset..chunk.offset + chunk.length]
1806 );
1807 }
1808 }
1809
1810 #[test]
1811 fn chunk_document_flushes_on_structural_boundary_even_under_budget() {
1812 let policy = ChunkPolicy {
1813 target_tokens: 20,
1814 soft_max_tokens: 30,
1815 hard_max_tokens: 30,
1816 boundary_overlap_tokens: 0,
1817 neighbor_window: 1,
1818 contextual_prefix: true,
1819 };
1820 let document = ExtractedDocument {
1821 blocks: vec![
1822 block_with(BlockKind::Heading, "# Intro", 0, &[]),
1823 block_with(BlockKind::Paragraph, "alpha beta", 8, &["Intro"]),
1824 block_with(BlockKind::CodeFence, "fn alpha() {}", 20, &["Intro"]),
1825 block_with(BlockKind::Paragraph, "gamma delta", 34, &["Intro"]),
1826 ],
1827 metadata: HashMap::new(),
1828 title: None,
1829 };
1830
1831 let chunks = chunk_document(&document, &policy);
1832 assert_eq!(chunks.len(), 3);
1833 assert_eq!(chunks[0].kind, FinalChunkKind::Section);
1834 assert_eq!(chunks[1].kind, FinalChunkKind::Code);
1835 assert_eq!(chunks[2].kind, FinalChunkKind::Paragraph);
1836 }
1837
1838 #[test]
1839 fn chunk_document_flushes_on_heading_transition_even_under_budget() {
1840 let policy = ChunkPolicy {
1841 target_tokens: 20,
1842 soft_max_tokens: 50,
1843 hard_max_tokens: 50,
1844 boundary_overlap_tokens: 0,
1845 neighbor_window: 1,
1846 contextual_prefix: true,
1847 };
1848 let document = ExtractedDocument {
1849 blocks: vec![
1850 block_with(BlockKind::Heading, "# Intro", 0, &[]),
1851 block_with(BlockKind::Paragraph, "alpha beta", 8, &["Intro"]),
1852 block_with(BlockKind::Heading, "## Setup", 20, &["Intro"]),
1853 block_with(BlockKind::Paragraph, "gamma delta", 30, &["Intro", "Setup"]),
1854 ],
1855 metadata: HashMap::new(),
1856 title: None,
1857 };
1858
1859 let chunks = chunk_document(&document, &policy);
1860 assert_eq!(chunks.len(), 2);
1861 assert_eq!(chunks[0].kind, FinalChunkKind::Section);
1862 assert_eq!(chunks[0].heading.as_deref(), Some("Intro"));
1863 assert_eq!(chunks[1].kind, FinalChunkKind::Section);
1864 assert_eq!(chunks[1].heading.as_deref(), Some("Intro > Setup"));
1865 }
1866
1867 #[test]
1868 fn chunk_document_flushes_when_narrative_heading_path_changes() {
1869 let policy = ChunkPolicy {
1870 target_tokens: 20,
1871 soft_max_tokens: 50,
1872 hard_max_tokens: 50,
1873 boundary_overlap_tokens: 0,
1874 neighbor_window: 1,
1875 contextual_prefix: true,
1876 };
1877 let document = ExtractedDocument {
1878 blocks: vec![
1879 block_with(BlockKind::Paragraph, "alpha beta", 0, &["Intro"]),
1880 block_with(BlockKind::Paragraph, "gamma delta", 12, &["Setup"]),
1881 ],
1882 metadata: HashMap::new(),
1883 title: None,
1884 };
1885
1886 let chunks = chunk_document(&document, &policy);
1887 assert_eq!(chunks.len(), 2);
1888 assert!(chunks
1889 .iter()
1890 .all(|chunk| chunk.kind == FinalChunkKind::Paragraph));
1891 }
1892
1893 #[test]
1894 fn can_pack_together_allows_same_narrative_family() {
1895 let heading = block_with(BlockKind::Heading, "# Intro", 0, &[]);
1896 let paragraph = block_with(BlockKind::Paragraph, "text", 10, &[]);
1897 assert!(can_pack_together(&heading, ¶graph));
1898 }
1899
1900 #[test]
1901 fn can_pack_together_rejects_cross_family_boundaries() {
1902 let paragraph = block_with(BlockKind::Paragraph, "text", 0, &[]);
1903 let code = block_with(BlockKind::CodeFence, "fn a() {}", 10, &[]);
1904 let table = block_with(BlockKind::TableRow, "|a|b|", 20, &[]);
1905 assert!(!can_pack_together(¶graph, &code));
1906 assert!(!can_pack_together(¶graph, &table));
1907 assert!(!can_pack_together(&code, &table));
1908 }
1909
1910 #[test]
1911 fn can_pack_together_rejects_heading_start_when_chunk_is_open() {
1912 let paragraph = block_with(BlockKind::Paragraph, "text", 0, &["Intro"]);
1913 let heading = block_with(BlockKind::Heading, "## Setup", 10, &["Intro"]);
1914 assert!(!can_pack_together(¶graph, &heading));
1915 }
1916
1917 #[test]
1918 fn can_pack_together_rejects_narrative_heading_path_mismatch() {
1919 let intro = block_with(BlockKind::Paragraph, "alpha", 0, &["Intro"]);
1920 let setup = block_with(BlockKind::Paragraph, "beta", 10, &["Setup"]);
1921 assert!(!can_pack_together(&intro, &setup));
1922 }
1923
1924 #[test]
1925 fn can_pack_together_treats_html_as_opaque() {
1926 let html = block_with(BlockKind::HtmlBlock, "<div>x</div>", 0, &[]);
1927 let html2 = block_with(BlockKind::HtmlBlock, "<div>y</div>", 20, &[]);
1928 let paragraph = block_with(BlockKind::Paragraph, "text", 40, &[]);
1929 assert!(!can_pack_together(&html, &html2));
1930 assert!(!can_pack_together(&html, ¶graph));
1931 }
1932
1933 #[test]
1934 fn token_split_makes_progress_when_overlap_exceeds_window() {
1935 let policy = ChunkPolicy {
1936 target_tokens: 2,
1937 soft_max_tokens: 2,
1938 hard_max_tokens: 2,
1939 boundary_overlap_tokens: 10,
1940 neighbor_window: 1,
1941 contextual_prefix: true,
1942 };
1943 let document = ExtractedDocument {
1944 blocks: vec![block_with(BlockKind::CodeFence, "a b c d e f", 0, &[])],
1945 metadata: HashMap::new(),
1946 title: None,
1947 };
1948
1949 let chunks = chunk_document(&document, &policy);
1950 assert_eq!(chunks.len(), 5);
1951 assert!(chunks.iter().all(|chunk| !chunk.text.is_empty()));
1952 }
1953
1954 #[test]
1955 fn narrative_sentence_split_makes_progress_with_high_overlap() {
1956 let policy = ChunkPolicy {
1957 target_tokens: 4,
1958 soft_max_tokens: 4,
1959 hard_max_tokens: 4,
1960 boundary_overlap_tokens: 10,
1961 neighbor_window: 1,
1962 contextual_prefix: true,
1963 };
1964 let document = ExtractedDocument {
1965 blocks: vec![block_with(
1966 BlockKind::Paragraph,
1967 "one two. three four. five six. seven eight.",
1968 0,
1969 &[],
1970 )],
1971 metadata: HashMap::new(),
1972 title: None,
1973 };
1974
1975 let chunks = chunk_document(&document, &policy);
1976 assert!(chunks.len() >= 2);
1977 assert!(chunks.iter().all(|chunk| !chunk.text.is_empty()));
1978 }
1979
1980 #[test]
1981 fn code_forced_split_prefers_blank_line_boundaries() {
1982 let policy = ChunkPolicy {
1983 target_tokens: 4,
1984 soft_max_tokens: 4,
1985 hard_max_tokens: 6,
1986 boundary_overlap_tokens: 0,
1987 neighbor_window: 1,
1988 contextual_prefix: true,
1989 };
1990 let document = ExtractedDocument {
1991 blocks: vec![block_with(
1992 BlockKind::CodeFence,
1993 "a1 a2 a3 a4\n\na5 a6 a7 a8\n\na9 a10 a11 a12",
1994 0,
1995 &[],
1996 )],
1997 metadata: HashMap::new(),
1998 title: None,
1999 };
2000
2001 let chunks = chunk_document(&document, &policy);
2002 assert_eq!(chunks.len(), 3);
2003 assert_eq!(chunks[0].text, "a1 a2 a3 a4");
2004 assert_eq!(chunks[1].text, "a5 a6 a7 a8");
2005 assert_eq!(chunks[2].text, "a9 a10 a11 a12");
2006 assert!(chunks
2007 .iter()
2008 .all(|chunk| chunk.kind == FinalChunkKind::Code));
2009 }
2010
2011 #[test]
2012 fn narrative_forced_split_uses_clause_boundaries_when_sentences_absent() {
2013 let policy = ChunkPolicy {
2014 target_tokens: 3,
2015 soft_max_tokens: 3,
2016 hard_max_tokens: 3,
2017 boundary_overlap_tokens: 0,
2018 neighbor_window: 1,
2019 contextual_prefix: true,
2020 };
2021 let document = ExtractedDocument {
2022 blocks: vec![block_with(
2023 BlockKind::Paragraph,
2024 "alpha beta, gamma delta, epsilon zeta, eta theta",
2025 0,
2026 &[],
2027 )],
2028 metadata: HashMap::new(),
2029 title: None,
2030 };
2031
2032 let chunks = chunk_document(&document, &policy);
2033 assert_eq!(chunks.len(), 4);
2034 assert_eq!(chunks[0].text, "alpha beta,");
2035 assert_eq!(chunks[1].text, "gamma delta,");
2036 assert_eq!(chunks[2].text, "epsilon zeta,");
2037 assert_eq!(chunks[3].text, "eta theta");
2038 }
2039
2040 #[test]
2041 fn score_narrative_cut_prefers_sentence_boundary_over_clause() {
2042 let sentence = score_narrative_cut(8, 0, 8, 20, NarrativeBoundary::Sentence);
2043 let clause = score_narrative_cut(8, 0, 8, 20, NarrativeBoundary::Clause);
2044 let token = score_narrative_cut(8, 0, 8, 20, NarrativeBoundary::TokenWindow);
2045 assert!(sentence > clause);
2046 assert!(clause > token);
2047 }
2048
2049 #[test]
2050 fn score_narrative_cut_prefers_proximity_to_target() {
2051 let close = score_narrative_cut(8, 0, 8, 20, NarrativeBoundary::Sentence);
2052 let far = score_narrative_cut(8, 0, 5, 20, NarrativeBoundary::Sentence);
2053 assert!(close > far);
2054 }
2055
2056 #[test]
2057 fn best_narrative_cut_penalizes_tiny_tail_when_choices_are_similar() {
2058 let selected = best_narrative_cut(
2059 &[
2060 (8, NarrativeBoundary::Sentence),
2061 (9, NarrativeBoundary::Sentence),
2062 ],
2063 8,
2064 0,
2065 10,
2066 )
2067 .expect("pick best cut");
2068 assert_eq!(selected, 8);
2069 }
2070}