kbolt_core/ingest/
chunk.rs

1use crate::config::{ChunkPolicy, ChunkingConfig};
2use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument};
3use crate::retrieval_context::{render_structural_body, ChunkRetrievalContext};
4use crate::Result;
5use kbolt_types::KboltError;
6
7const TABLE_HEADER_ATTR: &str = "__kbolt_table_header";
8const TABLE_RETRIEVAL_PREFIX_ATTR: &str = "__kbolt_table_retrieval_prefix";
9
10#[derive(Debug, Clone, PartialEq, Eq)]
11pub struct FinalChunk {
12    pub text: String,
13    pub retrieval_prefix: Option<String>,
14    pub offset: usize,
15    pub length: usize,
16    pub heading: Option<String>,
17    pub kind: FinalChunkKind,
18}
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
21pub enum FinalChunkKind {
22    Section,
23    Paragraph,
24    Code,
25    Table,
26    Mixed,
27}
28
29#[derive(Debug, Clone, Copy, PartialEq, Eq)]
30enum PackClass {
31    Narrative,
32    Code,
33    Table,
34    Opaque,
35}
36
37#[derive(Debug, Clone, Copy, PartialEq, Eq)]
38enum NarrativeBoundary {
39    Sentence,
40    Clause,
41    TokenWindow,
42}
43
44pub trait TokenCounter {
45    fn count(&self, text: &str) -> Result<usize>;
46
47    fn fits_within_token_limit_by_byte_len(&self, _byte_len: usize, _max_tokens: usize) -> bool {
48        false
49    }
50}
51
52#[derive(Debug, Default, Clone, Copy)]
53pub struct WhitespaceTokenCounter;
54
55impl TokenCounter for WhitespaceTokenCounter {
56    fn count(&self, text: &str) -> Result<usize> {
57        Ok(count_whitespace_tokens(text))
58    }
59}
60
61impl FinalChunkKind {
62    pub fn as_storage_kind(self) -> &'static str {
63        match self {
64            Self::Section => "section",
65            Self::Paragraph => "paragraph",
66            Self::Code => "code",
67            Self::Table => "table",
68            Self::Mixed => "mixed",
69        }
70    }
71}
72
73impl TryFrom<&str> for FinalChunkKind {
74    type Error = KboltError;
75
76    fn try_from(value: &str) -> std::result::Result<Self, Self::Error> {
77        match value {
78            "section" => Ok(Self::Section),
79            "paragraph" => Ok(Self::Paragraph),
80            "code" => Ok(Self::Code),
81            "table" => Ok(Self::Table),
82            "mixed" => Ok(Self::Mixed),
83            other => Err(KboltError::Internal(format!(
84                "invalid stored chunk kind: {other}"
85            ))),
86        }
87    }
88}
89
90impl FinalChunk {
91    pub fn retrieval_text(&self) -> String {
92        chunk_retrieval_body(self.text.as_str(), self.retrieval_prefix.as_deref())
93    }
94}
95
96pub fn chunk_retrieval_body(canonical_text: &str, retrieval_prefix: Option<&str>) -> String {
97    render_structural_body(ChunkRetrievalContext {
98        body: canonical_text,
99        retrieval_prefix,
100        title: None,
101        heading: None,
102    })
103}
104
105pub fn chunk_document(document: &ExtractedDocument, policy: &ChunkPolicy) -> Vec<FinalChunk> {
106    let counter = WhitespaceTokenCounter;
107    chunk_document_with_counter(document, policy, &counter)
108        .expect("whitespace token counter should be infallible")
109}
110
111pub fn chunk_canonical_document(
112    document: &ExtractedDocument,
113    policy: &ChunkPolicy,
114) -> Vec<FinalChunk> {
115    let counter = WhitespaceTokenCounter;
116    chunk_canonical_document_with_counter(document, policy, &counter)
117        .expect("whitespace token counter should be infallible")
118}
119
120pub fn chunk_document_with_counter(
121    document: &ExtractedDocument,
122    policy: &ChunkPolicy,
123    counter: &dyn TokenCounter,
124) -> Result<Vec<FinalChunk>> {
125    chunk_document_with_counter_inner(document, policy, counter, TableHeaderMode::SourceBlocks)
126}
127
128pub fn chunk_canonical_document_with_counter(
129    document: &ExtractedDocument,
130    policy: &ChunkPolicy,
131    counter: &dyn TokenCounter,
132) -> Result<Vec<FinalChunk>> {
133    let chunks = chunk_document_with_counter_inner(
134        document,
135        policy,
136        counter,
137        TableHeaderMode::CanonicalBlocks,
138    )?;
139    hydrate_canonical_chunk_text(document, chunks)
140}
141
142fn chunk_document_with_counter_inner(
143    document: &ExtractedDocument,
144    policy: &ChunkPolicy,
145    counter: &dyn TokenCounter,
146    table_header_mode: TableHeaderMode,
147) -> Result<Vec<FinalChunk>> {
148    if document.blocks.is_empty() {
149        return Ok(Vec::new());
150    }
151
152    debug_assert_valid_blocks(&document.blocks);
153
154    let soft_max = normalized_soft_max(policy);
155    let expanded =
156        expand_blocks_for_hard_max(&document.blocks, policy, counter, table_header_mode)?;
157    let mut chunks = Vec::new();
158    let mut current = Vec::new();
159
160    for block in &expanded {
161        let structurally_compatible = current
162            .last()
163            .is_none_or(|last| can_pack_together(last, block));
164        let candidate_fits = if current.is_empty() || !structurally_compatible {
165            false
166        } else {
167            candidate_chunk_fits_within(&current, block, soft_max, counter)?
168        };
169
170        if current.is_empty() || (structurally_compatible && candidate_fits) {
171            current.push(block.clone());
172            continue;
173        }
174
175        chunks.push(finalize_chunk(&current, table_header_mode));
176        current.clear();
177        current.push(block.clone());
178    }
179
180    if !current.is_empty() {
181        chunks.push(finalize_chunk(&current, table_header_mode));
182    }
183
184    Ok(chunks)
185}
186
187#[derive(Debug, Clone, Copy, PartialEq, Eq)]
188enum TableHeaderMode {
189    SourceBlocks,
190    CanonicalBlocks,
191}
192
193/// Resolves the effective chunk policy for a file profile.
194/// Precedence: CLI override > profile > defaults.
195pub fn resolve_policy(
196    config: &ChunkingConfig,
197    profile: Option<&str>,
198    cli_override: Option<&ChunkPolicy>,
199) -> ChunkPolicy {
200    if let Some(override_policy) = cli_override {
201        return override_policy.clone();
202    }
203
204    if let Some(profile_name) = profile {
205        let key = normalize_profile_key(profile_name);
206        if let Some(policy) = config.profiles.get(&key) {
207            return policy.clone();
208        }
209    }
210
211    config.defaults.clone()
212}
213
214fn normalize_profile_key(raw: &str) -> String {
215    raw.trim().trim_start_matches('.').to_ascii_lowercase()
216}
217
218fn count_whitespace_tokens(text: &str) -> usize {
219    text.split_whitespace().count()
220}
221
222fn countable_chunk_text(blocks: &[ExtractedBlock]) -> String {
223    let body = blocks
224        .iter()
225        .map(|block| block.text.as_str())
226        .collect::<Vec<_>>()
227        .join("\n\n");
228
229    let prefix = table_header_prefix_for_body(blocks, body.as_str());
230    chunk_retrieval_body(body.as_str(), prefix)
231}
232
233fn countable_chunk_text_byte_upper_bound(blocks: &[ExtractedBlock]) -> Option<usize> {
234    let body_len = joined_block_text_byte_len(blocks)?;
235    let prefix_len = table_header_prefix_upper_bound_len(blocks);
236    match prefix_len {
237        Some(prefix_len) if body_len == 0 => Some(prefix_len),
238        Some(prefix_len) => prefix_len.checked_add(1)?.checked_add(body_len),
239        None => Some(body_len),
240    }
241}
242
243fn joined_block_text_byte_len(blocks: &[ExtractedBlock]) -> Option<usize> {
244    let text_len = blocks
245        .iter()
246        .try_fold(0usize, |sum, block| sum.checked_add(block.text.len()))?;
247    let separators = blocks.len().saturating_sub(1).checked_mul(2)?;
248    text_len.checked_add(separators)
249}
250
251fn table_header_prefix_upper_bound_len(blocks: &[ExtractedBlock]) -> Option<usize> {
252    if derive_chunk_kind(blocks) != FinalChunkKind::Table {
253        return None;
254    }
255    let has_header = blocks
256        .iter()
257        .any(|block| block.kind == BlockKind::TableHeader);
258    if has_header {
259        return None;
260    }
261    blocks
262        .first()
263        .and_then(|block| block.attrs.get(TABLE_RETRIEVAL_PREFIX_ATTR))
264        .map(String::as_str)
265        .map(str::trim)
266        .filter(|header| !header.is_empty())
267        .map(str::len)
268}
269
270fn chunk_text_guaranteed_fits(
271    blocks: &[ExtractedBlock],
272    max_tokens: usize,
273    counter: &dyn TokenCounter,
274) -> bool {
275    countable_chunk_text_byte_upper_bound(blocks)
276        .is_some_and(|byte_len| counter.fits_within_token_limit_by_byte_len(byte_len, max_tokens))
277}
278
279fn chunk_fits_within(
280    blocks: &[ExtractedBlock],
281    max_tokens: usize,
282    counter: &dyn TokenCounter,
283) -> Result<bool> {
284    if chunk_text_guaranteed_fits(blocks, max_tokens, counter) {
285        return Ok(true);
286    }
287    Ok(count_finalized_chunk_tokens(blocks, counter)? <= max_tokens)
288}
289
290fn count_finalized_chunk_tokens(
291    blocks: &[ExtractedBlock],
292    counter: &dyn TokenCounter,
293) -> Result<usize> {
294    counter.count(countable_chunk_text(blocks).as_str())
295}
296
297fn candidate_chunk_fits_within(
298    current: &[ExtractedBlock],
299    next: &ExtractedBlock,
300    max_tokens: usize,
301    counter: &dyn TokenCounter,
302) -> Result<bool> {
303    let mut candidate = current.to_vec();
304    candidate.push(next.clone());
305    chunk_fits_within(&candidate, max_tokens, counter)
306}
307
308fn single_block_fits_within(
309    block: &ExtractedBlock,
310    max_tokens: usize,
311    counter: &dyn TokenCounter,
312) -> Result<bool> {
313    chunk_fits_within(std::slice::from_ref(block), max_tokens, counter)
314}
315
316fn count_single_block_tokens(block: &ExtractedBlock, counter: &dyn TokenCounter) -> Result<usize> {
317    count_finalized_chunk_tokens(std::slice::from_ref(block), counter)
318}
319
320fn hydrate_canonical_chunk_text(
321    document: &ExtractedDocument,
322    mut chunks: Vec<FinalChunk>,
323) -> Result<Vec<FinalChunk>> {
324    let canonical_text = document
325        .blocks
326        .iter()
327        .map(|block| block.text.as_str())
328        .collect::<Vec<_>>()
329        .join("\n\n");
330
331    for chunk in &mut chunks {
332        let end = chunk.offset.checked_add(chunk.length).ok_or_else(|| {
333            KboltError::Internal("canonical chunk text span overflows usize".to_string())
334        })?;
335        if end > canonical_text.len()
336            || !canonical_text.is_char_boundary(chunk.offset)
337            || !canonical_text.is_char_boundary(end)
338        {
339            return Err(KboltError::Internal(format!(
340                "canonical chunk text span {}..{} is invalid for text length {}",
341                chunk.offset,
342                end,
343                canonical_text.len()
344            ))
345            .into());
346        }
347        chunk.text = canonical_text[chunk.offset..end].to_string();
348    }
349
350    Ok(chunks)
351}
352
353fn normalized_soft_max(policy: &ChunkPolicy) -> usize {
354    let target = policy.target_tokens.max(1);
355    policy.soft_max_tokens.max(target)
356}
357
358fn normalized_target(policy: &ChunkPolicy) -> usize {
359    policy.target_tokens.max(1)
360}
361
362fn normalized_hard_max(policy: &ChunkPolicy) -> usize {
363    let soft_max = normalized_soft_max(policy);
364    policy.hard_max_tokens.max(soft_max)
365}
366
367fn normalized_overlap(policy: &ChunkPolicy, hard_max: usize) -> usize {
368    policy
369        .boundary_overlap_tokens
370        .min(hard_max.saturating_sub(1))
371}
372
373fn expand_blocks_for_hard_max(
374    blocks: &[ExtractedBlock],
375    policy: &ChunkPolicy,
376    counter: &dyn TokenCounter,
377    table_header_mode: TableHeaderMode,
378) -> Result<Vec<ExtractedBlock>> {
379    let hard_max = normalized_hard_max(policy);
380    let target = normalized_target(policy);
381    let overlap = normalized_overlap(policy, hard_max);
382    let mut expanded = Vec::new();
383    let mut active_table_header: Option<String> = None;
384
385    for block in blocks {
386        match block.kind {
387            BlockKind::TableHeader => {
388                active_table_header = Some(block.text.clone());
389            }
390            BlockKind::TableRow => {}
391            _ => {
392                active_table_header = None;
393            }
394        }
395
396        let tagged = match table_header_mode {
397            TableHeaderMode::SourceBlocks => {
398                attach_table_header_attr(block, active_table_header.as_deref())
399            }
400            TableHeaderMode::CanonicalBlocks => attach_table_retrieval_prefix_attr(
401                block,
402                fitting_table_retrieval_prefix(active_table_header.as_deref(), hard_max, counter)?,
403            ),
404        };
405
406        if single_block_fits_within(&tagged, hard_max, counter)? {
407            expanded.push(tagged);
408            continue;
409        }
410
411        if is_narrative_block_kind(&tagged.kind) {
412            if let Some(sentence_splits) =
413                split_block_by_sentence_boundaries(&tagged, target, hard_max, overlap, counter)?
414            {
415                expanded.extend(sentence_splits);
416                continue;
417            }
418        }
419
420        if tagged.kind == BlockKind::CodeFence {
421            if let Some(code_splits) =
422                split_code_block_by_blank_lines(&tagged, hard_max, overlap, counter)?
423            {
424                expanded.extend(code_splits);
425                continue;
426            }
427        }
428
429        expanded.extend(split_block_by_tokens(&tagged, hard_max, overlap, counter)?);
430    }
431
432    Ok(expanded)
433}
434
435fn attach_table_header_attr(block: &ExtractedBlock, table_header: Option<&str>) -> ExtractedBlock {
436    let mut tagged = block.clone();
437    if tagged.kind == BlockKind::TableRow {
438        if let Some(header) = table_header {
439            if !header.trim().is_empty() {
440                tagged
441                    .attrs
442                    .insert(TABLE_HEADER_ATTR.to_string(), header.to_string());
443            }
444        }
445    }
446    tagged
447}
448
449fn attach_table_retrieval_prefix_attr(
450    block: &ExtractedBlock,
451    table_header: Option<&str>,
452) -> ExtractedBlock {
453    let mut tagged = block.clone();
454    if tagged.kind == BlockKind::TableRow {
455        if let Some(header) = table_header {
456            if !header.trim().is_empty() {
457                tagged
458                    .attrs
459                    .insert(TABLE_RETRIEVAL_PREFIX_ATTR.to_string(), header.to_string());
460            }
461        }
462    }
463    tagged
464}
465
466fn fitting_table_retrieval_prefix<'a>(
467    table_header: Option<&'a str>,
468    hard_max: usize,
469    counter: &dyn TokenCounter,
470) -> Result<Option<&'a str>> {
471    let Some(header) = table_header.filter(|header| !header.trim().is_empty()) else {
472        return Ok(None);
473    };
474
475    if hard_max > 0 && counter.fits_within_token_limit_by_byte_len(header.len(), hard_max - 1) {
476        return Ok(Some(header));
477    }
478
479    if counter.count(header)? < hard_max {
480        Ok(Some(header))
481    } else {
482        Ok(None)
483    }
484}
485
486fn is_narrative_block_kind(kind: &BlockKind) -> bool {
487    matches!(
488        kind,
489        BlockKind::Paragraph | BlockKind::ListItem | BlockKind::BlockQuote
490    )
491}
492
493fn pack_class_for_kind(kind: &BlockKind) -> PackClass {
494    match kind {
495        BlockKind::Heading | BlockKind::Paragraph | BlockKind::ListItem | BlockKind::BlockQuote => {
496            PackClass::Narrative
497        }
498        BlockKind::CodeFence => PackClass::Code,
499        BlockKind::TableHeader | BlockKind::TableRow => PackClass::Table,
500        BlockKind::HtmlBlock => PackClass::Opaque,
501    }
502}
503
504fn can_pack_together(current: &ExtractedBlock, next: &ExtractedBlock) -> bool {
505    let current_class = pack_class_for_kind(&current.kind);
506    let next_class = pack_class_for_kind(&next.kind);
507    current_class == next_class
508        && current_class != PackClass::Opaque
509        && heading_scopes_compatible(current, next)
510}
511
512fn heading_scopes_compatible(current: &ExtractedBlock, next: &ExtractedBlock) -> bool {
513    let current_class = pack_class_for_kind(&current.kind);
514    let next_class = pack_class_for_kind(&next.kind);
515    if current_class != PackClass::Narrative || next_class != PackClass::Narrative {
516        return true;
517    }
518
519    // Start each heading in a fresh chunk to keep section-level locality.
520    if next.kind == BlockKind::Heading {
521        return false;
522    }
523
524    // Allow heading + body packing inside the newly started section.
525    if current.kind == BlockKind::Heading {
526        return true;
527    }
528
529    current.heading_path == next.heading_path
530}
531
532fn split_block_by_tokens(
533    block: &ExtractedBlock,
534    hard_max: usize,
535    overlap: usize,
536    counter: &dyn TokenCounter,
537) -> Result<Vec<ExtractedBlock>> {
538    if block.text.is_empty() {
539        return Ok(vec![block.clone()]);
540    }
541
542    let mut out = Vec::new();
543    let mut start_byte = 0usize;
544    while start_byte < block.text.len() {
545        let end_byte = find_largest_fitting_end_byte(block, start_byte, hard_max, counter)?;
546        out.push(split_block_range_by_bytes(block, start_byte, end_byte));
547
548        if end_byte == block.text.len() {
549            break;
550        }
551
552        start_byte = next_start_byte(block, start_byte, end_byte, overlap, counter)?;
553    }
554
555    Ok(out)
556}
557
558fn split_code_block_by_blank_lines(
559    block: &ExtractedBlock,
560    hard_max: usize,
561    overlap: usize,
562    counter: &dyn TokenCounter,
563) -> Result<Option<Vec<ExtractedBlock>>> {
564    let groups = code_group_ranges(block.text.as_str());
565    if groups.len() <= 1 {
566        return Ok(None);
567    }
568
569    let mut packed_ranges = Vec::new();
570    let mut current: Option<(usize, usize)> = None;
571    for (start, end) in groups {
572        match current {
573            None => {
574                current = Some((start, end));
575            }
576            Some((current_start, current_end)) => {
577                let candidate = split_block_range_by_bytes(block, current_start, end);
578                if single_block_fits_within(&candidate, hard_max, counter)? {
579                    current = Some((current_start, end));
580                } else {
581                    packed_ranges.push((current_start, current_end));
582                    current = Some((start, end));
583                }
584            }
585        }
586    }
587    if let Some((start, end)) = current {
588        packed_ranges.push((start, end));
589    }
590
591    let mut out = Vec::new();
592    for (start, end) in packed_ranges {
593        let split = split_block_range_by_bytes(block, start, end);
594        if !single_block_fits_within(&split, hard_max, counter)? {
595            out.extend(split_block_by_tokens(&split, hard_max, overlap, counter)?);
596        } else {
597            out.push(split);
598        }
599    }
600
601    Ok((out.len() > 1).then_some(out))
602}
603
604fn split_block_by_sentence_boundaries(
605    block: &ExtractedBlock,
606    target_tokens: usize,
607    hard_max: usize,
608    overlap: usize,
609    counter: &dyn TokenCounter,
610) -> Result<Option<Vec<ExtractedBlock>>> {
611    let spans = token_byte_spans(block.text.as_str());
612    if spans.is_empty() {
613        return Ok(Some(vec![block.clone()]));
614    }
615
616    let sentence_end_tokens = sentence_end_token_indices(block.text.as_str(), &spans)
617        .into_iter()
618        .map(|index| spans[index].1)
619        .collect::<Vec<_>>();
620    let clause_end_tokens = clause_end_token_indices(block.text.as_str(), &spans)
621        .into_iter()
622        .map(|index| spans[index].1)
623        .collect::<Vec<_>>();
624    if sentence_end_tokens.is_empty() && clause_end_tokens.is_empty() {
625        return Ok(None);
626    }
627
628    let mut used_structural_boundary = false;
629    let mut out = Vec::new();
630    let mut start_byte = 0usize;
631    while start_byte < block.text.len() {
632        let mut candidates = sentence_end_tokens
633            .iter()
634            .copied()
635            .filter(|end_byte| *end_byte > start_byte)
636            .map(|end_byte| (end_byte, NarrativeBoundary::Sentence))
637            .collect::<Vec<_>>();
638        candidates.extend(
639            clause_end_tokens
640                .iter()
641                .copied()
642                .filter(|end_byte| *end_byte > start_byte)
643                .map(|end_byte| (end_byte, NarrativeBoundary::Clause)),
644        );
645        let fallback_end = find_largest_fitting_end_byte(block, start_byte, hard_max, counter)?;
646        candidates.push((fallback_end, NarrativeBoundary::TokenWindow));
647
648        let Some((end_byte, boundary)) = choose_best_narrative_boundary(
649            block,
650            start_byte,
651            target_tokens,
652            hard_max,
653            &candidates,
654            counter,
655        )?
656        else {
657            return Err(KboltError::Inference(
658                "failed to choose a fitting narrative split boundary".to_string(),
659            )
660            .into());
661        };
662
663        if matches!(
664            boundary,
665            NarrativeBoundary::Sentence | NarrativeBoundary::Clause
666        ) {
667            used_structural_boundary = true;
668        }
669
670        out.push(split_block_range_by_bytes(block, start_byte, end_byte));
671        if end_byte == block.text.len() {
672            break;
673        }
674
675        start_byte = next_start_byte(block, start_byte, end_byte, overlap, counter)?;
676    }
677
678    Ok(used_structural_boundary.then_some(out))
679}
680
681fn split_block_range_by_bytes(
682    block: &ExtractedBlock,
683    byte_start: usize,
684    byte_end: usize,
685) -> ExtractedBlock {
686    debug_assert!(byte_start < byte_end, "byte range must be non-empty");
687    debug_assert!(
688        byte_end <= block.text.len(),
689        "byte range exceeds block text"
690    );
691
692    let mut split = block.clone();
693    split.offset = block.offset.saturating_add(byte_start);
694    split.length = byte_end.saturating_sub(byte_start);
695    split.text = block.text[byte_start..byte_end].to_string();
696    if let Some(header) = block.attrs.get(TABLE_RETRIEVAL_PREFIX_ATTR) {
697        if canonical_table_row_body_start(block.text.as_str(), header.as_str())
698            .is_some_and(|body_start| byte_start < body_start)
699        {
700            split.attrs.remove(TABLE_RETRIEVAL_PREFIX_ATTR);
701        }
702    }
703    split
704}
705
706fn choose_best_narrative_boundary(
707    block: &ExtractedBlock,
708    start_byte: usize,
709    target_tokens: usize,
710    hard_max: usize,
711    candidates: &[(usize, NarrativeBoundary)],
712    counter: &dyn TokenCounter,
713) -> Result<Option<(usize, NarrativeBoundary)>> {
714    let mut best: Option<(usize, NarrativeBoundary, i64)> = None;
715    for (end_byte, boundary) in candidates {
716        if *end_byte <= start_byte {
717            continue;
718        }
719
720        let candidate = split_block_range_by_bytes(block, start_byte, *end_byte);
721        let token_count = count_single_block_tokens(&candidate, counter)?;
722        if token_count > hard_max {
723            continue;
724        }
725
726        let boundary_score = match boundary {
727            NarrativeBoundary::Sentence => 30,
728            NarrativeBoundary::Clause => 15,
729            NarrativeBoundary::TokenWindow => 0,
730        };
731        let distance = token_count.abs_diff(target_tokens) as i64;
732        let score = boundary_score - (distance * 10);
733        let replace = best
734            .as_ref()
735            .map(|(best_end_byte, _, best_score)| {
736                score > *best_score || (score == *best_score && *end_byte > *best_end_byte)
737            })
738            .unwrap_or(true);
739        if replace {
740            best = Some((*end_byte, *boundary, score));
741        }
742    }
743
744    Ok(best.map(|(end_byte, boundary, _)| (end_byte, boundary)))
745}
746
747fn find_largest_fitting_end_byte(
748    block: &ExtractedBlock,
749    start_byte: usize,
750    hard_max: usize,
751    counter: &dyn TokenCounter,
752) -> Result<usize> {
753    let text = block.text.as_str();
754    let token_spans = token_byte_spans(text);
755    let mut token_boundaries = token_spans
756        .iter()
757        .map(|(_, end)| *end)
758        .filter(|end| *end > start_byte)
759        .collect::<Vec<_>>();
760    token_boundaries.dedup();
761
762    if let Some(end_byte) =
763        largest_fitting_boundary(block, start_byte, hard_max, &token_boundaries, counter)?
764    {
765        return Ok(end_byte);
766    }
767
768    let mut char_boundaries = text
769        .char_indices()
770        .map(|(idx, ch)| idx + ch.len_utf8())
771        .filter(|end| *end > start_byte)
772        .collect::<Vec<_>>();
773    char_boundaries.dedup();
774    largest_fitting_boundary(block, start_byte, hard_max, &char_boundaries, counter)?.ok_or_else(
775        || {
776            KboltError::Inference(format!(
777                "failed to find a fitting split boundary for block at offset {}",
778                block.offset
779            ))
780            .into()
781        },
782    )
783}
784
785fn largest_fitting_boundary(
786    block: &ExtractedBlock,
787    start_byte: usize,
788    hard_max: usize,
789    boundaries: &[usize],
790    counter: &dyn TokenCounter,
791) -> Result<Option<usize>> {
792    if boundaries.is_empty() {
793        return Ok(None);
794    }
795
796    let mut left = 0usize;
797    let mut right = boundaries.len();
798    let mut best = None;
799    while left < right {
800        let mid = left + (right - left) / 2;
801        let end_byte = boundaries[mid];
802        let candidate = split_block_range_by_bytes(block, start_byte, end_byte);
803        if single_block_fits_within(&candidate, hard_max, counter)? {
804            best = Some(end_byte);
805            left = mid + 1;
806        } else {
807            right = mid;
808        }
809    }
810
811    Ok(best)
812}
813
814fn next_start_byte(
815    block: &ExtractedBlock,
816    current_start_byte: usize,
817    end_byte: usize,
818    overlap: usize,
819    counter: &dyn TokenCounter,
820) -> Result<usize> {
821    debug_assert!(
822        end_byte > current_start_byte,
823        "end byte must advance beyond start byte"
824    );
825    if overlap == 0 {
826        return Ok(next_content_start_byte(block.text.as_str(), end_byte));
827    }
828
829    let text = block.text.as_str();
830    let token_starts = token_byte_spans(text)
831        .into_iter()
832        .map(|(start, _)| start)
833        .filter(|start| *start > current_start_byte && *start < end_byte)
834        .collect::<Vec<_>>();
835    if let Some(next_start) =
836        earliest_fitting_overlap_start(block, end_byte, overlap, &token_starts, counter)?
837    {
838        return Ok(next_start);
839    }
840
841    let char_starts = text
842        .char_indices()
843        .map(|(idx, _)| idx)
844        .filter(|idx| *idx > current_start_byte && *idx < end_byte)
845        .collect::<Vec<_>>();
846    Ok(
847        earliest_fitting_overlap_start(block, end_byte, overlap, &char_starts, counter)?
848            .map(|start| next_content_start_byte(text, start))
849            .unwrap_or_else(|| next_content_start_byte(text, end_byte)),
850    )
851}
852
853fn next_content_start_byte(text: &str, start_byte: usize) -> usize {
854    if start_byte >= text.len() {
855        return text.len();
856    }
857
858    for (idx, ch) in text[start_byte..].char_indices() {
859        if !ch.is_whitespace() {
860            return start_byte + idx;
861        }
862    }
863
864    text.len()
865}
866
867fn earliest_fitting_overlap_start(
868    block: &ExtractedBlock,
869    end_byte: usize,
870    overlap: usize,
871    candidates: &[usize],
872    counter: &dyn TokenCounter,
873) -> Result<Option<usize>> {
874    if candidates.is_empty() {
875        return Ok(None);
876    }
877
878    let mut left = 0usize;
879    let mut right = candidates.len();
880    let mut best = None;
881    while left < right {
882        let mid = left + (right - left) / 2;
883        let start_byte = candidates[mid];
884        let candidate = split_block_range_by_bytes(block, start_byte, end_byte);
885        if single_block_fits_within(&candidate, overlap, counter)? {
886            best = Some(start_byte);
887            right = mid;
888        } else {
889            left = mid + 1;
890        }
891    }
892
893    Ok(best)
894}
895
896fn sentence_end_token_indices(text: &str, spans: &[(usize, usize)]) -> Vec<usize> {
897    spans
898        .iter()
899        .enumerate()
900        .filter_map(|(index, (start, end))| {
901            token_ends_sentence(&text[*start..*end]).then_some(index)
902        })
903        .collect()
904}
905
906fn clause_end_token_indices(text: &str, spans: &[(usize, usize)]) -> Vec<usize> {
907    spans
908        .iter()
909        .enumerate()
910        .filter_map(|(index, (start, end))| token_ends_clause(&text[*start..*end]).then_some(index))
911        .collect()
912}
913
914fn token_ends_sentence(token: &str) -> bool {
915    let trimmed = token.trim_end_matches(['"', '\'', ')', ']', '}']);
916    trimmed.ends_with('.') || trimmed.ends_with('!') || trimmed.ends_with('?')
917}
918
919fn token_ends_clause(token: &str) -> bool {
920    let trimmed = token.trim_end_matches(['"', '\'', ')', ']', '}']);
921    trimmed.ends_with(',') || trimmed.ends_with(';') || trimmed.ends_with(':')
922}
923
924#[cfg(test)]
925fn best_narrative_cut(
926    candidates: &[(usize, NarrativeBoundary)],
927    target_tokens: usize,
928    start_token: usize,
929    total_tokens: usize,
930) -> Option<usize> {
931    candidates
932        .iter()
933        .copied()
934        .max_by_key(|(end_token, boundary)| {
935            (
936                score_narrative_cut(
937                    target_tokens,
938                    start_token,
939                    *end_token,
940                    total_tokens,
941                    *boundary,
942                ),
943                *end_token as i64,
944            )
945        })
946        .map(|(end_token, _)| end_token)
947}
948
949#[cfg(test)]
950fn score_narrative_cut(
951    target_tokens: usize,
952    start_token: usize,
953    end_token: usize,
954    total_tokens: usize,
955    boundary: NarrativeBoundary,
956) -> i64 {
957    let chunk_tokens = end_token.saturating_sub(start_token) as i64;
958    let distance = (chunk_tokens - target_tokens as i64).abs();
959    let boundary_score = match boundary {
960        NarrativeBoundary::Sentence => 30,
961        NarrativeBoundary::Clause => 15,
962        NarrativeBoundary::TokenWindow => 0,
963    };
964    let tiny_tail_penalty = {
965        let tiny_tail_threshold = (target_tokens / 4).max(1);
966        let tail = total_tokens.saturating_sub(end_token);
967        if tail > 0 && tail < tiny_tail_threshold {
968            20
969        } else {
970            0
971        }
972    };
973
974    boundary_score - (distance * 10) - tiny_tail_penalty
975}
976
977fn debug_assert_valid_blocks(blocks: &[ExtractedBlock]) {
978    for block in blocks {
979        debug_assert_eq!(
980            block.text.len(),
981            block.length,
982            "extractor invariant violated: text byte length and source length differ"
983        );
984    }
985}
986
987fn code_group_ranges(text: &str) -> Vec<(usize, usize)> {
988    let bytes = text.as_bytes();
989    let mut groups = Vec::new();
990    let mut group_start: Option<usize> = None;
991    let mut line_start = 0usize;
992
993    while line_start < bytes.len() {
994        let line_end = next_line_end_bytes(bytes, line_start);
995        let content_end = trim_line_ending_bytes(bytes, line_start, line_end);
996        let is_blank = is_blank_line_bytes(bytes, line_start, content_end);
997
998        match (group_start, is_blank) {
999            (None, false) => {
1000                group_start = Some(line_start);
1001            }
1002            (Some(start), true) => {
1003                let end = trim_trailing_newlines_bytes(bytes, line_start);
1004                if end > start {
1005                    groups.push((start, end));
1006                }
1007                group_start = None;
1008            }
1009            _ => {}
1010        }
1011
1012        line_start = line_end;
1013    }
1014
1015    if let Some(start) = group_start {
1016        let end = trim_trailing_newlines_bytes(bytes, bytes.len());
1017        if end > start {
1018            groups.push((start, end));
1019        }
1020    }
1021
1022    groups
1023}
1024
1025fn next_line_end_bytes(bytes: &[u8], start: usize) -> usize {
1026    let mut index = start;
1027    while index < bytes.len() {
1028        if bytes[index] == b'\n' {
1029            return index + 1;
1030        }
1031        index += 1;
1032    }
1033    bytes.len()
1034}
1035
1036fn trim_line_ending_bytes(bytes: &[u8], start: usize, end: usize) -> usize {
1037    let mut content_end = end;
1038    while content_end > start && matches!(bytes[content_end - 1], b'\n' | b'\r') {
1039        content_end -= 1;
1040    }
1041    content_end
1042}
1043
1044fn is_blank_line_bytes(bytes: &[u8], start: usize, end: usize) -> bool {
1045    bytes[start..end]
1046        .iter()
1047        .all(|byte| matches!(byte, b' ' | b'\t'))
1048}
1049
1050fn trim_trailing_newlines_bytes(bytes: &[u8], end: usize) -> usize {
1051    let mut result = end;
1052    while result > 0 && matches!(bytes[result - 1], b'\n' | b'\r') {
1053        result -= 1;
1054    }
1055    result
1056}
1057
1058fn token_byte_spans(text: &str) -> Vec<(usize, usize)> {
1059    let mut spans = Vec::new();
1060    let mut token_start: Option<usize> = None;
1061
1062    for (idx, ch) in text.char_indices() {
1063        if ch.is_whitespace() {
1064            if let Some(start) = token_start.take() {
1065                spans.push((start, idx));
1066            }
1067        } else if token_start.is_none() {
1068            token_start = Some(idx);
1069        }
1070    }
1071
1072    if let Some(start) = token_start {
1073        spans.push((start, text.len()));
1074    }
1075
1076    spans
1077}
1078
1079fn finalize_chunk(blocks: &[ExtractedBlock], table_header_mode: TableHeaderMode) -> FinalChunk {
1080    let start = blocks.first().map(|block| block.offset).unwrap_or(0);
1081    let end = blocks
1082        .last()
1083        .map(|block| block.offset.saturating_add(block.length))
1084        .unwrap_or(start);
1085    let mut text = blocks
1086        .iter()
1087        .map(|block| block.text.as_str())
1088        .collect::<Vec<_>>()
1089        .join("\n\n");
1090    let heading = resolve_heading(blocks);
1091    let kind = derive_chunk_kind(blocks);
1092    let retrieval_prefix = if matches!(table_header_mode, TableHeaderMode::CanonicalBlocks) {
1093        table_header_prefix_for_body(blocks, text.as_str()).map(ToString::to_string)
1094    } else {
1095        None
1096    };
1097    if kind == FinalChunkKind::Table {
1098        let has_header = blocks
1099            .iter()
1100            .any(|block| block.kind == BlockKind::TableHeader);
1101        if matches!(table_header_mode, TableHeaderMode::SourceBlocks) && !has_header {
1102            if let Some(header) = blocks
1103                .first()
1104                .and_then(|block| block.attrs.get(TABLE_HEADER_ATTR))
1105                .map(String::as_str)
1106            {
1107                text = format!("{header}\n{text}");
1108            }
1109        }
1110    }
1111
1112    FinalChunk {
1113        text,
1114        retrieval_prefix,
1115        offset: start,
1116        length: end.saturating_sub(start),
1117        heading,
1118        kind,
1119    }
1120}
1121
1122fn table_header_prefix_for_body<'a>(blocks: &'a [ExtractedBlock], body: &str) -> Option<&'a str> {
1123    let kind = derive_chunk_kind(blocks);
1124    if kind != FinalChunkKind::Table {
1125        return None;
1126    }
1127
1128    let has_header = blocks
1129        .iter()
1130        .any(|block| block.kind == BlockKind::TableHeader);
1131    if has_header {
1132        return None;
1133    }
1134
1135    let header = blocks
1136        .first()
1137        .and_then(|block| block.attrs.get(TABLE_RETRIEVAL_PREFIX_ATTR))
1138        .map(String::as_str)
1139        .map(str::trim)
1140        .filter(|header| !header.is_empty())?;
1141
1142    let body_trimmed = body.trim_start();
1143    if body_trimmed == header || body_trimmed.starts_with(&format!("{header}\n")) {
1144        None
1145    } else {
1146        Some(header)
1147    }
1148}
1149
1150fn canonical_table_row_body_start(text: &str, header: &str) -> Option<usize> {
1151    let rest = text.strip_prefix(header)?;
1152    rest.strip_prefix('\n')?;
1153    Some(header.len() + 1)
1154}
1155
1156fn resolve_heading(blocks: &[ExtractedBlock]) -> Option<String> {
1157    blocks
1158        .iter()
1159        .rev()
1160        .find_map(|block| (!block.heading_path.is_empty()).then(|| block.heading_path.join(" > ")))
1161}
1162
1163pub fn derive_chunk_kind(blocks: &[ExtractedBlock]) -> FinalChunkKind {
1164    if blocks.is_empty() {
1165        return FinalChunkKind::Mixed;
1166    }
1167
1168    if blocks
1169        .iter()
1170        .all(|block| block.kind == BlockKind::CodeFence)
1171    {
1172        return FinalChunkKind::Code;
1173    }
1174
1175    if blocks
1176        .iter()
1177        .all(|block| matches!(block.kind, BlockKind::TableHeader | BlockKind::TableRow))
1178    {
1179        return FinalChunkKind::Table;
1180    }
1181
1182    if blocks.iter().all(|block| {
1183        matches!(
1184            block.kind,
1185            BlockKind::Paragraph | BlockKind::ListItem | BlockKind::BlockQuote
1186        )
1187    }) {
1188        return FinalChunkKind::Paragraph;
1189    }
1190
1191    if blocks.iter().any(|block| block.kind == BlockKind::Heading)
1192        && blocks.iter().all(|block| {
1193            matches!(
1194                block.kind,
1195                BlockKind::Heading
1196                    | BlockKind::Paragraph
1197                    | BlockKind::ListItem
1198                    | BlockKind::BlockQuote
1199            )
1200        })
1201    {
1202        return FinalChunkKind::Section;
1203    }
1204
1205    FinalChunkKind::Mixed
1206}
1207
1208#[cfg(test)]
1209mod tests {
1210    use std::cell::Cell;
1211    use std::collections::HashMap;
1212
1213    use crate::config::{ChunkPolicy, ChunkingConfig};
1214    use crate::ingest::chunk::{
1215        best_narrative_cut, can_pack_together, chunk_canonical_document, chunk_document,
1216        chunk_document_with_counter, derive_chunk_kind, resolve_policy, score_narrative_cut,
1217        FinalChunkKind, NarrativeBoundary, TokenCounter, WhitespaceTokenCounter,
1218    };
1219    use crate::ingest::extract::{BlockKind, ExtractedBlock, ExtractedDocument};
1220
1221    fn baseline_config() -> ChunkingConfig {
1222        ChunkingConfig {
1223            defaults: ChunkPolicy {
1224                target_tokens: 800,
1225                soft_max_tokens: 950,
1226                hard_max_tokens: 1200,
1227                boundary_overlap_tokens: 48,
1228                neighbor_window: 1,
1229                contextual_prefix: true,
1230            },
1231            profiles: HashMap::from([(
1232                "md".to_string(),
1233                ChunkPolicy {
1234                    target_tokens: 300,
1235                    soft_max_tokens: 360,
1236                    hard_max_tokens: 480,
1237                    boundary_overlap_tokens: 24,
1238                    neighbor_window: 2,
1239                    contextual_prefix: false,
1240                },
1241            )]),
1242        }
1243    }
1244
1245    #[test]
1246    fn resolve_policy_prefers_cli_override() {
1247        let config = baseline_config();
1248        let override_policy = ChunkPolicy {
1249            target_tokens: 128,
1250            soft_max_tokens: 160,
1251            hard_max_tokens: 196,
1252            boundary_overlap_tokens: 16,
1253            neighbor_window: 3,
1254            contextual_prefix: false,
1255        };
1256
1257        let resolved = resolve_policy(&config, Some("md"), Some(&override_policy));
1258        assert_eq!(resolved, override_policy);
1259    }
1260
1261    #[test]
1262    fn resolve_policy_uses_normalized_profile_key() {
1263        let config = baseline_config();
1264
1265        let resolved = resolve_policy(&config, Some(".MD"), None);
1266        assert_eq!(resolved.target_tokens, 300);
1267        assert_eq!(resolved.soft_max_tokens, 360);
1268        assert_eq!(resolved.hard_max_tokens, 480);
1269        assert_eq!(resolved.boundary_overlap_tokens, 24);
1270        assert_eq!(resolved.neighbor_window, 2);
1271        assert!(!resolved.contextual_prefix);
1272    }
1273
1274    #[test]
1275    fn resolve_policy_falls_back_to_defaults() {
1276        let config = baseline_config();
1277
1278        let resolved = resolve_policy(&config, Some("txt"), None);
1279        assert_eq!(resolved, config.defaults);
1280    }
1281
1282    fn block(kind: BlockKind) -> ExtractedBlock {
1283        ExtractedBlock {
1284            text: "x".to_string(),
1285            offset: 0,
1286            length: 1,
1287            kind,
1288            heading_path: vec![],
1289            attrs: HashMap::new(),
1290        }
1291    }
1292
1293    fn block_with(
1294        kind: BlockKind,
1295        text: &str,
1296        offset: usize,
1297        heading_path: &[&str],
1298    ) -> ExtractedBlock {
1299        ExtractedBlock {
1300            text: text.to_string(),
1301            offset,
1302            length: text.len(),
1303            kind,
1304            heading_path: heading_path.iter().map(|value| value.to_string()).collect(),
1305            attrs: HashMap::new(),
1306        }
1307    }
1308
1309    #[test]
1310    fn derive_chunk_kind_code_only_is_code() {
1311        let blocks = vec![block(BlockKind::CodeFence), block(BlockKind::CodeFence)];
1312        assert_eq!(derive_chunk_kind(&blocks), FinalChunkKind::Code);
1313    }
1314
1315    #[test]
1316    fn derive_chunk_kind_table_only_is_table() {
1317        let blocks = vec![block(BlockKind::TableHeader), block(BlockKind::TableRow)];
1318        assert_eq!(derive_chunk_kind(&blocks), FinalChunkKind::Table);
1319    }
1320
1321    #[test]
1322    fn derive_chunk_kind_narrative_without_heading_is_paragraph() {
1323        let blocks = vec![block(BlockKind::Paragraph), block(BlockKind::ListItem)];
1324        assert_eq!(derive_chunk_kind(&blocks), FinalChunkKind::Paragraph);
1325    }
1326
1327    #[test]
1328    fn derive_chunk_kind_heading_scoped_narrative_is_section() {
1329        let blocks = vec![block(BlockKind::Heading), block(BlockKind::Paragraph)];
1330        assert_eq!(derive_chunk_kind(&blocks), FinalChunkKind::Section);
1331    }
1332
1333    #[test]
1334    fn derive_chunk_kind_mixed_content_is_mixed() {
1335        let blocks = vec![block(BlockKind::CodeFence), block(BlockKind::Paragraph)];
1336        assert_eq!(derive_chunk_kind(&blocks), FinalChunkKind::Mixed);
1337    }
1338
1339    #[test]
1340    fn chunk_kind_storage_labels_are_stable() {
1341        assert_eq!(FinalChunkKind::Section.as_storage_kind(), "section");
1342        assert_eq!(FinalChunkKind::Paragraph.as_storage_kind(), "paragraph");
1343        assert_eq!(FinalChunkKind::Code.as_storage_kind(), "code");
1344        assert_eq!(FinalChunkKind::Table.as_storage_kind(), "table");
1345        assert_eq!(FinalChunkKind::Mixed.as_storage_kind(), "mixed");
1346    }
1347
1348    #[test]
1349    fn chunk_kind_parses_storage_labels() {
1350        assert_eq!(
1351            FinalChunkKind::try_from("section").expect("parse section"),
1352            FinalChunkKind::Section
1353        );
1354        assert_eq!(
1355            FinalChunkKind::try_from("paragraph").expect("parse paragraph"),
1356            FinalChunkKind::Paragraph
1357        );
1358        assert_eq!(
1359            FinalChunkKind::try_from("code").expect("parse code"),
1360            FinalChunkKind::Code
1361        );
1362        assert_eq!(
1363            FinalChunkKind::try_from("table").expect("parse table"),
1364            FinalChunkKind::Table
1365        );
1366        assert_eq!(
1367            FinalChunkKind::try_from("mixed").expect("parse mixed"),
1368            FinalChunkKind::Mixed
1369        );
1370    }
1371
1372    #[test]
1373    fn chunk_kind_rejects_unknown_storage_labels() {
1374        let err = FinalChunkKind::try_from("unknown").expect_err("unknown label should fail");
1375        assert!(err.to_string().contains("invalid stored chunk kind"));
1376    }
1377
1378    #[test]
1379    fn whitespace_token_counter_counts_word_boundaries() {
1380        let counter = WhitespaceTokenCounter;
1381        assert_eq!(counter.count("").expect("count empty"), 0);
1382        assert_eq!(counter.count("alpha").expect("count token"), 1);
1383        assert_eq!(
1384            counter
1385                .count("alpha beta\tgamma\n\ndelta")
1386                .expect("count whitespace"),
1387            4
1388        );
1389    }
1390
1391    struct SeparatorAwareCounter;
1392
1393    impl TokenCounter for SeparatorAwareCounter {
1394        fn count(&self, text: &str) -> crate::Result<usize> {
1395            Ok(text.split_whitespace().count() + text.matches("\n\n").count())
1396        }
1397    }
1398
1399    struct CharCountCounter;
1400
1401    impl TokenCounter for CharCountCounter {
1402        fn count(&self, text: &str) -> crate::Result<usize> {
1403            Ok(text.chars().count())
1404        }
1405    }
1406
1407    struct ByteBoundCounter {
1408        calls: Cell<usize>,
1409    }
1410
1411    impl TokenCounter for ByteBoundCounter {
1412        fn count(&self, _text: &str) -> crate::Result<usize> {
1413            self.calls.set(self.calls.get() + 1);
1414            Ok(usize::MAX)
1415        }
1416
1417        fn fits_within_token_limit_by_byte_len(&self, byte_len: usize, max_tokens: usize) -> bool {
1418            byte_len
1419                .checked_add(2)
1420                .is_some_and(|upper_bound| upper_bound <= max_tokens)
1421        }
1422    }
1423
1424    #[test]
1425    fn chunk_document_skips_count_when_byte_length_guarantees_fit() {
1426        let policy = ChunkPolicy {
1427            target_tokens: 8,
1428            soft_max_tokens: 8,
1429            hard_max_tokens: 8,
1430            boundary_overlap_tokens: 0,
1431            neighbor_window: 1,
1432            contextual_prefix: true,
1433        };
1434        let document = ExtractedDocument {
1435            blocks: vec![block_with(BlockKind::Paragraph, "alpha", 0, &[])],
1436            metadata: HashMap::new(),
1437            title: None,
1438        };
1439        let counter = ByteBoundCounter {
1440            calls: Cell::new(0),
1441        };
1442
1443        let chunks = chunk_document_with_counter(&document, &policy, &counter)
1444            .expect("chunk with byte-bound counter");
1445
1446        assert_eq!(chunks.len(), 1);
1447        assert_eq!(chunks[0].text, "alpha");
1448        assert_eq!(counter.calls.get(), 0);
1449    }
1450
1451    #[test]
1452    fn chunk_document_with_counter_sizes_candidate_chunk_text_not_additive_blocks() {
1453        let policy = ChunkPolicy {
1454            target_tokens: 2,
1455            soft_max_tokens: 2,
1456            hard_max_tokens: 8,
1457            boundary_overlap_tokens: 0,
1458            neighbor_window: 1,
1459            contextual_prefix: true,
1460        };
1461        let document = ExtractedDocument {
1462            blocks: vec![
1463                block_with(BlockKind::Paragraph, "alpha", 0, &[]),
1464                block_with(BlockKind::Paragraph, "beta", 8, &[]),
1465            ],
1466            metadata: HashMap::new(),
1467            title: None,
1468        };
1469
1470        let chunks = chunk_document_with_counter(&document, &policy, &SeparatorAwareCounter)
1471            .expect("chunk with separator-aware counter");
1472        assert_eq!(chunks.len(), 2);
1473        assert_eq!(chunks[0].text, "alpha");
1474        assert_eq!(chunks[1].text, "beta");
1475    }
1476
1477    #[test]
1478    fn canonical_chunk_text_matches_stored_span_after_split_fragments_pack() {
1479        let policy = ChunkPolicy {
1480            target_tokens: 2,
1481            soft_max_tokens: 6,
1482            hard_max_tokens: 3,
1483            boundary_overlap_tokens: 0,
1484            neighbor_window: 1,
1485            contextual_prefix: true,
1486        };
1487        let text = "one two three four five six";
1488        let document = ExtractedDocument {
1489            blocks: vec![block_with(BlockKind::Paragraph, text, 0, &[])],
1490            metadata: HashMap::new(),
1491            title: None,
1492        };
1493
1494        let chunks = chunk_canonical_document(&document, &policy);
1495
1496        assert_eq!(chunks.len(), 1);
1497        assert_eq!(chunks[0].offset, 0);
1498        assert_eq!(chunks[0].length, text.len());
1499        assert_eq!(chunks[0].text, text);
1500    }
1501
1502    #[test]
1503    fn chunk_document_with_counter_falls_back_to_char_boundaries_for_single_oversized_token() {
1504        let policy = ChunkPolicy {
1505            target_tokens: 4,
1506            soft_max_tokens: 4,
1507            hard_max_tokens: 4,
1508            boundary_overlap_tokens: 0,
1509            neighbor_window: 1,
1510            contextual_prefix: true,
1511        };
1512        let document = ExtractedDocument {
1513            blocks: vec![block_with(BlockKind::CodeFence, "abcdefghij", 0, &[])],
1514            metadata: HashMap::new(),
1515            title: None,
1516        };
1517
1518        let chunks = chunk_document_with_counter(&document, &policy, &CharCountCounter)
1519            .expect("chunk oversized single token");
1520        assert_eq!(chunks.len(), 3);
1521        assert_eq!(chunks[0].text, "abcd");
1522        assert_eq!(chunks[1].text, "efgh");
1523        assert_eq!(chunks[2].text, "ij");
1524    }
1525
1526    #[test]
1527    fn chunk_document_packs_adjacent_blocks_within_soft_max() {
1528        let policy = ChunkPolicy {
1529            target_tokens: 3,
1530            soft_max_tokens: 4,
1531            hard_max_tokens: 8,
1532            boundary_overlap_tokens: 0,
1533            neighbor_window: 1,
1534            contextual_prefix: true,
1535        };
1536        let document = ExtractedDocument {
1537            blocks: vec![
1538                block_with(BlockKind::Paragraph, "alpha beta", 0, &[]),
1539                block_with(BlockKind::Paragraph, "gamma", 12, &[]),
1540                block_with(BlockKind::Paragraph, "delta epsilon", 20, &[]),
1541            ],
1542            metadata: HashMap::new(),
1543            title: None,
1544        };
1545
1546        let chunks = chunk_document(&document, &policy);
1547        assert_eq!(chunks.len(), 2);
1548        assert_eq!(chunks[0].text, "alpha beta\n\ngamma");
1549        assert_eq!(chunks[0].offset, 0);
1550        assert_eq!(chunks[0].length, 17);
1551        assert_eq!(chunks[0].kind, FinalChunkKind::Paragraph);
1552        assert_eq!(chunks[1].text, "delta epsilon");
1553    }
1554
1555    #[test]
1556    fn chunk_document_resolves_heading_from_structural_path() {
1557        let policy = ChunkPolicy {
1558            target_tokens: 2,
1559            soft_max_tokens: 4,
1560            hard_max_tokens: 8,
1561            boundary_overlap_tokens: 0,
1562            neighbor_window: 1,
1563            contextual_prefix: true,
1564        };
1565        let document = ExtractedDocument {
1566            blocks: vec![block_with(
1567                BlockKind::Paragraph,
1568                "body text",
1569                100,
1570                &["Guide", "Intro"],
1571            )],
1572            metadata: HashMap::new(),
1573            title: Some("Doc".to_string()),
1574        };
1575
1576        let chunks = chunk_document(&document, &policy);
1577        assert_eq!(chunks.len(), 1);
1578        assert_eq!(chunks[0].heading.as_deref(), Some("Guide > Intro"));
1579    }
1580
1581    #[test]
1582    fn chunk_document_with_counter_handles_empty_input() {
1583        let policy = ChunkPolicy::default();
1584        let document = ExtractedDocument {
1585            blocks: vec![],
1586            metadata: HashMap::new(),
1587            title: None,
1588        };
1589        let counter = WhitespaceTokenCounter;
1590
1591        let chunks =
1592            chunk_document_with_counter(&document, &policy, &counter).expect("chunk empty input");
1593        assert!(chunks.is_empty());
1594    }
1595
1596    #[test]
1597    fn chunk_document_splits_oversized_block_at_hard_max_with_overlap() {
1598        let policy = ChunkPolicy {
1599            target_tokens: 4,
1600            soft_max_tokens: 4,
1601            hard_max_tokens: 4,
1602            boundary_overlap_tokens: 1,
1603            neighbor_window: 1,
1604            contextual_prefix: true,
1605        };
1606        let document = ExtractedDocument {
1607            blocks: vec![block_with(
1608                BlockKind::Paragraph,
1609                "one two three four five six seven eight nine ten",
1610                10,
1611                &["Doc"],
1612            )],
1613            metadata: HashMap::new(),
1614            title: None,
1615        };
1616
1617        let chunks = chunk_document(&document, &policy);
1618        assert_eq!(chunks.len(), 3);
1619        assert_eq!(chunks[0].text, "one two three four");
1620        assert_eq!(chunks[1].text, "four five six seven");
1621        assert_eq!(chunks[2].text, "seven eight nine ten");
1622
1623        assert_eq!(chunks[0].offset, 10);
1624        assert_eq!(chunks[0].length, 18);
1625        assert_eq!(chunks[1].offset, 24);
1626        assert_eq!(chunks[1].length, 19);
1627        assert_eq!(chunks[2].offset, 38);
1628        assert_eq!(chunks[2].length, 20);
1629    }
1630
1631    #[test]
1632    fn chunk_document_prefers_sentence_boundaries_for_narrative_forced_split() {
1633        let policy = ChunkPolicy {
1634            target_tokens: 4,
1635            soft_max_tokens: 4,
1636            hard_max_tokens: 4,
1637            boundary_overlap_tokens: 0,
1638            neighbor_window: 1,
1639            contextual_prefix: true,
1640        };
1641        let document = ExtractedDocument {
1642            blocks: vec![block_with(
1643                BlockKind::Paragraph,
1644                "alpha one. beta two three. gamma four five. delta six seven.",
1645                0,
1646                &["Doc"],
1647            )],
1648            metadata: HashMap::new(),
1649            title: None,
1650        };
1651
1652        let chunks = chunk_document(&document, &policy);
1653        assert_eq!(chunks.len(), 4);
1654        assert_eq!(chunks[0].text, "alpha one.");
1655        assert_eq!(chunks[1].text, "beta two three.");
1656        assert_eq!(chunks[2].text, "gamma four five.");
1657        assert_eq!(chunks[3].text, "delta six seven.");
1658    }
1659
1660    #[test]
1661    fn chunk_document_keeps_token_window_split_for_non_narrative_blocks() {
1662        let policy = ChunkPolicy {
1663            target_tokens: 4,
1664            soft_max_tokens: 4,
1665            hard_max_tokens: 4,
1666            boundary_overlap_tokens: 0,
1667            neighbor_window: 1,
1668            contextual_prefix: true,
1669        };
1670        let document = ExtractedDocument {
1671            blocks: vec![block_with(
1672                BlockKind::CodeFence,
1673                "alpha. beta gamma delta epsilon zeta",
1674                0,
1675                &[],
1676            )],
1677            metadata: HashMap::new(),
1678            title: None,
1679        };
1680
1681        let chunks = chunk_document(&document, &policy);
1682        assert_eq!(chunks.len(), 2);
1683        assert_eq!(chunks[0].text, "alpha. beta gamma delta");
1684        assert_eq!(chunks[1].text, "epsilon zeta");
1685        assert!(chunks
1686            .iter()
1687            .all(|chunk| chunk.kind == FinalChunkKind::Code));
1688    }
1689
1690    #[test]
1691    fn chunk_document_carries_table_header_for_row_only_chunks() {
1692        let policy = ChunkPolicy {
1693            target_tokens: 4,
1694            soft_max_tokens: 4,
1695            hard_max_tokens: 4,
1696            boundary_overlap_tokens: 0,
1697            neighbor_window: 1,
1698            contextual_prefix: true,
1699        };
1700        let document = ExtractedDocument {
1701            blocks: vec![
1702                block_with(BlockKind::TableHeader, "h1 h2 h3 h4", 0, &[]),
1703                block_with(BlockKind::TableRow, "r1a r1b r1c r1d", 20, &[]),
1704                block_with(BlockKind::TableRow, "r2a r2b r2c r2d", 40, &[]),
1705            ],
1706            metadata: HashMap::new(),
1707            title: None,
1708        };
1709
1710        let chunks = chunk_document(&document, &policy);
1711        assert_eq!(chunks.len(), 3);
1712
1713        assert_eq!(chunks[0].text, "h1 h2 h3 h4");
1714        assert_eq!(chunks[0].offset, 0);
1715        assert_eq!(chunks[0].length, 11);
1716
1717        assert_eq!(chunks[1].text, "h1 h2 h3 h4\nr1a r1b r1c r1d");
1718        assert_eq!(chunks[1].offset, 20);
1719        assert_eq!(chunks[1].length, 15);
1720
1721        assert_eq!(chunks[2].text, "h1 h2 h3 h4\nr2a r2b r2c r2d");
1722        assert_eq!(chunks[2].offset, 40);
1723        assert_eq!(chunks[2].length, 15);
1724        assert!(chunks
1725            .iter()
1726            .all(|chunk| chunk.kind == FinalChunkKind::Table));
1727    }
1728
1729    #[test]
1730    fn canonical_table_row_splits_keep_exact_text_and_prefix_continuations() {
1731        let policy = ChunkPolicy {
1732            target_tokens: 4,
1733            soft_max_tokens: 4,
1734            hard_max_tokens: 5,
1735            boundary_overlap_tokens: 0,
1736            neighbor_window: 1,
1737            contextual_prefix: true,
1738        };
1739        let header = "sku price status";
1740        let row = "alpha beta gamma delta epsilon zeta eta theta iota kappa";
1741        let document = ExtractedDocument {
1742            blocks: vec![
1743                block_with(BlockKind::TableHeader, header, 0, &[]),
1744                block_with(BlockKind::TableRow, row, 20, &[]),
1745            ],
1746            metadata: HashMap::new(),
1747            title: None,
1748        };
1749        let canonical = crate::ingest::canonical::build_canonical_document(&document);
1750
1751        let chunks = chunk_canonical_document(&canonical.document, &policy);
1752
1753        assert!(
1754            chunks
1755                .iter()
1756                .any(|chunk| chunk.retrieval_prefix.as_deref() == Some(header)),
1757            "expected at least one split continuation with table header prefix"
1758        );
1759        for chunk in &chunks {
1760            assert_eq!(
1761                chunk.text,
1762                canonical.text[chunk.offset..chunk.offset + chunk.length]
1763            );
1764            if chunk.retrieval_prefix.as_deref() == Some(header) {
1765                assert!(!chunk.text.trim_start().starts_with(header));
1766                assert!(chunk.retrieval_text().starts_with("sku price status\n"));
1767            }
1768        }
1769    }
1770
1771    #[test]
1772    fn canonical_table_row_with_oversized_header_still_chunks() {
1773        let policy = ChunkPolicy {
1774            target_tokens: 4,
1775            soft_max_tokens: 4,
1776            hard_max_tokens: 5,
1777            boundary_overlap_tokens: 0,
1778            neighbor_window: 1,
1779            contextual_prefix: true,
1780        };
1781        let header = "h1 h2 h3 h4 h5 h6 h7 h8";
1782        let document = ExtractedDocument {
1783            blocks: vec![
1784                block_with(BlockKind::TableHeader, header, 0, &[]),
1785                block_with(BlockKind::TableRow, "alpha beta gamma tailneedle", 30, &[]),
1786            ],
1787            metadata: HashMap::new(),
1788            title: None,
1789        };
1790        let canonical = crate::ingest::canonical::build_canonical_document(&document);
1791
1792        let chunks = chunk_canonical_document(&canonical.document, &policy);
1793
1794        assert!(
1795            chunks.iter().any(|chunk| chunk.text.contains("tailneedle")),
1796            "expected oversized-header row body to remain chunkable"
1797        );
1798        assert!(
1799            chunks.iter().all(|chunk| chunk.retrieval_prefix.is_none()),
1800            "oversized headers should not be persisted as retrieval prefixes: {chunks:?}"
1801        );
1802        for chunk in &chunks {
1803            assert_eq!(
1804                chunk.text,
1805                canonical.text[chunk.offset..chunk.offset + chunk.length]
1806            );
1807        }
1808    }
1809
1810    #[test]
1811    fn chunk_document_flushes_on_structural_boundary_even_under_budget() {
1812        let policy = ChunkPolicy {
1813            target_tokens: 20,
1814            soft_max_tokens: 30,
1815            hard_max_tokens: 30,
1816            boundary_overlap_tokens: 0,
1817            neighbor_window: 1,
1818            contextual_prefix: true,
1819        };
1820        let document = ExtractedDocument {
1821            blocks: vec![
1822                block_with(BlockKind::Heading, "# Intro", 0, &[]),
1823                block_with(BlockKind::Paragraph, "alpha beta", 8, &["Intro"]),
1824                block_with(BlockKind::CodeFence, "fn alpha() {}", 20, &["Intro"]),
1825                block_with(BlockKind::Paragraph, "gamma delta", 34, &["Intro"]),
1826            ],
1827            metadata: HashMap::new(),
1828            title: None,
1829        };
1830
1831        let chunks = chunk_document(&document, &policy);
1832        assert_eq!(chunks.len(), 3);
1833        assert_eq!(chunks[0].kind, FinalChunkKind::Section);
1834        assert_eq!(chunks[1].kind, FinalChunkKind::Code);
1835        assert_eq!(chunks[2].kind, FinalChunkKind::Paragraph);
1836    }
1837
1838    #[test]
1839    fn chunk_document_flushes_on_heading_transition_even_under_budget() {
1840        let policy = ChunkPolicy {
1841            target_tokens: 20,
1842            soft_max_tokens: 50,
1843            hard_max_tokens: 50,
1844            boundary_overlap_tokens: 0,
1845            neighbor_window: 1,
1846            contextual_prefix: true,
1847        };
1848        let document = ExtractedDocument {
1849            blocks: vec![
1850                block_with(BlockKind::Heading, "# Intro", 0, &[]),
1851                block_with(BlockKind::Paragraph, "alpha beta", 8, &["Intro"]),
1852                block_with(BlockKind::Heading, "## Setup", 20, &["Intro"]),
1853                block_with(BlockKind::Paragraph, "gamma delta", 30, &["Intro", "Setup"]),
1854            ],
1855            metadata: HashMap::new(),
1856            title: None,
1857        };
1858
1859        let chunks = chunk_document(&document, &policy);
1860        assert_eq!(chunks.len(), 2);
1861        assert_eq!(chunks[0].kind, FinalChunkKind::Section);
1862        assert_eq!(chunks[0].heading.as_deref(), Some("Intro"));
1863        assert_eq!(chunks[1].kind, FinalChunkKind::Section);
1864        assert_eq!(chunks[1].heading.as_deref(), Some("Intro > Setup"));
1865    }
1866
1867    #[test]
1868    fn chunk_document_flushes_when_narrative_heading_path_changes() {
1869        let policy = ChunkPolicy {
1870            target_tokens: 20,
1871            soft_max_tokens: 50,
1872            hard_max_tokens: 50,
1873            boundary_overlap_tokens: 0,
1874            neighbor_window: 1,
1875            contextual_prefix: true,
1876        };
1877        let document = ExtractedDocument {
1878            blocks: vec![
1879                block_with(BlockKind::Paragraph, "alpha beta", 0, &["Intro"]),
1880                block_with(BlockKind::Paragraph, "gamma delta", 12, &["Setup"]),
1881            ],
1882            metadata: HashMap::new(),
1883            title: None,
1884        };
1885
1886        let chunks = chunk_document(&document, &policy);
1887        assert_eq!(chunks.len(), 2);
1888        assert!(chunks
1889            .iter()
1890            .all(|chunk| chunk.kind == FinalChunkKind::Paragraph));
1891    }
1892
1893    #[test]
1894    fn can_pack_together_allows_same_narrative_family() {
1895        let heading = block_with(BlockKind::Heading, "# Intro", 0, &[]);
1896        let paragraph = block_with(BlockKind::Paragraph, "text", 10, &[]);
1897        assert!(can_pack_together(&heading, &paragraph));
1898    }
1899
1900    #[test]
1901    fn can_pack_together_rejects_cross_family_boundaries() {
1902        let paragraph = block_with(BlockKind::Paragraph, "text", 0, &[]);
1903        let code = block_with(BlockKind::CodeFence, "fn a() {}", 10, &[]);
1904        let table = block_with(BlockKind::TableRow, "|a|b|", 20, &[]);
1905        assert!(!can_pack_together(&paragraph, &code));
1906        assert!(!can_pack_together(&paragraph, &table));
1907        assert!(!can_pack_together(&code, &table));
1908    }
1909
1910    #[test]
1911    fn can_pack_together_rejects_heading_start_when_chunk_is_open() {
1912        let paragraph = block_with(BlockKind::Paragraph, "text", 0, &["Intro"]);
1913        let heading = block_with(BlockKind::Heading, "## Setup", 10, &["Intro"]);
1914        assert!(!can_pack_together(&paragraph, &heading));
1915    }
1916
1917    #[test]
1918    fn can_pack_together_rejects_narrative_heading_path_mismatch() {
1919        let intro = block_with(BlockKind::Paragraph, "alpha", 0, &["Intro"]);
1920        let setup = block_with(BlockKind::Paragraph, "beta", 10, &["Setup"]);
1921        assert!(!can_pack_together(&intro, &setup));
1922    }
1923
1924    #[test]
1925    fn can_pack_together_treats_html_as_opaque() {
1926        let html = block_with(BlockKind::HtmlBlock, "<div>x</div>", 0, &[]);
1927        let html2 = block_with(BlockKind::HtmlBlock, "<div>y</div>", 20, &[]);
1928        let paragraph = block_with(BlockKind::Paragraph, "text", 40, &[]);
1929        assert!(!can_pack_together(&html, &html2));
1930        assert!(!can_pack_together(&html, &paragraph));
1931    }
1932
1933    #[test]
1934    fn token_split_makes_progress_when_overlap_exceeds_window() {
1935        let policy = ChunkPolicy {
1936            target_tokens: 2,
1937            soft_max_tokens: 2,
1938            hard_max_tokens: 2,
1939            boundary_overlap_tokens: 10,
1940            neighbor_window: 1,
1941            contextual_prefix: true,
1942        };
1943        let document = ExtractedDocument {
1944            blocks: vec![block_with(BlockKind::CodeFence, "a b c d e f", 0, &[])],
1945            metadata: HashMap::new(),
1946            title: None,
1947        };
1948
1949        let chunks = chunk_document(&document, &policy);
1950        assert_eq!(chunks.len(), 5);
1951        assert!(chunks.iter().all(|chunk| !chunk.text.is_empty()));
1952    }
1953
1954    #[test]
1955    fn narrative_sentence_split_makes_progress_with_high_overlap() {
1956        let policy = ChunkPolicy {
1957            target_tokens: 4,
1958            soft_max_tokens: 4,
1959            hard_max_tokens: 4,
1960            boundary_overlap_tokens: 10,
1961            neighbor_window: 1,
1962            contextual_prefix: true,
1963        };
1964        let document = ExtractedDocument {
1965            blocks: vec![block_with(
1966                BlockKind::Paragraph,
1967                "one two. three four. five six. seven eight.",
1968                0,
1969                &[],
1970            )],
1971            metadata: HashMap::new(),
1972            title: None,
1973        };
1974
1975        let chunks = chunk_document(&document, &policy);
1976        assert!(chunks.len() >= 2);
1977        assert!(chunks.iter().all(|chunk| !chunk.text.is_empty()));
1978    }
1979
1980    #[test]
1981    fn code_forced_split_prefers_blank_line_boundaries() {
1982        let policy = ChunkPolicy {
1983            target_tokens: 4,
1984            soft_max_tokens: 4,
1985            hard_max_tokens: 6,
1986            boundary_overlap_tokens: 0,
1987            neighbor_window: 1,
1988            contextual_prefix: true,
1989        };
1990        let document = ExtractedDocument {
1991            blocks: vec![block_with(
1992                BlockKind::CodeFence,
1993                "a1 a2 a3 a4\n\na5 a6 a7 a8\n\na9 a10 a11 a12",
1994                0,
1995                &[],
1996            )],
1997            metadata: HashMap::new(),
1998            title: None,
1999        };
2000
2001        let chunks = chunk_document(&document, &policy);
2002        assert_eq!(chunks.len(), 3);
2003        assert_eq!(chunks[0].text, "a1 a2 a3 a4");
2004        assert_eq!(chunks[1].text, "a5 a6 a7 a8");
2005        assert_eq!(chunks[2].text, "a9 a10 a11 a12");
2006        assert!(chunks
2007            .iter()
2008            .all(|chunk| chunk.kind == FinalChunkKind::Code));
2009    }
2010
2011    #[test]
2012    fn narrative_forced_split_uses_clause_boundaries_when_sentences_absent() {
2013        let policy = ChunkPolicy {
2014            target_tokens: 3,
2015            soft_max_tokens: 3,
2016            hard_max_tokens: 3,
2017            boundary_overlap_tokens: 0,
2018            neighbor_window: 1,
2019            contextual_prefix: true,
2020        };
2021        let document = ExtractedDocument {
2022            blocks: vec![block_with(
2023                BlockKind::Paragraph,
2024                "alpha beta, gamma delta, epsilon zeta, eta theta",
2025                0,
2026                &[],
2027            )],
2028            metadata: HashMap::new(),
2029            title: None,
2030        };
2031
2032        let chunks = chunk_document(&document, &policy);
2033        assert_eq!(chunks.len(), 4);
2034        assert_eq!(chunks[0].text, "alpha beta,");
2035        assert_eq!(chunks[1].text, "gamma delta,");
2036        assert_eq!(chunks[2].text, "epsilon zeta,");
2037        assert_eq!(chunks[3].text, "eta theta");
2038    }
2039
2040    #[test]
2041    fn score_narrative_cut_prefers_sentence_boundary_over_clause() {
2042        let sentence = score_narrative_cut(8, 0, 8, 20, NarrativeBoundary::Sentence);
2043        let clause = score_narrative_cut(8, 0, 8, 20, NarrativeBoundary::Clause);
2044        let token = score_narrative_cut(8, 0, 8, 20, NarrativeBoundary::TokenWindow);
2045        assert!(sentence > clause);
2046        assert!(clause > token);
2047    }
2048
2049    #[test]
2050    fn score_narrative_cut_prefers_proximity_to_target() {
2051        let close = score_narrative_cut(8, 0, 8, 20, NarrativeBoundary::Sentence);
2052        let far = score_narrative_cut(8, 0, 5, 20, NarrativeBoundary::Sentence);
2053        assert!(close > far);
2054    }
2055
2056    #[test]
2057    fn best_narrative_cut_penalizes_tiny_tail_when_choices_are_similar() {
2058        let selected = best_narrative_cut(
2059            &[
2060                (8, NarrativeBoundary::Sentence),
2061                (9, NarrativeBoundary::Sentence),
2062            ],
2063            8,
2064            0,
2065            10,
2066        )
2067        .expect("pick best cut");
2068        assert_eq!(selected, 8);
2069    }
2070}
kbolt_core/ingest/chunk.rs

kbolt_core/ingest/
chunk.rs