infiniloom_engine/document/
chunking.rs

1//! Document chunking for multi-turn LLM conversations.
2//!
3//! Splits large documents into token-budgeted chunks that respect
4//! section boundaries and never split tables or lists.
5
6use serde::{Deserialize, Serialize};
7
8use crate::document::types::*;
9use crate::tokenizer::{TokenModel, Tokenizer};
10
11/// A chunk of a document with metadata for multi-turn use.
12#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct DocumentChunk {
14    /// Chunk index (0-based)
15    pub index: usize,
16    /// Total number of chunks
17    pub total: usize,
18    /// Section path breadcrumb (e.g., "Chapter 3 > Section 3.2")
19    pub breadcrumb: String,
20    /// The chunk content as sections
21    pub sections: Vec<Section>,
22    /// Approximate token count (claude model)
23    pub token_count: usize,
24}
25
26/// Configuration for document chunking.
27#[derive(Debug, Clone)]
28pub struct ChunkConfig {
29    /// Maximum tokens per chunk (default: 4000)
30    pub max_tokens: usize,
31    /// Overlap tokens between chunks for context continuity (default: 200)
32    pub overlap_tokens: usize,
33}
34
35impl Default for ChunkConfig {
36    fn default() -> Self {
37        Self { max_tokens: 4000, overlap_tokens: 200 }
38    }
39}
40
41/// A flattened section candidate for chunking, with its breadcrumb path
42/// and estimated token count.
43struct SectionCandidate {
44    section: Section,
45    breadcrumb: String,
46    tokens: usize,
47}
48
49/// Estimate the token count for a single section (not including children).
50fn estimate_section_tokens(section: &Section, tokenizer: &Tokenizer) -> usize {
51    let mut text = String::new();
52    if let Some(title) = &section.title {
53        text.push_str(title);
54        text.push('\n');
55    }
56    for block in &section.content {
57        text.push_str(&block.text());
58        text.push('\n');
59    }
60    tokenizer.count(&text, TokenModel::Claude) as usize
61}
62
63/// Flatten a section tree into a list of candidates, each with its breadcrumb path.
64/// Children are recursively flattened, but each section is emitted without its children
65/// (the children become their own candidates).
66fn flatten_sections(sections: &[Section], parent_breadcrumb: &str) -> Vec<SectionCandidate> {
67    let tokenizer = Tokenizer::new();
68    let mut candidates = Vec::new();
69
70    for section in sections {
71        let breadcrumb = if parent_breadcrumb.is_empty() {
72            section.title.clone().unwrap_or_default()
73        } else if let Some(title) = &section.title {
74            format!("{} > {}", parent_breadcrumb, title)
75        } else {
76            parent_breadcrumb.to_owned()
77        };
78
79        // Create a candidate for this section without its children
80        let leaf_section = Section {
81            id: section.id.clone(),
82            level: section.level,
83            title: section.title.clone(),
84            number: section.number.clone(),
85            content: section.content.clone(),
86            children: Vec::new(),
87            importance: section.importance,
88        };
89
90        let tokens = estimate_section_tokens(&leaf_section, &tokenizer);
91        candidates.push(SectionCandidate {
92            section: leaf_section,
93            breadcrumb: breadcrumb.clone(),
94            tokens,
95        });
96
97        // Recursively flatten children
98        if !section.children.is_empty() {
99            candidates.extend(flatten_sections(&section.children, &breadcrumb));
100        }
101    }
102
103    candidates
104}
105
106/// Split a document into token-budgeted chunks.
107///
108/// The algorithm:
109/// 1. Flatten the document's section tree into candidates
110/// 2. Greedily pack candidates into chunks without exceeding `max_tokens`
111/// 3. Tables and lists are never split across chunks
112/// 4. Breaks prefer section boundaries (headings)
113/// 5. For overlap, the last section title from the previous chunk is included as context
114pub fn chunk_document(doc: &Document, config: &ChunkConfig) -> Vec<DocumentChunk> {
115    if doc.sections.is_empty() {
116        return Vec::new();
117    }
118
119    let candidates = flatten_sections(&doc.sections, "");
120    if candidates.is_empty() {
121        return Vec::new();
122    }
123
124    let mut chunks: Vec<(Vec<Section>, String, usize)> = Vec::new(); // (sections, breadcrumb, tokens)
125    let mut current_sections: Vec<Section> = Vec::new();
126    let mut current_tokens: usize = 0;
127    let mut current_breadcrumb = String::new();
128    let mut last_chunk_last_title: Option<String> = None;
129
130    for candidate in candidates {
131        let candidate_tokens = candidate.tokens;
132
133        // If adding this candidate would exceed the budget and we have content,
134        // finalize the current chunk
135        if !current_sections.is_empty() && current_tokens + candidate_tokens > config.max_tokens {
136            // Save the last section title for overlap context
137            let last_title = current_sections.last().and_then(|s| s.title.clone());
138
139            chunks.push((
140                std::mem::take(&mut current_sections),
141                std::mem::take(&mut current_breadcrumb),
142                current_tokens,
143            ));
144            current_tokens = 0;
145            last_chunk_last_title = last_title;
146        }
147
148        // If this is the first section in a new chunk and we have overlap context,
149        // add a breadcrumb hint from the previous chunk
150        if current_sections.is_empty() {
151            if let Some(ref prev_title) = last_chunk_last_title {
152                // Include the previous section title as a context marker
153                if !prev_title.is_empty() {
154                    let overlap_section = Section {
155                        id: None,
156                        level: 0,
157                        title: Some(format!("[continued from: {}]", prev_title)),
158                        number: None,
159                        content: Vec::new(),
160                        children: Vec::new(),
161                        importance: 0.0,
162                    };
163                    let tokenizer = Tokenizer::new();
164                    let overlap_tokens = estimate_section_tokens(&overlap_section, &tokenizer);
165                    if overlap_tokens <= config.overlap_tokens {
166                        current_sections.push(overlap_section);
167                        current_tokens += overlap_tokens;
168                    }
169                }
170            }
171        }
172
173        // Update breadcrumb to the first real section in this chunk
174        if current_breadcrumb.is_empty() && !candidate.breadcrumb.is_empty() {
175            current_breadcrumb = candidate.breadcrumb.clone();
176        }
177
178        // Add the candidate (even if it alone exceeds the budget - it gets its own chunk)
179        current_sections.push(candidate.section);
180        current_tokens += candidate_tokens;
181    }
182
183    // Don't forget the last chunk
184    if !current_sections.is_empty() {
185        chunks.push((current_sections, current_breadcrumb, current_tokens));
186    }
187
188    let total = chunks.len();
189    chunks
190        .into_iter()
191        .enumerate()
192        .map(|(i, (sections, breadcrumb, token_count))| DocumentChunk {
193            index: i,
194            total,
195            breadcrumb,
196            sections,
197            token_count,
198        })
199        .collect()
200}
201
202/// Format chunks as a JSON array.
203pub fn format_chunks_json(chunks: &[DocumentChunk]) -> String {
204    serde_json::to_string_pretty(chunks).unwrap_or_default()
205}
206
207/// Format a single chunk as readable text with header.
208pub fn format_chunk_text(chunk: &DocumentChunk) -> String {
209    let mut out = String::new();
210    out.push_str(&format!("--- Chunk {}/{} ---\n", chunk.index + 1, chunk.total));
211    if !chunk.breadcrumb.is_empty() {
212        out.push_str(&format!("Context: {}\n", chunk.breadcrumb));
213    }
214    out.push_str(&format!("Tokens: ~{}\n\n", chunk.token_count));
215    for section in &chunk.sections {
216        if let Some(title) = &section.title {
217            let level = section.level.min(6).max(1) as usize;
218            let prefix = "#".repeat(level);
219            out.push_str(&format!("{prefix} {title}\n\n"));
220        }
221        for block in &section.content {
222            out.push_str(&block.text());
223            out.push('\n');
224        }
225    }
226    out
227}
228
229#[cfg(test)]
230mod tests {
231    use super::*;
232
233    fn make_section(level: u8, title: &str, text: &str) -> Section {
234        let mut s = Section::new(level, title);
235        if !text.is_empty() {
236            s.content.push(ContentBlock::Paragraph(text.to_string()));
237        }
238        s
239    }
240
241    fn make_table_section(title: &str, rows: usize) -> Section {
242        let mut s = Section::new(1, title);
243        let table = Table {
244            caption: Some("Test Table".to_string()),
245            headers: vec!["Col A".to_string(), "Col B".to_string()],
246            rows: (0..rows)
247                .map(|i| vec![format!("Row {i} A data value"), format!("Row {i} B data value")])
248                .collect(),
249            alignments: vec![],
250        };
251        s.content.push(ContentBlock::Table(table));
252        s
253    }
254
255    #[test]
256    fn test_small_document_single_chunk() {
257        let mut doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
258        doc.sections.push(make_section(1, "Intro", "Hello world."));
259        doc.sections.push(make_section(1, "Conclusion", "Goodbye."));
260
261        let config = ChunkConfig { max_tokens: 4000, overlap_tokens: 200 };
262        let chunks = chunk_document(&doc, &config);
263
264        assert_eq!(chunks.len(), 1);
265        assert_eq!(chunks[0].index, 0);
266        assert_eq!(chunks[0].total, 1);
267        assert!(!chunks[0].sections.is_empty());
268    }
269
270    #[test]
271    fn test_large_document_multiple_chunks() {
272        let mut doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
273        // Create many sections with enough text to exceed a small token budget
274        for i in 0..20 {
275            let text = format!(
276                "This is section {i} with enough text to contribute meaningful tokens. \
277                 We need to ensure that the total document exceeds our small budget. \
278                 Adding more words helps ensure proper splitting across chunks."
279            );
280            doc.sections
281                .push(make_section(1, &format!("Section {i}"), &text));
282        }
283
284        let config = ChunkConfig {
285            max_tokens: 100, // Very small budget to force splitting
286            overlap_tokens: 0,
287        };
288        let chunks = chunk_document(&doc, &config);
289
290        assert!(chunks.len() > 1, "Should split into multiple chunks");
291        // All chunks should have correct total
292        for chunk in &chunks {
293            assert_eq!(chunk.total, chunks.len());
294        }
295        // Indices should be sequential
296        for (i, chunk) in chunks.iter().enumerate() {
297            assert_eq!(chunk.index, i);
298        }
299    }
300
301    #[test]
302    fn test_tables_not_split() {
303        let mut doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
304        // A section with a big table
305        doc.sections.push(make_table_section("Big Table", 50));
306
307        let config = ChunkConfig {
308            max_tokens: 50, // Very small, but table should still be in one chunk
309            overlap_tokens: 0,
310        };
311        let chunks = chunk_document(&doc, &config);
312
313        // The table section should be entirely in one chunk
314        let table_chunks: Vec<_> = chunks
315            .iter()
316            .filter(|c| {
317                c.sections.iter().any(|s| {
318                    s.content
319                        .iter()
320                        .any(|b| matches!(b, ContentBlock::Table(_)))
321                })
322            })
323            .collect();
324
325        assert_eq!(table_chunks.len(), 1, "Table should be entirely within a single chunk");
326    }
327
328    #[test]
329    fn test_breadcrumb_generation() {
330        let mut doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
331        let mut parent = Section::new(1, "Chapter 1");
332        parent
333            .content
334            .push(ContentBlock::Paragraph("Chapter intro text.".to_string()));
335        let child = make_section(2, "Section 1.1", "Child content.");
336        parent.children.push(child);
337        doc.sections.push(parent);
338
339        let config = ChunkConfig { max_tokens: 4000, overlap_tokens: 200 };
340        let chunks = chunk_document(&doc, &config);
341
342        // The breadcrumb should contain the top-level section title
343        assert!(!chunks.is_empty());
344        assert!(
345            chunks[0].breadcrumb.contains("Chapter 1"),
346            "Breadcrumb should contain parent title, got: {}",
347            chunks[0].breadcrumb
348        );
349    }
350
351    #[test]
352    fn test_chunk_index_total_numbering() {
353        let mut doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
354        for i in 0..10 {
355            let text = "A".repeat(500); // enough text to force many chunks with small budget
356            doc.sections.push(make_section(1, &format!("S{i}"), &text));
357        }
358
359        let config = ChunkConfig { max_tokens: 50, overlap_tokens: 0 };
360        let chunks = chunk_document(&doc, &config);
361
362        let total = chunks.len();
363        assert!(total > 1);
364        for (i, chunk) in chunks.iter().enumerate() {
365            assert_eq!(chunk.index, i);
366            assert_eq!(chunk.total, total);
367        }
368    }
369
370    #[test]
371    fn test_empty_document_returns_empty() {
372        let doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
373        let config = ChunkConfig::default();
374        let chunks = chunk_document(&doc, &config);
375        assert!(chunks.is_empty());
376    }
377
378    #[test]
379    fn test_chunk_config_defaults() {
380        let config = ChunkConfig::default();
381        assert_eq!(config.max_tokens, 4000);
382        assert_eq!(config.overlap_tokens, 200);
383    }
384
385    #[test]
386    fn test_format_chunk_text() {
387        let chunk = DocumentChunk {
388            index: 0,
389            total: 2,
390            breadcrumb: "Chapter 1 > Section 1.1".to_string(),
391            sections: vec![make_section(1, "Section 1.1", "Some content here.")],
392            token_count: 42,
393        };
394
395        let text = format_chunk_text(&chunk);
396        assert!(text.contains("--- Chunk 1/2 ---"));
397        assert!(text.contains("Context: Chapter 1 > Section 1.1"));
398        assert!(text.contains("Tokens: ~42"));
399        assert!(text.contains("# Section 1.1"));
400        assert!(text.contains("Some content here."));
401    }
402
403    #[test]
404    fn test_format_chunks_json() {
405        let chunks = vec![DocumentChunk {
406            index: 0,
407            total: 1,
408            breadcrumb: "Intro".to_string(),
409            sections: vec![make_section(1, "Intro", "Hello.")],
410            token_count: 10,
411        }];
412
413        let json = format_chunks_json(&chunks);
414        assert!(json.contains("\"index\": 0"));
415        assert!(json.contains("\"total\": 1"));
416        assert!(json.contains("\"breadcrumb\": \"Intro\""));
417    }
418}
infiniloom_engine/document/chunking.rs

infiniloom_engine/document/
chunking.rs