infiniloom_engine/document/
chunking.rs1use serde::{Deserialize, Serialize};
7
8use crate::document::types::*;
9use crate::tokenizer::{TokenModel, Tokenizer};
10
11#[derive(Debug, Clone, Serialize, Deserialize)]
13pub struct DocumentChunk {
14 pub index: usize,
16 pub total: usize,
18 pub breadcrumb: String,
20 pub sections: Vec<Section>,
22 pub token_count: usize,
24}
25
26#[derive(Debug, Clone)]
28pub struct ChunkConfig {
29 pub max_tokens: usize,
31 pub overlap_tokens: usize,
33}
34
35impl Default for ChunkConfig {
36 fn default() -> Self {
37 Self { max_tokens: 4000, overlap_tokens: 200 }
38 }
39}
40
41struct SectionCandidate {
44 section: Section,
45 breadcrumb: String,
46 tokens: usize,
47}
48
49fn estimate_section_tokens(section: &Section, tokenizer: &Tokenizer) -> usize {
51 let mut text = String::new();
52 if let Some(title) = §ion.title {
53 text.push_str(title);
54 text.push('\n');
55 }
56 for block in §ion.content {
57 text.push_str(&block.text());
58 text.push('\n');
59 }
60 tokenizer.count(&text, TokenModel::Claude) as usize
61}
62
63fn flatten_sections(sections: &[Section], parent_breadcrumb: &str) -> Vec<SectionCandidate> {
67 let tokenizer = Tokenizer::new();
68 let mut candidates = Vec::new();
69
70 for section in sections {
71 let breadcrumb = if parent_breadcrumb.is_empty() {
72 section.title.clone().unwrap_or_default()
73 } else if let Some(title) = §ion.title {
74 format!("{} > {}", parent_breadcrumb, title)
75 } else {
76 parent_breadcrumb.to_owned()
77 };
78
79 let leaf_section = Section {
81 id: section.id.clone(),
82 level: section.level,
83 title: section.title.clone(),
84 number: section.number.clone(),
85 content: section.content.clone(),
86 children: Vec::new(),
87 importance: section.importance,
88 };
89
90 let tokens = estimate_section_tokens(&leaf_section, &tokenizer);
91 candidates.push(SectionCandidate {
92 section: leaf_section,
93 breadcrumb: breadcrumb.clone(),
94 tokens,
95 });
96
97 if !section.children.is_empty() {
99 candidates.extend(flatten_sections(§ion.children, &breadcrumb));
100 }
101 }
102
103 candidates
104}
105
106pub fn chunk_document(doc: &Document, config: &ChunkConfig) -> Vec<DocumentChunk> {
115 if doc.sections.is_empty() {
116 return Vec::new();
117 }
118
119 let candidates = flatten_sections(&doc.sections, "");
120 if candidates.is_empty() {
121 return Vec::new();
122 }
123
124 let mut chunks: Vec<(Vec<Section>, String, usize)> = Vec::new(); let mut current_sections: Vec<Section> = Vec::new();
126 let mut current_tokens: usize = 0;
127 let mut current_breadcrumb = String::new();
128 let mut last_chunk_last_title: Option<String> = None;
129
130 for candidate in candidates {
131 let candidate_tokens = candidate.tokens;
132
133 if !current_sections.is_empty() && current_tokens + candidate_tokens > config.max_tokens {
136 let last_title = current_sections.last().and_then(|s| s.title.clone());
138
139 chunks.push((
140 std::mem::take(&mut current_sections),
141 std::mem::take(&mut current_breadcrumb),
142 current_tokens,
143 ));
144 current_tokens = 0;
145 last_chunk_last_title = last_title;
146 }
147
148 if current_sections.is_empty() {
151 if let Some(ref prev_title) = last_chunk_last_title {
152 if !prev_title.is_empty() {
154 let overlap_section = Section {
155 id: None,
156 level: 0,
157 title: Some(format!("[continued from: {}]", prev_title)),
158 number: None,
159 content: Vec::new(),
160 children: Vec::new(),
161 importance: 0.0,
162 };
163 let tokenizer = Tokenizer::new();
164 let overlap_tokens = estimate_section_tokens(&overlap_section, &tokenizer);
165 if overlap_tokens <= config.overlap_tokens {
166 current_sections.push(overlap_section);
167 current_tokens += overlap_tokens;
168 }
169 }
170 }
171 }
172
173 if current_breadcrumb.is_empty() && !candidate.breadcrumb.is_empty() {
175 current_breadcrumb = candidate.breadcrumb.clone();
176 }
177
178 current_sections.push(candidate.section);
180 current_tokens += candidate_tokens;
181 }
182
183 if !current_sections.is_empty() {
185 chunks.push((current_sections, current_breadcrumb, current_tokens));
186 }
187
188 let total = chunks.len();
189 chunks
190 .into_iter()
191 .enumerate()
192 .map(|(i, (sections, breadcrumb, token_count))| DocumentChunk {
193 index: i,
194 total,
195 breadcrumb,
196 sections,
197 token_count,
198 })
199 .collect()
200}
201
202pub fn format_chunks_json(chunks: &[DocumentChunk]) -> String {
204 serde_json::to_string_pretty(chunks).unwrap_or_default()
205}
206
207pub fn format_chunk_text(chunk: &DocumentChunk) -> String {
209 let mut out = String::new();
210 out.push_str(&format!("--- Chunk {}/{} ---\n", chunk.index + 1, chunk.total));
211 if !chunk.breadcrumb.is_empty() {
212 out.push_str(&format!("Context: {}\n", chunk.breadcrumb));
213 }
214 out.push_str(&format!("Tokens: ~{}\n\n", chunk.token_count));
215 for section in &chunk.sections {
216 if let Some(title) = §ion.title {
217 let level = section.level.min(6).max(1) as usize;
218 let prefix = "#".repeat(level);
219 out.push_str(&format!("{prefix} {title}\n\n"));
220 }
221 for block in §ion.content {
222 out.push_str(&block.text());
223 out.push('\n');
224 }
225 }
226 out
227}
228
229#[cfg(test)]
230mod tests {
231 use super::*;
232
233 fn make_section(level: u8, title: &str, text: &str) -> Section {
234 let mut s = Section::new(level, title);
235 if !text.is_empty() {
236 s.content.push(ContentBlock::Paragraph(text.to_string()));
237 }
238 s
239 }
240
241 fn make_table_section(title: &str, rows: usize) -> Section {
242 let mut s = Section::new(1, title);
243 let table = Table {
244 caption: Some("Test Table".to_string()),
245 headers: vec!["Col A".to_string(), "Col B".to_string()],
246 rows: (0..rows)
247 .map(|i| vec![format!("Row {i} A data value"), format!("Row {i} B data value")])
248 .collect(),
249 alignments: vec![],
250 };
251 s.content.push(ContentBlock::Table(table));
252 s
253 }
254
255 #[test]
256 fn test_small_document_single_chunk() {
257 let mut doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
258 doc.sections.push(make_section(1, "Intro", "Hello world."));
259 doc.sections.push(make_section(1, "Conclusion", "Goodbye."));
260
261 let config = ChunkConfig { max_tokens: 4000, overlap_tokens: 200 };
262 let chunks = chunk_document(&doc, &config);
263
264 assert_eq!(chunks.len(), 1);
265 assert_eq!(chunks[0].index, 0);
266 assert_eq!(chunks[0].total, 1);
267 assert!(!chunks[0].sections.is_empty());
268 }
269
270 #[test]
271 fn test_large_document_multiple_chunks() {
272 let mut doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
273 for i in 0..20 {
275 let text = format!(
276 "This is section {i} with enough text to contribute meaningful tokens. \
277 We need to ensure that the total document exceeds our small budget. \
278 Adding more words helps ensure proper splitting across chunks."
279 );
280 doc.sections
281 .push(make_section(1, &format!("Section {i}"), &text));
282 }
283
284 let config = ChunkConfig {
285 max_tokens: 100, overlap_tokens: 0,
287 };
288 let chunks = chunk_document(&doc, &config);
289
290 assert!(chunks.len() > 1, "Should split into multiple chunks");
291 for chunk in &chunks {
293 assert_eq!(chunk.total, chunks.len());
294 }
295 for (i, chunk) in chunks.iter().enumerate() {
297 assert_eq!(chunk.index, i);
298 }
299 }
300
301 #[test]
302 fn test_tables_not_split() {
303 let mut doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
304 doc.sections.push(make_table_section("Big Table", 50));
306
307 let config = ChunkConfig {
308 max_tokens: 50, overlap_tokens: 0,
310 };
311 let chunks = chunk_document(&doc, &config);
312
313 let table_chunks: Vec<_> = chunks
315 .iter()
316 .filter(|c| {
317 c.sections.iter().any(|s| {
318 s.content
319 .iter()
320 .any(|b| matches!(b, ContentBlock::Table(_)))
321 })
322 })
323 .collect();
324
325 assert_eq!(table_chunks.len(), 1, "Table should be entirely within a single chunk");
326 }
327
328 #[test]
329 fn test_breadcrumb_generation() {
330 let mut doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
331 let mut parent = Section::new(1, "Chapter 1");
332 parent
333 .content
334 .push(ContentBlock::Paragraph("Chapter intro text.".to_string()));
335 let child = make_section(2, "Section 1.1", "Child content.");
336 parent.children.push(child);
337 doc.sections.push(parent);
338
339 let config = ChunkConfig { max_tokens: 4000, overlap_tokens: 200 };
340 let chunks = chunk_document(&doc, &config);
341
342 assert!(!chunks.is_empty());
344 assert!(
345 chunks[0].breadcrumb.contains("Chapter 1"),
346 "Breadcrumb should contain parent title, got: {}",
347 chunks[0].breadcrumb
348 );
349 }
350
351 #[test]
352 fn test_chunk_index_total_numbering() {
353 let mut doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
354 for i in 0..10 {
355 let text = "A".repeat(500); doc.sections.push(make_section(1, &format!("S{i}"), &text));
357 }
358
359 let config = ChunkConfig { max_tokens: 50, overlap_tokens: 0 };
360 let chunks = chunk_document(&doc, &config);
361
362 let total = chunks.len();
363 assert!(total > 1);
364 for (i, chunk) in chunks.iter().enumerate() {
365 assert_eq!(chunk.index, i);
366 assert_eq!(chunk.total, total);
367 }
368 }
369
370 #[test]
371 fn test_empty_document_returns_empty() {
372 let doc = Document::new("/tmp/test.md", DocumentFormat::Markdown);
373 let config = ChunkConfig::default();
374 let chunks = chunk_document(&doc, &config);
375 assert!(chunks.is_empty());
376 }
377
378 #[test]
379 fn test_chunk_config_defaults() {
380 let config = ChunkConfig::default();
381 assert_eq!(config.max_tokens, 4000);
382 assert_eq!(config.overlap_tokens, 200);
383 }
384
385 #[test]
386 fn test_format_chunk_text() {
387 let chunk = DocumentChunk {
388 index: 0,
389 total: 2,
390 breadcrumb: "Chapter 1 > Section 1.1".to_string(),
391 sections: vec![make_section(1, "Section 1.1", "Some content here.")],
392 token_count: 42,
393 };
394
395 let text = format_chunk_text(&chunk);
396 assert!(text.contains("--- Chunk 1/2 ---"));
397 assert!(text.contains("Context: Chapter 1 > Section 1.1"));
398 assert!(text.contains("Tokens: ~42"));
399 assert!(text.contains("# Section 1.1"));
400 assert!(text.contains("Some content here."));
401 }
402
403 #[test]
404 fn test_format_chunks_json() {
405 let chunks = vec![DocumentChunk {
406 index: 0,
407 total: 1,
408 breadcrumb: "Intro".to_string(),
409 sections: vec![make_section(1, "Intro", "Hello.")],
410 token_count: 10,
411 }];
412
413 let json = format_chunks_json(&chunks);
414 assert!(json.contains("\"index\": 0"));
415 assert!(json.contains("\"total\": 1"));
416 assert!(json.contains("\"breadcrumb\": \"Intro\""));
417 }
418}