1use crate::types::structure::{
9 ChunkType, ChunkingOptions, ChunkingResult, CodeChunkingStrategy, ElementData,
10 StructuredChunk, StructuredDocument, StructuredTable, TableChunkingStrategy,
11};
12
13pub struct StructuralChunker {
32 options: ChunkingOptions,
33}
34
35impl Default for StructuralChunker {
36 fn default() -> Self {
37 Self::new(ChunkingOptions::default())
38 }
39}
40
41impl StructuralChunker {
42 pub fn new(options: ChunkingOptions) -> Self {
44 Self { options }
45 }
46
47 pub fn with_max_chars(max_chars: usize) -> Self {
49 Self {
50 options: ChunkingOptions {
51 max_chars,
52 ..Default::default()
53 },
54 }
55 }
56
57 pub fn chunk(&self, doc: &StructuredDocument) -> ChunkingResult {
59 let mut result = ChunkingResult::empty();
60 let mut current_text = String::new();
61 let mut current_start = 0;
62 let mut pending_heading: Option<&str> = None;
63
64 for element in &doc.elements {
65 match &element.data {
66 ElementData::Table(table) => {
67 if !current_text.trim().is_empty() {
69 self.emit_text_chunk(
70 &mut result,
71 ¤t_text,
72 current_start,
73 element.char_start,
74 );
75 current_text.clear();
76 }
77
78 self.chunk_table(&mut result, table, element.char_start, element.char_end);
80 current_start = element.char_end;
81 }
82
83 ElementData::CodeBlock(block) => {
84 if !current_text.trim().is_empty() {
86 self.emit_text_chunk(
87 &mut result,
88 ¤t_text,
89 current_start,
90 element.char_start,
91 );
92 current_text.clear();
93 }
94
95 self.chunk_code_block(
97 &mut result,
98 &block.format(),
99 block.language.as_deref(),
100 element.char_start,
101 element.char_end,
102 );
103 current_start = element.char_end;
104 }
105
106 ElementData::Heading(heading) => {
107 if self.options.include_section_headers {
108 pending_heading = Some(heading.format().leak());
110 }
111
112 if !current_text.is_empty() {
114 current_text.push('\n');
115 }
116 current_text.push_str(&heading.format());
117 }
118
119 ElementData::List(list) => {
120 if self.options.preserve_lists {
121 let list_text = list.format();
122 let combined_len = current_text.chars().count() + list_text.chars().count();
123
124 if combined_len > self.options.max_chars && !current_text.trim().is_empty() {
125 self.emit_text_chunk(
127 &mut result,
128 ¤t_text,
129 current_start,
130 element.char_start,
131 );
132 current_text.clear();
133 current_start = element.char_start;
134 }
135
136 if !current_text.is_empty() {
138 current_text.push_str("\n\n");
139 }
140 current_text.push_str(&list_text);
141 } else {
142 let text = element.text();
144 if !current_text.is_empty() {
145 current_text.push_str("\n\n");
146 }
147 current_text.push_str(&text);
148 }
149 }
150
151 ElementData::Paragraph { text } => {
152 let text_len = text.chars().count();
153 let current_len = current_text.chars().count();
154
155 if current_len + text_len > self.options.max_chars && !current_text.trim().is_empty() {
156 self.emit_text_chunk(
158 &mut result,
159 ¤t_text,
160 current_start,
161 element.char_start,
162 );
163 current_text.clear();
164 current_start = element.char_start;
165
166 if let Some(heading) = pending_heading.take() {
168 current_text.push_str(heading);
169 current_text.push_str("\n\n");
170 }
171 }
172
173 if !current_text.is_empty() && !current_text.ends_with('\n') {
174 current_text.push_str("\n\n");
175 }
176 current_text.push_str(text);
177 }
178
179 ElementData::BlockQuote { text } => {
180 if !current_text.is_empty() {
181 current_text.push_str("\n\n");
182 }
183 current_text.push_str("> ");
184 current_text.push_str(text);
185 }
186
187 ElementData::Separator => {
188 if !current_text.trim().is_empty() {
190 self.emit_text_chunk(
191 &mut result,
192 ¤t_text,
193 current_start,
194 element.char_start,
195 );
196 current_text.clear();
197 }
198 current_start = element.char_end;
199 pending_heading = None;
200 }
201
202 ElementData::Raw { text } => {
203 if !current_text.is_empty() {
204 current_text.push_str("\n\n");
205 }
206 current_text.push_str(text);
207 }
208 }
209 }
210
211 if !current_text.trim().is_empty() {
213 self.emit_text_chunk(
214 &mut result,
215 ¤t_text,
216 current_start,
217 doc.total_chars,
218 );
219 }
220
221 result
222 }
223
224 fn emit_text_chunk(
226 &self,
227 result: &mut ChunkingResult,
228 text: &str,
229 char_start: usize,
230 char_end: usize,
231 ) {
232 let index = result.chunks.len();
233 result.chunks.push(StructuredChunk::text(
234 text.trim(),
235 index,
236 char_start,
237 char_end,
238 ));
239 }
240
241 fn chunk_table(
243 &self,
244 result: &mut ChunkingResult,
245 table: &StructuredTable,
246 char_start: usize,
247 char_end: usize,
248 ) {
249 result.tables_processed += 1;
250
251 match self.options.table_handling {
252 TableChunkingStrategy::PreserveWhole => {
253 let index = result.chunks.len();
255 result.chunks.push(StructuredChunk::table(
256 &table.raw_text,
257 index,
258 &table.id,
259 char_start,
260 char_end,
261 ));
262 }
263
264 TableChunkingStrategy::SplitWithHeader => {
265 let header_text = table.format_header();
267 let header_chars = header_text.chars().count();
268
269 if table.char_count() <= self.options.max_chars {
271 let index = result.chunks.len();
272 result.chunks.push(StructuredChunk::table(
273 &table.raw_text,
274 index,
275 &table.id,
276 char_start,
277 char_end,
278 ));
279 return;
280 }
281
282 result.tables_split += 1;
284 let data_rows: Vec<_> = table.data_rows().collect();
285
286 if data_rows.is_empty() {
287 let index = result.chunks.len();
289 result.chunks.push(StructuredChunk::table(
290 &header_text,
291 index,
292 &table.id,
293 char_start,
294 char_end,
295 ));
296 return;
297 }
298
299 let max_rows_per_chunk = self.calculate_rows_per_chunk(table, header_chars);
300 let total_parts = (data_rows.len() + max_rows_per_chunk - 1) / max_rows_per_chunk;
301
302 let mut part = 1;
303 let mut row_idx = 0;
304
305 while row_idx < data_rows.len() {
306 let end_idx = (row_idx + max_rows_per_chunk).min(data_rows.len());
307 let rows_in_chunk = &data_rows[row_idx..end_idx];
308
309 let mut chunk_text = header_text.clone();
311 for row in rows_in_chunk {
312 chunk_text.push('\n');
313 chunk_text.push_str(&table.format_row(row));
314 }
315
316 let index = result.chunks.len();
317 if part == 1 {
318 result.chunks.push(StructuredChunk::table(
320 &chunk_text,
321 index,
322 &table.id,
323 char_start,
324 char_end,
325 ));
326 } else {
327 result.chunks.push(StructuredChunk::table_continuation(
329 &chunk_text,
330 index,
331 &table.id,
332 part as u32,
333 total_parts as u32,
334 &header_text,
335 char_start,
336 char_end,
337 ));
338 }
339
340 row_idx = end_idx;
341 part += 1;
342 }
343 }
344
345 TableChunkingStrategy::Naive => {
346 let index = result.chunks.len();
348 result.chunks.push(StructuredChunk::text(
349 &table.raw_text,
350 index,
351 char_start,
352 char_end,
353 ));
354 }
355 }
356 }
357
358 fn calculate_rows_per_chunk(&self, table: &StructuredTable, header_chars: usize) -> usize {
360 let available = self.options.max_chars.saturating_sub(header_chars + 10);
361 if available == 0 {
362 return 1;
363 }
364
365 let total_row_chars: usize = table
367 .data_rows()
368 .map(|row| {
369 row.cells.iter().map(|c| c.text.chars().count()).sum::<usize>()
370 + row.cells.len() * 3 })
372 .sum();
373
374 let row_count = table.data_row_count();
375 if row_count == 0 {
376 return 1;
377 }
378
379 let avg_row_chars = total_row_chars / row_count;
380 if avg_row_chars == 0 {
381 return row_count;
382 }
383
384 (available / avg_row_chars).max(1)
385 }
386
387 fn chunk_code_block(
389 &self,
390 result: &mut ChunkingResult,
391 formatted_text: &str,
392 language: Option<&str>,
393 char_start: usize,
394 char_end: usize,
395 ) {
396 result.code_blocks_processed += 1;
397
398 match self.options.code_handling {
399 CodeChunkingStrategy::PreserveWhole => {
400 let index = result.chunks.len();
402 result.chunks.push(StructuredChunk {
403 text: formatted_text.to_string(),
404 chunk_type: ChunkType::CodeBlock,
405 index,
406 element_id: None,
407 part: None,
408 total_parts: None,
409 context: language.map(|s| s.to_string()),
410 char_start,
411 char_end,
412 });
413 }
414
415 CodeChunkingStrategy::SplitAtBoundaries => {
416 let block_chars = formatted_text.chars().count();
418 if block_chars <= self.options.max_chars {
419 let index = result.chunks.len();
421 result.chunks.push(StructuredChunk {
422 text: formatted_text.to_string(),
423 chunk_type: ChunkType::CodeBlock,
424 index,
425 element_id: None,
426 part: None,
427 total_parts: None,
428 context: language.map(|s| s.to_string()),
429 char_start,
430 char_end,
431 });
432 } else {
433 self.split_code_at_boundaries(result, formatted_text, language, char_start, char_end);
435 }
436 }
437
438 CodeChunkingStrategy::SplitWithOverlap => {
439 self.split_code_with_overlap(result, formatted_text, language, char_start, char_end);
441 }
442 }
443 }
444
445 fn split_code_at_boundaries(
447 &self,
448 result: &mut ChunkingResult,
449 formatted_text: &str,
450 language: Option<&str>,
451 char_start: usize,
452 char_end: usize,
453 ) {
454 let lines: Vec<&str> = formatted_text.lines().collect();
456 let mut chunks = Vec::new();
457 let mut current_chunk = Vec::new();
458 let mut current_chars = 0;
459
460 let fence_start = lines.first().copied().unwrap_or("```");
462 let fence_end = lines.last().copied().unwrap_or("```");
463 let content_lines = &lines[1..lines.len().saturating_sub(1)];
464
465 for (i, line) in content_lines.iter().enumerate() {
466 let line_chars = line.chars().count() + 1;
467
468 let is_boundary = line.trim().is_empty()
470 || line.trim().starts_with("fn ")
471 || line.trim().starts_with("def ")
472 || line.trim().starts_with("function ")
473 || line.trim().starts_with("class ")
474 || line.trim().starts_with("impl ");
475
476 if is_boundary
477 && current_chars > self.options.max_chars / 2
478 && i > 0
479 {
480 if !current_chunk.is_empty() {
482 chunks.push(current_chunk.join("\n"));
483 current_chunk.clear();
484 current_chars = 0;
485 }
486 }
487
488 current_chunk.push(*line);
489 current_chars += line_chars;
490 }
491
492 if !current_chunk.is_empty() {
494 chunks.push(current_chunk.join("\n"));
495 }
496
497 let total_parts = chunks.len();
499 for (i, chunk_content) in chunks.into_iter().enumerate() {
500 let index = result.chunks.len();
501 let chunk_text = format!("{}{}\n{}\n{}", fence_start, language.unwrap_or(""), chunk_content, fence_end);
502
503 if i == 0 {
504 result.chunks.push(StructuredChunk {
505 text: chunk_text,
506 chunk_type: ChunkType::CodeBlock,
507 index,
508 element_id: None,
509 part: Some(1),
510 total_parts: Some(total_parts as u32),
511 context: language.map(|s| s.to_string()),
512 char_start,
513 char_end,
514 });
515 } else {
516 result.chunks.push(StructuredChunk {
517 text: chunk_text,
518 chunk_type: ChunkType::CodeBlockContinuation,
519 index,
520 element_id: None,
521 part: Some((i + 1) as u32),
522 total_parts: Some(total_parts as u32),
523 context: language.map(|s| s.to_string()),
524 char_start,
525 char_end,
526 });
527 }
528 }
529 }
530
531 fn split_code_with_overlap(
533 &self,
534 result: &mut ChunkingResult,
535 formatted_text: &str,
536 language: Option<&str>,
537 char_start: usize,
538 char_end: usize,
539 ) {
540 let lines: Vec<&str> = formatted_text.lines().collect();
541 let overlap_lines = (self.options.overlap_chars / 40).max(2);
542
543 let fence_start = lines.first().copied().unwrap_or("```");
545 let fence_end = lines.last().copied().unwrap_or("```");
546 let content_lines = &lines[1..lines.len().saturating_sub(1)];
547
548 let mut chunks = Vec::new();
549 let mut start_line = 0;
550
551 while start_line < content_lines.len() {
552 let mut current_chars = 0;
553 let mut end_line = start_line;
554
555 while end_line < content_lines.len() {
556 current_chars += content_lines[end_line].chars().count() + 1;
557 if current_chars > self.options.max_chars {
558 break;
559 }
560 end_line += 1;
561 }
562
563 if end_line == start_line {
564 end_line = start_line + 1;
565 }
566
567 let chunk_lines: Vec<&str> = content_lines[start_line..end_line].to_vec();
568 chunks.push(chunk_lines.join("\n"));
569
570 start_line = if end_line >= content_lines.len() {
572 content_lines.len()
573 } else {
574 end_line.saturating_sub(overlap_lines)
575 };
576 }
577
578 let total_parts = chunks.len();
580 for (i, chunk_content) in chunks.into_iter().enumerate() {
581 let index = result.chunks.len();
582 let chunk_text = format!("{}{}\n{}\n{}", fence_start, language.unwrap_or(""), chunk_content, fence_end);
583
584 let chunk_type = if i == 0 {
585 ChunkType::CodeBlock
586 } else {
587 ChunkType::CodeBlockContinuation
588 };
589
590 result.chunks.push(StructuredChunk {
591 text: chunk_text,
592 chunk_type,
593 index,
594 element_id: None,
595 part: Some((i + 1) as u32),
596 total_parts: Some(total_parts as u32),
597 context: language.map(|s| s.to_string()),
598 char_start,
599 char_end,
600 });
601 }
602 }
603}
604
605pub fn chunk_structured(doc: &StructuredDocument) -> ChunkingResult {
607 StructuralChunker::default().chunk(doc)
608}
609
610pub fn chunk_structured_with_max(doc: &StructuredDocument, max_chars: usize) -> ChunkingResult {
612 StructuralChunker::with_max_chars(max_chars).chunk(doc)
613}
614
615#[cfg(test)]
616mod tests {
617 use super::*;
618 use crate::structure::detect_structure;
619
620 #[test]
621 fn test_simple_text_chunking() {
622 let text = "This is a simple paragraph.\n\nAnother paragraph here.";
623 let doc = detect_structure(text);
624 let result = chunk_structured(&doc);
625
626 assert!(!result.chunks.is_empty());
627 assert_eq!(result.tables_processed, 0);
628 }
629
630 #[test]
631 fn test_table_preserved_when_small() {
632 let text = r#"Introduction.
633
634| Name | Age |
635|------|-----|
636| Alice | 30 |
637| Bob | 25 |
638
639Conclusion."#;
640
641 let doc = detect_structure(text);
642 let result = chunk_structured(&doc);
643
644 let table_chunks: Vec<_> = result
646 .chunks
647 .iter()
648 .filter(|c| c.is_table())
649 .collect();
650
651 assert_eq!(table_chunks.len(), 1);
652 assert_eq!(result.tables_processed, 1);
653 assert_eq!(result.tables_split, 0);
654 }
655
656 #[test]
657 fn test_large_table_split_with_header() {
658 let mut rows = String::new();
660 for i in 1..=50 {
661 rows.push_str(&format!("| Row {} with some data | More data here | Even more |\n", i));
662 }
663
664 let text = format!(
665 r#"Introduction.
666
667| Column A | Column B | Column C |
668|----------|----------|----------|
669{}
670Conclusion."#,
671 rows
672 );
673
674 let doc = detect_structure(&text);
675 let chunker = StructuralChunker::with_max_chars(500);
676 let result = chunker.chunk(&doc);
677
678 let table_chunks: Vec<_> = result
680 .chunks
681 .iter()
682 .filter(|c| c.is_table())
683 .collect();
684
685 assert!(table_chunks.len() > 1, "Large table should be split");
686 assert_eq!(result.tables_split, 1);
687
688 for chunk in &table_chunks {
690 assert!(
691 chunk.text.contains("| Column A |"),
692 "Each table chunk should contain header"
693 );
694 }
695
696 for chunk in table_chunks.iter().skip(1) {
698 assert_eq!(chunk.chunk_type, ChunkType::TableContinuation);
699 assert!(chunk.context.is_some());
700 }
701 }
702
703 #[test]
704 fn test_code_block_preserved() {
705 let text = r#"Here is code:
706
707```rust
708fn main() {
709 println!("Hello!");
710}
711```
712
713Done."#;
714
715 let doc = detect_structure(text);
716 let result = chunk_structured(&doc);
717
718 let code_chunks: Vec<_> = result
719 .chunks
720 .iter()
721 .filter(|c| matches!(c.chunk_type, ChunkType::CodeBlock))
722 .collect();
723
724 assert_eq!(code_chunks.len(), 1);
725 assert!(code_chunks[0].text.contains("fn main()"));
726 }
727
728 #[test]
729 fn test_mixed_content() {
730 let text = r#"# Report
731
732## Summary
733
734This is the summary section.
735
736| Item | Count |
737|------|-------|
738| A | 10 |
739| B | 20 |
740
741## Code
742
743```python
744def hello():
745 print("Hello")
746```
747
748## Conclusion
749
750All done."#;
751
752 let doc = detect_structure(text);
753 let result = chunk_structured(&doc);
754
755 assert!(result.tables_processed >= 1);
756 assert!(result.code_blocks_processed >= 1);
757 assert!(result.chunks.len() >= 3);
758 }
759
760 #[test]
761 fn test_table_header_formatting() {
762 let text = r#"| Col1 | Col2 | Col3 |
763|------|------|------|
764| A1 | A2 | A3 |
765| B1 | B2 | B3 |"#;
766
767 let doc = detect_structure(text);
768 let table = doc.tables().next().unwrap();
769
770 let header = table.format_header();
771 assert!(header.contains("| Col1 | Col2 | Col3 |"));
772 assert!(header.contains("|---|---|---|"));
773 }
774
775 #[test]
776 fn test_preserve_whole_strategy() {
777 let mut rows = String::new();
778 for i in 1..=20 {
779 rows.push_str(&format!("| Data {} | Value |\n", i));
780 }
781
782 let text = format!(
783 r#"| Header1 | Header2 |
784|---------|---------|
785{}"#,
786 rows
787 );
788
789 let doc = detect_structure(&text);
790 let chunker = StructuralChunker::new(ChunkingOptions {
791 max_chars: 500,
792 table_handling: TableChunkingStrategy::PreserveWhole,
793 ..Default::default()
794 });
795 let result = chunker.chunk(&doc);
796
797 let table_chunks: Vec<_> = result
799 .chunks
800 .iter()
801 .filter(|c| c.is_table())
802 .collect();
803
804 assert_eq!(table_chunks.len(), 1);
805 assert_eq!(result.tables_split, 0);
806 }
807
808 #[test]
809 fn test_chunking_result_stats() {
810 let text = r#"| A | B |
811|---|---|
812| 1 | 2 |
813
814```python
815x = 1
816```
817
818| C | D |
819|---|---|
820| 3 | 4 |"#;
821
822 let doc = detect_structure(text);
823 let result = chunk_structured(&doc);
824
825 assert_eq!(result.tables_processed, 2);
826 assert_eq!(result.code_blocks_processed, 1);
827 }
828}