1use crate::types::structure::{
9 ChunkType, ChunkingOptions, ChunkingResult, CodeChunkingStrategy, ElementData, StructuredChunk,
10 StructuredDocument, StructuredTable, TableChunkingStrategy,
11};
12
13pub struct StructuralChunker {
32 options: ChunkingOptions,
33}
34
35impl Default for StructuralChunker {
36 fn default() -> Self {
37 Self::new(ChunkingOptions::default())
38 }
39}
40
41impl StructuralChunker {
42 #[must_use]
44 pub fn new(options: ChunkingOptions) -> Self {
45 Self { options }
46 }
47
48 #[must_use]
50 pub fn with_max_chars(max_chars: usize) -> Self {
51 Self {
52 options: ChunkingOptions {
53 max_chars,
54 ..Default::default()
55 },
56 }
57 }
58
59 #[must_use]
61 pub fn chunk(&self, doc: &StructuredDocument) -> ChunkingResult {
62 let mut result = ChunkingResult::empty();
63 let mut current_text = String::new();
64 let mut current_start = 0;
65 let mut pending_heading: Option<&str> = None;
66
67 for element in &doc.elements {
68 match &element.data {
69 ElementData::Table(table) => {
70 if !current_text.trim().is_empty() {
72 self.emit_text_chunk(
73 &mut result,
74 ¤t_text,
75 current_start,
76 element.char_start,
77 );
78 current_text.clear();
79 }
80
81 self.chunk_table(&mut result, table, element.char_start, element.char_end);
83 current_start = element.char_end;
84 }
85
86 ElementData::CodeBlock(block) => {
87 if !current_text.trim().is_empty() {
89 self.emit_text_chunk(
90 &mut result,
91 ¤t_text,
92 current_start,
93 element.char_start,
94 );
95 current_text.clear();
96 }
97
98 self.chunk_code_block(
100 &mut result,
101 &block.format(),
102 block.language.as_deref(),
103 element.char_start,
104 element.char_end,
105 );
106 current_start = element.char_end;
107 }
108
109 ElementData::Heading(heading) => {
110 if self.options.include_section_headers {
111 pending_heading = Some(heading.format().leak());
113 }
114
115 if !current_text.is_empty() {
117 current_text.push('\n');
118 }
119 current_text.push_str(&heading.format());
120 }
121
122 ElementData::List(list) => {
123 if self.options.preserve_lists {
124 let list_text = list.format();
125 let combined_len = current_text.chars().count() + list_text.chars().count();
126
127 if combined_len > self.options.max_chars && !current_text.trim().is_empty()
128 {
129 self.emit_text_chunk(
131 &mut result,
132 ¤t_text,
133 current_start,
134 element.char_start,
135 );
136 current_text.clear();
137 current_start = element.char_start;
138 }
139
140 if !current_text.is_empty() {
142 current_text.push_str("\n\n");
143 }
144 current_text.push_str(&list_text);
145 } else {
146 let text = element.text();
148 if !current_text.is_empty() {
149 current_text.push_str("\n\n");
150 }
151 current_text.push_str(&text);
152 }
153 }
154
155 ElementData::Paragraph { text } => {
156 let text_len = text.chars().count();
157 let current_len = current_text.chars().count();
158
159 if current_len + text_len > self.options.max_chars
160 && !current_text.trim().is_empty()
161 {
162 self.emit_text_chunk(
164 &mut result,
165 ¤t_text,
166 current_start,
167 element.char_start,
168 );
169 current_text.clear();
170 current_start = element.char_start;
171
172 if let Some(heading) = pending_heading.take() {
174 current_text.push_str(heading);
175 current_text.push_str("\n\n");
176 }
177 }
178
179 if !current_text.is_empty() && !current_text.ends_with('\n') {
180 current_text.push_str("\n\n");
181 }
182 current_text.push_str(text);
183 }
184
185 ElementData::BlockQuote { text } => {
186 if !current_text.is_empty() {
187 current_text.push_str("\n\n");
188 }
189 current_text.push_str("> ");
190 current_text.push_str(text);
191 }
192
193 ElementData::Separator => {
194 if !current_text.trim().is_empty() {
196 self.emit_text_chunk(
197 &mut result,
198 ¤t_text,
199 current_start,
200 element.char_start,
201 );
202 current_text.clear();
203 }
204 current_start = element.char_end;
205 pending_heading = None;
206 }
207
208 ElementData::Raw { text } => {
209 if !current_text.is_empty() {
210 current_text.push_str("\n\n");
211 }
212 current_text.push_str(text);
213 }
214 }
215 }
216
217 if !current_text.trim().is_empty() {
219 self.emit_text_chunk(&mut result, ¤t_text, current_start, doc.total_chars);
220 }
221
222 result
223 }
224
225 fn emit_text_chunk(
227 &self,
228 result: &mut ChunkingResult,
229 text: &str,
230 char_start: usize,
231 char_end: usize,
232 ) {
233 let index = result.chunks.len();
234 result.chunks.push(StructuredChunk::text(
235 text.trim(),
236 index,
237 char_start,
238 char_end,
239 ));
240 }
241
242 fn chunk_table(
244 &self,
245 result: &mut ChunkingResult,
246 table: &StructuredTable,
247 char_start: usize,
248 char_end: usize,
249 ) {
250 result.tables_processed += 1;
251
252 match self.options.table_handling {
253 TableChunkingStrategy::PreserveWhole => {
254 let index = result.chunks.len();
256 result.chunks.push(StructuredChunk::table(
257 &table.raw_text,
258 index,
259 &table.id,
260 char_start,
261 char_end,
262 ));
263 }
264
265 TableChunkingStrategy::SplitWithHeader => {
266 let header_text = table.format_header();
268 let header_chars = header_text.chars().count();
269
270 if table.char_count() <= self.options.max_chars {
272 let index = result.chunks.len();
273 result.chunks.push(StructuredChunk::table(
274 &table.raw_text,
275 index,
276 &table.id,
277 char_start,
278 char_end,
279 ));
280 return;
281 }
282
283 result.tables_split += 1;
285 let data_rows: Vec<_> = table.data_rows().collect();
286
287 if data_rows.is_empty() {
288 let index = result.chunks.len();
290 result.chunks.push(StructuredChunk::table(
291 &header_text,
292 index,
293 &table.id,
294 char_start,
295 char_end,
296 ));
297 return;
298 }
299
300 let max_rows_per_chunk = self.calculate_rows_per_chunk(table, header_chars);
301 let total_parts = data_rows.len().div_ceil(max_rows_per_chunk);
302
303 let mut part = 1;
304 let mut row_idx = 0;
305
306 while row_idx < data_rows.len() {
307 let end_idx = (row_idx + max_rows_per_chunk).min(data_rows.len());
308 let rows_in_chunk = &data_rows[row_idx..end_idx];
309
310 let mut chunk_text = header_text.clone();
312 for row in rows_in_chunk {
313 chunk_text.push('\n');
314 chunk_text.push_str(&table.format_row(row));
315 }
316
317 let index = result.chunks.len();
318 if part == 1 {
319 result.chunks.push(StructuredChunk::table(
321 &chunk_text,
322 index,
323 &table.id,
324 char_start,
325 char_end,
326 ));
327 } else {
328 result.chunks.push(StructuredChunk::table_continuation(
330 &chunk_text,
331 index,
332 &table.id,
333 part as u32,
334 u32::try_from(total_parts).unwrap_or(0),
335 &header_text,
336 char_start,
337 char_end,
338 ));
339 }
340
341 row_idx = end_idx;
342 part += 1;
343 }
344 }
345
346 TableChunkingStrategy::Naive => {
347 let index = result.chunks.len();
349 result.chunks.push(StructuredChunk::text(
350 &table.raw_text,
351 index,
352 char_start,
353 char_end,
354 ));
355 }
356 }
357 }
358
359 fn calculate_rows_per_chunk(&self, table: &StructuredTable, header_chars: usize) -> usize {
361 let available = self.options.max_chars.saturating_sub(header_chars + 10);
362 if available == 0 {
363 return 1;
364 }
365
366 let total_row_chars: usize = table
368 .data_rows()
369 .map(|row| {
370 row.cells
371 .iter()
372 .map(|c| c.text.chars().count())
373 .sum::<usize>()
374 + row.cells.len() * 3 })
376 .sum();
377
378 let row_count = table.data_row_count();
379 if row_count == 0 {
380 return 1;
381 }
382
383 let avg_row_chars = total_row_chars / row_count;
384 if avg_row_chars == 0 {
385 return row_count;
386 }
387
388 (available / avg_row_chars).max(1)
389 }
390
391 fn chunk_code_block(
393 &self,
394 result: &mut ChunkingResult,
395 formatted_text: &str,
396 language: Option<&str>,
397 char_start: usize,
398 char_end: usize,
399 ) {
400 result.code_blocks_processed += 1;
401
402 match self.options.code_handling {
403 CodeChunkingStrategy::PreserveWhole => {
404 let index = result.chunks.len();
406 result.chunks.push(StructuredChunk {
407 text: formatted_text.to_string(),
408 chunk_type: ChunkType::CodeBlock,
409 index,
410 element_id: None,
411 part: None,
412 total_parts: None,
413 context: language.map(std::string::ToString::to_string),
414 char_start,
415 char_end,
416 });
417 }
418
419 CodeChunkingStrategy::SplitAtBoundaries => {
420 let block_chars = formatted_text.chars().count();
422 if block_chars <= self.options.max_chars {
423 let index = result.chunks.len();
425 result.chunks.push(StructuredChunk {
426 text: formatted_text.to_string(),
427 chunk_type: ChunkType::CodeBlock,
428 index,
429 element_id: None,
430 part: None,
431 total_parts: None,
432 context: language.map(std::string::ToString::to_string),
433 char_start,
434 char_end,
435 });
436 } else {
437 self.split_code_at_boundaries(
439 result,
440 formatted_text,
441 language,
442 char_start,
443 char_end,
444 );
445 }
446 }
447
448 CodeChunkingStrategy::SplitWithOverlap => {
449 self.split_code_with_overlap(
451 result,
452 formatted_text,
453 language,
454 char_start,
455 char_end,
456 );
457 }
458 }
459 }
460
461 fn split_code_at_boundaries(
463 &self,
464 result: &mut ChunkingResult,
465 formatted_text: &str,
466 language: Option<&str>,
467 char_start: usize,
468 char_end: usize,
469 ) {
470 let lines: Vec<&str> = formatted_text.lines().collect();
472 let mut chunks = Vec::new();
473 let mut current_chunk = Vec::new();
474 let mut current_chars = 0;
475
476 let fence_start = lines.first().copied().unwrap_or("```");
478 let fence_end = lines.last().copied().unwrap_or("```");
479 let content_lines = &lines[1..lines.len().saturating_sub(1)];
480
481 for (i, line) in content_lines.iter().enumerate() {
482 let line_chars = line.chars().count() + 1;
483
484 let is_boundary = line.trim().is_empty()
486 || line.trim().starts_with("fn ")
487 || line.trim().starts_with("def ")
488 || line.trim().starts_with("function ")
489 || line.trim().starts_with("class ")
490 || line.trim().starts_with("impl ");
491
492 if is_boundary && current_chars > self.options.max_chars / 2 && i > 0 {
493 if !current_chunk.is_empty() {
495 chunks.push(current_chunk.join("\n"));
496 current_chunk.clear();
497 current_chars = 0;
498 }
499 }
500
501 current_chunk.push(*line);
502 current_chars += line_chars;
503 }
504
505 if !current_chunk.is_empty() {
507 chunks.push(current_chunk.join("\n"));
508 }
509
510 let total_parts = chunks.len();
512 for (i, chunk_content) in chunks.into_iter().enumerate() {
513 let index = result.chunks.len();
514 let chunk_text = format!(
515 "{}{}\n{}\n{}",
516 fence_start,
517 language.unwrap_or(""),
518 chunk_content,
519 fence_end
520 );
521
522 if i == 0 {
523 result.chunks.push(StructuredChunk {
524 text: chunk_text,
525 chunk_type: ChunkType::CodeBlock,
526 index,
527 element_id: None,
528 part: Some(1),
529 total_parts: Some(u32::try_from(total_parts).unwrap_or(0)),
530 context: language.map(std::string::ToString::to_string),
531 char_start,
532 char_end,
533 });
534 } else {
535 result.chunks.push(StructuredChunk {
536 text: chunk_text,
537 chunk_type: ChunkType::CodeBlockContinuation,
538 index,
539 element_id: None,
540 part: Some(u32::try_from(i + 1).unwrap_or(0)),
541 total_parts: Some(u32::try_from(total_parts).unwrap_or(0)),
542 context: language.map(std::string::ToString::to_string),
543 char_start,
544 char_end,
545 });
546 }
547 }
548 }
549
550 fn split_code_with_overlap(
552 &self,
553 result: &mut ChunkingResult,
554 formatted_text: &str,
555 language: Option<&str>,
556 char_start: usize,
557 char_end: usize,
558 ) {
559 let lines: Vec<&str> = formatted_text.lines().collect();
560 let overlap_lines = (self.options.overlap_chars / 40).max(2);
561
562 let fence_start = lines.first().copied().unwrap_or("```");
564 let fence_end = lines.last().copied().unwrap_or("```");
565 let content_lines = &lines[1..lines.len().saturating_sub(1)];
566
567 let mut chunks = Vec::new();
568 let mut start_line = 0;
569
570 while start_line < content_lines.len() {
571 let mut current_chars = 0;
572 let mut end_line = start_line;
573
574 while end_line < content_lines.len() {
575 current_chars += content_lines[end_line].chars().count() + 1;
576 if current_chars > self.options.max_chars {
577 break;
578 }
579 end_line += 1;
580 }
581
582 if end_line == start_line {
583 end_line = start_line + 1;
584 }
585
586 let chunk_lines: Vec<&str> = content_lines[start_line..end_line].to_vec();
587 chunks.push(chunk_lines.join("\n"));
588
589 start_line = if end_line >= content_lines.len() {
591 content_lines.len()
592 } else {
593 end_line.saturating_sub(overlap_lines)
594 };
595 }
596
597 let total_parts = chunks.len();
599 for (i, chunk_content) in chunks.into_iter().enumerate() {
600 let index = result.chunks.len();
601 let chunk_text = format!(
602 "{}{}\n{}\n{}",
603 fence_start,
604 language.unwrap_or(""),
605 chunk_content,
606 fence_end
607 );
608
609 let chunk_type = if i == 0 {
610 ChunkType::CodeBlock
611 } else {
612 ChunkType::CodeBlockContinuation
613 };
614
615 result.chunks.push(StructuredChunk {
616 text: chunk_text,
617 chunk_type,
618 index,
619 element_id: None,
620 part: Some(u32::try_from(i + 1).unwrap_or(0)),
621 total_parts: Some(u32::try_from(total_parts).unwrap_or(0)),
622 context: language.map(std::string::ToString::to_string),
623 char_start,
624 char_end,
625 });
626 }
627 }
628}
629
630#[must_use]
632pub fn chunk_structured(doc: &StructuredDocument) -> ChunkingResult {
633 StructuralChunker::default().chunk(doc)
634}
635
636#[must_use]
638pub fn chunk_structured_with_max(doc: &StructuredDocument, max_chars: usize) -> ChunkingResult {
639 StructuralChunker::with_max_chars(max_chars).chunk(doc)
640}
641
642#[cfg(test)]
643mod tests {
644 use super::*;
645 use crate::structure::detect_structure;
646
647 #[test]
648 fn test_simple_text_chunking() {
649 let text = "This is a simple paragraph.\n\nAnother paragraph here.";
650 let doc = detect_structure(text);
651 let result = chunk_structured(&doc);
652
653 assert!(!result.chunks.is_empty());
654 assert_eq!(result.tables_processed, 0);
655 }
656
657 #[test]
658 fn test_table_preserved_when_small() {
659 let text = r"Introduction.
660
661| Name | Age |
662|------|-----|
663| Alice | 30 |
664| Bob | 25 |
665
666Conclusion.";
667
668 let doc = detect_structure(text);
669 let result = chunk_structured(&doc);
670
671 let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
673
674 assert_eq!(table_chunks.len(), 1);
675 assert_eq!(result.tables_processed, 1);
676 assert_eq!(result.tables_split, 0);
677 }
678
679 #[test]
680 fn test_large_table_split_with_header() {
681 let mut rows = String::new();
683 for i in 1..=50 {
684 rows.push_str(&format!(
685 "| Row {} with some data | More data here | Even more |\n",
686 i
687 ));
688 }
689
690 let text = format!(
691 r"Introduction.
692
693| Column A | Column B | Column C |
694|----------|----------|----------|
695{}
696Conclusion.",
697 rows
698 );
699
700 let doc = detect_structure(&text);
701 let chunker = StructuralChunker::with_max_chars(500);
702 let result = chunker.chunk(&doc);
703
704 let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
706
707 assert!(table_chunks.len() > 1, "Large table should be split");
708 assert_eq!(result.tables_split, 1);
709
710 for chunk in &table_chunks {
712 assert!(
713 chunk.text.contains("| Column A |"),
714 "Each table chunk should contain header"
715 );
716 }
717
718 for chunk in table_chunks.iter().skip(1) {
720 assert_eq!(chunk.chunk_type, ChunkType::TableContinuation);
721 assert!(chunk.context.is_some());
722 }
723 }
724
725 #[test]
726 fn test_code_block_preserved() {
727 let text = r#"Here is code:
728
729```rust
730fn main() {
731 println!("Hello!");
732}
733```
734
735Done."#;
736
737 let doc = detect_structure(text);
738 let result = chunk_structured(&doc);
739
740 let code_chunks: Vec<_> = result
741 .chunks
742 .iter()
743 .filter(|c| matches!(c.chunk_type, ChunkType::CodeBlock))
744 .collect();
745
746 assert_eq!(code_chunks.len(), 1);
747 assert!(code_chunks[0].text.contains("fn main()"));
748 }
749
750 #[test]
751 fn test_mixed_content() {
752 let text = r#"# Report
753
754## Summary
755
756This is the summary section.
757
758| Item | Count |
759|------|-------|
760| A | 10 |
761| B | 20 |
762
763## Code
764
765```python
766def hello():
767 print("Hello")
768```
769
770## Conclusion
771
772All done."#;
773
774 let doc = detect_structure(text);
775 let result = chunk_structured(&doc);
776
777 assert!(result.tables_processed >= 1);
778 assert!(result.code_blocks_processed >= 1);
779 assert!(result.chunks.len() >= 3);
780 }
781
782 #[test]
783 fn test_table_header_formatting() {
784 let text = r"| Col1 | Col2 | Col3 |
785|------|------|------|
786| A1 | A2 | A3 |
787| B1 | B2 | B3 |";
788
789 let doc = detect_structure(text);
790 let table = doc.tables().next().unwrap();
791
792 let header = table.format_header();
793 assert!(header.contains("| Col1 | Col2 | Col3 |"));
794 assert!(header.contains("|---|---|---|"));
795 }
796
797 #[test]
798 fn test_preserve_whole_strategy() {
799 let mut rows = String::new();
800 for i in 1..=20 {
801 rows.push_str(&format!("| Data {} | Value |\n", i));
802 }
803
804 let text = format!(
805 r"| Header1 | Header2 |
806|---------|---------|
807{}",
808 rows
809 );
810
811 let doc = detect_structure(&text);
812 let chunker = StructuralChunker::new(ChunkingOptions {
813 max_chars: 500,
814 table_handling: TableChunkingStrategy::PreserveWhole,
815 ..Default::default()
816 });
817 let result = chunker.chunk(&doc);
818
819 let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
821
822 assert_eq!(table_chunks.len(), 1);
823 assert_eq!(result.tables_split, 0);
824 }
825
826 #[test]
827 fn test_chunking_result_stats() {
828 let text = r"| A | B |
829|---|---|
830| 1 | 2 |
831
832```python
833x = 1
834```
835
836| C | D |
837|---|---|
838| 3 | 4 |";
839
840 let doc = detect_structure(text);
841 let result = chunk_structured(&doc);
842
843 assert_eq!(result.tables_processed, 2);
844 assert_eq!(result.code_blocks_processed, 1);
845 }
846}