1use crate::types::structure::{
9 ChunkType, ChunkingOptions, ChunkingResult, CodeChunkingStrategy, ElementData, StructuredChunk,
10 StructuredDocument, StructuredTable, TableChunkingStrategy,
11};
12
13pub struct StructuralChunker {
32 options: ChunkingOptions,
33}
34
35impl Default for StructuralChunker {
36 fn default() -> Self {
37 Self::new(ChunkingOptions::default())
38 }
39}
40
41impl StructuralChunker {
42 pub fn new(options: ChunkingOptions) -> Self {
44 Self { options }
45 }
46
47 pub fn with_max_chars(max_chars: usize) -> Self {
49 Self {
50 options: ChunkingOptions {
51 max_chars,
52 ..Default::default()
53 },
54 }
55 }
56
57 pub fn chunk(&self, doc: &StructuredDocument) -> ChunkingResult {
59 let mut result = ChunkingResult::empty();
60 let mut current_text = String::new();
61 let mut current_start = 0;
62 let mut pending_heading: Option<&str> = None;
63
64 for element in &doc.elements {
65 match &element.data {
66 ElementData::Table(table) => {
67 if !current_text.trim().is_empty() {
69 self.emit_text_chunk(
70 &mut result,
71 ¤t_text,
72 current_start,
73 element.char_start,
74 );
75 current_text.clear();
76 }
77
78 self.chunk_table(&mut result, table, element.char_start, element.char_end);
80 current_start = element.char_end;
81 }
82
83 ElementData::CodeBlock(block) => {
84 if !current_text.trim().is_empty() {
86 self.emit_text_chunk(
87 &mut result,
88 ¤t_text,
89 current_start,
90 element.char_start,
91 );
92 current_text.clear();
93 }
94
95 self.chunk_code_block(
97 &mut result,
98 &block.format(),
99 block.language.as_deref(),
100 element.char_start,
101 element.char_end,
102 );
103 current_start = element.char_end;
104 }
105
106 ElementData::Heading(heading) => {
107 if self.options.include_section_headers {
108 pending_heading = Some(heading.format().leak());
110 }
111
112 if !current_text.is_empty() {
114 current_text.push('\n');
115 }
116 current_text.push_str(&heading.format());
117 }
118
119 ElementData::List(list) => {
120 if self.options.preserve_lists {
121 let list_text = list.format();
122 let combined_len = current_text.chars().count() + list_text.chars().count();
123
124 if combined_len > self.options.max_chars && !current_text.trim().is_empty()
125 {
126 self.emit_text_chunk(
128 &mut result,
129 ¤t_text,
130 current_start,
131 element.char_start,
132 );
133 current_text.clear();
134 current_start = element.char_start;
135 }
136
137 if !current_text.is_empty() {
139 current_text.push_str("\n\n");
140 }
141 current_text.push_str(&list_text);
142 } else {
143 let text = element.text();
145 if !current_text.is_empty() {
146 current_text.push_str("\n\n");
147 }
148 current_text.push_str(&text);
149 }
150 }
151
152 ElementData::Paragraph { text } => {
153 let text_len = text.chars().count();
154 let current_len = current_text.chars().count();
155
156 if current_len + text_len > self.options.max_chars
157 && !current_text.trim().is_empty()
158 {
159 self.emit_text_chunk(
161 &mut result,
162 ¤t_text,
163 current_start,
164 element.char_start,
165 );
166 current_text.clear();
167 current_start = element.char_start;
168
169 if let Some(heading) = pending_heading.take() {
171 current_text.push_str(heading);
172 current_text.push_str("\n\n");
173 }
174 }
175
176 if !current_text.is_empty() && !current_text.ends_with('\n') {
177 current_text.push_str("\n\n");
178 }
179 current_text.push_str(text);
180 }
181
182 ElementData::BlockQuote { text } => {
183 if !current_text.is_empty() {
184 current_text.push_str("\n\n");
185 }
186 current_text.push_str("> ");
187 current_text.push_str(text);
188 }
189
190 ElementData::Separator => {
191 if !current_text.trim().is_empty() {
193 self.emit_text_chunk(
194 &mut result,
195 ¤t_text,
196 current_start,
197 element.char_start,
198 );
199 current_text.clear();
200 }
201 current_start = element.char_end;
202 pending_heading = None;
203 }
204
205 ElementData::Raw { text } => {
206 if !current_text.is_empty() {
207 current_text.push_str("\n\n");
208 }
209 current_text.push_str(text);
210 }
211 }
212 }
213
214 if !current_text.trim().is_empty() {
216 self.emit_text_chunk(&mut result, ¤t_text, current_start, doc.total_chars);
217 }
218
219 result
220 }
221
222 fn emit_text_chunk(
224 &self,
225 result: &mut ChunkingResult,
226 text: &str,
227 char_start: usize,
228 char_end: usize,
229 ) {
230 let index = result.chunks.len();
231 result.chunks.push(StructuredChunk::text(
232 text.trim(),
233 index,
234 char_start,
235 char_end,
236 ));
237 }
238
239 fn chunk_table(
241 &self,
242 result: &mut ChunkingResult,
243 table: &StructuredTable,
244 char_start: usize,
245 char_end: usize,
246 ) {
247 result.tables_processed += 1;
248
249 match self.options.table_handling {
250 TableChunkingStrategy::PreserveWhole => {
251 let index = result.chunks.len();
253 result.chunks.push(StructuredChunk::table(
254 &table.raw_text,
255 index,
256 &table.id,
257 char_start,
258 char_end,
259 ));
260 }
261
262 TableChunkingStrategy::SplitWithHeader => {
263 let header_text = table.format_header();
265 let header_chars = header_text.chars().count();
266
267 if table.char_count() <= self.options.max_chars {
269 let index = result.chunks.len();
270 result.chunks.push(StructuredChunk::table(
271 &table.raw_text,
272 index,
273 &table.id,
274 char_start,
275 char_end,
276 ));
277 return;
278 }
279
280 result.tables_split += 1;
282 let data_rows: Vec<_> = table.data_rows().collect();
283
284 if data_rows.is_empty() {
285 let index = result.chunks.len();
287 result.chunks.push(StructuredChunk::table(
288 &header_text,
289 index,
290 &table.id,
291 char_start,
292 char_end,
293 ));
294 return;
295 }
296
297 let max_rows_per_chunk = self.calculate_rows_per_chunk(table, header_chars);
298 let total_parts = (data_rows.len() + max_rows_per_chunk - 1) / max_rows_per_chunk;
299
300 let mut part = 1;
301 let mut row_idx = 0;
302
303 while row_idx < data_rows.len() {
304 let end_idx = (row_idx + max_rows_per_chunk).min(data_rows.len());
305 let rows_in_chunk = &data_rows[row_idx..end_idx];
306
307 let mut chunk_text = header_text.clone();
309 for row in rows_in_chunk {
310 chunk_text.push('\n');
311 chunk_text.push_str(&table.format_row(row));
312 }
313
314 let index = result.chunks.len();
315 if part == 1 {
316 result.chunks.push(StructuredChunk::table(
318 &chunk_text,
319 index,
320 &table.id,
321 char_start,
322 char_end,
323 ));
324 } else {
325 result.chunks.push(StructuredChunk::table_continuation(
327 &chunk_text,
328 index,
329 &table.id,
330 part as u32,
331 total_parts as u32,
332 &header_text,
333 char_start,
334 char_end,
335 ));
336 }
337
338 row_idx = end_idx;
339 part += 1;
340 }
341 }
342
343 TableChunkingStrategy::Naive => {
344 let index = result.chunks.len();
346 result.chunks.push(StructuredChunk::text(
347 &table.raw_text,
348 index,
349 char_start,
350 char_end,
351 ));
352 }
353 }
354 }
355
356 fn calculate_rows_per_chunk(&self, table: &StructuredTable, header_chars: usize) -> usize {
358 let available = self.options.max_chars.saturating_sub(header_chars + 10);
359 if available == 0 {
360 return 1;
361 }
362
363 let total_row_chars: usize = table
365 .data_rows()
366 .map(|row| {
367 row.cells
368 .iter()
369 .map(|c| c.text.chars().count())
370 .sum::<usize>()
371 + row.cells.len() * 3 })
373 .sum();
374
375 let row_count = table.data_row_count();
376 if row_count == 0 {
377 return 1;
378 }
379
380 let avg_row_chars = total_row_chars / row_count;
381 if avg_row_chars == 0 {
382 return row_count;
383 }
384
385 (available / avg_row_chars).max(1)
386 }
387
388 fn chunk_code_block(
390 &self,
391 result: &mut ChunkingResult,
392 formatted_text: &str,
393 language: Option<&str>,
394 char_start: usize,
395 char_end: usize,
396 ) {
397 result.code_blocks_processed += 1;
398
399 match self.options.code_handling {
400 CodeChunkingStrategy::PreserveWhole => {
401 let index = result.chunks.len();
403 result.chunks.push(StructuredChunk {
404 text: formatted_text.to_string(),
405 chunk_type: ChunkType::CodeBlock,
406 index,
407 element_id: None,
408 part: None,
409 total_parts: None,
410 context: language.map(|s| s.to_string()),
411 char_start,
412 char_end,
413 });
414 }
415
416 CodeChunkingStrategy::SplitAtBoundaries => {
417 let block_chars = formatted_text.chars().count();
419 if block_chars <= self.options.max_chars {
420 let index = result.chunks.len();
422 result.chunks.push(StructuredChunk {
423 text: formatted_text.to_string(),
424 chunk_type: ChunkType::CodeBlock,
425 index,
426 element_id: None,
427 part: None,
428 total_parts: None,
429 context: language.map(|s| s.to_string()),
430 char_start,
431 char_end,
432 });
433 } else {
434 self.split_code_at_boundaries(
436 result,
437 formatted_text,
438 language,
439 char_start,
440 char_end,
441 );
442 }
443 }
444
445 CodeChunkingStrategy::SplitWithOverlap => {
446 self.split_code_with_overlap(
448 result,
449 formatted_text,
450 language,
451 char_start,
452 char_end,
453 );
454 }
455 }
456 }
457
458 fn split_code_at_boundaries(
460 &self,
461 result: &mut ChunkingResult,
462 formatted_text: &str,
463 language: Option<&str>,
464 char_start: usize,
465 char_end: usize,
466 ) {
467 let lines: Vec<&str> = formatted_text.lines().collect();
469 let mut chunks = Vec::new();
470 let mut current_chunk = Vec::new();
471 let mut current_chars = 0;
472
473 let fence_start = lines.first().copied().unwrap_or("```");
475 let fence_end = lines.last().copied().unwrap_or("```");
476 let content_lines = &lines[1..lines.len().saturating_sub(1)];
477
478 for (i, line) in content_lines.iter().enumerate() {
479 let line_chars = line.chars().count() + 1;
480
481 let is_boundary = line.trim().is_empty()
483 || line.trim().starts_with("fn ")
484 || line.trim().starts_with("def ")
485 || line.trim().starts_with("function ")
486 || line.trim().starts_with("class ")
487 || line.trim().starts_with("impl ");
488
489 if is_boundary && current_chars > self.options.max_chars / 2 && i > 0 {
490 if !current_chunk.is_empty() {
492 chunks.push(current_chunk.join("\n"));
493 current_chunk.clear();
494 current_chars = 0;
495 }
496 }
497
498 current_chunk.push(*line);
499 current_chars += line_chars;
500 }
501
502 if !current_chunk.is_empty() {
504 chunks.push(current_chunk.join("\n"));
505 }
506
507 let total_parts = chunks.len();
509 for (i, chunk_content) in chunks.into_iter().enumerate() {
510 let index = result.chunks.len();
511 let chunk_text = format!(
512 "{}{}\n{}\n{}",
513 fence_start,
514 language.unwrap_or(""),
515 chunk_content,
516 fence_end
517 );
518
519 if i == 0 {
520 result.chunks.push(StructuredChunk {
521 text: chunk_text,
522 chunk_type: ChunkType::CodeBlock,
523 index,
524 element_id: None,
525 part: Some(1),
526 total_parts: Some(total_parts as u32),
527 context: language.map(|s| s.to_string()),
528 char_start,
529 char_end,
530 });
531 } else {
532 result.chunks.push(StructuredChunk {
533 text: chunk_text,
534 chunk_type: ChunkType::CodeBlockContinuation,
535 index,
536 element_id: None,
537 part: Some((i + 1) as u32),
538 total_parts: Some(total_parts as u32),
539 context: language.map(|s| s.to_string()),
540 char_start,
541 char_end,
542 });
543 }
544 }
545 }
546
547 fn split_code_with_overlap(
549 &self,
550 result: &mut ChunkingResult,
551 formatted_text: &str,
552 language: Option<&str>,
553 char_start: usize,
554 char_end: usize,
555 ) {
556 let lines: Vec<&str> = formatted_text.lines().collect();
557 let overlap_lines = (self.options.overlap_chars / 40).max(2);
558
559 let fence_start = lines.first().copied().unwrap_or("```");
561 let fence_end = lines.last().copied().unwrap_or("```");
562 let content_lines = &lines[1..lines.len().saturating_sub(1)];
563
564 let mut chunks = Vec::new();
565 let mut start_line = 0;
566
567 while start_line < content_lines.len() {
568 let mut current_chars = 0;
569 let mut end_line = start_line;
570
571 while end_line < content_lines.len() {
572 current_chars += content_lines[end_line].chars().count() + 1;
573 if current_chars > self.options.max_chars {
574 break;
575 }
576 end_line += 1;
577 }
578
579 if end_line == start_line {
580 end_line = start_line + 1;
581 }
582
583 let chunk_lines: Vec<&str> = content_lines[start_line..end_line].to_vec();
584 chunks.push(chunk_lines.join("\n"));
585
586 start_line = if end_line >= content_lines.len() {
588 content_lines.len()
589 } else {
590 end_line.saturating_sub(overlap_lines)
591 };
592 }
593
594 let total_parts = chunks.len();
596 for (i, chunk_content) in chunks.into_iter().enumerate() {
597 let index = result.chunks.len();
598 let chunk_text = format!(
599 "{}{}\n{}\n{}",
600 fence_start,
601 language.unwrap_or(""),
602 chunk_content,
603 fence_end
604 );
605
606 let chunk_type = if i == 0 {
607 ChunkType::CodeBlock
608 } else {
609 ChunkType::CodeBlockContinuation
610 };
611
612 result.chunks.push(StructuredChunk {
613 text: chunk_text,
614 chunk_type,
615 index,
616 element_id: None,
617 part: Some((i + 1) as u32),
618 total_parts: Some(total_parts as u32),
619 context: language.map(|s| s.to_string()),
620 char_start,
621 char_end,
622 });
623 }
624 }
625}
626
627pub fn chunk_structured(doc: &StructuredDocument) -> ChunkingResult {
629 StructuralChunker::default().chunk(doc)
630}
631
632pub fn chunk_structured_with_max(doc: &StructuredDocument, max_chars: usize) -> ChunkingResult {
634 StructuralChunker::with_max_chars(max_chars).chunk(doc)
635}
636
637#[cfg(test)]
638mod tests {
639 use super::*;
640 use crate::structure::detect_structure;
641
642 #[test]
643 fn test_simple_text_chunking() {
644 let text = "This is a simple paragraph.\n\nAnother paragraph here.";
645 let doc = detect_structure(text);
646 let result = chunk_structured(&doc);
647
648 assert!(!result.chunks.is_empty());
649 assert_eq!(result.tables_processed, 0);
650 }
651
652 #[test]
653 fn test_table_preserved_when_small() {
654 let text = r#"Introduction.
655
656| Name | Age |
657|------|-----|
658| Alice | 30 |
659| Bob | 25 |
660
661Conclusion."#;
662
663 let doc = detect_structure(text);
664 let result = chunk_structured(&doc);
665
666 let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
668
669 assert_eq!(table_chunks.len(), 1);
670 assert_eq!(result.tables_processed, 1);
671 assert_eq!(result.tables_split, 0);
672 }
673
674 #[test]
675 fn test_large_table_split_with_header() {
676 let mut rows = String::new();
678 for i in 1..=50 {
679 rows.push_str(&format!(
680 "| Row {} with some data | More data here | Even more |\n",
681 i
682 ));
683 }
684
685 let text = format!(
686 r#"Introduction.
687
688| Column A | Column B | Column C |
689|----------|----------|----------|
690{}
691Conclusion."#,
692 rows
693 );
694
695 let doc = detect_structure(&text);
696 let chunker = StructuralChunker::with_max_chars(500);
697 let result = chunker.chunk(&doc);
698
699 let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
701
702 assert!(table_chunks.len() > 1, "Large table should be split");
703 assert_eq!(result.tables_split, 1);
704
705 for chunk in &table_chunks {
707 assert!(
708 chunk.text.contains("| Column A |"),
709 "Each table chunk should contain header"
710 );
711 }
712
713 for chunk in table_chunks.iter().skip(1) {
715 assert_eq!(chunk.chunk_type, ChunkType::TableContinuation);
716 assert!(chunk.context.is_some());
717 }
718 }
719
720 #[test]
721 fn test_code_block_preserved() {
722 let text = r#"Here is code:
723
724```rust
725fn main() {
726 println!("Hello!");
727}
728```
729
730Done."#;
731
732 let doc = detect_structure(text);
733 let result = chunk_structured(&doc);
734
735 let code_chunks: Vec<_> = result
736 .chunks
737 .iter()
738 .filter(|c| matches!(c.chunk_type, ChunkType::CodeBlock))
739 .collect();
740
741 assert_eq!(code_chunks.len(), 1);
742 assert!(code_chunks[0].text.contains("fn main()"));
743 }
744
745 #[test]
746 fn test_mixed_content() {
747 let text = r#"# Report
748
749## Summary
750
751This is the summary section.
752
753| Item | Count |
754|------|-------|
755| A | 10 |
756| B | 20 |
757
758## Code
759
760```python
761def hello():
762 print("Hello")
763```
764
765## Conclusion
766
767All done."#;
768
769 let doc = detect_structure(text);
770 let result = chunk_structured(&doc);
771
772 assert!(result.tables_processed >= 1);
773 assert!(result.code_blocks_processed >= 1);
774 assert!(result.chunks.len() >= 3);
775 }
776
777 #[test]
778 fn test_table_header_formatting() {
779 let text = r#"| Col1 | Col2 | Col3 |
780|------|------|------|
781| A1 | A2 | A3 |
782| B1 | B2 | B3 |"#;
783
784 let doc = detect_structure(text);
785 let table = doc.tables().next().unwrap();
786
787 let header = table.format_header();
788 assert!(header.contains("| Col1 | Col2 | Col3 |"));
789 assert!(header.contains("|---|---|---|"));
790 }
791
792 #[test]
793 fn test_preserve_whole_strategy() {
794 let mut rows = String::new();
795 for i in 1..=20 {
796 rows.push_str(&format!("| Data {} | Value |\n", i));
797 }
798
799 let text = format!(
800 r#"| Header1 | Header2 |
801|---------|---------|
802{}"#,
803 rows
804 );
805
806 let doc = detect_structure(&text);
807 let chunker = StructuralChunker::new(ChunkingOptions {
808 max_chars: 500,
809 table_handling: TableChunkingStrategy::PreserveWhole,
810 ..Default::default()
811 });
812 let result = chunker.chunk(&doc);
813
814 let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
816
817 assert_eq!(table_chunks.len(), 1);
818 assert_eq!(result.tables_split, 0);
819 }
820
821 #[test]
822 fn test_chunking_result_stats() {
823 let text = r#"| A | B |
824|---|---|
825| 1 | 2 |
826
827```python
828x = 1
829```
830
831| C | D |
832|---|---|
833| 3 | 4 |"#;
834
835 let doc = detect_structure(text);
836 let result = chunk_structured(&doc);
837
838 assert_eq!(result.tables_processed, 2);
839 assert_eq!(result.code_blocks_processed, 1);
840 }
841}