memvid_core/structure/
chunker.rs

1//! Structural chunker that respects document boundaries.
2//!
3//! The chunker takes a `StructuredDocument` and produces `StructuredChunk`s
4//! that preserve semantic units. Tables are split between rows with header
5//! propagation, code blocks are kept whole or split at boundaries, and
6//! sections include their heading context.
7
8use crate::types::structure::{
9    ChunkType, ChunkingOptions, ChunkingResult, CodeChunkingStrategy, ElementData, StructuredChunk,
10    StructuredDocument, StructuredTable, TableChunkingStrategy,
11};
12
13/// Structural chunker that respects document boundaries.
14///
15/// # Example
16///
17/// ```ignore
18/// use memvid_core::structure::{StructuralChunker, ChunkingOptions, detect_structure};
19///
20/// let text = "| A | B |\n|---|---|\n| 1 | 2 |\n| 3 | 4 |";
21/// let doc = detect_structure(text);
22///
23/// let chunker = StructuralChunker::new(ChunkingOptions::default());
24/// let result = chunker.chunk(&doc);
25///
26/// // Each chunk preserves table structure
27/// for chunk in result.chunks {
28///     println!("{}", chunk.text);
29/// }
30/// ```
31pub struct StructuralChunker {
32    options: ChunkingOptions,
33}
34
35impl Default for StructuralChunker {
36    fn default() -> Self {
37        Self::new(ChunkingOptions::default())
38    }
39}
40
41impl StructuralChunker {
42    /// Create a new chunker with the given options.
43    pub fn new(options: ChunkingOptions) -> Self {
44        Self { options }
45    }
46
47    /// Create a chunker with default options and custom max chars.
48    pub fn with_max_chars(max_chars: usize) -> Self {
49        Self {
50            options: ChunkingOptions {
51                max_chars,
52                ..Default::default()
53            },
54        }
55    }
56
57    /// Chunk a structured document.
58    pub fn chunk(&self, doc: &StructuredDocument) -> ChunkingResult {
59        let mut result = ChunkingResult::empty();
60        let mut current_text = String::new();
61        let mut current_start = 0;
62        let mut pending_heading: Option<&str> = None;
63
64        for element in &doc.elements {
65            match &element.data {
66                ElementData::Table(table) => {
67                    // Flush any pending text before table
68                    if !current_text.trim().is_empty() {
69                        self.emit_text_chunk(
70                            &mut result,
71                            &current_text,
72                            current_start,
73                            element.char_start,
74                        );
75                        current_text.clear();
76                    }
77
78                    // Chunk the table
79                    self.chunk_table(&mut result, table, element.char_start, element.char_end);
80                    current_start = element.char_end;
81                }
82
83                ElementData::CodeBlock(block) => {
84                    // Flush pending text
85                    if !current_text.trim().is_empty() {
86                        self.emit_text_chunk(
87                            &mut result,
88                            &current_text,
89                            current_start,
90                            element.char_start,
91                        );
92                        current_text.clear();
93                    }
94
95                    // Chunk the code block
96                    self.chunk_code_block(
97                        &mut result,
98                        &block.format(),
99                        block.language.as_deref(),
100                        element.char_start,
101                        element.char_end,
102                    );
103                    current_start = element.char_end;
104                }
105
106                ElementData::Heading(heading) => {
107                    if self.options.include_section_headers {
108                        // Keep heading with following content
109                        pending_heading = Some(heading.format().leak());
110                    }
111
112                    // Add heading to current text
113                    if !current_text.is_empty() {
114                        current_text.push('\n');
115                    }
116                    current_text.push_str(&heading.format());
117                }
118
119                ElementData::List(list) => {
120                    if self.options.preserve_lists {
121                        let list_text = list.format();
122                        let combined_len = current_text.chars().count() + list_text.chars().count();
123
124                        if combined_len > self.options.max_chars && !current_text.trim().is_empty()
125                        {
126                            // Flush current text before list
127                            self.emit_text_chunk(
128                                &mut result,
129                                &current_text,
130                                current_start,
131                                element.char_start,
132                            );
133                            current_text.clear();
134                            current_start = element.char_start;
135                        }
136
137                        // Add list to current text
138                        if !current_text.is_empty() {
139                            current_text.push_str("\n\n");
140                        }
141                        current_text.push_str(&list_text);
142                    } else {
143                        // Treat list as regular text
144                        let text = element.text();
145                        if !current_text.is_empty() {
146                            current_text.push_str("\n\n");
147                        }
148                        current_text.push_str(&text);
149                    }
150                }
151
152                ElementData::Paragraph { text } => {
153                    let text_len = text.chars().count();
154                    let current_len = current_text.chars().count();
155
156                    if current_len + text_len > self.options.max_chars
157                        && !current_text.trim().is_empty()
158                    {
159                        // Flush current chunk
160                        self.emit_text_chunk(
161                            &mut result,
162                            &current_text,
163                            current_start,
164                            element.char_start,
165                        );
166                        current_text.clear();
167                        current_start = element.char_start;
168
169                        // Add pending heading context if any
170                        if let Some(heading) = pending_heading.take() {
171                            current_text.push_str(heading);
172                            current_text.push_str("\n\n");
173                        }
174                    }
175
176                    if !current_text.is_empty() && !current_text.ends_with('\n') {
177                        current_text.push_str("\n\n");
178                    }
179                    current_text.push_str(text);
180                }
181
182                ElementData::BlockQuote { text } => {
183                    if !current_text.is_empty() {
184                        current_text.push_str("\n\n");
185                    }
186                    current_text.push_str("> ");
187                    current_text.push_str(text);
188                }
189
190                ElementData::Separator => {
191                    // Treat separator as a natural chunk break
192                    if !current_text.trim().is_empty() {
193                        self.emit_text_chunk(
194                            &mut result,
195                            &current_text,
196                            current_start,
197                            element.char_start,
198                        );
199                        current_text.clear();
200                    }
201                    current_start = element.char_end;
202                    pending_heading = None;
203                }
204
205                ElementData::Raw { text } => {
206                    if !current_text.is_empty() {
207                        current_text.push_str("\n\n");
208                    }
209                    current_text.push_str(text);
210                }
211            }
212        }
213
214        // Flush remaining text
215        if !current_text.trim().is_empty() {
216            self.emit_text_chunk(&mut result, &current_text, current_start, doc.total_chars);
217        }
218
219        result
220    }
221
222    /// Emit a text chunk.
223    fn emit_text_chunk(
224        &self,
225        result: &mut ChunkingResult,
226        text: &str,
227        char_start: usize,
228        char_end: usize,
229    ) {
230        let index = result.chunks.len();
231        result.chunks.push(StructuredChunk::text(
232            text.trim(),
233            index,
234            char_start,
235            char_end,
236        ));
237    }
238
239    /// Chunk a table with header propagation.
240    fn chunk_table(
241        &self,
242        result: &mut ChunkingResult,
243        table: &StructuredTable,
244        char_start: usize,
245        char_end: usize,
246    ) {
247        result.tables_processed += 1;
248
249        match self.options.table_handling {
250            TableChunkingStrategy::PreserveWhole => {
251                // Keep entire table as one chunk (may exceed max_chars)
252                let index = result.chunks.len();
253                result.chunks.push(StructuredChunk::table(
254                    &table.raw_text,
255                    index,
256                    &table.id,
257                    char_start,
258                    char_end,
259                ));
260            }
261
262            TableChunkingStrategy::SplitWithHeader => {
263                // Split table between rows, prepend header to each chunk
264                let header_text = table.format_header();
265                let header_chars = header_text.chars().count();
266
267                // If entire table fits, emit as single chunk
268                if table.char_count() <= self.options.max_chars {
269                    let index = result.chunks.len();
270                    result.chunks.push(StructuredChunk::table(
271                        &table.raw_text,
272                        index,
273                        &table.id,
274                        char_start,
275                        char_end,
276                    ));
277                    return;
278                }
279
280                // Split by rows
281                result.tables_split += 1;
282                let data_rows: Vec<_> = table.data_rows().collect();
283
284                if data_rows.is_empty() {
285                    // Only header, emit as-is
286                    let index = result.chunks.len();
287                    result.chunks.push(StructuredChunk::table(
288                        &header_text,
289                        index,
290                        &table.id,
291                        char_start,
292                        char_end,
293                    ));
294                    return;
295                }
296
297                let max_rows_per_chunk = self.calculate_rows_per_chunk(table, header_chars);
298                let total_parts = (data_rows.len() + max_rows_per_chunk - 1) / max_rows_per_chunk;
299
300                let mut part = 1;
301                let mut row_idx = 0;
302
303                while row_idx < data_rows.len() {
304                    let end_idx = (row_idx + max_rows_per_chunk).min(data_rows.len());
305                    let rows_in_chunk = &data_rows[row_idx..end_idx];
306
307                    // Build chunk text: header + rows
308                    let mut chunk_text = header_text.clone();
309                    for row in rows_in_chunk {
310                        chunk_text.push('\n');
311                        chunk_text.push_str(&table.format_row(row));
312                    }
313
314                    let index = result.chunks.len();
315                    if part == 1 {
316                        // First part is a Table chunk
317                        result.chunks.push(StructuredChunk::table(
318                            &chunk_text,
319                            index,
320                            &table.id,
321                            char_start,
322                            char_end,
323                        ));
324                    } else {
325                        // Subsequent parts are TableContinuation chunks
326                        result.chunks.push(StructuredChunk::table_continuation(
327                            &chunk_text,
328                            index,
329                            &table.id,
330                            part as u32,
331                            total_parts as u32,
332                            &header_text,
333                            char_start,
334                            char_end,
335                        ));
336                    }
337
338                    row_idx = end_idx;
339                    part += 1;
340                }
341            }
342
343            TableChunkingStrategy::Naive => {
344                // Just treat table as text (not recommended)
345                let index = result.chunks.len();
346                result.chunks.push(StructuredChunk::text(
347                    &table.raw_text,
348                    index,
349                    char_start,
350                    char_end,
351                ));
352            }
353        }
354    }
355
356    /// Calculate how many rows fit per chunk given header overhead.
357    fn calculate_rows_per_chunk(&self, table: &StructuredTable, header_chars: usize) -> usize {
358        let available = self.options.max_chars.saturating_sub(header_chars + 10);
359        if available == 0 {
360            return 1;
361        }
362
363        // Estimate average row size
364        let total_row_chars: usize = table
365            .data_rows()
366            .map(|row| {
367                row.cells
368                    .iter()
369                    .map(|c| c.text.chars().count())
370                    .sum::<usize>()
371                    + row.cells.len() * 3 // | separators
372            })
373            .sum();
374
375        let row_count = table.data_row_count();
376        if row_count == 0 {
377            return 1;
378        }
379
380        let avg_row_chars = total_row_chars / row_count;
381        if avg_row_chars == 0 {
382            return row_count;
383        }
384
385        (available / avg_row_chars).max(1)
386    }
387
388    /// Chunk a code block.
389    fn chunk_code_block(
390        &self,
391        result: &mut ChunkingResult,
392        formatted_text: &str,
393        language: Option<&str>,
394        char_start: usize,
395        char_end: usize,
396    ) {
397        result.code_blocks_processed += 1;
398
399        match self.options.code_handling {
400            CodeChunkingStrategy::PreserveWhole => {
401                // Keep entire code block as one chunk
402                let index = result.chunks.len();
403                result.chunks.push(StructuredChunk {
404                    text: formatted_text.to_string(),
405                    chunk_type: ChunkType::CodeBlock,
406                    index,
407                    element_id: None,
408                    part: None,
409                    total_parts: None,
410                    context: language.map(|s| s.to_string()),
411                    char_start,
412                    char_end,
413                });
414            }
415
416            CodeChunkingStrategy::SplitAtBoundaries => {
417                // Try to split at function/block boundaries
418                let block_chars = formatted_text.chars().count();
419                if block_chars <= self.options.max_chars {
420                    // Fits in one chunk
421                    let index = result.chunks.len();
422                    result.chunks.push(StructuredChunk {
423                        text: formatted_text.to_string(),
424                        chunk_type: ChunkType::CodeBlock,
425                        index,
426                        element_id: None,
427                        part: None,
428                        total_parts: None,
429                        context: language.map(|s| s.to_string()),
430                        char_start,
431                        char_end,
432                    });
433                } else {
434                    // Split at function boundaries or fall back to line boundaries
435                    self.split_code_at_boundaries(
436                        result,
437                        formatted_text,
438                        language,
439                        char_start,
440                        char_end,
441                    );
442                }
443            }
444
445            CodeChunkingStrategy::SplitWithOverlap => {
446                // Split with overlap for context
447                self.split_code_with_overlap(
448                    result,
449                    formatted_text,
450                    language,
451                    char_start,
452                    char_end,
453                );
454            }
455        }
456    }
457
458    /// Split code at function/block boundaries.
459    fn split_code_at_boundaries(
460        &self,
461        result: &mut ChunkingResult,
462        formatted_text: &str,
463        language: Option<&str>,
464        char_start: usize,
465        char_end: usize,
466    ) {
467        // Simple heuristic: split at empty lines that likely indicate function boundaries
468        let lines: Vec<&str> = formatted_text.lines().collect();
469        let mut chunks = Vec::new();
470        let mut current_chunk = Vec::new();
471        let mut current_chars = 0;
472
473        // Find fence markers to preserve
474        let fence_start = lines.first().copied().unwrap_or("```");
475        let fence_end = lines.last().copied().unwrap_or("```");
476        let content_lines = &lines[1..lines.len().saturating_sub(1)];
477
478        for (i, line) in content_lines.iter().enumerate() {
479            let line_chars = line.chars().count() + 1;
480
481            // Check for good split point (empty line or function start)
482            let is_boundary = line.trim().is_empty()
483                || line.trim().starts_with("fn ")
484                || line.trim().starts_with("def ")
485                || line.trim().starts_with("function ")
486                || line.trim().starts_with("class ")
487                || line.trim().starts_with("impl ");
488
489            if is_boundary && current_chars > self.options.max_chars / 2 && i > 0 {
490                // Emit current chunk
491                if !current_chunk.is_empty() {
492                    chunks.push(current_chunk.join("\n"));
493                    current_chunk.clear();
494                    current_chars = 0;
495                }
496            }
497
498            current_chunk.push(*line);
499            current_chars += line_chars;
500        }
501
502        // Emit remaining
503        if !current_chunk.is_empty() {
504            chunks.push(current_chunk.join("\n"));
505        }
506
507        // Emit as continuation chunks
508        let total_parts = chunks.len();
509        for (i, chunk_content) in chunks.into_iter().enumerate() {
510            let index = result.chunks.len();
511            let chunk_text = format!(
512                "{}{}\n{}\n{}",
513                fence_start,
514                language.unwrap_or(""),
515                chunk_content,
516                fence_end
517            );
518
519            if i == 0 {
520                result.chunks.push(StructuredChunk {
521                    text: chunk_text,
522                    chunk_type: ChunkType::CodeBlock,
523                    index,
524                    element_id: None,
525                    part: Some(1),
526                    total_parts: Some(total_parts as u32),
527                    context: language.map(|s| s.to_string()),
528                    char_start,
529                    char_end,
530                });
531            } else {
532                result.chunks.push(StructuredChunk {
533                    text: chunk_text,
534                    chunk_type: ChunkType::CodeBlockContinuation,
535                    index,
536                    element_id: None,
537                    part: Some((i + 1) as u32),
538                    total_parts: Some(total_parts as u32),
539                    context: language.map(|s| s.to_string()),
540                    char_start,
541                    char_end,
542                });
543            }
544        }
545    }
546
547    /// Split code with overlap for context.
548    fn split_code_with_overlap(
549        &self,
550        result: &mut ChunkingResult,
551        formatted_text: &str,
552        language: Option<&str>,
553        char_start: usize,
554        char_end: usize,
555    ) {
556        let lines: Vec<&str> = formatted_text.lines().collect();
557        let overlap_lines = (self.options.overlap_chars / 40).max(2);
558
559        // Find fence markers
560        let fence_start = lines.first().copied().unwrap_or("```");
561        let fence_end = lines.last().copied().unwrap_or("```");
562        let content_lines = &lines[1..lines.len().saturating_sub(1)];
563
564        let mut chunks = Vec::new();
565        let mut start_line = 0;
566
567        while start_line < content_lines.len() {
568            let mut current_chars = 0;
569            let mut end_line = start_line;
570
571            while end_line < content_lines.len() {
572                current_chars += content_lines[end_line].chars().count() + 1;
573                if current_chars > self.options.max_chars {
574                    break;
575                }
576                end_line += 1;
577            }
578
579            if end_line == start_line {
580                end_line = start_line + 1;
581            }
582
583            let chunk_lines: Vec<&str> = content_lines[start_line..end_line].to_vec();
584            chunks.push(chunk_lines.join("\n"));
585
586            // Move forward with overlap
587            start_line = if end_line >= content_lines.len() {
588                content_lines.len()
589            } else {
590                end_line.saturating_sub(overlap_lines)
591            };
592        }
593
594        // Emit chunks
595        let total_parts = chunks.len();
596        for (i, chunk_content) in chunks.into_iter().enumerate() {
597            let index = result.chunks.len();
598            let chunk_text = format!(
599                "{}{}\n{}\n{}",
600                fence_start,
601                language.unwrap_or(""),
602                chunk_content,
603                fence_end
604            );
605
606            let chunk_type = if i == 0 {
607                ChunkType::CodeBlock
608            } else {
609                ChunkType::CodeBlockContinuation
610            };
611
612            result.chunks.push(StructuredChunk {
613                text: chunk_text,
614                chunk_type,
615                index,
616                element_id: None,
617                part: Some((i + 1) as u32),
618                total_parts: Some(total_parts as u32),
619                context: language.map(|s| s.to_string()),
620                char_start,
621                char_end,
622            });
623        }
624    }
625}
626
627/// Convenience function to chunk text with default options.
628pub fn chunk_structured(doc: &StructuredDocument) -> ChunkingResult {
629    StructuralChunker::default().chunk(doc)
630}
631
632/// Convenience function to chunk text with custom max chars.
633pub fn chunk_structured_with_max(doc: &StructuredDocument, max_chars: usize) -> ChunkingResult {
634    StructuralChunker::with_max_chars(max_chars).chunk(doc)
635}
636
637#[cfg(test)]
638mod tests {
639    use super::*;
640    use crate::structure::detect_structure;
641
642    #[test]
643    fn test_simple_text_chunking() {
644        let text = "This is a simple paragraph.\n\nAnother paragraph here.";
645        let doc = detect_structure(text);
646        let result = chunk_structured(&doc);
647
648        assert!(!result.chunks.is_empty());
649        assert_eq!(result.tables_processed, 0);
650    }
651
652    #[test]
653    fn test_table_preserved_when_small() {
654        let text = r#"Introduction.
655
656| Name | Age |
657|------|-----|
658| Alice | 30 |
659| Bob | 25 |
660
661Conclusion."#;
662
663        let doc = detect_structure(text);
664        let result = chunk_structured(&doc);
665
666        // Table should be in one chunk
667        let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
668
669        assert_eq!(table_chunks.len(), 1);
670        assert_eq!(result.tables_processed, 1);
671        assert_eq!(result.tables_split, 0);
672    }
673
674    #[test]
675    fn test_large_table_split_with_header() {
676        // Create a table that exceeds max_chars
677        let mut rows = String::new();
678        for i in 1..=50 {
679            rows.push_str(&format!(
680                "| Row {} with some data | More data here | Even more |\n",
681                i
682            ));
683        }
684
685        let text = format!(
686            r#"Introduction.
687
688| Column A | Column B | Column C |
689|----------|----------|----------|
690{}
691Conclusion."#,
692            rows
693        );
694
695        let doc = detect_structure(&text);
696        let chunker = StructuralChunker::with_max_chars(500);
697        let result = chunker.chunk(&doc);
698
699        // Table should be split
700        let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
701
702        assert!(table_chunks.len() > 1, "Large table should be split");
703        assert_eq!(result.tables_split, 1);
704
705        // Each chunk should contain header
706        for chunk in &table_chunks {
707            assert!(
708                chunk.text.contains("| Column A |"),
709                "Each table chunk should contain header"
710            );
711        }
712
713        // Continuation chunks should have context
714        for chunk in table_chunks.iter().skip(1) {
715            assert_eq!(chunk.chunk_type, ChunkType::TableContinuation);
716            assert!(chunk.context.is_some());
717        }
718    }
719
720    #[test]
721    fn test_code_block_preserved() {
722        let text = r#"Here is code:
723
724```rust
725fn main() {
726    println!("Hello!");
727}
728```
729
730Done."#;
731
732        let doc = detect_structure(text);
733        let result = chunk_structured(&doc);
734
735        let code_chunks: Vec<_> = result
736            .chunks
737            .iter()
738            .filter(|c| matches!(c.chunk_type, ChunkType::CodeBlock))
739            .collect();
740
741        assert_eq!(code_chunks.len(), 1);
742        assert!(code_chunks[0].text.contains("fn main()"));
743    }
744
745    #[test]
746    fn test_mixed_content() {
747        let text = r#"# Report
748
749## Summary
750
751This is the summary section.
752
753| Item | Count |
754|------|-------|
755| A    | 10    |
756| B    | 20    |
757
758## Code
759
760```python
761def hello():
762    print("Hello")
763```
764
765## Conclusion
766
767All done."#;
768
769        let doc = detect_structure(text);
770        let result = chunk_structured(&doc);
771
772        assert!(result.tables_processed >= 1);
773        assert!(result.code_blocks_processed >= 1);
774        assert!(result.chunks.len() >= 3);
775    }
776
777    #[test]
778    fn test_table_header_formatting() {
779        let text = r#"| Col1 | Col2 | Col3 |
780|------|------|------|
781| A1   | A2   | A3   |
782| B1   | B2   | B3   |"#;
783
784        let doc = detect_structure(text);
785        let table = doc.tables().next().unwrap();
786
787        let header = table.format_header();
788        assert!(header.contains("| Col1 | Col2 | Col3 |"));
789        assert!(header.contains("|---|---|---|"));
790    }
791
792    #[test]
793    fn test_preserve_whole_strategy() {
794        let mut rows = String::new();
795        for i in 1..=20 {
796            rows.push_str(&format!("| Data {} | Value |\n", i));
797        }
798
799        let text = format!(
800            r#"| Header1 | Header2 |
801|---------|---------|
802{}"#,
803            rows
804        );
805
806        let doc = detect_structure(&text);
807        let chunker = StructuralChunker::new(ChunkingOptions {
808            max_chars: 500,
809            table_handling: TableChunkingStrategy::PreserveWhole,
810            ..Default::default()
811        });
812        let result = chunker.chunk(&doc);
813
814        // Table should NOT be split with PreserveWhole
815        let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
816
817        assert_eq!(table_chunks.len(), 1);
818        assert_eq!(result.tables_split, 0);
819    }
820
821    #[test]
822    fn test_chunking_result_stats() {
823        let text = r#"| A | B |
824|---|---|
825| 1 | 2 |
826
827```python
828x = 1
829```
830
831| C | D |
832|---|---|
833| 3 | 4 |"#;
834
835        let doc = detect_structure(text);
836        let result = chunk_structured(&doc);
837
838        assert_eq!(result.tables_processed, 2);
839        assert_eq!(result.code_blocks_processed, 1);
840    }
841}