memvid_core/structure/
chunker.rs

1//! Structural chunker that respects document boundaries.
2//!
3//! The chunker takes a `StructuredDocument` and produces `StructuredChunk`s
4//! that preserve semantic units. Tables are split between rows with header
5//! propagation, code blocks are kept whole or split at boundaries, and
6//! sections include their heading context.
7
8use crate::types::structure::{
9    ChunkType, ChunkingOptions, ChunkingResult, CodeChunkingStrategy, ElementData,
10    StructuredChunk, StructuredDocument, StructuredTable, TableChunkingStrategy,
11};
12
13/// Structural chunker that respects document boundaries.
14///
15/// # Example
16///
17/// ```ignore
18/// use memvid_core::structure::{StructuralChunker, ChunkingOptions, detect_structure};
19///
20/// let text = "| A | B |\n|---|---|\n| 1 | 2 |\n| 3 | 4 |";
21/// let doc = detect_structure(text);
22///
23/// let chunker = StructuralChunker::new(ChunkingOptions::default());
24/// let result = chunker.chunk(&doc);
25///
26/// // Each chunk preserves table structure
27/// for chunk in result.chunks {
28///     println!("{}", chunk.text);
29/// }
30/// ```
31pub struct StructuralChunker {
32    options: ChunkingOptions,
33}
34
35impl Default for StructuralChunker {
36    fn default() -> Self {
37        Self::new(ChunkingOptions::default())
38    }
39}
40
41impl StructuralChunker {
42    /// Create a new chunker with the given options.
43    pub fn new(options: ChunkingOptions) -> Self {
44        Self { options }
45    }
46
47    /// Create a chunker with default options and custom max chars.
48    pub fn with_max_chars(max_chars: usize) -> Self {
49        Self {
50            options: ChunkingOptions {
51                max_chars,
52                ..Default::default()
53            },
54        }
55    }
56
57    /// Chunk a structured document.
58    pub fn chunk(&self, doc: &StructuredDocument) -> ChunkingResult {
59        let mut result = ChunkingResult::empty();
60        let mut current_text = String::new();
61        let mut current_start = 0;
62        let mut pending_heading: Option<&str> = None;
63
64        for element in &doc.elements {
65            match &element.data {
66                ElementData::Table(table) => {
67                    // Flush any pending text before table
68                    if !current_text.trim().is_empty() {
69                        self.emit_text_chunk(
70                            &mut result,
71                            &current_text,
72                            current_start,
73                            element.char_start,
74                        );
75                        current_text.clear();
76                    }
77
78                    // Chunk the table
79                    self.chunk_table(&mut result, table, element.char_start, element.char_end);
80                    current_start = element.char_end;
81                }
82
83                ElementData::CodeBlock(block) => {
84                    // Flush pending text
85                    if !current_text.trim().is_empty() {
86                        self.emit_text_chunk(
87                            &mut result,
88                            &current_text,
89                            current_start,
90                            element.char_start,
91                        );
92                        current_text.clear();
93                    }
94
95                    // Chunk the code block
96                    self.chunk_code_block(
97                        &mut result,
98                        &block.format(),
99                        block.language.as_deref(),
100                        element.char_start,
101                        element.char_end,
102                    );
103                    current_start = element.char_end;
104                }
105
106                ElementData::Heading(heading) => {
107                    if self.options.include_section_headers {
108                        // Keep heading with following content
109                        pending_heading = Some(heading.format().leak());
110                    }
111
112                    // Add heading to current text
113                    if !current_text.is_empty() {
114                        current_text.push('\n');
115                    }
116                    current_text.push_str(&heading.format());
117                }
118
119                ElementData::List(list) => {
120                    if self.options.preserve_lists {
121                        let list_text = list.format();
122                        let combined_len = current_text.chars().count() + list_text.chars().count();
123
124                        if combined_len > self.options.max_chars && !current_text.trim().is_empty() {
125                            // Flush current text before list
126                            self.emit_text_chunk(
127                                &mut result,
128                                &current_text,
129                                current_start,
130                                element.char_start,
131                            );
132                            current_text.clear();
133                            current_start = element.char_start;
134                        }
135
136                        // Add list to current text
137                        if !current_text.is_empty() {
138                            current_text.push_str("\n\n");
139                        }
140                        current_text.push_str(&list_text);
141                    } else {
142                        // Treat list as regular text
143                        let text = element.text();
144                        if !current_text.is_empty() {
145                            current_text.push_str("\n\n");
146                        }
147                        current_text.push_str(&text);
148                    }
149                }
150
151                ElementData::Paragraph { text } => {
152                    let text_len = text.chars().count();
153                    let current_len = current_text.chars().count();
154
155                    if current_len + text_len > self.options.max_chars && !current_text.trim().is_empty() {
156                        // Flush current chunk
157                        self.emit_text_chunk(
158                            &mut result,
159                            &current_text,
160                            current_start,
161                            element.char_start,
162                        );
163                        current_text.clear();
164                        current_start = element.char_start;
165
166                        // Add pending heading context if any
167                        if let Some(heading) = pending_heading.take() {
168                            current_text.push_str(heading);
169                            current_text.push_str("\n\n");
170                        }
171                    }
172
173                    if !current_text.is_empty() && !current_text.ends_with('\n') {
174                        current_text.push_str("\n\n");
175                    }
176                    current_text.push_str(text);
177                }
178
179                ElementData::BlockQuote { text } => {
180                    if !current_text.is_empty() {
181                        current_text.push_str("\n\n");
182                    }
183                    current_text.push_str("> ");
184                    current_text.push_str(text);
185                }
186
187                ElementData::Separator => {
188                    // Treat separator as a natural chunk break
189                    if !current_text.trim().is_empty() {
190                        self.emit_text_chunk(
191                            &mut result,
192                            &current_text,
193                            current_start,
194                            element.char_start,
195                        );
196                        current_text.clear();
197                    }
198                    current_start = element.char_end;
199                    pending_heading = None;
200                }
201
202                ElementData::Raw { text } => {
203                    if !current_text.is_empty() {
204                        current_text.push_str("\n\n");
205                    }
206                    current_text.push_str(text);
207                }
208            }
209        }
210
211        // Flush remaining text
212        if !current_text.trim().is_empty() {
213            self.emit_text_chunk(
214                &mut result,
215                &current_text,
216                current_start,
217                doc.total_chars,
218            );
219        }
220
221        result
222    }
223
224    /// Emit a text chunk.
225    fn emit_text_chunk(
226        &self,
227        result: &mut ChunkingResult,
228        text: &str,
229        char_start: usize,
230        char_end: usize,
231    ) {
232        let index = result.chunks.len();
233        result.chunks.push(StructuredChunk::text(
234            text.trim(),
235            index,
236            char_start,
237            char_end,
238        ));
239    }
240
241    /// Chunk a table with header propagation.
242    fn chunk_table(
243        &self,
244        result: &mut ChunkingResult,
245        table: &StructuredTable,
246        char_start: usize,
247        char_end: usize,
248    ) {
249        result.tables_processed += 1;
250
251        match self.options.table_handling {
252            TableChunkingStrategy::PreserveWhole => {
253                // Keep entire table as one chunk (may exceed max_chars)
254                let index = result.chunks.len();
255                result.chunks.push(StructuredChunk::table(
256                    &table.raw_text,
257                    index,
258                    &table.id,
259                    char_start,
260                    char_end,
261                ));
262            }
263
264            TableChunkingStrategy::SplitWithHeader => {
265                // Split table between rows, prepend header to each chunk
266                let header_text = table.format_header();
267                let header_chars = header_text.chars().count();
268
269                // If entire table fits, emit as single chunk
270                if table.char_count() <= self.options.max_chars {
271                    let index = result.chunks.len();
272                    result.chunks.push(StructuredChunk::table(
273                        &table.raw_text,
274                        index,
275                        &table.id,
276                        char_start,
277                        char_end,
278                    ));
279                    return;
280                }
281
282                // Split by rows
283                result.tables_split += 1;
284                let data_rows: Vec<_> = table.data_rows().collect();
285
286                if data_rows.is_empty() {
287                    // Only header, emit as-is
288                    let index = result.chunks.len();
289                    result.chunks.push(StructuredChunk::table(
290                        &header_text,
291                        index,
292                        &table.id,
293                        char_start,
294                        char_end,
295                    ));
296                    return;
297                }
298
299                let max_rows_per_chunk = self.calculate_rows_per_chunk(table, header_chars);
300                let total_parts = (data_rows.len() + max_rows_per_chunk - 1) / max_rows_per_chunk;
301
302                let mut part = 1;
303                let mut row_idx = 0;
304
305                while row_idx < data_rows.len() {
306                    let end_idx = (row_idx + max_rows_per_chunk).min(data_rows.len());
307                    let rows_in_chunk = &data_rows[row_idx..end_idx];
308
309                    // Build chunk text: header + rows
310                    let mut chunk_text = header_text.clone();
311                    for row in rows_in_chunk {
312                        chunk_text.push('\n');
313                        chunk_text.push_str(&table.format_row(row));
314                    }
315
316                    let index = result.chunks.len();
317                    if part == 1 {
318                        // First part is a Table chunk
319                        result.chunks.push(StructuredChunk::table(
320                            &chunk_text,
321                            index,
322                            &table.id,
323                            char_start,
324                            char_end,
325                        ));
326                    } else {
327                        // Subsequent parts are TableContinuation chunks
328                        result.chunks.push(StructuredChunk::table_continuation(
329                            &chunk_text,
330                            index,
331                            &table.id,
332                            part as u32,
333                            total_parts as u32,
334                            &header_text,
335                            char_start,
336                            char_end,
337                        ));
338                    }
339
340                    row_idx = end_idx;
341                    part += 1;
342                }
343            }
344
345            TableChunkingStrategy::Naive => {
346                // Just treat table as text (not recommended)
347                let index = result.chunks.len();
348                result.chunks.push(StructuredChunk::text(
349                    &table.raw_text,
350                    index,
351                    char_start,
352                    char_end,
353                ));
354            }
355        }
356    }
357
358    /// Calculate how many rows fit per chunk given header overhead.
359    fn calculate_rows_per_chunk(&self, table: &StructuredTable, header_chars: usize) -> usize {
360        let available = self.options.max_chars.saturating_sub(header_chars + 10);
361        if available == 0 {
362            return 1;
363        }
364
365        // Estimate average row size
366        let total_row_chars: usize = table
367            .data_rows()
368            .map(|row| {
369                row.cells.iter().map(|c| c.text.chars().count()).sum::<usize>()
370                    + row.cells.len() * 3 // | separators
371            })
372            .sum();
373
374        let row_count = table.data_row_count();
375        if row_count == 0 {
376            return 1;
377        }
378
379        let avg_row_chars = total_row_chars / row_count;
380        if avg_row_chars == 0 {
381            return row_count;
382        }
383
384        (available / avg_row_chars).max(1)
385    }
386
387    /// Chunk a code block.
388    fn chunk_code_block(
389        &self,
390        result: &mut ChunkingResult,
391        formatted_text: &str,
392        language: Option<&str>,
393        char_start: usize,
394        char_end: usize,
395    ) {
396        result.code_blocks_processed += 1;
397
398        match self.options.code_handling {
399            CodeChunkingStrategy::PreserveWhole => {
400                // Keep entire code block as one chunk
401                let index = result.chunks.len();
402                result.chunks.push(StructuredChunk {
403                    text: formatted_text.to_string(),
404                    chunk_type: ChunkType::CodeBlock,
405                    index,
406                    element_id: None,
407                    part: None,
408                    total_parts: None,
409                    context: language.map(|s| s.to_string()),
410                    char_start,
411                    char_end,
412                });
413            }
414
415            CodeChunkingStrategy::SplitAtBoundaries => {
416                // Try to split at function/block boundaries
417                let block_chars = formatted_text.chars().count();
418                if block_chars <= self.options.max_chars {
419                    // Fits in one chunk
420                    let index = result.chunks.len();
421                    result.chunks.push(StructuredChunk {
422                        text: formatted_text.to_string(),
423                        chunk_type: ChunkType::CodeBlock,
424                        index,
425                        element_id: None,
426                        part: None,
427                        total_parts: None,
428                        context: language.map(|s| s.to_string()),
429                        char_start,
430                        char_end,
431                    });
432                } else {
433                    // Split at function boundaries or fall back to line boundaries
434                    self.split_code_at_boundaries(result, formatted_text, language, char_start, char_end);
435                }
436            }
437
438            CodeChunkingStrategy::SplitWithOverlap => {
439                // Split with overlap for context
440                self.split_code_with_overlap(result, formatted_text, language, char_start, char_end);
441            }
442        }
443    }
444
445    /// Split code at function/block boundaries.
446    fn split_code_at_boundaries(
447        &self,
448        result: &mut ChunkingResult,
449        formatted_text: &str,
450        language: Option<&str>,
451        char_start: usize,
452        char_end: usize,
453    ) {
454        // Simple heuristic: split at empty lines that likely indicate function boundaries
455        let lines: Vec<&str> = formatted_text.lines().collect();
456        let mut chunks = Vec::new();
457        let mut current_chunk = Vec::new();
458        let mut current_chars = 0;
459
460        // Find fence markers to preserve
461        let fence_start = lines.first().copied().unwrap_or("```");
462        let fence_end = lines.last().copied().unwrap_or("```");
463        let content_lines = &lines[1..lines.len().saturating_sub(1)];
464
465        for (i, line) in content_lines.iter().enumerate() {
466            let line_chars = line.chars().count() + 1;
467
468            // Check for good split point (empty line or function start)
469            let is_boundary = line.trim().is_empty()
470                || line.trim().starts_with("fn ")
471                || line.trim().starts_with("def ")
472                || line.trim().starts_with("function ")
473                || line.trim().starts_with("class ")
474                || line.trim().starts_with("impl ");
475
476            if is_boundary
477                && current_chars > self.options.max_chars / 2
478                && i > 0
479            {
480                // Emit current chunk
481                if !current_chunk.is_empty() {
482                    chunks.push(current_chunk.join("\n"));
483                    current_chunk.clear();
484                    current_chars = 0;
485                }
486            }
487
488            current_chunk.push(*line);
489            current_chars += line_chars;
490        }
491
492        // Emit remaining
493        if !current_chunk.is_empty() {
494            chunks.push(current_chunk.join("\n"));
495        }
496
497        // Emit as continuation chunks
498        let total_parts = chunks.len();
499        for (i, chunk_content) in chunks.into_iter().enumerate() {
500            let index = result.chunks.len();
501            let chunk_text = format!("{}{}\n{}\n{}", fence_start, language.unwrap_or(""), chunk_content, fence_end);
502
503            if i == 0 {
504                result.chunks.push(StructuredChunk {
505                    text: chunk_text,
506                    chunk_type: ChunkType::CodeBlock,
507                    index,
508                    element_id: None,
509                    part: Some(1),
510                    total_parts: Some(total_parts as u32),
511                    context: language.map(|s| s.to_string()),
512                    char_start,
513                    char_end,
514                });
515            } else {
516                result.chunks.push(StructuredChunk {
517                    text: chunk_text,
518                    chunk_type: ChunkType::CodeBlockContinuation,
519                    index,
520                    element_id: None,
521                    part: Some((i + 1) as u32),
522                    total_parts: Some(total_parts as u32),
523                    context: language.map(|s| s.to_string()),
524                    char_start,
525                    char_end,
526                });
527            }
528        }
529    }
530
531    /// Split code with overlap for context.
532    fn split_code_with_overlap(
533        &self,
534        result: &mut ChunkingResult,
535        formatted_text: &str,
536        language: Option<&str>,
537        char_start: usize,
538        char_end: usize,
539    ) {
540        let lines: Vec<&str> = formatted_text.lines().collect();
541        let overlap_lines = (self.options.overlap_chars / 40).max(2);
542
543        // Find fence markers
544        let fence_start = lines.first().copied().unwrap_or("```");
545        let fence_end = lines.last().copied().unwrap_or("```");
546        let content_lines = &lines[1..lines.len().saturating_sub(1)];
547
548        let mut chunks = Vec::new();
549        let mut start_line = 0;
550
551        while start_line < content_lines.len() {
552            let mut current_chars = 0;
553            let mut end_line = start_line;
554
555            while end_line < content_lines.len() {
556                current_chars += content_lines[end_line].chars().count() + 1;
557                if current_chars > self.options.max_chars {
558                    break;
559                }
560                end_line += 1;
561            }
562
563            if end_line == start_line {
564                end_line = start_line + 1;
565            }
566
567            let chunk_lines: Vec<&str> = content_lines[start_line..end_line].to_vec();
568            chunks.push(chunk_lines.join("\n"));
569
570            // Move forward with overlap
571            start_line = if end_line >= content_lines.len() {
572                content_lines.len()
573            } else {
574                end_line.saturating_sub(overlap_lines)
575            };
576        }
577
578        // Emit chunks
579        let total_parts = chunks.len();
580        for (i, chunk_content) in chunks.into_iter().enumerate() {
581            let index = result.chunks.len();
582            let chunk_text = format!("{}{}\n{}\n{}", fence_start, language.unwrap_or(""), chunk_content, fence_end);
583
584            let chunk_type = if i == 0 {
585                ChunkType::CodeBlock
586            } else {
587                ChunkType::CodeBlockContinuation
588            };
589
590            result.chunks.push(StructuredChunk {
591                text: chunk_text,
592                chunk_type,
593                index,
594                element_id: None,
595                part: Some((i + 1) as u32),
596                total_parts: Some(total_parts as u32),
597                context: language.map(|s| s.to_string()),
598                char_start,
599                char_end,
600            });
601        }
602    }
603}
604
605/// Convenience function to chunk text with default options.
606pub fn chunk_structured(doc: &StructuredDocument) -> ChunkingResult {
607    StructuralChunker::default().chunk(doc)
608}
609
610/// Convenience function to chunk text with custom max chars.
611pub fn chunk_structured_with_max(doc: &StructuredDocument, max_chars: usize) -> ChunkingResult {
612    StructuralChunker::with_max_chars(max_chars).chunk(doc)
613}
614
615#[cfg(test)]
616mod tests {
617    use super::*;
618    use crate::structure::detect_structure;
619
620    #[test]
621    fn test_simple_text_chunking() {
622        let text = "This is a simple paragraph.\n\nAnother paragraph here.";
623        let doc = detect_structure(text);
624        let result = chunk_structured(&doc);
625
626        assert!(!result.chunks.is_empty());
627        assert_eq!(result.tables_processed, 0);
628    }
629
630    #[test]
631    fn test_table_preserved_when_small() {
632        let text = r#"Introduction.
633
634| Name | Age |
635|------|-----|
636| Alice | 30 |
637| Bob | 25 |
638
639Conclusion."#;
640
641        let doc = detect_structure(text);
642        let result = chunk_structured(&doc);
643
644        // Table should be in one chunk
645        let table_chunks: Vec<_> = result
646            .chunks
647            .iter()
648            .filter(|c| c.is_table())
649            .collect();
650
651        assert_eq!(table_chunks.len(), 1);
652        assert_eq!(result.tables_processed, 1);
653        assert_eq!(result.tables_split, 0);
654    }
655
656    #[test]
657    fn test_large_table_split_with_header() {
658        // Create a table that exceeds max_chars
659        let mut rows = String::new();
660        for i in 1..=50 {
661            rows.push_str(&format!("| Row {} with some data | More data here | Even more |\n", i));
662        }
663
664        let text = format!(
665            r#"Introduction.
666
667| Column A | Column B | Column C |
668|----------|----------|----------|
669{}
670Conclusion."#,
671            rows
672        );
673
674        let doc = detect_structure(&text);
675        let chunker = StructuralChunker::with_max_chars(500);
676        let result = chunker.chunk(&doc);
677
678        // Table should be split
679        let table_chunks: Vec<_> = result
680            .chunks
681            .iter()
682            .filter(|c| c.is_table())
683            .collect();
684
685        assert!(table_chunks.len() > 1, "Large table should be split");
686        assert_eq!(result.tables_split, 1);
687
688        // Each chunk should contain header
689        for chunk in &table_chunks {
690            assert!(
691                chunk.text.contains("| Column A |"),
692                "Each table chunk should contain header"
693            );
694        }
695
696        // Continuation chunks should have context
697        for chunk in table_chunks.iter().skip(1) {
698            assert_eq!(chunk.chunk_type, ChunkType::TableContinuation);
699            assert!(chunk.context.is_some());
700        }
701    }
702
703    #[test]
704    fn test_code_block_preserved() {
705        let text = r#"Here is code:
706
707```rust
708fn main() {
709    println!("Hello!");
710}
711```
712
713Done."#;
714
715        let doc = detect_structure(text);
716        let result = chunk_structured(&doc);
717
718        let code_chunks: Vec<_> = result
719            .chunks
720            .iter()
721            .filter(|c| matches!(c.chunk_type, ChunkType::CodeBlock))
722            .collect();
723
724        assert_eq!(code_chunks.len(), 1);
725        assert!(code_chunks[0].text.contains("fn main()"));
726    }
727
728    #[test]
729    fn test_mixed_content() {
730        let text = r#"# Report
731
732## Summary
733
734This is the summary section.
735
736| Item | Count |
737|------|-------|
738| A    | 10    |
739| B    | 20    |
740
741## Code
742
743```python
744def hello():
745    print("Hello")
746```
747
748## Conclusion
749
750All done."#;
751
752        let doc = detect_structure(text);
753        let result = chunk_structured(&doc);
754
755        assert!(result.tables_processed >= 1);
756        assert!(result.code_blocks_processed >= 1);
757        assert!(result.chunks.len() >= 3);
758    }
759
760    #[test]
761    fn test_table_header_formatting() {
762        let text = r#"| Col1 | Col2 | Col3 |
763|------|------|------|
764| A1   | A2   | A3   |
765| B1   | B2   | B3   |"#;
766
767        let doc = detect_structure(text);
768        let table = doc.tables().next().unwrap();
769
770        let header = table.format_header();
771        assert!(header.contains("| Col1 | Col2 | Col3 |"));
772        assert!(header.contains("|---|---|---|"));
773    }
774
775    #[test]
776    fn test_preserve_whole_strategy() {
777        let mut rows = String::new();
778        for i in 1..=20 {
779            rows.push_str(&format!("| Data {} | Value |\n", i));
780        }
781
782        let text = format!(
783            r#"| Header1 | Header2 |
784|---------|---------|
785{}"#,
786            rows
787        );
788
789        let doc = detect_structure(&text);
790        let chunker = StructuralChunker::new(ChunkingOptions {
791            max_chars: 500,
792            table_handling: TableChunkingStrategy::PreserveWhole,
793            ..Default::default()
794        });
795        let result = chunker.chunk(&doc);
796
797        // Table should NOT be split with PreserveWhole
798        let table_chunks: Vec<_> = result
799            .chunks
800            .iter()
801            .filter(|c| c.is_table())
802            .collect();
803
804        assert_eq!(table_chunks.len(), 1);
805        assert_eq!(result.tables_split, 0);
806    }
807
808    #[test]
809    fn test_chunking_result_stats() {
810        let text = r#"| A | B |
811|---|---|
812| 1 | 2 |
813
814```python
815x = 1
816```
817
818| C | D |
819|---|---|
820| 3 | 4 |"#;
821
822        let doc = detect_structure(text);
823        let result = chunk_structured(&doc);
824
825        assert_eq!(result.tables_processed, 2);
826        assert_eq!(result.code_blocks_processed, 1);
827    }
828}