Skip to main content

memvid_core/structure/
chunker.rs

1//! Structural chunker that respects document boundaries.
2//!
3//! The chunker takes a `StructuredDocument` and produces `StructuredChunk`s
4//! that preserve semantic units. Tables are split between rows with header
5//! propagation, code blocks are kept whole or split at boundaries, and
6//! sections include their heading context.
7
8use crate::types::structure::{
9    ChunkType, ChunkingOptions, ChunkingResult, CodeChunkingStrategy, ElementData, StructuredChunk,
10    StructuredDocument, StructuredTable, TableChunkingStrategy,
11};
12
13/// Structural chunker that respects document boundaries.
14///
15/// # Example
16///
17/// ```ignore
18/// use memvid_core::structure::{StructuralChunker, ChunkingOptions, detect_structure};
19///
20/// let text = "| A | B |\n|---|---|\n| 1 | 2 |\n| 3 | 4 |";
21/// let doc = detect_structure(text);
22///
23/// let chunker = StructuralChunker::new(ChunkingOptions::default());
24/// let result = chunker.chunk(&doc);
25///
26/// // Each chunk preserves table structure
27/// for chunk in result.chunks {
28///     println!("{}", chunk.text);
29/// }
30/// ```
31pub struct StructuralChunker {
32    options: ChunkingOptions,
33}
34
35impl Default for StructuralChunker {
36    fn default() -> Self {
37        Self::new(ChunkingOptions::default())
38    }
39}
40
41impl StructuralChunker {
42    /// Create a new chunker with the given options.
43    #[must_use]
44    pub fn new(options: ChunkingOptions) -> Self {
45        Self { options }
46    }
47
48    /// Create a chunker with default options and custom max chars.
49    #[must_use]
50    pub fn with_max_chars(max_chars: usize) -> Self {
51        Self {
52            options: ChunkingOptions {
53                max_chars,
54                ..Default::default()
55            },
56        }
57    }
58
59    /// Chunk a structured document.
60    #[must_use]
61    pub fn chunk(&self, doc: &StructuredDocument) -> ChunkingResult {
62        let mut result = ChunkingResult::empty();
63        let mut current_text = String::new();
64        let mut current_start = 0;
65        let mut pending_heading: Option<&str> = None;
66
67        for element in &doc.elements {
68            match &element.data {
69                ElementData::Table(table) => {
70                    // Flush any pending text before table
71                    if !current_text.trim().is_empty() {
72                        self.emit_text_chunk(
73                            &mut result,
74                            &current_text,
75                            current_start,
76                            element.char_start,
77                        );
78                        current_text.clear();
79                    }
80
81                    // Chunk the table
82                    self.chunk_table(&mut result, table, element.char_start, element.char_end);
83                    current_start = element.char_end;
84                }
85
86                ElementData::CodeBlock(block) => {
87                    // Flush pending text
88                    if !current_text.trim().is_empty() {
89                        self.emit_text_chunk(
90                            &mut result,
91                            &current_text,
92                            current_start,
93                            element.char_start,
94                        );
95                        current_text.clear();
96                    }
97
98                    // Chunk the code block
99                    self.chunk_code_block(
100                        &mut result,
101                        &block.format(),
102                        block.language.as_deref(),
103                        element.char_start,
104                        element.char_end,
105                    );
106                    current_start = element.char_end;
107                }
108
109                ElementData::Heading(heading) => {
110                    if self.options.include_section_headers {
111                        // Keep heading with following content
112                        pending_heading = Some(heading.format().leak());
113                    }
114
115                    // Add heading to current text
116                    if !current_text.is_empty() {
117                        current_text.push('\n');
118                    }
119                    current_text.push_str(&heading.format());
120                }
121
122                ElementData::List(list) => {
123                    if self.options.preserve_lists {
124                        let list_text = list.format();
125                        let combined_len = current_text.chars().count() + list_text.chars().count();
126
127                        if combined_len > self.options.max_chars && !current_text.trim().is_empty()
128                        {
129                            // Flush current text before list
130                            self.emit_text_chunk(
131                                &mut result,
132                                &current_text,
133                                current_start,
134                                element.char_start,
135                            );
136                            current_text.clear();
137                            current_start = element.char_start;
138                        }
139
140                        // Add list to current text
141                        if !current_text.is_empty() {
142                            current_text.push_str("\n\n");
143                        }
144                        current_text.push_str(&list_text);
145                    } else {
146                        // Treat list as regular text
147                        let text = element.text();
148                        if !current_text.is_empty() {
149                            current_text.push_str("\n\n");
150                        }
151                        current_text.push_str(&text);
152                    }
153                }
154
155                ElementData::Paragraph { text } => {
156                    let text_len = text.chars().count();
157                    let current_len = current_text.chars().count();
158
159                    if current_len + text_len > self.options.max_chars
160                        && !current_text.trim().is_empty()
161                    {
162                        // Flush current chunk
163                        self.emit_text_chunk(
164                            &mut result,
165                            &current_text,
166                            current_start,
167                            element.char_start,
168                        );
169                        current_text.clear();
170                        current_start = element.char_start;
171
172                        // Add pending heading context if any
173                        if let Some(heading) = pending_heading.take() {
174                            current_text.push_str(heading);
175                            current_text.push_str("\n\n");
176                        }
177                    }
178
179                    if !current_text.is_empty() && !current_text.ends_with('\n') {
180                        current_text.push_str("\n\n");
181                    }
182                    current_text.push_str(text);
183                }
184
185                ElementData::BlockQuote { text } => {
186                    if !current_text.is_empty() {
187                        current_text.push_str("\n\n");
188                    }
189                    current_text.push_str("> ");
190                    current_text.push_str(text);
191                }
192
193                ElementData::Separator => {
194                    // Treat separator as a natural chunk break
195                    if !current_text.trim().is_empty() {
196                        self.emit_text_chunk(
197                            &mut result,
198                            &current_text,
199                            current_start,
200                            element.char_start,
201                        );
202                        current_text.clear();
203                    }
204                    current_start = element.char_end;
205                    pending_heading = None;
206                }
207
208                ElementData::Raw { text } => {
209                    if !current_text.is_empty() {
210                        current_text.push_str("\n\n");
211                    }
212                    current_text.push_str(text);
213                }
214            }
215        }
216
217        // Flush remaining text
218        if !current_text.trim().is_empty() {
219            self.emit_text_chunk(&mut result, &current_text, current_start, doc.total_chars);
220        }
221
222        result
223    }
224
225    /// Emit a text chunk.
226    fn emit_text_chunk(
227        &self,
228        result: &mut ChunkingResult,
229        text: &str,
230        char_start: usize,
231        char_end: usize,
232    ) {
233        let index = result.chunks.len();
234        result.chunks.push(StructuredChunk::text(
235            text.trim(),
236            index,
237            char_start,
238            char_end,
239        ));
240    }
241
242    /// Chunk a table with header propagation.
243    fn chunk_table(
244        &self,
245        result: &mut ChunkingResult,
246        table: &StructuredTable,
247        char_start: usize,
248        char_end: usize,
249    ) {
250        result.tables_processed += 1;
251
252        match self.options.table_handling {
253            TableChunkingStrategy::PreserveWhole => {
254                // Keep entire table as one chunk (may exceed max_chars)
255                let index = result.chunks.len();
256                result.chunks.push(StructuredChunk::table(
257                    &table.raw_text,
258                    index,
259                    &table.id,
260                    char_start,
261                    char_end,
262                ));
263            }
264
265            TableChunkingStrategy::SplitWithHeader => {
266                // Split table between rows, prepend header to each chunk
267                let header_text = table.format_header();
268                let header_chars = header_text.chars().count();
269
270                // If entire table fits, emit as single chunk
271                if table.char_count() <= self.options.max_chars {
272                    let index = result.chunks.len();
273                    result.chunks.push(StructuredChunk::table(
274                        &table.raw_text,
275                        index,
276                        &table.id,
277                        char_start,
278                        char_end,
279                    ));
280                    return;
281                }
282
283                // Split by rows
284                result.tables_split += 1;
285                let data_rows: Vec<_> = table.data_rows().collect();
286
287                if data_rows.is_empty() {
288                    // Only header, emit as-is
289                    let index = result.chunks.len();
290                    result.chunks.push(StructuredChunk::table(
291                        &header_text,
292                        index,
293                        &table.id,
294                        char_start,
295                        char_end,
296                    ));
297                    return;
298                }
299
300                let max_rows_per_chunk = self.calculate_rows_per_chunk(table, header_chars);
301                let total_parts = data_rows.len().div_ceil(max_rows_per_chunk);
302
303                let mut part = 1;
304                let mut row_idx = 0;
305
306                while row_idx < data_rows.len() {
307                    let end_idx = (row_idx + max_rows_per_chunk).min(data_rows.len());
308                    let rows_in_chunk = &data_rows[row_idx..end_idx];
309
310                    // Build chunk text: header + rows
311                    let mut chunk_text = header_text.clone();
312                    for row in rows_in_chunk {
313                        chunk_text.push('\n');
314                        chunk_text.push_str(&table.format_row(row));
315                    }
316
317                    let index = result.chunks.len();
318                    if part == 1 {
319                        // First part is a Table chunk
320                        result.chunks.push(StructuredChunk::table(
321                            &chunk_text,
322                            index,
323                            &table.id,
324                            char_start,
325                            char_end,
326                        ));
327                    } else {
328                        // Subsequent parts are TableContinuation chunks
329                        result.chunks.push(StructuredChunk::table_continuation(
330                            &chunk_text,
331                            index,
332                            &table.id,
333                            part as u32,
334                            u32::try_from(total_parts).unwrap_or(0),
335                            &header_text,
336                            char_start,
337                            char_end,
338                        ));
339                    }
340
341                    row_idx = end_idx;
342                    part += 1;
343                }
344            }
345
346            TableChunkingStrategy::Naive => {
347                // Just treat table as text (not recommended)
348                let index = result.chunks.len();
349                result.chunks.push(StructuredChunk::text(
350                    &table.raw_text,
351                    index,
352                    char_start,
353                    char_end,
354                ));
355            }
356        }
357    }
358
359    /// Calculate how many rows fit per chunk given header overhead.
360    fn calculate_rows_per_chunk(&self, table: &StructuredTable, header_chars: usize) -> usize {
361        let available = self.options.max_chars.saturating_sub(header_chars + 10);
362        if available == 0 {
363            return 1;
364        }
365
366        // Estimate average row size
367        let total_row_chars: usize = table
368            .data_rows()
369            .map(|row| {
370                row.cells
371                    .iter()
372                    .map(|c| c.text.chars().count())
373                    .sum::<usize>()
374                    + row.cells.len() * 3 // | separators
375            })
376            .sum();
377
378        let row_count = table.data_row_count();
379        if row_count == 0 {
380            return 1;
381        }
382
383        let avg_row_chars = total_row_chars / row_count;
384        if avg_row_chars == 0 {
385            return row_count;
386        }
387
388        (available / avg_row_chars).max(1)
389    }
390
391    /// Chunk a code block.
392    fn chunk_code_block(
393        &self,
394        result: &mut ChunkingResult,
395        formatted_text: &str,
396        language: Option<&str>,
397        char_start: usize,
398        char_end: usize,
399    ) {
400        result.code_blocks_processed += 1;
401
402        match self.options.code_handling {
403            CodeChunkingStrategy::PreserveWhole => {
404                // Keep entire code block as one chunk
405                let index = result.chunks.len();
406                result.chunks.push(StructuredChunk {
407                    text: formatted_text.to_string(),
408                    chunk_type: ChunkType::CodeBlock,
409                    index,
410                    element_id: None,
411                    part: None,
412                    total_parts: None,
413                    context: language.map(std::string::ToString::to_string),
414                    char_start,
415                    char_end,
416                });
417            }
418
419            CodeChunkingStrategy::SplitAtBoundaries => {
420                // Try to split at function/block boundaries
421                let block_chars = formatted_text.chars().count();
422                if block_chars <= self.options.max_chars {
423                    // Fits in one chunk
424                    let index = result.chunks.len();
425                    result.chunks.push(StructuredChunk {
426                        text: formatted_text.to_string(),
427                        chunk_type: ChunkType::CodeBlock,
428                        index,
429                        element_id: None,
430                        part: None,
431                        total_parts: None,
432                        context: language.map(std::string::ToString::to_string),
433                        char_start,
434                        char_end,
435                    });
436                } else {
437                    // Split at function boundaries or fall back to line boundaries
438                    self.split_code_at_boundaries(
439                        result,
440                        formatted_text,
441                        language,
442                        char_start,
443                        char_end,
444                    );
445                }
446            }
447
448            CodeChunkingStrategy::SplitWithOverlap => {
449                // Split with overlap for context
450                self.split_code_with_overlap(
451                    result,
452                    formatted_text,
453                    language,
454                    char_start,
455                    char_end,
456                );
457            }
458        }
459    }
460
461    /// Split code at function/block boundaries.
462    fn split_code_at_boundaries(
463        &self,
464        result: &mut ChunkingResult,
465        formatted_text: &str,
466        language: Option<&str>,
467        char_start: usize,
468        char_end: usize,
469    ) {
470        // Simple heuristic: split at empty lines that likely indicate function boundaries
471        let lines: Vec<&str> = formatted_text.lines().collect();
472        let mut chunks = Vec::new();
473        let mut current_chunk = Vec::new();
474        let mut current_chars = 0;
475
476        // Find fence markers to preserve
477        let fence_start = lines.first().copied().unwrap_or("```");
478        let fence_end = lines.last().copied().unwrap_or("```");
479        let content_lines = &lines[1..lines.len().saturating_sub(1)];
480
481        for (i, line) in content_lines.iter().enumerate() {
482            let line_chars = line.chars().count() + 1;
483
484            // Check for good split point (empty line or function start)
485            let is_boundary = line.trim().is_empty()
486                || line.trim().starts_with("fn ")
487                || line.trim().starts_with("def ")
488                || line.trim().starts_with("function ")
489                || line.trim().starts_with("class ")
490                || line.trim().starts_with("impl ");
491
492            if is_boundary && current_chars > self.options.max_chars / 2 && i > 0 {
493                // Emit current chunk
494                if !current_chunk.is_empty() {
495                    chunks.push(current_chunk.join("\n"));
496                    current_chunk.clear();
497                    current_chars = 0;
498                }
499            }
500
501            current_chunk.push(*line);
502            current_chars += line_chars;
503        }
504
505        // Emit remaining
506        if !current_chunk.is_empty() {
507            chunks.push(current_chunk.join("\n"));
508        }
509
510        // Emit as continuation chunks
511        let total_parts = chunks.len();
512        for (i, chunk_content) in chunks.into_iter().enumerate() {
513            let index = result.chunks.len();
514            let chunk_text = format!(
515                "{}{}\n{}\n{}",
516                fence_start,
517                language.unwrap_or(""),
518                chunk_content,
519                fence_end
520            );
521
522            if i == 0 {
523                result.chunks.push(StructuredChunk {
524                    text: chunk_text,
525                    chunk_type: ChunkType::CodeBlock,
526                    index,
527                    element_id: None,
528                    part: Some(1),
529                    total_parts: Some(u32::try_from(total_parts).unwrap_or(0)),
530                    context: language.map(std::string::ToString::to_string),
531                    char_start,
532                    char_end,
533                });
534            } else {
535                result.chunks.push(StructuredChunk {
536                    text: chunk_text,
537                    chunk_type: ChunkType::CodeBlockContinuation,
538                    index,
539                    element_id: None,
540                    part: Some(u32::try_from(i + 1).unwrap_or(0)),
541                    total_parts: Some(u32::try_from(total_parts).unwrap_or(0)),
542                    context: language.map(std::string::ToString::to_string),
543                    char_start,
544                    char_end,
545                });
546            }
547        }
548    }
549
550    /// Split code with overlap for context.
551    fn split_code_with_overlap(
552        &self,
553        result: &mut ChunkingResult,
554        formatted_text: &str,
555        language: Option<&str>,
556        char_start: usize,
557        char_end: usize,
558    ) {
559        let lines: Vec<&str> = formatted_text.lines().collect();
560        let overlap_lines = (self.options.overlap_chars / 40).max(2);
561
562        // Find fence markers
563        let fence_start = lines.first().copied().unwrap_or("```");
564        let fence_end = lines.last().copied().unwrap_or("```");
565        let content_lines = &lines[1..lines.len().saturating_sub(1)];
566
567        let mut chunks = Vec::new();
568        let mut start_line = 0;
569
570        while start_line < content_lines.len() {
571            let mut current_chars = 0;
572            let mut end_line = start_line;
573
574            while end_line < content_lines.len() {
575                current_chars += content_lines[end_line].chars().count() + 1;
576                if current_chars > self.options.max_chars {
577                    break;
578                }
579                end_line += 1;
580            }
581
582            if end_line == start_line {
583                end_line = start_line + 1;
584            }
585
586            let chunk_lines: Vec<&str> = content_lines[start_line..end_line].to_vec();
587            chunks.push(chunk_lines.join("\n"));
588
589            // Move forward with overlap
590            start_line = if end_line >= content_lines.len() {
591                content_lines.len()
592            } else {
593                end_line.saturating_sub(overlap_lines)
594            };
595        }
596
597        // Emit chunks
598        let total_parts = chunks.len();
599        for (i, chunk_content) in chunks.into_iter().enumerate() {
600            let index = result.chunks.len();
601            let chunk_text = format!(
602                "{}{}\n{}\n{}",
603                fence_start,
604                language.unwrap_or(""),
605                chunk_content,
606                fence_end
607            );
608
609            let chunk_type = if i == 0 {
610                ChunkType::CodeBlock
611            } else {
612                ChunkType::CodeBlockContinuation
613            };
614
615            result.chunks.push(StructuredChunk {
616                text: chunk_text,
617                chunk_type,
618                index,
619                element_id: None,
620                part: Some(u32::try_from(i + 1).unwrap_or(0)),
621                total_parts: Some(u32::try_from(total_parts).unwrap_or(0)),
622                context: language.map(std::string::ToString::to_string),
623                char_start,
624                char_end,
625            });
626        }
627    }
628}
629
630/// Convenience function to chunk text with default options.
631#[must_use]
632pub fn chunk_structured(doc: &StructuredDocument) -> ChunkingResult {
633    StructuralChunker::default().chunk(doc)
634}
635
636/// Convenience function to chunk text with custom max chars.
637#[must_use]
638pub fn chunk_structured_with_max(doc: &StructuredDocument, max_chars: usize) -> ChunkingResult {
639    StructuralChunker::with_max_chars(max_chars).chunk(doc)
640}
641
642#[cfg(test)]
643mod tests {
644    use super::*;
645    use crate::structure::detect_structure;
646
647    #[test]
648    fn test_simple_text_chunking() {
649        let text = "This is a simple paragraph.\n\nAnother paragraph here.";
650        let doc = detect_structure(text);
651        let result = chunk_structured(&doc);
652
653        assert!(!result.chunks.is_empty());
654        assert_eq!(result.tables_processed, 0);
655    }
656
657    #[test]
658    fn test_table_preserved_when_small() {
659        let text = r"Introduction.
660
661| Name | Age |
662|------|-----|
663| Alice | 30 |
664| Bob | 25 |
665
666Conclusion.";
667
668        let doc = detect_structure(text);
669        let result = chunk_structured(&doc);
670
671        // Table should be in one chunk
672        let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
673
674        assert_eq!(table_chunks.len(), 1);
675        assert_eq!(result.tables_processed, 1);
676        assert_eq!(result.tables_split, 0);
677    }
678
679    #[test]
680    fn test_large_table_split_with_header() {
681        // Create a table that exceeds max_chars
682        let mut rows = String::new();
683        for i in 1..=50 {
684            rows.push_str(&format!(
685                "| Row {} with some data | More data here | Even more |\n",
686                i
687            ));
688        }
689
690        let text = format!(
691            r"Introduction.
692
693| Column A | Column B | Column C |
694|----------|----------|----------|
695{}
696Conclusion.",
697            rows
698        );
699
700        let doc = detect_structure(&text);
701        let chunker = StructuralChunker::with_max_chars(500);
702        let result = chunker.chunk(&doc);
703
704        // Table should be split
705        let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
706
707        assert!(table_chunks.len() > 1, "Large table should be split");
708        assert_eq!(result.tables_split, 1);
709
710        // Each chunk should contain header
711        for chunk in &table_chunks {
712            assert!(
713                chunk.text.contains("| Column A |"),
714                "Each table chunk should contain header"
715            );
716        }
717
718        // Continuation chunks should have context
719        for chunk in table_chunks.iter().skip(1) {
720            assert_eq!(chunk.chunk_type, ChunkType::TableContinuation);
721            assert!(chunk.context.is_some());
722        }
723    }
724
725    #[test]
726    fn test_code_block_preserved() {
727        let text = r#"Here is code:
728
729```rust
730fn main() {
731    println!("Hello!");
732}
733```
734
735Done."#;
736
737        let doc = detect_structure(text);
738        let result = chunk_structured(&doc);
739
740        let code_chunks: Vec<_> = result
741            .chunks
742            .iter()
743            .filter(|c| matches!(c.chunk_type, ChunkType::CodeBlock))
744            .collect();
745
746        assert_eq!(code_chunks.len(), 1);
747        assert!(code_chunks[0].text.contains("fn main()"));
748    }
749
750    #[test]
751    fn test_mixed_content() {
752        let text = r#"# Report
753
754## Summary
755
756This is the summary section.
757
758| Item | Count |
759|------|-------|
760| A    | 10    |
761| B    | 20    |
762
763## Code
764
765```python
766def hello():
767    print("Hello")
768```
769
770## Conclusion
771
772All done."#;
773
774        let doc = detect_structure(text);
775        let result = chunk_structured(&doc);
776
777        assert!(result.tables_processed >= 1);
778        assert!(result.code_blocks_processed >= 1);
779        assert!(result.chunks.len() >= 3);
780    }
781
782    #[test]
783    fn test_table_header_formatting() {
784        let text = r"| Col1 | Col2 | Col3 |
785|------|------|------|
786| A1   | A2   | A3   |
787| B1   | B2   | B3   |";
788
789        let doc = detect_structure(text);
790        let table = doc.tables().next().unwrap();
791
792        let header = table.format_header();
793        assert!(header.contains("| Col1 | Col2 | Col3 |"));
794        assert!(header.contains("|---|---|---|"));
795    }
796
797    #[test]
798    fn test_preserve_whole_strategy() {
799        let mut rows = String::new();
800        for i in 1..=20 {
801            rows.push_str(&format!("| Data {} | Value |\n", i));
802        }
803
804        let text = format!(
805            r"| Header1 | Header2 |
806|---------|---------|
807{}",
808            rows
809        );
810
811        let doc = detect_structure(&text);
812        let chunker = StructuralChunker::new(ChunkingOptions {
813            max_chars: 500,
814            table_handling: TableChunkingStrategy::PreserveWhole,
815            ..Default::default()
816        });
817        let result = chunker.chunk(&doc);
818
819        // Table should NOT be split with PreserveWhole
820        let table_chunks: Vec<_> = result.chunks.iter().filter(|c| c.is_table()).collect();
821
822        assert_eq!(table_chunks.len(), 1);
823        assert_eq!(result.tables_split, 0);
824    }
825
826    #[test]
827    fn test_chunking_result_stats() {
828        let text = r"| A | B |
829|---|---|
830| 1 | 2 |
831
832```python
833x = 1
834```
835
836| C | D |
837|---|---|
838| 3 | 4 |";
839
840        let doc = detect_structure(text);
841        let result = chunk_structured(&doc);
842
843        assert_eq!(result.tables_processed, 2);
844        assert_eq!(result.code_blocks_processed, 1);
845    }
846}