sediment/
chunker.rs

1//! Chunking logic for splitting documents into searchable pieces
2//!
3//! Uses a two-pass approach for all content types:
4//! 1. First pass: Split by semantic boundaries (headers, functions, paragraphs)
5//! 2. Second pass: If any chunk exceeds max size, recursively split further
6//!
7//! Also applies minimum chunk size to avoid tiny fragments by merging small sections.
8
9use crate::document::ContentType;
10
11/// Configuration for chunking
12#[derive(Debug, Clone)]
13pub struct ChunkingConfig {
14    /// Minimum content length before chunking is applied (default: 1000 chars)
15    pub min_chunk_threshold: usize,
16    /// Maximum chunk size in characters (default: 800 chars)
17    pub max_chunk_size: usize,
18    /// Minimum chunk size - merge if below (default: 200 chars)
19    pub min_chunk_size: usize,
20    /// Overlap between chunks in characters (default: 100 chars)
21    pub chunk_overlap: usize,
22}
23
24impl Default for ChunkingConfig {
25    fn default() -> Self {
26        Self {
27            min_chunk_threshold: 1000,
28            max_chunk_size: 800,
29            min_chunk_size: 200,
30            chunk_overlap: 100,
31        }
32    }
33}
34
35impl ChunkingConfig {
36    /// Create config with legacy field name for backwards compatibility
37    pub fn with_chunk_size(mut self, size: usize) -> Self {
38        self.max_chunk_size = size;
39        self
40    }
41}
42
43/// Result of chunking a piece of content
44#[derive(Debug, Clone)]
45pub struct ChunkResult {
46    /// The chunk content
47    pub content: String,
48    /// Start offset in original content (character position)
49    pub start_offset: usize,
50    /// End offset in original content (character position)
51    pub end_offset: usize,
52    /// Optional context (e.g., parent header for markdown)
53    pub context: Option<String>,
54    /// Whether this chunk represents a major boundary (header, function) - don't merge across
55    pub is_boundary: bool,
56}
57
58impl ChunkResult {
59    fn new(content: String, start_offset: usize, end_offset: usize) -> Self {
60        Self {
61            content,
62            start_offset,
63            end_offset,
64            context: None,
65            is_boundary: false,
66        }
67    }
68
69    fn with_context(mut self, context: Option<String>) -> Self {
70        self.context = context;
71        self
72    }
73
74    fn with_boundary(mut self, is_boundary: bool) -> Self {
75        self.is_boundary = is_boundary;
76        self
77    }
78}
79
80/// Chunk content based on content type
81pub fn chunk_content(
82    content: &str,
83    content_type: ContentType,
84    config: &ChunkingConfig,
85) -> Vec<ChunkResult> {
86    // Don't chunk if content is below threshold
87    if content.len() < config.min_chunk_threshold {
88        return vec![ChunkResult::new(content.to_string(), 0, content.len())];
89    }
90
91    // First pass: semantic splitting
92    let chunks = match content_type {
93        ContentType::Markdown => chunk_markdown(content, config),
94        ContentType::Json => chunk_json(content, config),
95        ContentType::Yaml => chunk_yaml(content, config),
96        ContentType::Code => chunk_code(content, config),
97        ContentType::Text => chunk_text(content, config),
98    };
99
100    // Second pass: enforce max size by recursive splitting
101    let chunks = enforce_max_size(chunks, config);
102
103    // Third pass: merge small chunks (respecting boundaries)
104    merge_small_chunks(chunks, config.min_chunk_size)
105}
106
107// ============================================================================
108// Helper Functions
109// ============================================================================
110
111/// Split text at sentence boundaries
112fn split_at_sentences(text: &str) -> Vec<&str> {
113    let mut sentences = Vec::new();
114    let bytes = text.as_bytes();
115    let mut start = 0;
116    let mut i = 0;
117
118    while i < bytes.len() {
119        // Look for sentence-ending punctuation followed by space or end
120        if matches!(bytes[i], b'.' | b'?' | b'!') {
121            let next_idx = i + 1;
122            if next_idx >= bytes.len()
123                || bytes[next_idx] == b' '
124                || bytes[next_idx] == b'\n'
125                || bytes[next_idx] == b'\t'
126            {
127                // Include the punctuation
128                let end = i + 1;
129                if start < end && end <= bytes.len() {
130                    sentences.push(&text[start..end]);
131                }
132                // Skip whitespace after punctuation
133                i += 1;
134                while i < bytes.len()
135                    && (bytes[i] == b' ' || bytes[i] == b'\n' || bytes[i] == b'\t')
136                {
137                    i += 1;
138                }
139                start = i;
140                continue;
141            }
142        }
143        i += 1;
144    }
145
146    // Add remaining text
147    if start < bytes.len() {
148        sentences.push(&text[start..]);
149    }
150
151    if sentences.is_empty() && !text.is_empty() {
152        sentences.push(text);
153    }
154
155    sentences
156}
157
158/// Recursively split content: paragraphs -> sentences -> chars
159fn recursive_split(
160    text: &str,
161    max_size: usize,
162    offset: usize,
163    context: Option<String>,
164    overlap: usize,
165) -> Vec<ChunkResult> {
166    if text.len() <= max_size {
167        return vec![
168            ChunkResult::new(text.to_string(), offset, offset + text.len()).with_context(context),
169        ];
170    }
171
172    let mut chunks = Vec::new();
173
174    // Try splitting by paragraphs first
175    let paragraphs: Vec<&str> = text.split("\n\n").collect();
176    if paragraphs.len() > 1 {
177        let mut current_chunk = String::new();
178        let mut chunk_start = offset;
179        let mut current_pos = offset;
180
181        for (i, para) in paragraphs.iter().enumerate() {
182            let sep = if i > 0 { "\n\n" } else { "" };
183            let para_with_sep = format!("{}{}", sep, para);
184
185            if !current_chunk.is_empty() && current_chunk.len() + para_with_sep.len() > max_size {
186                // Save current chunk
187                chunks.push(
188                    ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
189                        .with_context(context.clone()),
190                );
191
192                // Start new chunk with overlap
193                let overlap_text = get_overlap_text(&current_chunk, overlap);
194                chunk_start = current_pos - overlap_text.len();
195                current_chunk = overlap_text;
196            }
197
198            current_chunk.push_str(&para_with_sep);
199            current_pos += para_with_sep.len();
200        }
201
202        if !current_chunk.is_empty() {
203            // If the final chunk is still too large, recursively split by sentences
204            if current_chunk.len() > max_size {
205                chunks.extend(split_by_sentences(
206                    &current_chunk,
207                    max_size,
208                    chunk_start,
209                    context.clone(),
210                    overlap,
211                ));
212            } else {
213                chunks.push(
214                    ChunkResult::new(current_chunk, chunk_start, current_pos)
215                        .with_context(context.clone()),
216                );
217            }
218        }
219
220        return chunks;
221    }
222
223    // No paragraph breaks - split by sentences
224    split_by_sentences(text, max_size, offset, context, overlap)
225}
226
227/// Split text by sentences, falling back to character split
228fn split_by_sentences(
229    text: &str,
230    max_size: usize,
231    offset: usize,
232    context: Option<String>,
233    overlap: usize,
234) -> Vec<ChunkResult> {
235    let sentences = split_at_sentences(text);
236
237    if sentences.len() <= 1 {
238        // No sentence boundaries - split by characters
239        return split_by_chars(text, max_size, offset, context, overlap);
240    }
241
242    let mut chunks = Vec::new();
243    let mut current_chunk = String::new();
244    let mut chunk_start = offset;
245    let mut current_pos = offset;
246
247    for sentence in sentences {
248        let sep = if !current_chunk.is_empty() { " " } else { "" };
249        let sentence_with_sep = format!("{}{}", sep, sentence);
250
251        if !current_chunk.is_empty() && current_chunk.len() + sentence_with_sep.len() > max_size {
252            chunks.push(
253                ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
254                    .with_context(context.clone()),
255            );
256
257            let overlap_text = get_overlap_text(&current_chunk, overlap);
258            chunk_start = current_pos - overlap_text.len();
259            current_chunk = overlap_text;
260        }
261
262        current_chunk.push_str(&sentence_with_sep);
263        current_pos += sentence_with_sep.len();
264    }
265
266    if !current_chunk.is_empty() {
267        if current_chunk.len() > max_size {
268            chunks.extend(split_by_chars(
269                &current_chunk,
270                max_size,
271                chunk_start,
272                context.clone(),
273                overlap,
274            ));
275        } else {
276            chunks.push(
277                ChunkResult::new(current_chunk, chunk_start, current_pos)
278                    .with_context(context.clone()),
279            );
280        }
281    }
282
283    chunks
284}
285
286/// Split text by characters (last resort)
287fn split_by_chars(
288    text: &str,
289    max_size: usize,
290    offset: usize,
291    context: Option<String>,
292    overlap: usize,
293) -> Vec<ChunkResult> {
294    let mut chunks = Vec::new();
295    let bytes = text.as_bytes();
296    let mut start = 0;
297
298    // Ensure we make progress - overlap must be less than chunk size
299    let effective_overlap = overlap.min(max_size / 2);
300
301    while start < text.len() {
302        let end = (start + max_size).min(text.len());
303
304        // Try to break at a word boundary
305        let actual_end = if end < text.len() {
306            find_word_boundary_bytes(bytes, start, end)
307        } else {
308            end
309        };
310
311        // Ensure we make at least some progress
312        let actual_end = if actual_end <= start {
313            (start + max_size).min(text.len())
314        } else {
315            actual_end
316        };
317
318        chunks.push(
319            ChunkResult::new(
320                text[start..actual_end].to_string(),
321                offset + start,
322                offset + actual_end,
323            )
324            .with_context(context.clone()),
325        );
326
327        // Next chunk starts after this one, minus overlap
328        // But ensure we always make progress
329        let next_start = actual_end.saturating_sub(effective_overlap);
330        start = if next_start <= start {
331            actual_end // No overlap if it would cause no progress
332        } else {
333            next_start
334        };
335    }
336
337    if chunks.is_empty() && !text.is_empty() {
338        chunks.push(
339            ChunkResult::new(text.to_string(), offset, offset + text.len()).with_context(context),
340        );
341    }
342
343    chunks
344}
345
346/// Find a word boundary near the target position (byte-based for efficiency)
347fn find_word_boundary_bytes(bytes: &[u8], start: usize, target: usize) -> usize {
348    // Look backwards from target to find a space or newline
349    let search_start = target.saturating_sub(50).max(start);
350    for i in (search_start..target).rev() {
351        if bytes[i] == b' ' || bytes[i] == b'\n' {
352            return i + 1;
353        }
354    }
355    target
356}
357
358/// Get overlap text from the end of a chunk
359fn get_overlap_text(text: &str, overlap: usize) -> String {
360    if text.len() <= overlap {
361        return text.to_string();
362    }
363
364    let actual_start = find_overlap_start_bytes(text.as_bytes(), overlap);
365    text[actual_start..].to_string()
366}
367
368/// Find a good overlap start position (byte-based for efficiency)
369fn find_overlap_start_bytes(bytes: &[u8], target_overlap: usize) -> usize {
370    if bytes.len() <= target_overlap {
371        return 0;
372    }
373
374    let start_search = bytes.len().saturating_sub(target_overlap + 50);
375    let end_search = bytes
376        .len()
377        .saturating_sub(target_overlap.saturating_sub(50));
378
379    // Look for a good break point (newline, period, space)
380    for i in (start_search..end_search).rev() {
381        if bytes[i] == b'\n' || bytes[i] == b'.' || bytes[i] == b' ' {
382            return i + 1;
383        }
384    }
385
386    // Fall back to target position
387    bytes.len().saturating_sub(target_overlap)
388}
389
390/// Enforce max size on all chunks by recursive splitting
391fn enforce_max_size(chunks: Vec<ChunkResult>, config: &ChunkingConfig) -> Vec<ChunkResult> {
392    let mut result = Vec::new();
393
394    for chunk in chunks {
395        if chunk.content.len() > config.max_chunk_size {
396            result.extend(recursive_split(
397                &chunk.content,
398                config.max_chunk_size,
399                chunk.start_offset,
400                chunk.context,
401                config.chunk_overlap,
402            ));
403        } else {
404            result.push(chunk);
405        }
406    }
407
408    result
409}
410
411/// Merge small chunks with neighbors, respecting boundaries
412fn merge_small_chunks(chunks: Vec<ChunkResult>, min_size: usize) -> Vec<ChunkResult> {
413    if chunks.is_empty() {
414        return chunks;
415    }
416
417    let mut result: Vec<ChunkResult> = Vec::new();
418
419    for chunk in chunks {
420        if chunk.content.len() >= min_size || chunk.is_boundary {
421            result.push(chunk);
422        } else if let Some(last) = result.last_mut() {
423            // Don't merge across boundaries
424            if !last.is_boundary {
425                // Merge with previous chunk
426                last.content.push_str("\n\n");
427                last.content.push_str(&chunk.content);
428                last.end_offset = chunk.end_offset;
429                // Keep the more specific context
430                if chunk.context.is_some() {
431                    last.context = chunk.context;
432                }
433            } else {
434                result.push(chunk);
435            }
436        } else {
437            result.push(chunk);
438        }
439    }
440
441    // Final pass: if a trailing small chunk exists after a boundary, keep it
442    result
443}
444
445// ============================================================================
446// Markdown Chunking
447// ============================================================================
448
449/// A parsed markdown section
450struct MarkdownSection {
451    /// Full header path (e.g., ["# Main", "## Sub"])
452    header_path: Vec<String>,
453    /// Section content (including the header line)
454    content: String,
455    /// Start offset in original content
456    start_offset: usize,
457    /// End offset in original content
458    end_offset: usize,
459}
460
461/// Parse markdown into sections by headers
462fn parse_markdown_sections(content: &str) -> Vec<MarkdownSection> {
463    let mut sections = Vec::new();
464    let mut current_section = String::new();
465    let mut section_start = 0;
466    let mut current_pos = 0;
467    let mut header_stack: Vec<(usize, String)> = Vec::new(); // (level, header text)
468
469    let lines: Vec<&str> = content.lines().collect();
470
471    for line in lines.iter() {
472        let line_with_newline = if current_pos > 0 {
473            format!("\n{}", line)
474        } else {
475            line.to_string()
476        };
477
478        // Check if this is a header
479        if let Some(level) = get_header_level(line) {
480            // If we have content, save current section
481            if !current_section.is_empty() {
482                sections.push(MarkdownSection {
483                    header_path: header_stack.iter().map(|(_, h)| h.clone()).collect(),
484                    content: current_section.clone(),
485                    start_offset: section_start,
486                    end_offset: current_pos,
487                });
488            }
489
490            // Update header stack
491            // Pop headers of equal or lower level
492            while !header_stack.is_empty() && header_stack.last().unwrap().0 >= level {
493                header_stack.pop();
494            }
495            header_stack.push((level, line.to_string()));
496
497            // Start new section
498            current_section = line_with_newline.trim_start_matches('\n').to_string();
499            section_start = current_pos;
500        } else {
501            current_section.push_str(&line_with_newline);
502        }
503
504        current_pos += line_with_newline.len();
505    }
506
507    // Add final section
508    if !current_section.is_empty() {
509        sections.push(MarkdownSection {
510            header_path: header_stack.iter().map(|(_, h)| h.clone()).collect(),
511            content: current_section,
512            start_offset: section_start,
513            end_offset: content.len(),
514        });
515    }
516
517    // If no sections found, create one for entire content
518    if sections.is_empty() {
519        sections.push(MarkdownSection {
520            header_path: vec![],
521            content: content.to_string(),
522            start_offset: 0,
523            end_offset: content.len(),
524        });
525    }
526
527    sections
528}
529
530/// Get the header level (1-6) or None if not a header
531fn get_header_level(line: &str) -> Option<usize> {
532    let trimmed = line.trim_start();
533    if !trimmed.starts_with('#') {
534        return None;
535    }
536
537    let level = trimmed.chars().take_while(|&c| c == '#').count();
538    if level > 0 && level <= 6 {
539        // Make sure there's a space after the hashes (valid markdown header)
540        if trimmed.len() > level && trimmed.chars().nth(level) == Some(' ') {
541            return Some(level);
542        }
543    }
544    None
545}
546
547/// Format header path as context string
548fn format_header_path(path: &[String]) -> Option<String> {
549    if path.is_empty() {
550        return None;
551    }
552    Some(path.join(" > "))
553}
554
555/// Chunk markdown by headers, preserving header context
556fn chunk_markdown(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
557    let sections = parse_markdown_sections(content);
558    let mut chunks = Vec::new();
559
560    for section in sections {
561        let context = format_header_path(&section.header_path);
562        let is_boundary = !section.header_path.is_empty();
563
564        chunks.push(
565            ChunkResult::new(section.content, section.start_offset, section.end_offset)
566                .with_context(context)
567                .with_boundary(is_boundary),
568        );
569    }
570
571    // Size enforcement happens in chunk_content
572    chunks
573}
574
575// ============================================================================
576// Text Chunking
577// ============================================================================
578
579/// Chunk plain text by paragraphs, then sentences
580fn chunk_text(content: &str, config: &ChunkingConfig) -> Vec<ChunkResult> {
581    let mut chunks = Vec::new();
582    let mut current_chunk = String::new();
583    let mut chunk_start = 0;
584    let mut current_pos = 0;
585
586    // Split by double newlines (paragraphs)
587    let paragraphs: Vec<&str> = content.split("\n\n").collect();
588
589    for (i, para) in paragraphs.iter().enumerate() {
590        let sep = if i > 0 { "\n\n" } else { "" };
591        let para_with_sep = format!("{}{}", sep, para);
592
593        // If adding this paragraph would exceed chunk size, save current chunk
594        if !current_chunk.is_empty()
595            && current_chunk.len() + para_with_sep.len() > config.max_chunk_size
596        {
597            chunks.push(ChunkResult::new(
598                current_chunk.clone(),
599                chunk_start,
600                current_pos,
601            ));
602
603            // Start new chunk with overlap
604            let overlap_text = get_overlap_text(&current_chunk, config.chunk_overlap);
605            chunk_start = current_pos - overlap_text.len();
606            current_chunk = overlap_text;
607        }
608
609        current_chunk.push_str(&para_with_sep);
610        current_pos += para_with_sep.len();
611    }
612
613    // Add final chunk
614    if !current_chunk.is_empty() {
615        chunks.push(ChunkResult::new(current_chunk, chunk_start, content.len()));
616    }
617
618    // Ensure at least one chunk
619    if chunks.is_empty() {
620        chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
621    }
622
623    chunks
624}
625
626// ============================================================================
627// Code Chunking
628// ============================================================================
629
630/// Common patterns for function/class boundaries
631const CODE_BOUNDARY_PATTERNS: &[&str] = &[
632    // Rust
633    "fn ",
634    "pub fn ",
635    "async fn ",
636    "pub async fn ",
637    "impl ",
638    "struct ",
639    "enum ",
640    "trait ",
641    "mod ",
642    "const ",
643    "static ",
644    "type ",
645    "#[",
646    "//!",
647    // Go
648    "func ",
649    // Python
650    "def ",
651    "class ",
652    "async def ",
653    // JavaScript/TypeScript
654    "function ",
655    "async function ",
656    "export ",
657    "export default",
658    "module.exports",
659    "const ",
660    "let ",
661    "var ",
662    "interface ",
663    // C/C++
664    "void ",
665    "int ",
666    "char ",
667    "double ",
668    "float ",
669    "#define ",
670    "#include ",
671];
672
673/// Extract context from a code boundary line
674fn extract_code_context(line: &str) -> String {
675    let trimmed = line.trim();
676
677    // Try to extract a meaningful identifier
678    // For function definitions, get up to the opening brace or paren
679    if let Some(paren_pos) = trimmed.find('(') {
680        let signature = &trimmed[..paren_pos];
681        // Find the last word (likely the function name)
682        if signature.rfind(' ').is_some() {
683            return format!("{}...", &trimmed[..paren_pos.min(60)]);
684        }
685    }
686
687    // For struct/class/impl, get the name
688    for keyword in &[
689        "struct ",
690        "class ",
691        "impl ",
692        "trait ",
693        "interface ",
694        "enum ",
695    ] {
696        if let Some(rest) = trimmed.strip_prefix(keyword) {
697            let name_end = rest
698                .find(|c: char| !c.is_alphanumeric() && c != '_' && c != '<' && c != '>')
699                .unwrap_or(rest.len());
700            return format!("{}{}", keyword, &rest[..name_end.min(50)]);
701        }
702    }
703
704    // Default: first 60 chars
705    trimmed.chars().take(60).collect()
706}
707
708/// Check if a line is a code boundary
709fn is_code_boundary(line: &str) -> bool {
710    let trimmed = line.trim_start();
711    CODE_BOUNDARY_PATTERNS
712        .iter()
713        .any(|p| trimmed.starts_with(p))
714}
715
716/// Chunk code by function/class boundaries
717fn chunk_code(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
718    let mut chunks = Vec::new();
719    let mut current_chunk = String::new();
720    let mut chunk_start = 0;
721    let mut current_pos = 0;
722    let mut current_context: Option<String> = None;
723    let mut is_at_boundary = false;
724
725    let lines: Vec<&str> = content.lines().collect();
726
727    for line in lines {
728        let line_with_newline = if current_pos > 0 {
729            format!("\n{}", line)
730        } else {
731            line.to_string()
732        };
733
734        let boundary = is_code_boundary(line);
735
736        // If we hit a boundary and have substantial content, start new chunk
737        if boundary && !current_chunk.is_empty() && current_chunk.len() > 100 {
738            chunks.push(
739                ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
740                    .with_context(current_context.clone())
741                    .with_boundary(is_at_boundary),
742            );
743
744            current_chunk = String::new();
745            chunk_start = current_pos;
746            is_at_boundary = true;
747        }
748
749        if boundary {
750            current_context = Some(extract_code_context(line));
751            is_at_boundary = true;
752        }
753
754        // Size-based splitting (will be handled by enforce_max_size)
755        current_chunk.push_str(&line_with_newline);
756        current_pos += line_with_newline.len();
757    }
758
759    // Add final chunk
760    if !current_chunk.is_empty() {
761        chunks.push(
762            ChunkResult::new(current_chunk, chunk_start, content.len())
763                .with_context(current_context)
764                .with_boundary(is_at_boundary),
765        );
766    }
767
768    if chunks.is_empty() {
769        chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
770    }
771
772    chunks
773}
774
775// ============================================================================
776// JSON Chunking
777// ============================================================================
778
779/// Chunk JSON by top-level keys/array elements with nested path context
780fn chunk_json(content: &str, config: &ChunkingConfig) -> Vec<ChunkResult> {
781    // Try to parse as JSON
782    if let Ok(value) = serde_json::from_str::<serde_json::Value>(content) {
783        let chunks = chunk_json_value(&value, config, vec![]);
784        if !chunks.is_empty() {
785            return chunks;
786        }
787    }
788
789    // Fall back to text chunking if JSON parsing fails
790    chunk_text(content, config)
791}
792
793/// Recursively chunk a JSON value with path context
794fn chunk_json_value(
795    value: &serde_json::Value,
796    config: &ChunkingConfig,
797    path: Vec<String>,
798) -> Vec<ChunkResult> {
799    let mut chunks = Vec::new();
800
801    match value {
802        serde_json::Value::Object(map) => {
803            let mut current_chunk = String::from("{\n");
804            let entries: Vec<_> = map.iter().collect();
805
806            for (i, (key, val)) in entries.iter().enumerate() {
807                let val_str = serde_json::to_string_pretty(val).unwrap_or_default();
808                let entry = if i < entries.len() - 1 {
809                    format!("  \"{}\": {},\n", key, val_str)
810                } else {
811                    format!("  \"{}\": {}\n", key, val_str)
812                };
813
814                let mut new_path = path.clone();
815                new_path.push((*key).clone());
816                let path_str = new_path.join(".");
817
818                // If this single entry is too large, try to chunk its value
819                if entry.len() > config.max_chunk_size {
820                    // Save current chunk if not empty
821                    if current_chunk.len() > 3 {
822                        current_chunk.push('}');
823                        let context = if path.is_empty() {
824                            None
825                        } else {
826                            Some(path.join("."))
827                        };
828                        chunks.push(
829                            ChunkResult::new(current_chunk, 0, 0)
830                                .with_context(context)
831                                .with_boundary(true),
832                        );
833                        current_chunk = String::from("{\n");
834                    }
835
836                    // Recursively chunk the large value
837                    let sub_chunks = chunk_json_value(val, config, new_path);
838                    chunks.extend(sub_chunks);
839                    continue;
840                }
841
842                if current_chunk.len() + entry.len() > config.max_chunk_size
843                    && current_chunk.len() > 3
844                {
845                    current_chunk.push('}');
846                    chunks.push(
847                        ChunkResult::new(current_chunk, 0, 0)
848                            .with_context(Some(path_str.clone()))
849                            .with_boundary(true),
850                    );
851                    current_chunk = String::from("{\n");
852                }
853
854                current_chunk.push_str(&entry);
855            }
856
857            current_chunk.push('}');
858            if current_chunk.len() > 3 {
859                let context = if path.is_empty() {
860                    None
861                } else {
862                    Some(path.join("."))
863                };
864                chunks.push(
865                    ChunkResult::new(current_chunk, 0, 0)
866                        .with_context(context)
867                        .with_boundary(true),
868                );
869            }
870        }
871        serde_json::Value::Array(arr) => {
872            let mut current_chunk = String::from("[\n");
873
874            for (i, val) in arr.iter().enumerate() {
875                let val_str = serde_json::to_string_pretty(val).unwrap_or_default();
876                let entry = if i < arr.len() - 1 {
877                    format!("  {},\n", val_str)
878                } else {
879                    format!("  {}\n", val_str)
880                };
881
882                let mut new_path = path.clone();
883                new_path.push(format!("[{}]", i));
884                let path_str = new_path.join(".");
885
886                if current_chunk.len() + entry.len() > config.max_chunk_size
887                    && current_chunk.len() > 3
888                {
889                    current_chunk.push(']');
890                    chunks.push(
891                        ChunkResult::new(current_chunk, 0, 0)
892                            .with_context(Some(path_str.clone()))
893                            .with_boundary(true),
894                    );
895                    current_chunk = String::from("[\n");
896                }
897
898                current_chunk.push_str(&entry);
899            }
900
901            current_chunk.push(']');
902            if current_chunk.len() > 3 {
903                let context = if path.is_empty() {
904                    None
905                } else {
906                    Some(path.join("."))
907                };
908                chunks.push(
909                    ChunkResult::new(current_chunk, 0, 0)
910                        .with_context(context)
911                        .with_boundary(true),
912                );
913            }
914        }
915        _ => {
916            // Primitive value - just stringify
917            let content = serde_json::to_string_pretty(value).unwrap_or_default();
918            let context = if path.is_empty() {
919                None
920            } else {
921                Some(path.join("."))
922            };
923            chunks.push(
924                ChunkResult::new(content, 0, 0)
925                    .with_context(context)
926                    .with_boundary(false),
927            );
928        }
929    }
930
931    chunks
932}
933
934// ============================================================================
935// YAML Chunking
936// ============================================================================
937
938/// Chunk YAML by top-level keys with nested path context
939fn chunk_yaml(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
940    let mut chunks = Vec::new();
941    let mut current_chunk = String::new();
942    let mut chunk_start = 0;
943    let mut current_pos = 0;
944    let mut key_stack: Vec<(usize, String)> = Vec::new(); // (indent level, key)
945
946    let lines: Vec<&str> = content.lines().collect();
947
948    for line in lines {
949        let line_with_newline = if current_pos > 0 {
950            format!("\n{}", line)
951        } else {
952            line.to_string()
953        };
954
955        // Calculate indent level
956        let indent = line.len() - line.trim_start().len();
957        let trimmed = line.trim();
958
959        // Check if this is a key line (contains : not in a string)
960        let is_key_line = !trimmed.starts_with('-')
961            && !trimmed.starts_with('#')
962            && trimmed.contains(':')
963            && !trimmed.starts_with('"')
964            && !trimmed.starts_with('\'');
965
966        if is_key_line {
967            // Extract the key
968            if let Some(key) = trimmed.split(':').next() {
969                let key = key.trim().to_string();
970
971                // Update key stack based on indentation
972                while !key_stack.is_empty() && key_stack.last().unwrap().0 >= indent {
973                    key_stack.pop();
974                }
975                key_stack.push((indent, key));
976            }
977        }
978
979        // Check if this is a top-level key (no leading whitespace)
980        let is_top_level_key = indent == 0 && is_key_line;
981
982        // Start new chunk at top-level keys
983        if is_top_level_key && !current_chunk.is_empty() && current_chunk.len() > 50 {
984            let context = format_yaml_path(&key_stack[..key_stack.len().saturating_sub(1)]);
985            chunks.push(
986                ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
987                    .with_context(context)
988                    .with_boundary(true),
989            );
990
991            current_chunk = String::new();
992            chunk_start = current_pos;
993        }
994
995        current_chunk.push_str(&line_with_newline);
996        current_pos += line_with_newline.len();
997    }
998
999    // Add final chunk
1000    if !current_chunk.is_empty() {
1001        let context = format_yaml_path(&key_stack);
1002        chunks.push(
1003            ChunkResult::new(current_chunk, chunk_start, content.len())
1004                .with_context(context)
1005                .with_boundary(!key_stack.is_empty()),
1006        );
1007    }
1008
1009    if chunks.is_empty() {
1010        chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
1011    }
1012
1013    chunks
1014}
1015
1016/// Format YAML key path as context string
1017fn format_yaml_path(stack: &[(usize, String)]) -> Option<String> {
1018    if stack.is_empty() {
1019        return None;
1020    }
1021    Some(
1022        stack
1023            .iter()
1024            .map(|(_, k)| k.as_str())
1025            .collect::<Vec<_>>()
1026            .join("."),
1027    )
1028}
1029
1030// ============================================================================
1031// Tests
1032// ============================================================================
1033
1034#[cfg(test)]
1035mod tests {
1036    use super::*;
1037
1038    #[test]
1039    fn test_small_content_no_chunking() {
1040        let content = "Small content";
1041        let config = ChunkingConfig::default();
1042        let chunks = chunk_content(content, ContentType::Text, &config);
1043
1044        assert_eq!(chunks.len(), 1);
1045        assert_eq!(chunks[0].content, content);
1046    }
1047
1048    #[test]
1049    fn test_text_chunking() {
1050        let content = "a".repeat(2000);
1051        let config = ChunkingConfig {
1052            min_chunk_threshold: 1000,
1053            max_chunk_size: 500,
1054            min_chunk_size: 100,
1055            chunk_overlap: 100,
1056        };
1057        let chunks = chunk_content(&content, ContentType::Text, &config);
1058
1059        assert!(chunks.len() > 1);
1060        for chunk in &chunks {
1061            // Allow some flexibility for overlap
1062            assert!(chunk.content.len() <= config.max_chunk_size + config.chunk_overlap + 100);
1063        }
1064    }
1065
1066    #[test]
1067    fn test_markdown_splits_by_headers_first() {
1068        let content = "# H1\nShort content here.\n\n# H2\nAlso short content.";
1069        let config = ChunkingConfig {
1070            min_chunk_threshold: 10, // Force chunking
1071            max_chunk_size: 1000,
1072            min_chunk_size: 10, // Allow small chunks
1073            chunk_overlap: 0,
1074        };
1075        let chunks = chunk_content(content, ContentType::Markdown, &config);
1076
1077        // Should produce 2 chunks (one per header)
1078        assert_eq!(chunks.len(), 2);
1079        assert!(chunks[0].content.contains("# H1"));
1080        assert!(chunks[1].content.contains("# H2"));
1081    }
1082
1083    #[test]
1084    fn test_large_section_gets_subsplit() {
1085        let long_paragraph = "This is a long sentence. ".repeat(100);
1086        let content = format!("# Header\n\n{}", long_paragraph);
1087        let config = ChunkingConfig {
1088            min_chunk_threshold: 100,
1089            max_chunk_size: 500,
1090            min_chunk_size: 100,
1091            chunk_overlap: 50,
1092        };
1093        let chunks = chunk_content(&content, ContentType::Markdown, &config);
1094
1095        // Should have multiple chunks due to size enforcement
1096        assert!(chunks.len() > 1);
1097        // All chunks should be under max size (with some tolerance for overlap)
1098        for chunk in &chunks {
1099            assert!(chunk.content.len() <= config.max_chunk_size + config.chunk_overlap + 100);
1100        }
1101    }
1102
1103    #[test]
1104    fn test_small_chunks_merged() {
1105        let content = "# A\nx\n\n# B\ny\n\n# C\nz";
1106        let config = ChunkingConfig {
1107            min_chunk_threshold: 5,
1108            max_chunk_size: 1000,
1109            min_chunk_size: 50, // Minimum size that will cause merging
1110            chunk_overlap: 0,
1111        };
1112        let chunks = chunk_content(content, ContentType::Markdown, &config);
1113
1114        // Small sections should be merged (fewer chunks than headers)
1115        // But boundaries should be respected
1116        assert!(chunks.len() <= 3);
1117    }
1118
1119    #[test]
1120    fn test_header_path_context() {
1121        let content = "# Main\n\n## Sub\n\nContent here\n\n### Detail\n\nMore content";
1122        let config = ChunkingConfig {
1123            min_chunk_threshold: 10,
1124            max_chunk_size: 1000,
1125            min_chunk_size: 10,
1126            chunk_overlap: 0,
1127        };
1128        let chunks = chunk_content(content, ContentType::Markdown, &config);
1129
1130        // Check that context includes full header path
1131        let detail_chunk = chunks.iter().find(|c| c.content.contains("### Detail"));
1132        assert!(detail_chunk.is_some());
1133        let ctx = detail_chunk.unwrap().context.as_ref().unwrap();
1134        assert!(ctx.contains("# Main"));
1135        assert!(ctx.contains("## Sub"));
1136        assert!(ctx.contains("### Detail"));
1137    }
1138
1139    #[test]
1140    fn test_markdown_chunking_preserves_context() {
1141        let content = format!(
1142            "# Header 1\n\n{}\n\n# Header 2\n\n{}",
1143            "a".repeat(600),
1144            "b".repeat(600)
1145        );
1146        let config = ChunkingConfig {
1147            min_chunk_threshold: 500,
1148            max_chunk_size: 500,
1149            min_chunk_size: 100,
1150            chunk_overlap: 50,
1151        };
1152        let chunks = chunk_content(&content, ContentType::Markdown, &config);
1153
1154        assert!(chunks.len() >= 2);
1155        // Check that context is preserved
1156        assert!(chunks.iter().any(|c| c.context.is_some()));
1157    }
1158
1159    #[test]
1160    fn test_code_chunking() {
1161        let content = format!(
1162            "fn foo() {{\n{}\n}}\n\nfn bar() {{\n{}\n}}",
1163            "    // code\n".repeat(50),
1164            "    // more code\n".repeat(50)
1165        );
1166        let config = ChunkingConfig {
1167            min_chunk_threshold: 500,
1168            max_chunk_size: 500,
1169            min_chunk_size: 100,
1170            chunk_overlap: 50,
1171        };
1172        let chunks = chunk_content(&content, ContentType::Code, &config);
1173
1174        assert!(chunks.len() >= 2);
1175    }
1176
1177    #[test]
1178    fn test_code_boundary_patterns() {
1179        let patterns = [
1180            "fn test()",
1181            "pub fn test()",
1182            "async fn test()",
1183            "const FOO",
1184            "export default",
1185            "module.exports",
1186            "interface Foo",
1187            "type Bar",
1188        ];
1189
1190        for pattern in patterns {
1191            assert!(
1192                is_code_boundary(pattern),
1193                "Pattern '{}' should be recognized as boundary",
1194                pattern
1195            );
1196        }
1197    }
1198
1199    #[test]
1200    fn test_json_chunking() {
1201        let content = serde_json::json!({
1202            "key1": "a".repeat(300),
1203            "key2": "b".repeat(300),
1204            "key3": "c".repeat(300),
1205        })
1206        .to_string();
1207
1208        let config = ChunkingConfig {
1209            min_chunk_threshold: 500,
1210            max_chunk_size: 400,
1211            min_chunk_size: 100,
1212            chunk_overlap: 50,
1213        };
1214        let chunks = chunk_content(&content, ContentType::Json, &config);
1215
1216        assert!(!chunks.is_empty());
1217    }
1218
1219    #[test]
1220    fn test_json_nested_path_context() {
1221        let content = serde_json::json!({
1222            "users": {
1223                "profile": {
1224                    "settings": "value"
1225                }
1226            }
1227        })
1228        .to_string();
1229
1230        let config = ChunkingConfig {
1231            min_chunk_threshold: 10,
1232            max_chunk_size: 1000,
1233            min_chunk_size: 10,
1234            chunk_overlap: 0,
1235        };
1236        let chunks = chunk_content(&content, ContentType::Json, &config);
1237
1238        // Should have context with nested path
1239        assert!(!chunks.is_empty());
1240    }
1241
1242    #[test]
1243    fn test_sentence_splitting() {
1244        let text = "First sentence. Second sentence? Third sentence! Fourth.";
1245        let sentences = split_at_sentences(text);
1246
1247        assert!(sentences.len() >= 3);
1248        assert!(sentences[0].contains("First"));
1249    }
1250
1251    #[test]
1252    fn test_yaml_chunking_with_path() {
1253        let content = r#"
1254server:
1255  host: localhost
1256  port: 8080
1257database:
1258  host: db.example.com
1259  port: 5432
1260"#;
1261        let config = ChunkingConfig {
1262            min_chunk_threshold: 10,
1263            max_chunk_size: 1000,
1264            min_chunk_size: 10,
1265            chunk_overlap: 0,
1266        };
1267        let chunks = chunk_content(content, ContentType::Yaml, &config);
1268
1269        // Should have chunks for server and database sections
1270        assert!(!chunks.is_empty());
1271    }
1272
1273    #[test]
1274    fn test_recursive_split_preserves_context() {
1275        let long_text = "This is a sentence. ".repeat(100);
1276        let chunks = recursive_split(&long_text, 200, 0, Some("test context".to_string()), 20);
1277
1278        assert!(chunks.len() > 1);
1279        for chunk in &chunks {
1280            assert!(
1281                chunk
1282                    .context
1283                    .as_ref()
1284                    .map(|c| c == "test context")
1285                    .unwrap_or(false)
1286            );
1287        }
1288    }
1289}
sediment/chunker.rs

sediment/
chunker.rs