sediment/
chunker.rs

1//! Chunking logic for splitting documents into searchable pieces
2//!
3//! Uses a two-pass approach for all content types:
4//! 1. First pass: Split by semantic boundaries (headers, functions, paragraphs)
5//! 2. Second pass: If any chunk exceeds max size, recursively split further
6//!
7//! Also applies minimum chunk size to avoid tiny fragments by merging small sections.
8
9use crate::document::ContentType;
10
11/// Configuration for chunking
12#[derive(Debug, Clone)]
13pub struct ChunkingConfig {
14    /// Minimum content length before chunking is applied (default: 1000 chars)
15    pub min_chunk_threshold: usize,
16    /// Maximum chunk size in characters (default: 800 chars)
17    pub max_chunk_size: usize,
18    /// Minimum chunk size - merge if below (default: 200 chars)
19    pub min_chunk_size: usize,
20    /// Overlap between chunks in characters (default: 100 chars)
21    pub chunk_overlap: usize,
22}
23
24impl Default for ChunkingConfig {
25    fn default() -> Self {
26        Self {
27            min_chunk_threshold: 1000,
28            max_chunk_size: 800,
29            min_chunk_size: 200,
30            chunk_overlap: 100,
31        }
32    }
33}
34
35/// Result of chunking a piece of content
36#[derive(Debug, Clone)]
37pub struct ChunkResult {
38    /// The chunk content
39    pub content: String,
40    /// Start byte offset in original content
41    pub start_offset: usize,
42    /// End byte offset in original content
43    pub end_offset: usize,
44    /// Optional context (e.g., parent header for markdown)
45    pub context: Option<String>,
46    /// Whether this chunk represents a major boundary (header, function) - don't merge across
47    pub is_boundary: bool,
48}
49
50impl ChunkResult {
51    fn new(content: String, start_offset: usize, end_offset: usize) -> Self {
52        Self {
53            content,
54            start_offset,
55            end_offset,
56            context: None,
57            is_boundary: false,
58        }
59    }
60
61    fn with_context(mut self, context: Option<String>) -> Self {
62        self.context = context;
63        self
64    }
65
66    fn with_boundary(mut self, is_boundary: bool) -> Self {
67        self.is_boundary = is_boundary;
68        self
69    }
70}
71
72/// Chunk content based on content type
73pub fn chunk_content(
74    content: &str,
75    content_type: ContentType,
76    config: &ChunkingConfig,
77) -> Vec<ChunkResult> {
78    // Don't chunk if content is below threshold (use char count, not byte length)
79    let char_count = content.chars().count();
80    if char_count < config.min_chunk_threshold {
81        return vec![ChunkResult::new(content.to_string(), 0, content.len())];
82    }
83
84    // First pass: semantic splitting
85    let chunks = match content_type {
86        ContentType::Markdown => chunk_markdown(content, config),
87        ContentType::Json => chunk_json(content, config),
88        ContentType::Yaml => chunk_yaml(content, config),
89        ContentType::Code => chunk_code(content, config),
90        ContentType::Text => chunk_text(content, config),
91    };
92
93    // Second pass: enforce max size by recursive splitting
94    let chunks = enforce_max_size(chunks, config);
95
96    // Third pass: merge small chunks (respecting boundaries and max size)
97    merge_small_chunks(chunks, config.min_chunk_size, config.max_chunk_size)
98}
99
100// ============================================================================
101// Helper Functions
102// ============================================================================
103
104/// Split text at sentence boundaries
105fn split_at_sentences(text: &str) -> Vec<&str> {
106    let mut sentences = Vec::new();
107    let mut start = 0;
108    let mut char_indices = text.char_indices().peekable();
109
110    while let Some((i, ch)) = char_indices.next() {
111        // Look for sentence-ending punctuation (ASCII and Unicode)
112        if matches!(ch, '.' | '?' | '!' | '。' | '？' | '！') {
113            let end = i + ch.len_utf8();
114            // Check if followed by whitespace or end of text
115            let at_end_or_ws = match char_indices.peek() {
116                None => true,
117                Some(&(_, next_ch)) => next_ch == ' ' || next_ch == '\n' || next_ch == '\t',
118            };
119            if at_end_or_ws {
120                if start < end {
121                    sentences.push(&text[start..end]);
122                }
123                // Skip whitespace after punctuation
124                while let Some(&(_, next_ch)) = char_indices.peek() {
125                    if next_ch == ' ' || next_ch == '\n' || next_ch == '\t' {
126                        char_indices.next();
127                    } else {
128                        break;
129                    }
130                }
131                start = match char_indices.peek() {
132                    Some(&(idx, _)) => idx,
133                    None => text.len(),
134                };
135            }
136        }
137    }
138
139    // Add remaining text
140    if start < text.len() {
141        sentences.push(&text[start..]);
142    }
143
144    if sentences.is_empty() && !text.is_empty() {
145        sentences.push(text);
146    }
147
148    sentences
149}
150
151/// Recursively split content: paragraphs -> sentences -> chars
152fn recursive_split(
153    text: &str,
154    max_size: usize,
155    offset: usize,
156    context: Option<String>,
157    overlap: usize,
158) -> Vec<ChunkResult> {
159    if text.len() <= max_size {
160        return vec![
161            ChunkResult::new(text.to_string(), offset, offset + text.len()).with_context(context),
162        ];
163    }
164
165    let mut chunks = Vec::new();
166
167    // Try splitting by paragraphs first
168    let paragraphs: Vec<&str> = text.split("\n\n").collect();
169    if paragraphs.len() > 1 {
170        let mut current_chunk = String::new();
171        let mut chunk_start = offset;
172        let mut current_pos = offset;
173
174        for (i, para) in paragraphs.iter().enumerate() {
175            let sep = if i > 0 { "\n\n" } else { "" };
176            let para_with_sep = format!("{}{}", sep, para);
177
178            if !current_chunk.is_empty() && current_chunk.len() + para_with_sep.len() > max_size {
179                // Save current chunk
180                chunks.push(
181                    ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
182                        .with_context(context.clone()),
183                );
184
185                // Start new chunk with overlap
186                let overlap_text = get_overlap_text(&current_chunk, overlap);
187                chunk_start = current_pos - overlap_text.len();
188                current_chunk = overlap_text;
189            }
190
191            current_chunk.push_str(&para_with_sep);
192            current_pos += para_with_sep.len();
193        }
194
195        if !current_chunk.is_empty() {
196            // If the final chunk is still too large, recursively split by sentences
197            if current_chunk.len() > max_size {
198                chunks.extend(split_by_sentences(
199                    &current_chunk,
200                    max_size,
201                    chunk_start,
202                    context.clone(),
203                    overlap,
204                ));
205            } else {
206                chunks.push(
207                    ChunkResult::new(current_chunk, chunk_start, current_pos)
208                        .with_context(context.clone()),
209                );
210            }
211        }
212
213        return chunks;
214    }
215
216    // No paragraph breaks - split by sentences
217    split_by_sentences(text, max_size, offset, context, overlap)
218}
219
220/// Split text by sentences, falling back to character split
221fn split_by_sentences(
222    text: &str,
223    max_size: usize,
224    offset: usize,
225    context: Option<String>,
226    overlap: usize,
227) -> Vec<ChunkResult> {
228    let sentences = split_at_sentences(text);
229
230    if sentences.len() <= 1 {
231        // No sentence boundaries - split by characters
232        return split_by_chars(text, max_size, offset, context, overlap);
233    }
234
235    let mut chunks = Vec::new();
236    let mut current_chunk = String::new();
237    let mut chunk_start = offset;
238    let mut current_pos = offset;
239
240    for sentence in sentences {
241        let sep = if !current_chunk.is_empty() { " " } else { "" };
242        let sentence_with_sep = format!("{}{}", sep, sentence);
243
244        if !current_chunk.is_empty() && current_chunk.len() + sentence_with_sep.len() > max_size {
245            chunks.push(
246                ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
247                    .with_context(context.clone()),
248            );
249
250            let overlap_text = get_overlap_text(&current_chunk, overlap);
251            chunk_start = current_pos - overlap_text.len();
252            current_chunk = overlap_text;
253        }
254
255        current_chunk.push_str(&sentence_with_sep);
256        current_pos += sentence_with_sep.len();
257    }
258
259    if !current_chunk.is_empty() {
260        if current_chunk.len() > max_size {
261            chunks.extend(split_by_chars(
262                &current_chunk,
263                max_size,
264                chunk_start,
265                context.clone(),
266                overlap,
267            ));
268        } else {
269            chunks.push(
270                ChunkResult::new(current_chunk, chunk_start, current_pos)
271                    .with_context(context.clone()),
272            );
273        }
274    }
275
276    chunks
277}
278
279/// Split text by characters (last resort)
280fn split_by_chars(
281    text: &str,
282    max_size: usize,
283    offset: usize,
284    context: Option<String>,
285    overlap: usize,
286) -> Vec<ChunkResult> {
287    let mut chunks = Vec::new();
288    let bytes = text.as_bytes();
289    let mut start = 0;
290
291    // Ensure we make progress - overlap must be less than chunk size
292    let effective_overlap = overlap.min(max_size / 2);
293
294    while start < text.len() {
295        let end = (start + max_size).min(text.len());
296
297        // Try to break at a word boundary
298        let actual_end = if end < text.len() {
299            find_word_boundary_bytes(bytes, start, end)
300        } else {
301            end
302        };
303
304        // Ensure we make at least some progress
305        let actual_end = if actual_end <= start {
306            (start + max_size).min(text.len())
307        } else {
308            actual_end
309        };
310
311        // Ensure actual_end is on a valid UTF-8 char boundary
312        let actual_end = {
313            let mut e = actual_end;
314            while e < text.len() && !text.is_char_boundary(e) {
315                e += 1;
316            }
317            e
318        };
319
320        chunks.push(
321            ChunkResult::new(
322                text[start..actual_end].to_string(),
323                offset + start,
324                offset + actual_end,
325            )
326            .with_context(context.clone()),
327        );
328
329        // Next chunk starts after this one, minus overlap
330        // But ensure we always make progress and land on a char boundary
331        let next_start = actual_end.saturating_sub(effective_overlap);
332        let next_start = {
333            let mut s = next_start;
334            while s < text.len() && !text.is_char_boundary(s) {
335                s += 1;
336            }
337            s
338        };
339        start = if next_start <= start {
340            actual_end // No overlap if it would cause no progress
341        } else {
342            next_start
343        };
344    }
345
346    if chunks.is_empty() && !text.is_empty() {
347        chunks.push(
348            ChunkResult::new(text.to_string(), offset, offset + text.len()).with_context(context),
349        );
350    }
351
352    chunks
353}
354
355/// Find a word boundary near the target position (byte-based for efficiency).
356///
357/// Safety: Only searches for ASCII bytes (space 0x20, newline 0x0A) which cannot
358/// appear as continuation bytes in multi-byte UTF-8 sequences. The returned
359/// position is always immediately after an ASCII byte, so slicing at this
360/// position is guaranteed to be on a valid UTF-8 char boundary.
361fn find_word_boundary_bytes(bytes: &[u8], start: usize, target: usize) -> usize {
362    // Look backwards from target to find a space or newline
363    let search_start = target.saturating_sub(50).max(start);
364    for i in (search_start..target).rev() {
365        if bytes[i] == b' ' || bytes[i] == b'\n' {
366            return i + 1;
367        }
368    }
369    target
370}
371
372/// Get overlap text from the end of a chunk
373fn get_overlap_text(text: &str, overlap: usize) -> String {
374    if text.len() <= overlap {
375        return text.to_string();
376    }
377
378    let actual_start = find_overlap_start_bytes(text.as_bytes(), overlap);
379    text[actual_start..].to_string()
380}
381
382/// Find a good overlap start position (byte-based for efficiency)
383fn find_overlap_start_bytes(bytes: &[u8], target_overlap: usize) -> usize {
384    if bytes.len() <= target_overlap {
385        return 0;
386    }
387
388    let start_search = bytes.len().saturating_sub(target_overlap + 50);
389    let end_search = bytes
390        .len()
391        .saturating_sub(target_overlap.saturating_sub(50));
392
393    // Look for a good break point (newline, period, space)
394    for i in (start_search..end_search).rev() {
395        if bytes[i] == b'\n' || bytes[i] == b'.' || bytes[i] == b' ' {
396            return i + 1;
397        }
398    }
399
400    // Fall back to target position, snapping forward to a valid char boundary
401    // to avoid panicking on multi-byte characters (continuation bytes: 10xxxxxx).
402    let mut pos = bytes.len().saturating_sub(target_overlap);
403    while pos < bytes.len() && bytes[pos] & 0xC0 == 0x80 {
404        pos += 1;
405    }
406    pos
407}
408
409/// Enforce max size on all chunks by recursive splitting
410fn enforce_max_size(chunks: Vec<ChunkResult>, config: &ChunkingConfig) -> Vec<ChunkResult> {
411    let mut result = Vec::new();
412
413    for chunk in chunks {
414        if chunk.content.len() > config.max_chunk_size {
415            result.extend(recursive_split(
416                &chunk.content,
417                config.max_chunk_size,
418                chunk.start_offset,
419                chunk.context,
420                config.chunk_overlap,
421            ));
422        } else {
423            result.push(chunk);
424        }
425    }
426
427    result
428}
429
430/// Merge small chunks with neighbors, respecting boundaries and max size
431fn merge_small_chunks(
432    chunks: Vec<ChunkResult>,
433    min_size: usize,
434    max_size: usize,
435) -> Vec<ChunkResult> {
436    if chunks.is_empty() {
437        return chunks;
438    }
439
440    let mut result: Vec<ChunkResult> = Vec::new();
441
442    for chunk in chunks {
443        if chunk.content.len() >= min_size || chunk.is_boundary {
444            result.push(chunk);
445        } else if let Some(last) = result.last_mut() {
446            // Don't merge across boundaries or if result would exceed max size
447            let merged_len = last.content.len() + 2 + chunk.content.len();
448            if !last.is_boundary && merged_len <= max_size {
449                // Merge with previous chunk
450                last.content.push_str("\n\n");
451                last.content.push_str(&chunk.content);
452                last.end_offset = chunk.end_offset;
453                // Keep the more specific context
454                if chunk.context.is_some() {
455                    last.context = chunk.context;
456                }
457            } else {
458                result.push(chunk);
459            }
460        } else {
461            result.push(chunk);
462        }
463    }
464
465    // Final pass: if a trailing small chunk exists after a boundary, keep it
466    result
467}
468
469// ============================================================================
470// Markdown Chunking
471// ============================================================================
472
473/// A parsed markdown section
474struct MarkdownSection {
475    /// Full header path (e.g., ["# Main", "## Sub"])
476    header_path: Vec<String>,
477    /// Section content (including the header line)
478    content: String,
479    /// Start offset in original content
480    start_offset: usize,
481    /// End offset in original content
482    end_offset: usize,
483}
484
485/// Parse markdown into sections by headers
486fn parse_markdown_sections(content: &str) -> Vec<MarkdownSection> {
487    let mut sections = Vec::new();
488    let mut current_section = String::new();
489    let mut section_start = 0;
490    let mut current_pos = 0;
491    let mut header_stack: Vec<(usize, String)> = Vec::new(); // (level, header text)
492
493    let lines: Vec<&str> = content.lines().collect();
494
495    for line in lines.iter() {
496        let line_with_newline = if current_pos > 0 {
497            format!("\n{}", line)
498        } else {
499            line.to_string()
500        };
501
502        // Check if this is a header
503        if let Some(level) = get_header_level(line) {
504            // If we have content, save current section
505            if !current_section.is_empty() {
506                sections.push(MarkdownSection {
507                    header_path: header_stack.iter().map(|(_, h)| h.clone()).collect(),
508                    content: current_section.clone(),
509                    start_offset: section_start,
510                    end_offset: current_pos,
511                });
512            }
513
514            // Update header stack
515            // Pop headers of equal or lower level
516            while !header_stack.is_empty() && header_stack.last().unwrap().0 >= level {
517                header_stack.pop();
518            }
519            header_stack.push((level, line.to_string()));
520
521            // Start new section
522            current_section = line_with_newline.trim_start_matches('\n').to_string();
523            section_start = current_pos;
524        } else {
525            current_section.push_str(&line_with_newline);
526        }
527
528        current_pos += line_with_newline.len();
529    }
530
531    // Add final section
532    if !current_section.is_empty() {
533        sections.push(MarkdownSection {
534            header_path: header_stack.iter().map(|(_, h)| h.clone()).collect(),
535            content: current_section,
536            start_offset: section_start,
537            end_offset: content.len(),
538        });
539    }
540
541    // If no sections found, create one for entire content
542    if sections.is_empty() {
543        sections.push(MarkdownSection {
544            header_path: vec![],
545            content: content.to_string(),
546            start_offset: 0,
547            end_offset: content.len(),
548        });
549    }
550
551    sections
552}
553
554/// Get the header level (1-6) or None if not a header
555fn get_header_level(line: &str) -> Option<usize> {
556    let trimmed = line.trim_start();
557    if !trimmed.starts_with('#') {
558        return None;
559    }
560
561    let level = trimmed.chars().take_while(|&c| c == '#').count();
562    if level > 0 && level <= 6 {
563        // Make sure there's a space after the hashes (valid markdown header)
564        if trimmed.len() > level && trimmed.chars().nth(level) == Some(' ') {
565            return Some(level);
566        }
567    }
568    None
569}
570
571/// Format header path as context string
572fn format_header_path(path: &[String]) -> Option<String> {
573    if path.is_empty() {
574        return None;
575    }
576    Some(path.join(" > "))
577}
578
579/// Chunk markdown by headers, preserving header context
580fn chunk_markdown(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
581    let sections = parse_markdown_sections(content);
582    let mut chunks = Vec::new();
583
584    for section in sections {
585        let context = format_header_path(&section.header_path);
586        let is_boundary = !section.header_path.is_empty();
587
588        chunks.push(
589            ChunkResult::new(section.content, section.start_offset, section.end_offset)
590                .with_context(context)
591                .with_boundary(is_boundary),
592        );
593    }
594
595    // Size enforcement happens in chunk_content
596    chunks
597}
598
599// ============================================================================
600// Text Chunking
601// ============================================================================
602
603/// Chunk plain text by paragraphs, then sentences
604fn chunk_text(content: &str, config: &ChunkingConfig) -> Vec<ChunkResult> {
605    let mut chunks = Vec::new();
606    let mut current_chunk = String::new();
607    let mut chunk_start = 0;
608    let mut current_pos = 0;
609
610    // Split by double newlines (paragraphs)
611    let paragraphs: Vec<&str> = content.split("\n\n").collect();
612
613    for (i, para) in paragraphs.iter().enumerate() {
614        let sep = if i > 0 { "\n\n" } else { "" };
615        let para_with_sep = format!("{}{}", sep, para);
616
617        // If adding this paragraph would exceed chunk size, save current chunk
618        if !current_chunk.is_empty()
619            && current_chunk.len() + para_with_sep.len() > config.max_chunk_size
620        {
621            chunks.push(ChunkResult::new(
622                current_chunk.clone(),
623                chunk_start,
624                current_pos,
625            ));
626
627            // Start new chunk with overlap
628            let overlap_text = get_overlap_text(&current_chunk, config.chunk_overlap);
629            chunk_start = current_pos - overlap_text.len();
630            current_chunk = overlap_text;
631        }
632
633        current_chunk.push_str(&para_with_sep);
634        current_pos += para_with_sep.len();
635    }
636
637    // Add final chunk
638    if !current_chunk.is_empty() {
639        chunks.push(ChunkResult::new(current_chunk, chunk_start, content.len()));
640    }
641
642    // Ensure at least one chunk
643    if chunks.is_empty() {
644        chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
645    }
646
647    chunks
648}
649
650// ============================================================================
651// Code Chunking
652// ============================================================================
653
654/// Common patterns for function/class boundaries
655const CODE_BOUNDARY_PATTERNS: &[&str] = &[
656    // Rust
657    "fn ",
658    "pub fn ",
659    "async fn ",
660    "pub async fn ",
661    "impl ",
662    "struct ",
663    "enum ",
664    "trait ",
665    "mod ",
666    "const ",
667    "static ",
668    "type ",
669    "#[",
670    "//!",
671    // Go
672    "func ",
673    // Python
674    "def ",
675    "class ",
676    "async def ",
677    // JavaScript/TypeScript
678    "function ",
679    "async function ",
680    "export ",
681    "export default",
682    "module.exports",
683    "const ",
684    "let ",
685    "var ",
686    "interface ",
687    // C/C++
688    "void ",
689    "int ",
690    "char ",
691    "double ",
692    "float ",
693    "#define ",
694    "#include ",
695];
696
697/// Extract context from a code boundary line
698fn extract_code_context(line: &str) -> String {
699    let trimmed = line.trim();
700
701    // Try to extract a meaningful identifier
702    // For function definitions, get up to the opening brace or paren
703    if let Some(paren_pos) = trimmed.find('(') {
704        let signature = &trimmed[..paren_pos];
705        if signature.contains(' ') {
706            return format!("{}...", &trimmed[..paren_pos.min(60)]);
707        }
708    }
709
710    // For struct/class/impl, get the name
711    for keyword in &[
712        "struct ",
713        "class ",
714        "impl ",
715        "trait ",
716        "interface ",
717        "enum ",
718    ] {
719        if let Some(rest) = trimmed.strip_prefix(keyword) {
720            let name_end = rest
721                .find(|c: char| !c.is_alphanumeric() && c != '_' && c != '<' && c != '>')
722                .unwrap_or(rest.len());
723            return format!("{}{}", keyword, &rest[..name_end.min(50)]);
724        }
725    }
726
727    // Default: first 60 chars
728    trimmed.chars().take(60).collect()
729}
730
731/// Check if a line is a code boundary
732fn is_code_boundary(line: &str) -> bool {
733    let trimmed = line.trim_start();
734    CODE_BOUNDARY_PATTERNS
735        .iter()
736        .any(|p| trimmed.starts_with(p))
737}
738
739/// Chunk code by function/class boundaries
740fn chunk_code(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
741    let mut chunks = Vec::new();
742    let mut current_chunk = String::new();
743    let mut chunk_start = 0;
744    let mut current_pos = 0;
745    let mut current_context: Option<String> = None;
746    let mut is_at_boundary = false;
747
748    let lines: Vec<&str> = content.lines().collect();
749
750    for line in lines {
751        let line_with_newline = if current_pos > 0 {
752            format!("\n{}", line)
753        } else {
754            line.to_string()
755        };
756
757        let boundary = is_code_boundary(line);
758
759        // If we hit a boundary and have substantial content, start new chunk
760        if boundary && !current_chunk.is_empty() && current_chunk.len() > 100 {
761            chunks.push(
762                ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
763                    .with_context(current_context.clone())
764                    .with_boundary(is_at_boundary),
765            );
766
767            current_chunk = String::new();
768            chunk_start = current_pos;
769            is_at_boundary = true;
770        }
771
772        if boundary {
773            current_context = Some(extract_code_context(line));
774            is_at_boundary = true;
775        }
776
777        // Size-based splitting (will be handled by enforce_max_size)
778        current_chunk.push_str(&line_with_newline);
779        current_pos += line_with_newline.len();
780    }
781
782    // Add final chunk
783    if !current_chunk.is_empty() {
784        chunks.push(
785            ChunkResult::new(current_chunk, chunk_start, content.len())
786                .with_context(current_context)
787                .with_boundary(is_at_boundary),
788        );
789    }
790
791    if chunks.is_empty() {
792        chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
793    }
794
795    chunks
796}
797
798// ============================================================================
799// JSON Chunking
800// ============================================================================
801
802/// Chunk JSON by top-level keys/array elements with nested path context
803fn chunk_json(content: &str, config: &ChunkingConfig) -> Vec<ChunkResult> {
804    // Try to parse as JSON
805    if let Ok(value) = serde_json::from_str::<serde_json::Value>(content) {
806        let chunks = chunk_json_value(&value, config, vec![]);
807        if !chunks.is_empty() {
808            return chunks;
809        }
810    }
811
812    // Fall back to text chunking if JSON parsing fails
813    chunk_text(content, config)
814}
815
816/// Recursively chunk a JSON value with path context.
817///
818/// Note: JSON chunks are re-serialized from parsed JSON, so `start_offset` and
819/// `end_offset` represent positions within the serialized chunk content (0..len),
820/// not byte offsets into the original input string.
821fn chunk_json_value(
822    value: &serde_json::Value,
823    config: &ChunkingConfig,
824    path: Vec<String>,
825) -> Vec<ChunkResult> {
826    let mut chunks = Vec::new();
827
828    match value {
829        serde_json::Value::Object(map) => {
830            let mut current_chunk = String::from("{\n");
831            let entries: Vec<_> = map.iter().collect();
832
833            for (i, (key, val)) in entries.iter().enumerate() {
834                let val_str = serde_json::to_string_pretty(val).unwrap_or_default();
835                let entry = if i < entries.len() - 1 {
836                    format!("  \"{}\": {},\n", key, val_str)
837                } else {
838                    format!("  \"{}\": {}\n", key, val_str)
839                };
840
841                let mut new_path = path.clone();
842                new_path.push((*key).clone());
843                let path_str = new_path.join(".");
844
845                // If this single entry is too large, try to chunk its value
846                if entry.len() > config.max_chunk_size {
847                    // Save current chunk if not empty
848                    if current_chunk.len() > 3 {
849                        current_chunk.push('}');
850                        let len = current_chunk.len();
851                        let context = if path.is_empty() {
852                            None
853                        } else {
854                            Some(path.join("."))
855                        };
856                        chunks.push(
857                            ChunkResult::new(current_chunk, 0, len)
858                                .with_context(context)
859                                .with_boundary(true),
860                        );
861                        current_chunk = String::from("{\n");
862                    }
863
864                    // Recursively chunk the large value
865                    let sub_chunks = chunk_json_value(val, config, new_path);
866                    chunks.extend(sub_chunks);
867                    continue;
868                }
869
870                if current_chunk.len() + entry.len() > config.max_chunk_size
871                    && current_chunk.len() > 3
872                {
873                    current_chunk.push('}');
874                    let len = current_chunk.len();
875                    chunks.push(
876                        ChunkResult::new(current_chunk, 0, len)
877                            .with_context(Some(path_str.clone()))
878                            .with_boundary(true),
879                    );
880                    current_chunk = String::from("{\n");
881                }
882
883                current_chunk.push_str(&entry);
884            }
885
886            current_chunk.push('}');
887            if current_chunk.len() > 3 {
888                let len = current_chunk.len();
889                let context = if path.is_empty() {
890                    None
891                } else {
892                    Some(path.join("."))
893                };
894                chunks.push(
895                    ChunkResult::new(current_chunk, 0, len)
896                        .with_context(context)
897                        .with_boundary(true),
898                );
899            }
900        }
901        serde_json::Value::Array(arr) => {
902            let mut current_chunk = String::from("[\n");
903
904            for (i, val) in arr.iter().enumerate() {
905                let val_str = serde_json::to_string_pretty(val).unwrap_or_default();
906                let entry = if i < arr.len() - 1 {
907                    format!("  {},\n", val_str)
908                } else {
909                    format!("  {}\n", val_str)
910                };
911
912                let mut new_path = path.clone();
913                new_path.push(format!("[{}]", i));
914                let path_str = new_path.join(".");
915
916                if current_chunk.len() + entry.len() > config.max_chunk_size
917                    && current_chunk.len() > 3
918                {
919                    current_chunk.push(']');
920                    let len = current_chunk.len();
921                    chunks.push(
922                        ChunkResult::new(current_chunk, 0, len)
923                            .with_context(Some(path_str.clone()))
924                            .with_boundary(true),
925                    );
926                    current_chunk = String::from("[\n");
927                }
928
929                current_chunk.push_str(&entry);
930            }
931
932            current_chunk.push(']');
933            if current_chunk.len() > 3 {
934                let len = current_chunk.len();
935                let context = if path.is_empty() {
936                    None
937                } else {
938                    Some(path.join("."))
939                };
940                chunks.push(
941                    ChunkResult::new(current_chunk, 0, len)
942                        .with_context(context)
943                        .with_boundary(true),
944                );
945            }
946        }
947        _ => {
948            // Primitive value - just stringify
949            let content = serde_json::to_string_pretty(value).unwrap_or_default();
950            let len = content.len();
951            let context = if path.is_empty() {
952                None
953            } else {
954                Some(path.join("."))
955            };
956            chunks.push(
957                ChunkResult::new(content, 0, len)
958                    .with_context(context)
959                    .with_boundary(false),
960            );
961        }
962    }
963
964    chunks
965}
966
967// ============================================================================
968// YAML Chunking
969// ============================================================================
970
971/// Chunk YAML by top-level keys with nested path context
972fn chunk_yaml(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
973    let mut chunks = Vec::new();
974    let mut current_chunk = String::new();
975    let mut chunk_start = 0;
976    let mut current_pos = 0;
977    let mut key_stack: Vec<(usize, String)> = Vec::new(); // (indent level, key)
978
979    let lines: Vec<&str> = content.lines().collect();
980
981    for line in lines {
982        let line_with_newline = if current_pos > 0 {
983            format!("\n{}", line)
984        } else {
985            line.to_string()
986        };
987
988        // Calculate indent level
989        let indent = line.len() - line.trim_start().len();
990        let trimmed = line.trim();
991
992        // Check if this is a YAML key line.
993        // A key line has an unquoted key followed by ':'. We check that ':' is not
994        // inside a URL (contains ://) and the line doesn't start with flow indicators.
995        let is_key_line = !trimmed.starts_with('-')
996            && !trimmed.starts_with('#')
997            && !trimmed.starts_with('"')
998            && !trimmed.starts_with('\'')
999            && !trimmed.starts_with('{')
1000            && !trimmed.starts_with('[')
1001            && trimmed.contains(':')
1002            && !trimmed.contains("://");
1003
1004        if is_key_line {
1005            // Extract the key
1006            if let Some(key) = trimmed.split(':').next() {
1007                let key = key.trim().to_string();
1008
1009                // Update key stack based on indentation
1010                while !key_stack.is_empty() && key_stack.last().unwrap().0 >= indent {
1011                    key_stack.pop();
1012                }
1013                key_stack.push((indent, key));
1014            }
1015        }
1016
1017        // Check if this is a top-level key (no leading whitespace)
1018        let is_top_level_key = indent == 0 && is_key_line;
1019
1020        // Start new chunk at top-level keys
1021        if is_top_level_key && !current_chunk.is_empty() && current_chunk.len() > 50 {
1022            let context = format_yaml_path(&key_stack[..key_stack.len().saturating_sub(1)]);
1023            chunks.push(
1024                ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
1025                    .with_context(context)
1026                    .with_boundary(true),
1027            );
1028
1029            current_chunk = String::new();
1030            chunk_start = current_pos;
1031        }
1032
1033        current_chunk.push_str(&line_with_newline);
1034        current_pos += line_with_newline.len();
1035    }
1036
1037    // Add final chunk
1038    if !current_chunk.is_empty() {
1039        let context = format_yaml_path(&key_stack);
1040        chunks.push(
1041            ChunkResult::new(current_chunk, chunk_start, content.len())
1042                .with_context(context)
1043                .with_boundary(!key_stack.is_empty()),
1044        );
1045    }
1046
1047    if chunks.is_empty() {
1048        chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
1049    }
1050
1051    chunks
1052}
1053
1054/// Format YAML key path as context string
1055fn format_yaml_path(stack: &[(usize, String)]) -> Option<String> {
1056    if stack.is_empty() {
1057        return None;
1058    }
1059    Some(
1060        stack
1061            .iter()
1062            .map(|(_, k)| k.as_str())
1063            .collect::<Vec<_>>()
1064            .join("."),
1065    )
1066}
1067
1068// ============================================================================
1069// Tests
1070// ============================================================================
1071
1072#[cfg(test)]
1073mod tests {
1074    use super::*;
1075
1076    #[test]
1077    fn test_small_content_no_chunking() {
1078        let content = "Small content";
1079        let config = ChunkingConfig::default();
1080        let chunks = chunk_content(content, ContentType::Text, &config);
1081
1082        assert_eq!(chunks.len(), 1);
1083        assert_eq!(chunks[0].content, content);
1084    }
1085
1086    #[test]
1087    fn test_text_chunking() {
1088        let content = "a".repeat(2000);
1089        let config = ChunkingConfig {
1090            min_chunk_threshold: 1000,
1091            max_chunk_size: 500,
1092            min_chunk_size: 100,
1093            chunk_overlap: 100,
1094        };
1095        let chunks = chunk_content(&content, ContentType::Text, &config);
1096
1097        assert!(chunks.len() > 1);
1098        for chunk in &chunks {
1099            // Allow some flexibility for overlap
1100            assert!(chunk.content.len() <= config.max_chunk_size + config.chunk_overlap + 100);
1101        }
1102    }
1103
1104    #[test]
1105    fn test_markdown_splits_by_headers_first() {
1106        let content = "# H1\nShort content here.\n\n# H2\nAlso short content.";
1107        let config = ChunkingConfig {
1108            min_chunk_threshold: 10, // Force chunking
1109            max_chunk_size: 1000,
1110            min_chunk_size: 10, // Allow small chunks
1111            chunk_overlap: 0,
1112        };
1113        let chunks = chunk_content(content, ContentType::Markdown, &config);
1114
1115        // Should produce 2 chunks (one per header)
1116        assert_eq!(chunks.len(), 2);
1117        assert!(chunks[0].content.contains("# H1"));
1118        assert!(chunks[1].content.contains("# H2"));
1119    }
1120
1121    #[test]
1122    fn test_large_section_gets_subsplit() {
1123        let long_paragraph = "This is a long sentence. ".repeat(100);
1124        let content = format!("# Header\n\n{}", long_paragraph);
1125        let config = ChunkingConfig {
1126            min_chunk_threshold: 100,
1127            max_chunk_size: 500,
1128            min_chunk_size: 100,
1129            chunk_overlap: 50,
1130        };
1131        let chunks = chunk_content(&content, ContentType::Markdown, &config);
1132
1133        // Should have multiple chunks due to size enforcement
1134        assert!(chunks.len() > 1);
1135        // All chunks should be under max size (with some tolerance for overlap)
1136        for chunk in &chunks {
1137            assert!(chunk.content.len() <= config.max_chunk_size + config.chunk_overlap + 100);
1138        }
1139    }
1140
1141    #[test]
1142    fn test_small_chunks_merged() {
1143        let content = "# A\nx\n\n# B\ny\n\n# C\nz";
1144        let config = ChunkingConfig {
1145            min_chunk_threshold: 5,
1146            max_chunk_size: 1000,
1147            min_chunk_size: 50, // Minimum size that will cause merging
1148            chunk_overlap: 0,
1149        };
1150        let chunks = chunk_content(content, ContentType::Markdown, &config);
1151
1152        // Small sections should be merged (fewer chunks than headers)
1153        // But boundaries should be respected
1154        assert!(chunks.len() <= 3);
1155    }
1156
1157    #[test]
1158    fn test_header_path_context() {
1159        let content = "# Main\n\n## Sub\n\nContent here\n\n### Detail\n\nMore content";
1160        let config = ChunkingConfig {
1161            min_chunk_threshold: 10,
1162            max_chunk_size: 1000,
1163            min_chunk_size: 10,
1164            chunk_overlap: 0,
1165        };
1166        let chunks = chunk_content(content, ContentType::Markdown, &config);
1167
1168        // Check that context includes full header path
1169        let detail_chunk = chunks.iter().find(|c| c.content.contains("### Detail"));
1170        assert!(detail_chunk.is_some());
1171        let ctx = detail_chunk.unwrap().context.as_ref().unwrap();
1172        assert!(ctx.contains("# Main"));
1173        assert!(ctx.contains("## Sub"));
1174        assert!(ctx.contains("### Detail"));
1175    }
1176
1177    #[test]
1178    fn test_markdown_chunking_preserves_context() {
1179        let content = format!(
1180            "# Header 1\n\n{}\n\n# Header 2\n\n{}",
1181            "a".repeat(600),
1182            "b".repeat(600)
1183        );
1184        let config = ChunkingConfig {
1185            min_chunk_threshold: 500,
1186            max_chunk_size: 500,
1187            min_chunk_size: 100,
1188            chunk_overlap: 50,
1189        };
1190        let chunks = chunk_content(&content, ContentType::Markdown, &config);
1191
1192        assert!(chunks.len() >= 2);
1193        // Check that context is preserved
1194        assert!(chunks.iter().any(|c| c.context.is_some()));
1195    }
1196
1197    #[test]
1198    fn test_code_chunking() {
1199        let content = format!(
1200            "fn foo() {{\n{}\n}}\n\nfn bar() {{\n{}\n}}",
1201            "    // code\n".repeat(50),
1202            "    // more code\n".repeat(50)
1203        );
1204        let config = ChunkingConfig {
1205            min_chunk_threshold: 500,
1206            max_chunk_size: 500,
1207            min_chunk_size: 100,
1208            chunk_overlap: 50,
1209        };
1210        let chunks = chunk_content(&content, ContentType::Code, &config);
1211
1212        assert!(chunks.len() >= 2);
1213    }
1214
1215    #[test]
1216    fn test_code_boundary_patterns() {
1217        let patterns = [
1218            "fn test()",
1219            "pub fn test()",
1220            "async fn test()",
1221            "const FOO",
1222            "export default",
1223            "module.exports",
1224            "interface Foo",
1225            "type Bar",
1226        ];
1227
1228        for pattern in patterns {
1229            assert!(
1230                is_code_boundary(pattern),
1231                "Pattern '{}' should be recognized as boundary",
1232                pattern
1233            );
1234        }
1235    }
1236
1237    #[test]
1238    fn test_json_chunking() {
1239        let content = serde_json::json!({
1240            "key1": "a".repeat(300),
1241            "key2": "b".repeat(300),
1242            "key3": "c".repeat(300),
1243        })
1244        .to_string();
1245
1246        let config = ChunkingConfig {
1247            min_chunk_threshold: 500,
1248            max_chunk_size: 400,
1249            min_chunk_size: 100,
1250            chunk_overlap: 50,
1251        };
1252        let chunks = chunk_content(&content, ContentType::Json, &config);
1253
1254        assert!(!chunks.is_empty());
1255    }
1256
1257    #[test]
1258    fn test_json_nested_path_context() {
1259        let content = serde_json::json!({
1260            "users": {
1261                "profile": {
1262                    "settings": "value"
1263                }
1264            }
1265        })
1266        .to_string();
1267
1268        let config = ChunkingConfig {
1269            min_chunk_threshold: 10,
1270            max_chunk_size: 1000,
1271            min_chunk_size: 10,
1272            chunk_overlap: 0,
1273        };
1274        let chunks = chunk_content(&content, ContentType::Json, &config);
1275
1276        // Should have context with nested path
1277        assert!(!chunks.is_empty());
1278    }
1279
1280    #[test]
1281    fn test_sentence_splitting() {
1282        let text = "First sentence. Second sentence? Third sentence! Fourth.";
1283        let sentences = split_at_sentences(text);
1284
1285        assert!(sentences.len() >= 3);
1286        assert!(sentences[0].contains("First"));
1287    }
1288
1289    #[test]
1290    fn test_yaml_chunking_with_path() {
1291        let content = r#"
1292server:
1293  host: localhost
1294  port: 8080
1295database:
1296  host: db.example.com
1297  port: 5432
1298"#;
1299        let config = ChunkingConfig {
1300            min_chunk_threshold: 10,
1301            max_chunk_size: 1000,
1302            min_chunk_size: 10,
1303            chunk_overlap: 0,
1304        };
1305        let chunks = chunk_content(content, ContentType::Yaml, &config);
1306
1307        // Should have chunks for server and database sections
1308        assert!(!chunks.is_empty());
1309    }
1310
1311    #[test]
1312    fn test_chunk_content_threshold_uses_chars_not_bytes() {
1313        // Bug #4: chunker threshold should use char count, not byte length
1314        // 999 4-byte emoji chars = 999 chars but 3996 bytes
1315        let content: String = "😀".repeat(999);
1316        assert_eq!(content.chars().count(), 999);
1317        assert_eq!(content.len(), 3996);
1318
1319        let config = ChunkingConfig::default(); // min_chunk_threshold: 1000
1320        let chunks = chunk_content(&content, ContentType::Text, &config);
1321        // 999 chars < 1000 threshold, so should NOT chunk
1322        assert_eq!(
1323            chunks.len(),
1324            1,
1325            "Should not chunk content below char threshold"
1326        );
1327    }
1328
1329    #[test]
1330    fn test_split_by_chars_multibyte_no_panic() {
1331        // Bug #5: split_by_chars must not panic on multi-byte UTF-8
1332        let content: String = "日".repeat(2000); // 3 bytes each
1333        // Should not panic
1334        let chunks = split_by_chars(&content, 500, 0, None, 50);
1335        assert!(!chunks.is_empty());
1336        for chunk in &chunks {
1337            // All chunks should be valid UTF-8 (they are Strings, so this is guaranteed
1338            // if we didn't panic)
1339            assert!(!chunk.content.is_empty());
1340        }
1341    }
1342
1343    #[test]
1344    fn test_split_by_chars_mixed_multibyte() {
1345        // Mix of ASCII and multi-byte chars
1346        let content = "Hello 世界! ".repeat(200);
1347        let chunks = split_by_chars(&content, 100, 0, None, 20);
1348        assert!(!chunks.is_empty());
1349        for chunk in &chunks {
1350            assert!(!chunk.content.is_empty());
1351        }
1352    }
1353
1354    #[test]
1355    fn test_recursive_split_preserves_context() {
1356        let long_text = "This is a sentence. ".repeat(100);
1357        let chunks = recursive_split(&long_text, 200, 0, Some("test context".to_string()), 20);
1358
1359        assert!(chunks.len() > 1);
1360        for chunk in &chunks {
1361            assert!(
1362                chunk
1363                    .context
1364                    .as_ref()
1365                    .map(|c| c == "test context")
1366                    .unwrap_or(false)
1367            );
1368        }
1369    }
1370}
sediment/chunker.rs

sediment/
chunker.rs