sediment/
chunker.rs

1//! Chunking logic for splitting documents into searchable pieces
2//!
3//! Uses a two-pass approach for all content types:
4//! 1. First pass: Split by semantic boundaries (headers, functions, paragraphs)
5//! 2. Second pass: If any chunk exceeds max size, recursively split further
6//!
7//! Also applies minimum chunk size to avoid tiny fragments by merging small sections.
8
9use crate::document::ContentType;
10
11/// Configuration for chunking
12#[derive(Debug, Clone)]
13pub struct ChunkingConfig {
14    /// Minimum content length before chunking is applied (default: 1000 chars)
15    pub min_chunk_threshold: usize,
16    /// Maximum chunk size in characters (default: 800 chars)
17    pub max_chunk_size: usize,
18    /// Minimum chunk size - merge if below (default: 200 chars)
19    pub min_chunk_size: usize,
20    /// Overlap between chunks in characters (default: 100 chars)
21    pub chunk_overlap: usize,
22}
23
24impl Default for ChunkingConfig {
25    fn default() -> Self {
26        Self {
27            min_chunk_threshold: 1000,
28            max_chunk_size: 800,
29            min_chunk_size: 200,
30            chunk_overlap: 100,
31        }
32    }
33}
34
35impl ChunkingConfig {
36    /// Create config with legacy field name for backwards compatibility
37    pub fn with_chunk_size(mut self, size: usize) -> Self {
38        self.max_chunk_size = size;
39        self
40    }
41}
42
43/// Result of chunking a piece of content
44#[derive(Debug, Clone)]
45pub struct ChunkResult {
46    /// The chunk content
47    pub content: String,
48    /// Start offset in original content (character position)
49    pub start_offset: usize,
50    /// End offset in original content (character position)
51    pub end_offset: usize,
52    /// Optional context (e.g., parent header for markdown)
53    pub context: Option<String>,
54    /// Whether this chunk represents a major boundary (header, function) - don't merge across
55    pub is_boundary: bool,
56}
57
58impl ChunkResult {
59    fn new(content: String, start_offset: usize, end_offset: usize) -> Self {
60        Self {
61            content,
62            start_offset,
63            end_offset,
64            context: None,
65            is_boundary: false,
66        }
67    }
68
69    fn with_context(mut self, context: Option<String>) -> Self {
70        self.context = context;
71        self
72    }
73
74    fn with_boundary(mut self, is_boundary: bool) -> Self {
75        self.is_boundary = is_boundary;
76        self
77    }
78}
79
80/// Chunk content based on content type
81pub fn chunk_content(
82    content: &str,
83    content_type: ContentType,
84    config: &ChunkingConfig,
85) -> Vec<ChunkResult> {
86    // Don't chunk if content is below threshold (use char count, not byte length)
87    let char_count = content.chars().count();
88    if char_count < config.min_chunk_threshold {
89        return vec![ChunkResult::new(content.to_string(), 0, content.len())];
90    }
91
92    // First pass: semantic splitting
93    let chunks = match content_type {
94        ContentType::Markdown => chunk_markdown(content, config),
95        ContentType::Json => chunk_json(content, config),
96        ContentType::Yaml => chunk_yaml(content, config),
97        ContentType::Code => chunk_code(content, config),
98        ContentType::Text => chunk_text(content, config),
99    };
100
101    // Second pass: enforce max size by recursive splitting
102    let chunks = enforce_max_size(chunks, config);
103
104    // Third pass: merge small chunks (respecting boundaries)
105    merge_small_chunks(chunks, config.min_chunk_size)
106}
107
108// ============================================================================
109// Helper Functions
110// ============================================================================
111
112/// Split text at sentence boundaries
113fn split_at_sentences(text: &str) -> Vec<&str> {
114    let mut sentences = Vec::new();
115    let mut start = 0;
116    let mut char_indices = text.char_indices().peekable();
117
118    while let Some((i, ch)) = char_indices.next() {
119        // Look for sentence-ending punctuation (ASCII and Unicode)
120        if matches!(ch, '.' | '?' | '!' | '。' | '？' | '！') {
121            let end = i + ch.len_utf8();
122            // Check if followed by whitespace or end of text
123            let at_end_or_ws = match char_indices.peek() {
124                None => true,
125                Some(&(_, next_ch)) => next_ch == ' ' || next_ch == '\n' || next_ch == '\t',
126            };
127            if at_end_or_ws {
128                if start < end {
129                    sentences.push(&text[start..end]);
130                }
131                // Skip whitespace after punctuation
132                while let Some(&(_, next_ch)) = char_indices.peek() {
133                    if next_ch == ' ' || next_ch == '\n' || next_ch == '\t' {
134                        char_indices.next();
135                    } else {
136                        break;
137                    }
138                }
139                start = match char_indices.peek() {
140                    Some(&(idx, _)) => idx,
141                    None => text.len(),
142                };
143            }
144        }
145    }
146
147    // Add remaining text
148    if start < text.len() {
149        sentences.push(&text[start..]);
150    }
151
152    if sentences.is_empty() && !text.is_empty() {
153        sentences.push(text);
154    }
155
156    sentences
157}
158
159/// Recursively split content: paragraphs -> sentences -> chars
160fn recursive_split(
161    text: &str,
162    max_size: usize,
163    offset: usize,
164    context: Option<String>,
165    overlap: usize,
166) -> Vec<ChunkResult> {
167    if text.len() <= max_size {
168        return vec![
169            ChunkResult::new(text.to_string(), offset, offset + text.len()).with_context(context),
170        ];
171    }
172
173    let mut chunks = Vec::new();
174
175    // Try splitting by paragraphs first
176    let paragraphs: Vec<&str> = text.split("\n\n").collect();
177    if paragraphs.len() > 1 {
178        let mut current_chunk = String::new();
179        let mut chunk_start = offset;
180        let mut current_pos = offset;
181
182        for (i, para) in paragraphs.iter().enumerate() {
183            let sep = if i > 0 { "\n\n" } else { "" };
184            let para_with_sep = format!("{}{}", sep, para);
185
186            if !current_chunk.is_empty() && current_chunk.len() + para_with_sep.len() > max_size {
187                // Save current chunk
188                chunks.push(
189                    ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
190                        .with_context(context.clone()),
191                );
192
193                // Start new chunk with overlap
194                let overlap_text = get_overlap_text(&current_chunk, overlap);
195                chunk_start = current_pos - overlap_text.len();
196                current_chunk = overlap_text;
197            }
198
199            current_chunk.push_str(&para_with_sep);
200            current_pos += para_with_sep.len();
201        }
202
203        if !current_chunk.is_empty() {
204            // If the final chunk is still too large, recursively split by sentences
205            if current_chunk.len() > max_size {
206                chunks.extend(split_by_sentences(
207                    &current_chunk,
208                    max_size,
209                    chunk_start,
210                    context.clone(),
211                    overlap,
212                ));
213            } else {
214                chunks.push(
215                    ChunkResult::new(current_chunk, chunk_start, current_pos)
216                        .with_context(context.clone()),
217                );
218            }
219        }
220
221        return chunks;
222    }
223
224    // No paragraph breaks - split by sentences
225    split_by_sentences(text, max_size, offset, context, overlap)
226}
227
228/// Split text by sentences, falling back to character split
229fn split_by_sentences(
230    text: &str,
231    max_size: usize,
232    offset: usize,
233    context: Option<String>,
234    overlap: usize,
235) -> Vec<ChunkResult> {
236    let sentences = split_at_sentences(text);
237
238    if sentences.len() <= 1 {
239        // No sentence boundaries - split by characters
240        return split_by_chars(text, max_size, offset, context, overlap);
241    }
242
243    let mut chunks = Vec::new();
244    let mut current_chunk = String::new();
245    let mut chunk_start = offset;
246    let mut current_pos = offset;
247
248    for sentence in sentences {
249        let sep = if !current_chunk.is_empty() { " " } else { "" };
250        let sentence_with_sep = format!("{}{}", sep, sentence);
251
252        if !current_chunk.is_empty() && current_chunk.len() + sentence_with_sep.len() > max_size {
253            chunks.push(
254                ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
255                    .with_context(context.clone()),
256            );
257
258            let overlap_text = get_overlap_text(&current_chunk, overlap);
259            chunk_start = current_pos - overlap_text.len();
260            current_chunk = overlap_text;
261        }
262
263        current_chunk.push_str(&sentence_with_sep);
264        current_pos += sentence_with_sep.len();
265    }
266
267    if !current_chunk.is_empty() {
268        if current_chunk.len() > max_size {
269            chunks.extend(split_by_chars(
270                &current_chunk,
271                max_size,
272                chunk_start,
273                context.clone(),
274                overlap,
275            ));
276        } else {
277            chunks.push(
278                ChunkResult::new(current_chunk, chunk_start, current_pos)
279                    .with_context(context.clone()),
280            );
281        }
282    }
283
284    chunks
285}
286
287/// Split text by characters (last resort)
288fn split_by_chars(
289    text: &str,
290    max_size: usize,
291    offset: usize,
292    context: Option<String>,
293    overlap: usize,
294) -> Vec<ChunkResult> {
295    let mut chunks = Vec::new();
296    let bytes = text.as_bytes();
297    let mut start = 0;
298
299    // Ensure we make progress - overlap must be less than chunk size
300    let effective_overlap = overlap.min(max_size / 2);
301
302    while start < text.len() {
303        let end = (start + max_size).min(text.len());
304
305        // Try to break at a word boundary
306        let actual_end = if end < text.len() {
307            find_word_boundary_bytes(bytes, start, end)
308        } else {
309            end
310        };
311
312        // Ensure we make at least some progress
313        let actual_end = if actual_end <= start {
314            (start + max_size).min(text.len())
315        } else {
316            actual_end
317        };
318
319        // Ensure actual_end is on a valid UTF-8 char boundary
320        let actual_end = {
321            let mut e = actual_end;
322            while e < text.len() && !text.is_char_boundary(e) {
323                e += 1;
324            }
325            e
326        };
327
328        chunks.push(
329            ChunkResult::new(
330                text[start..actual_end].to_string(),
331                offset + start,
332                offset + actual_end,
333            )
334            .with_context(context.clone()),
335        );
336
337        // Next chunk starts after this one, minus overlap
338        // But ensure we always make progress and land on a char boundary
339        let next_start = actual_end.saturating_sub(effective_overlap);
340        let next_start = {
341            let mut s = next_start;
342            while s < text.len() && !text.is_char_boundary(s) {
343                s += 1;
344            }
345            s
346        };
347        start = if next_start <= start {
348            actual_end // No overlap if it would cause no progress
349        } else {
350            next_start
351        };
352    }
353
354    if chunks.is_empty() && !text.is_empty() {
355        chunks.push(
356            ChunkResult::new(text.to_string(), offset, offset + text.len()).with_context(context),
357        );
358    }
359
360    chunks
361}
362
363/// Find a word boundary near the target position (byte-based for efficiency).
364///
365/// Safety: Only searches for ASCII bytes (space 0x20, newline 0x0A) which cannot
366/// appear as continuation bytes in multi-byte UTF-8 sequences. The returned
367/// position is always immediately after an ASCII byte, so slicing at this
368/// position is guaranteed to be on a valid UTF-8 char boundary.
369fn find_word_boundary_bytes(bytes: &[u8], start: usize, target: usize) -> usize {
370    // Look backwards from target to find a space or newline
371    let search_start = target.saturating_sub(50).max(start);
372    for i in (search_start..target).rev() {
373        if bytes[i] == b' ' || bytes[i] == b'\n' {
374            return i + 1;
375        }
376    }
377    target
378}
379
380/// Get overlap text from the end of a chunk
381fn get_overlap_text(text: &str, overlap: usize) -> String {
382    if text.len() <= overlap {
383        return text.to_string();
384    }
385
386    let actual_start = find_overlap_start_bytes(text.as_bytes(), overlap);
387    text[actual_start..].to_string()
388}
389
390/// Find a good overlap start position (byte-based for efficiency)
391fn find_overlap_start_bytes(bytes: &[u8], target_overlap: usize) -> usize {
392    if bytes.len() <= target_overlap {
393        return 0;
394    }
395
396    let start_search = bytes.len().saturating_sub(target_overlap + 50);
397    let end_search = bytes
398        .len()
399        .saturating_sub(target_overlap.saturating_sub(50));
400
401    // Look for a good break point (newline, period, space)
402    for i in (start_search..end_search).rev() {
403        if bytes[i] == b'\n' || bytes[i] == b'.' || bytes[i] == b' ' {
404            return i + 1;
405        }
406    }
407
408    // Fall back to target position
409    bytes.len().saturating_sub(target_overlap)
410}
411
412/// Enforce max size on all chunks by recursive splitting
413fn enforce_max_size(chunks: Vec<ChunkResult>, config: &ChunkingConfig) -> Vec<ChunkResult> {
414    let mut result = Vec::new();
415
416    for chunk in chunks {
417        if chunk.content.len() > config.max_chunk_size {
418            result.extend(recursive_split(
419                &chunk.content,
420                config.max_chunk_size,
421                chunk.start_offset,
422                chunk.context,
423                config.chunk_overlap,
424            ));
425        } else {
426            result.push(chunk);
427        }
428    }
429
430    result
431}
432
433/// Merge small chunks with neighbors, respecting boundaries
434fn merge_small_chunks(chunks: Vec<ChunkResult>, min_size: usize) -> Vec<ChunkResult> {
435    if chunks.is_empty() {
436        return chunks;
437    }
438
439    let mut result: Vec<ChunkResult> = Vec::new();
440
441    for chunk in chunks {
442        if chunk.content.len() >= min_size || chunk.is_boundary {
443            result.push(chunk);
444        } else if let Some(last) = result.last_mut() {
445            // Don't merge across boundaries
446            if !last.is_boundary {
447                // Merge with previous chunk
448                last.content.push_str("\n\n");
449                last.content.push_str(&chunk.content);
450                last.end_offset = chunk.end_offset;
451                // Keep the more specific context
452                if chunk.context.is_some() {
453                    last.context = chunk.context;
454                }
455            } else {
456                result.push(chunk);
457            }
458        } else {
459            result.push(chunk);
460        }
461    }
462
463    // Final pass: if a trailing small chunk exists after a boundary, keep it
464    result
465}
466
467// ============================================================================
468// Markdown Chunking
469// ============================================================================
470
471/// A parsed markdown section
472struct MarkdownSection {
473    /// Full header path (e.g., ["# Main", "## Sub"])
474    header_path: Vec<String>,
475    /// Section content (including the header line)
476    content: String,
477    /// Start offset in original content
478    start_offset: usize,
479    /// End offset in original content
480    end_offset: usize,
481}
482
483/// Parse markdown into sections by headers
484fn parse_markdown_sections(content: &str) -> Vec<MarkdownSection> {
485    let mut sections = Vec::new();
486    let mut current_section = String::new();
487    let mut section_start = 0;
488    let mut current_pos = 0;
489    let mut header_stack: Vec<(usize, String)> = Vec::new(); // (level, header text)
490
491    let lines: Vec<&str> = content.lines().collect();
492
493    for line in lines.iter() {
494        let line_with_newline = if current_pos > 0 {
495            format!("\n{}", line)
496        } else {
497            line.to_string()
498        };
499
500        // Check if this is a header
501        if let Some(level) = get_header_level(line) {
502            // If we have content, save current section
503            if !current_section.is_empty() {
504                sections.push(MarkdownSection {
505                    header_path: header_stack.iter().map(|(_, h)| h.clone()).collect(),
506                    content: current_section.clone(),
507                    start_offset: section_start,
508                    end_offset: current_pos,
509                });
510            }
511
512            // Update header stack
513            // Pop headers of equal or lower level
514            while !header_stack.is_empty() && header_stack.last().unwrap().0 >= level {
515                header_stack.pop();
516            }
517            header_stack.push((level, line.to_string()));
518
519            // Start new section
520            current_section = line_with_newline.trim_start_matches('\n').to_string();
521            section_start = current_pos;
522        } else {
523            current_section.push_str(&line_with_newline);
524        }
525
526        current_pos += line_with_newline.len();
527    }
528
529    // Add final section
530    if !current_section.is_empty() {
531        sections.push(MarkdownSection {
532            header_path: header_stack.iter().map(|(_, h)| h.clone()).collect(),
533            content: current_section,
534            start_offset: section_start,
535            end_offset: content.len(),
536        });
537    }
538
539    // If no sections found, create one for entire content
540    if sections.is_empty() {
541        sections.push(MarkdownSection {
542            header_path: vec![],
543            content: content.to_string(),
544            start_offset: 0,
545            end_offset: content.len(),
546        });
547    }
548
549    sections
550}
551
552/// Get the header level (1-6) or None if not a header
553fn get_header_level(line: &str) -> Option<usize> {
554    let trimmed = line.trim_start();
555    if !trimmed.starts_with('#') {
556        return None;
557    }
558
559    let level = trimmed.chars().take_while(|&c| c == '#').count();
560    if level > 0 && level <= 6 {
561        // Make sure there's a space after the hashes (valid markdown header)
562        if trimmed.len() > level && trimmed.chars().nth(level) == Some(' ') {
563            return Some(level);
564        }
565    }
566    None
567}
568
569/// Format header path as context string
570fn format_header_path(path: &[String]) -> Option<String> {
571    if path.is_empty() {
572        return None;
573    }
574    Some(path.join(" > "))
575}
576
577/// Chunk markdown by headers, preserving header context
578fn chunk_markdown(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
579    let sections = parse_markdown_sections(content);
580    let mut chunks = Vec::new();
581
582    for section in sections {
583        let context = format_header_path(&section.header_path);
584        let is_boundary = !section.header_path.is_empty();
585
586        chunks.push(
587            ChunkResult::new(section.content, section.start_offset, section.end_offset)
588                .with_context(context)
589                .with_boundary(is_boundary),
590        );
591    }
592
593    // Size enforcement happens in chunk_content
594    chunks
595}
596
597// ============================================================================
598// Text Chunking
599// ============================================================================
600
601/// Chunk plain text by paragraphs, then sentences
602fn chunk_text(content: &str, config: &ChunkingConfig) -> Vec<ChunkResult> {
603    let mut chunks = Vec::new();
604    let mut current_chunk = String::new();
605    let mut chunk_start = 0;
606    let mut current_pos = 0;
607
608    // Split by double newlines (paragraphs)
609    let paragraphs: Vec<&str> = content.split("\n\n").collect();
610
611    for (i, para) in paragraphs.iter().enumerate() {
612        let sep = if i > 0 { "\n\n" } else { "" };
613        let para_with_sep = format!("{}{}", sep, para);
614
615        // If adding this paragraph would exceed chunk size, save current chunk
616        if !current_chunk.is_empty()
617            && current_chunk.len() + para_with_sep.len() > config.max_chunk_size
618        {
619            chunks.push(ChunkResult::new(
620                current_chunk.clone(),
621                chunk_start,
622                current_pos,
623            ));
624
625            // Start new chunk with overlap
626            let overlap_text = get_overlap_text(&current_chunk, config.chunk_overlap);
627            chunk_start = current_pos - overlap_text.len();
628            current_chunk = overlap_text;
629        }
630
631        current_chunk.push_str(&para_with_sep);
632        current_pos += para_with_sep.len();
633    }
634
635    // Add final chunk
636    if !current_chunk.is_empty() {
637        chunks.push(ChunkResult::new(current_chunk, chunk_start, content.len()));
638    }
639
640    // Ensure at least one chunk
641    if chunks.is_empty() {
642        chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
643    }
644
645    chunks
646}
647
648// ============================================================================
649// Code Chunking
650// ============================================================================
651
652/// Common patterns for function/class boundaries
653const CODE_BOUNDARY_PATTERNS: &[&str] = &[
654    // Rust
655    "fn ",
656    "pub fn ",
657    "async fn ",
658    "pub async fn ",
659    "impl ",
660    "struct ",
661    "enum ",
662    "trait ",
663    "mod ",
664    "const ",
665    "static ",
666    "type ",
667    "#[",
668    "//!",
669    // Go
670    "func ",
671    // Python
672    "def ",
673    "class ",
674    "async def ",
675    // JavaScript/TypeScript
676    "function ",
677    "async function ",
678    "export ",
679    "export default",
680    "module.exports",
681    "const ",
682    "let ",
683    "var ",
684    "interface ",
685    // C/C++
686    "void ",
687    "int ",
688    "char ",
689    "double ",
690    "float ",
691    "#define ",
692    "#include ",
693];
694
695/// Extract context from a code boundary line
696fn extract_code_context(line: &str) -> String {
697    let trimmed = line.trim();
698
699    // Try to extract a meaningful identifier
700    // For function definitions, get up to the opening brace or paren
701    if let Some(paren_pos) = trimmed.find('(') {
702        let signature = &trimmed[..paren_pos];
703        // Find the last word (likely the function name)
704        if signature.rfind(' ').is_some() {
705            return format!("{}...", &trimmed[..paren_pos.min(60)]);
706        }
707    }
708
709    // For struct/class/impl, get the name
710    for keyword in &[
711        "struct ",
712        "class ",
713        "impl ",
714        "trait ",
715        "interface ",
716        "enum ",
717    ] {
718        if let Some(rest) = trimmed.strip_prefix(keyword) {
719            let name_end = rest
720                .find(|c: char| !c.is_alphanumeric() && c != '_' && c != '<' && c != '>')
721                .unwrap_or(rest.len());
722            return format!("{}{}", keyword, &rest[..name_end.min(50)]);
723        }
724    }
725
726    // Default: first 60 chars
727    trimmed.chars().take(60).collect()
728}
729
730/// Check if a line is a code boundary
731fn is_code_boundary(line: &str) -> bool {
732    let trimmed = line.trim_start();
733    CODE_BOUNDARY_PATTERNS
734        .iter()
735        .any(|p| trimmed.starts_with(p))
736}
737
738/// Chunk code by function/class boundaries
739fn chunk_code(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
740    let mut chunks = Vec::new();
741    let mut current_chunk = String::new();
742    let mut chunk_start = 0;
743    let mut current_pos = 0;
744    let mut current_context: Option<String> = None;
745    let mut is_at_boundary = false;
746
747    let lines: Vec<&str> = content.lines().collect();
748
749    for line in lines {
750        let line_with_newline = if current_pos > 0 {
751            format!("\n{}", line)
752        } else {
753            line.to_string()
754        };
755
756        let boundary = is_code_boundary(line);
757
758        // If we hit a boundary and have substantial content, start new chunk
759        if boundary && !current_chunk.is_empty() && current_chunk.len() > 100 {
760            chunks.push(
761                ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
762                    .with_context(current_context.clone())
763                    .with_boundary(is_at_boundary),
764            );
765
766            current_chunk = String::new();
767            chunk_start = current_pos;
768            is_at_boundary = true;
769        }
770
771        if boundary {
772            current_context = Some(extract_code_context(line));
773            is_at_boundary = true;
774        }
775
776        // Size-based splitting (will be handled by enforce_max_size)
777        current_chunk.push_str(&line_with_newline);
778        current_pos += line_with_newline.len();
779    }
780
781    // Add final chunk
782    if !current_chunk.is_empty() {
783        chunks.push(
784            ChunkResult::new(current_chunk, chunk_start, content.len())
785                .with_context(current_context)
786                .with_boundary(is_at_boundary),
787        );
788    }
789
790    if chunks.is_empty() {
791        chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
792    }
793
794    chunks
795}
796
797// ============================================================================
798// JSON Chunking
799// ============================================================================
800
801/// Chunk JSON by top-level keys/array elements with nested path context
802fn chunk_json(content: &str, config: &ChunkingConfig) -> Vec<ChunkResult> {
803    // Try to parse as JSON
804    if let Ok(value) = serde_json::from_str::<serde_json::Value>(content) {
805        let chunks = chunk_json_value(&value, config, vec![]);
806        if !chunks.is_empty() {
807            return chunks;
808        }
809    }
810
811    // Fall back to text chunking if JSON parsing fails
812    chunk_text(content, config)
813}
814
815/// Recursively chunk a JSON value with path context.
816///
817/// Note: JSON chunks are re-serialized from parsed JSON, so `start_offset` and
818/// `end_offset` represent positions within the serialized chunk content (0..len),
819/// not byte offsets into the original input string.
820fn chunk_json_value(
821    value: &serde_json::Value,
822    config: &ChunkingConfig,
823    path: Vec<String>,
824) -> Vec<ChunkResult> {
825    let mut chunks = Vec::new();
826
827    match value {
828        serde_json::Value::Object(map) => {
829            let mut current_chunk = String::from("{\n");
830            let entries: Vec<_> = map.iter().collect();
831
832            for (i, (key, val)) in entries.iter().enumerate() {
833                let val_str = serde_json::to_string_pretty(val).unwrap_or_default();
834                let entry = if i < entries.len() - 1 {
835                    format!("  \"{}\": {},\n", key, val_str)
836                } else {
837                    format!("  \"{}\": {}\n", key, val_str)
838                };
839
840                let mut new_path = path.clone();
841                new_path.push((*key).clone());
842                let path_str = new_path.join(".");
843
844                // If this single entry is too large, try to chunk its value
845                if entry.len() > config.max_chunk_size {
846                    // Save current chunk if not empty
847                    if current_chunk.len() > 3 {
848                        current_chunk.push('}');
849                        let len = current_chunk.len();
850                        let context = if path.is_empty() {
851                            None
852                        } else {
853                            Some(path.join("."))
854                        };
855                        chunks.push(
856                            ChunkResult::new(current_chunk, 0, len)
857                                .with_context(context)
858                                .with_boundary(true),
859                        );
860                        current_chunk = String::from("{\n");
861                    }
862
863                    // Recursively chunk the large value
864                    let sub_chunks = chunk_json_value(val, config, new_path);
865                    chunks.extend(sub_chunks);
866                    continue;
867                }
868
869                if current_chunk.len() + entry.len() > config.max_chunk_size
870                    && current_chunk.len() > 3
871                {
872                    current_chunk.push('}');
873                    let len = current_chunk.len();
874                    chunks.push(
875                        ChunkResult::new(current_chunk, 0, len)
876                            .with_context(Some(path_str.clone()))
877                            .with_boundary(true),
878                    );
879                    current_chunk = String::from("{\n");
880                }
881
882                current_chunk.push_str(&entry);
883            }
884
885            current_chunk.push('}');
886            if current_chunk.len() > 3 {
887                let len = current_chunk.len();
888                let context = if path.is_empty() {
889                    None
890                } else {
891                    Some(path.join("."))
892                };
893                chunks.push(
894                    ChunkResult::new(current_chunk, 0, len)
895                        .with_context(context)
896                        .with_boundary(true),
897                );
898            }
899        }
900        serde_json::Value::Array(arr) => {
901            let mut current_chunk = String::from("[\n");
902
903            for (i, val) in arr.iter().enumerate() {
904                let val_str = serde_json::to_string_pretty(val).unwrap_or_default();
905                let entry = if i < arr.len() - 1 {
906                    format!("  {},\n", val_str)
907                } else {
908                    format!("  {}\n", val_str)
909                };
910
911                let mut new_path = path.clone();
912                new_path.push(format!("[{}]", i));
913                let path_str = new_path.join(".");
914
915                if current_chunk.len() + entry.len() > config.max_chunk_size
916                    && current_chunk.len() > 3
917                {
918                    current_chunk.push(']');
919                    let len = current_chunk.len();
920                    chunks.push(
921                        ChunkResult::new(current_chunk, 0, len)
922                            .with_context(Some(path_str.clone()))
923                            .with_boundary(true),
924                    );
925                    current_chunk = String::from("[\n");
926                }
927
928                current_chunk.push_str(&entry);
929            }
930
931            current_chunk.push(']');
932            if current_chunk.len() > 3 {
933                let len = current_chunk.len();
934                let context = if path.is_empty() {
935                    None
936                } else {
937                    Some(path.join("."))
938                };
939                chunks.push(
940                    ChunkResult::new(current_chunk, 0, len)
941                        .with_context(context)
942                        .with_boundary(true),
943                );
944            }
945        }
946        _ => {
947            // Primitive value - just stringify
948            let content = serde_json::to_string_pretty(value).unwrap_or_default();
949            let len = content.len();
950            let context = if path.is_empty() {
951                None
952            } else {
953                Some(path.join("."))
954            };
955            chunks.push(
956                ChunkResult::new(content, 0, len)
957                    .with_context(context)
958                    .with_boundary(false),
959            );
960        }
961    }
962
963    chunks
964}
965
966// ============================================================================
967// YAML Chunking
968// ============================================================================
969
970/// Chunk YAML by top-level keys with nested path context
971fn chunk_yaml(content: &str, _config: &ChunkingConfig) -> Vec<ChunkResult> {
972    let mut chunks = Vec::new();
973    let mut current_chunk = String::new();
974    let mut chunk_start = 0;
975    let mut current_pos = 0;
976    let mut key_stack: Vec<(usize, String)> = Vec::new(); // (indent level, key)
977
978    let lines: Vec<&str> = content.lines().collect();
979
980    for line in lines {
981        let line_with_newline = if current_pos > 0 {
982            format!("\n{}", line)
983        } else {
984            line.to_string()
985        };
986
987        // Calculate indent level
988        let indent = line.len() - line.trim_start().len();
989        let trimmed = line.trim();
990
991        // Check if this is a YAML key line.
992        // A key line has an unquoted key followed by ':'. We check that ':' is not
993        // inside a URL (contains ://) and the line doesn't start with flow indicators.
994        let is_key_line = !trimmed.starts_with('-')
995            && !trimmed.starts_with('#')
996            && !trimmed.starts_with('"')
997            && !trimmed.starts_with('\'')
998            && !trimmed.starts_with('{')
999            && !trimmed.starts_with('[')
1000            && trimmed.contains(':')
1001            && !trimmed.contains("://");
1002
1003        if is_key_line {
1004            // Extract the key
1005            if let Some(key) = trimmed.split(':').next() {
1006                let key = key.trim().to_string();
1007
1008                // Update key stack based on indentation
1009                while !key_stack.is_empty() && key_stack.last().unwrap().0 >= indent {
1010                    key_stack.pop();
1011                }
1012                key_stack.push((indent, key));
1013            }
1014        }
1015
1016        // Check if this is a top-level key (no leading whitespace)
1017        let is_top_level_key = indent == 0 && is_key_line;
1018
1019        // Start new chunk at top-level keys
1020        if is_top_level_key && !current_chunk.is_empty() && current_chunk.len() > 50 {
1021            let context = format_yaml_path(&key_stack[..key_stack.len().saturating_sub(1)]);
1022            chunks.push(
1023                ChunkResult::new(current_chunk.clone(), chunk_start, current_pos)
1024                    .with_context(context)
1025                    .with_boundary(true),
1026            );
1027
1028            current_chunk = String::new();
1029            chunk_start = current_pos;
1030        }
1031
1032        current_chunk.push_str(&line_with_newline);
1033        current_pos += line_with_newline.len();
1034    }
1035
1036    // Add final chunk
1037    if !current_chunk.is_empty() {
1038        let context = format_yaml_path(&key_stack);
1039        chunks.push(
1040            ChunkResult::new(current_chunk, chunk_start, content.len())
1041                .with_context(context)
1042                .with_boundary(!key_stack.is_empty()),
1043        );
1044    }
1045
1046    if chunks.is_empty() {
1047        chunks.push(ChunkResult::new(content.to_string(), 0, content.len()));
1048    }
1049
1050    chunks
1051}
1052
1053/// Format YAML key path as context string
1054fn format_yaml_path(stack: &[(usize, String)]) -> Option<String> {
1055    if stack.is_empty() {
1056        return None;
1057    }
1058    Some(
1059        stack
1060            .iter()
1061            .map(|(_, k)| k.as_str())
1062            .collect::<Vec<_>>()
1063            .join("."),
1064    )
1065}
1066
1067// ============================================================================
1068// Tests
1069// ============================================================================
1070
1071#[cfg(test)]
1072mod tests {
1073    use super::*;
1074
1075    #[test]
1076    fn test_small_content_no_chunking() {
1077        let content = "Small content";
1078        let config = ChunkingConfig::default();
1079        let chunks = chunk_content(content, ContentType::Text, &config);
1080
1081        assert_eq!(chunks.len(), 1);
1082        assert_eq!(chunks[0].content, content);
1083    }
1084
1085    #[test]
1086    fn test_text_chunking() {
1087        let content = "a".repeat(2000);
1088        let config = ChunkingConfig {
1089            min_chunk_threshold: 1000,
1090            max_chunk_size: 500,
1091            min_chunk_size: 100,
1092            chunk_overlap: 100,
1093        };
1094        let chunks = chunk_content(&content, ContentType::Text, &config);
1095
1096        assert!(chunks.len() > 1);
1097        for chunk in &chunks {
1098            // Allow some flexibility for overlap
1099            assert!(chunk.content.len() <= config.max_chunk_size + config.chunk_overlap + 100);
1100        }
1101    }
1102
1103    #[test]
1104    fn test_markdown_splits_by_headers_first() {
1105        let content = "# H1\nShort content here.\n\n# H2\nAlso short content.";
1106        let config = ChunkingConfig {
1107            min_chunk_threshold: 10, // Force chunking
1108            max_chunk_size: 1000,
1109            min_chunk_size: 10, // Allow small chunks
1110            chunk_overlap: 0,
1111        };
1112        let chunks = chunk_content(content, ContentType::Markdown, &config);
1113
1114        // Should produce 2 chunks (one per header)
1115        assert_eq!(chunks.len(), 2);
1116        assert!(chunks[0].content.contains("# H1"));
1117        assert!(chunks[1].content.contains("# H2"));
1118    }
1119
1120    #[test]
1121    fn test_large_section_gets_subsplit() {
1122        let long_paragraph = "This is a long sentence. ".repeat(100);
1123        let content = format!("# Header\n\n{}", long_paragraph);
1124        let config = ChunkingConfig {
1125            min_chunk_threshold: 100,
1126            max_chunk_size: 500,
1127            min_chunk_size: 100,
1128            chunk_overlap: 50,
1129        };
1130        let chunks = chunk_content(&content, ContentType::Markdown, &config);
1131
1132        // Should have multiple chunks due to size enforcement
1133        assert!(chunks.len() > 1);
1134        // All chunks should be under max size (with some tolerance for overlap)
1135        for chunk in &chunks {
1136            assert!(chunk.content.len() <= config.max_chunk_size + config.chunk_overlap + 100);
1137        }
1138    }
1139
1140    #[test]
1141    fn test_small_chunks_merged() {
1142        let content = "# A\nx\n\n# B\ny\n\n# C\nz";
1143        let config = ChunkingConfig {
1144            min_chunk_threshold: 5,
1145            max_chunk_size: 1000,
1146            min_chunk_size: 50, // Minimum size that will cause merging
1147            chunk_overlap: 0,
1148        };
1149        let chunks = chunk_content(content, ContentType::Markdown, &config);
1150
1151        // Small sections should be merged (fewer chunks than headers)
1152        // But boundaries should be respected
1153        assert!(chunks.len() <= 3);
1154    }
1155
1156    #[test]
1157    fn test_header_path_context() {
1158        let content = "# Main\n\n## Sub\n\nContent here\n\n### Detail\n\nMore content";
1159        let config = ChunkingConfig {
1160            min_chunk_threshold: 10,
1161            max_chunk_size: 1000,
1162            min_chunk_size: 10,
1163            chunk_overlap: 0,
1164        };
1165        let chunks = chunk_content(content, ContentType::Markdown, &config);
1166
1167        // Check that context includes full header path
1168        let detail_chunk = chunks.iter().find(|c| c.content.contains("### Detail"));
1169        assert!(detail_chunk.is_some());
1170        let ctx = detail_chunk.unwrap().context.as_ref().unwrap();
1171        assert!(ctx.contains("# Main"));
1172        assert!(ctx.contains("## Sub"));
1173        assert!(ctx.contains("### Detail"));
1174    }
1175
1176    #[test]
1177    fn test_markdown_chunking_preserves_context() {
1178        let content = format!(
1179            "# Header 1\n\n{}\n\n# Header 2\n\n{}",
1180            "a".repeat(600),
1181            "b".repeat(600)
1182        );
1183        let config = ChunkingConfig {
1184            min_chunk_threshold: 500,
1185            max_chunk_size: 500,
1186            min_chunk_size: 100,
1187            chunk_overlap: 50,
1188        };
1189        let chunks = chunk_content(&content, ContentType::Markdown, &config);
1190
1191        assert!(chunks.len() >= 2);
1192        // Check that context is preserved
1193        assert!(chunks.iter().any(|c| c.context.is_some()));
1194    }
1195
1196    #[test]
1197    fn test_code_chunking() {
1198        let content = format!(
1199            "fn foo() {{\n{}\n}}\n\nfn bar() {{\n{}\n}}",
1200            "    // code\n".repeat(50),
1201            "    // more code\n".repeat(50)
1202        );
1203        let config = ChunkingConfig {
1204            min_chunk_threshold: 500,
1205            max_chunk_size: 500,
1206            min_chunk_size: 100,
1207            chunk_overlap: 50,
1208        };
1209        let chunks = chunk_content(&content, ContentType::Code, &config);
1210
1211        assert!(chunks.len() >= 2);
1212    }
1213
1214    #[test]
1215    fn test_code_boundary_patterns() {
1216        let patterns = [
1217            "fn test()",
1218            "pub fn test()",
1219            "async fn test()",
1220            "const FOO",
1221            "export default",
1222            "module.exports",
1223            "interface Foo",
1224            "type Bar",
1225        ];
1226
1227        for pattern in patterns {
1228            assert!(
1229                is_code_boundary(pattern),
1230                "Pattern '{}' should be recognized as boundary",
1231                pattern
1232            );
1233        }
1234    }
1235
1236    #[test]
1237    fn test_json_chunking() {
1238        let content = serde_json::json!({
1239            "key1": "a".repeat(300),
1240            "key2": "b".repeat(300),
1241            "key3": "c".repeat(300),
1242        })
1243        .to_string();
1244
1245        let config = ChunkingConfig {
1246            min_chunk_threshold: 500,
1247            max_chunk_size: 400,
1248            min_chunk_size: 100,
1249            chunk_overlap: 50,
1250        };
1251        let chunks = chunk_content(&content, ContentType::Json, &config);
1252
1253        assert!(!chunks.is_empty());
1254    }
1255
1256    #[test]
1257    fn test_json_nested_path_context() {
1258        let content = serde_json::json!({
1259            "users": {
1260                "profile": {
1261                    "settings": "value"
1262                }
1263            }
1264        })
1265        .to_string();
1266
1267        let config = ChunkingConfig {
1268            min_chunk_threshold: 10,
1269            max_chunk_size: 1000,
1270            min_chunk_size: 10,
1271            chunk_overlap: 0,
1272        };
1273        let chunks = chunk_content(&content, ContentType::Json, &config);
1274
1275        // Should have context with nested path
1276        assert!(!chunks.is_empty());
1277    }
1278
1279    #[test]
1280    fn test_sentence_splitting() {
1281        let text = "First sentence. Second sentence? Third sentence! Fourth.";
1282        let sentences = split_at_sentences(text);
1283
1284        assert!(sentences.len() >= 3);
1285        assert!(sentences[0].contains("First"));
1286    }
1287
1288    #[test]
1289    fn test_yaml_chunking_with_path() {
1290        let content = r#"
1291server:
1292  host: localhost
1293  port: 8080
1294database:
1295  host: db.example.com
1296  port: 5432
1297"#;
1298        let config = ChunkingConfig {
1299            min_chunk_threshold: 10,
1300            max_chunk_size: 1000,
1301            min_chunk_size: 10,
1302            chunk_overlap: 0,
1303        };
1304        let chunks = chunk_content(content, ContentType::Yaml, &config);
1305
1306        // Should have chunks for server and database sections
1307        assert!(!chunks.is_empty());
1308    }
1309
1310    #[test]
1311    fn test_chunk_content_threshold_uses_chars_not_bytes() {
1312        // Bug #4: chunker threshold should use char count, not byte length
1313        // 999 4-byte emoji chars = 999 chars but 3996 bytes
1314        let content: String = "😀".repeat(999);
1315        assert_eq!(content.chars().count(), 999);
1316        assert_eq!(content.len(), 3996);
1317
1318        let config = ChunkingConfig::default(); // min_chunk_threshold: 1000
1319        let chunks = chunk_content(&content, ContentType::Text, &config);
1320        // 999 chars < 1000 threshold, so should NOT chunk
1321        assert_eq!(
1322            chunks.len(),
1323            1,
1324            "Should not chunk content below char threshold"
1325        );
1326    }
1327
1328    #[test]
1329    fn test_split_by_chars_multibyte_no_panic() {
1330        // Bug #5: split_by_chars must not panic on multi-byte UTF-8
1331        let content: String = "日".repeat(2000); // 3 bytes each
1332        // Should not panic
1333        let chunks = split_by_chars(&content, 500, 0, None, 50);
1334        assert!(!chunks.is_empty());
1335        for chunk in &chunks {
1336            // All chunks should be valid UTF-8 (they are Strings, so this is guaranteed
1337            // if we didn't panic)
1338            assert!(!chunk.content.is_empty());
1339        }
1340    }
1341
1342    #[test]
1343    fn test_split_by_chars_mixed_multibyte() {
1344        // Mix of ASCII and multi-byte chars
1345        let content = "Hello 世界! ".repeat(200);
1346        let chunks = split_by_chars(&content, 100, 0, None, 20);
1347        assert!(!chunks.is_empty());
1348        for chunk in &chunks {
1349            assert!(!chunk.content.is_empty());
1350        }
1351    }
1352
1353    #[test]
1354    fn test_recursive_split_preserves_context() {
1355        let long_text = "This is a sentence. ".repeat(100);
1356        let chunks = recursive_split(&long_text, 200, 0, Some("test context".to_string()), 20);
1357
1358        assert!(chunks.len() > 1);
1359        for chunk in &chunks {
1360            assert!(
1361                chunk
1362                    .context
1363                    .as_ref()
1364                    .map(|c| c == "test context")
1365                    .unwrap_or(false)
1366            );
1367        }
1368    }
1369}
sediment/chunker.rs

sediment/
chunker.rs