agentroot_core/index/
chunker.rs

1//! Document chunking for embedding
2
3/// Chunking configuration
4pub const CHUNK_SIZE_TOKENS: usize = 800;
5pub const CHUNK_OVERLAP_TOKENS: usize = 120;
6pub const CHUNK_SIZE_CHARS: usize = 3200;
7pub const CHUNK_OVERLAP_CHARS: usize = 480;
8
9/// Document chunk
10#[derive(Debug, Clone)]
11pub struct Chunk {
12    pub text: String,
13    pub position: usize,
14    pub token_count: Option<usize>,
15}
16
17/// Find a valid char boundary at or before the given byte index
18fn floor_char_boundary(s: &str, index: usize) -> usize {
19    if index >= s.len() {
20        return s.len();
21    }
22    let mut i = index;
23    while i > 0 && !s.is_char_boundary(i) {
24        i -= 1;
25    }
26    i
27}
28
29/// Find a valid char boundary at or after the given byte index
30fn ceil_char_boundary(s: &str, index: usize) -> usize {
31    if index >= s.len() {
32        return s.len();
33    }
34    let mut i = index;
35    while i < s.len() && !s.is_char_boundary(i) {
36        i += 1;
37    }
38    i
39}
40
41/// Character-based chunking (fallback)
42pub fn chunk_by_chars(content: &str, chunk_size: usize, overlap: usize) -> Vec<Chunk> {
43    if content.len() <= chunk_size {
44        return vec![Chunk {
45            text: content.to_string(),
46            position: 0,
47            token_count: None,
48        }];
49    }
50
51    let mut chunks = Vec::new();
52    let mut start = 0;
53
54    while start < content.len() {
55        let raw_end = (start + chunk_size).min(content.len());
56        let end = floor_char_boundary(content, raw_end);
57        let mut chunk_end = end;
58
59        // Find natural break point in last 30%
60        if end < content.len() {
61            let search_start_raw = start + (chunk_size * 70 / 100);
62            let search_start = ceil_char_boundary(content, search_start_raw);
63
64            if search_start < end {
65                let search_region = &content[search_start..end];
66
67                if let Some(pos) = search_region.rfind("\n\n") {
68                    chunk_end = search_start + pos + 2;
69                } else if let Some(pos) = search_region.rfind(". ") {
70                    chunk_end = search_start + pos + 2;
71                } else if let Some(pos) = search_region.rfind('\n') {
72                    chunk_end = search_start + pos + 1;
73                } else if let Some(pos) = search_region.rfind(' ') {
74                    chunk_end = search_start + pos + 1;
75                }
76            }
77        }
78
79        // Ensure chunk_end is at a char boundary
80        chunk_end = floor_char_boundary(content, chunk_end);
81
82        chunks.push(Chunk {
83            text: content[start..chunk_end].to_string(),
84            position: start,
85            token_count: None,
86        });
87
88        if chunk_end >= content.len() {
89            break;
90        }
91
92        let new_start_raw = chunk_end.saturating_sub(overlap);
93        start = ceil_char_boundary(content, new_start_raw);
94    }
95
96    chunks
97}
98
99#[cfg(test)]
100mod tests {
101    use super::*;
102
103    #[test]
104    fn test_chunk_small_content() {
105        let content = "Small content.";
106        let chunks = chunk_by_chars(content, 100, 20);
107        assert_eq!(chunks.len(), 1);
108        assert_eq!(chunks[0].text, content);
109    }
110
111    #[test]
112    fn test_chunk_preserves_paragraphs() {
113        let content = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
114        let chunks = chunk_by_chars(content, 30, 5);
115        assert!(chunks.len() >= 2);
116    }
117
118    #[test]
119    fn test_chunk_handles_unicode() {
120        let content = "Hello δΈ–η•Œ! This is a test with emoji πŸŽ‰ and special chars ─ here.";
121        let chunks = chunk_by_chars(content, 20, 5);
122        assert!(!chunks.is_empty());
123        for chunk in &chunks {
124            assert!(!chunk.text.is_empty());
125        }
126    }
127
128    #[test]
129    fn test_floor_char_boundary() {
130        let s = "Hello δΈ–η•Œ";
131        assert_eq!(floor_char_boundary(s, 6), 6); // Start of δΈ–
132        assert_eq!(floor_char_boundary(s, 7), 6); // Inside δΈ–
133        assert_eq!(floor_char_boundary(s, 8), 6); // Inside δΈ–
134        assert_eq!(floor_char_boundary(s, 9), 9); // Start of η•Œ
135    }
136}