Skip to main content

heartbit_core/knowledge/
chunker.rs

1//! Document chunker — splits large documents into overlapping text chunks for indexing.
2
3use super::{Chunk, DocumentSource};
4
5/// Configuration for text chunking.
6#[derive(Debug, Clone)]
7pub struct ChunkConfig {
8    /// Maximum byte length per chunk.
9    pub chunk_size: usize,
10    /// Number of overlapping bytes between consecutive chunks.
11    pub chunk_overlap: usize,
12}
13
14impl Default for ChunkConfig {
15    fn default() -> Self {
16        Self {
17            chunk_size: 1000,
18            chunk_overlap: 200,
19        }
20    }
21}
22
23/// Generate a deterministic chunk ID from source URI and chunk index.
24///
25/// Uses a simple FNV-1a hash for stability across process restarts and
26/// Rust versions (unlike `DefaultHasher` which uses randomized SipHash).
27fn chunk_id(uri: &str, index: usize) -> String {
28    let hash = crate::util::fnv1a_hash(uri.as_bytes());
29    format!("{hash:016x}-{index}")
30}
31
32/// Split text into overlapping chunks, respecting paragraph boundaries.
33///
34/// Empty text produces no chunks. Paragraphs are split on double newlines.
35/// If a paragraph fits within `chunk_size`, it's kept whole. Large paragraphs
36/// are split at `chunk_size` boundaries with `chunk_overlap` overlap.
37pub fn split_into_chunks(text: &str, source: &DocumentSource, config: &ChunkConfig) -> Vec<Chunk> {
38    let text = text.trim();
39    if text.is_empty() {
40        return vec![];
41    }
42
43    let mut chunks = Vec::new();
44    let mut current = String::new();
45    let mut chunk_index = 0;
46
47    let paragraphs: Vec<&str> = text.split("\n\n").collect();
48
49    for para in &paragraphs {
50        let para = para.trim();
51        if para.is_empty() {
52            continue;
53        }
54
55        // If adding this paragraph would exceed chunk_size, emit current chunk first
56        if !current.is_empty() && current.len() + para.len() + 2 > config.chunk_size {
57            let id = chunk_id(&source.uri, chunk_index);
58            chunks.push(Chunk {
59                id,
60                content: current.clone(),
61                source: source.clone(),
62                chunk_index,
63                tenant_id: None,
64            });
65            chunk_index += 1;
66
67            // Keep overlap from the end of the current chunk
68            if config.chunk_overlap > 0 && current.len() > config.chunk_overlap {
69                let start = current.len() - config.chunk_overlap;
70                // Find a char boundary
71                let start = current.ceil_char_boundary(start);
72                current = current[start..].to_string();
73            } else if config.chunk_overlap == 0 {
74                current.clear();
75            }
76            // If chunk_overlap >= current.len(), keep all of current
77        }
78
79        // Handle paragraphs larger than chunk_size by splitting them
80        if para.len() > config.chunk_size {
81            // First flush current content if any
82            if !current.is_empty() {
83                let id = chunk_id(&source.uri, chunk_index);
84                chunks.push(Chunk {
85                    id,
86                    content: current.clone(),
87                    source: source.clone(),
88                    chunk_index,
89                    tenant_id: None,
90                });
91                chunk_index += 1;
92                current.clear();
93            }
94
95            // Split the large paragraph
96            let mut pos = 0;
97            while pos < para.len() {
98                let end = (pos + config.chunk_size).min(para.len());
99                let end = para.ceil_char_boundary(end);
100                let end = end.min(para.len());
101
102                let id = chunk_id(&source.uri, chunk_index);
103                chunks.push(Chunk {
104                    id,
105                    content: para[pos..end].to_string(),
106                    source: source.clone(),
107                    chunk_index,
108                    tenant_id: None,
109                });
110                chunk_index += 1;
111
112                if end >= para.len() {
113                    break;
114                }
115
116                // Advance with overlap
117                let advance = if config.chunk_overlap < config.chunk_size {
118                    config.chunk_size - config.chunk_overlap
119                } else {
120                    1 // Avoid infinite loop
121                };
122                pos += advance;
123                pos = para.ceil_char_boundary(pos);
124            }
125        } else {
126            // Append paragraph to current chunk
127            if !current.is_empty() {
128                current.push_str("\n\n");
129            }
130            current.push_str(para);
131        }
132    }
133
134    // Emit any remaining content
135    if !current.is_empty() {
136        let id = chunk_id(&source.uri, chunk_index);
137        chunks.push(Chunk {
138            id,
139            content: current,
140            source: source.clone(),
141            chunk_index,
142            tenant_id: None,
143        });
144    }
145
146    chunks
147}
148
149#[cfg(test)]
150mod tests {
151    use super::*;
152
153    fn test_source() -> DocumentSource {
154        DocumentSource {
155            uri: "test.md".into(),
156            title: "Test".into(),
157        }
158    }
159
160    #[test]
161    fn empty_text_produces_no_chunks() {
162        let chunks = split_into_chunks("", &test_source(), &ChunkConfig::default());
163        assert!(chunks.is_empty());
164    }
165
166    #[test]
167    fn whitespace_only_produces_no_chunks() {
168        let chunks = split_into_chunks("   \n\n  ", &test_source(), &ChunkConfig::default());
169        assert!(chunks.is_empty());
170    }
171
172    #[test]
173    fn single_small_paragraph_is_one_chunk() {
174        let text = "Hello, world!";
175        let chunks = split_into_chunks(text, &test_source(), &ChunkConfig::default());
176        assert_eq!(chunks.len(), 1);
177        assert_eq!(chunks[0].content, "Hello, world!");
178        assert_eq!(chunks[0].chunk_index, 0);
179        assert_eq!(chunks[0].source.uri, "test.md");
180    }
181
182    #[test]
183    fn multiple_paragraphs_within_limit_are_single_chunk() {
184        let text = "First paragraph.\n\nSecond paragraph.";
185        let config = ChunkConfig {
186            chunk_size: 1000,
187            chunk_overlap: 0,
188        };
189        let chunks = split_into_chunks(text, &test_source(), &config);
190        assert_eq!(chunks.len(), 1);
191        assert!(chunks[0].content.contains("First paragraph."));
192        assert!(chunks[0].content.contains("Second paragraph."));
193    }
194
195    #[test]
196    fn paragraphs_exceeding_limit_split_into_multiple_chunks() {
197        let para1 = "a".repeat(60);
198        let para2 = "b".repeat(60);
199        let text = format!("{para1}\n\n{para2}");
200        let config = ChunkConfig {
201            chunk_size: 80,
202            chunk_overlap: 0,
203        };
204        let chunks = split_into_chunks(&text, &test_source(), &config);
205        assert!(
206            chunks.len() >= 2,
207            "expected >= 2 chunks, got {}",
208            chunks.len()
209        );
210        assert!(chunks[0].content.contains('a'));
211        assert!(chunks.last().unwrap().content.contains('b'));
212    }
213
214    #[test]
215    fn overlap_preserves_context() {
216        let para1 = "a".repeat(60);
217        let para2 = "b".repeat(60);
218        let text = format!("{para1}\n\n{para2}");
219        let config = ChunkConfig {
220            chunk_size: 80,
221            chunk_overlap: 20,
222        };
223        let chunks = split_into_chunks(&text, &test_source(), &config);
224        assert!(
225            chunks.len() >= 2,
226            "expected >= 2 chunks, got {}",
227            chunks.len()
228        );
229        // The second chunk should start with overlap from the first
230        if chunks.len() >= 2 {
231            // With overlap=20, the second chunk should contain some trailing 'a's
232            // from the first chunk
233            let c1_tail = &chunks[0].content[chunks[0].content.len().saturating_sub(20)..];
234            let c2_head = &chunks[1].content[..c1_tail.len().min(chunks[1].content.len())];
235            assert_eq!(c1_tail, c2_head, "overlap should match");
236        }
237    }
238
239    #[test]
240    fn chunk_indices_are_sequential() {
241        let text = (0..10)
242            .map(|i| format!("Paragraph {i}"))
243            .collect::<Vec<_>>()
244            .join("\n\n");
245        let config = ChunkConfig {
246            chunk_size: 30,
247            chunk_overlap: 0,
248        };
249        let chunks = split_into_chunks(&text, &test_source(), &config);
250        for (i, chunk) in chunks.iter().enumerate() {
251            assert_eq!(chunk.chunk_index, i, "chunk {i} has wrong index");
252        }
253    }
254
255    #[test]
256    fn deterministic_ids() {
257        let text = "Hello world.\n\nSecond paragraph.";
258        let config = ChunkConfig {
259            chunk_size: 20,
260            chunk_overlap: 0,
261        };
262        let chunks1 = split_into_chunks(text, &test_source(), &config);
263        let chunks2 = split_into_chunks(text, &test_source(), &config);
264        assert_eq!(chunks1.len(), chunks2.len());
265        for (a, b) in chunks1.iter().zip(chunks2.iter()) {
266            assert_eq!(a.id, b.id, "chunk IDs should be deterministic");
267        }
268    }
269
270    #[test]
271    fn different_sources_produce_different_ids() {
272        let text = "Hello world.";
273        let config = ChunkConfig::default();
274        let src1 = DocumentSource {
275            uri: "file1.md".into(),
276            title: "F1".into(),
277        };
278        let src2 = DocumentSource {
279            uri: "file2.md".into(),
280            title: "F2".into(),
281        };
282        let c1 = split_into_chunks(text, &src1, &config);
283        let c2 = split_into_chunks(text, &src2, &config);
284        assert_ne!(c1[0].id, c2[0].id);
285    }
286
287    #[test]
288    fn utf8_safe_chunking() {
289        // Multi-byte characters should not be split mid-character
290        let text = "é".repeat(600); // 2 bytes each = 1200 bytes, 600 chars
291        let config = ChunkConfig {
292            chunk_size: 100,
293            chunk_overlap: 20,
294        };
295        let chunks = split_into_chunks(&text, &test_source(), &config);
296        assert!(!chunks.is_empty());
297        for chunk in &chunks {
298            // Every chunk should be valid UTF-8 (Rust strings guarantee this)
299            assert!(chunk.content.is_char_boundary(0));
300            assert!(chunk.content.is_char_boundary(chunk.content.len()));
301        }
302    }
303
304    #[test]
305    fn chunk_config_defaults() {
306        let config = ChunkConfig::default();
307        assert_eq!(config.chunk_size, 1000);
308        assert_eq!(config.chunk_overlap, 200);
309    }
310
311    #[test]
312    fn large_single_paragraph_split() {
313        let text = "x".repeat(3000);
314        let config = ChunkConfig {
315            chunk_size: 1000,
316            chunk_overlap: 200,
317        };
318        let chunks = split_into_chunks(&text, &test_source(), &config);
319        assert!(
320            chunks.len() >= 3,
321            "expected >= 3 chunks, got {}",
322            chunks.len()
323        );
324        // All content should be covered
325        let total_unique: usize = chunks.iter().map(|c| c.content.len()).sum();
326        // With overlap, total content > original, so just check >= original
327        assert!(total_unique >= 3000);
328    }
329}