Skip to main content

sqlite_graphrag/
chunking.rs

1//! Semantic chunking for embedding inputs (Markdown-aware, 512-token limit).
2//!
3//! Splits bodies using [`text_splitter::MarkdownSplitter`] with overlap so
4//! multi-chunk memories preserve context across chunk boundaries.
5
6// src/chunking.rs
7// Token-based chunking for E5 model (512 token limit)
8
9use crate::constants::{CHUNK_OVERLAP_TOKENS, CHUNK_SIZE_TOKENS, EMBEDDING_DIM};
10use text_splitter::{ChunkConfig, MarkdownSplitter};
11use tokenizers::Tokenizer;
12
13// Conservative heuristic to reduce the risk of underestimating the real token count
14// in Markdown, code, and multilingual text. The previous value (4 chars/token) allowed
15// chunks that were too large for some real documents.
16const CHARS_PER_TOKEN: usize = 2;
17pub const CHUNK_SIZE_CHARS: usize = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
18pub const CHUNK_OVERLAP_CHARS: usize = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
19
20#[derive(Debug, Clone)]
21pub struct Chunk {
22    pub start_offset: usize,
23    pub end_offset: usize,
24    pub token_count_approx: usize,
25}
26
27pub fn needs_chunking(body: &str) -> bool {
28    body.len() > CHUNK_SIZE_CHARS
29}
30
31pub fn split_into_chunks(body: &str) -> Vec<Chunk> {
32    if !needs_chunking(body) {
33        return vec![Chunk {
34            token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
35            start_offset: 0,
36            end_offset: body.len(),
37        }];
38    }
39
40    let mut chunks = Vec::new();
41    let mut start = 0usize;
42
43    while start < body.len() {
44        start = next_char_boundary(body, start);
45        let desired_end = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
46        let end = if desired_end < body.len() {
47            find_split_boundary(body, start, desired_end)
48        } else {
49            desired_end
50        };
51
52        let end = if end <= start {
53            let fallback = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
54            if fallback > start {
55                fallback
56            } else {
57                body.len()
58            }
59        } else {
60            end
61        };
62
63        let token_count_approx = body[start..end].chars().count() / CHARS_PER_TOKEN;
64        chunks.push(Chunk {
65            start_offset: start,
66            end_offset: end,
67            token_count_approx,
68        });
69
70        if end >= body.len() {
71            break;
72        }
73
74        let next_start = next_char_boundary(body, end.saturating_sub(CHUNK_OVERLAP_CHARS));
75        start = if next_start >= end { end } else { next_start };
76    }
77
78    chunks
79}
80
81pub fn split_into_chunks_by_token_offsets(
82    body: &str,
83    token_offsets: &[(usize, usize)],
84) -> Vec<Chunk> {
85    if token_offsets.len() <= CHUNK_SIZE_TOKENS {
86        return vec![Chunk {
87            token_count_approx: token_offsets.len(),
88            start_offset: 0,
89            end_offset: body.len(),
90        }];
91    }
92
93    let mut chunks = Vec::new();
94    let mut start_token = 0usize;
95
96    while start_token < token_offsets.len() {
97        let end_token = (start_token + CHUNK_SIZE_TOKENS).min(token_offsets.len());
98
99        chunks.push(Chunk {
100            start_offset: if start_token == 0 {
101                0
102            } else {
103                token_offsets[start_token].0
104            },
105            end_offset: if end_token == token_offsets.len() {
106                body.len()
107            } else {
108                token_offsets[end_token - 1].1
109            },
110            token_count_approx: end_token - start_token,
111        });
112
113        if end_token == token_offsets.len() {
114            break;
115        }
116
117        let next_start = end_token.saturating_sub(CHUNK_OVERLAP_TOKENS);
118        start_token = if next_start <= start_token {
119            end_token
120        } else {
121            next_start
122        };
123    }
124
125    chunks
126}
127
128/// Splits body into chunks using MarkdownSplitter with a real tokenizer.
129/// Respects Markdown semantic boundaries (H1-H6, paragraphs, blocks).
130/// For plain text without Markdown markers, falls back to paragraph and sentence breaks.
131pub fn split_into_chunks_hierarchical(body: &str, tokenizer: &Tokenizer) -> Vec<Chunk> {
132    if body.is_empty() {
133        return Vec::new();
134    }
135
136    let config = ChunkConfig::new(CHUNK_SIZE_TOKENS)
137        .with_sizer(tokenizer)
138        .with_overlap(CHUNK_OVERLAP_TOKENS)
139        .expect(
140            "compile-time invariant: CHUNK_OVERLAP_TOKENS must be smaller than CHUNK_SIZE_TOKENS",
141        );
142
143    let splitter = MarkdownSplitter::new(config);
144
145    let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
146
147    if items.is_empty() {
148        return vec![Chunk {
149            start_offset: 0,
150            end_offset: body.len(),
151            token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
152        }];
153    }
154
155    items
156        .into_iter()
157        .map(|(start, text)| {
158            let end = start + text.len();
159            Chunk {
160                start_offset: start,
161                end_offset: end,
162                token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
163            }
164        })
165        .collect()
166}
167
168pub fn chunk_text<'a>(body: &'a str, chunk: &Chunk) -> &'a str {
169    &body[chunk.start_offset..chunk.end_offset]
170}
171
172fn find_split_boundary(body: &str, start: usize, desired_end: usize) -> usize {
173    let slice = &body[start..desired_end];
174    if let Some(pos) = slice.rfind("\n\n") {
175        return start + pos + 2;
176    }
177    if let Some(pos) = slice.rfind(". ") {
178        return start + pos + 2;
179    }
180    if let Some(pos) = slice.rfind(' ') {
181        return start + pos + 1;
182    }
183    desired_end
184}
185
186fn previous_char_boundary(body: &str, mut idx: usize) -> usize {
187    idx = idx.min(body.len());
188    while idx > 0 && !body.is_char_boundary(idx) {
189        idx -= 1;
190    }
191    idx
192}
193
194fn next_char_boundary(body: &str, mut idx: usize) -> usize {
195    idx = idx.min(body.len());
196    while idx < body.len() && !body.is_char_boundary(idx) {
197        idx += 1;
198    }
199    idx
200}
201
202pub fn aggregate_embeddings(chunk_embeddings: &[Vec<f32>]) -> Vec<f32> {
203    if chunk_embeddings.is_empty() {
204        return vec![0.0f32; EMBEDDING_DIM];
205    }
206    if chunk_embeddings.len() == 1 {
207        return chunk_embeddings[0].clone();
208    }
209
210    let dim = chunk_embeddings[0].len();
211    let mut mean = vec![0.0f32; dim];
212    for emb in chunk_embeddings {
213        for (i, v) in emb.iter().enumerate() {
214            mean[i] += v;
215        }
216    }
217    let n = chunk_embeddings.len() as f32;
218    for v in &mut mean {
219        *v /= n;
220    }
221
222    let norm: f32 = mean.iter().map(|x| x * x).sum::<f32>().sqrt();
223    if norm > 1e-9 {
224        for v in &mut mean {
225            *v /= norm;
226        }
227    }
228    mean
229}
230
231#[cfg(test)]
232mod tests {
233    use super::*;
234
235    #[test]
236    fn test_short_body_no_chunking() {
237        let body = "short text";
238        assert!(!needs_chunking(body));
239        let chunks = split_into_chunks(body);
240        assert_eq!(chunks.len(), 1);
241        assert_eq!(chunk_text(body, &chunks[0]), body);
242    }
243
244    #[test]
245    fn test_long_body_produces_multiple_chunks() {
246        let body = "word ".repeat(1000);
247        assert!(needs_chunking(&body));
248        let chunks = split_into_chunks(&body);
249        assert!(chunks.len() > 1);
250        assert!(chunks.iter().all(|c| !chunk_text(&body, c).is_empty()));
251    }
252
253    #[test]
254    fn split_by_token_offsets_respeita_limite_e_overlap() {
255        let body = "ab".repeat(460);
256        let offsets: Vec<(usize, usize)> = (0..460)
257            .map(|i| {
258                let start = i * 2;
259                (start, start + 2)
260            })
261            .collect();
262
263        let chunks = split_into_chunks_by_token_offsets(&body, &offsets);
264        assert_eq!(chunks.len(), 2);
265        assert_eq!(chunks[0].token_count_approx, CHUNK_SIZE_TOKENS);
266        assert_eq!(chunks[1].token_count_approx, 110);
267        assert_eq!(chunks[0].start_offset, 0);
268        assert_eq!(
269            chunks[1].start_offset,
270            offsets[CHUNK_SIZE_TOKENS - CHUNK_OVERLAP_TOKENS].0
271        );
272    }
273
274    #[test]
275    fn split_by_token_offsets_returns_one_chunk_when_fits() {
276        let body = "texto curto";
277        let offsets = vec![(0, 5), (6, 11)];
278        let chunks = split_into_chunks_by_token_offsets(body, &offsets);
279        assert_eq!(chunks.len(), 1);
280        assert_eq!(chunks[0].start_offset, 0);
281        assert_eq!(chunks[0].end_offset, body.len());
282        assert_eq!(chunks[0].token_count_approx, 2);
283    }
284
285    #[test]
286    fn test_multibyte_body_preserves_progress_and_boundaries() {
287        // Multibyte body intentionally includes 2-byte UTF-8 sequences (Latin-1 supplement)
288        // expressed as Unicode escapes so this source file remains ASCII-only per the
289        // language policy. The original PT-BR phrase "a\u{e7}\u{e3}o \u{fa}til " is preserved
290        // since the test exercises UTF-8 char-boundary handling.
291        let body = "a\u{e7}\u{e3}o \u{fa}til ".repeat(1000);
292        let chunks = split_into_chunks(&body);
293        assert!(chunks.len() > 1);
294        for chunk in &chunks {
295            assert!(!chunk_text(&body, chunk).is_empty());
296            assert!(body.is_char_boundary(chunk.start_offset));
297            assert!(body.is_char_boundary(chunk.end_offset));
298            assert!(chunk.end_offset > chunk.start_offset);
299        }
300        for pair in chunks.windows(2) {
301            assert!(pair[1].start_offset >= pair[0].start_offset);
302            assert!(pair[1].end_offset > pair[0].start_offset);
303        }
304    }
305
306    #[test]
307    fn test_aggregate_embeddings_normalizes() {
308        let embs = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
309        let agg = aggregate_embeddings(&embs);
310        let norm: f32 = agg.iter().map(|x| x * x).sum::<f32>().sqrt();
311        assert!((norm - 1.0).abs() < 1e-5);
312    }
313
314    fn split_hier_chars(body: &str, size: usize) -> Vec<Chunk> {
315        use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
316        if body.is_empty() {
317            return Vec::new();
318        }
319        let config = ChunkConfig::new(size)
320            .with_sizer(Characters)
321            .with_overlap(0)
322            .expect("overlap must be smaller than size");
323        let splitter = MarkdownSplitter::new(config);
324        let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
325        if items.is_empty() {
326            return vec![Chunk {
327                start_offset: 0,
328                end_offset: body.len(),
329                token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
330            }];
331        }
332        items
333            .into_iter()
334            .map(|(start, text)| {
335                let end = start + text.len();
336                Chunk {
337                    start_offset: start,
338                    end_offset: end,
339                    token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
340                }
341            })
342            .collect()
343    }
344
345    #[test]
346    fn test_hierarchical_empty_body_returns_empty() {
347        use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
348        let config = ChunkConfig::new(100)
349            .with_sizer(Characters)
350            .with_overlap(0)
351            .expect("overlap < size");
352        let splitter = MarkdownSplitter::new(config);
353        let result: Vec<_> = splitter.chunk_indices("").collect();
354        assert!(result.is_empty());
355    }
356
357    #[test]
358    fn test_markdown_h1_boundary_yields_two_chunks() {
359        let body = "# Title 1\n\nbody1 body1 body1 body1 body1 body1\n\n# Title 2\n\nbody2 body2 body2 body2 body2 body2";
360        let chunks = split_hier_chars(body, 30);
361        assert!(
362            chunks.len() >= 2,
363            "expected >=2 chunks, got {}",
364            chunks.len()
365        );
366        for c in &chunks {
367            assert!(body.is_char_boundary(c.start_offset));
368            assert!(body.is_char_boundary(c.end_offset));
369        }
370    }
371
372    #[test]
373    fn test_markdown_h2_nested_respects_boundaries() {
374        let body = "# H1\n\n## H2a\n\nParagraph A with enough text to force a split.\n\n## H2b\n\nParagraph B with enough text to force a split as well.";
375        let chunks = split_hier_chars(body, 40);
376        assert!(!chunks.is_empty());
377        for c in &chunks {
378            assert!(body.is_char_boundary(c.start_offset));
379            assert!(body.is_char_boundary(c.end_offset));
380            assert!(c.end_offset > c.start_offset);
381            assert!(c.end_offset <= body.len());
382        }
383    }
384
385    #[test]
386    fn test_markdown_paragraph_soft_boundary() {
387        let para = "Plain text sentence used to fill the paragraph. ";
388        let body = format!(
389            "{}\n\n{}\n\n{}",
390            para.repeat(3),
391            para.repeat(3),
392            para.repeat(3)
393        );
394        let chunks = split_hier_chars(&body, 80);
395        assert!(
396            chunks.len() >= 2,
397            "expected >=2 chunks with a body of {} chars",
398            body.len()
399        );
400        for c in &chunks {
401            assert!(body.is_char_boundary(c.start_offset));
402            assert!(body.is_char_boundary(c.end_offset));
403        }
404    }
405
406    #[test]
407    fn test_markdown_60kb_valid_offsets() {
408        let block = "# Section\n\nBlock content text. ".repeat(1700);
409        assert!(
410            block.len() > 50_000,
411            "body must be >50KB, has {} bytes",
412            block.len()
413        );
414        let chunks = split_hier_chars(&block, 256);
415        assert!(chunks.len() > 1);
416        for c in &chunks {
417            assert!(block.is_char_boundary(c.start_offset));
418            assert!(block.is_char_boundary(c.end_offset));
419            assert!(c.end_offset > c.start_offset);
420            assert!(!chunk_text(&block, c).is_empty());
421        }
422    }
423
424    #[test]
425    fn test_fallback_plain_text_without_markers() {
426        let body = "a ".repeat(1000);
427        let chunks = split_hier_chars(&body, 100);
428        assert!(!chunks.is_empty());
429        for c in &chunks {
430            assert!(body.is_char_boundary(c.start_offset));
431            assert!(body.is_char_boundary(c.end_offset));
432        }
433    }
434}