sqlite_graphrag/
chunking.rs

1//! Semantic chunking for embedding inputs (Markdown-aware, 512-token limit).
2//!
3//! Splits bodies using [`text_splitter::MarkdownSplitter`] with overlap so
4//! multi-chunk memories preserve context across chunk boundaries.
5
6// src/chunking.rs
7// Token-based chunking for E5 model (512 token limit)
8
9use crate::constants::{CHUNK_OVERLAP_TOKENS, CHUNK_SIZE_TOKENS, EMBEDDING_DIM};
10use text_splitter::{ChunkConfig, MarkdownSplitter};
11use tokenizers::Tokenizer;
12
13// Conservative heuristic to reduce the risk of underestimating the real token count
14// in Markdown, code, and multilingual text. The previous value (4 chars/token) allowed
15// chunks that were too large for some real documents.
16/// Characters per token heuristic: 2 chars/token reduces the risk of underestimating
17/// real token counts in Markdown, code, and multilingual text.
18const CHARS_PER_TOKEN: usize = 2;
19
20/// Maximum character length of a single chunk (derived from token limit × chars-per-token).
21pub const CHUNK_SIZE_CHARS: usize = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
22
23/// Character overlap between consecutive chunks to preserve cross-boundary context.
24pub const CHUNK_OVERLAP_CHARS: usize = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
25
26/// A contiguous slice of a body string identified by byte offsets.
27#[derive(Debug, Clone)]
28pub struct Chunk {
29    /// Byte offset of the first character (inclusive).
30    pub start_offset: usize,
31    /// Byte offset past the last character (exclusive).
32    pub end_offset: usize,
33    /// Approximate token count for this chunk (chars / `CHARS_PER_TOKEN`).
34    pub token_count_approx: usize,
35}
36
37/// Returns `true` when `body` exceeds `CHUNK_SIZE_CHARS` and must be split.
38pub fn needs_chunking(body: &str) -> bool {
39    body.len() > CHUNK_SIZE_CHARS
40}
41
42/// Splits `body` into overlapping [`Chunk`]s using a character-based heuristic.
43///
44/// Short bodies (≤ `CHUNK_SIZE_CHARS`) are returned as a single chunk.
45/// Splits prefer paragraph breaks, then sentence-end punctuation, then word boundaries.
46///
47/// # Errors
48/// This function is infallible; it returns a `Vec` directly.
49pub fn split_into_chunks(body: &str) -> Vec<Chunk> {
50    if !needs_chunking(body) {
51        return vec![Chunk {
52            token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
53            start_offset: 0,
54            end_offset: body.len(),
55        }];
56    }
57
58    let mut chunks = Vec::new();
59    let mut start = 0usize;
60
61    while start < body.len() {
62        start = next_char_boundary(body, start);
63        let desired_end = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
64        let end = if desired_end < body.len() {
65            find_split_boundary(body, start, desired_end)
66        } else {
67            desired_end
68        };
69
70        let end = if end <= start {
71            let fallback = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
72            if fallback > start {
73                fallback
74            } else {
75                body.len()
76            }
77        } else {
78            end
79        };
80
81        let token_count_approx = body[start..end].chars().count() / CHARS_PER_TOKEN;
82        chunks.push(Chunk {
83            start_offset: start,
84            end_offset: end,
85            token_count_approx,
86        });
87
88        if end >= body.len() {
89            break;
90        }
91
92        let next_start = next_char_boundary(body, end.saturating_sub(CHUNK_OVERLAP_CHARS));
93        start = if next_start >= end { end } else { next_start };
94    }
95
96    chunks
97}
98
99/// Splits `body` into [`Chunk`]s using pre-computed token byte-offsets.
100///
101/// Each element of `token_offsets` is a `(start, end)` byte range for one token.
102/// Respects `CHUNK_SIZE_TOKENS` and `CHUNK_OVERLAP_TOKENS` constants.
103/// Short bodies (≤ `CHUNK_SIZE_TOKENS` tokens) are returned as a single chunk.
104pub fn split_into_chunks_by_token_offsets(
105    body: &str,
106    token_offsets: &[(usize, usize)],
107) -> Vec<Chunk> {
108    if token_offsets.len() <= CHUNK_SIZE_TOKENS {
109        return vec![Chunk {
110            token_count_approx: token_offsets.len(),
111            start_offset: 0,
112            end_offset: body.len(),
113        }];
114    }
115
116    let mut chunks = Vec::new();
117    let mut start_token = 0usize;
118
119    while start_token < token_offsets.len() {
120        let end_token = (start_token + CHUNK_SIZE_TOKENS).min(token_offsets.len());
121
122        chunks.push(Chunk {
123            start_offset: if start_token == 0 {
124                0
125            } else {
126                token_offsets[start_token].0
127            },
128            end_offset: if end_token == token_offsets.len() {
129                body.len()
130            } else {
131                token_offsets[end_token - 1].1
132            },
133            token_count_approx: end_token - start_token,
134        });
135
136        if end_token == token_offsets.len() {
137            break;
138        }
139
140        let next_start = end_token.saturating_sub(CHUNK_OVERLAP_TOKENS);
141        start_token = if next_start <= start_token {
142            end_token
143        } else {
144            next_start
145        };
146    }
147
148    chunks
149}
150
151/// Splits body into chunks using MarkdownSplitter with a real tokenizer.
152/// Respects Markdown semantic boundaries (H1-H6, paragraphs, blocks).
153/// For plain text without Markdown markers, falls back to paragraph and sentence breaks.
154pub fn split_into_chunks_hierarchical(body: &str, tokenizer: &Tokenizer) -> Vec<Chunk> {
155    if body.is_empty() {
156        return Vec::new();
157    }
158
159    let config = ChunkConfig::new(CHUNK_SIZE_TOKENS)
160        .with_sizer(tokenizer)
161        .with_overlap(CHUNK_OVERLAP_TOKENS)
162        .expect(
163            "compile-time invariant: CHUNK_OVERLAP_TOKENS must be smaller than CHUNK_SIZE_TOKENS",
164        );
165
166    let splitter = MarkdownSplitter::new(config);
167
168    let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
169
170    if items.is_empty() {
171        return vec![Chunk {
172            start_offset: 0,
173            end_offset: body.len(),
174            token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
175        }];
176    }
177
178    items
179        .into_iter()
180        .map(|(start, text)| {
181            let end = start + text.len();
182            Chunk {
183                start_offset: start,
184                end_offset: end,
185                token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
186            }
187        })
188        .collect()
189}
190
191/// Returns the string slice of `body` described by `chunk`'s byte offsets.
192pub fn chunk_text<'a>(body: &'a str, chunk: &Chunk) -> &'a str {
193    &body[chunk.start_offset..chunk.end_offset]
194}
195
196fn find_split_boundary(body: &str, start: usize, desired_end: usize) -> usize {
197    let slice = &body[start..desired_end];
198    if let Some(pos) = slice.rfind("\n\n") {
199        return start + pos + 2;
200    }
201    if let Some(pos) = slice.rfind(". ") {
202        return start + pos + 2;
203    }
204    if let Some(pos) = slice.rfind(' ') {
205        return start + pos + 1;
206    }
207    desired_end
208}
209
210fn previous_char_boundary(body: &str, mut idx: usize) -> usize {
211    idx = idx.min(body.len());
212    while idx > 0 && !body.is_char_boundary(idx) {
213        idx -= 1;
214    }
215    idx
216}
217
218fn next_char_boundary(body: &str, mut idx: usize) -> usize {
219    idx = idx.min(body.len());
220    while idx < body.len() && !body.is_char_boundary(idx) {
221        idx += 1;
222    }
223    idx
224}
225
226/// Computes the mean of `chunk_embeddings` and L2-normalizes the result.
227///
228/// Returns a zero-vector of length `EMBEDDING_DIM` when the input is empty.
229/// When a single embedding is provided it is returned as-is (no copy).
230pub fn aggregate_embeddings(chunk_embeddings: &[Vec<f32>]) -> Vec<f32> {
231    if chunk_embeddings.is_empty() {
232        return vec![0.0f32; EMBEDDING_DIM];
233    }
234    if chunk_embeddings.len() == 1 {
235        return chunk_embeddings[0].clone();
236    }
237
238    let dim = chunk_embeddings[0].len();
239    let mut mean = vec![0.0f32; dim];
240    for emb in chunk_embeddings {
241        for (i, v) in emb.iter().enumerate() {
242            mean[i] += v;
243        }
244    }
245    let n = chunk_embeddings.len() as f32;
246    for v in &mut mean {
247        *v /= n;
248    }
249
250    let norm: f32 = mean.iter().map(|x| x * x).sum::<f32>().sqrt();
251    if norm > 1e-9 {
252        for v in &mut mean {
253            *v /= norm;
254        }
255    }
256    mean
257}
258
259#[cfg(test)]
260mod tests {
261    use super::*;
262
263    #[test]
264    fn test_short_body_no_chunking() {
265        let body = "short text";
266        assert!(!needs_chunking(body));
267        let chunks = split_into_chunks(body);
268        assert_eq!(chunks.len(), 1);
269        assert_eq!(chunk_text(body, &chunks[0]), body);
270    }
271
272    #[test]
273    fn test_long_body_produces_multiple_chunks() {
274        let body = "word ".repeat(1000);
275        assert!(needs_chunking(&body));
276        let chunks = split_into_chunks(&body);
277        assert!(chunks.len() > 1);
278        assert!(chunks.iter().all(|c| !chunk_text(&body, c).is_empty()));
279    }
280
281    #[test]
282    fn split_by_token_offsets_respeita_limite_e_overlap() {
283        let body = "ab".repeat(460);
284        let offsets: Vec<(usize, usize)> = (0..460)
285            .map(|i| {
286                let start = i * 2;
287                (start, start + 2)
288            })
289            .collect();
290
291        let chunks = split_into_chunks_by_token_offsets(&body, &offsets);
292        assert_eq!(chunks.len(), 2);
293        assert_eq!(chunks[0].token_count_approx, CHUNK_SIZE_TOKENS);
294        assert_eq!(chunks[1].token_count_approx, 110);
295        assert_eq!(chunks[0].start_offset, 0);
296        assert_eq!(
297            chunks[1].start_offset,
298            offsets[CHUNK_SIZE_TOKENS - CHUNK_OVERLAP_TOKENS].0
299        );
300    }
301
302    #[test]
303    fn split_by_token_offsets_returns_one_chunk_when_fits() {
304        let body = "texto curto";
305        let offsets = vec![(0, 5), (6, 11)];
306        let chunks = split_into_chunks_by_token_offsets(body, &offsets);
307        assert_eq!(chunks.len(), 1);
308        assert_eq!(chunks[0].start_offset, 0);
309        assert_eq!(chunks[0].end_offset, body.len());
310        assert_eq!(chunks[0].token_count_approx, 2);
311    }
312
313    #[test]
314    fn test_multibyte_body_preserves_progress_and_boundaries() {
315        // Multibyte body intentionally includes 2-byte UTF-8 sequences (Latin-1 supplement)
316        // expressed as Unicode escapes so this source file remains ASCII-only per the
317        // language policy. The original PT-BR phrase "a\u{e7}\u{e3}o \u{fa}til " is preserved
318        // since the test exercises UTF-8 char-boundary handling.
319        let body = "a\u{e7}\u{e3}o \u{fa}til ".repeat(1000);
320        let chunks = split_into_chunks(&body);
321        assert!(chunks.len() > 1);
322        for chunk in &chunks {
323            assert!(!chunk_text(&body, chunk).is_empty());
324            assert!(body.is_char_boundary(chunk.start_offset));
325            assert!(body.is_char_boundary(chunk.end_offset));
326            assert!(chunk.end_offset > chunk.start_offset);
327        }
328        for pair in chunks.windows(2) {
329            assert!(pair[1].start_offset >= pair[0].start_offset);
330            assert!(pair[1].end_offset > pair[0].start_offset);
331        }
332    }
333
334    #[test]
335    fn test_aggregate_embeddings_normalizes() {
336        let embs = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
337        let agg = aggregate_embeddings(&embs);
338        let norm: f32 = agg.iter().map(|x| x * x).sum::<f32>().sqrt();
339        assert!((norm - 1.0).abs() < 1e-5);
340    }
341
342    fn split_hier_chars(body: &str, size: usize) -> Vec<Chunk> {
343        use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
344        if body.is_empty() {
345            return Vec::new();
346        }
347        let config = ChunkConfig::new(size)
348            .with_sizer(Characters)
349            .with_overlap(0)
350            .expect("overlap must be smaller than size");
351        let splitter = MarkdownSplitter::new(config);
352        let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
353        if items.is_empty() {
354            return vec![Chunk {
355                start_offset: 0,
356                end_offset: body.len(),
357                token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
358            }];
359        }
360        items
361            .into_iter()
362            .map(|(start, text)| {
363                let end = start + text.len();
364                Chunk {
365                    start_offset: start,
366                    end_offset: end,
367                    token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
368                }
369            })
370            .collect()
371    }
372
373    #[test]
374    fn test_hierarchical_empty_body_returns_empty() {
375        use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
376        let config = ChunkConfig::new(100)
377            .with_sizer(Characters)
378            .with_overlap(0)
379            .expect("overlap < size");
380        let splitter = MarkdownSplitter::new(config);
381        let result: Vec<_> = splitter.chunk_indices("").collect();
382        assert!(result.is_empty());
383    }
384
385    #[test]
386    fn test_markdown_h1_boundary_yields_two_chunks() {
387        let body = "# Title 1\n\nbody1 body1 body1 body1 body1 body1\n\n# Title 2\n\nbody2 body2 body2 body2 body2 body2";
388        let chunks = split_hier_chars(body, 30);
389        assert!(
390            chunks.len() >= 2,
391            "expected >=2 chunks, got {}",
392            chunks.len()
393        );
394        for c in &chunks {
395            assert!(body.is_char_boundary(c.start_offset));
396            assert!(body.is_char_boundary(c.end_offset));
397        }
398    }
399
400    #[test]
401    fn test_markdown_h2_nested_respects_boundaries() {
402        let body = "# H1\n\n## H2a\n\nParagraph A with enough text to force a split.\n\n## H2b\n\nParagraph B with enough text to force a split as well.";
403        let chunks = split_hier_chars(body, 40);
404        assert!(!chunks.is_empty());
405        for c in &chunks {
406            assert!(body.is_char_boundary(c.start_offset));
407            assert!(body.is_char_boundary(c.end_offset));
408            assert!(c.end_offset > c.start_offset);
409            assert!(c.end_offset <= body.len());
410        }
411    }
412
413    #[test]
414    fn test_markdown_paragraph_soft_boundary() {
415        let para = "Plain text sentence used to fill the paragraph. ";
416        let body = format!(
417            "{}\n\n{}\n\n{}",
418            para.repeat(3),
419            para.repeat(3),
420            para.repeat(3)
421        );
422        let chunks = split_hier_chars(&body, 80);
423        assert!(
424            chunks.len() >= 2,
425            "expected >=2 chunks with a body of {} chars",
426            body.len()
427        );
428        for c in &chunks {
429            assert!(body.is_char_boundary(c.start_offset));
430            assert!(body.is_char_boundary(c.end_offset));
431        }
432    }
433
434    #[test]
435    fn test_markdown_60kb_valid_offsets() {
436        let block = "# Section\n\nBlock content text. ".repeat(1700);
437        assert!(
438            block.len() > 50_000,
439            "body must be >50KB, has {} bytes",
440            block.len()
441        );
442        let chunks = split_hier_chars(&block, 256);
443        assert!(chunks.len() > 1);
444        for c in &chunks {
445            assert!(block.is_char_boundary(c.start_offset));
446            assert!(block.is_char_boundary(c.end_offset));
447            assert!(c.end_offset > c.start_offset);
448            assert!(!chunk_text(&block, c).is_empty());
449        }
450    }
451
452    #[test]
453    fn test_fallback_plain_text_without_markers() {
454        let body = "a ".repeat(1000);
455        let chunks = split_hier_chars(&body, 100);
456        assert!(!chunks.is_empty());
457        for c in &chunks {
458            assert!(body.is_char_boundary(c.start_offset));
459            assert!(body.is_char_boundary(c.end_offset));
460        }
461    }
462}
sqlite_graphrag/chunking.rs

sqlite_graphrag/
chunking.rs