sqlite_graphrag/
chunking.rs

1//! Semantic chunking for embedding inputs (Markdown-aware, 512-token limit).
2//!
3//! Splits bodies using [`text_splitter::MarkdownSplitter`] with overlap so
4//! multi-chunk memories preserve context across chunk boundaries.
5
6// src/chunking.rs
7// Token-based chunking for E5 model (512 token limit)
8
9use crate::constants::{CHUNK_OVERLAP_TOKENS, CHUNK_SIZE_TOKENS, EMBEDDING_DIM};
10use text_splitter::{ChunkConfig, MarkdownSplitter};
11
12// Conservative heuristic to reduce the risk of underestimating the real token count
13// in Markdown, code, and multilingual text. The previous value (4 chars/token) allowed
14// chunks that were too large for some real documents.
15/// Characters per token heuristic: 2 chars/token reduces the risk of underestimating
16/// real token counts in Markdown, code, and multilingual text.
17const CHARS_PER_TOKEN: usize = 2;
18
19/// Maximum character length of a single chunk (derived from token limit × chars-per-token).
20pub const CHUNK_SIZE_CHARS: usize = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
21
22/// Character overlap between consecutive chunks to preserve cross-boundary context.
23pub const CHUNK_OVERLAP_CHARS: usize = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
24
25/// A contiguous slice of a body string identified by byte offsets.
26#[derive(Debug, Clone)]
27pub struct Chunk {
28    /// Byte offset of the first character (inclusive).
29    pub start_offset: usize,
30    /// Byte offset past the last character (exclusive).
31    pub end_offset: usize,
32    /// Approximate token count for this chunk (chars / `CHARS_PER_TOKEN`).
33    pub token_count_approx: usize,
34}
35
36/// Returns `true` when `body` exceeds `CHUNK_SIZE_CHARS` and must be split.
37pub fn needs_chunking(body: &str) -> bool {
38    body.len() > CHUNK_SIZE_CHARS
39}
40
41/// Splits `body` into overlapping [`Chunk`]s using a character-based heuristic.
42///
43/// Short bodies (≤ `CHUNK_SIZE_CHARS`) are returned as a single chunk.
44/// Splits prefer paragraph breaks, then sentence-end punctuation, then word boundaries.
45///
46/// # Errors
47/// This function is infallible; it returns a `Vec` directly.
48pub fn split_into_chunks(body: &str) -> Vec<Chunk> {
49    if !needs_chunking(body) {
50        return vec![Chunk {
51            token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
52            start_offset: 0,
53            end_offset: body.len(),
54        }];
55    }
56
57    let mut chunks = Vec::with_capacity(body.len() / CHUNK_SIZE_CHARS + 1);
58    let mut start = 0usize;
59
60    while start < body.len() {
61        start = next_char_boundary(body, start);
62        let desired_end = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
63        let end = if desired_end < body.len() {
64            find_split_boundary(body, start, desired_end)
65        } else {
66            desired_end
67        };
68
69        let end = if end <= start {
70            let fallback = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
71            if fallback > start {
72                fallback
73            } else {
74                body.len()
75            }
76        } else {
77            end
78        };
79
80        let token_count_approx = body[start..end].chars().count() / CHARS_PER_TOKEN;
81        chunks.push(Chunk {
82            start_offset: start,
83            end_offset: end,
84            token_count_approx,
85        });
86
87        if end >= body.len() {
88            break;
89        }
90
91        let next_start = next_char_boundary(body, end.saturating_sub(CHUNK_OVERLAP_CHARS));
92        start = if next_start >= end { end } else { next_start };
93    }
94
95    chunks
96}
97
98/// Splits `body` into [`Chunk`]s using pre-computed token byte-offsets.
99///
100/// Each element of `token_offsets` is a `(start, end)` byte range for one token.
101/// Respects `CHUNK_SIZE_TOKENS` and `CHUNK_OVERLAP_TOKENS` constants.
102/// Short bodies (≤ `CHUNK_SIZE_TOKENS` tokens) are returned as a single chunk.
103pub fn split_into_chunks_by_token_offsets(
104    body: &str,
105    token_offsets: &[(usize, usize)],
106) -> Vec<Chunk> {
107    if token_offsets.len() <= CHUNK_SIZE_TOKENS {
108        return vec![Chunk {
109            token_count_approx: token_offsets.len(),
110            start_offset: 0,
111            end_offset: body.len(),
112        }];
113    }
114
115    let mut chunks = Vec::with_capacity(token_offsets.len() / CHUNK_SIZE_TOKENS + 1);
116    let mut start_token = 0usize;
117
118    while start_token < token_offsets.len() {
119        let end_token = (start_token + CHUNK_SIZE_TOKENS).min(token_offsets.len());
120
121        chunks.push(Chunk {
122            start_offset: if start_token == 0 {
123                0
124            } else {
125                token_offsets[start_token].0
126            },
127            end_offset: if end_token == token_offsets.len() {
128                body.len()
129            } else {
130                token_offsets[end_token - 1].1
131            },
132            token_count_approx: end_token - start_token,
133        });
134
135        if end_token == token_offsets.len() {
136            break;
137        }
138
139        let next_start = end_token.saturating_sub(CHUNK_OVERLAP_TOKENS);
140        start_token = if next_start <= start_token {
141            end_token
142        } else {
143            next_start
144        };
145    }
146
147    chunks
148}
149
150/// Splits body into chunks using MarkdownSplitter with a real tokenizer.
151/// Respects Markdown semantic boundaries (H1-H6, paragraphs, blocks).
152/// For plain text without Markdown markers, falls back to paragraph and sentence breaks.
153///
154/// v1.0.76: the `tokenizer` parameter was removed. The chunker now uses the
155/// char-based heuristic (`CHARS_PER_TOKEN = 2`) which is the same heuristic
156/// the rest of the codebase uses for `Chunk::token_count_approx`.
157pub fn split_into_chunks_hierarchical(body: &str) -> Vec<Chunk> {
158    if body.is_empty() {
159        return Vec::new();
160    }
161
162    // v1.0.76: text_splitter 0.30.1's `ChunkConfig::new` defaults to a
163    // char-count sizer when no explicit sizer is set. The default chunk
164    // size is in characters, not tokens, so we scale `CHUNK_SIZE_TOKENS`
165    // by `CHARS_PER_TOKEN` to keep the chunk size roughly equivalent.
166    let char_chunk_size = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
167    let char_overlap = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
168    let config = ChunkConfig::new(char_chunk_size)
169        .with_overlap(char_overlap)
170        .expect("compile-time invariant: CHUNK_OVERLAP must be smaller than chunk size");
171
172    let splitter = MarkdownSplitter::new(config);
173
174    let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
175
176    if items.is_empty() {
177        return vec![Chunk {
178            start_offset: 0,
179            end_offset: body.len(),
180            token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
181        }];
182    }
183
184    items
185        .into_iter()
186        .map(|(start, text)| {
187            let end = start + text.len();
188            Chunk {
189                start_offset: start,
190                end_offset: end,
191                token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
192            }
193        })
194        .collect()
195}
196
197/// Returns the string slice of `body` described by `chunk`'s byte offsets.
198pub fn chunk_text<'a>(body: &'a str, chunk: &Chunk) -> &'a str {
199    &body[chunk.start_offset..chunk.end_offset]
200}
201
202fn find_split_boundary(body: &str, start: usize, desired_end: usize) -> usize {
203    let slice = &body[start..desired_end];
204    if let Some(pos) = slice.rfind("\n\n") {
205        return start + pos + 2;
206    }
207    if let Some(pos) = slice.rfind(". ") {
208        return start + pos + 2;
209    }
210    if let Some(pos) = slice.rfind(' ') {
211        return start + pos + 1;
212    }
213    desired_end
214}
215
216fn previous_char_boundary(body: &str, mut idx: usize) -> usize {
217    idx = idx.min(body.len());
218    while idx > 0 && !body.is_char_boundary(idx) {
219        idx -= 1;
220    }
221    idx
222}
223
224fn next_char_boundary(body: &str, mut idx: usize) -> usize {
225    idx = idx.min(body.len());
226    while idx < body.len() && !body.is_char_boundary(idx) {
227        idx += 1;
228    }
229    idx
230}
231
232/// Computes the mean of `chunk_embeddings` and L2-normalizes the result.
233///
234/// Returns a zero-vector of length `EMBEDDING_DIM` when the input is empty.
235/// When a single embedding is provided it is returned as-is (no copy).
236pub fn aggregate_embeddings(chunk_embeddings: &[Vec<f32>]) -> Vec<f32> {
237    if chunk_embeddings.is_empty() {
238        return vec![0.0f32; EMBEDDING_DIM];
239    }
240    if chunk_embeddings.len() == 1 {
241        return chunk_embeddings[0].clone();
242    }
243
244    let dim = chunk_embeddings[0].len();
245    let mut mean = vec![0.0f32; dim];
246    for emb in chunk_embeddings {
247        for (i, v) in emb.iter().enumerate() {
248            mean[i] += v;
249        }
250    }
251    let n = chunk_embeddings.len() as f32;
252    for v in &mut mean {
253        *v /= n;
254    }
255
256    let norm: f32 = mean.iter().map(|x| x * x).sum::<f32>().sqrt();
257    if norm > 1e-9 {
258        for v in &mut mean {
259            *v /= norm;
260        }
261    }
262    mean
263}
264
265#[cfg(test)]
266mod tests {
267    use super::*;
268
269    #[test]
270    fn test_short_body_no_chunking() {
271        let body = "short text";
272        assert!(!needs_chunking(body));
273        let chunks = split_into_chunks(body);
274        assert_eq!(chunks.len(), 1);
275        assert_eq!(chunk_text(body, &chunks[0]), body);
276    }
277
278    #[test]
279    fn test_long_body_produces_multiple_chunks() {
280        let body = "word ".repeat(1000);
281        assert!(needs_chunking(&body));
282        let chunks = split_into_chunks(&body);
283        assert!(chunks.len() > 1);
284        assert!(chunks.iter().all(|c| !chunk_text(&body, c).is_empty()));
285    }
286
287    #[test]
288    fn split_by_token_offsets_respeita_limite_e_overlap() {
289        let body = "ab".repeat(460);
290        let offsets: Vec<(usize, usize)> = (0..460)
291            .map(|i| {
292                let start = i * 2;
293                (start, start + 2)
294            })
295            .collect();
296
297        let chunks = split_into_chunks_by_token_offsets(&body, &offsets);
298        assert_eq!(chunks.len(), 2);
299        assert_eq!(chunks[0].token_count_approx, CHUNK_SIZE_TOKENS);
300        assert_eq!(chunks[1].token_count_approx, 110);
301        assert_eq!(chunks[0].start_offset, 0);
302        assert_eq!(
303            chunks[1].start_offset,
304            offsets[CHUNK_SIZE_TOKENS - CHUNK_OVERLAP_TOKENS].0
305        );
306    }
307
308    #[test]
309    fn split_by_token_offsets_returns_one_chunk_when_fits() {
310        let body = "texto curto";
311        let offsets = vec![(0, 5), (6, 11)];
312        let chunks = split_into_chunks_by_token_offsets(body, &offsets);
313        assert_eq!(chunks.len(), 1);
314        assert_eq!(chunks[0].start_offset, 0);
315        assert_eq!(chunks[0].end_offset, body.len());
316        assert_eq!(chunks[0].token_count_approx, 2);
317    }
318
319    #[test]
320    fn test_multibyte_body_preserves_progress_and_boundaries() {
321        // Multibyte body intentionally includes 2-byte UTF-8 sequences (Latin-1 supplement)
322        // expressed as Unicode escapes so this source file remains ASCII-only per the
323        // language policy. The original PT-BR phrase "a\u{e7}\u{e3}o \u{fa}til " is preserved
324        // since the test exercises UTF-8 char-boundary handling.
325        let body = "a\u{e7}\u{e3}o \u{fa}til ".repeat(1000);
326        let chunks = split_into_chunks(&body);
327        assert!(chunks.len() > 1);
328        for chunk in &chunks {
329            assert!(!chunk_text(&body, chunk).is_empty());
330            assert!(body.is_char_boundary(chunk.start_offset));
331            assert!(body.is_char_boundary(chunk.end_offset));
332            assert!(chunk.end_offset > chunk.start_offset);
333        }
334        for pair in chunks.windows(2) {
335            assert!(pair[1].start_offset >= pair[0].start_offset);
336            assert!(pair[1].end_offset > pair[0].start_offset);
337        }
338    }
339
340    #[test]
341    fn test_aggregate_embeddings_normalizes() {
342        let embs = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
343        let agg = aggregate_embeddings(&embs);
344        let norm: f32 = agg.iter().map(|x| x * x).sum::<f32>().sqrt();
345        assert!((norm - 1.0).abs() < 1e-5);
346    }
347
348    fn split_hier_chars(body: &str, size: usize) -> Vec<Chunk> {
349        use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
350        if body.is_empty() {
351            return Vec::new();
352        }
353        let config = ChunkConfig::new(size)
354            .with_sizer(Characters)
355            .with_overlap(0)
356            .expect("overlap must be smaller than size");
357        let splitter = MarkdownSplitter::new(config);
358        let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
359        if items.is_empty() {
360            return vec![Chunk {
361                start_offset: 0,
362                end_offset: body.len(),
363                token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
364            }];
365        }
366        items
367            .into_iter()
368            .map(|(start, text)| {
369                let end = start + text.len();
370                Chunk {
371                    start_offset: start,
372                    end_offset: end,
373                    token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
374                }
375            })
376            .collect()
377    }
378
379    #[test]
380    fn test_hierarchical_empty_body_returns_empty() {
381        use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
382        let config = ChunkConfig::new(100)
383            .with_sizer(Characters)
384            .with_overlap(0)
385            .expect("overlap < size");
386        let splitter = MarkdownSplitter::new(config);
387        let result: Vec<_> = splitter.chunk_indices("").collect();
388        assert!(result.is_empty());
389    }
390
391    #[test]
392    fn test_markdown_h1_boundary_yields_two_chunks() {
393        let body = "# Title 1\n\nbody1 body1 body1 body1 body1 body1\n\n# Title 2\n\nbody2 body2 body2 body2 body2 body2";
394        let chunks = split_hier_chars(body, 30);
395        assert!(
396            chunks.len() >= 2,
397            "expected >=2 chunks, got {}",
398            chunks.len()
399        );
400        for c in &chunks {
401            assert!(body.is_char_boundary(c.start_offset));
402            assert!(body.is_char_boundary(c.end_offset));
403        }
404    }
405
406    #[test]
407    fn test_markdown_h2_nested_respects_boundaries() {
408        let body = "# H1\n\n## H2a\n\nParagraph A with enough text to force a split.\n\n## H2b\n\nParagraph B with enough text to force a split as well.";
409        let chunks = split_hier_chars(body, 40);
410        assert!(!chunks.is_empty());
411        for c in &chunks {
412            assert!(body.is_char_boundary(c.start_offset));
413            assert!(body.is_char_boundary(c.end_offset));
414            assert!(c.end_offset > c.start_offset);
415            assert!(c.end_offset <= body.len());
416        }
417    }
418
419    #[test]
420    fn test_markdown_paragraph_soft_boundary() {
421        let para = "Plain text sentence used to fill the paragraph. ";
422        let body = format!(
423            "{}\n\n{}\n\n{}",
424            para.repeat(3),
425            para.repeat(3),
426            para.repeat(3)
427        );
428        let chunks = split_hier_chars(&body, 80);
429        assert!(
430            chunks.len() >= 2,
431            "expected >=2 chunks with a body of {} chars",
432            body.len()
433        );
434        for c in &chunks {
435            assert!(body.is_char_boundary(c.start_offset));
436            assert!(body.is_char_boundary(c.end_offset));
437        }
438    }
439
440    #[test]
441    fn test_markdown_60kb_valid_offsets() {
442        let block = "# Section\n\nBlock content text. ".repeat(1700);
443        assert!(
444            block.len() > 50_000,
445            "body must be >50KB, has {} bytes",
446            block.len()
447        );
448        let chunks = split_hier_chars(&block, 256);
449        assert!(chunks.len() > 1);
450        for c in &chunks {
451            assert!(block.is_char_boundary(c.start_offset));
452            assert!(block.is_char_boundary(c.end_offset));
453            assert!(c.end_offset > c.start_offset);
454            assert!(!chunk_text(&block, c).is_empty());
455        }
456    }
457
458    #[test]
459    fn test_fallback_plain_text_without_markers() {
460        let body = "a ".repeat(1000);
461        let chunks = split_hier_chars(&body, 100);
462        assert!(!chunks.is_empty());
463        for c in &chunks {
464            assert!(body.is_char_boundary(c.start_offset));
465            assert!(body.is_char_boundary(c.end_offset));
466        }
467    }
468}
sqlite_graphrag/chunking.rs

sqlite_graphrag/
chunking.rs