Skip to main content

sqlite_graphrag/
chunking.rs

1//! Semantic chunking for embedding inputs (Markdown-aware, 512-token limit).
2//!
3//! Splits bodies using [`text_splitter::MarkdownSplitter`] with overlap so
4//! multi-chunk memories preserve context across chunk boundaries.
5
6// src/chunking.rs
7// Token-based chunking for E5 model (512 token limit)
8
9use crate::constants::{CHUNK_OVERLAP_TOKENS, CHUNK_SIZE_TOKENS, EMBEDDING_DIM};
10use text_splitter::{ChunkConfig, MarkdownSplitter};
11use tokenizers::Tokenizer;
12
13// Heurística conservadora para reduzir o risco de subestimar o número real de tokens
14// em Markdown, código e texto multilíngue. Valor anterior 4 chars/token permitia
15// chunks grandes demais para alguns documentos reais.
16const CHARS_PER_TOKEN: usize = 2;
17pub const CHUNK_SIZE_CHARS: usize = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
18pub const CHUNK_OVERLAP_CHARS: usize = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
19
20#[derive(Debug, Clone)]
21pub struct Chunk {
22    pub start_offset: usize,
23    pub end_offset: usize,
24    pub token_count_approx: usize,
25}
26
27pub fn needs_chunking(body: &str) -> bool {
28    body.len() > CHUNK_SIZE_CHARS
29}
30
31pub fn split_into_chunks(body: &str) -> Vec<Chunk> {
32    if !needs_chunking(body) {
33        return vec![Chunk {
34            token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
35            start_offset: 0,
36            end_offset: body.len(),
37        }];
38    }
39
40    let mut chunks = Vec::new();
41    let mut start = 0usize;
42
43    while start < body.len() {
44        start = next_char_boundary(body, start);
45        let desired_end = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
46        let end = if desired_end < body.len() {
47            find_split_boundary(body, start, desired_end)
48        } else {
49            desired_end
50        };
51
52        let end = if end <= start {
53            let fallback = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
54            if fallback > start {
55                fallback
56            } else {
57                body.len()
58            }
59        } else {
60            end
61        };
62
63        let token_count_approx = body[start..end].chars().count() / CHARS_PER_TOKEN;
64        chunks.push(Chunk {
65            start_offset: start,
66            end_offset: end,
67            token_count_approx,
68        });
69
70        if end >= body.len() {
71            break;
72        }
73
74        let next_start = next_char_boundary(body, end.saturating_sub(CHUNK_OVERLAP_CHARS));
75        start = if next_start >= end { end } else { next_start };
76    }
77
78    chunks
79}
80
81pub fn split_into_chunks_by_token_offsets(
82    body: &str,
83    token_offsets: &[(usize, usize)],
84) -> Vec<Chunk> {
85    if token_offsets.len() <= CHUNK_SIZE_TOKENS {
86        return vec![Chunk {
87            token_count_approx: token_offsets.len(),
88            start_offset: 0,
89            end_offset: body.len(),
90        }];
91    }
92
93    let mut chunks = Vec::new();
94    let mut start_token = 0usize;
95
96    while start_token < token_offsets.len() {
97        let end_token = (start_token + CHUNK_SIZE_TOKENS).min(token_offsets.len());
98
99        chunks.push(Chunk {
100            start_offset: if start_token == 0 {
101                0
102            } else {
103                token_offsets[start_token].0
104            },
105            end_offset: if end_token == token_offsets.len() {
106                body.len()
107            } else {
108                token_offsets[end_token - 1].1
109            },
110            token_count_approx: end_token - start_token,
111        });
112
113        if end_token == token_offsets.len() {
114            break;
115        }
116
117        let next_start = end_token.saturating_sub(CHUNK_OVERLAP_TOKENS);
118        start_token = if next_start <= start_token {
119            end_token
120        } else {
121            next_start
122        };
123    }
124
125    chunks
126}
127
128/// Splits body into chunks using MarkdownSplitter with a real tokenizer.
129/// Respects Markdown semantic boundaries (H1-H6, paragraphs, blocks).
130/// For plain text without Markdown markers, falls back to paragraph and sentence breaks.
131pub fn split_into_chunks_hierarchical(body: &str, tokenizer: &Tokenizer) -> Vec<Chunk> {
132    if body.is_empty() {
133        return Vec::new();
134    }
135
136    let config = ChunkConfig::new(CHUNK_SIZE_TOKENS)
137        .with_sizer(tokenizer)
138        .with_overlap(CHUNK_OVERLAP_TOKENS)
139        .expect("CHUNK_OVERLAP_TOKENS deve ser menor que CHUNK_SIZE_TOKENS");
140
141    let splitter = MarkdownSplitter::new(config);
142
143    let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
144
145    if items.is_empty() {
146        return vec![Chunk {
147            start_offset: 0,
148            end_offset: body.len(),
149            token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
150        }];
151    }
152
153    items
154        .into_iter()
155        .map(|(start, text)| {
156            let end = start + text.len();
157            Chunk {
158                start_offset: start,
159                end_offset: end,
160                token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
161            }
162        })
163        .collect()
164}
165
166pub fn chunk_text<'a>(body: &'a str, chunk: &Chunk) -> &'a str {
167    &body[chunk.start_offset..chunk.end_offset]
168}
169
170fn find_split_boundary(body: &str, start: usize, desired_end: usize) -> usize {
171    let slice = &body[start..desired_end];
172    if let Some(pos) = slice.rfind("\n\n") {
173        return start + pos + 2;
174    }
175    if let Some(pos) = slice.rfind(". ") {
176        return start + pos + 2;
177    }
178    if let Some(pos) = slice.rfind(' ') {
179        return start + pos + 1;
180    }
181    desired_end
182}
183
184fn previous_char_boundary(body: &str, mut idx: usize) -> usize {
185    idx = idx.min(body.len());
186    while idx > 0 && !body.is_char_boundary(idx) {
187        idx -= 1;
188    }
189    idx
190}
191
192fn next_char_boundary(body: &str, mut idx: usize) -> usize {
193    idx = idx.min(body.len());
194    while idx < body.len() && !body.is_char_boundary(idx) {
195        idx += 1;
196    }
197    idx
198}
199
200pub fn aggregate_embeddings(chunk_embeddings: &[Vec<f32>]) -> Vec<f32> {
201    if chunk_embeddings.is_empty() {
202        return vec![0.0f32; EMBEDDING_DIM];
203    }
204    if chunk_embeddings.len() == 1 {
205        return chunk_embeddings[0].clone();
206    }
207
208    let dim = chunk_embeddings[0].len();
209    let mut mean = vec![0.0f32; dim];
210    for emb in chunk_embeddings {
211        for (i, v) in emb.iter().enumerate() {
212            mean[i] += v;
213        }
214    }
215    let n = chunk_embeddings.len() as f32;
216    for v in &mut mean {
217        *v /= n;
218    }
219
220    let norm: f32 = mean.iter().map(|x| x * x).sum::<f32>().sqrt();
221    if norm > 1e-9 {
222        for v in &mut mean {
223            *v /= norm;
224        }
225    }
226    mean
227}
228
229#[cfg(test)]
230mod tests {
231    use super::*;
232
233    #[test]
234    fn test_short_body_no_chunking() {
235        let body = "short text";
236        assert!(!needs_chunking(body));
237        let chunks = split_into_chunks(body);
238        assert_eq!(chunks.len(), 1);
239        assert_eq!(chunk_text(body, &chunks[0]), body);
240    }
241
242    #[test]
243    fn test_long_body_produces_multiple_chunks() {
244        let body = "word ".repeat(1000);
245        assert!(needs_chunking(&body));
246        let chunks = split_into_chunks(&body);
247        assert!(chunks.len() > 1);
248        assert!(chunks.iter().all(|c| !chunk_text(&body, c).is_empty()));
249    }
250
251    #[test]
252    fn split_by_token_offsets_respeita_limite_e_overlap() {
253        let body = "ab".repeat(460);
254        let offsets: Vec<(usize, usize)> = (0..460)
255            .map(|i| {
256                let start = i * 2;
257                (start, start + 2)
258            })
259            .collect();
260
261        let chunks = split_into_chunks_by_token_offsets(&body, &offsets);
262        assert_eq!(chunks.len(), 2);
263        assert_eq!(chunks[0].token_count_approx, CHUNK_SIZE_TOKENS);
264        assert_eq!(chunks[1].token_count_approx, 110);
265        assert_eq!(chunks[0].start_offset, 0);
266        assert_eq!(
267            chunks[1].start_offset,
268            offsets[CHUNK_SIZE_TOKENS - CHUNK_OVERLAP_TOKENS].0
269        );
270    }
271
272    #[test]
273    fn split_by_token_offsets_retorna_um_chunk_quando_cabe() {
274        let body = "texto curto";
275        let offsets = vec![(0, 5), (6, 11)];
276        let chunks = split_into_chunks_by_token_offsets(body, &offsets);
277        assert_eq!(chunks.len(), 1);
278        assert_eq!(chunks[0].start_offset, 0);
279        assert_eq!(chunks[0].end_offset, body.len());
280        assert_eq!(chunks[0].token_count_approx, 2);
281    }
282
283    #[test]
284    fn test_multibyte_body_preserves_progress_and_boundaries() {
285        let body = "ação útil ".repeat(1000);
286        let chunks = split_into_chunks(&body);
287        assert!(chunks.len() > 1);
288        for chunk in &chunks {
289            assert!(!chunk_text(&body, chunk).is_empty());
290            assert!(body.is_char_boundary(chunk.start_offset));
291            assert!(body.is_char_boundary(chunk.end_offset));
292            assert!(chunk.end_offset > chunk.start_offset);
293        }
294        for pair in chunks.windows(2) {
295            assert!(pair[1].start_offset >= pair[0].start_offset);
296            assert!(pair[1].end_offset > pair[0].start_offset);
297        }
298    }
299
300    #[test]
301    fn test_aggregate_embeddings_normalizes() {
302        let embs = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
303        let agg = aggregate_embeddings(&embs);
304        let norm: f32 = agg.iter().map(|x| x * x).sum::<f32>().sqrt();
305        assert!((norm - 1.0).abs() < 1e-5);
306    }
307
308    fn split_hier_chars(body: &str, size: usize) -> Vec<Chunk> {
309        use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
310        if body.is_empty() {
311            return Vec::new();
312        }
313        let config = ChunkConfig::new(size)
314            .with_sizer(Characters)
315            .with_overlap(0)
316            .expect("overlap deve ser menor que size");
317        let splitter = MarkdownSplitter::new(config);
318        let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
319        if items.is_empty() {
320            return vec![Chunk {
321                start_offset: 0,
322                end_offset: body.len(),
323                token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
324            }];
325        }
326        items
327            .into_iter()
328            .map(|(start, text)| {
329                let end = start + text.len();
330                Chunk {
331                    start_offset: start,
332                    end_offset: end,
333                    token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
334                }
335            })
336            .collect()
337    }
338
339    #[test]
340    fn test_hierarchical_empty_body_retorna_vazio() {
341        use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
342        let config = ChunkConfig::new(100)
343            .with_sizer(Characters)
344            .with_overlap(0)
345            .expect("overlap < size");
346        let splitter = MarkdownSplitter::new(config);
347        let result: Vec<_> = splitter.chunk_indices("").collect();
348        assert!(result.is_empty());
349    }
350
351    #[test]
352    fn test_markdown_h1_boundary_gera_dois_chunks() {
353        let body = "# Title 1\n\nbody1 body1 body1 body1 body1 body1\n\n# Title 2\n\nbody2 body2 body2 body2 body2 body2";
354        let chunks = split_hier_chars(body, 30);
355        assert!(
356            chunks.len() >= 2,
357            "esperado >=2 chunks, obtido {}",
358            chunks.len()
359        );
360        for c in &chunks {
361            assert!(body.is_char_boundary(c.start_offset));
362            assert!(body.is_char_boundary(c.end_offset));
363        }
364    }
365
366    #[test]
367    fn test_markdown_h2_nested_respeita_boundaries() {
368        let body = "# H1\n\n## H2a\n\nParágrafo A com texto suficiente para forçar split.\n\n## H2b\n\nParágrafo B com texto suficiente para forçar split também.";
369        let chunks = split_hier_chars(body, 40);
370        assert!(!chunks.is_empty());
371        for c in &chunks {
372            assert!(body.is_char_boundary(c.start_offset));
373            assert!(body.is_char_boundary(c.end_offset));
374            assert!(c.end_offset > c.start_offset);
375            assert!(c.end_offset <= body.len());
376        }
377    }
378
379    #[test]
380    fn test_markdown_paragrafo_soft_boundary() {
381        let para = "Frase de texto simples para preencher o parágrafo. ";
382        let body = format!(
383            "{}\n\n{}\n\n{}",
384            para.repeat(3),
385            para.repeat(3),
386            para.repeat(3)
387        );
388        let chunks = split_hier_chars(&body, 80);
389        assert!(
390            chunks.len() >= 2,
391            "esperado >=2 chunks com body de {} chars",
392            body.len()
393        );
394        for c in &chunks {
395            assert!(body.is_char_boundary(c.start_offset));
396            assert!(body.is_char_boundary(c.end_offset));
397        }
398    }
399
400    #[test]
401    fn test_markdown_60kb_offsets_validos() {
402        let bloco = "# Seção\n\nTexto de conteúdo do bloco. ".repeat(1500);
403        assert!(
404            bloco.len() > 50_000,
405            "body deve ser >50KB, tem {} bytes",
406            bloco.len()
407        );
408        let chunks = split_hier_chars(&bloco, 256);
409        assert!(chunks.len() > 1);
410        for c in &chunks {
411            assert!(bloco.is_char_boundary(c.start_offset));
412            assert!(bloco.is_char_boundary(c.end_offset));
413            assert!(c.end_offset > c.start_offset);
414            assert!(!chunk_text(&bloco, c).is_empty());
415        }
416    }
417
418    #[test]
419    fn test_fallback_texto_puro_sem_marcadores() {
420        let body = "a ".repeat(1000);
421        let chunks = split_hier_chars(&body, 100);
422        assert!(!chunks.is_empty());
423        for c in &chunks {
424            assert!(body.is_char_boundary(c.start_offset));
425            assert!(body.is_char_boundary(c.end_offset));
426        }
427    }
428}