Skip to main content

sqlite_graphrag/
chunking.rs

1// src/chunking.rs
2// Token-based chunking for E5 model (512 token limit)
3
4use crate::constants::{CHUNK_OVERLAP_TOKENS, CHUNK_SIZE_TOKENS, EMBEDDING_DIM};
5use text_splitter::{ChunkConfig, MarkdownSplitter};
6use tokenizers::Tokenizer;
7
8// Heurística conservadora para reduzir o risco de subestimar o número real de tokens
9// em Markdown, código e texto multilíngue. Valor anterior 4 chars/token permitia
10// chunks grandes demais para alguns documentos reais.
11const CHARS_PER_TOKEN: usize = 2;
12pub const CHUNK_SIZE_CHARS: usize = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
13pub const CHUNK_OVERLAP_CHARS: usize = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
14
15#[derive(Debug, Clone)]
16pub struct Chunk {
17    pub start_offset: usize,
18    pub end_offset: usize,
19    pub token_count_approx: usize,
20}
21
22pub fn needs_chunking(body: &str) -> bool {
23    body.len() > CHUNK_SIZE_CHARS
24}
25
26pub fn split_into_chunks(body: &str) -> Vec<Chunk> {
27    if !needs_chunking(body) {
28        return vec![Chunk {
29            token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
30            start_offset: 0,
31            end_offset: body.len(),
32        }];
33    }
34
35    let mut chunks = Vec::new();
36    let mut start = 0usize;
37
38    while start < body.len() {
39        start = next_char_boundary(body, start);
40        let desired_end = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
41        let end = if desired_end < body.len() {
42            find_split_boundary(body, start, desired_end)
43        } else {
44            desired_end
45        };
46
47        let end = if end <= start {
48            let fallback = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
49            if fallback > start {
50                fallback
51            } else {
52                body.len()
53            }
54        } else {
55            end
56        };
57
58        let token_count_approx = body[start..end].chars().count() / CHARS_PER_TOKEN;
59        chunks.push(Chunk {
60            start_offset: start,
61            end_offset: end,
62            token_count_approx,
63        });
64
65        if end >= body.len() {
66            break;
67        }
68
69        let next_start = next_char_boundary(body, end.saturating_sub(CHUNK_OVERLAP_CHARS));
70        start = if next_start >= end { end } else { next_start };
71    }
72
73    chunks
74}
75
76pub fn split_into_chunks_by_token_offsets(
77    body: &str,
78    token_offsets: &[(usize, usize)],
79) -> Vec<Chunk> {
80    if token_offsets.len() <= CHUNK_SIZE_TOKENS {
81        return vec![Chunk {
82            token_count_approx: token_offsets.len(),
83            start_offset: 0,
84            end_offset: body.len(),
85        }];
86    }
87
88    let mut chunks = Vec::new();
89    let mut start_token = 0usize;
90
91    while start_token < token_offsets.len() {
92        let end_token = (start_token + CHUNK_SIZE_TOKENS).min(token_offsets.len());
93
94        chunks.push(Chunk {
95            start_offset: if start_token == 0 {
96                0
97            } else {
98                token_offsets[start_token].0
99            },
100            end_offset: if end_token == token_offsets.len() {
101                body.len()
102            } else {
103                token_offsets[end_token - 1].1
104            },
105            token_count_approx: end_token - start_token,
106        });
107
108        if end_token == token_offsets.len() {
109            break;
110        }
111
112        let next_start = end_token.saturating_sub(CHUNK_OVERLAP_TOKENS);
113        start_token = if next_start <= start_token {
114            end_token
115        } else {
116            next_start
117        };
118    }
119
120    chunks
121}
122
123/// Divide body em chunks usando MarkdownSplitter com tokenizer real.
124/// Respeita limites semânticos de Markdown (H1-H6, parágrafos, blocos).
125/// Para texto puro sem marcadores Markdown, cai sobre quebras de parágrafo e sentenças.
126pub fn split_into_chunks_hierarchical(body: &str, tokenizer: &Tokenizer) -> Vec<Chunk> {
127    if body.is_empty() {
128        return Vec::new();
129    }
130
131    let config = ChunkConfig::new(CHUNK_SIZE_TOKENS)
132        .with_sizer(tokenizer)
133        .with_overlap(CHUNK_OVERLAP_TOKENS)
134        .expect("CHUNK_OVERLAP_TOKENS deve ser menor que CHUNK_SIZE_TOKENS");
135
136    let splitter = MarkdownSplitter::new(config);
137
138    let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
139
140    if items.is_empty() {
141        return vec![Chunk {
142            start_offset: 0,
143            end_offset: body.len(),
144            token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
145        }];
146    }
147
148    items
149        .into_iter()
150        .map(|(start, text)| {
151            let end = start + text.len();
152            Chunk {
153                start_offset: start,
154                end_offset: end,
155                token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
156            }
157        })
158        .collect()
159}
160
161pub fn chunk_text<'a>(body: &'a str, chunk: &Chunk) -> &'a str {
162    &body[chunk.start_offset..chunk.end_offset]
163}
164
165fn find_split_boundary(body: &str, start: usize, desired_end: usize) -> usize {
166    let slice = &body[start..desired_end];
167    if let Some(pos) = slice.rfind("\n\n") {
168        return start + pos + 2;
169    }
170    if let Some(pos) = slice.rfind(". ") {
171        return start + pos + 2;
172    }
173    if let Some(pos) = slice.rfind(' ') {
174        return start + pos + 1;
175    }
176    desired_end
177}
178
179fn previous_char_boundary(body: &str, mut idx: usize) -> usize {
180    idx = idx.min(body.len());
181    while idx > 0 && !body.is_char_boundary(idx) {
182        idx -= 1;
183    }
184    idx
185}
186
187fn next_char_boundary(body: &str, mut idx: usize) -> usize {
188    idx = idx.min(body.len());
189    while idx < body.len() && !body.is_char_boundary(idx) {
190        idx += 1;
191    }
192    idx
193}
194
195pub fn aggregate_embeddings(chunk_embeddings: &[Vec<f32>]) -> Vec<f32> {
196    if chunk_embeddings.is_empty() {
197        return vec![0.0f32; EMBEDDING_DIM];
198    }
199    if chunk_embeddings.len() == 1 {
200        return chunk_embeddings[0].clone();
201    }
202
203    let dim = chunk_embeddings[0].len();
204    let mut mean = vec![0.0f32; dim];
205    for emb in chunk_embeddings {
206        for (i, v) in emb.iter().enumerate() {
207            mean[i] += v;
208        }
209    }
210    let n = chunk_embeddings.len() as f32;
211    for v in &mut mean {
212        *v /= n;
213    }
214
215    let norm: f32 = mean.iter().map(|x| x * x).sum::<f32>().sqrt();
216    if norm > 1e-9 {
217        for v in &mut mean {
218            *v /= norm;
219        }
220    }
221    mean
222}
223
224#[cfg(test)]
225mod tests {
226    use super::*;
227
228    #[test]
229    fn test_short_body_no_chunking() {
230        let body = "short text";
231        assert!(!needs_chunking(body));
232        let chunks = split_into_chunks(body);
233        assert_eq!(chunks.len(), 1);
234        assert_eq!(chunk_text(body, &chunks[0]), body);
235    }
236
237    #[test]
238    fn test_long_body_produces_multiple_chunks() {
239        let body = "word ".repeat(1000);
240        assert!(needs_chunking(&body));
241        let chunks = split_into_chunks(&body);
242        assert!(chunks.len() > 1);
243        assert!(chunks.iter().all(|c| !chunk_text(&body, c).is_empty()));
244    }
245
246    #[test]
247    fn split_by_token_offsets_respeita_limite_e_overlap() {
248        let body = "ab".repeat(460);
249        let offsets: Vec<(usize, usize)> = (0..460)
250            .map(|i| {
251                let start = i * 2;
252                (start, start + 2)
253            })
254            .collect();
255
256        let chunks = split_into_chunks_by_token_offsets(&body, &offsets);
257        assert_eq!(chunks.len(), 2);
258        assert_eq!(chunks[0].token_count_approx, CHUNK_SIZE_TOKENS);
259        assert_eq!(chunks[1].token_count_approx, 110);
260        assert_eq!(chunks[0].start_offset, 0);
261        assert_eq!(
262            chunks[1].start_offset,
263            offsets[CHUNK_SIZE_TOKENS - CHUNK_OVERLAP_TOKENS].0
264        );
265    }
266
267    #[test]
268    fn split_by_token_offsets_retorna_um_chunk_quando_cabe() {
269        let body = "texto curto";
270        let offsets = vec![(0, 5), (6, 11)];
271        let chunks = split_into_chunks_by_token_offsets(body, &offsets);
272        assert_eq!(chunks.len(), 1);
273        assert_eq!(chunks[0].start_offset, 0);
274        assert_eq!(chunks[0].end_offset, body.len());
275        assert_eq!(chunks[0].token_count_approx, 2);
276    }
277
278    #[test]
279    fn test_multibyte_body_preserves_progress_and_boundaries() {
280        let body = "ação útil ".repeat(1000);
281        let chunks = split_into_chunks(&body);
282        assert!(chunks.len() > 1);
283        for chunk in &chunks {
284            assert!(!chunk_text(&body, chunk).is_empty());
285            assert!(body.is_char_boundary(chunk.start_offset));
286            assert!(body.is_char_boundary(chunk.end_offset));
287            assert!(chunk.end_offset > chunk.start_offset);
288        }
289        for pair in chunks.windows(2) {
290            assert!(pair[1].start_offset >= pair[0].start_offset);
291            assert!(pair[1].end_offset > pair[0].start_offset);
292        }
293    }
294
295    #[test]
296    fn test_aggregate_embeddings_normalizes() {
297        let embs = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
298        let agg = aggregate_embeddings(&embs);
299        let norm: f32 = agg.iter().map(|x| x * x).sum::<f32>().sqrt();
300        assert!((norm - 1.0).abs() < 1e-5);
301    }
302
303    fn split_hier_chars(body: &str, size: usize) -> Vec<Chunk> {
304        use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
305        if body.is_empty() {
306            return Vec::new();
307        }
308        let config = ChunkConfig::new(size)
309            .with_sizer(Characters)
310            .with_overlap(0)
311            .expect("overlap deve ser menor que size");
312        let splitter = MarkdownSplitter::new(config);
313        let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
314        if items.is_empty() {
315            return vec![Chunk {
316                start_offset: 0,
317                end_offset: body.len(),
318                token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
319            }];
320        }
321        items
322            .into_iter()
323            .map(|(start, text)| {
324                let end = start + text.len();
325                Chunk {
326                    start_offset: start,
327                    end_offset: end,
328                    token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
329                }
330            })
331            .collect()
332    }
333
334    #[test]
335    fn test_hierarchical_empty_body_retorna_vazio() {
336        use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
337        let config = ChunkConfig::new(100)
338            .with_sizer(Characters)
339            .with_overlap(0)
340            .expect("overlap < size");
341        let splitter = MarkdownSplitter::new(config);
342        let result: Vec<_> = splitter.chunk_indices("").collect();
343        assert!(result.is_empty());
344    }
345
346    #[test]
347    fn test_markdown_h1_boundary_gera_dois_chunks() {
348        let body = "# Title 1\n\nbody1 body1 body1 body1 body1 body1\n\n# Title 2\n\nbody2 body2 body2 body2 body2 body2";
349        let chunks = split_hier_chars(body, 30);
350        assert!(
351            chunks.len() >= 2,
352            "esperado >=2 chunks, obtido {}",
353            chunks.len()
354        );
355        for c in &chunks {
356            assert!(body.is_char_boundary(c.start_offset));
357            assert!(body.is_char_boundary(c.end_offset));
358        }
359    }
360
361    #[test]
362    fn test_markdown_h2_nested_respeita_boundaries() {
363        let body = "# H1\n\n## H2a\n\nParágrafo A com texto suficiente para forçar split.\n\n## H2b\n\nParágrafo B com texto suficiente para forçar split também.";
364        let chunks = split_hier_chars(body, 40);
365        assert!(!chunks.is_empty());
366        for c in &chunks {
367            assert!(body.is_char_boundary(c.start_offset));
368            assert!(body.is_char_boundary(c.end_offset));
369            assert!(c.end_offset > c.start_offset);
370            assert!(c.end_offset <= body.len());
371        }
372    }
373
374    #[test]
375    fn test_markdown_paragrafo_soft_boundary() {
376        let para = "Frase de texto simples para preencher o parágrafo. ";
377        let body = format!(
378            "{}\n\n{}\n\n{}",
379            para.repeat(3),
380            para.repeat(3),
381            para.repeat(3)
382        );
383        let chunks = split_hier_chars(&body, 80);
384        assert!(
385            chunks.len() >= 2,
386            "esperado >=2 chunks com body de {} chars",
387            body.len()
388        );
389        for c in &chunks {
390            assert!(body.is_char_boundary(c.start_offset));
391            assert!(body.is_char_boundary(c.end_offset));
392        }
393    }
394
395    #[test]
396    fn test_markdown_60kb_offsets_validos() {
397        let bloco = "# Seção\n\nTexto de conteúdo do bloco. ".repeat(1500);
398        assert!(
399            bloco.len() > 50_000,
400            "body deve ser >50KB, tem {} bytes",
401            bloco.len()
402        );
403        let chunks = split_hier_chars(&bloco, 256);
404        assert!(chunks.len() > 1);
405        for c in &chunks {
406            assert!(bloco.is_char_boundary(c.start_offset));
407            assert!(bloco.is_char_boundary(c.end_offset));
408            assert!(c.end_offset > c.start_offset);
409            assert!(!chunk_text(&bloco, c).is_empty());
410        }
411    }
412
413    #[test]
414    fn test_fallback_texto_puro_sem_marcadores() {
415        let body = "a ".repeat(1000);
416        let chunks = split_hier_chars(&body, 100);
417        assert!(!chunks.is_empty());
418        for c in &chunks {
419            assert!(body.is_char_boundary(c.start_offset));
420            assert!(body.is_char_boundary(c.end_offset));
421        }
422    }
423}