1use crate::constants::{CHUNK_OVERLAP_TOKENS, CHUNK_SIZE_TOKENS, EMBEDDING_DIM};
5use text_splitter::{ChunkConfig, MarkdownSplitter};
6use tokenizers::Tokenizer;
7
8const CHARS_PER_TOKEN: usize = 2;
12pub const CHUNK_SIZE_CHARS: usize = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
13pub const CHUNK_OVERLAP_CHARS: usize = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
14
15#[derive(Debug, Clone)]
16pub struct Chunk {
17 pub start_offset: usize,
18 pub end_offset: usize,
19 pub token_count_approx: usize,
20}
21
22pub fn needs_chunking(body: &str) -> bool {
23 body.len() > CHUNK_SIZE_CHARS
24}
25
26pub fn split_into_chunks(body: &str) -> Vec<Chunk> {
27 if !needs_chunking(body) {
28 return vec![Chunk {
29 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
30 start_offset: 0,
31 end_offset: body.len(),
32 }];
33 }
34
35 let mut chunks = Vec::new();
36 let mut start = 0usize;
37
38 while start < body.len() {
39 start = next_char_boundary(body, start);
40 let desired_end = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
41 let end = if desired_end < body.len() {
42 find_split_boundary(body, start, desired_end)
43 } else {
44 desired_end
45 };
46
47 let end = if end <= start {
48 let fallback = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
49 if fallback > start {
50 fallback
51 } else {
52 body.len()
53 }
54 } else {
55 end
56 };
57
58 let token_count_approx = body[start..end].chars().count() / CHARS_PER_TOKEN;
59 chunks.push(Chunk {
60 start_offset: start,
61 end_offset: end,
62 token_count_approx,
63 });
64
65 if end >= body.len() {
66 break;
67 }
68
69 let next_start = next_char_boundary(body, end.saturating_sub(CHUNK_OVERLAP_CHARS));
70 start = if next_start >= end { end } else { next_start };
71 }
72
73 chunks
74}
75
76pub fn split_into_chunks_by_token_offsets(
77 body: &str,
78 token_offsets: &[(usize, usize)],
79) -> Vec<Chunk> {
80 if token_offsets.len() <= CHUNK_SIZE_TOKENS {
81 return vec![Chunk {
82 token_count_approx: token_offsets.len(),
83 start_offset: 0,
84 end_offset: body.len(),
85 }];
86 }
87
88 let mut chunks = Vec::new();
89 let mut start_token = 0usize;
90
91 while start_token < token_offsets.len() {
92 let end_token = (start_token + CHUNK_SIZE_TOKENS).min(token_offsets.len());
93
94 chunks.push(Chunk {
95 start_offset: if start_token == 0 {
96 0
97 } else {
98 token_offsets[start_token].0
99 },
100 end_offset: if end_token == token_offsets.len() {
101 body.len()
102 } else {
103 token_offsets[end_token - 1].1
104 },
105 token_count_approx: end_token - start_token,
106 });
107
108 if end_token == token_offsets.len() {
109 break;
110 }
111
112 let next_start = end_token.saturating_sub(CHUNK_OVERLAP_TOKENS);
113 start_token = if next_start <= start_token {
114 end_token
115 } else {
116 next_start
117 };
118 }
119
120 chunks
121}
122
123pub fn split_into_chunks_hierarchical(body: &str, tokenizer: &Tokenizer) -> Vec<Chunk> {
127 if body.is_empty() {
128 return Vec::new();
129 }
130
131 let config = ChunkConfig::new(CHUNK_SIZE_TOKENS)
132 .with_sizer(tokenizer)
133 .with_overlap(CHUNK_OVERLAP_TOKENS)
134 .expect("CHUNK_OVERLAP_TOKENS deve ser menor que CHUNK_SIZE_TOKENS");
135
136 let splitter = MarkdownSplitter::new(config);
137
138 let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
139
140 if items.is_empty() {
141 return vec![Chunk {
142 start_offset: 0,
143 end_offset: body.len(),
144 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
145 }];
146 }
147
148 items
149 .into_iter()
150 .map(|(start, text)| {
151 let end = start + text.len();
152 Chunk {
153 start_offset: start,
154 end_offset: end,
155 token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
156 }
157 })
158 .collect()
159}
160
161pub fn chunk_text<'a>(body: &'a str, chunk: &Chunk) -> &'a str {
162 &body[chunk.start_offset..chunk.end_offset]
163}
164
165fn find_split_boundary(body: &str, start: usize, desired_end: usize) -> usize {
166 let slice = &body[start..desired_end];
167 if let Some(pos) = slice.rfind("\n\n") {
168 return start + pos + 2;
169 }
170 if let Some(pos) = slice.rfind(". ") {
171 return start + pos + 2;
172 }
173 if let Some(pos) = slice.rfind(' ') {
174 return start + pos + 1;
175 }
176 desired_end
177}
178
179fn previous_char_boundary(body: &str, mut idx: usize) -> usize {
180 idx = idx.min(body.len());
181 while idx > 0 && !body.is_char_boundary(idx) {
182 idx -= 1;
183 }
184 idx
185}
186
187fn next_char_boundary(body: &str, mut idx: usize) -> usize {
188 idx = idx.min(body.len());
189 while idx < body.len() && !body.is_char_boundary(idx) {
190 idx += 1;
191 }
192 idx
193}
194
195pub fn aggregate_embeddings(chunk_embeddings: &[Vec<f32>]) -> Vec<f32> {
196 if chunk_embeddings.is_empty() {
197 return vec![0.0f32; EMBEDDING_DIM];
198 }
199 if chunk_embeddings.len() == 1 {
200 return chunk_embeddings[0].clone();
201 }
202
203 let dim = chunk_embeddings[0].len();
204 let mut mean = vec![0.0f32; dim];
205 for emb in chunk_embeddings {
206 for (i, v) in emb.iter().enumerate() {
207 mean[i] += v;
208 }
209 }
210 let n = chunk_embeddings.len() as f32;
211 for v in &mut mean {
212 *v /= n;
213 }
214
215 let norm: f32 = mean.iter().map(|x| x * x).sum::<f32>().sqrt();
216 if norm > 1e-9 {
217 for v in &mut mean {
218 *v /= norm;
219 }
220 }
221 mean
222}
223
224#[cfg(test)]
225mod tests {
226 use super::*;
227
228 #[test]
229 fn test_short_body_no_chunking() {
230 let body = "short text";
231 assert!(!needs_chunking(body));
232 let chunks = split_into_chunks(body);
233 assert_eq!(chunks.len(), 1);
234 assert_eq!(chunk_text(body, &chunks[0]), body);
235 }
236
237 #[test]
238 fn test_long_body_produces_multiple_chunks() {
239 let body = "word ".repeat(1000);
240 assert!(needs_chunking(&body));
241 let chunks = split_into_chunks(&body);
242 assert!(chunks.len() > 1);
243 assert!(chunks.iter().all(|c| !chunk_text(&body, c).is_empty()));
244 }
245
246 #[test]
247 fn split_by_token_offsets_respeita_limite_e_overlap() {
248 let body = "ab".repeat(460);
249 let offsets: Vec<(usize, usize)> = (0..460)
250 .map(|i| {
251 let start = i * 2;
252 (start, start + 2)
253 })
254 .collect();
255
256 let chunks = split_into_chunks_by_token_offsets(&body, &offsets);
257 assert_eq!(chunks.len(), 2);
258 assert_eq!(chunks[0].token_count_approx, CHUNK_SIZE_TOKENS);
259 assert_eq!(chunks[1].token_count_approx, 110);
260 assert_eq!(chunks[0].start_offset, 0);
261 assert_eq!(
262 chunks[1].start_offset,
263 offsets[CHUNK_SIZE_TOKENS - CHUNK_OVERLAP_TOKENS].0
264 );
265 }
266
267 #[test]
268 fn split_by_token_offsets_retorna_um_chunk_quando_cabe() {
269 let body = "texto curto";
270 let offsets = vec![(0, 5), (6, 11)];
271 let chunks = split_into_chunks_by_token_offsets(body, &offsets);
272 assert_eq!(chunks.len(), 1);
273 assert_eq!(chunks[0].start_offset, 0);
274 assert_eq!(chunks[0].end_offset, body.len());
275 assert_eq!(chunks[0].token_count_approx, 2);
276 }
277
278 #[test]
279 fn test_multibyte_body_preserves_progress_and_boundaries() {
280 let body = "ação útil ".repeat(1000);
281 let chunks = split_into_chunks(&body);
282 assert!(chunks.len() > 1);
283 for chunk in &chunks {
284 assert!(!chunk_text(&body, chunk).is_empty());
285 assert!(body.is_char_boundary(chunk.start_offset));
286 assert!(body.is_char_boundary(chunk.end_offset));
287 assert!(chunk.end_offset > chunk.start_offset);
288 }
289 for pair in chunks.windows(2) {
290 assert!(pair[1].start_offset >= pair[0].start_offset);
291 assert!(pair[1].end_offset > pair[0].start_offset);
292 }
293 }
294
295 #[test]
296 fn test_aggregate_embeddings_normalizes() {
297 let embs = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
298 let agg = aggregate_embeddings(&embs);
299 let norm: f32 = agg.iter().map(|x| x * x).sum::<f32>().sqrt();
300 assert!((norm - 1.0).abs() < 1e-5);
301 }
302
303 fn split_hier_chars(body: &str, size: usize) -> Vec<Chunk> {
304 use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
305 if body.is_empty() {
306 return Vec::new();
307 }
308 let config = ChunkConfig::new(size)
309 .with_sizer(Characters)
310 .with_overlap(0)
311 .expect("overlap deve ser menor que size");
312 let splitter = MarkdownSplitter::new(config);
313 let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
314 if items.is_empty() {
315 return vec![Chunk {
316 start_offset: 0,
317 end_offset: body.len(),
318 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
319 }];
320 }
321 items
322 .into_iter()
323 .map(|(start, text)| {
324 let end = start + text.len();
325 Chunk {
326 start_offset: start,
327 end_offset: end,
328 token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
329 }
330 })
331 .collect()
332 }
333
334 #[test]
335 fn test_hierarchical_empty_body_retorna_vazio() {
336 use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
337 let config = ChunkConfig::new(100)
338 .with_sizer(Characters)
339 .with_overlap(0)
340 .expect("overlap < size");
341 let splitter = MarkdownSplitter::new(config);
342 let result: Vec<_> = splitter.chunk_indices("").collect();
343 assert!(result.is_empty());
344 }
345
346 #[test]
347 fn test_markdown_h1_boundary_gera_dois_chunks() {
348 let body = "# Title 1\n\nbody1 body1 body1 body1 body1 body1\n\n# Title 2\n\nbody2 body2 body2 body2 body2 body2";
349 let chunks = split_hier_chars(body, 30);
350 assert!(
351 chunks.len() >= 2,
352 "esperado >=2 chunks, obtido {}",
353 chunks.len()
354 );
355 for c in &chunks {
356 assert!(body.is_char_boundary(c.start_offset));
357 assert!(body.is_char_boundary(c.end_offset));
358 }
359 }
360
361 #[test]
362 fn test_markdown_h2_nested_respeita_boundaries() {
363 let body = "# H1\n\n## H2a\n\nParágrafo A com texto suficiente para forçar split.\n\n## H2b\n\nParágrafo B com texto suficiente para forçar split também.";
364 let chunks = split_hier_chars(body, 40);
365 assert!(!chunks.is_empty());
366 for c in &chunks {
367 assert!(body.is_char_boundary(c.start_offset));
368 assert!(body.is_char_boundary(c.end_offset));
369 assert!(c.end_offset > c.start_offset);
370 assert!(c.end_offset <= body.len());
371 }
372 }
373
374 #[test]
375 fn test_markdown_paragrafo_soft_boundary() {
376 let para = "Frase de texto simples para preencher o parágrafo. ";
377 let body = format!(
378 "{}\n\n{}\n\n{}",
379 para.repeat(3),
380 para.repeat(3),
381 para.repeat(3)
382 );
383 let chunks = split_hier_chars(&body, 80);
384 assert!(
385 chunks.len() >= 2,
386 "esperado >=2 chunks com body de {} chars",
387 body.len()
388 );
389 for c in &chunks {
390 assert!(body.is_char_boundary(c.start_offset));
391 assert!(body.is_char_boundary(c.end_offset));
392 }
393 }
394
395 #[test]
396 fn test_markdown_60kb_offsets_validos() {
397 let bloco = "# Seção\n\nTexto de conteúdo do bloco. ".repeat(1500);
398 assert!(
399 bloco.len() > 50_000,
400 "body deve ser >50KB, tem {} bytes",
401 bloco.len()
402 );
403 let chunks = split_hier_chars(&bloco, 256);
404 assert!(chunks.len() > 1);
405 for c in &chunks {
406 assert!(bloco.is_char_boundary(c.start_offset));
407 assert!(bloco.is_char_boundary(c.end_offset));
408 assert!(c.end_offset > c.start_offset);
409 assert!(!chunk_text(&bloco, c).is_empty());
410 }
411 }
412
413 #[test]
414 fn test_fallback_texto_puro_sem_marcadores() {
415 let body = "a ".repeat(1000);
416 let chunks = split_hier_chars(&body, 100);
417 assert!(!chunks.is_empty());
418 for c in &chunks {
419 assert!(body.is_char_boundary(c.start_offset));
420 assert!(body.is_char_boundary(c.end_offset));
421 }
422 }
423}