1use crate::constants::{CHUNK_OVERLAP_TOKENS, CHUNK_SIZE_TOKENS, EMBEDDING_DIM};
10use text_splitter::{ChunkConfig, MarkdownSplitter};
11use tokenizers::Tokenizer;
12
13const CHARS_PER_TOKEN: usize = 2;
17pub const CHUNK_SIZE_CHARS: usize = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
18pub const CHUNK_OVERLAP_CHARS: usize = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
19
20#[derive(Debug, Clone)]
21pub struct Chunk {
22 pub start_offset: usize,
23 pub end_offset: usize,
24 pub token_count_approx: usize,
25}
26
27pub fn needs_chunking(body: &str) -> bool {
28 body.len() > CHUNK_SIZE_CHARS
29}
30
31pub fn split_into_chunks(body: &str) -> Vec<Chunk> {
32 if !needs_chunking(body) {
33 return vec![Chunk {
34 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
35 start_offset: 0,
36 end_offset: body.len(),
37 }];
38 }
39
40 let mut chunks = Vec::new();
41 let mut start = 0usize;
42
43 while start < body.len() {
44 start = next_char_boundary(body, start);
45 let desired_end = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
46 let end = if desired_end < body.len() {
47 find_split_boundary(body, start, desired_end)
48 } else {
49 desired_end
50 };
51
52 let end = if end <= start {
53 let fallback = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
54 if fallback > start {
55 fallback
56 } else {
57 body.len()
58 }
59 } else {
60 end
61 };
62
63 let token_count_approx = body[start..end].chars().count() / CHARS_PER_TOKEN;
64 chunks.push(Chunk {
65 start_offset: start,
66 end_offset: end,
67 token_count_approx,
68 });
69
70 if end >= body.len() {
71 break;
72 }
73
74 let next_start = next_char_boundary(body, end.saturating_sub(CHUNK_OVERLAP_CHARS));
75 start = if next_start >= end { end } else { next_start };
76 }
77
78 chunks
79}
80
81pub fn split_into_chunks_by_token_offsets(
82 body: &str,
83 token_offsets: &[(usize, usize)],
84) -> Vec<Chunk> {
85 if token_offsets.len() <= CHUNK_SIZE_TOKENS {
86 return vec![Chunk {
87 token_count_approx: token_offsets.len(),
88 start_offset: 0,
89 end_offset: body.len(),
90 }];
91 }
92
93 let mut chunks = Vec::new();
94 let mut start_token = 0usize;
95
96 while start_token < token_offsets.len() {
97 let end_token = (start_token + CHUNK_SIZE_TOKENS).min(token_offsets.len());
98
99 chunks.push(Chunk {
100 start_offset: if start_token == 0 {
101 0
102 } else {
103 token_offsets[start_token].0
104 },
105 end_offset: if end_token == token_offsets.len() {
106 body.len()
107 } else {
108 token_offsets[end_token - 1].1
109 },
110 token_count_approx: end_token - start_token,
111 });
112
113 if end_token == token_offsets.len() {
114 break;
115 }
116
117 let next_start = end_token.saturating_sub(CHUNK_OVERLAP_TOKENS);
118 start_token = if next_start <= start_token {
119 end_token
120 } else {
121 next_start
122 };
123 }
124
125 chunks
126}
127
128pub fn split_into_chunks_hierarchical(body: &str, tokenizer: &Tokenizer) -> Vec<Chunk> {
132 if body.is_empty() {
133 return Vec::new();
134 }
135
136 let config = ChunkConfig::new(CHUNK_SIZE_TOKENS)
137 .with_sizer(tokenizer)
138 .with_overlap(CHUNK_OVERLAP_TOKENS)
139 .expect("CHUNK_OVERLAP_TOKENS deve ser menor que CHUNK_SIZE_TOKENS");
140
141 let splitter = MarkdownSplitter::new(config);
142
143 let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
144
145 if items.is_empty() {
146 return vec![Chunk {
147 start_offset: 0,
148 end_offset: body.len(),
149 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
150 }];
151 }
152
153 items
154 .into_iter()
155 .map(|(start, text)| {
156 let end = start + text.len();
157 Chunk {
158 start_offset: start,
159 end_offset: end,
160 token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
161 }
162 })
163 .collect()
164}
165
166pub fn chunk_text<'a>(body: &'a str, chunk: &Chunk) -> &'a str {
167 &body[chunk.start_offset..chunk.end_offset]
168}
169
170fn find_split_boundary(body: &str, start: usize, desired_end: usize) -> usize {
171 let slice = &body[start..desired_end];
172 if let Some(pos) = slice.rfind("\n\n") {
173 return start + pos + 2;
174 }
175 if let Some(pos) = slice.rfind(". ") {
176 return start + pos + 2;
177 }
178 if let Some(pos) = slice.rfind(' ') {
179 return start + pos + 1;
180 }
181 desired_end
182}
183
184fn previous_char_boundary(body: &str, mut idx: usize) -> usize {
185 idx = idx.min(body.len());
186 while idx > 0 && !body.is_char_boundary(idx) {
187 idx -= 1;
188 }
189 idx
190}
191
192fn next_char_boundary(body: &str, mut idx: usize) -> usize {
193 idx = idx.min(body.len());
194 while idx < body.len() && !body.is_char_boundary(idx) {
195 idx += 1;
196 }
197 idx
198}
199
200pub fn aggregate_embeddings(chunk_embeddings: &[Vec<f32>]) -> Vec<f32> {
201 if chunk_embeddings.is_empty() {
202 return vec![0.0f32; EMBEDDING_DIM];
203 }
204 if chunk_embeddings.len() == 1 {
205 return chunk_embeddings[0].clone();
206 }
207
208 let dim = chunk_embeddings[0].len();
209 let mut mean = vec![0.0f32; dim];
210 for emb in chunk_embeddings {
211 for (i, v) in emb.iter().enumerate() {
212 mean[i] += v;
213 }
214 }
215 let n = chunk_embeddings.len() as f32;
216 for v in &mut mean {
217 *v /= n;
218 }
219
220 let norm: f32 = mean.iter().map(|x| x * x).sum::<f32>().sqrt();
221 if norm > 1e-9 {
222 for v in &mut mean {
223 *v /= norm;
224 }
225 }
226 mean
227}
228
229#[cfg(test)]
230mod tests {
231 use super::*;
232
233 #[test]
234 fn test_short_body_no_chunking() {
235 let body = "short text";
236 assert!(!needs_chunking(body));
237 let chunks = split_into_chunks(body);
238 assert_eq!(chunks.len(), 1);
239 assert_eq!(chunk_text(body, &chunks[0]), body);
240 }
241
242 #[test]
243 fn test_long_body_produces_multiple_chunks() {
244 let body = "word ".repeat(1000);
245 assert!(needs_chunking(&body));
246 let chunks = split_into_chunks(&body);
247 assert!(chunks.len() > 1);
248 assert!(chunks.iter().all(|c| !chunk_text(&body, c).is_empty()));
249 }
250
251 #[test]
252 fn split_by_token_offsets_respeita_limite_e_overlap() {
253 let body = "ab".repeat(460);
254 let offsets: Vec<(usize, usize)> = (0..460)
255 .map(|i| {
256 let start = i * 2;
257 (start, start + 2)
258 })
259 .collect();
260
261 let chunks = split_into_chunks_by_token_offsets(&body, &offsets);
262 assert_eq!(chunks.len(), 2);
263 assert_eq!(chunks[0].token_count_approx, CHUNK_SIZE_TOKENS);
264 assert_eq!(chunks[1].token_count_approx, 110);
265 assert_eq!(chunks[0].start_offset, 0);
266 assert_eq!(
267 chunks[1].start_offset,
268 offsets[CHUNK_SIZE_TOKENS - CHUNK_OVERLAP_TOKENS].0
269 );
270 }
271
272 #[test]
273 fn split_by_token_offsets_retorna_um_chunk_quando_cabe() {
274 let body = "texto curto";
275 let offsets = vec![(0, 5), (6, 11)];
276 let chunks = split_into_chunks_by_token_offsets(body, &offsets);
277 assert_eq!(chunks.len(), 1);
278 assert_eq!(chunks[0].start_offset, 0);
279 assert_eq!(chunks[0].end_offset, body.len());
280 assert_eq!(chunks[0].token_count_approx, 2);
281 }
282
283 #[test]
284 fn test_multibyte_body_preserves_progress_and_boundaries() {
285 let body = "ação útil ".repeat(1000);
286 let chunks = split_into_chunks(&body);
287 assert!(chunks.len() > 1);
288 for chunk in &chunks {
289 assert!(!chunk_text(&body, chunk).is_empty());
290 assert!(body.is_char_boundary(chunk.start_offset));
291 assert!(body.is_char_boundary(chunk.end_offset));
292 assert!(chunk.end_offset > chunk.start_offset);
293 }
294 for pair in chunks.windows(2) {
295 assert!(pair[1].start_offset >= pair[0].start_offset);
296 assert!(pair[1].end_offset > pair[0].start_offset);
297 }
298 }
299
300 #[test]
301 fn test_aggregate_embeddings_normalizes() {
302 let embs = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
303 let agg = aggregate_embeddings(&embs);
304 let norm: f32 = agg.iter().map(|x| x * x).sum::<f32>().sqrt();
305 assert!((norm - 1.0).abs() < 1e-5);
306 }
307
308 fn split_hier_chars(body: &str, size: usize) -> Vec<Chunk> {
309 use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
310 if body.is_empty() {
311 return Vec::new();
312 }
313 let config = ChunkConfig::new(size)
314 .with_sizer(Characters)
315 .with_overlap(0)
316 .expect("overlap deve ser menor que size");
317 let splitter = MarkdownSplitter::new(config);
318 let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
319 if items.is_empty() {
320 return vec![Chunk {
321 start_offset: 0,
322 end_offset: body.len(),
323 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
324 }];
325 }
326 items
327 .into_iter()
328 .map(|(start, text)| {
329 let end = start + text.len();
330 Chunk {
331 start_offset: start,
332 end_offset: end,
333 token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
334 }
335 })
336 .collect()
337 }
338
339 #[test]
340 fn test_hierarchical_empty_body_retorna_vazio() {
341 use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
342 let config = ChunkConfig::new(100)
343 .with_sizer(Characters)
344 .with_overlap(0)
345 .expect("overlap < size");
346 let splitter = MarkdownSplitter::new(config);
347 let result: Vec<_> = splitter.chunk_indices("").collect();
348 assert!(result.is_empty());
349 }
350
351 #[test]
352 fn test_markdown_h1_boundary_gera_dois_chunks() {
353 let body = "# Title 1\n\nbody1 body1 body1 body1 body1 body1\n\n# Title 2\n\nbody2 body2 body2 body2 body2 body2";
354 let chunks = split_hier_chars(body, 30);
355 assert!(
356 chunks.len() >= 2,
357 "esperado >=2 chunks, obtido {}",
358 chunks.len()
359 );
360 for c in &chunks {
361 assert!(body.is_char_boundary(c.start_offset));
362 assert!(body.is_char_boundary(c.end_offset));
363 }
364 }
365
366 #[test]
367 fn test_markdown_h2_nested_respeita_boundaries() {
368 let body = "# H1\n\n## H2a\n\nParágrafo A com texto suficiente para forçar split.\n\n## H2b\n\nParágrafo B com texto suficiente para forçar split também.";
369 let chunks = split_hier_chars(body, 40);
370 assert!(!chunks.is_empty());
371 for c in &chunks {
372 assert!(body.is_char_boundary(c.start_offset));
373 assert!(body.is_char_boundary(c.end_offset));
374 assert!(c.end_offset > c.start_offset);
375 assert!(c.end_offset <= body.len());
376 }
377 }
378
379 #[test]
380 fn test_markdown_paragrafo_soft_boundary() {
381 let para = "Frase de texto simples para preencher o parágrafo. ";
382 let body = format!(
383 "{}\n\n{}\n\n{}",
384 para.repeat(3),
385 para.repeat(3),
386 para.repeat(3)
387 );
388 let chunks = split_hier_chars(&body, 80);
389 assert!(
390 chunks.len() >= 2,
391 "esperado >=2 chunks com body de {} chars",
392 body.len()
393 );
394 for c in &chunks {
395 assert!(body.is_char_boundary(c.start_offset));
396 assert!(body.is_char_boundary(c.end_offset));
397 }
398 }
399
400 #[test]
401 fn test_markdown_60kb_offsets_validos() {
402 let bloco = "# Seção\n\nTexto de conteúdo do bloco. ".repeat(1500);
403 assert!(
404 bloco.len() > 50_000,
405 "body deve ser >50KB, tem {} bytes",
406 bloco.len()
407 );
408 let chunks = split_hier_chars(&bloco, 256);
409 assert!(chunks.len() > 1);
410 for c in &chunks {
411 assert!(bloco.is_char_boundary(c.start_offset));
412 assert!(bloco.is_char_boundary(c.end_offset));
413 assert!(c.end_offset > c.start_offset);
414 assert!(!chunk_text(&bloco, c).is_empty());
415 }
416 }
417
418 #[test]
419 fn test_fallback_texto_puro_sem_marcadores() {
420 let body = "a ".repeat(1000);
421 let chunks = split_hier_chars(&body, 100);
422 assert!(!chunks.is_empty());
423 for c in &chunks {
424 assert!(body.is_char_boundary(c.start_offset));
425 assert!(body.is_char_boundary(c.end_offset));
426 }
427 }
428}