1use crate::constants::{CHUNK_OVERLAP_TOKENS, CHUNK_SIZE_TOKENS, EMBEDDING_DIM};
10use text_splitter::{ChunkConfig, MarkdownSplitter};
11use tokenizers::Tokenizer;
12
13const CHARS_PER_TOKEN: usize = 2;
19
20pub const CHUNK_SIZE_CHARS: usize = CHUNK_SIZE_TOKENS * CHARS_PER_TOKEN;
22
23pub const CHUNK_OVERLAP_CHARS: usize = CHUNK_OVERLAP_TOKENS * CHARS_PER_TOKEN;
25
26#[derive(Debug, Clone)]
28pub struct Chunk {
29 pub start_offset: usize,
31 pub end_offset: usize,
33 pub token_count_approx: usize,
35}
36
37pub fn needs_chunking(body: &str) -> bool {
39 body.len() > CHUNK_SIZE_CHARS
40}
41
42pub fn split_into_chunks(body: &str) -> Vec<Chunk> {
50 if !needs_chunking(body) {
51 return vec![Chunk {
52 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
53 start_offset: 0,
54 end_offset: body.len(),
55 }];
56 }
57
58 let mut chunks = Vec::new();
59 let mut start = 0usize;
60
61 while start < body.len() {
62 start = next_char_boundary(body, start);
63 let desired_end = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
64 let end = if desired_end < body.len() {
65 find_split_boundary(body, start, desired_end)
66 } else {
67 desired_end
68 };
69
70 let end = if end <= start {
71 let fallback = previous_char_boundary(body, (start + CHUNK_SIZE_CHARS).min(body.len()));
72 if fallback > start {
73 fallback
74 } else {
75 body.len()
76 }
77 } else {
78 end
79 };
80
81 let token_count_approx = body[start..end].chars().count() / CHARS_PER_TOKEN;
82 chunks.push(Chunk {
83 start_offset: start,
84 end_offset: end,
85 token_count_approx,
86 });
87
88 if end >= body.len() {
89 break;
90 }
91
92 let next_start = next_char_boundary(body, end.saturating_sub(CHUNK_OVERLAP_CHARS));
93 start = if next_start >= end { end } else { next_start };
94 }
95
96 chunks
97}
98
99pub fn split_into_chunks_by_token_offsets(
105 body: &str,
106 token_offsets: &[(usize, usize)],
107) -> Vec<Chunk> {
108 if token_offsets.len() <= CHUNK_SIZE_TOKENS {
109 return vec![Chunk {
110 token_count_approx: token_offsets.len(),
111 start_offset: 0,
112 end_offset: body.len(),
113 }];
114 }
115
116 let mut chunks = Vec::new();
117 let mut start_token = 0usize;
118
119 while start_token < token_offsets.len() {
120 let end_token = (start_token + CHUNK_SIZE_TOKENS).min(token_offsets.len());
121
122 chunks.push(Chunk {
123 start_offset: if start_token == 0 {
124 0
125 } else {
126 token_offsets[start_token].0
127 },
128 end_offset: if end_token == token_offsets.len() {
129 body.len()
130 } else {
131 token_offsets[end_token - 1].1
132 },
133 token_count_approx: end_token - start_token,
134 });
135
136 if end_token == token_offsets.len() {
137 break;
138 }
139
140 let next_start = end_token.saturating_sub(CHUNK_OVERLAP_TOKENS);
141 start_token = if next_start <= start_token {
142 end_token
143 } else {
144 next_start
145 };
146 }
147
148 chunks
149}
150
151pub fn split_into_chunks_hierarchical(body: &str, tokenizer: &Tokenizer) -> Vec<Chunk> {
155 if body.is_empty() {
156 return Vec::new();
157 }
158
159 let config = ChunkConfig::new(CHUNK_SIZE_TOKENS)
160 .with_sizer(tokenizer)
161 .with_overlap(CHUNK_OVERLAP_TOKENS)
162 .expect(
163 "compile-time invariant: CHUNK_OVERLAP_TOKENS must be smaller than CHUNK_SIZE_TOKENS",
164 );
165
166 let splitter = MarkdownSplitter::new(config);
167
168 let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
169
170 if items.is_empty() {
171 return vec![Chunk {
172 start_offset: 0,
173 end_offset: body.len(),
174 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
175 }];
176 }
177
178 items
179 .into_iter()
180 .map(|(start, text)| {
181 let end = start + text.len();
182 Chunk {
183 start_offset: start,
184 end_offset: end,
185 token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
186 }
187 })
188 .collect()
189}
190
191pub fn chunk_text<'a>(body: &'a str, chunk: &Chunk) -> &'a str {
193 &body[chunk.start_offset..chunk.end_offset]
194}
195
196fn find_split_boundary(body: &str, start: usize, desired_end: usize) -> usize {
197 let slice = &body[start..desired_end];
198 if let Some(pos) = slice.rfind("\n\n") {
199 return start + pos + 2;
200 }
201 if let Some(pos) = slice.rfind(". ") {
202 return start + pos + 2;
203 }
204 if let Some(pos) = slice.rfind(' ') {
205 return start + pos + 1;
206 }
207 desired_end
208}
209
210fn previous_char_boundary(body: &str, mut idx: usize) -> usize {
211 idx = idx.min(body.len());
212 while idx > 0 && !body.is_char_boundary(idx) {
213 idx -= 1;
214 }
215 idx
216}
217
218fn next_char_boundary(body: &str, mut idx: usize) -> usize {
219 idx = idx.min(body.len());
220 while idx < body.len() && !body.is_char_boundary(idx) {
221 idx += 1;
222 }
223 idx
224}
225
226pub fn aggregate_embeddings(chunk_embeddings: &[Vec<f32>]) -> Vec<f32> {
231 if chunk_embeddings.is_empty() {
232 return vec![0.0f32; EMBEDDING_DIM];
233 }
234 if chunk_embeddings.len() == 1 {
235 return chunk_embeddings[0].clone();
236 }
237
238 let dim = chunk_embeddings[0].len();
239 let mut mean = vec![0.0f32; dim];
240 for emb in chunk_embeddings {
241 for (i, v) in emb.iter().enumerate() {
242 mean[i] += v;
243 }
244 }
245 let n = chunk_embeddings.len() as f32;
246 for v in &mut mean {
247 *v /= n;
248 }
249
250 let norm: f32 = mean.iter().map(|x| x * x).sum::<f32>().sqrt();
251 if norm > 1e-9 {
252 for v in &mut mean {
253 *v /= norm;
254 }
255 }
256 mean
257}
258
259#[cfg(test)]
260mod tests {
261 use super::*;
262
263 #[test]
264 fn test_short_body_no_chunking() {
265 let body = "short text";
266 assert!(!needs_chunking(body));
267 let chunks = split_into_chunks(body);
268 assert_eq!(chunks.len(), 1);
269 assert_eq!(chunk_text(body, &chunks[0]), body);
270 }
271
272 #[test]
273 fn test_long_body_produces_multiple_chunks() {
274 let body = "word ".repeat(1000);
275 assert!(needs_chunking(&body));
276 let chunks = split_into_chunks(&body);
277 assert!(chunks.len() > 1);
278 assert!(chunks.iter().all(|c| !chunk_text(&body, c).is_empty()));
279 }
280
281 #[test]
282 fn split_by_token_offsets_respeita_limite_e_overlap() {
283 let body = "ab".repeat(460);
284 let offsets: Vec<(usize, usize)> = (0..460)
285 .map(|i| {
286 let start = i * 2;
287 (start, start + 2)
288 })
289 .collect();
290
291 let chunks = split_into_chunks_by_token_offsets(&body, &offsets);
292 assert_eq!(chunks.len(), 2);
293 assert_eq!(chunks[0].token_count_approx, CHUNK_SIZE_TOKENS);
294 assert_eq!(chunks[1].token_count_approx, 110);
295 assert_eq!(chunks[0].start_offset, 0);
296 assert_eq!(
297 chunks[1].start_offset,
298 offsets[CHUNK_SIZE_TOKENS - CHUNK_OVERLAP_TOKENS].0
299 );
300 }
301
302 #[test]
303 fn split_by_token_offsets_returns_one_chunk_when_fits() {
304 let body = "texto curto";
305 let offsets = vec![(0, 5), (6, 11)];
306 let chunks = split_into_chunks_by_token_offsets(body, &offsets);
307 assert_eq!(chunks.len(), 1);
308 assert_eq!(chunks[0].start_offset, 0);
309 assert_eq!(chunks[0].end_offset, body.len());
310 assert_eq!(chunks[0].token_count_approx, 2);
311 }
312
313 #[test]
314 fn test_multibyte_body_preserves_progress_and_boundaries() {
315 let body = "a\u{e7}\u{e3}o \u{fa}til ".repeat(1000);
320 let chunks = split_into_chunks(&body);
321 assert!(chunks.len() > 1);
322 for chunk in &chunks {
323 assert!(!chunk_text(&body, chunk).is_empty());
324 assert!(body.is_char_boundary(chunk.start_offset));
325 assert!(body.is_char_boundary(chunk.end_offset));
326 assert!(chunk.end_offset > chunk.start_offset);
327 }
328 for pair in chunks.windows(2) {
329 assert!(pair[1].start_offset >= pair[0].start_offset);
330 assert!(pair[1].end_offset > pair[0].start_offset);
331 }
332 }
333
334 #[test]
335 fn test_aggregate_embeddings_normalizes() {
336 let embs = vec![vec![1.0f32, 0.0], vec![0.0f32, 1.0]];
337 let agg = aggregate_embeddings(&embs);
338 let norm: f32 = agg.iter().map(|x| x * x).sum::<f32>().sqrt();
339 assert!((norm - 1.0).abs() < 1e-5);
340 }
341
342 fn split_hier_chars(body: &str, size: usize) -> Vec<Chunk> {
343 use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
344 if body.is_empty() {
345 return Vec::new();
346 }
347 let config = ChunkConfig::new(size)
348 .with_sizer(Characters)
349 .with_overlap(0)
350 .expect("overlap must be smaller than size");
351 let splitter = MarkdownSplitter::new(config);
352 let items: Vec<(usize, &str)> = splitter.chunk_indices(body).collect();
353 if items.is_empty() {
354 return vec![Chunk {
355 start_offset: 0,
356 end_offset: body.len(),
357 token_count_approx: body.chars().count() / CHARS_PER_TOKEN,
358 }];
359 }
360 items
361 .into_iter()
362 .map(|(start, text)| {
363 let end = start + text.len();
364 Chunk {
365 start_offset: start,
366 end_offset: end,
367 token_count_approx: text.chars().count() / CHARS_PER_TOKEN,
368 }
369 })
370 .collect()
371 }
372
373 #[test]
374 fn test_hierarchical_empty_body_returns_empty() {
375 use text_splitter::{Characters, ChunkConfig, MarkdownSplitter};
376 let config = ChunkConfig::new(100)
377 .with_sizer(Characters)
378 .with_overlap(0)
379 .expect("overlap < size");
380 let splitter = MarkdownSplitter::new(config);
381 let result: Vec<_> = splitter.chunk_indices("").collect();
382 assert!(result.is_empty());
383 }
384
385 #[test]
386 fn test_markdown_h1_boundary_yields_two_chunks() {
387 let body = "# Title 1\n\nbody1 body1 body1 body1 body1 body1\n\n# Title 2\n\nbody2 body2 body2 body2 body2 body2";
388 let chunks = split_hier_chars(body, 30);
389 assert!(
390 chunks.len() >= 2,
391 "expected >=2 chunks, got {}",
392 chunks.len()
393 );
394 for c in &chunks {
395 assert!(body.is_char_boundary(c.start_offset));
396 assert!(body.is_char_boundary(c.end_offset));
397 }
398 }
399
400 #[test]
401 fn test_markdown_h2_nested_respects_boundaries() {
402 let body = "# H1\n\n## H2a\n\nParagraph A with enough text to force a split.\n\n## H2b\n\nParagraph B with enough text to force a split as well.";
403 let chunks = split_hier_chars(body, 40);
404 assert!(!chunks.is_empty());
405 for c in &chunks {
406 assert!(body.is_char_boundary(c.start_offset));
407 assert!(body.is_char_boundary(c.end_offset));
408 assert!(c.end_offset > c.start_offset);
409 assert!(c.end_offset <= body.len());
410 }
411 }
412
413 #[test]
414 fn test_markdown_paragraph_soft_boundary() {
415 let para = "Plain text sentence used to fill the paragraph. ";
416 let body = format!(
417 "{}\n\n{}\n\n{}",
418 para.repeat(3),
419 para.repeat(3),
420 para.repeat(3)
421 );
422 let chunks = split_hier_chars(&body, 80);
423 assert!(
424 chunks.len() >= 2,
425 "expected >=2 chunks with a body of {} chars",
426 body.len()
427 );
428 for c in &chunks {
429 assert!(body.is_char_boundary(c.start_offset));
430 assert!(body.is_char_boundary(c.end_offset));
431 }
432 }
433
434 #[test]
435 fn test_markdown_60kb_valid_offsets() {
436 let block = "# Section\n\nBlock content text. ".repeat(1700);
437 assert!(
438 block.len() > 50_000,
439 "body must be >50KB, has {} bytes",
440 block.len()
441 );
442 let chunks = split_hier_chars(&block, 256);
443 assert!(chunks.len() > 1);
444 for c in &chunks {
445 assert!(block.is_char_boundary(c.start_offset));
446 assert!(block.is_char_boundary(c.end_offset));
447 assert!(c.end_offset > c.start_offset);
448 assert!(!chunk_text(&block, c).is_empty());
449 }
450 }
451
452 #[test]
453 fn test_fallback_plain_text_without_markers() {
454 let body = "a ".repeat(1000);
455 let chunks = split_hier_chars(&body, 100);
456 assert!(!chunks.is_empty());
457 for c in &chunks {
458 assert!(body.is_char_boundary(c.start_offset));
459 assert!(body.is_char_boundary(c.end_offset));
460 }
461 }
462}