1use crate::Result;
27#[cfg(feature = "semantic-chunking")]
28use std::collections::BTreeSet;
29
30#[derive(Debug, Clone)]
32pub struct SemanticChunkConfig {
33 pub target_size: usize,
35 pub min_size: usize,
37 pub max_size: usize,
39 pub similarity_threshold: f32,
42 pub overlap: usize,
44 pub fallback_to_sentences: bool,
46}
47
48impl Default for SemanticChunkConfig {
49 fn default() -> Self {
50 Self {
51 target_size: 10_000,
52 min_size: 1_000,
53 max_size: 20_000,
54 similarity_threshold: 0.7,
55 overlap: 200,
56 fallback_to_sentences: true,
57 }
58 }
59}
60
61impl SemanticChunkConfig {
62 pub fn long_document() -> Self {
64 Self {
65 target_size: 50_000,
66 min_size: 5_000,
67 max_size: 100_000,
68 similarity_threshold: 0.75,
69 overlap: 500,
70 fallback_to_sentences: true,
71 }
72 }
73
74 pub fn coreference() -> Self {
76 Self {
77 target_size: 5_000,
78 min_size: 500,
79 max_size: 10_000,
80 similarity_threshold: 0.8, overlap: 300,
82 fallback_to_sentences: true,
83 }
84 }
85}
86
87#[derive(Debug, Clone)]
89pub struct SemanticChunk {
90 pub text: String,
92 pub start: usize,
94 pub end: usize,
96 pub topic: Option<String>,
98 pub similarity_to_prev: Option<f32>,
100}
101
102pub trait SemanticChunker: Send + Sync {
104 fn chunk(&self, text: &str, language: Option<&str>) -> Result<Vec<SemanticChunk>>;
108}
109
110#[derive(Debug)]
115pub struct RuleBasedSemanticChunker {
116 config: SemanticChunkConfig,
117}
118
119impl RuleBasedSemanticChunker {
120 pub fn new(config: SemanticChunkConfig) -> Self {
122 Self { config }
123 }
124}
125
126fn char_to_byte_map(text: &str) -> Vec<usize> {
127 let mut map = Vec::with_capacity(text.chars().count().saturating_add(1));
131 for (b, _) in text.char_indices() {
132 map.push(b);
133 }
134 map.push(text.len());
135 map
136}
137
138fn byte_at_char(char_to_byte: &[usize], char_idx: usize) -> usize {
139 if char_to_byte.is_empty() {
141 return 0;
142 }
143 let i = char_idx.min(char_to_byte.len() - 1);
144 char_to_byte[i]
145}
146
147fn paragraph_ranges(text: &str) -> Vec<(usize, usize)> {
148 let mut out = Vec::new();
155 let mut para_start: Option<usize> = None;
156 let mut line_start = 0usize;
157 let mut line_has_non_ws = false;
158
159 let mut i = 0usize;
160 for c in text.chars() {
161 match c {
162 '\n' => {
163 if line_has_non_ws {
166 if para_start.is_none() {
167 para_start = Some(line_start);
168 }
169 } else if let Some(ps) = para_start {
170 out.push((ps, line_start));
172 para_start = None;
173 }
174 i += 1;
175 line_start = i;
176 line_has_non_ws = false;
177 }
178 '\r' => {
179 i += 1;
181 }
182 ' ' | '\t' => {
183 i += 1;
184 }
185 _ => {
186 line_has_non_ws = true;
187 i += 1;
188 }
189 }
190 }
191
192 if line_has_non_ws && para_start.is_none() {
194 para_start = Some(line_start);
195 }
196 if let Some(ps) = para_start {
197 out.push((ps, i));
198 }
199 out
200}
201
202impl SemanticChunker for RuleBasedSemanticChunker {
203 fn chunk(&self, text: &str, language: Option<&str>) -> Result<Vec<SemanticChunk>> {
204 let _ = language; if text.is_empty() {
207 return Ok(Vec::new());
208 }
209
210 let char_to_byte = char_to_byte_map(text);
211 let text_len_chars = char_to_byte.len().saturating_sub(1);
212
213 let mut paras = paragraph_ranges(text);
216 if paras.is_empty() {
217 paras = vec![(0, text_len_chars)];
218 }
219
220 let mut ranges: Vec<(usize, usize)> = Vec::new();
222 let mut cur_start: Option<usize> = None;
223 let mut cur_end: usize = 0;
224
225 for (p_start, p_end) in paras {
226 if p_end <= p_start {
227 continue;
228 }
229 if cur_start.is_none() {
230 cur_start = Some(p_start);
231 cur_end = p_start;
232 }
233
234 let next_end = cur_end.max(p_end);
236 if let Some(cs) = cur_start {
237 let cur_len = next_end.saturating_sub(cs);
238 if cur_len > self.config.max_size && cur_end > cs {
239 ranges.push((cs, cur_end));
241 let overlap_start = cur_end.saturating_sub(self.config.overlap);
243 cur_start = Some(overlap_start.min(p_start));
244 cur_end = cur_start.unwrap();
245 }
246 }
247
248 cur_end = cur_end.max(p_end);
250
251 if let Some(cs) = cur_start {
253 let cur_len = cur_end.saturating_sub(cs);
254 if cur_len >= self.config.target_size && cur_len >= self.config.min_size {
255 ranges.push((cs, cur_end));
256 let overlap_start = cur_end.saturating_sub(self.config.overlap);
257 cur_start = Some(overlap_start.min(cur_end));
258 cur_end = cur_start.unwrap();
259 }
260 }
261 }
262
263 if let Some(cs) = cur_start {
264 if cur_end > cs {
265 ranges.push((cs, cur_end));
266 }
267 }
268
269 let mut merged: Vec<(usize, usize)> = Vec::new();
271 for (s, e) in ranges {
272 let len = e.saturating_sub(s);
273 if len < self.config.min_size && !merged.is_empty() {
274 let last = merged.last_mut().unwrap();
275 last.1 = last.1.max(e);
276 } else {
277 merged.push((s, e));
278 }
279 }
280
281 let mut out = Vec::new();
283 for (s, e) in merged {
284 let sb = byte_at_char(&char_to_byte, s);
285 let eb = byte_at_char(&char_to_byte, e);
286 if eb <= sb {
287 continue;
288 }
289 let chunk_text = text.get(sb..eb).unwrap_or("").to_string();
290 if chunk_text.trim().is_empty() {
291 continue;
292 }
293 out.push(SemanticChunk {
294 text: chunk_text,
295 start: s,
296 end: e,
297 topic: None,
298 similarity_to_prev: None,
299 });
300 }
301
302 Ok(out)
303 }
304}
305
306#[cfg(feature = "semantic-chunking")]
314#[derive(Debug)]
315pub struct EmbeddingSemanticChunker {
316 config: SemanticChunkConfig,
317 }
320
321#[cfg(feature = "semantic-chunking")]
322impl EmbeddingSemanticChunker {
323 pub fn new(config: SemanticChunkConfig) -> Result<Self> {
325 Ok(Self { config })
326 }
327
328 fn tokenize_for_similarity(s: &str) -> BTreeSet<String> {
329 let mut t = String::with_capacity(s.len());
336 for c in s.chars() {
337 if c.is_alphanumeric() {
338 t.push(c.to_ascii_lowercase());
339 } else {
340 t.push(' ');
341 }
342 }
343 t.split_whitespace()
344 .filter(|w| w.chars().count() > 2)
345 .map(|w| w.to_string())
346 .collect()
347 }
348
349 fn jaccard(a: &BTreeSet<String>, b: &BTreeSet<String>) -> f32 {
350 if a.is_empty() && b.is_empty() {
351 return 1.0;
352 }
353 if a.is_empty() || b.is_empty() {
354 return 0.0;
355 }
356 let inter = a.intersection(b).count() as f32;
357 let uni = a.union(b).count() as f32;
358 if uni <= 0.0 {
359 0.0
360 } else {
361 inter / uni
362 }
363 }
364
365 fn char_to_byte_map(text: &str) -> Vec<usize> {
366 super::semantic_chunking::char_to_byte_map(text)
367 }
368
369 fn byte_at_char(map: &[usize], char_idx: usize) -> usize {
370 super::semantic_chunking::byte_at_char(map, char_idx)
371 }
372
373 fn split_sentences_spans(text: &str) -> Vec<(usize, usize)> {
374 let terminators = [
376 '.', '!', '?', '。', '!', '?', '؟', '۔', '।', ];
381 let mut out = Vec::new();
382 let mut start = 0usize;
383 let mut i = 0usize;
384 for c in text.chars() {
385 i += 1;
386 if terminators.contains(&c) {
387 if i > start {
388 out.push((start, i));
389 }
390 start = i;
391 }
392 }
393 if i > start {
394 out.push((start, i));
395 }
396 out
397 }
398}
399
400#[cfg(feature = "semantic-chunking")]
401impl SemanticChunker for EmbeddingSemanticChunker {
402 fn chunk(&self, text: &str, language: Option<&str>) -> Result<Vec<SemanticChunk>> {
403 let _ = language;
404 let t = text.trim();
405 if t.is_empty() {
406 return Ok(vec![]);
407 }
408
409 let spans = Self::split_sentences_spans(text);
410 if spans.is_empty() {
411 let fallback = RuleBasedSemanticChunker::new(self.config.clone());
412 return fallback.chunk(text, None);
413 }
414
415 let char_to_byte = Self::char_to_byte_map(text);
416
417 let mut chunks: Vec<SemanticChunk> = Vec::new();
418 let mut chunk_start_char = spans[0].0;
419 let mut chunk_end_char = spans[0].1;
420 let mut prev_sentence_tokens: Option<BTreeSet<String>> = None;
421 let mut prev_chunk_similarity: Option<f32> = None;
422
423 for (idx, (s0, s1)) in spans.iter().copied().enumerate() {
424 let sent_start = s0;
425 let sent_end = s1;
426
427 let sent_bytes_start = Self::byte_at_char(&char_to_byte, sent_start);
428 let sent_bytes_end = Self::byte_at_char(&char_to_byte, sent_end);
429 let sent_text = text
430 .get(sent_bytes_start..sent_bytes_end)
431 .unwrap_or("")
432 .trim();
433
434 if sent_text.is_empty() {
435 continue;
436 }
437
438 let tokens = Self::tokenize_for_similarity(sent_text);
439 let sim_to_prev_sentence = prev_sentence_tokens
440 .as_ref()
441 .map(|p| Self::jaccard(p, &tokens));
442
443 if idx > 0 {
445 let cur_len = chunk_end_char.saturating_sub(chunk_start_char);
446 let would_len = sent_end.saturating_sub(chunk_start_char);
447 let similarity_break = sim_to_prev_sentence
448 .map(|s| s < self.config.similarity_threshold)
449 .unwrap_or(false);
450 let would_exceed =
451 would_len > self.config.max_size && cur_len >= self.config.min_size;
452
453 if (similarity_break && cur_len >= self.config.min_size) || would_exceed {
454 let start_b = Self::byte_at_char(&char_to_byte, chunk_start_char);
455 let end_b = Self::byte_at_char(&char_to_byte, chunk_end_char);
456 let chunk_text = text.get(start_b..end_b).unwrap_or("").trim().to_string();
457 if !chunk_text.is_empty() {
458 chunks.push(SemanticChunk {
459 text: chunk_text,
460 start: chunk_start_char,
461 end: chunk_end_char,
462 topic: None,
463 similarity_to_prev: prev_chunk_similarity,
464 });
465 }
466
467 let overlap_start_char = chunk_end_char
469 .saturating_sub(self.config.overlap)
470 .min(sent_start);
471 chunk_start_char = overlap_start_char;
472 prev_chunk_similarity = sim_to_prev_sentence;
473 }
474 }
475
476 chunk_end_char = sent_end;
478 prev_sentence_tokens = Some(tokens);
479 }
480
481 if chunk_end_char > chunk_start_char {
483 let start_b = Self::byte_at_char(&char_to_byte, chunk_start_char);
484 let end_b = Self::byte_at_char(&char_to_byte, chunk_end_char);
485 let chunk_text = text.get(start_b..end_b).unwrap_or("").trim().to_string();
486 if !chunk_text.is_empty() {
487 chunks.push(SemanticChunk {
488 text: chunk_text,
489 start: chunk_start_char,
490 end: chunk_end_char,
491 topic: None,
492 similarity_to_prev: prev_chunk_similarity,
493 });
494 }
495 }
496
497 if chunks.is_empty() {
498 let fallback = RuleBasedSemanticChunker::new(self.config.clone());
499 return fallback.chunk(text, None);
500 }
501
502 Ok(chunks)
503 }
504}
505
506pub fn create_semantic_chunker(config: SemanticChunkConfig) -> Result<Box<dyn SemanticChunker>> {
508 #[cfg(feature = "semantic-chunking")]
509 {
510 match EmbeddingSemanticChunker::new(config.clone()) {
512 Ok(chunker) => Ok(Box::new(chunker)),
513 Err(_) => Ok(Box::new(RuleBasedSemanticChunker::new(config))),
514 }
515 }
516
517 #[cfg(not(feature = "semantic-chunking"))]
518 {
519 Ok(Box::new(RuleBasedSemanticChunker::new(config)))
521 }
522}
523
524#[cfg(test)]
525mod tests {
526 use super::*;
527
528 #[test]
529 fn test_rule_based_chunker() {
530 let config = SemanticChunkConfig {
531 target_size: 100,
532 min_size: 50,
533 max_size: 200,
534 similarity_threshold: 0.7,
535 overlap: 20,
536 fallback_to_sentences: true,
537 };
538
539 let chunker = RuleBasedSemanticChunker::new(config);
540 let text = "Paragraph one.\n\nParagraph two.\n\nParagraph three.";
541 let chunks = chunker.chunk(text, None).unwrap();
542
543 assert!(!chunks.is_empty());
544 assert_eq!(chunks[0].start, 0);
545 }
546
547 #[test]
548 fn test_chunker_respects_min_size() {
549 let config = SemanticChunkConfig {
550 target_size: 1000,
551 min_size: 100,
552 max_size: 2000,
553 similarity_threshold: 0.7,
554 overlap: 50,
555 fallback_to_sentences: true,
556 };
557
558 let chunker = RuleBasedSemanticChunker::new(config);
559 let text = "Short.\n\nAlso short.";
560 let chunks = chunker.chunk(text, None).unwrap();
561
562 assert!(chunks.len() <= 1 || chunks[0].text.chars().count() >= 100);
564 }
565}