1mod timestamp;
4pub use timestamp::TimestampChunker;
5
6use crate::{Document, DocumentId, Error, Result};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10fn ceil_char_boundary(s: &str, i: usize) -> usize {
13 if i >= s.len() {
14 s.len()
15 } else {
16 let mut pos = i;
17 while pos < s.len() && !s.is_char_boundary(pos) {
18 pos += 1;
19 }
20 pos
21 }
22}
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
26pub struct ChunkId(pub uuid::Uuid);
27
28impl ChunkId {
29 #[must_use]
31 pub fn new() -> Self {
32 Self(uuid::Uuid::new_v4())
33 }
34}
35
36impl Default for ChunkId {
37 fn default() -> Self {
38 Self::new()
39 }
40}
41
42impl std::fmt::Display for ChunkId {
43 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44 write!(f, "{}", self.0)
45 }
46}
47
48#[derive(Debug, Clone, Default, Serialize, Deserialize)]
50pub struct ChunkMetadata {
51 pub title: Option<String>,
53 pub headers: Vec<String>,
55 pub page: Option<usize>,
57 pub custom: HashMap<String, serde_json::Value>,
59}
60
61#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct Chunk {
64 pub id: ChunkId,
66 pub document_id: DocumentId,
68 pub content: String,
70 pub start_offset: usize,
72 pub end_offset: usize,
74 pub metadata: ChunkMetadata,
76 pub embedding: Option<Vec<f32>>,
78}
79
80impl Chunk {
81 #[must_use]
83 pub fn new(
84 document_id: DocumentId,
85 content: String,
86 start_offset: usize,
87 end_offset: usize,
88 ) -> Self {
89 Self {
90 id: ChunkId::new(),
91 document_id,
92 content,
93 start_offset,
94 end_offset,
95 metadata: ChunkMetadata::default(),
96 embedding: None,
97 }
98 }
99
100 #[must_use]
102 pub fn len(&self) -> usize {
103 self.content.len()
104 }
105
106 #[must_use]
108 pub fn is_empty(&self) -> bool {
109 self.content.is_empty()
110 }
111
112 pub fn set_embedding(&mut self, embedding: Vec<f32>) {
114 contract_pre_embedding_lookup!(embedding);
116 self.embedding = Some(embedding);
117 }
118}
119
120#[derive(Debug, Clone, Serialize, Deserialize)]
122pub enum ChunkingStrategy {
123 FixedSize {
125 chunk_size: usize,
127 overlap: usize,
129 },
130 Sentence {
132 max_sentences: usize,
134 overlap_sentences: usize,
136 },
137 Paragraph {
139 max_paragraphs: usize,
141 },
142 Recursive {
144 separators: Vec<String>,
146 chunk_size: usize,
148 overlap: usize,
150 },
151}
152
153impl Default for ChunkingStrategy {
154 fn default() -> Self {
155 Self::Recursive {
156 separators: vec![
157 "\n\n".to_string(),
158 "\n".to_string(),
159 ". ".to_string(),
160 " ".to_string(),
161 ],
162 chunk_size: 512,
163 overlap: 50,
164 }
165 }
166}
167
168pub trait Chunker: Send + Sync {
170 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>>;
172
173 fn estimate_chunks(&self, document: &Document) -> usize;
175}
176
177#[derive(Debug, Clone)]
179pub struct RecursiveChunker {
180 separators: Vec<String>,
181 chunk_size: usize,
182 overlap: usize,
183}
184
185impl RecursiveChunker {
186 #[must_use]
188 pub fn new(chunk_size: usize, overlap: usize) -> Self {
189 Self {
190 separators: vec![
191 "\n\n".to_string(),
192 "\n".to_string(),
193 ". ".to_string(),
194 " ".to_string(),
195 ],
196 chunk_size,
197 overlap,
198 }
199 }
200
201 #[must_use]
203 pub fn with_separators(mut self, separators: Vec<String>) -> Self {
204 self.separators = separators;
205 self
206 }
207
208 fn split_text(&self, text: &str, separator_idx: usize) -> Vec<String> {
209 if text.len() <= self.chunk_size {
210 return vec![text.to_string()];
211 }
212
213 if separator_idx >= self.separators.len() {
214 return self.split_by_chars(text);
216 }
217
218 let separator = &self.separators[separator_idx];
219 let parts: Vec<&str> = text.split(separator).collect();
220
221 if parts.len() == 1 {
222 return self.split_text(text, separator_idx + 1);
224 }
225
226 self.merge_splits(&parts, separator, separator_idx)
227 }
228
229 fn merge_splits(&self, parts: &[&str], separator: &str, separator_idx: usize) -> Vec<String> {
230 let mut chunks = Vec::new();
231 let mut current = String::new();
232
233 for part in parts {
234 let potential = if current.is_empty() {
235 (*part).to_string()
236 } else {
237 format!("{current}{separator}{part}")
238 };
239
240 if potential.len() <= self.chunk_size {
241 current = potential;
242 } else if current.is_empty() {
243 chunks.extend(self.split_text(part, separator_idx + 1));
245 } else {
246 chunks.push(current);
247 current = (*part).to_string();
248 }
249 }
250
251 if !current.is_empty() {
252 if current.len() <= self.chunk_size {
253 chunks.push(current);
254 } else {
255 chunks.extend(self.split_text(¤t, separator_idx + 1));
256 }
257 }
258
259 chunks
260 }
261
262 fn split_by_chars(&self, text: &str) -> Vec<String> {
263 let chars: Vec<char> = text.chars().collect();
264 let mut chunks = Vec::new();
265 let mut start = 0;
266
267 while start < chars.len() {
268 let end = (start + self.chunk_size).min(chars.len());
269 let chunk: String = chars[start..end].iter().collect();
270 chunks.push(chunk);
271
272 if end >= chars.len() {
273 break;
274 }
275
276 start = if end > self.overlap { end - self.overlap } else { end };
278 }
279
280 chunks
281 }
282
283 fn apply_overlap(&self, chunks: Vec<String>) -> Vec<String> {
284 if self.overlap == 0 || chunks.len() <= 1 {
285 return chunks;
286 }
287
288 let mut result = Vec::with_capacity(chunks.len());
289 for (i, chunk) in chunks.iter().enumerate() {
290 if i == 0 {
291 result.push(chunk.clone());
292 } else {
293 let prev = &chunks[i - 1];
295 let overlap_text = if prev.len() > self.overlap {
296 let start = prev.len() - self.overlap;
297 let start = ceil_char_boundary(prev, start);
298 &prev[start..]
299 } else {
300 prev.as_str()
301 };
302 result.push(format!("{overlap_text}{chunk}"));
303 }
304 }
305 result
306 }
307}
308
309impl Chunker for RecursiveChunker {
310 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
311 if document.content.is_empty() {
312 return Err(Error::EmptyDocument(
313 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
314 ));
315 }
316
317 let text_chunks = self.split_text(&document.content, 0);
318 let overlapped = self.apply_overlap(text_chunks);
319
320 let mut offset = 0;
321 let mut chunks = Vec::new();
322
323 for content in overlapped {
324 let safe_offset = ceil_char_boundary(&document.content, offset);
326 let start = document.content[safe_offset..]
328 .find(&content)
329 .map_or(safe_offset, |pos| safe_offset + pos);
330 let end = start + content.len();
331
332 let mut chunk = Chunk::new(document.id, content, start, end);
333 chunk.metadata.title = document.title.clone();
334
335 chunks.push(chunk);
336 offset = ceil_char_boundary(&document.content, start + 1);
337 }
338
339 Ok(chunks)
340 }
341
342 fn estimate_chunks(&self, document: &Document) -> usize {
343 if document.content.is_empty() {
344 return 0;
345 }
346 let effective_size = self.chunk_size.saturating_sub(self.overlap);
347 if effective_size == 0 {
348 return 1;
349 }
350 (document.content.len() + effective_size - 1) / effective_size
351 }
352}
353
354#[derive(Debug, Clone)]
356pub struct FixedSizeChunker {
357 chunk_size: usize,
358 overlap: usize,
359}
360
361impl FixedSizeChunker {
362 #[must_use]
364 pub fn new(chunk_size: usize, overlap: usize) -> Self {
365 Self { chunk_size, overlap }
366 }
367}
368
369impl Chunker for FixedSizeChunker {
370 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
371 if document.content.is_empty() {
372 return Err(Error::EmptyDocument(
373 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
374 ));
375 }
376
377 let chars: Vec<char> = document.content.chars().collect();
378 let mut chunks = Vec::new();
379 let mut start = 0;
380
381 while start < chars.len() {
382 let end = (start + self.chunk_size).min(chars.len());
383 let content: String = chars[start..end].iter().collect();
384
385 let byte_start = chars[..start].iter().collect::<String>().len();
386 let byte_end = byte_start + content.len();
387
388 let mut chunk = Chunk::new(document.id, content, byte_start, byte_end);
389 chunk.metadata.title = document.title.clone();
390 chunks.push(chunk);
391
392 if end >= chars.len() {
393 break;
394 }
395
396 let step = self.chunk_size.saturating_sub(self.overlap);
397 start += if step == 0 { 1 } else { step };
398 }
399
400 Ok(chunks)
401 }
402
403 fn estimate_chunks(&self, document: &Document) -> usize {
404 if document.content.is_empty() {
405 return 0;
406 }
407 let step = self.chunk_size.saturating_sub(self.overlap);
408 if step == 0 {
409 return document.content.chars().count();
410 }
411 let char_count = document.content.chars().count();
412 (char_count + step - 1) / step
413 }
414}
415
416pub struct SemanticChunker<E: crate::embed::Embedder> {
418 embedder: E,
419 pub similarity_threshold: f32,
421 pub max_chunk_size: usize,
423}
424
425impl<E: crate::embed::Embedder> SemanticChunker<E> {
426 pub fn new(embedder: E, similarity_threshold: f32, max_chunk_size: usize) -> Self {
428 Self { embedder, similarity_threshold, max_chunk_size }
429 }
430
431 fn split_sentences(text: &str) -> Vec<&str> {
433 let mut sentences = Vec::new();
434 let mut start = 0;
435
436 for (i, c) in text.char_indices() {
437 if c == '.' || c == '!' || c == '?' {
438 let next_char = text[i + c.len_utf8()..].chars().next();
439 if next_char.map_or(true, |nc| nc.is_whitespace()) {
440 let end = i + c.len_utf8();
441 let sentence = text[start..end].trim();
442 if !sentence.is_empty() {
443 sentences.push(sentence);
444 }
445 start = end;
446 }
447 }
448 }
449
450 let remaining = text[start..].trim();
451 if !remaining.is_empty() {
452 sentences.push(remaining);
453 }
454
455 sentences
456 }
457}
458
459impl<E: crate::embed::Embedder> Chunker for SemanticChunker<E> {
460 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
461 if document.content.is_empty() {
462 return Err(Error::EmptyDocument(
463 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
464 ));
465 }
466
467 let sentences = Self::split_sentences(&document.content);
468 if sentences.is_empty() {
469 return Err(Error::EmptyDocument(
470 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
471 ));
472 }
473
474 if sentences.len() == 1 {
475 let content = sentences[0].to_string();
476 let start_offset = document.content.find(&content).unwrap_or(0);
477 let end_offset = start_offset + content.len();
478 let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
479 chunk.metadata.title = document.title.clone();
480 return Ok(vec![chunk]);
481 }
482
483 let embeddings: Vec<Vec<f32>> = sentences
485 .iter()
486 .map(|s| {
487 self.embedder.embed(s).unwrap_or_else(|e| {
488 eprintln!("Embedding failed for sentence: {e}");
489 vec![0.0; self.embedder.dimension()]
490 })
491 })
492 .collect();
493
494 let mut chunks = Vec::new();
495 let mut current_sentences: Vec<&str> = vec![sentences[0]];
496 let mut current_embedding = &embeddings[0];
497
498 for i in 1..sentences.len() {
499 let similarity = crate::embed::cosine_similarity(current_embedding, &embeddings[i]);
500 let current_len: usize = current_sentences.iter().map(|s| s.len()).sum();
501
502 if similarity < self.similarity_threshold
503 || current_len + sentences[i].len() > self.max_chunk_size
504 {
505 let content = current_sentences.join(" ");
507 let start_offset = document.content.find(&content).unwrap_or(0);
508 let end_offset = start_offset + content.len();
509 let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
510 chunk.metadata.title = document.title.clone();
511 chunks.push(chunk);
512
513 current_sentences = vec![sentences[i]];
514 current_embedding = &embeddings[i];
515 } else {
516 current_sentences.push(sentences[i]);
517 }
518 }
519
520 if !current_sentences.is_empty() {
522 let content = current_sentences.join(" ");
523 let start_offset = document.content.find(&content).unwrap_or(0);
524 let end_offset = start_offset + content.len();
525 let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
526 chunk.metadata.title = document.title.clone();
527 chunks.push(chunk);
528 }
529
530 Ok(chunks)
531 }
532
533 fn estimate_chunks(&self, document: &Document) -> usize {
534 if document.content.is_empty() {
535 return 0;
536 }
537 let sentences = Self::split_sentences(&document.content);
539 (sentences.len() + 2) / 3 }
541}
542
543#[derive(Debug, Clone)]
545pub struct StructuralChunker {
546 pub respect_headers: bool,
548 pub max_section_size: usize,
550}
551
552impl StructuralChunker {
553 #[must_use]
555 pub fn new(respect_headers: bool, max_section_size: usize) -> Self {
556 Self { respect_headers, max_section_size }
557 }
558
559 fn extract_header(line: &str) -> Option<String> {
561 let trimmed = line.trim();
562 if trimmed.starts_with('#') {
563 let header = trimmed.trim_start_matches('#').trim();
565 if !header.is_empty() {
566 return Some(header.to_string());
567 }
568 }
569 None
570 }
571
572 fn is_header(line: &str) -> bool {
574 Self::extract_header(line).is_some()
575 }
576
577 fn split_by_headers(text: &str) -> Vec<(Option<String>, String)> {
579 let mut sections = Vec::new();
580 let mut current_header: Option<String> = None;
581 let mut current_content = String::new();
582
583 for line in text.lines() {
584 if Self::is_header(line) {
585 if !current_content.trim().is_empty() || current_header.is_some() {
587 sections.push((current_header.take(), current_content.trim().to_string()));
588 current_content = String::new();
589 }
590 current_header = Self::extract_header(line);
591 current_content.push_str(line);
592 current_content.push('\n');
593 } else {
594 current_content.push_str(line);
595 current_content.push('\n');
596 }
597 }
598
599 if !current_content.trim().is_empty() {
601 sections.push((current_header, current_content.trim().to_string()));
602 }
603
604 sections
605 }
606}
607
608impl Chunker for StructuralChunker {
609 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
610 if document.content.is_empty() {
611 return Err(Error::EmptyDocument(
612 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
613 ));
614 }
615
616 let sections = if self.respect_headers {
617 Self::split_by_headers(&document.content)
618 } else {
619 vec![(None, document.content.clone())]
620 };
621
622 if sections.is_empty() {
623 return Err(Error::EmptyDocument(
624 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
625 ));
626 }
627
628 let mut chunks = Vec::new();
629 let doc_title = document.title.clone();
631 let doc_source = document.source.clone();
632 let doc_metadata = document.metadata.clone();
633 let sub_chunker = RecursiveChunker::new(self.max_section_size, 50);
634
635 for (header, content) in sections {
636 if content.is_empty() {
637 continue;
638 }
639
640 if content.len() > self.max_section_size {
642 let sub_doc = Document {
643 id: document.id,
644 content,
645 title: doc_title.clone(),
646 source: doc_source.clone(),
647 metadata: doc_metadata.clone(),
648 };
649 if let Ok(sub_chunks) = sub_chunker.chunk(&sub_doc) {
650 for mut chunk in sub_chunks {
651 if let Some(ref h) = header {
652 chunk.metadata.headers.push(h.clone());
653 }
654 chunks.push(chunk);
655 }
656 }
657 } else {
658 let start_offset = document.content.find(&content).unwrap_or(0);
659 let end_offset = start_offset + content.len();
660 let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
661 chunk.metadata.title = doc_title.clone();
662 if let Some(h) = header {
663 chunk.metadata.headers.push(h);
664 }
665 chunks.push(chunk);
666 }
667 }
668
669 if chunks.is_empty() {
670 let content = document.content.clone();
672 let mut chunk = Chunk::new(document.id, content, 0, document.content.len());
673 chunk.metadata.title = document.title.clone();
674 chunks.push(chunk);
675 }
676
677 Ok(chunks)
678 }
679
680 fn estimate_chunks(&self, document: &Document) -> usize {
681 if document.content.is_empty() {
682 return 0;
683 }
684 let sections = Self::split_by_headers(&document.content);
685 sections.len().max(1)
686 }
687}
688
689#[derive(Debug, Clone)]
691pub struct ParagraphChunker {
692 max_paragraphs: usize,
693}
694
695impl ParagraphChunker {
696 #[must_use]
698 pub fn new(max_paragraphs: usize) -> Self {
699 Self { max_paragraphs }
700 }
701
702 fn split_paragraphs(text: &str) -> Vec<&str> {
704 text.split("\n\n").map(|p| p.trim()).filter(|p| !p.is_empty()).collect()
705 }
706}
707
708impl Chunker for ParagraphChunker {
709 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
710 if document.content.is_empty() {
711 return Err(Error::EmptyDocument(
712 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
713 ));
714 }
715
716 let paragraphs = Self::split_paragraphs(&document.content);
717 if paragraphs.is_empty() {
718 return Err(Error::EmptyDocument(
719 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
720 ));
721 }
722
723 let mut chunks = Vec::new();
724 let mut i = 0;
725
726 while i < paragraphs.len() {
727 let end = (i + self.max_paragraphs).min(paragraphs.len());
728 let content = paragraphs[i..end].join("\n\n");
729
730 let start_offset = document.content.find(&content).unwrap_or(0);
731 let end_offset = start_offset + content.len();
732
733 let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
734 chunk.metadata.title = document.title.clone();
735 chunks.push(chunk);
736
737 i = end;
738 }
739
740 Ok(chunks)
741 }
742
743 fn estimate_chunks(&self, document: &Document) -> usize {
744 if document.content.is_empty() {
745 return 0;
746 }
747 let paragraphs = Self::split_paragraphs(&document.content);
748 if self.max_paragraphs == 0 {
749 return paragraphs.len();
750 }
751 (paragraphs.len() + self.max_paragraphs - 1) / self.max_paragraphs
752 }
753}
754
755#[derive(Debug, Clone)]
757pub struct SentenceChunker {
758 max_sentences: usize,
759 overlap_sentences: usize,
760}
761
762impl SentenceChunker {
763 #[must_use]
765 pub fn new(max_sentences: usize, overlap_sentences: usize) -> Self {
766 Self { max_sentences, overlap_sentences }
767 }
768
769 fn split_sentences(text: &str) -> Vec<&str> {
770 let mut sentences = Vec::new();
771 let mut start = 0;
772
773 for (i, c) in text.char_indices() {
774 if c == '.' || c == '!' || c == '?' {
775 let next_char = text[i + c.len_utf8()..].chars().next();
777 if next_char.map_or(true, |nc| nc.is_whitespace() || nc.is_uppercase()) {
778 let end = i + c.len_utf8();
779 let sentence = text[start..end].trim();
780 if !sentence.is_empty() {
781 sentences.push(sentence);
782 }
783 start = end;
784 }
785 }
786 }
787
788 let remaining = text[start..].trim();
790 if !remaining.is_empty() {
791 sentences.push(remaining);
792 }
793
794 sentences
795 }
796}
797
798impl Chunker for SentenceChunker {
799 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
800 if document.content.is_empty() {
801 return Err(Error::EmptyDocument(
802 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
803 ));
804 }
805
806 let sentences = Self::split_sentences(&document.content);
807 let mut chunks = Vec::new();
808 let mut i = 0;
809
810 while i < sentences.len() {
811 let end = (i + self.max_sentences).min(sentences.len());
812 let content = sentences[i..end].join(" ");
813
814 let start_offset = document.content.find(&content).unwrap_or(0);
815 let end_offset = start_offset + content.len();
816
817 let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
818 chunk.metadata.title = document.title.clone();
819 chunks.push(chunk);
820
821 let step = self.max_sentences.saturating_sub(self.overlap_sentences);
822 i += if step == 0 { 1 } else { step };
823 }
824
825 Ok(chunks)
826 }
827
828 fn estimate_chunks(&self, document: &Document) -> usize {
829 if document.content.is_empty() {
830 return 0;
831 }
832 let sentences = Self::split_sentences(&document.content);
833 let step = self.max_sentences.saturating_sub(self.overlap_sentences);
834 if step == 0 {
835 return sentences.len();
836 }
837 (sentences.len() + step - 1) / step
838 }
839}
840
841#[cfg(test)]
842mod tests;