1mod timestamp;
4pub use timestamp::TimestampChunker;
5
6use crate::{Document, DocumentId, Error, Result};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10fn ceil_char_boundary(s: &str, i: usize) -> usize {
13 if i >= s.len() {
14 s.len()
15 } else {
16 let mut pos = i;
17 while pos < s.len() && !s.is_char_boundary(pos) {
18 pos += 1;
19 }
20 pos
21 }
22}
23
24#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
26pub struct ChunkId(pub uuid::Uuid);
27
28impl ChunkId {
29 #[must_use]
31 pub fn new() -> Self {
32 Self(uuid::Uuid::new_v4())
33 }
34}
35
36impl Default for ChunkId {
37 fn default() -> Self {
38 Self::new()
39 }
40}
41
42impl std::fmt::Display for ChunkId {
43 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44 write!(f, "{}", self.0)
45 }
46}
47
48#[derive(Debug, Clone, Default, Serialize, Deserialize)]
50pub struct ChunkMetadata {
51 pub title: Option<String>,
53 pub headers: Vec<String>,
55 pub page: Option<usize>,
57 pub custom: HashMap<String, serde_json::Value>,
59}
60
61#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct Chunk {
64 pub id: ChunkId,
66 pub document_id: DocumentId,
68 pub content: String,
70 pub start_offset: usize,
72 pub end_offset: usize,
74 pub metadata: ChunkMetadata,
76 pub embedding: Option<Vec<f32>>,
78}
79
80impl Chunk {
81 #[must_use]
83 pub fn new(
84 document_id: DocumentId,
85 content: String,
86 start_offset: usize,
87 end_offset: usize,
88 ) -> Self {
89 Self {
90 id: ChunkId::new(),
91 document_id,
92 content,
93 start_offset,
94 end_offset,
95 metadata: ChunkMetadata::default(),
96 embedding: None,
97 }
98 }
99
100 #[must_use]
102 pub fn len(&self) -> usize {
103 self.content.len()
104 }
105
106 #[must_use]
108 pub fn is_empty(&self) -> bool {
109 self.content.is_empty()
110 }
111
112 pub fn set_embedding(&mut self, embedding: Vec<f32>) {
114 self.embedding = Some(embedding);
115 }
116}
117
118#[derive(Debug, Clone, Serialize, Deserialize)]
120pub enum ChunkingStrategy {
121 FixedSize {
123 chunk_size: usize,
125 overlap: usize,
127 },
128 Sentence {
130 max_sentences: usize,
132 overlap_sentences: usize,
134 },
135 Paragraph {
137 max_paragraphs: usize,
139 },
140 Recursive {
142 separators: Vec<String>,
144 chunk_size: usize,
146 overlap: usize,
148 },
149}
150
151impl Default for ChunkingStrategy {
152 fn default() -> Self {
153 Self::Recursive {
154 separators: vec![
155 "\n\n".to_string(),
156 "\n".to_string(),
157 ". ".to_string(),
158 " ".to_string(),
159 ],
160 chunk_size: 512,
161 overlap: 50,
162 }
163 }
164}
165
166pub trait Chunker: Send + Sync {
168 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>>;
170
171 fn estimate_chunks(&self, document: &Document) -> usize;
173}
174
175#[derive(Debug, Clone)]
177pub struct RecursiveChunker {
178 separators: Vec<String>,
179 chunk_size: usize,
180 overlap: usize,
181}
182
183impl RecursiveChunker {
184 #[must_use]
186 pub fn new(chunk_size: usize, overlap: usize) -> Self {
187 Self {
188 separators: vec![
189 "\n\n".to_string(),
190 "\n".to_string(),
191 ". ".to_string(),
192 " ".to_string(),
193 ],
194 chunk_size,
195 overlap,
196 }
197 }
198
199 #[must_use]
201 pub fn with_separators(mut self, separators: Vec<String>) -> Self {
202 self.separators = separators;
203 self
204 }
205
206 fn split_text(&self, text: &str, separator_idx: usize) -> Vec<String> {
207 if text.len() <= self.chunk_size {
208 return vec![text.to_string()];
209 }
210
211 if separator_idx >= self.separators.len() {
212 return self.split_by_chars(text);
214 }
215
216 let separator = &self.separators[separator_idx];
217 let parts: Vec<&str> = text.split(separator).collect();
218
219 if parts.len() == 1 {
220 return self.split_text(text, separator_idx + 1);
222 }
223
224 self.merge_splits(&parts, separator, separator_idx)
225 }
226
227 fn merge_splits(&self, parts: &[&str], separator: &str, separator_idx: usize) -> Vec<String> {
228 let mut chunks = Vec::new();
229 let mut current = String::new();
230
231 for part in parts {
232 let potential = if current.is_empty() {
233 (*part).to_string()
234 } else {
235 format!("{current}{separator}{part}")
236 };
237
238 if potential.len() <= self.chunk_size {
239 current = potential;
240 } else if current.is_empty() {
241 chunks.extend(self.split_text(part, separator_idx + 1));
243 } else {
244 chunks.push(current);
245 current = (*part).to_string();
246 }
247 }
248
249 if !current.is_empty() {
250 if current.len() <= self.chunk_size {
251 chunks.push(current);
252 } else {
253 chunks.extend(self.split_text(¤t, separator_idx + 1));
254 }
255 }
256
257 chunks
258 }
259
260 fn split_by_chars(&self, text: &str) -> Vec<String> {
261 let chars: Vec<char> = text.chars().collect();
262 let mut chunks = Vec::new();
263 let mut start = 0;
264
265 while start < chars.len() {
266 let end = (start + self.chunk_size).min(chars.len());
267 let chunk: String = chars[start..end].iter().collect();
268 chunks.push(chunk);
269
270 if end >= chars.len() {
271 break;
272 }
273
274 start = if end > self.overlap { end - self.overlap } else { end };
276 }
277
278 chunks
279 }
280
281 fn apply_overlap(&self, chunks: Vec<String>) -> Vec<String> {
282 if self.overlap == 0 || chunks.len() <= 1 {
283 return chunks;
284 }
285
286 let mut result = Vec::with_capacity(chunks.len());
287 for (i, chunk) in chunks.iter().enumerate() {
288 if i == 0 {
289 result.push(chunk.clone());
290 } else {
291 let prev = &chunks[i - 1];
293 let overlap_text = if prev.len() > self.overlap {
294 let start = prev.len() - self.overlap;
295 let start = ceil_char_boundary(prev, start);
296 &prev[start..]
297 } else {
298 prev.as_str()
299 };
300 result.push(format!("{overlap_text}{chunk}"));
301 }
302 }
303 result
304 }
305}
306
307impl Chunker for RecursiveChunker {
308 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
309 if document.content.is_empty() {
310 return Err(Error::EmptyDocument(
311 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
312 ));
313 }
314
315 let text_chunks = self.split_text(&document.content, 0);
316 let overlapped = self.apply_overlap(text_chunks);
317
318 let mut offset = 0;
319 let mut chunks = Vec::new();
320
321 for content in overlapped {
322 let safe_offset = ceil_char_boundary(&document.content, offset);
324 let start = document.content[safe_offset..]
326 .find(&content)
327 .map_or(safe_offset, |pos| safe_offset + pos);
328 let end = start + content.len();
329
330 let mut chunk = Chunk::new(document.id, content, start, end);
331 chunk.metadata.title = document.title.clone();
332
333 chunks.push(chunk);
334 offset = ceil_char_boundary(&document.content, start + 1);
335 }
336
337 Ok(chunks)
338 }
339
340 fn estimate_chunks(&self, document: &Document) -> usize {
341 if document.content.is_empty() {
342 return 0;
343 }
344 let effective_size = self.chunk_size.saturating_sub(self.overlap);
345 if effective_size == 0 {
346 return 1;
347 }
348 (document.content.len() + effective_size - 1) / effective_size
349 }
350}
351
352#[derive(Debug, Clone)]
354pub struct FixedSizeChunker {
355 chunk_size: usize,
356 overlap: usize,
357}
358
359impl FixedSizeChunker {
360 #[must_use]
362 pub fn new(chunk_size: usize, overlap: usize) -> Self {
363 Self { chunk_size, overlap }
364 }
365}
366
367impl Chunker for FixedSizeChunker {
368 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
369 if document.content.is_empty() {
370 return Err(Error::EmptyDocument(
371 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
372 ));
373 }
374
375 let chars: Vec<char> = document.content.chars().collect();
376 let mut chunks = Vec::new();
377 let mut start = 0;
378
379 while start < chars.len() {
380 let end = (start + self.chunk_size).min(chars.len());
381 let content: String = chars[start..end].iter().collect();
382
383 let byte_start = chars[..start].iter().collect::<String>().len();
384 let byte_end = byte_start + content.len();
385
386 let mut chunk = Chunk::new(document.id, content, byte_start, byte_end);
387 chunk.metadata.title = document.title.clone();
388 chunks.push(chunk);
389
390 if end >= chars.len() {
391 break;
392 }
393
394 let step = self.chunk_size.saturating_sub(self.overlap);
395 start += if step == 0 { 1 } else { step };
396 }
397
398 Ok(chunks)
399 }
400
401 fn estimate_chunks(&self, document: &Document) -> usize {
402 if document.content.is_empty() {
403 return 0;
404 }
405 let step = self.chunk_size.saturating_sub(self.overlap);
406 if step == 0 {
407 return document.content.chars().count();
408 }
409 let char_count = document.content.chars().count();
410 (char_count + step - 1) / step
411 }
412}
413
414pub struct SemanticChunker<E: crate::embed::Embedder> {
416 embedder: E,
417 pub similarity_threshold: f32,
419 pub max_chunk_size: usize,
421}
422
423impl<E: crate::embed::Embedder> SemanticChunker<E> {
424 pub fn new(embedder: E, similarity_threshold: f32, max_chunk_size: usize) -> Self {
426 Self { embedder, similarity_threshold, max_chunk_size }
427 }
428
429 fn split_sentences(text: &str) -> Vec<&str> {
431 let mut sentences = Vec::new();
432 let mut start = 0;
433
434 for (i, c) in text.char_indices() {
435 if c == '.' || c == '!' || c == '?' {
436 let next_char = text[i + c.len_utf8()..].chars().next();
437 if next_char.map_or(true, |nc| nc.is_whitespace()) {
438 let end = i + c.len_utf8();
439 let sentence = text[start..end].trim();
440 if !sentence.is_empty() {
441 sentences.push(sentence);
442 }
443 start = end;
444 }
445 }
446 }
447
448 let remaining = text[start..].trim();
449 if !remaining.is_empty() {
450 sentences.push(remaining);
451 }
452
453 sentences
454 }
455}
456
457impl<E: crate::embed::Embedder> Chunker for SemanticChunker<E> {
458 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
459 if document.content.is_empty() {
460 return Err(Error::EmptyDocument(
461 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
462 ));
463 }
464
465 let sentences = Self::split_sentences(&document.content);
466 if sentences.is_empty() {
467 return Err(Error::EmptyDocument(
468 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
469 ));
470 }
471
472 if sentences.len() == 1 {
473 let content = sentences[0].to_string();
474 let start_offset = document.content.find(&content).unwrap_or(0);
475 let end_offset = start_offset + content.len();
476 let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
477 chunk.metadata.title = document.title.clone();
478 return Ok(vec![chunk]);
479 }
480
481 let embeddings: Vec<Vec<f32>> = sentences
483 .iter()
484 .map(|s| {
485 self.embedder.embed(s).unwrap_or_else(|e| {
486 eprintln!("Embedding failed for sentence: {e}");
487 vec![0.0; self.embedder.dimension()]
488 })
489 })
490 .collect();
491
492 let mut chunks = Vec::new();
493 let mut current_sentences: Vec<&str> = vec![sentences[0]];
494 let mut current_embedding = &embeddings[0];
495
496 for i in 1..sentences.len() {
497 let similarity = crate::embed::cosine_similarity(current_embedding, &embeddings[i]);
498 let current_len: usize = current_sentences.iter().map(|s| s.len()).sum();
499
500 if similarity < self.similarity_threshold
501 || current_len + sentences[i].len() > self.max_chunk_size
502 {
503 let content = current_sentences.join(" ");
505 let start_offset = document.content.find(&content).unwrap_or(0);
506 let end_offset = start_offset + content.len();
507 let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
508 chunk.metadata.title = document.title.clone();
509 chunks.push(chunk);
510
511 current_sentences = vec![sentences[i]];
512 current_embedding = &embeddings[i];
513 } else {
514 current_sentences.push(sentences[i]);
515 }
516 }
517
518 if !current_sentences.is_empty() {
520 let content = current_sentences.join(" ");
521 let start_offset = document.content.find(&content).unwrap_or(0);
522 let end_offset = start_offset + content.len();
523 let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
524 chunk.metadata.title = document.title.clone();
525 chunks.push(chunk);
526 }
527
528 Ok(chunks)
529 }
530
531 fn estimate_chunks(&self, document: &Document) -> usize {
532 if document.content.is_empty() {
533 return 0;
534 }
535 let sentences = Self::split_sentences(&document.content);
537 (sentences.len() + 2) / 3 }
539}
540
541#[derive(Debug, Clone)]
543pub struct StructuralChunker {
544 pub respect_headers: bool,
546 pub max_section_size: usize,
548}
549
550impl StructuralChunker {
551 #[must_use]
553 pub fn new(respect_headers: bool, max_section_size: usize) -> Self {
554 Self { respect_headers, max_section_size }
555 }
556
557 fn extract_header(line: &str) -> Option<String> {
559 let trimmed = line.trim();
560 if trimmed.starts_with('#') {
561 let header = trimmed.trim_start_matches('#').trim();
563 if !header.is_empty() {
564 return Some(header.to_string());
565 }
566 }
567 None
568 }
569
570 fn is_header(line: &str) -> bool {
572 Self::extract_header(line).is_some()
573 }
574
575 fn split_by_headers(text: &str) -> Vec<(Option<String>, String)> {
577 let mut sections = Vec::new();
578 let mut current_header: Option<String> = None;
579 let mut current_content = String::new();
580
581 for line in text.lines() {
582 if Self::is_header(line) {
583 if !current_content.trim().is_empty() || current_header.is_some() {
585 sections.push((current_header.take(), current_content.trim().to_string()));
586 current_content = String::new();
587 }
588 current_header = Self::extract_header(line);
589 current_content.push_str(line);
590 current_content.push('\n');
591 } else {
592 current_content.push_str(line);
593 current_content.push('\n');
594 }
595 }
596
597 if !current_content.trim().is_empty() {
599 sections.push((current_header, current_content.trim().to_string()));
600 }
601
602 sections
603 }
604}
605
606impl Chunker for StructuralChunker {
607 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
608 if document.content.is_empty() {
609 return Err(Error::EmptyDocument(
610 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
611 ));
612 }
613
614 let sections = if self.respect_headers {
615 Self::split_by_headers(&document.content)
616 } else {
617 vec![(None, document.content.clone())]
618 };
619
620 if sections.is_empty() {
621 return Err(Error::EmptyDocument(
622 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
623 ));
624 }
625
626 let mut chunks = Vec::new();
627 let doc_title = document.title.clone();
629 let doc_source = document.source.clone();
630 let doc_metadata = document.metadata.clone();
631 let sub_chunker = RecursiveChunker::new(self.max_section_size, 50);
632
633 for (header, content) in sections {
634 if content.is_empty() {
635 continue;
636 }
637
638 if content.len() > self.max_section_size {
640 let sub_doc = Document {
641 id: document.id,
642 content,
643 title: doc_title.clone(),
644 source: doc_source.clone(),
645 metadata: doc_metadata.clone(),
646 };
647 if let Ok(sub_chunks) = sub_chunker.chunk(&sub_doc) {
648 for mut chunk in sub_chunks {
649 if let Some(ref h) = header {
650 chunk.metadata.headers.push(h.clone());
651 }
652 chunks.push(chunk);
653 }
654 }
655 } else {
656 let start_offset = document.content.find(&content).unwrap_or(0);
657 let end_offset = start_offset + content.len();
658 let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
659 chunk.metadata.title = doc_title.clone();
660 if let Some(h) = header {
661 chunk.metadata.headers.push(h);
662 }
663 chunks.push(chunk);
664 }
665 }
666
667 if chunks.is_empty() {
668 let content = document.content.clone();
670 let mut chunk = Chunk::new(document.id, content, 0, document.content.len());
671 chunk.metadata.title = document.title.clone();
672 chunks.push(chunk);
673 }
674
675 Ok(chunks)
676 }
677
678 fn estimate_chunks(&self, document: &Document) -> usize {
679 if document.content.is_empty() {
680 return 0;
681 }
682 let sections = Self::split_by_headers(&document.content);
683 sections.len().max(1)
684 }
685}
686
687#[derive(Debug, Clone)]
689pub struct ParagraphChunker {
690 max_paragraphs: usize,
691}
692
693impl ParagraphChunker {
694 #[must_use]
696 pub fn new(max_paragraphs: usize) -> Self {
697 Self { max_paragraphs }
698 }
699
700 fn split_paragraphs(text: &str) -> Vec<&str> {
702 text.split("\n\n").map(|p| p.trim()).filter(|p| !p.is_empty()).collect()
703 }
704}
705
706impl Chunker for ParagraphChunker {
707 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
708 if document.content.is_empty() {
709 return Err(Error::EmptyDocument(
710 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
711 ));
712 }
713
714 let paragraphs = Self::split_paragraphs(&document.content);
715 if paragraphs.is_empty() {
716 return Err(Error::EmptyDocument(
717 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
718 ));
719 }
720
721 let mut chunks = Vec::new();
722 let mut i = 0;
723
724 while i < paragraphs.len() {
725 let end = (i + self.max_paragraphs).min(paragraphs.len());
726 let content = paragraphs[i..end].join("\n\n");
727
728 let start_offset = document.content.find(&content).unwrap_or(0);
729 let end_offset = start_offset + content.len();
730
731 let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
732 chunk.metadata.title = document.title.clone();
733 chunks.push(chunk);
734
735 i = end;
736 }
737
738 Ok(chunks)
739 }
740
741 fn estimate_chunks(&self, document: &Document) -> usize {
742 if document.content.is_empty() {
743 return 0;
744 }
745 let paragraphs = Self::split_paragraphs(&document.content);
746 if self.max_paragraphs == 0 {
747 return paragraphs.len();
748 }
749 (paragraphs.len() + self.max_paragraphs - 1) / self.max_paragraphs
750 }
751}
752
753#[derive(Debug, Clone)]
755pub struct SentenceChunker {
756 max_sentences: usize,
757 overlap_sentences: usize,
758}
759
760impl SentenceChunker {
761 #[must_use]
763 pub fn new(max_sentences: usize, overlap_sentences: usize) -> Self {
764 Self { max_sentences, overlap_sentences }
765 }
766
767 fn split_sentences(text: &str) -> Vec<&str> {
768 let mut sentences = Vec::new();
769 let mut start = 0;
770
771 for (i, c) in text.char_indices() {
772 if c == '.' || c == '!' || c == '?' {
773 let next_char = text[i + c.len_utf8()..].chars().next();
775 if next_char.map_or(true, |nc| nc.is_whitespace() || nc.is_uppercase()) {
776 let end = i + c.len_utf8();
777 let sentence = text[start..end].trim();
778 if !sentence.is_empty() {
779 sentences.push(sentence);
780 }
781 start = end;
782 }
783 }
784 }
785
786 let remaining = text[start..].trim();
788 if !remaining.is_empty() {
789 sentences.push(remaining);
790 }
791
792 sentences
793 }
794}
795
796impl Chunker for SentenceChunker {
797 fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
798 if document.content.is_empty() {
799 return Err(Error::EmptyDocument(
800 document.title.clone().unwrap_or_else(|| "untitled".to_string()),
801 ));
802 }
803
804 let sentences = Self::split_sentences(&document.content);
805 let mut chunks = Vec::new();
806 let mut i = 0;
807
808 while i < sentences.len() {
809 let end = (i + self.max_sentences).min(sentences.len());
810 let content = sentences[i..end].join(" ");
811
812 let start_offset = document.content.find(&content).unwrap_or(0);
813 let end_offset = start_offset + content.len();
814
815 let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
816 chunk.metadata.title = document.title.clone();
817 chunks.push(chunk);
818
819 let step = self.max_sentences.saturating_sub(self.overlap_sentences);
820 i += if step == 0 { 1 } else { step };
821 }
822
823 Ok(chunks)
824 }
825
826 fn estimate_chunks(&self, document: &Document) -> usize {
827 if document.content.is_empty() {
828 return 0;
829 }
830 let sentences = Self::split_sentences(&document.content);
831 let step = self.max_sentences.saturating_sub(self.overlap_sentences);
832 if step == 0 {
833 return sentences.len();
834 }
835 (sentences.len() + step - 1) / step
836 }
837}
838
839#[cfg(test)]
840mod tests;