Skip to main content

trueno_rag/chunk/
mod.rs

1//! Document chunking strategies for RAG pipelines
2
3mod timestamp;
4pub use timestamp::TimestampChunker;
5
6use crate::{Document, DocumentId, Error, Result};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10/// Stable replacement for `str::ceil_char_boundary` (unstable).
11/// Returns the smallest index >= `i` that is a valid UTF-8 char boundary.
12fn ceil_char_boundary(s: &str, i: usize) -> usize {
13    if i >= s.len() {
14        s.len()
15    } else {
16        let mut pos = i;
17        while pos < s.len() && !s.is_char_boundary(pos) {
18            pos += 1;
19        }
20        pos
21    }
22}
23
24/// Unique chunk identifier
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
26pub struct ChunkId(pub uuid::Uuid);
27
28impl ChunkId {
29    /// Create a new random chunk ID
30    #[must_use]
31    pub fn new() -> Self {
32        Self(uuid::Uuid::new_v4())
33    }
34}
35
36impl Default for ChunkId {
37    fn default() -> Self {
38        Self::new()
39    }
40}
41
42impl std::fmt::Display for ChunkId {
43    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44        write!(f, "{}", self.0)
45    }
46}
47
48/// Metadata associated with a chunk
49#[derive(Debug, Clone, Default, Serialize, Deserialize)]
50pub struct ChunkMetadata {
51    /// Source document title
52    pub title: Option<String>,
53    /// Section/header hierarchy
54    pub headers: Vec<String>,
55    /// Page number (for PDFs)
56    pub page: Option<usize>,
57    /// Custom metadata
58    pub custom: HashMap<String, serde_json::Value>,
59}
60
61/// A chunk of text from a document
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct Chunk {
64    /// Unique chunk identifier
65    pub id: ChunkId,
66    /// Source document reference
67    pub document_id: DocumentId,
68    /// Chunk text content
69    pub content: String,
70    /// Character offset in source document (start)
71    pub start_offset: usize,
72    /// Character offset in source document (end)
73    pub end_offset: usize,
74    /// Metadata inherited from document
75    pub metadata: ChunkMetadata,
76    /// Embedding vector (populated after embedding)
77    pub embedding: Option<Vec<f32>>,
78}
79
80impl Chunk {
81    /// Create a new chunk
82    #[must_use]
83    pub fn new(
84        document_id: DocumentId,
85        content: String,
86        start_offset: usize,
87        end_offset: usize,
88    ) -> Self {
89        Self {
90            id: ChunkId::new(),
91            document_id,
92            content,
93            start_offset,
94            end_offset,
95            metadata: ChunkMetadata::default(),
96            embedding: None,
97        }
98    }
99
100    /// Get the length of the chunk in characters
101    #[must_use]
102    pub fn len(&self) -> usize {
103        self.content.len()
104    }
105
106    /// Check if the chunk is empty
107    #[must_use]
108    pub fn is_empty(&self) -> bool {
109        self.content.is_empty()
110    }
111
112    /// Set the embedding vector
113    pub fn set_embedding(&mut self, embedding: Vec<f32>) {
114        // Contract: embedding-algebra-v1.yaml precondition (pv codegen)
115        contract_pre_embedding_lookup!(embedding);
116        self.embedding = Some(embedding);
117    }
118}
119
120/// Chunking strategy configuration
121#[derive(Debug, Clone, Serialize, Deserialize)]
122pub enum ChunkingStrategy {
123    /// Fixed-size chunks with overlap
124    FixedSize {
125        /// Target chunk size in characters
126        chunk_size: usize,
127        /// Overlap between consecutive chunks
128        overlap: usize,
129    },
130    /// Split on sentence boundaries
131    Sentence {
132        /// Maximum sentences per chunk
133        max_sentences: usize,
134        /// Overlap sentences between chunks
135        overlap_sentences: usize,
136    },
137    /// Split on paragraph boundaries
138    Paragraph {
139        /// Maximum paragraphs per chunk
140        max_paragraphs: usize,
141    },
142    /// Recursive character splitting
143    Recursive {
144        /// Separators to try in order
145        separators: Vec<String>,
146        /// Target chunk size
147        chunk_size: usize,
148        /// Overlap between chunks
149        overlap: usize,
150    },
151}
152
153impl Default for ChunkingStrategy {
154    fn default() -> Self {
155        Self::Recursive {
156            separators: vec![
157                "\n\n".to_string(),
158                "\n".to_string(),
159                ". ".to_string(),
160                " ".to_string(),
161            ],
162            chunk_size: 512,
163            overlap: 50,
164        }
165    }
166}
167
168/// Trait for document chunkers
169pub trait Chunker: Send + Sync {
170    /// Split document into chunks
171    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>>;
172
173    /// Estimate chunk count without materializing
174    fn estimate_chunks(&self, document: &Document) -> usize;
175}
176
177/// Recursive chunker implementation
178#[derive(Debug, Clone)]
179pub struct RecursiveChunker {
180    separators: Vec<String>,
181    chunk_size: usize,
182    overlap: usize,
183}
184
185impl RecursiveChunker {
186    /// Create a new recursive chunker
187    #[must_use]
188    pub fn new(chunk_size: usize, overlap: usize) -> Self {
189        Self {
190            separators: vec![
191                "\n\n".to_string(),
192                "\n".to_string(),
193                ". ".to_string(),
194                " ".to_string(),
195            ],
196            chunk_size,
197            overlap,
198        }
199    }
200
201    /// Create with custom separators
202    #[must_use]
203    pub fn with_separators(mut self, separators: Vec<String>) -> Self {
204        self.separators = separators;
205        self
206    }
207
208    fn split_text(&self, text: &str, separator_idx: usize) -> Vec<String> {
209        if text.len() <= self.chunk_size {
210            return vec![text.to_string()];
211        }
212
213        if separator_idx >= self.separators.len() {
214            // Fallback: split by characters
215            return self.split_by_chars(text);
216        }
217
218        let separator = &self.separators[separator_idx];
219        let parts: Vec<&str> = text.split(separator).collect();
220
221        if parts.len() == 1 {
222            // Separator not found, try next
223            return self.split_text(text, separator_idx + 1);
224        }
225
226        self.merge_splits(&parts, separator, separator_idx)
227    }
228
229    fn merge_splits(&self, parts: &[&str], separator: &str, separator_idx: usize) -> Vec<String> {
230        let mut chunks = Vec::new();
231        let mut current = String::new();
232
233        for part in parts {
234            let potential = if current.is_empty() {
235                (*part).to_string()
236            } else {
237                format!("{current}{separator}{part}")
238            };
239
240            if potential.len() <= self.chunk_size {
241                current = potential;
242            } else if current.is_empty() {
243                // Single part too large, recurse
244                chunks.extend(self.split_text(part, separator_idx + 1));
245            } else {
246                chunks.push(current);
247                current = (*part).to_string();
248            }
249        }
250
251        if !current.is_empty() {
252            if current.len() <= self.chunk_size {
253                chunks.push(current);
254            } else {
255                chunks.extend(self.split_text(&current, separator_idx + 1));
256            }
257        }
258
259        chunks
260    }
261
262    fn split_by_chars(&self, text: &str) -> Vec<String> {
263        let chars: Vec<char> = text.chars().collect();
264        let mut chunks = Vec::new();
265        let mut start = 0;
266
267        while start < chars.len() {
268            let end = (start + self.chunk_size).min(chars.len());
269            let chunk: String = chars[start..end].iter().collect();
270            chunks.push(chunk);
271
272            if end >= chars.len() {
273                break;
274            }
275
276            // Move start, accounting for overlap
277            start = if end > self.overlap { end - self.overlap } else { end };
278        }
279
280        chunks
281    }
282
283    fn apply_overlap(&self, chunks: Vec<String>) -> Vec<String> {
284        if self.overlap == 0 || chunks.len() <= 1 {
285            return chunks;
286        }
287
288        let mut result = Vec::with_capacity(chunks.len());
289        for (i, chunk) in chunks.iter().enumerate() {
290            if i == 0 {
291                result.push(chunk.clone());
292            } else {
293                // Add overlap from previous chunk
294                let prev = &chunks[i - 1];
295                let overlap_text = if prev.len() > self.overlap {
296                    let start = prev.len() - self.overlap;
297                    let start = ceil_char_boundary(prev, start);
298                    &prev[start..]
299                } else {
300                    prev.as_str()
301                };
302                result.push(format!("{overlap_text}{chunk}"));
303            }
304        }
305        result
306    }
307}
308
309impl Chunker for RecursiveChunker {
310    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
311        if document.content.is_empty() {
312            return Err(Error::EmptyDocument(
313                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
314            ));
315        }
316
317        let text_chunks = self.split_text(&document.content, 0);
318        let overlapped = self.apply_overlap(text_chunks);
319
320        let mut offset = 0;
321        let mut chunks = Vec::new();
322
323        for content in overlapped {
324            // Snap offset to a valid char boundary
325            let safe_offset = ceil_char_boundary(&document.content, offset);
326            // Find actual position in document
327            let start = document.content[safe_offset..]
328                .find(&content)
329                .map_or(safe_offset, |pos| safe_offset + pos);
330            let end = start + content.len();
331
332            let mut chunk = Chunk::new(document.id, content, start, end);
333            chunk.metadata.title = document.title.clone();
334
335            chunks.push(chunk);
336            offset = ceil_char_boundary(&document.content, start + 1);
337        }
338
339        Ok(chunks)
340    }
341
342    fn estimate_chunks(&self, document: &Document) -> usize {
343        if document.content.is_empty() {
344            return 0;
345        }
346        let effective_size = self.chunk_size.saturating_sub(self.overlap);
347        if effective_size == 0 {
348            return 1;
349        }
350        (document.content.len() + effective_size - 1) / effective_size
351    }
352}
353
354/// Fixed-size chunker implementation
355#[derive(Debug, Clone)]
356pub struct FixedSizeChunker {
357    chunk_size: usize,
358    overlap: usize,
359}
360
361impl FixedSizeChunker {
362    /// Create a new fixed-size chunker
363    #[must_use]
364    pub fn new(chunk_size: usize, overlap: usize) -> Self {
365        Self { chunk_size, overlap }
366    }
367}
368
369impl Chunker for FixedSizeChunker {
370    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
371        if document.content.is_empty() {
372            return Err(Error::EmptyDocument(
373                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
374            ));
375        }
376
377        let chars: Vec<char> = document.content.chars().collect();
378        let mut chunks = Vec::new();
379        let mut start = 0;
380
381        while start < chars.len() {
382            let end = (start + self.chunk_size).min(chars.len());
383            let content: String = chars[start..end].iter().collect();
384
385            let byte_start = chars[..start].iter().collect::<String>().len();
386            let byte_end = byte_start + content.len();
387
388            let mut chunk = Chunk::new(document.id, content, byte_start, byte_end);
389            chunk.metadata.title = document.title.clone();
390            chunks.push(chunk);
391
392            if end >= chars.len() {
393                break;
394            }
395
396            let step = self.chunk_size.saturating_sub(self.overlap);
397            start += if step == 0 { 1 } else { step };
398        }
399
400        Ok(chunks)
401    }
402
403    fn estimate_chunks(&self, document: &Document) -> usize {
404        if document.content.is_empty() {
405            return 0;
406        }
407        let step = self.chunk_size.saturating_sub(self.overlap);
408        if step == 0 {
409            return document.content.chars().count();
410        }
411        let char_count = document.content.chars().count();
412        (char_count + step - 1) / step
413    }
414}
415
416/// Semantic chunker that groups sentences by embedding similarity
417pub struct SemanticChunker<E: crate::embed::Embedder> {
418    embedder: E,
419    /// Similarity threshold (0.0 to 1.0) - chunks split when similarity drops below this
420    pub similarity_threshold: f32,
421    /// Maximum chunk size in characters
422    pub max_chunk_size: usize,
423}
424
425impl<E: crate::embed::Embedder> SemanticChunker<E> {
426    /// Create a new semantic chunker
427    pub fn new(embedder: E, similarity_threshold: f32, max_chunk_size: usize) -> Self {
428        Self { embedder, similarity_threshold, max_chunk_size }
429    }
430
431    /// Split text into sentences
432    fn split_sentences(text: &str) -> Vec<&str> {
433        let mut sentences = Vec::new();
434        let mut start = 0;
435
436        for (i, c) in text.char_indices() {
437            if c == '.' || c == '!' || c == '?' {
438                let next_char = text[i + c.len_utf8()..].chars().next();
439                if next_char.map_or(true, |nc| nc.is_whitespace()) {
440                    let end = i + c.len_utf8();
441                    let sentence = text[start..end].trim();
442                    if !sentence.is_empty() {
443                        sentences.push(sentence);
444                    }
445                    start = end;
446                }
447            }
448        }
449
450        let remaining = text[start..].trim();
451        if !remaining.is_empty() {
452            sentences.push(remaining);
453        }
454
455        sentences
456    }
457}
458
459impl<E: crate::embed::Embedder> Chunker for SemanticChunker<E> {
460    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
461        if document.content.is_empty() {
462            return Err(Error::EmptyDocument(
463                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
464            ));
465        }
466
467        let sentences = Self::split_sentences(&document.content);
468        if sentences.is_empty() {
469            return Err(Error::EmptyDocument(
470                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
471            ));
472        }
473
474        if sentences.len() == 1 {
475            let content = sentences[0].to_string();
476            let start_offset = document.content.find(&content).unwrap_or(0);
477            let end_offset = start_offset + content.len();
478            let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
479            chunk.metadata.title = document.title.clone();
480            return Ok(vec![chunk]);
481        }
482
483        // Embed all sentences
484        let embeddings: Vec<Vec<f32>> = sentences
485            .iter()
486            .map(|s| {
487                self.embedder.embed(s).unwrap_or_else(|e| {
488                    eprintln!("Embedding failed for sentence: {e}");
489                    vec![0.0; self.embedder.dimension()]
490                })
491            })
492            .collect();
493
494        let mut chunks = Vec::new();
495        let mut current_sentences: Vec<&str> = vec![sentences[0]];
496        let mut current_embedding = &embeddings[0];
497
498        for i in 1..sentences.len() {
499            let similarity = crate::embed::cosine_similarity(current_embedding, &embeddings[i]);
500            let current_len: usize = current_sentences.iter().map(|s| s.len()).sum();
501
502            if similarity < self.similarity_threshold
503                || current_len + sentences[i].len() > self.max_chunk_size
504            {
505                // Create chunk from current sentences
506                let content = current_sentences.join(" ");
507                let start_offset = document.content.find(&content).unwrap_or(0);
508                let end_offset = start_offset + content.len();
509                let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
510                chunk.metadata.title = document.title.clone();
511                chunks.push(chunk);
512
513                current_sentences = vec![sentences[i]];
514                current_embedding = &embeddings[i];
515            } else {
516                current_sentences.push(sentences[i]);
517            }
518        }
519
520        // Add remaining sentences
521        if !current_sentences.is_empty() {
522            let content = current_sentences.join(" ");
523            let start_offset = document.content.find(&content).unwrap_or(0);
524            let end_offset = start_offset + content.len();
525            let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
526            chunk.metadata.title = document.title.clone();
527            chunks.push(chunk);
528        }
529
530        Ok(chunks)
531    }
532
533    fn estimate_chunks(&self, document: &Document) -> usize {
534        if document.content.is_empty() {
535            return 0;
536        }
537        // Rough estimate based on max chunk size
538        let sentences = Self::split_sentences(&document.content);
539        (sentences.len() + 2) / 3 // Assume average 3 sentences per chunk
540    }
541}
542
543/// Structural chunker that respects document structure (headers, sections)
544#[derive(Debug, Clone)]
545pub struct StructuralChunker {
546    /// Whether to respect headers when chunking
547    pub respect_headers: bool,
548    /// Maximum section size in characters
549    pub max_section_size: usize,
550}
551
552impl StructuralChunker {
553    /// Create a new structural chunker
554    #[must_use]
555    pub fn new(respect_headers: bool, max_section_size: usize) -> Self {
556        Self { respect_headers, max_section_size }
557    }
558
559    /// Extract header text from a line
560    fn extract_header(line: &str) -> Option<String> {
561        let trimmed = line.trim();
562        if trimmed.starts_with('#') {
563            // Markdown header
564            let header = trimmed.trim_start_matches('#').trim();
565            if !header.is_empty() {
566                return Some(header.to_string());
567            }
568        }
569        None
570    }
571
572    /// Check if a line is a header
573    fn is_header(line: &str) -> bool {
574        Self::extract_header(line).is_some()
575    }
576
577    /// Split document into sections by headers
578    fn split_by_headers(text: &str) -> Vec<(Option<String>, String)> {
579        let mut sections = Vec::new();
580        let mut current_header: Option<String> = None;
581        let mut current_content = String::new();
582
583        for line in text.lines() {
584            if Self::is_header(line) {
585                // Save previous section if not empty
586                if !current_content.trim().is_empty() || current_header.is_some() {
587                    sections.push((current_header.take(), current_content.trim().to_string()));
588                    current_content = String::new();
589                }
590                current_header = Self::extract_header(line);
591                current_content.push_str(line);
592                current_content.push('\n');
593            } else {
594                current_content.push_str(line);
595                current_content.push('\n');
596            }
597        }
598
599        // Add final section
600        if !current_content.trim().is_empty() {
601            sections.push((current_header, current_content.trim().to_string()));
602        }
603
604        sections
605    }
606}
607
608impl Chunker for StructuralChunker {
609    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
610        if document.content.is_empty() {
611            return Err(Error::EmptyDocument(
612                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
613            ));
614        }
615
616        let sections = if self.respect_headers {
617            Self::split_by_headers(&document.content)
618        } else {
619            vec![(None, document.content.clone())]
620        };
621
622        if sections.is_empty() {
623            return Err(Error::EmptyDocument(
624                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
625            ));
626        }
627
628        let mut chunks = Vec::new();
629        // Hoist clones and constructors outside loop (CB-518, CB-520)
630        let doc_title = document.title.clone();
631        let doc_source = document.source.clone();
632        let doc_metadata = document.metadata.clone();
633        let sub_chunker = RecursiveChunker::new(self.max_section_size, 50);
634
635        for (header, content) in sections {
636            if content.is_empty() {
637                continue;
638            }
639
640            // Split large sections if needed
641            if content.len() > self.max_section_size {
642                let sub_doc = Document {
643                    id: document.id,
644                    content,
645                    title: doc_title.clone(),
646                    source: doc_source.clone(),
647                    metadata: doc_metadata.clone(),
648                };
649                if let Ok(sub_chunks) = sub_chunker.chunk(&sub_doc) {
650                    for mut chunk in sub_chunks {
651                        if let Some(ref h) = header {
652                            chunk.metadata.headers.push(h.clone());
653                        }
654                        chunks.push(chunk);
655                    }
656                }
657            } else {
658                let start_offset = document.content.find(&content).unwrap_or(0);
659                let end_offset = start_offset + content.len();
660                let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
661                chunk.metadata.title = doc_title.clone();
662                if let Some(h) = header {
663                    chunk.metadata.headers.push(h);
664                }
665                chunks.push(chunk);
666            }
667        }
668
669        if chunks.is_empty() {
670            // Fallback: return entire document as single chunk
671            let content = document.content.clone();
672            let mut chunk = Chunk::new(document.id, content, 0, document.content.len());
673            chunk.metadata.title = document.title.clone();
674            chunks.push(chunk);
675        }
676
677        Ok(chunks)
678    }
679
680    fn estimate_chunks(&self, document: &Document) -> usize {
681        if document.content.is_empty() {
682            return 0;
683        }
684        let sections = Self::split_by_headers(&document.content);
685        sections.len().max(1)
686    }
687}
688
689/// Paragraph-based chunker
690#[derive(Debug, Clone)]
691pub struct ParagraphChunker {
692    max_paragraphs: usize,
693}
694
695impl ParagraphChunker {
696    /// Create a new paragraph chunker
697    #[must_use]
698    pub fn new(max_paragraphs: usize) -> Self {
699        Self { max_paragraphs }
700    }
701
702    /// Split text into paragraphs
703    fn split_paragraphs(text: &str) -> Vec<&str> {
704        text.split("\n\n").map(|p| p.trim()).filter(|p| !p.is_empty()).collect()
705    }
706}
707
708impl Chunker for ParagraphChunker {
709    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
710        if document.content.is_empty() {
711            return Err(Error::EmptyDocument(
712                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
713            ));
714        }
715
716        let paragraphs = Self::split_paragraphs(&document.content);
717        if paragraphs.is_empty() {
718            return Err(Error::EmptyDocument(
719                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
720            ));
721        }
722
723        let mut chunks = Vec::new();
724        let mut i = 0;
725
726        while i < paragraphs.len() {
727            let end = (i + self.max_paragraphs).min(paragraphs.len());
728            let content = paragraphs[i..end].join("\n\n");
729
730            let start_offset = document.content.find(&content).unwrap_or(0);
731            let end_offset = start_offset + content.len();
732
733            let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
734            chunk.metadata.title = document.title.clone();
735            chunks.push(chunk);
736
737            i = end;
738        }
739
740        Ok(chunks)
741    }
742
743    fn estimate_chunks(&self, document: &Document) -> usize {
744        if document.content.is_empty() {
745            return 0;
746        }
747        let paragraphs = Self::split_paragraphs(&document.content);
748        if self.max_paragraphs == 0 {
749            return paragraphs.len();
750        }
751        (paragraphs.len() + self.max_paragraphs - 1) / self.max_paragraphs
752    }
753}
754
755/// Sentence-based chunker
756#[derive(Debug, Clone)]
757pub struct SentenceChunker {
758    max_sentences: usize,
759    overlap_sentences: usize,
760}
761
762impl SentenceChunker {
763    /// Create a new sentence chunker
764    #[must_use]
765    pub fn new(max_sentences: usize, overlap_sentences: usize) -> Self {
766        Self { max_sentences, overlap_sentences }
767    }
768
769    fn split_sentences(text: &str) -> Vec<&str> {
770        let mut sentences = Vec::new();
771        let mut start = 0;
772
773        for (i, c) in text.char_indices() {
774            if c == '.' || c == '!' || c == '?' {
775                // Check for end of sentence
776                let next_char = text[i + c.len_utf8()..].chars().next();
777                if next_char.map_or(true, |nc| nc.is_whitespace() || nc.is_uppercase()) {
778                    let end = i + c.len_utf8();
779                    let sentence = text[start..end].trim();
780                    if !sentence.is_empty() {
781                        sentences.push(sentence);
782                    }
783                    start = end;
784                }
785            }
786        }
787
788        // Add remaining text
789        let remaining = text[start..].trim();
790        if !remaining.is_empty() {
791            sentences.push(remaining);
792        }
793
794        sentences
795    }
796}
797
798impl Chunker for SentenceChunker {
799    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
800        if document.content.is_empty() {
801            return Err(Error::EmptyDocument(
802                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
803            ));
804        }
805
806        let sentences = Self::split_sentences(&document.content);
807        let mut chunks = Vec::new();
808        let mut i = 0;
809
810        while i < sentences.len() {
811            let end = (i + self.max_sentences).min(sentences.len());
812            let content = sentences[i..end].join(" ");
813
814            let start_offset = document.content.find(&content).unwrap_or(0);
815            let end_offset = start_offset + content.len();
816
817            let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
818            chunk.metadata.title = document.title.clone();
819            chunks.push(chunk);
820
821            let step = self.max_sentences.saturating_sub(self.overlap_sentences);
822            i += if step == 0 { 1 } else { step };
823        }
824
825        Ok(chunks)
826    }
827
828    fn estimate_chunks(&self, document: &Document) -> usize {
829        if document.content.is_empty() {
830            return 0;
831        }
832        let sentences = Self::split_sentences(&document.content);
833        let step = self.max_sentences.saturating_sub(self.overlap_sentences);
834        if step == 0 {
835            return sentences.len();
836        }
837        (sentences.len() + step - 1) / step
838    }
839}
840
841#[cfg(test)]
842mod tests;