Skip to main content

trueno_rag/chunk/
mod.rs

1//! Document chunking strategies for RAG pipelines
2
3mod timestamp;
4pub use timestamp::TimestampChunker;
5
6use crate::{Document, DocumentId, Error, Result};
7use serde::{Deserialize, Serialize};
8use std::collections::HashMap;
9
10/// Stable replacement for `str::ceil_char_boundary` (unstable).
11/// Returns the smallest index >= `i` that is a valid UTF-8 char boundary.
12fn ceil_char_boundary(s: &str, i: usize) -> usize {
13    if i >= s.len() {
14        s.len()
15    } else {
16        let mut pos = i;
17        while pos < s.len() && !s.is_char_boundary(pos) {
18            pos += 1;
19        }
20        pos
21    }
22}
23
24/// Unique chunk identifier
25#[derive(Debug, Clone, Copy, PartialEq, Eq, Hash, Serialize, Deserialize)]
26pub struct ChunkId(pub uuid::Uuid);
27
28impl ChunkId {
29    /// Create a new random chunk ID
30    #[must_use]
31    pub fn new() -> Self {
32        Self(uuid::Uuid::new_v4())
33    }
34}
35
36impl Default for ChunkId {
37    fn default() -> Self {
38        Self::new()
39    }
40}
41
42impl std::fmt::Display for ChunkId {
43    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
44        write!(f, "{}", self.0)
45    }
46}
47
48/// Metadata associated with a chunk
49#[derive(Debug, Clone, Default, Serialize, Deserialize)]
50pub struct ChunkMetadata {
51    /// Source document title
52    pub title: Option<String>,
53    /// Section/header hierarchy
54    pub headers: Vec<String>,
55    /// Page number (for PDFs)
56    pub page: Option<usize>,
57    /// Custom metadata
58    pub custom: HashMap<String, serde_json::Value>,
59}
60
61/// A chunk of text from a document
62#[derive(Debug, Clone, Serialize, Deserialize)]
63pub struct Chunk {
64    /// Unique chunk identifier
65    pub id: ChunkId,
66    /// Source document reference
67    pub document_id: DocumentId,
68    /// Chunk text content
69    pub content: String,
70    /// Character offset in source document (start)
71    pub start_offset: usize,
72    /// Character offset in source document (end)
73    pub end_offset: usize,
74    /// Metadata inherited from document
75    pub metadata: ChunkMetadata,
76    /// Embedding vector (populated after embedding)
77    pub embedding: Option<Vec<f32>>,
78}
79
80impl Chunk {
81    /// Create a new chunk
82    #[must_use]
83    pub fn new(
84        document_id: DocumentId,
85        content: String,
86        start_offset: usize,
87        end_offset: usize,
88    ) -> Self {
89        Self {
90            id: ChunkId::new(),
91            document_id,
92            content,
93            start_offset,
94            end_offset,
95            metadata: ChunkMetadata::default(),
96            embedding: None,
97        }
98    }
99
100    /// Get the length of the chunk in characters
101    #[must_use]
102    pub fn len(&self) -> usize {
103        self.content.len()
104    }
105
106    /// Check if the chunk is empty
107    #[must_use]
108    pub fn is_empty(&self) -> bool {
109        self.content.is_empty()
110    }
111
112    /// Set the embedding vector
113    pub fn set_embedding(&mut self, embedding: Vec<f32>) {
114        self.embedding = Some(embedding);
115    }
116}
117
118/// Chunking strategy configuration
119#[derive(Debug, Clone, Serialize, Deserialize)]
120pub enum ChunkingStrategy {
121    /// Fixed-size chunks with overlap
122    FixedSize {
123        /// Target chunk size in characters
124        chunk_size: usize,
125        /// Overlap between consecutive chunks
126        overlap: usize,
127    },
128    /// Split on sentence boundaries
129    Sentence {
130        /// Maximum sentences per chunk
131        max_sentences: usize,
132        /// Overlap sentences between chunks
133        overlap_sentences: usize,
134    },
135    /// Split on paragraph boundaries
136    Paragraph {
137        /// Maximum paragraphs per chunk
138        max_paragraphs: usize,
139    },
140    /// Recursive character splitting
141    Recursive {
142        /// Separators to try in order
143        separators: Vec<String>,
144        /// Target chunk size
145        chunk_size: usize,
146        /// Overlap between chunks
147        overlap: usize,
148    },
149}
150
151impl Default for ChunkingStrategy {
152    fn default() -> Self {
153        Self::Recursive {
154            separators: vec![
155                "\n\n".to_string(),
156                "\n".to_string(),
157                ". ".to_string(),
158                " ".to_string(),
159            ],
160            chunk_size: 512,
161            overlap: 50,
162        }
163    }
164}
165
166/// Trait for document chunkers
167pub trait Chunker: Send + Sync {
168    /// Split document into chunks
169    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>>;
170
171    /// Estimate chunk count without materializing
172    fn estimate_chunks(&self, document: &Document) -> usize;
173}
174
175/// Recursive chunker implementation
176#[derive(Debug, Clone)]
177pub struct RecursiveChunker {
178    separators: Vec<String>,
179    chunk_size: usize,
180    overlap: usize,
181}
182
183impl RecursiveChunker {
184    /// Create a new recursive chunker
185    #[must_use]
186    pub fn new(chunk_size: usize, overlap: usize) -> Self {
187        Self {
188            separators: vec![
189                "\n\n".to_string(),
190                "\n".to_string(),
191                ". ".to_string(),
192                " ".to_string(),
193            ],
194            chunk_size,
195            overlap,
196        }
197    }
198
199    /// Create with custom separators
200    #[must_use]
201    pub fn with_separators(mut self, separators: Vec<String>) -> Self {
202        self.separators = separators;
203        self
204    }
205
206    fn split_text(&self, text: &str, separator_idx: usize) -> Vec<String> {
207        if text.len() <= self.chunk_size {
208            return vec![text.to_string()];
209        }
210
211        if separator_idx >= self.separators.len() {
212            // Fallback: split by characters
213            return self.split_by_chars(text);
214        }
215
216        let separator = &self.separators[separator_idx];
217        let parts: Vec<&str> = text.split(separator).collect();
218
219        if parts.len() == 1 {
220            // Separator not found, try next
221            return self.split_text(text, separator_idx + 1);
222        }
223
224        self.merge_splits(&parts, separator, separator_idx)
225    }
226
227    fn merge_splits(&self, parts: &[&str], separator: &str, separator_idx: usize) -> Vec<String> {
228        let mut chunks = Vec::new();
229        let mut current = String::new();
230
231        for part in parts {
232            let potential = if current.is_empty() {
233                (*part).to_string()
234            } else {
235                format!("{current}{separator}{part}")
236            };
237
238            if potential.len() <= self.chunk_size {
239                current = potential;
240            } else if current.is_empty() {
241                // Single part too large, recurse
242                chunks.extend(self.split_text(part, separator_idx + 1));
243            } else {
244                chunks.push(current);
245                current = (*part).to_string();
246            }
247        }
248
249        if !current.is_empty() {
250            if current.len() <= self.chunk_size {
251                chunks.push(current);
252            } else {
253                chunks.extend(self.split_text(&current, separator_idx + 1));
254            }
255        }
256
257        chunks
258    }
259
260    fn split_by_chars(&self, text: &str) -> Vec<String> {
261        let chars: Vec<char> = text.chars().collect();
262        let mut chunks = Vec::new();
263        let mut start = 0;
264
265        while start < chars.len() {
266            let end = (start + self.chunk_size).min(chars.len());
267            let chunk: String = chars[start..end].iter().collect();
268            chunks.push(chunk);
269
270            if end >= chars.len() {
271                break;
272            }
273
274            // Move start, accounting for overlap
275            start = if end > self.overlap { end - self.overlap } else { end };
276        }
277
278        chunks
279    }
280
281    fn apply_overlap(&self, chunks: Vec<String>) -> Vec<String> {
282        if self.overlap == 0 || chunks.len() <= 1 {
283            return chunks;
284        }
285
286        let mut result = Vec::with_capacity(chunks.len());
287        for (i, chunk) in chunks.iter().enumerate() {
288            if i == 0 {
289                result.push(chunk.clone());
290            } else {
291                // Add overlap from previous chunk
292                let prev = &chunks[i - 1];
293                let overlap_text = if prev.len() > self.overlap {
294                    let start = prev.len() - self.overlap;
295                    let start = ceil_char_boundary(prev, start);
296                    &prev[start..]
297                } else {
298                    prev.as_str()
299                };
300                result.push(format!("{overlap_text}{chunk}"));
301            }
302        }
303        result
304    }
305}
306
307impl Chunker for RecursiveChunker {
308    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
309        if document.content.is_empty() {
310            return Err(Error::EmptyDocument(
311                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
312            ));
313        }
314
315        let text_chunks = self.split_text(&document.content, 0);
316        let overlapped = self.apply_overlap(text_chunks);
317
318        let mut offset = 0;
319        let mut chunks = Vec::new();
320
321        for content in overlapped {
322            // Snap offset to a valid char boundary
323            let safe_offset = ceil_char_boundary(&document.content, offset);
324            // Find actual position in document
325            let start = document.content[safe_offset..]
326                .find(&content)
327                .map_or(safe_offset, |pos| safe_offset + pos);
328            let end = start + content.len();
329
330            let mut chunk = Chunk::new(document.id, content, start, end);
331            chunk.metadata.title = document.title.clone();
332
333            chunks.push(chunk);
334            offset = ceil_char_boundary(&document.content, start + 1);
335        }
336
337        Ok(chunks)
338    }
339
340    fn estimate_chunks(&self, document: &Document) -> usize {
341        if document.content.is_empty() {
342            return 0;
343        }
344        let effective_size = self.chunk_size.saturating_sub(self.overlap);
345        if effective_size == 0 {
346            return 1;
347        }
348        (document.content.len() + effective_size - 1) / effective_size
349    }
350}
351
352/// Fixed-size chunker implementation
353#[derive(Debug, Clone)]
354pub struct FixedSizeChunker {
355    chunk_size: usize,
356    overlap: usize,
357}
358
359impl FixedSizeChunker {
360    /// Create a new fixed-size chunker
361    #[must_use]
362    pub fn new(chunk_size: usize, overlap: usize) -> Self {
363        Self { chunk_size, overlap }
364    }
365}
366
367impl Chunker for FixedSizeChunker {
368    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
369        if document.content.is_empty() {
370            return Err(Error::EmptyDocument(
371                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
372            ));
373        }
374
375        let chars: Vec<char> = document.content.chars().collect();
376        let mut chunks = Vec::new();
377        let mut start = 0;
378
379        while start < chars.len() {
380            let end = (start + self.chunk_size).min(chars.len());
381            let content: String = chars[start..end].iter().collect();
382
383            let byte_start = chars[..start].iter().collect::<String>().len();
384            let byte_end = byte_start + content.len();
385
386            let mut chunk = Chunk::new(document.id, content, byte_start, byte_end);
387            chunk.metadata.title = document.title.clone();
388            chunks.push(chunk);
389
390            if end >= chars.len() {
391                break;
392            }
393
394            let step = self.chunk_size.saturating_sub(self.overlap);
395            start += if step == 0 { 1 } else { step };
396        }
397
398        Ok(chunks)
399    }
400
401    fn estimate_chunks(&self, document: &Document) -> usize {
402        if document.content.is_empty() {
403            return 0;
404        }
405        let step = self.chunk_size.saturating_sub(self.overlap);
406        if step == 0 {
407            return document.content.chars().count();
408        }
409        let char_count = document.content.chars().count();
410        (char_count + step - 1) / step
411    }
412}
413
414/// Semantic chunker that groups sentences by embedding similarity
415pub struct SemanticChunker<E: crate::embed::Embedder> {
416    embedder: E,
417    /// Similarity threshold (0.0 to 1.0) - chunks split when similarity drops below this
418    pub similarity_threshold: f32,
419    /// Maximum chunk size in characters
420    pub max_chunk_size: usize,
421}
422
423impl<E: crate::embed::Embedder> SemanticChunker<E> {
424    /// Create a new semantic chunker
425    pub fn new(embedder: E, similarity_threshold: f32, max_chunk_size: usize) -> Self {
426        Self { embedder, similarity_threshold, max_chunk_size }
427    }
428
429    /// Split text into sentences
430    fn split_sentences(text: &str) -> Vec<&str> {
431        let mut sentences = Vec::new();
432        let mut start = 0;
433
434        for (i, c) in text.char_indices() {
435            if c == '.' || c == '!' || c == '?' {
436                let next_char = text[i + c.len_utf8()..].chars().next();
437                if next_char.map_or(true, |nc| nc.is_whitespace()) {
438                    let end = i + c.len_utf8();
439                    let sentence = text[start..end].trim();
440                    if !sentence.is_empty() {
441                        sentences.push(sentence);
442                    }
443                    start = end;
444                }
445            }
446        }
447
448        let remaining = text[start..].trim();
449        if !remaining.is_empty() {
450            sentences.push(remaining);
451        }
452
453        sentences
454    }
455}
456
457impl<E: crate::embed::Embedder> Chunker for SemanticChunker<E> {
458    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
459        if document.content.is_empty() {
460            return Err(Error::EmptyDocument(
461                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
462            ));
463        }
464
465        let sentences = Self::split_sentences(&document.content);
466        if sentences.is_empty() {
467            return Err(Error::EmptyDocument(
468                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
469            ));
470        }
471
472        if sentences.len() == 1 {
473            let content = sentences[0].to_string();
474            let start_offset = document.content.find(&content).unwrap_or(0);
475            let end_offset = start_offset + content.len();
476            let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
477            chunk.metadata.title = document.title.clone();
478            return Ok(vec![chunk]);
479        }
480
481        // Embed all sentences
482        let embeddings: Vec<Vec<f32>> = sentences
483            .iter()
484            .map(|s| {
485                self.embedder.embed(s).unwrap_or_else(|e| {
486                    eprintln!("Embedding failed for sentence: {e}");
487                    vec![0.0; self.embedder.dimension()]
488                })
489            })
490            .collect();
491
492        let mut chunks = Vec::new();
493        let mut current_sentences: Vec<&str> = vec![sentences[0]];
494        let mut current_embedding = &embeddings[0];
495
496        for i in 1..sentences.len() {
497            let similarity = crate::embed::cosine_similarity(current_embedding, &embeddings[i]);
498            let current_len: usize = current_sentences.iter().map(|s| s.len()).sum();
499
500            if similarity < self.similarity_threshold
501                || current_len + sentences[i].len() > self.max_chunk_size
502            {
503                // Create chunk from current sentences
504                let content = current_sentences.join(" ");
505                let start_offset = document.content.find(&content).unwrap_or(0);
506                let end_offset = start_offset + content.len();
507                let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
508                chunk.metadata.title = document.title.clone();
509                chunks.push(chunk);
510
511                current_sentences = vec![sentences[i]];
512                current_embedding = &embeddings[i];
513            } else {
514                current_sentences.push(sentences[i]);
515            }
516        }
517
518        // Add remaining sentences
519        if !current_sentences.is_empty() {
520            let content = current_sentences.join(" ");
521            let start_offset = document.content.find(&content).unwrap_or(0);
522            let end_offset = start_offset + content.len();
523            let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
524            chunk.metadata.title = document.title.clone();
525            chunks.push(chunk);
526        }
527
528        Ok(chunks)
529    }
530
531    fn estimate_chunks(&self, document: &Document) -> usize {
532        if document.content.is_empty() {
533            return 0;
534        }
535        // Rough estimate based on max chunk size
536        let sentences = Self::split_sentences(&document.content);
537        (sentences.len() + 2) / 3 // Assume average 3 sentences per chunk
538    }
539}
540
541/// Structural chunker that respects document structure (headers, sections)
542#[derive(Debug, Clone)]
543pub struct StructuralChunker {
544    /// Whether to respect headers when chunking
545    pub respect_headers: bool,
546    /// Maximum section size in characters
547    pub max_section_size: usize,
548}
549
550impl StructuralChunker {
551    /// Create a new structural chunker
552    #[must_use]
553    pub fn new(respect_headers: bool, max_section_size: usize) -> Self {
554        Self { respect_headers, max_section_size }
555    }
556
557    /// Extract header text from a line
558    fn extract_header(line: &str) -> Option<String> {
559        let trimmed = line.trim();
560        if trimmed.starts_with('#') {
561            // Markdown header
562            let header = trimmed.trim_start_matches('#').trim();
563            if !header.is_empty() {
564                return Some(header.to_string());
565            }
566        }
567        None
568    }
569
570    /// Check if a line is a header
571    fn is_header(line: &str) -> bool {
572        Self::extract_header(line).is_some()
573    }
574
575    /// Split document into sections by headers
576    fn split_by_headers(text: &str) -> Vec<(Option<String>, String)> {
577        let mut sections = Vec::new();
578        let mut current_header: Option<String> = None;
579        let mut current_content = String::new();
580
581        for line in text.lines() {
582            if Self::is_header(line) {
583                // Save previous section if not empty
584                if !current_content.trim().is_empty() || current_header.is_some() {
585                    sections.push((current_header.take(), current_content.trim().to_string()));
586                    current_content = String::new();
587                }
588                current_header = Self::extract_header(line);
589                current_content.push_str(line);
590                current_content.push('\n');
591            } else {
592                current_content.push_str(line);
593                current_content.push('\n');
594            }
595        }
596
597        // Add final section
598        if !current_content.trim().is_empty() {
599            sections.push((current_header, current_content.trim().to_string()));
600        }
601
602        sections
603    }
604}
605
606impl Chunker for StructuralChunker {
607    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
608        if document.content.is_empty() {
609            return Err(Error::EmptyDocument(
610                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
611            ));
612        }
613
614        let sections = if self.respect_headers {
615            Self::split_by_headers(&document.content)
616        } else {
617            vec![(None, document.content.clone())]
618        };
619
620        if sections.is_empty() {
621            return Err(Error::EmptyDocument(
622                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
623            ));
624        }
625
626        let mut chunks = Vec::new();
627        // Hoist clones and constructors outside loop (CB-518, CB-520)
628        let doc_title = document.title.clone();
629        let doc_source = document.source.clone();
630        let doc_metadata = document.metadata.clone();
631        let sub_chunker = RecursiveChunker::new(self.max_section_size, 50);
632
633        for (header, content) in sections {
634            if content.is_empty() {
635                continue;
636            }
637
638            // Split large sections if needed
639            if content.len() > self.max_section_size {
640                let sub_doc = Document {
641                    id: document.id,
642                    content,
643                    title: doc_title.clone(),
644                    source: doc_source.clone(),
645                    metadata: doc_metadata.clone(),
646                };
647                if let Ok(sub_chunks) = sub_chunker.chunk(&sub_doc) {
648                    for mut chunk in sub_chunks {
649                        if let Some(ref h) = header {
650                            chunk.metadata.headers.push(h.clone());
651                        }
652                        chunks.push(chunk);
653                    }
654                }
655            } else {
656                let start_offset = document.content.find(&content).unwrap_or(0);
657                let end_offset = start_offset + content.len();
658                let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
659                chunk.metadata.title = doc_title.clone();
660                if let Some(h) = header {
661                    chunk.metadata.headers.push(h);
662                }
663                chunks.push(chunk);
664            }
665        }
666
667        if chunks.is_empty() {
668            // Fallback: return entire document as single chunk
669            let content = document.content.clone();
670            let mut chunk = Chunk::new(document.id, content, 0, document.content.len());
671            chunk.metadata.title = document.title.clone();
672            chunks.push(chunk);
673        }
674
675        Ok(chunks)
676    }
677
678    fn estimate_chunks(&self, document: &Document) -> usize {
679        if document.content.is_empty() {
680            return 0;
681        }
682        let sections = Self::split_by_headers(&document.content);
683        sections.len().max(1)
684    }
685}
686
687/// Paragraph-based chunker
688#[derive(Debug, Clone)]
689pub struct ParagraphChunker {
690    max_paragraphs: usize,
691}
692
693impl ParagraphChunker {
694    /// Create a new paragraph chunker
695    #[must_use]
696    pub fn new(max_paragraphs: usize) -> Self {
697        Self { max_paragraphs }
698    }
699
700    /// Split text into paragraphs
701    fn split_paragraphs(text: &str) -> Vec<&str> {
702        text.split("\n\n").map(|p| p.trim()).filter(|p| !p.is_empty()).collect()
703    }
704}
705
706impl Chunker for ParagraphChunker {
707    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
708        if document.content.is_empty() {
709            return Err(Error::EmptyDocument(
710                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
711            ));
712        }
713
714        let paragraphs = Self::split_paragraphs(&document.content);
715        if paragraphs.is_empty() {
716            return Err(Error::EmptyDocument(
717                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
718            ));
719        }
720
721        let mut chunks = Vec::new();
722        let mut i = 0;
723
724        while i < paragraphs.len() {
725            let end = (i + self.max_paragraphs).min(paragraphs.len());
726            let content = paragraphs[i..end].join("\n\n");
727
728            let start_offset = document.content.find(&content).unwrap_or(0);
729            let end_offset = start_offset + content.len();
730
731            let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
732            chunk.metadata.title = document.title.clone();
733            chunks.push(chunk);
734
735            i = end;
736        }
737
738        Ok(chunks)
739    }
740
741    fn estimate_chunks(&self, document: &Document) -> usize {
742        if document.content.is_empty() {
743            return 0;
744        }
745        let paragraphs = Self::split_paragraphs(&document.content);
746        if self.max_paragraphs == 0 {
747            return paragraphs.len();
748        }
749        (paragraphs.len() + self.max_paragraphs - 1) / self.max_paragraphs
750    }
751}
752
753/// Sentence-based chunker
754#[derive(Debug, Clone)]
755pub struct SentenceChunker {
756    max_sentences: usize,
757    overlap_sentences: usize,
758}
759
760impl SentenceChunker {
761    /// Create a new sentence chunker
762    #[must_use]
763    pub fn new(max_sentences: usize, overlap_sentences: usize) -> Self {
764        Self { max_sentences, overlap_sentences }
765    }
766
767    fn split_sentences(text: &str) -> Vec<&str> {
768        let mut sentences = Vec::new();
769        let mut start = 0;
770
771        for (i, c) in text.char_indices() {
772            if c == '.' || c == '!' || c == '?' {
773                // Check for end of sentence
774                let next_char = text[i + c.len_utf8()..].chars().next();
775                if next_char.map_or(true, |nc| nc.is_whitespace() || nc.is_uppercase()) {
776                    let end = i + c.len_utf8();
777                    let sentence = text[start..end].trim();
778                    if !sentence.is_empty() {
779                        sentences.push(sentence);
780                    }
781                    start = end;
782                }
783            }
784        }
785
786        // Add remaining text
787        let remaining = text[start..].trim();
788        if !remaining.is_empty() {
789            sentences.push(remaining);
790        }
791
792        sentences
793    }
794}
795
796impl Chunker for SentenceChunker {
797    fn chunk(&self, document: &Document) -> Result<Vec<Chunk>> {
798        if document.content.is_empty() {
799            return Err(Error::EmptyDocument(
800                document.title.clone().unwrap_or_else(|| "untitled".to_string()),
801            ));
802        }
803
804        let sentences = Self::split_sentences(&document.content);
805        let mut chunks = Vec::new();
806        let mut i = 0;
807
808        while i < sentences.len() {
809            let end = (i + self.max_sentences).min(sentences.len());
810            let content = sentences[i..end].join(" ");
811
812            let start_offset = document.content.find(&content).unwrap_or(0);
813            let end_offset = start_offset + content.len();
814
815            let mut chunk = Chunk::new(document.id, content, start_offset, end_offset);
816            chunk.metadata.title = document.title.clone();
817            chunks.push(chunk);
818
819            let step = self.max_sentences.saturating_sub(self.overlap_sentences);
820            i += if step == 0 { 1 } else { step };
821        }
822
823        Ok(chunks)
824    }
825
826    fn estimate_chunks(&self, document: &Document) -> usize {
827        if document.content.is_empty() {
828            return 0;
829        }
830        let sentences = Self::split_sentences(&document.content);
831        let step = self.max_sentences.saturating_sub(self.overlap_sentences);
832        if step == 0 {
833            return sentences.len();
834        }
835        (sentences.len() + step - 1) / step
836    }
837}
838
839#[cfg(test)]
840mod tests;