Skip to main content

pdfvec/chunker/
mod.rs

1//! Text chunking strategies for vectorization pipelines.
2//!
3//! This module provides [`Chunker`] for splitting extracted text into
4//! appropriately-sized segments for embedding and RAG pipelines.
5//!
6//! # Strategies
7//!
8//! - [`ChunkStrategy::Fixed`]: Split at character boundaries with optional overlap
9//! - [`ChunkStrategy::Paragraph`]: Split on paragraph boundaries (double newlines)
10//! - [`ChunkStrategy::Sentence`]: Split on sentence boundaries using Unicode segmentation
11//!
12//! # Example
13//!
14//! ```
15//! use pdfvec::{Chunker, ChunkStrategy};
16//!
17//! let text = "First sentence. Second sentence.\n\nNew paragraph here.";
18//!
19//! // Fixed-size chunks with overlap
20//! let chunker = Chunker::new(ChunkStrategy::Fixed)
21//!     .chunk_size(20)
22//!     .overlap(5);
23//!
24//! for chunk in chunker.chunks(text) {
25//!     println!("{}: {}", chunk.index(), chunk.text());
26//! }
27//! ```
28
29mod fixed;
30mod paragraph;
31mod sentence;
32
33use crate::Chunk;
34
35/// Strategy for splitting text into chunks.
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
37pub enum ChunkStrategy {
38    /// Fixed-size chunks with optional overlap.
39    #[default]
40    Fixed,
41    /// Split on paragraph boundaries (double newlines).
42    Paragraph,
43    /// Split on sentence boundaries using Unicode segmentation.
44    Sentence,
45}
46
47/// Text chunker with configurable strategy and parameters.
48///
49/// # Example
50///
51/// ```
52/// use pdfvec::{Chunker, ChunkStrategy};
53///
54/// let chunker = Chunker::new(ChunkStrategy::Fixed)
55///     .chunk_size(512)
56///     .overlap(50);
57///
58/// let text = "Your document text here...";
59/// let chunks: Vec<_> = chunker.chunks(text).collect();
60/// ```
61#[derive(Debug, Clone)]
62pub struct Chunker {
63    strategy: ChunkStrategy,
64    chunk_size: usize,
65    overlap: usize,
66    min_chunk_size: usize,
67}
68
69impl Default for Chunker {
70    fn default() -> Self {
71        Self::new(ChunkStrategy::default())
72    }
73}
74
75impl Chunker {
76    /// Creates a new chunker with the specified strategy.
77    ///
78    /// # Example
79    ///
80    /// ```
81    /// use pdfvec::{Chunker, ChunkStrategy};
82    ///
83    /// let chunker = Chunker::new(ChunkStrategy::Paragraph);
84    /// ```
85    #[must_use]
86    pub fn new(strategy: ChunkStrategy) -> Self {
87        Self {
88            strategy,
89            chunk_size: 512,
90            overlap: 0,
91            min_chunk_size: 100,
92        }
93    }
94
95    /// Sets the target chunk size in characters.
96    ///
97    /// For [`ChunkStrategy::Fixed`], this is the exact size (except for the last chunk).
98    /// For other strategies, this is the target size for merging small segments.
99    ///
100    /// Default: 512
101    ///
102    /// # Example
103    ///
104    /// ```
105    /// use pdfvec::{Chunker, ChunkStrategy};
106    ///
107    /// let chunker = Chunker::new(ChunkStrategy::Fixed).chunk_size(1024);
108    /// ```
109    #[must_use]
110    pub fn chunk_size(mut self, size: usize) -> Self {
111        self.chunk_size = size.max(1);
112        self
113    }
114
115    /// Sets the overlap between consecutive chunks in characters.
116    ///
117    /// Overlap helps maintain context across chunk boundaries. Only applies
118    /// to [`ChunkStrategy::Fixed`].
119    ///
120    /// Default: 0
121    ///
122    /// # Example
123    ///
124    /// ```
125    /// use pdfvec::{Chunker, ChunkStrategy};
126    ///
127    /// let chunker = Chunker::new(ChunkStrategy::Fixed)
128    ///     .chunk_size(100)
129    ///     .overlap(20);
130    /// ```
131    #[must_use]
132    pub fn overlap(mut self, chars: usize) -> Self {
133        self.overlap = chars;
134        self
135    }
136
137    /// Sets the minimum chunk size for paragraph and sentence strategies.
138    ///
139    /// Chunks smaller than this will be merged with adjacent chunks.
140    ///
141    /// Default: 100
142    ///
143    /// # Example
144    ///
145    /// ```
146    /// use pdfvec::{Chunker, ChunkStrategy};
147    ///
148    /// let chunker = Chunker::new(ChunkStrategy::Paragraph)
149    ///     .min_chunk_size(50);
150    /// ```
151    #[must_use]
152    pub fn min_chunk_size(mut self, size: usize) -> Self {
153        self.min_chunk_size = size;
154        self
155    }
156
157    /// Returns an iterator over chunks of the input text.
158    ///
159    /// # Example
160    ///
161    /// ```
162    /// use pdfvec::{Chunker, ChunkStrategy};
163    ///
164    /// let text = "Hello world. How are you?";
165    /// let chunker = Chunker::new(ChunkStrategy::Sentence);
166    ///
167    /// for chunk in chunker.chunks(text) {
168    ///     println!("Chunk {}: {}", chunk.index(), chunk.text());
169    /// }
170    /// ```
171    #[must_use]
172    pub fn chunks<'a>(&self, text: &'a str) -> Box<dyn Iterator<Item = Chunk<'a>> + 'a> {
173        match self.strategy {
174            ChunkStrategy::Fixed => Box::new(fixed::FixedChunker::new(
175                text,
176                self.chunk_size,
177                self.overlap,
178            )),
179            ChunkStrategy::Paragraph => Box::new(paragraph::ParagraphChunker::new(
180                text,
181                self.chunk_size,
182                self.min_chunk_size,
183            )),
184            ChunkStrategy::Sentence => Box::new(sentence::SentenceChunker::new(
185                text,
186                self.chunk_size,
187                self.min_chunk_size,
188            )),
189        }
190    }
191
192    /// Returns the configured chunk size.
193    #[must_use]
194    pub fn get_chunk_size(&self) -> usize {
195        self.chunk_size
196    }
197
198    /// Returns the configured overlap.
199    #[must_use]
200    pub fn get_overlap(&self) -> usize {
201        self.overlap
202    }
203
204    /// Returns the configured strategy.
205    #[must_use]
206    pub fn strategy(&self) -> ChunkStrategy {
207        self.strategy
208    }
209}
210
211#[cfg(test)]
212mod tests {
213    use super::*;
214
215    #[test]
216    fn chunker_default() {
217        let chunker = Chunker::default();
218        assert_eq!(chunker.strategy(), ChunkStrategy::Fixed);
219        assert_eq!(chunker.get_chunk_size(), 512);
220        assert_eq!(chunker.get_overlap(), 0);
221    }
222
223    #[test]
224    fn chunker_builder() {
225        let chunker = Chunker::new(ChunkStrategy::Paragraph)
226            .chunk_size(1024)
227            .overlap(100)
228            .min_chunk_size(50);
229
230        assert_eq!(chunker.strategy(), ChunkStrategy::Paragraph);
231        assert_eq!(chunker.get_chunk_size(), 1024);
232        assert_eq!(chunker.get_overlap(), 100);
233    }
234
235    #[test]
236    fn chunk_size_minimum_is_one() {
237        let chunker = Chunker::new(ChunkStrategy::Fixed).chunk_size(0);
238        assert_eq!(chunker.get_chunk_size(), 1);
239    }
240}