pdfvec/chunker/mod.rs
1//! Text chunking strategies for vectorization pipelines.
2//!
3//! This module provides [`Chunker`] for splitting extracted text into
4//! appropriately-sized segments for embedding and RAG pipelines.
5//!
6//! # Strategies
7//!
8//! - [`ChunkStrategy::Fixed`]: Split at character boundaries with optional overlap
9//! - [`ChunkStrategy::Paragraph`]: Split on paragraph boundaries (double newlines)
10//! - [`ChunkStrategy::Sentence`]: Split on sentence boundaries using Unicode segmentation
11//!
12//! # Example
13//!
14//! ```
15//! use pdfvec::{Chunker, ChunkStrategy};
16//!
17//! let text = "First sentence. Second sentence.\n\nNew paragraph here.";
18//!
19//! // Fixed-size chunks with overlap
20//! let chunker = Chunker::new(ChunkStrategy::Fixed)
21//! .chunk_size(20)
22//! .overlap(5);
23//!
24//! for chunk in chunker.chunks(text) {
25//! println!("{}: {}", chunk.index(), chunk.text());
26//! }
27//! ```
28
29mod fixed;
30mod paragraph;
31mod sentence;
32
33use crate::Chunk;
34
35/// Strategy for splitting text into chunks.
36#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
37pub enum ChunkStrategy {
38 /// Fixed-size chunks with optional overlap.
39 #[default]
40 Fixed,
41 /// Split on paragraph boundaries (double newlines).
42 Paragraph,
43 /// Split on sentence boundaries using Unicode segmentation.
44 Sentence,
45}
46
47/// Text chunker with configurable strategy and parameters.
48///
49/// # Example
50///
51/// ```
52/// use pdfvec::{Chunker, ChunkStrategy};
53///
54/// let chunker = Chunker::new(ChunkStrategy::Fixed)
55/// .chunk_size(512)
56/// .overlap(50);
57///
58/// let text = "Your document text here...";
59/// let chunks: Vec<_> = chunker.chunks(text).collect();
60/// ```
61#[derive(Debug, Clone)]
62pub struct Chunker {
63 strategy: ChunkStrategy,
64 chunk_size: usize,
65 overlap: usize,
66 min_chunk_size: usize,
67}
68
69impl Default for Chunker {
70 fn default() -> Self {
71 Self::new(ChunkStrategy::default())
72 }
73}
74
75impl Chunker {
76 /// Creates a new chunker with the specified strategy.
77 ///
78 /// # Example
79 ///
80 /// ```
81 /// use pdfvec::{Chunker, ChunkStrategy};
82 ///
83 /// let chunker = Chunker::new(ChunkStrategy::Paragraph);
84 /// ```
85 #[must_use]
86 pub fn new(strategy: ChunkStrategy) -> Self {
87 Self {
88 strategy,
89 chunk_size: 512,
90 overlap: 0,
91 min_chunk_size: 100,
92 }
93 }
94
95 /// Sets the target chunk size in characters.
96 ///
97 /// For [`ChunkStrategy::Fixed`], this is the exact size (except for the last chunk).
98 /// For other strategies, this is the target size for merging small segments.
99 ///
100 /// Default: 512
101 ///
102 /// # Example
103 ///
104 /// ```
105 /// use pdfvec::{Chunker, ChunkStrategy};
106 ///
107 /// let chunker = Chunker::new(ChunkStrategy::Fixed).chunk_size(1024);
108 /// ```
109 #[must_use]
110 pub fn chunk_size(mut self, size: usize) -> Self {
111 self.chunk_size = size.max(1);
112 self
113 }
114
115 /// Sets the overlap between consecutive chunks in characters.
116 ///
117 /// Overlap helps maintain context across chunk boundaries. Only applies
118 /// to [`ChunkStrategy::Fixed`].
119 ///
120 /// Default: 0
121 ///
122 /// # Example
123 ///
124 /// ```
125 /// use pdfvec::{Chunker, ChunkStrategy};
126 ///
127 /// let chunker = Chunker::new(ChunkStrategy::Fixed)
128 /// .chunk_size(100)
129 /// .overlap(20);
130 /// ```
131 #[must_use]
132 pub fn overlap(mut self, chars: usize) -> Self {
133 self.overlap = chars;
134 self
135 }
136
137 /// Sets the minimum chunk size for paragraph and sentence strategies.
138 ///
139 /// Chunks smaller than this will be merged with adjacent chunks.
140 ///
141 /// Default: 100
142 ///
143 /// # Example
144 ///
145 /// ```
146 /// use pdfvec::{Chunker, ChunkStrategy};
147 ///
148 /// let chunker = Chunker::new(ChunkStrategy::Paragraph)
149 /// .min_chunk_size(50);
150 /// ```
151 #[must_use]
152 pub fn min_chunk_size(mut self, size: usize) -> Self {
153 self.min_chunk_size = size;
154 self
155 }
156
157 /// Returns an iterator over chunks of the input text.
158 ///
159 /// # Example
160 ///
161 /// ```
162 /// use pdfvec::{Chunker, ChunkStrategy};
163 ///
164 /// let text = "Hello world. How are you?";
165 /// let chunker = Chunker::new(ChunkStrategy::Sentence);
166 ///
167 /// for chunk in chunker.chunks(text) {
168 /// println!("Chunk {}: {}", chunk.index(), chunk.text());
169 /// }
170 /// ```
171 #[must_use]
172 pub fn chunks<'a>(&self, text: &'a str) -> Box<dyn Iterator<Item = Chunk<'a>> + 'a> {
173 match self.strategy {
174 ChunkStrategy::Fixed => Box::new(fixed::FixedChunker::new(
175 text,
176 self.chunk_size,
177 self.overlap,
178 )),
179 ChunkStrategy::Paragraph => Box::new(paragraph::ParagraphChunker::new(
180 text,
181 self.chunk_size,
182 self.min_chunk_size,
183 )),
184 ChunkStrategy::Sentence => Box::new(sentence::SentenceChunker::new(
185 text,
186 self.chunk_size,
187 self.min_chunk_size,
188 )),
189 }
190 }
191
192 /// Returns the configured chunk size.
193 #[must_use]
194 pub fn get_chunk_size(&self) -> usize {
195 self.chunk_size
196 }
197
198 /// Returns the configured overlap.
199 #[must_use]
200 pub fn get_overlap(&self) -> usize {
201 self.overlap
202 }
203
204 /// Returns the configured strategy.
205 #[must_use]
206 pub fn strategy(&self) -> ChunkStrategy {
207 self.strategy
208 }
209}
210
211#[cfg(test)]
212mod tests {
213 use super::*;
214
215 #[test]
216 fn chunker_default() {
217 let chunker = Chunker::default();
218 assert_eq!(chunker.strategy(), ChunkStrategy::Fixed);
219 assert_eq!(chunker.get_chunk_size(), 512);
220 assert_eq!(chunker.get_overlap(), 0);
221 }
222
223 #[test]
224 fn chunker_builder() {
225 let chunker = Chunker::new(ChunkStrategy::Paragraph)
226 .chunk_size(1024)
227 .overlap(100)
228 .min_chunk_size(50);
229
230 assert_eq!(chunker.strategy(), ChunkStrategy::Paragraph);
231 assert_eq!(chunker.get_chunk_size(), 1024);
232 assert_eq!(chunker.get_overlap(), 100);
233 }
234
235 #[test]
236 fn chunk_size_minimum_is_one() {
237 let chunker = Chunker::new(ChunkStrategy::Fixed).chunk_size(0);
238 assert_eq!(chunker.get_chunk_size(), 1);
239 }
240}