chunkedrs/lib.rs
1//! # chunkedrs
2//!
3//! AI-native text chunking — split long documents into token-accurate pieces for
4//! embedding and retrieval. Built on [tiktoken](https://crates.io/crates/tiktoken)
5//! for precise token counting.
6//!
7//! ## Design: 用就要好用
8//!
9//! Three strategies, each done right:
10//!
11//! | Strategy | Use case | Speed |
12//! |----------|----------|-------|
13//! | **Recursive** (default) | General text — paragraphs, sentences, words | Fastest |
14//! | **Markdown** | Documents with `#` headers — preserves section metadata | Fast |
15//! | **Semantic** | High-quality RAG — splits at meaning boundaries via embeddings | Slower (API calls) |
16//!
17//! ## Quick start
18//!
19//! ```rust
20//! // split with defaults: recursive, 512 max tokens, no overlap
21//! let chunks = chunkedrs::chunk("your long text here...").split();
22//! for chunk in &chunks {
23//! println!("[{}] {} tokens", chunk.index, chunk.token_count);
24//! }
25//! ```
26//!
27//! ## Token-accurate splitting
28//!
29//! ```rust
30//! let chunks = chunkedrs::chunk("your long text here...")
31//! .max_tokens(256)
32//! .overlap(50)
33//! .model("gpt-4o")
34//! .split();
35//!
36//! // every chunk is guaranteed to have <= 256 tokens
37//! assert!(chunks.iter().all(|c| c.token_count <= 256));
38//! ```
39//!
40//! ## Markdown-aware splitting
41//!
42//! ```rust
43//! let markdown = "# Intro\n\nSome text.\n\n## Details\n\nMore text here.\n";
44//! let chunks = chunkedrs::chunk(markdown).markdown().split();
45//!
46//! // each chunk knows which section it belongs to
47//! assert_eq!(chunks[0].section.as_deref(), Some("# Intro"));
48//! ```
49//!
50//! ## Semantic splitting
51//!
52//! With the `semantic` feature enabled, split at meaning boundaries using embeddings:
53//!
54//! ```rust,ignore
55//! let client = embedrs::openai("sk-...");
56//! let chunks = chunkedrs::chunk("your long text here...")
57//! .semantic(&client)
58//! .split_async()
59//! .await?;
60//! ```
61
62mod chunk;
63mod markdown;
64pub(crate) mod recursive;
65#[cfg(feature = "semantic")]
66mod semantic;
67
68pub use chunk::Chunk;
69
70/// find byte offset of a substring within the parent string using pointer arithmetic
71pub(crate) fn byte_offset_of(sub: &str, parent: &str) -> usize {
72 let sub_ptr = sub.as_ptr() as usize;
73 let parent_ptr = parent.as_ptr() as usize;
74 debug_assert!(
75 sub_ptr >= parent_ptr && sub_ptr <= parent_ptr + parent.len(),
76 "substring pointer is not within parent string bounds"
77 );
78 sub_ptr.saturating_sub(parent_ptr)
79}
80
81/// Error types for chunkedrs operations.
82#[derive(Debug)]
83#[non_exhaustive]
84pub enum Error {
85 /// Embedding error during semantic chunking.
86 #[cfg(feature = "semantic")]
87 Embed(embedrs::Error),
88}
89
90impl std::fmt::Display for Error {
91 #[allow(unused_variables)]
92 fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
93 match *self {
94 #[cfg(feature = "semantic")]
95 Error::Embed(ref e) => write!(f, "embedding error: {e}"),
96 // without semantic feature, Error is uninhabited but non_exhaustive
97 // keeps the type valid for future expansion
98 #[cfg(not(feature = "semantic"))]
99 _ => unreachable!("Error is uninhabited without semantic feature"),
100 }
101 }
102}
103
104impl std::error::Error for Error {}
105
106/// Result type for chunkedrs operations.
107pub type Result<T> = std::result::Result<T, Error>;
108
109/// Create a chunk builder for the given text.
110///
111/// This is the main entry point. Call `.split()` to get chunks with the default
112/// strategy (recursive), or chain builder methods to customize:
113///
114/// ```rust
115/// let chunks = chunkedrs::chunk("hello world").split();
116/// assert_eq!(chunks.len(), 1);
117/// assert_eq!(chunks[0].content, "hello world");
118/// ```
119pub fn chunk(text: &str) -> ChunkBuilder<'_> {
120 ChunkBuilder {
121 text,
122 max_tokens: 512,
123 overlap: 0,
124 model_name: None,
125 encoding_name: None,
126 strategy: Strategy::Recursive,
127 #[cfg(feature = "semantic")]
128 semantic_client: None,
129 #[cfg(feature = "semantic")]
130 semantic_threshold: 0.5,
131 }
132}
133
134/// Strategy for splitting text.
135#[derive(Debug, Clone, Copy, PartialEq, Eq)]
136enum Strategy {
137 Recursive,
138 Markdown,
139 #[cfg(feature = "semantic")]
140 Semantic,
141}
142
143/// Builder for configuring text chunking.
144///
145/// Created by [`chunk()`]. Chain methods to configure, then call [`.split()`](ChunkBuilder::split)
146/// (sync) or [`.split_async()`](ChunkBuilder::split_async) (semantic).
147pub struct ChunkBuilder<'a> {
148 text: &'a str,
149 max_tokens: usize,
150 overlap: usize,
151 model_name: Option<&'a str>,
152 encoding_name: Option<&'a str>,
153 strategy: Strategy,
154 #[cfg(feature = "semantic")]
155 semantic_client: Option<&'a embedrs::Client>,
156 #[cfg(feature = "semantic")]
157 semantic_threshold: f64,
158}
159
160impl<'a> ChunkBuilder<'a> {
161 /// Set the maximum number of tokens per chunk. Default: 512.
162 ///
163 /// ```rust
164 /// let chunks = chunkedrs::chunk("hello world").max_tokens(256).split();
165 /// assert!(chunks.iter().all(|c| c.token_count <= 256));
166 /// ```
167 pub fn max_tokens(mut self, n: usize) -> Self {
168 self.max_tokens = n.max(1);
169 self
170 }
171
172 /// Set the number of overlapping tokens between consecutive chunks. Default: 0.
173 ///
174 /// Overlap ensures context is not lost at chunk boundaries — important for
175 /// retrieval quality in RAG pipelines.
176 ///
177 /// ```rust
178 /// let chunks = chunkedrs::chunk("hello world").overlap(50).split();
179 /// ```
180 pub fn overlap(mut self, tokens: usize) -> Self {
181 self.overlap = tokens;
182 self
183 }
184
185 /// Set the model name to auto-select the correct tokenizer encoding.
186 ///
187 /// Uses [`tiktoken::encoding_for_model`] to find the right encoding.
188 /// Default: `o200k_base` (GPT-4o, GPT-4-turbo).
189 ///
190 /// This is independent of [`.encoding()`](ChunkBuilder::encoding). If both are
191 /// set, `encoding` takes precedence.
192 ///
193 /// If the model name is not recognized, falls back to `o200k_base` silently.
194 ///
195 /// ```rust
196 /// let chunks = chunkedrs::chunk("hello world").model("gpt-4o").split();
197 /// ```
198 pub fn model(mut self, model: &'a str) -> Self {
199 self.model_name = Some(model);
200 self
201 }
202
203 /// Set the tiktoken encoding name directly.
204 ///
205 /// Use this when you know the exact encoding (e.g. `"cl100k_base"`, `"o200k_base"`).
206 /// Takes precedence over [`.model()`](ChunkBuilder::model) if both are set.
207 ///
208 /// If the encoding name is not recognized, falls back to `o200k_base` silently.
209 ///
210 /// ```rust
211 /// let chunks = chunkedrs::chunk("hello world").encoding("cl100k_base").split();
212 /// ```
213 pub fn encoding(mut self, encoding: &'a str) -> Self {
214 self.encoding_name = Some(encoding);
215 self
216 }
217
218 /// Use markdown-aware splitting.
219 ///
220 /// Splits at `#` header boundaries first, then applies recursive splitting
221 /// within each section. Each chunk's [`Chunk::section`] field contains the
222 /// header it belongs to.
223 ///
224 /// Note: header lines themselves are stored in `section` metadata, not in
225 /// chunk `content`. This means joining all chunk contents will not reproduce
226 /// the header lines from the original document.
227 ///
228 /// ```rust
229 /// let md = "# Title\n\nContent here.\n";
230 /// let chunks = chunkedrs::chunk(md).markdown().split();
231 /// assert_eq!(chunks[0].section.as_deref(), Some("# Title"));
232 /// ```
233 pub fn markdown(mut self) -> Self {
234 self.strategy = Strategy::Markdown;
235 self
236 }
237
238 /// Use semantic splitting with an embedding client.
239 ///
240 /// Splits at meaning boundaries by computing cosine similarity between
241 /// consecutive sentence embeddings. When similarity drops below the threshold,
242 /// a new chunk begins.
243 ///
244 /// Requires the `semantic` feature and an [`embedrs::Client`].
245 /// Must use [`.split_async()`](ChunkBuilder::split_async) instead of `.split()`.
246 ///
247 /// ```rust,ignore
248 /// let client = embedrs::openai("sk-...");
249 /// let chunks = chunkedrs::chunk(text)
250 /// .semantic(&client)
251 /// .split_async()
252 /// .await?;
253 /// ```
254 #[cfg(feature = "semantic")]
255 pub fn semantic(mut self, client: &'a embedrs::Client) -> Self {
256 self.strategy = Strategy::Semantic;
257 self.semantic_client = Some(client);
258 self
259 }
260
261 /// Set the similarity threshold for semantic splitting. Default: 0.5.
262 ///
263 /// Lower values create fewer, larger chunks. Higher values create more, smaller chunks.
264 /// Only effective when using [`.semantic()`](ChunkBuilder::semantic).
265 #[cfg(feature = "semantic")]
266 pub fn threshold(mut self, t: f64) -> Self {
267 self.semantic_threshold = t;
268 self
269 }
270
271 /// Split the text synchronously. Works with recursive and markdown strategies.
272 ///
273 /// Panics if called with the semantic strategy — use
274 /// [`.split_async()`](ChunkBuilder::split_async) instead.
275 ///
276 /// ```rust
277 /// let chunks = chunkedrs::chunk("hello world").split();
278 /// assert_eq!(chunks[0].content, "hello world");
279 /// ```
280 pub fn split(self) -> Vec<Chunk> {
281 let encoder = self.resolve_encoder();
282 match self.strategy {
283 Strategy::Recursive => recursive::split_recursive(
284 self.text,
285 0,
286 self.max_tokens,
287 self.overlap,
288 encoder,
289 &None,
290 ),
291 Strategy::Markdown => {
292 markdown::split_markdown(self.text, self.max_tokens, self.overlap, encoder)
293 }
294 #[cfg(feature = "semantic")]
295 Strategy::Semantic => {
296 panic!(
297 "semantic strategy requires async: use .split_async().await instead of .split()"
298 )
299 }
300 }
301 }
302
303 /// Split the text asynchronously. Required for semantic splitting.
304 ///
305 /// ```rust,ignore
306 /// let chunks = chunkedrs::chunk(text)
307 /// .semantic(&client)
308 /// .split_async()
309 /// .await?;
310 /// ```
311 #[cfg(feature = "semantic")]
312 pub async fn split_async(self) -> Result<Vec<Chunk>> {
313 let encoder = self.resolve_encoder();
314 match self.strategy {
315 Strategy::Semantic => {
316 let client = self
317 .semantic_client
318 .expect("semantic() must be called before split_async()");
319 semantic::split_semantic(
320 self.text,
321 self.max_tokens,
322 self.overlap,
323 encoder,
324 client,
325 self.semantic_threshold,
326 )
327 .await
328 }
329 _ => Ok(self.split()),
330 }
331 }
332
333 fn resolve_encoder(&self) -> &'static tiktoken::CoreBpe {
334 let default = || tiktoken::get_encoding("o200k_base").expect("o200k_base encoding");
335
336 // encoding name takes precedence over model name
337 if let Some(name) = self.encoding_name {
338 return tiktoken::get_encoding(name).unwrap_or_else(default);
339 }
340
341 // try model name
342 if let Some(model) = self.model_name {
343 return tiktoken::encoding_for_model(model)
344 .or_else(|| tiktoken::get_encoding(model))
345 .unwrap_or_else(default);
346 }
347
348 default()
349 }
350}
351
352#[cfg(test)]
353mod tests {
354 use super::*;
355
356 #[test]
357 fn chunk_short_text() {
358 let chunks = chunk("hello world").split();
359 assert_eq!(chunks.len(), 1);
360 assert_eq!(chunks[0].content, "hello world");
361 assert_eq!(chunks[0].index, 0);
362 assert_eq!(chunks[0].start_byte, 0);
363 assert_eq!(chunks[0].end_byte, 11);
364 assert!(chunks[0].token_count > 0);
365 }
366
367 #[test]
368 fn chunk_empty_text() {
369 let chunks = chunk("").split();
370 assert!(chunks.is_empty());
371 }
372
373 #[test]
374 fn chunk_respects_max_tokens() {
375 let text = "The quick brown fox. ".repeat(100);
376 let chunks = chunk(&text).max_tokens(20).split();
377 for c in &chunks {
378 assert!(
379 c.token_count <= 20,
380 "chunk {} has {} tokens",
381 c.index,
382 c.token_count
383 );
384 }
385 }
386
387 #[test]
388 fn chunk_with_overlap() {
389 let text = "Sentence one. Sentence two. Sentence three. Sentence four. Sentence five. Sentence six.";
390 let chunks = chunk(text).max_tokens(10).overlap(3).split();
391 assert!(chunks.len() >= 2);
392 }
393
394 #[test]
395 fn chunk_max_tokens_minimum_one() {
396 let chunks = chunk("hello").max_tokens(0).split();
397 // max_tokens(0) becomes 1
398 assert!(!chunks.is_empty());
399 }
400
401 #[test]
402 fn chunk_with_model() {
403 let chunks = chunk("hello world").model("gpt-4o").split();
404 assert_eq!(chunks.len(), 1);
405 }
406
407 #[test]
408 fn chunk_with_encoding() {
409 let chunks = chunk("hello world").encoding("cl100k_base").split();
410 assert_eq!(chunks.len(), 1);
411 }
412
413 #[test]
414 fn chunk_markdown_mode() {
415 let md = "# Title\n\nSome content.\n\n## Section\n\nMore content.\n";
416 let chunks = chunk(md).markdown().split();
417 assert!(chunks.len() >= 2);
418 assert_eq!(chunks[0].section.as_deref(), Some("# Title"));
419 }
420
421 #[test]
422 fn chunk_sequential_indices() {
423 let text = "Word. ".repeat(200);
424 let chunks = chunk(&text).max_tokens(10).split();
425 for (i, c) in chunks.iter().enumerate() {
426 assert_eq!(c.index, i);
427 }
428 }
429
430 #[test]
431 fn chunk_chinese_text() {
432 let text = "这是一段中文文本。它包含多个句子。每个句子都应该被正确分割。更多的内容在这里。还有更多。最后一句话。";
433 let chunks = chunk(text).max_tokens(10).split();
434 assert!(chunks.len() >= 2);
435 for c in &chunks {
436 assert!(c.token_count <= 10);
437 }
438 }
439
440 #[test]
441 fn chunk_japanese_text() {
442 let text =
443 "これは日本語のテキストです。複数の文が含まれています。正しく分割されるべきです。";
444 let chunks = chunk(text).max_tokens(10).split();
445 assert!(chunks.len() >= 1);
446 for c in &chunks {
447 assert!(c.token_count <= 10);
448 }
449 }
450
451 #[test]
452 fn chunk_preserves_all_content() {
453 let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
454 let chunks = chunk(text).max_tokens(5).split();
455 let combined: String = chunks
456 .iter()
457 .map(|c| c.content.as_str())
458 .collect::<Vec<_>>()
459 .join("");
460 assert!(combined.contains("First"));
461 assert!(combined.contains("Second"));
462 assert!(combined.contains("Third"));
463 }
464
465 #[test]
466 fn chunk_large_document() {
467 let text = "Lorem ipsum dolor sit amet. ".repeat(1000);
468 let chunks = chunk(&text).max_tokens(100).split();
469 assert!(chunks.len() >= 10);
470 for c in &chunks {
471 assert!(c.token_count <= 100);
472 }
473 }
474
475 #[test]
476 fn chunk_single_token_max() {
477 let chunks = chunk("hello world foo bar").max_tokens(1).split();
478 assert!(chunks.len() >= 4);
479 for c in &chunks {
480 assert!(c.token_count <= 1);
481 }
482 }
483
484 #[test]
485 fn resolve_encoder_unknown_falls_back() {
486 let builder = chunk("test").model("nonexistent-model-xyz");
487 let enc = builder.resolve_encoder();
488 assert!(enc.count("hello") > 0);
489 }
490
491 #[test]
492 fn model_and_encoding_are_independent() {
493 // encoding takes precedence over model
494 // gpt-4o uses o200k_base, but we explicitly set cl100k_base
495 let enc_cl100k = chunk("test")
496 .model("gpt-4o")
497 .encoding("cl100k_base")
498 .resolve_encoder();
499 let enc_o200k = chunk("test").model("gpt-4o").resolve_encoder();
500
501 // verify they are different encoders by checking that at least one of
502 // several test strings produces different token counts
503 let test_texts = [
504 "hello_world_123_test",
505 "foo::bar::baz::qux",
506 "αβγδεζηθ",
507 "1234567890",
508 ];
509 let any_different = test_texts
510 .iter()
511 .any(|t| enc_cl100k.count(t) != enc_o200k.count(t));
512 assert!(
513 any_different,
514 "cl100k_base and o200k_base should produce different token counts for at least one test string"
515 );
516 }
517
518 #[test]
519 fn encoding_only_without_model() {
520 let builder = chunk("test").encoding("cl100k_base");
521 let enc = builder.resolve_encoder();
522 assert!(enc.count("hello") > 0);
523 }
524
525 #[test]
526 fn model_only_without_encoding() {
527 let builder = chunk("test").model("gpt-4o");
528 let enc = builder.resolve_encoder();
529 assert!(enc.count("hello") > 0);
530 }
531}