Skip to main content

chunkedrs/
lib.rs

1//! # chunkedrs
2//!
3//! AI-native text chunking — split long documents into token-accurate pieces for
4//! embedding and retrieval. Built on [tiktoken](https://crates.io/crates/tiktoken)
5//! for precise token counting.
6//!
7//! ## Design: 用就要好用
8//!
9//! Three strategies, each done right:
10//!
11//! | Strategy | Use case | Speed |
12//! |----------|----------|-------|
13//! | **Recursive** (default) | General text — paragraphs, sentences, words | Fastest |
14//! | **Markdown** | Documents with `#` headers — preserves section metadata | Fast |
15//! | **Semantic** | High-quality RAG — splits at meaning boundaries via embeddings | Slower (API calls) |
16//!
17//! ## Quick start
18//!
19//! ```rust
20//! // split with defaults: recursive, 512 max tokens, no overlap
21//! let chunks = chunkedrs::chunk("your long text here...").split();
22//! for chunk in &chunks {
23//!     println!("[{}] {} tokens", chunk.index, chunk.token_count);
24//! }
25//! ```
26//!
27//! ## Token-accurate splitting
28//!
29//! ```rust
30//! let chunks = chunkedrs::chunk("your long text here...")
31//!     .max_tokens(256)
32//!     .overlap(50)
33//!     .model("gpt-4o")
34//!     .split();
35//!
36//! // every chunk is guaranteed to have <= 256 tokens
37//! assert!(chunks.iter().all(|c| c.token_count <= 256));
38//! ```
39//!
40//! ## Markdown-aware splitting
41//!
42//! ```rust
43//! let markdown = "# Intro\n\nSome text.\n\n## Details\n\nMore text here.\n";
44//! let chunks = chunkedrs::chunk(markdown).markdown().split();
45//!
46//! // each chunk knows which section it belongs to
47//! assert_eq!(chunks[0].section.as_deref(), Some("# Intro"));
48//! ```
49//!
50//! ## Semantic splitting
51//!
52//! With the `semantic` feature enabled, split at meaning boundaries using embeddings:
53//!
54//! ```rust,ignore
55//! let client = embedrs::openai("sk-...");
56//! let chunks = chunkedrs::chunk("your long text here...")
57//!     .semantic(&client)
58//!     .split_async()
59//!     .await?;
60//! ```
61
62mod chunk;
63mod markdown;
64pub(crate) mod recursive;
65#[cfg(feature = "semantic")]
66mod semantic;
67
68pub use chunk::Chunk;
69
70/// find byte offset of a substring within the parent string using pointer arithmetic
71pub(crate) fn byte_offset_of(sub: &str, parent: &str) -> usize {
72    let sub_ptr = sub.as_ptr() as usize;
73    let parent_ptr = parent.as_ptr() as usize;
74    debug_assert!(
75        sub_ptr >= parent_ptr && sub_ptr <= parent_ptr + parent.len(),
76        "substring pointer is not within parent string bounds"
77    );
78    sub_ptr.saturating_sub(parent_ptr)
79}
80
81/// Error types for chunkedrs operations.
82#[derive(Debug)]
83#[non_exhaustive]
84pub enum Error {
85    /// Embedding error during semantic chunking.
86    #[cfg(feature = "semantic")]
87    Embed(embedrs::Error),
88}
89
90impl std::fmt::Display for Error {
91    #[allow(unused_variables)]
92    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
93        match *self {
94            #[cfg(feature = "semantic")]
95            Error::Embed(ref e) => write!(f, "embedding error: {e}"),
96            // without semantic feature, Error is uninhabited but non_exhaustive
97            // keeps the type valid for future expansion
98            #[cfg(not(feature = "semantic"))]
99            _ => unreachable!("Error is uninhabited without semantic feature"),
100        }
101    }
102}
103
104impl std::error::Error for Error {}
105
106/// Result type for chunkedrs operations.
107pub type Result<T> = std::result::Result<T, Error>;
108
109/// Create a chunk builder for the given text.
110///
111/// This is the main entry point. Call `.split()` to get chunks with the default
112/// strategy (recursive), or chain builder methods to customize:
113///
114/// ```rust
115/// let chunks = chunkedrs::chunk("hello world").split();
116/// assert_eq!(chunks.len(), 1);
117/// assert_eq!(chunks[0].content, "hello world");
118/// ```
119pub fn chunk(text: &str) -> ChunkBuilder<'_> {
120    ChunkBuilder {
121        text,
122        max_tokens: 512,
123        overlap: 0,
124        model_name: None,
125        encoding_name: None,
126        strategy: Strategy::Recursive,
127        #[cfg(feature = "semantic")]
128        semantic_client: None,
129        #[cfg(feature = "semantic")]
130        semantic_threshold: 0.5,
131    }
132}
133
134/// Strategy for splitting text.
135#[derive(Debug, Clone, Copy, PartialEq, Eq)]
136enum Strategy {
137    Recursive,
138    Markdown,
139    #[cfg(feature = "semantic")]
140    Semantic,
141}
142
143/// Builder for configuring text chunking.
144///
145/// Created by [`chunk()`]. Chain methods to configure, then call [`.split()`](ChunkBuilder::split)
146/// (sync) or [`.split_async()`](ChunkBuilder::split_async) (semantic).
147pub struct ChunkBuilder<'a> {
148    text: &'a str,
149    max_tokens: usize,
150    overlap: usize,
151    model_name: Option<&'a str>,
152    encoding_name: Option<&'a str>,
153    strategy: Strategy,
154    #[cfg(feature = "semantic")]
155    semantic_client: Option<&'a embedrs::Client>,
156    #[cfg(feature = "semantic")]
157    semantic_threshold: f64,
158}
159
160impl<'a> ChunkBuilder<'a> {
161    /// Set the maximum number of tokens per chunk. Default: 512.
162    ///
163    /// ```rust
164    /// let chunks = chunkedrs::chunk("hello world").max_tokens(256).split();
165    /// assert!(chunks.iter().all(|c| c.token_count <= 256));
166    /// ```
167    pub fn max_tokens(mut self, n: usize) -> Self {
168        self.max_tokens = n.max(1);
169        self
170    }
171
172    /// Set the number of overlapping tokens between consecutive chunks. Default: 0.
173    ///
174    /// Overlap ensures context is not lost at chunk boundaries — important for
175    /// retrieval quality in RAG pipelines.
176    ///
177    /// ```rust
178    /// let chunks = chunkedrs::chunk("hello world").overlap(50).split();
179    /// ```
180    pub fn overlap(mut self, tokens: usize) -> Self {
181        self.overlap = tokens;
182        self
183    }
184
185    /// Set the model name to auto-select the correct tokenizer encoding.
186    ///
187    /// Uses [`tiktoken::encoding_for_model`] to find the right encoding.
188    /// Default: `o200k_base` (GPT-4o, GPT-4-turbo).
189    ///
190    /// This is independent of [`.encoding()`](ChunkBuilder::encoding). If both are
191    /// set, `encoding` takes precedence.
192    ///
193    /// If the model name is not recognized, falls back to `o200k_base` silently.
194    ///
195    /// ```rust
196    /// let chunks = chunkedrs::chunk("hello world").model("gpt-4o").split();
197    /// ```
198    pub fn model(mut self, model: &'a str) -> Self {
199        self.model_name = Some(model);
200        self
201    }
202
203    /// Set the tiktoken encoding name directly.
204    ///
205    /// Use this when you know the exact encoding (e.g. `"cl100k_base"`, `"o200k_base"`).
206    /// Takes precedence over [`.model()`](ChunkBuilder::model) if both are set.
207    ///
208    /// If the encoding name is not recognized, falls back to `o200k_base` silently.
209    ///
210    /// ```rust
211    /// let chunks = chunkedrs::chunk("hello world").encoding("cl100k_base").split();
212    /// ```
213    pub fn encoding(mut self, encoding: &'a str) -> Self {
214        self.encoding_name = Some(encoding);
215        self
216    }
217
218    /// Use markdown-aware splitting.
219    ///
220    /// Splits at `#` header boundaries first, then applies recursive splitting
221    /// within each section. Each chunk's [`Chunk::section`] field contains the
222    /// header it belongs to.
223    ///
224    /// Note: header lines themselves are stored in `section` metadata, not in
225    /// chunk `content`. This means joining all chunk contents will not reproduce
226    /// the header lines from the original document.
227    ///
228    /// ```rust
229    /// let md = "# Title\n\nContent here.\n";
230    /// let chunks = chunkedrs::chunk(md).markdown().split();
231    /// assert_eq!(chunks[0].section.as_deref(), Some("# Title"));
232    /// ```
233    pub fn markdown(mut self) -> Self {
234        self.strategy = Strategy::Markdown;
235        self
236    }
237
238    /// Use semantic splitting with an embedding client.
239    ///
240    /// Splits at meaning boundaries by computing cosine similarity between
241    /// consecutive sentence embeddings. When similarity drops below the threshold,
242    /// a new chunk begins.
243    ///
244    /// Requires the `semantic` feature and an [`embedrs::Client`].
245    /// Must use [`.split_async()`](ChunkBuilder::split_async) instead of `.split()`.
246    ///
247    /// ```rust,ignore
248    /// let client = embedrs::openai("sk-...");
249    /// let chunks = chunkedrs::chunk(text)
250    ///     .semantic(&client)
251    ///     .split_async()
252    ///     .await?;
253    /// ```
254    #[cfg(feature = "semantic")]
255    pub fn semantic(mut self, client: &'a embedrs::Client) -> Self {
256        self.strategy = Strategy::Semantic;
257        self.semantic_client = Some(client);
258        self
259    }
260
261    /// Set the similarity threshold for semantic splitting. Default: 0.5.
262    ///
263    /// Lower values create fewer, larger chunks. Higher values create more, smaller chunks.
264    /// Only effective when using [`.semantic()`](ChunkBuilder::semantic).
265    #[cfg(feature = "semantic")]
266    pub fn threshold(mut self, t: f64) -> Self {
267        self.semantic_threshold = t;
268        self
269    }
270
271    /// Split the text synchronously. Works with recursive and markdown strategies.
272    ///
273    /// Panics if called with the semantic strategy — use
274    /// [`.split_async()`](ChunkBuilder::split_async) instead.
275    ///
276    /// ```rust
277    /// let chunks = chunkedrs::chunk("hello world").split();
278    /// assert_eq!(chunks[0].content, "hello world");
279    /// ```
280    pub fn split(self) -> Vec<Chunk> {
281        let encoder = self.resolve_encoder();
282        match self.strategy {
283            Strategy::Recursive => recursive::split_recursive(
284                self.text,
285                0,
286                self.max_tokens,
287                self.overlap,
288                encoder,
289                &None,
290            ),
291            Strategy::Markdown => {
292                markdown::split_markdown(self.text, self.max_tokens, self.overlap, encoder)
293            }
294            #[cfg(feature = "semantic")]
295            Strategy::Semantic => {
296                panic!(
297                    "semantic strategy requires async: use .split_async().await instead of .split()"
298                )
299            }
300        }
301    }
302
303    /// Split the text asynchronously. Required for semantic splitting.
304    ///
305    /// ```rust,ignore
306    /// let chunks = chunkedrs::chunk(text)
307    ///     .semantic(&client)
308    ///     .split_async()
309    ///     .await?;
310    /// ```
311    #[cfg(feature = "semantic")]
312    pub async fn split_async(self) -> Result<Vec<Chunk>> {
313        let encoder = self.resolve_encoder();
314        match self.strategy {
315            Strategy::Semantic => {
316                let client = self
317                    .semantic_client
318                    .expect("semantic() must be called before split_async()");
319                semantic::split_semantic(
320                    self.text,
321                    self.max_tokens,
322                    self.overlap,
323                    encoder,
324                    client,
325                    self.semantic_threshold,
326                )
327                .await
328            }
329            _ => Ok(self.split()),
330        }
331    }
332
333    fn resolve_encoder(&self) -> &'static tiktoken::CoreBpe {
334        let default = || tiktoken::get_encoding("o200k_base").expect("o200k_base encoding");
335
336        // encoding name takes precedence over model name
337        if let Some(name) = self.encoding_name {
338            return tiktoken::get_encoding(name).unwrap_or_else(default);
339        }
340
341        // try model name
342        if let Some(model) = self.model_name {
343            return tiktoken::encoding_for_model(model)
344                .or_else(|| tiktoken::get_encoding(model))
345                .unwrap_or_else(default);
346        }
347
348        default()
349    }
350}
351
352#[cfg(test)]
353mod tests {
354    use super::*;
355
356    #[test]
357    fn chunk_short_text() {
358        let chunks = chunk("hello world").split();
359        assert_eq!(chunks.len(), 1);
360        assert_eq!(chunks[0].content, "hello world");
361        assert_eq!(chunks[0].index, 0);
362        assert_eq!(chunks[0].start_byte, 0);
363        assert_eq!(chunks[0].end_byte, 11);
364        assert!(chunks[0].token_count > 0);
365    }
366
367    #[test]
368    fn chunk_empty_text() {
369        let chunks = chunk("").split();
370        assert!(chunks.is_empty());
371    }
372
373    #[test]
374    fn chunk_respects_max_tokens() {
375        let text = "The quick brown fox. ".repeat(100);
376        let chunks = chunk(&text).max_tokens(20).split();
377        for c in &chunks {
378            assert!(
379                c.token_count <= 20,
380                "chunk {} has {} tokens",
381                c.index,
382                c.token_count
383            );
384        }
385    }
386
387    #[test]
388    fn chunk_with_overlap() {
389        let text = "Sentence one. Sentence two. Sentence three. Sentence four. Sentence five. Sentence six.";
390        let chunks = chunk(text).max_tokens(10).overlap(3).split();
391        assert!(chunks.len() >= 2);
392    }
393
394    #[test]
395    fn chunk_max_tokens_minimum_one() {
396        let chunks = chunk("hello").max_tokens(0).split();
397        // max_tokens(0) becomes 1
398        assert!(!chunks.is_empty());
399    }
400
401    #[test]
402    fn chunk_with_model() {
403        let chunks = chunk("hello world").model("gpt-4o").split();
404        assert_eq!(chunks.len(), 1);
405    }
406
407    #[test]
408    fn chunk_with_encoding() {
409        let chunks = chunk("hello world").encoding("cl100k_base").split();
410        assert_eq!(chunks.len(), 1);
411    }
412
413    #[test]
414    fn chunk_markdown_mode() {
415        let md = "# Title\n\nSome content.\n\n## Section\n\nMore content.\n";
416        let chunks = chunk(md).markdown().split();
417        assert!(chunks.len() >= 2);
418        assert_eq!(chunks[0].section.as_deref(), Some("# Title"));
419    }
420
421    #[test]
422    fn chunk_sequential_indices() {
423        let text = "Word. ".repeat(200);
424        let chunks = chunk(&text).max_tokens(10).split();
425        for (i, c) in chunks.iter().enumerate() {
426            assert_eq!(c.index, i);
427        }
428    }
429
430    #[test]
431    fn chunk_chinese_text() {
432        let text = "这是一段中文文本。它包含多个句子。每个句子都应该被正确分割。更多的内容在这里。还有更多。最后一句话。";
433        let chunks = chunk(text).max_tokens(10).split();
434        assert!(chunks.len() >= 2);
435        for c in &chunks {
436            assert!(c.token_count <= 10);
437        }
438    }
439
440    #[test]
441    fn chunk_japanese_text() {
442        let text =
443            "これは日本語のテキストです。複数の文が含まれています。正しく分割されるべきです。";
444        let chunks = chunk(text).max_tokens(10).split();
445        assert!(chunks.len() >= 1);
446        for c in &chunks {
447            assert!(c.token_count <= 10);
448        }
449    }
450
451    #[test]
452    fn chunk_preserves_all_content() {
453        let text = "First paragraph.\n\nSecond paragraph.\n\nThird paragraph.";
454        let chunks = chunk(text).max_tokens(5).split();
455        let combined: String = chunks
456            .iter()
457            .map(|c| c.content.as_str())
458            .collect::<Vec<_>>()
459            .join("");
460        assert!(combined.contains("First"));
461        assert!(combined.contains("Second"));
462        assert!(combined.contains("Third"));
463    }
464
465    #[test]
466    fn chunk_large_document() {
467        let text = "Lorem ipsum dolor sit amet. ".repeat(1000);
468        let chunks = chunk(&text).max_tokens(100).split();
469        assert!(chunks.len() >= 10);
470        for c in &chunks {
471            assert!(c.token_count <= 100);
472        }
473    }
474
475    #[test]
476    fn chunk_single_token_max() {
477        let chunks = chunk("hello world foo bar").max_tokens(1).split();
478        assert!(chunks.len() >= 4);
479        for c in &chunks {
480            assert!(c.token_count <= 1);
481        }
482    }
483
484    #[test]
485    fn resolve_encoder_unknown_falls_back() {
486        let builder = chunk("test").model("nonexistent-model-xyz");
487        let enc = builder.resolve_encoder();
488        assert!(enc.count("hello") > 0);
489    }
490
491    #[test]
492    fn model_and_encoding_are_independent() {
493        // encoding takes precedence over model
494        // gpt-4o uses o200k_base, but we explicitly set cl100k_base
495        let enc_cl100k = chunk("test")
496            .model("gpt-4o")
497            .encoding("cl100k_base")
498            .resolve_encoder();
499        let enc_o200k = chunk("test").model("gpt-4o").resolve_encoder();
500
501        // verify they are different encoders by checking that at least one of
502        // several test strings produces different token counts
503        let test_texts = [
504            "hello_world_123_test",
505            "foo::bar::baz::qux",
506            "αβγδεζηθ",
507            "1234567890",
508        ];
509        let any_different = test_texts
510            .iter()
511            .any(|t| enc_cl100k.count(t) != enc_o200k.count(t));
512        assert!(
513            any_different,
514            "cl100k_base and o200k_base should produce different token counts for at least one test string"
515        );
516    }
517
518    #[test]
519    fn encoding_only_without_model() {
520        let builder = chunk("test").encoding("cl100k_base");
521        let enc = builder.resolve_encoder();
522        assert!(enc.count("hello") > 0);
523    }
524
525    #[test]
526    fn model_only_without_encoding() {
527        let builder = chunk("test").model("gpt-4o");
528        let enc = builder.resolve_encoder();
529        assert!(enc.count("hello") > 0);
530    }
531}