chunk/
lib.rs

1//! The fastest semantic text chunking library — up to 1TB/s chunking throughput.
2//!
3//! This crate provides three main functionalities:
4//!
5//! 1. **Size-based chunking** ([`chunk`] module): Split text into chunks of a target size,
6//!    preferring to break at delimiter boundaries.
7//!
8//! 2. **Delimiter splitting** ([`split`] module): Split text at every delimiter occurrence,
9//!    equivalent to Cython's `split_text` function.
10//!
11//! 3. **Token-aware merging** ([`merge`] module): Merge segments based on token counts,
12//!    equivalent to Cython's `_merge_splits` function.
13//!
14//! # Examples
15//!
16//! ## Size-based chunking
17//!
18//! ```
19//! use chunk::chunk;
20//!
21//! let text = b"Hello world. How are you? I'm fine.\nThanks for asking.";
22//!
23//! // With defaults (4KB chunks, split at \n . ?)
24//! let chunks: Vec<&[u8]> = chunk(text).collect();
25//!
26//! // With custom size and delimiters
27//! let chunks: Vec<&[u8]> = chunk(text).size(1024).delimiters(b"\n.?!").collect();
28//!
29//! // With multi-byte pattern (e.g., metaspace for SentencePiece tokenizers)
30//! let metaspace = "▁".as_bytes(); // [0xE2, 0x96, 0x81]
31//! let chunks: Vec<&[u8]> = chunk(b"Hello\xE2\x96\x81World").pattern(metaspace).collect();
32//! ```
33//!
34//! ## Delimiter splitting
35//!
36//! ```
37//! use chunk::{split, split_at_delimiters, IncludeDelim};
38//!
39//! let text = b"Hello. World. Test.";
40//!
41//! // Using the builder API
42//! let slices = split(text).delimiters(b".").include_prev().collect_slices();
43//! assert_eq!(slices, vec![b"Hello.".as_slice(), b" World.".as_slice(), b" Test.".as_slice()]);
44//!
45//! // Using the function directly
46//! let offsets = split_at_delimiters(text, b".", IncludeDelim::Prev, 0);
47//! assert_eq!(&text[offsets[0].0..offsets[0].1], b"Hello.");
48//! ```
49//!
50//! ## Token-aware merging
51//!
52//! ```
53//! use chunk::merge_splits;
54//!
55//! // Merge text segments based on token counts
56//! let splits = vec!["a", "b", "c", "d", "e", "f", "g"];
57//! let token_counts = vec![1, 1, 1, 1, 1, 1, 1];
58//! let result = merge_splits(&splits, &token_counts, 3);
59//! assert_eq!(result.merged, vec!["abc", "def", "g"]);
60//! assert_eq!(result.token_counts, vec![3, 3, 1]);
61//! ```
62
63mod chunk;
64mod delim;
65mod merge;
66mod savgol;
67mod split;
68
69// Re-export from chunk module
70pub use crate::chunk::{Chunker, OwnedChunker, chunk};
71
72// Re-export from split module
73pub use crate::split::{IncludeDelim, PatternSplitter, Splitter, split, split_at_delimiters, split_at_patterns};
74
75// Re-export from merge module
76pub use crate::merge::{MergeResult, find_merge_indices, merge_splits};
77
78// Re-export constants from delim module
79pub use crate::delim::{DEFAULT_DELIMITERS, DEFAULT_TARGET_SIZE};
80
81// Re-export from savgol module
82pub use crate::savgol::{
83    FilteredIndices, MinimaResult, filter_split_indices, find_local_minima_interpolated,
84    savgol_filter, windowed_cross_similarity,
85};
86
87// Additional tests that span modules
88#[cfg(test)]
89mod integration_tests {
90    use super::*;
91
92    #[test]
93    fn test_chunk_and_split_consistency() {
94        // Both should preserve all bytes
95        let text = b"Hello. World. Test.";
96
97        let chunk_total: usize = chunk(text).size(10).delimiters(b".").map(|c| c.len()).sum();
98        let split_total: usize = split_at_delimiters(text, b".", IncludeDelim::Prev, 0)
99            .iter()
100            .map(|(s, e)| e - s)
101            .sum();
102
103        assert_eq!(chunk_total, text.len());
104        assert_eq!(split_total, text.len());
105    }
106
107    #[test]
108    fn test_consecutive_delimiters_chunk() {
109        let text = b"Hello\n\nWorld";
110        let chunks: Vec<_> = chunk(text).size(8).delimiters(b"\n").collect();
111        let total: usize = chunks.iter().map(|c| c.len()).sum();
112        assert_eq!(total, text.len());
113    }
114
115    #[test]
116    fn test_prefix_mode_chunk() {
117        let text = b"Hello World Test";
118        let chunks: Vec<_> = chunk(text).size(8).delimiters(b" ").prefix().collect();
119        assert_eq!(chunks[0], b"Hello");
120        assert_eq!(chunks[1], b" World");
121        assert_eq!(chunks[2], b" Test");
122    }
123
124    #[test]
125    fn test_prefix_preserves_total_bytes() {
126        let text = b"Hello World Test More Words Here";
127        let chunks: Vec<_> = chunk(text).size(10).delimiters(b" ").prefix().collect();
128        let total: usize = chunks.iter().map(|c| c.len()).sum();
129        assert_eq!(total, text.len());
130    }
131
132    #[test]
133    fn test_prefix_mode_delimiter_at_window_start() {
134        let text = b"Hello world";
135        let chunks: Vec<_> = chunk(text).size(5).delimiters(b" ").prefix().collect();
136        let total: usize = chunks.iter().map(|c| c.len()).sum();
137        assert_eq!(total, text.len());
138        assert_eq!(chunks[0], b"Hello");
139    }
140
141    #[test]
142    fn test_prefix_mode_small_chunks() {
143        let text = b"a b c d e";
144        let chunks: Vec<_> = chunk(text).size(2).delimiters(b" ").prefix().collect();
145        let total: usize = chunks.iter().map(|c| c.len()).sum();
146        assert_eq!(total, text.len());
147        for c in &chunks {
148            assert!(!c.is_empty(), "Found empty chunk!");
149        }
150    }
151
152    // ============ Multi-byte pattern tests ============
153
154    #[test]
155    fn test_pattern_metaspace_suffix() {
156        let metaspace = "▁".as_bytes();
157        let text = "Hello▁World▁Test".as_bytes();
158        let chunks: Vec<_> = chunk(text).size(15).pattern(metaspace).collect();
159        assert_eq!(chunks[0], "Hello▁".as_bytes());
160        assert_eq!(chunks[1], "World▁Test".as_bytes());
161        let total: usize = chunks.iter().map(|c| c.len()).sum();
162        assert_eq!(total, text.len());
163    }
164
165    #[test]
166    fn test_pattern_metaspace_prefix() {
167        let metaspace = "▁".as_bytes();
168        let text = "Hello▁World▁Test".as_bytes();
169        let chunks: Vec<_> = chunk(text).size(15).pattern(metaspace).prefix().collect();
170        assert_eq!(chunks[0], "Hello".as_bytes());
171        assert_eq!(chunks[1], "▁World▁Test".as_bytes());
172        let total: usize = chunks.iter().map(|c| c.len()).sum();
173        assert_eq!(total, text.len());
174    }
175
176    #[test]
177    fn test_pattern_preserves_bytes() {
178        let metaspace = "▁".as_bytes();
179        let text = "The▁quick▁brown▁fox▁jumps▁over▁the▁lazy▁dog".as_bytes();
180        let chunks: Vec<_> = chunk(text).size(20).pattern(metaspace).collect();
181        let total: usize = chunks.iter().map(|c| c.len()).sum();
182        assert_eq!(total, text.len());
183    }
184
185    #[test]
186    fn test_pattern_no_match_hard_split() {
187        let pattern = b"XYZ";
188        let text = b"abcdefghijklmnop";
189        let chunks: Vec<_> = chunk(text).size(5).pattern(pattern).collect();
190        assert_eq!(chunks[0], b"abcde");
191        assert_eq!(chunks[1], b"fghij");
192    }
193
194    #[test]
195    fn test_pattern_single_byte_optimization() {
196        let text = b"Hello World Test";
197        let chunks: Vec<_> = chunk(text).size(8).pattern(b" ").prefix().collect();
198        assert_eq!(chunks[0], b"Hello");
199        assert_eq!(chunks[1], b" World");
200    }
201
202    // ============ Consecutive and Forward Fallback Tests ============
203
204    #[test]
205    fn test_consecutive_pattern_basic() {
206        let metaspace = b"\xE2\x96\x81";
207        let text = b"word\xE2\x96\x81\xE2\x96\x81\xE2\x96\x81next";
208        let chunks: Vec<_> = chunk(text)
209            .pattern(metaspace)
210            .size(10)
211            .prefix()
212            .consecutive()
213            .collect();
214        let total: usize = chunks.iter().map(|c| c.len()).sum();
215        assert_eq!(total, text.len());
216        assert_eq!(chunks[0], b"word");
217        assert!(chunks[1].starts_with(metaspace));
218    }
219
220    #[test]
221    fn test_forward_fallback_basic() {
222        let metaspace = b"\xE2\x96\x81";
223        let text = b"verylongword\xE2\x96\x81short";
224        let chunks: Vec<_> = chunk(text)
225            .pattern(metaspace)
226            .size(6)
227            .prefix()
228            .forward_fallback()
229            .collect();
230        assert_eq!(chunks[0], b"verylongword");
231        assert!(chunks[1].starts_with(metaspace));
232    }
233
234    #[test]
235    fn test_delimiter_consecutive_basic() {
236        let text = b"Hello\n\n\nWorld";
237        let chunks: Vec<_> = chunk(text)
238            .delimiters(b"\n")
239            .size(8)
240            .prefix()
241            .consecutive()
242            .collect();
243        let total: usize = chunks.iter().map(|c| c.len()).sum();
244        assert_eq!(total, text.len());
245        assert_eq!(chunks[0], b"Hello");
246        assert_eq!(chunks[1], b"\n\n\nWorld");
247    }
248
249    #[test]
250    fn test_delimiter_forward_fallback_basic() {
251        let text = b"verylongword next";
252        let chunks: Vec<_> = chunk(text)
253            .delimiters(b" ")
254            .size(6)
255            .prefix()
256            .forward_fallback()
257            .collect();
258        assert_eq!(chunks[0], b"verylongword");
259        assert_eq!(chunks[1], b" next");
260    }
261
262    #[test]
263    fn test_owned_chunker_pattern() {
264        let metaspace = "▁".as_bytes();
265        let text = "Hello▁World▁Test".as_bytes().to_vec();
266        let mut chunker = OwnedChunker::new(text.clone())
267            .size(15)
268            .pattern(metaspace.to_vec())
269            .prefix();
270        let mut chunks = Vec::new();
271        while let Some(c) = chunker.next_chunk() {
272            chunks.push(c);
273        }
274        assert_eq!(chunks[0], "Hello".as_bytes());
275        let total: usize = chunks.iter().map(|c| c.len()).sum();
276        assert_eq!(total, text.len());
277    }
278
279    #[test]
280    fn test_owned_chunker_collect_offsets() {
281        let metaspace = "▁".as_bytes();
282        let text = "Hello▁World▁Test".as_bytes().to_vec();
283        let mut chunker = OwnedChunker::new(text.clone())
284            .size(15)
285            .pattern(metaspace.to_vec())
286            .prefix();
287        let offsets = chunker.collect_offsets();
288        assert_eq!(offsets[0], (0, 5));
289        assert_eq!(&text[offsets[0].0..offsets[0].1], "Hello".as_bytes());
290    }
291}