chunk/chunk.rs
1//! Size-based text chunking at delimiter boundaries.
2//!
3//! This module provides the [`Chunker`] and [`OwnedChunker`] types for splitting
4//! text into chunks of a target size, preferring to break at delimiter boundaries.
5
6use crate::delim::{DEFAULT_DELIMITERS, DEFAULT_TARGET_SIZE, build_table, compute_split_at};
7
8/// Chunk text at delimiter boundaries.
9///
10/// Returns a builder that can be configured with `.size()` and `.delimiters()`,
11/// or used directly as an iterator with defaults (4KB chunks, `\n.?` delimiters).
12///
13/// - For 1-3 delimiters: uses SIMD-accelerated memchr
14/// - For 4+ delimiters: uses lookup table
15///
16/// # Example
17///
18/// ```
19/// use chunk::chunk;
20///
21/// let text = b"First sentence. Second sentence. Third sentence.";
22///
23/// // With defaults
24/// let chunks: Vec<_> = chunk(text).collect();
25///
26/// // With custom size
27/// let chunks: Vec<_> = chunk(text).size(1024).collect();
28///
29/// // With custom delimiters
30/// let chunks: Vec<_> = chunk(text).delimiters(b"\n.?!").collect();
31///
32/// // With both
33/// let chunks: Vec<_> = chunk(text).size(8192).delimiters(b"\n").collect();
34/// ```
35pub fn chunk(text: &[u8]) -> Chunker<'_> {
36 Chunker::new(text)
37}
38
39/// Chunker splits text at delimiter boundaries.
40///
41/// Created via [`chunk()`], can be configured with `.size()` and `.delimiters()`.
42/// For multi-byte delimiters, use `.pattern()` instead.
43pub struct Chunker<'a> {
44 text: &'a [u8],
45 target_size: usize,
46 delimiters: &'a [u8],
47 pattern: Option<&'a [u8]>,
48 pos: usize,
49 table: Option<[bool; 256]>,
50 initialized: bool,
51 prefix_mode: bool,
52 /// When true, find the START of consecutive pattern runs (not middle)
53 consecutive: bool,
54 /// When true, search forward if no pattern found in backward window
55 forward_fallback: bool,
56}
57
58impl<'a> Chunker<'a> {
59 fn new(text: &'a [u8]) -> Self {
60 Self {
61 text,
62 target_size: DEFAULT_TARGET_SIZE,
63 delimiters: DEFAULT_DELIMITERS,
64 pattern: None,
65 pos: 0,
66 table: None,
67 initialized: false,
68 prefix_mode: false,
69 consecutive: false,
70 forward_fallback: false,
71 }
72 }
73
74 /// Set the target chunk size in bytes.
75 pub fn size(mut self, size: usize) -> Self {
76 self.target_size = size;
77 self
78 }
79
80 /// Set single-byte delimiters to split on.
81 ///
82 /// Mutually exclusive with `pattern()` - last one set wins.
83 pub fn delimiters(mut self, delimiters: &'a [u8]) -> Self {
84 self.delimiters = delimiters;
85 self.pattern = None; // Clear pattern mode
86 self
87 }
88
89 /// Set a multi-byte pattern to split on.
90 ///
91 /// Use this for multi-byte delimiters like UTF-8 characters (e.g., metaspace `▁`).
92 /// Mutually exclusive with `delimiters()` - last one set wins.
93 ///
94 /// ```
95 /// use chunk::chunk;
96 /// let metaspace = "▁".as_bytes(); // [0xE2, 0x96, 0x81]
97 /// let chunks: Vec<_> = chunk(b"Hello\xE2\x96\x81World\xE2\x96\x81Test")
98 /// .size(15)
99 /// .pattern(metaspace)
100 /// .prefix()
101 /// .collect();
102 /// assert_eq!(chunks[0], b"Hello");
103 /// assert_eq!(chunks[1], b"\xE2\x96\x81World\xE2\x96\x81Test");
104 /// ```
105 pub fn pattern(mut self, pattern: &'a [u8]) -> Self {
106 self.pattern = Some(pattern);
107 self.delimiters = &[]; // Clear single-byte delimiters
108 self
109 }
110
111 /// Put delimiter at the start of the next chunk (prefix mode).
112 ///
113 /// ```
114 /// use chunk::chunk;
115 /// let chunks: Vec<_> = chunk(b"Hello World").size(8).delimiters(b" ").prefix().collect();
116 /// assert_eq!(chunks, vec![b"Hello".as_slice(), b" World".as_slice()]);
117 /// ```
118 pub fn prefix(mut self) -> Self {
119 self.prefix_mode = true;
120 self
121 }
122
123 /// Put delimiter at the end of the current chunk (suffix mode, default).
124 ///
125 /// ```
126 /// use chunk::chunk;
127 /// let chunks: Vec<_> = chunk(b"Hello World").size(8).delimiters(b" ").suffix().collect();
128 /// assert_eq!(chunks, vec![b"Hello ".as_slice(), b"World".as_slice()]);
129 /// ```
130 pub fn suffix(mut self) -> Self {
131 self.prefix_mode = false;
132 self
133 }
134
135 /// Enable consecutive delimiter/pattern handling.
136 ///
137 /// When splitting, ensures we split at the START of a consecutive run
138 /// of the same delimiter/pattern, not in the middle. For example:
139 /// - With pattern: "word▁▁▁next" splits as ["word"]["▁▁▁next"]
140 /// - With delimiter: "word\n\n\nnext" splits as ["word"]["\\n\\n\\nnext"]
141 ///
142 /// This is useful for patterns that can merge (like BPE tokenization)
143 /// or when consecutive delimiters have semantic meaning (like `\n\n`
144 /// for paragraph breaks).
145 ///
146 /// Works with both `.pattern()` and `.delimiters()`.
147 ///
148 /// ```
149 /// use chunk::chunk;
150 ///
151 /// // With pattern
152 /// let text = b"word\xE2\x96\x81\xE2\x96\x81\xE2\x96\x81next"; // word▁▁▁next
153 /// let metaspace = b"\xE2\x96\x81";
154 /// let chunks: Vec<_> = chunk(text)
155 /// .pattern(metaspace)
156 /// .size(10)
157 /// .prefix()
158 /// .consecutive()
159 /// .collect();
160 /// assert_eq!(chunks[0], b"word");
161 ///
162 /// // With delimiters
163 /// let text = b"Hello\n\n\nWorld";
164 /// let chunks: Vec<_> = chunk(text)
165 /// .delimiters(b"\n")
166 /// .size(8)
167 /// .prefix()
168 /// .consecutive()
169 /// .collect();
170 /// assert_eq!(chunks[0], b"Hello");
171 /// assert_eq!(chunks[1], b"\n\n\nWorld");
172 /// ```
173 pub fn consecutive(mut self) -> Self {
174 self.consecutive = true;
175 self
176 }
177
178 /// Enable forward fallback search.
179 ///
180 /// When no delimiter/pattern is found in the backward search window,
181 /// search forward from target_end instead of doing a hard split.
182 ///
183 /// This ensures splits always occur at semantic boundaries when possible,
184 /// even if the nearest boundary is past the target size.
185 ///
186 /// Works with both `.pattern()` and `.delimiters()`.
187 ///
188 /// ```
189 /// use chunk::chunk;
190 ///
191 /// // With pattern
192 /// let text = b"verylongword\xE2\x96\x81short"; // verylongword▁short
193 /// let metaspace = b"\xE2\x96\x81";
194 /// let chunks: Vec<_> = chunk(text)
195 /// .pattern(metaspace)
196 /// .size(6)
197 /// .prefix()
198 /// .forward_fallback()
199 /// .collect();
200 /// // Without forward_fallback: hard split at position 6
201 /// // With forward_fallback: finds ▁ at position 12
202 /// assert_eq!(chunks[0], b"verylongword");
203 ///
204 /// // With delimiters
205 /// let text = b"verylongword next";
206 /// let chunks: Vec<_> = chunk(text)
207 /// .delimiters(b" ")
208 /// .size(6)
209 /// .prefix()
210 /// .forward_fallback()
211 /// .collect();
212 /// assert_eq!(chunks[0], b"verylongword");
213 /// assert_eq!(chunks[1], b" next");
214 /// ```
215 pub fn forward_fallback(mut self) -> Self {
216 self.forward_fallback = true;
217 self
218 }
219
220 /// Initialize lookup table if needed (called on first iteration).
221 fn init(&mut self) {
222 if !self.initialized {
223 self.table = build_table(self.delimiters);
224 self.initialized = true;
225 }
226 }
227}
228
229impl<'a> Iterator for Chunker<'a> {
230 type Item = &'a [u8];
231
232 fn next(&mut self) -> Option<Self::Item> {
233 self.init();
234
235 if self.pos >= self.text.len() {
236 return None;
237 }
238
239 let remaining = self.text.len() - self.pos;
240
241 // Last chunk - return remainder
242 if remaining <= self.target_size {
243 let chunk = &self.text[self.pos..];
244 self.pos = self.text.len();
245 return Some(chunk);
246 }
247
248 let end = self.pos + self.target_size;
249
250 let split_at = compute_split_at(
251 self.text,
252 self.pos,
253 end,
254 self.pattern,
255 self.delimiters,
256 self.table.as_ref(),
257 self.prefix_mode,
258 self.consecutive,
259 self.forward_fallback,
260 );
261
262 let chunk = &self.text[self.pos..split_at];
263 self.pos = split_at;
264 Some(chunk)
265 }
266}
267
268/// Owned chunker for FFI bindings (Python, WASM).
269///
270/// Unlike [`Chunker`], this owns its data and returns owned chunks.
271/// Use this when you need to cross FFI boundaries where lifetimes can't be tracked.
272///
273/// # Example
274///
275/// ```
276/// use chunk::OwnedChunker;
277///
278/// let text = b"Hello world. How are you?".to_vec();
279/// let mut chunker = OwnedChunker::new(text)
280/// .size(15)
281/// .delimiters(b"\n.?".to_vec());
282///
283/// while let Some(chunk) = chunker.next_chunk() {
284/// println!("{:?}", chunk);
285/// }
286/// ```
287pub struct OwnedChunker {
288 text: Vec<u8>,
289 target_size: usize,
290 delimiters: Vec<u8>,
291 pattern: Option<Vec<u8>>,
292 pos: usize,
293 table: Option<[bool; 256]>,
294 initialized: bool,
295 prefix_mode: bool,
296 consecutive: bool,
297 forward_fallback: bool,
298}
299
300impl OwnedChunker {
301 /// Create a new owned chunker with the given text.
302 pub fn new(text: Vec<u8>) -> Self {
303 Self {
304 text,
305 target_size: DEFAULT_TARGET_SIZE,
306 delimiters: DEFAULT_DELIMITERS.to_vec(),
307 pattern: None,
308 pos: 0,
309 table: None,
310 initialized: false,
311 prefix_mode: false,
312 consecutive: false,
313 forward_fallback: false,
314 }
315 }
316
317 /// Set the target chunk size in bytes.
318 pub fn size(mut self, size: usize) -> Self {
319 self.target_size = size;
320 self
321 }
322
323 /// Set single-byte delimiters to split on.
324 ///
325 /// Mutually exclusive with `pattern()` - last one set wins.
326 pub fn delimiters(mut self, delimiters: Vec<u8>) -> Self {
327 self.delimiters = delimiters;
328 self.pattern = None; // Clear pattern mode
329 self
330 }
331
332 /// Set a multi-byte pattern to split on.
333 ///
334 /// Use this for multi-byte delimiters like UTF-8 characters (e.g., metaspace `▁`).
335 /// Mutually exclusive with `delimiters()` - last one set wins.
336 pub fn pattern(mut self, pattern: Vec<u8>) -> Self {
337 self.pattern = Some(pattern);
338 self.delimiters = vec![]; // Clear single-byte delimiters
339 self
340 }
341
342 /// Put delimiter at the start of the next chunk (prefix mode).
343 pub fn prefix(mut self) -> Self {
344 self.prefix_mode = true;
345 self
346 }
347
348 /// Put delimiter at the end of the current chunk (suffix mode, default).
349 pub fn suffix(mut self) -> Self {
350 self.prefix_mode = false;
351 self
352 }
353
354 /// Enable consecutive delimiter/pattern handling.
355 ///
356 /// When splitting, ensures we split at the START of a consecutive run
357 /// of the same delimiter/pattern, not in the middle.
358 /// Works with both `.pattern()` and `.delimiters()`.
359 pub fn consecutive(mut self) -> Self {
360 self.consecutive = true;
361 self
362 }
363
364 /// Enable forward fallback search.
365 ///
366 /// When no delimiter/pattern is found in the backward search window,
367 /// search forward from target_end instead of doing a hard split.
368 /// Works with both `.pattern()` and `.delimiters()`.
369 pub fn forward_fallback(mut self) -> Self {
370 self.forward_fallback = true;
371 self
372 }
373
374 /// Initialize lookup table if needed.
375 fn init(&mut self) {
376 if !self.initialized {
377 self.table = build_table(&self.delimiters);
378 self.initialized = true;
379 }
380 }
381
382 /// Get the next chunk, or None if exhausted.
383 pub fn next_chunk(&mut self) -> Option<Vec<u8>> {
384 self.init();
385
386 if self.pos >= self.text.len() {
387 return None;
388 }
389
390 let remaining = self.text.len() - self.pos;
391
392 // Last chunk - return remainder
393 if remaining <= self.target_size {
394 let chunk = self.text[self.pos..].to_vec();
395 self.pos = self.text.len();
396 return Some(chunk);
397 }
398
399 let end = self.pos + self.target_size;
400
401 let split_at = compute_split_at(
402 &self.text,
403 self.pos,
404 end,
405 self.pattern.as_deref(),
406 &self.delimiters,
407 self.table.as_ref(),
408 self.prefix_mode,
409 self.consecutive,
410 self.forward_fallback,
411 );
412
413 let chunk = self.text[self.pos..split_at].to_vec();
414 self.pos = split_at;
415 Some(chunk)
416 }
417
418 /// Reset the chunker to start from the beginning.
419 pub fn reset(&mut self) {
420 self.pos = 0;
421 }
422
423 /// Get a reference to the underlying text.
424 pub fn text(&self) -> &[u8] {
425 &self.text
426 }
427
428 /// Collect all chunk offsets as (start, end) pairs.
429 /// This is more efficient for FFI as it returns all offsets in one call.
430 pub fn collect_offsets(&mut self) -> Vec<(usize, usize)> {
431 self.init();
432
433 let mut offsets = Vec::new();
434 let mut pos = 0;
435
436 while pos < self.text.len() {
437 let remaining = self.text.len() - pos;
438
439 if remaining <= self.target_size {
440 offsets.push((pos, self.text.len()));
441 break;
442 }
443
444 let end = pos + self.target_size;
445
446 let split_at = compute_split_at(
447 &self.text,
448 pos,
449 end,
450 self.pattern.as_deref(),
451 &self.delimiters,
452 self.table.as_ref(),
453 self.prefix_mode,
454 self.consecutive,
455 self.forward_fallback,
456 );
457
458 offsets.push((pos, split_at));
459 pos = split_at;
460 }
461
462 offsets
463 }
464}
465
466#[cfg(test)]
467mod tests {
468 use super::*;
469
470 #[test]
471 fn test_basic_chunking() {
472 let text = b"Hello. World. Test.";
473 let chunks: Vec<_> = chunk(text).size(10).delimiters(b".").collect();
474 assert_eq!(chunks.len(), 3);
475 assert_eq!(chunks[0], b"Hello.");
476 assert_eq!(chunks[1], b" World.");
477 assert_eq!(chunks[2], b" Test.");
478 }
479
480 #[test]
481 fn test_newline_delimiter() {
482 let text = b"Line one\nLine two\nLine three";
483 let chunks: Vec<_> = chunk(text).size(15).delimiters(b"\n").collect();
484 assert_eq!(chunks[0], b"Line one\n");
485 assert_eq!(chunks[1], b"Line two\n");
486 assert_eq!(chunks[2], b"Line three");
487 }
488
489 #[test]
490 fn test_multiple_delimiters() {
491 let text = b"Hello? World. Yes!";
492 let chunks: Vec<_> = chunk(text).size(10).delimiters(b".?!").collect();
493 assert_eq!(chunks[0], b"Hello?");
494 }
495
496 #[test]
497 fn test_four_delimiters_uses_table() {
498 let text = b"A. B? C! D; E";
499 let chunks: Vec<_> = chunk(text).size(5).delimiters(b".?!;").collect();
500 assert!(chunks.len() >= 2);
501 }
502
503 #[test]
504 fn test_no_delimiter_hard_split() {
505 let text = b"abcdefghij";
506 let chunks: Vec<_> = chunk(text).size(5).delimiters(b".").collect();
507 assert_eq!(chunks[0], b"abcde");
508 assert_eq!(chunks[1], b"fghij");
509 }
510
511 #[test]
512 fn test_empty_text() {
513 let text = b"";
514 let chunks: Vec<_> = chunk(text).size(10).delimiters(b".").collect();
515 assert_eq!(chunks.len(), 0);
516 }
517
518 #[test]
519 fn test_text_smaller_than_target() {
520 let text = b"Small";
521 let chunks: Vec<_> = chunk(text).size(100).delimiters(b".").collect();
522 assert_eq!(chunks.len(), 1);
523 assert_eq!(chunks[0], b"Small");
524 }
525
526 #[test]
527 fn test_total_bytes_preserved() {
528 let text = b"The quick brown fox jumps over the lazy dog. How vexingly quick!";
529 let chunks: Vec<_> = chunk(text).size(20).delimiters(b"\n.?!").collect();
530 let total: usize = chunks.iter().map(|c| c.len()).sum();
531 assert_eq!(total, text.len());
532 }
533
534 #[test]
535 fn test_defaults() {
536 let text = b"Hello world. This is a test.";
537 let chunks: Vec<_> = chunk(text).collect();
538 assert!(!chunks.is_empty());
539 }
540
541 #[test]
542 fn test_prefix_mode() {
543 let text = b"Hello World Test";
544 let chunks: Vec<_> = chunk(text).size(8).delimiters(b" ").prefix().collect();
545 assert_eq!(chunks[0], b"Hello");
546 assert_eq!(chunks[1], b" World");
547 assert_eq!(chunks[2], b" Test");
548 }
549
550 #[test]
551 fn test_suffix_mode() {
552 let text = b"Hello World Test";
553 let chunks: Vec<_> = chunk(text).size(8).delimiters(b" ").suffix().collect();
554 assert_eq!(chunks[0], b"Hello ");
555 assert_eq!(chunks[1], b"World ");
556 assert_eq!(chunks[2], b"Test");
557 }
558
559 #[test]
560 fn test_consecutive_delimiters() {
561 let text = b"Hello\n\n\nWorld";
562 let chunks: Vec<_> = chunk(text)
563 .delimiters(b"\n")
564 .size(8)
565 .prefix()
566 .consecutive()
567 .collect();
568 assert_eq!(chunks[0], b"Hello");
569 assert_eq!(chunks[1], b"\n\n\nWorld");
570 }
571
572 #[test]
573 fn test_forward_fallback() {
574 let text = b"verylongword next";
575 let chunks: Vec<_> = chunk(text)
576 .delimiters(b" ")
577 .size(6)
578 .prefix()
579 .forward_fallback()
580 .collect();
581 assert_eq!(chunks[0], b"verylongword");
582 assert_eq!(chunks[1], b" next");
583 }
584
585 #[test]
586 fn test_pattern_metaspace() {
587 let metaspace = "▁".as_bytes();
588 let text = "Hello▁World▁Test".as_bytes();
589 let chunks: Vec<_> = chunk(text).size(15).pattern(metaspace).prefix().collect();
590 assert_eq!(chunks[0], "Hello".as_bytes());
591 assert_eq!(chunks[1], "▁World▁Test".as_bytes());
592 }
593
594 #[test]
595 fn test_owned_chunker() {
596 let text = b"Hello. World. Test.".to_vec();
597 let mut chunker = OwnedChunker::new(text).size(10).delimiters(b".".to_vec());
598
599 let mut chunks = Vec::new();
600 while let Some(c) = chunker.next_chunk() {
601 chunks.push(c);
602 }
603
604 assert_eq!(chunks.len(), 3);
605 assert_eq!(chunks[0], b"Hello.");
606 }
607
608 #[test]
609 fn test_owned_chunker_collect_offsets() {
610 let text = b"Hello. World. Test.".to_vec();
611 let mut chunker = OwnedChunker::new(text.clone())
612 .size(10)
613 .delimiters(b".".to_vec());
614
615 let offsets = chunker.collect_offsets();
616 assert_eq!(offsets.len(), 3);
617 assert_eq!(&text[offsets[0].0..offsets[0].1], b"Hello.");
618 }
619}