chunk/
chunk.rs

1//! Size-based text chunking at delimiter boundaries.
2//!
3//! This module provides the [`Chunker`] and [`OwnedChunker`] types for splitting
4//! text into chunks of a target size, preferring to break at delimiter boundaries.
5
6use crate::delim::{DEFAULT_DELIMITERS, DEFAULT_TARGET_SIZE, build_table, compute_split_at};
7
8/// Chunk text at delimiter boundaries.
9///
10/// Returns a builder that can be configured with `.size()` and `.delimiters()`,
11/// or used directly as an iterator with defaults (4KB chunks, `\n.?` delimiters).
12///
13/// - For 1-3 delimiters: uses SIMD-accelerated memchr
14/// - For 4+ delimiters: uses lookup table
15///
16/// # Example
17///
18/// ```
19/// use chunk::chunk;
20///
21/// let text = b"First sentence. Second sentence. Third sentence.";
22///
23/// // With defaults
24/// let chunks: Vec<_> = chunk(text).collect();
25///
26/// // With custom size
27/// let chunks: Vec<_> = chunk(text).size(1024).collect();
28///
29/// // With custom delimiters
30/// let chunks: Vec<_> = chunk(text).delimiters(b"\n.?!").collect();
31///
32/// // With both
33/// let chunks: Vec<_> = chunk(text).size(8192).delimiters(b"\n").collect();
34/// ```
35pub fn chunk(text: &[u8]) -> Chunker<'_> {
36    Chunker::new(text)
37}
38
39/// Chunker splits text at delimiter boundaries.
40///
41/// Created via [`chunk()`], can be configured with `.size()` and `.delimiters()`.
42/// For multi-byte delimiters, use `.pattern()` instead.
43pub struct Chunker<'a> {
44    text: &'a [u8],
45    target_size: usize,
46    delimiters: &'a [u8],
47    pattern: Option<&'a [u8]>,
48    pos: usize,
49    table: Option<[bool; 256]>,
50    initialized: bool,
51    prefix_mode: bool,
52    /// When true, find the START of consecutive pattern runs (not middle)
53    consecutive: bool,
54    /// When true, search forward if no pattern found in backward window
55    forward_fallback: bool,
56}
57
58impl<'a> Chunker<'a> {
59    fn new(text: &'a [u8]) -> Self {
60        Self {
61            text,
62            target_size: DEFAULT_TARGET_SIZE,
63            delimiters: DEFAULT_DELIMITERS,
64            pattern: None,
65            pos: 0,
66            table: None,
67            initialized: false,
68            prefix_mode: false,
69            consecutive: false,
70            forward_fallback: false,
71        }
72    }
73
74    /// Set the target chunk size in bytes.
75    pub fn size(mut self, size: usize) -> Self {
76        self.target_size = size;
77        self
78    }
79
80    /// Set single-byte delimiters to split on.
81    ///
82    /// Mutually exclusive with `pattern()` - last one set wins.
83    pub fn delimiters(mut self, delimiters: &'a [u8]) -> Self {
84        self.delimiters = delimiters;
85        self.pattern = None; // Clear pattern mode
86        self
87    }
88
89    /// Set a multi-byte pattern to split on.
90    ///
91    /// Use this for multi-byte delimiters like UTF-8 characters (e.g., metaspace `▁`).
92    /// Mutually exclusive with `delimiters()` - last one set wins.
93    ///
94    /// ```
95    /// use chunk::chunk;
96    /// let metaspace = "▁".as_bytes(); // [0xE2, 0x96, 0x81]
97    /// let chunks: Vec<_> = chunk(b"Hello\xE2\x96\x81World\xE2\x96\x81Test")
98    ///     .size(15)
99    ///     .pattern(metaspace)
100    ///     .prefix()
101    ///     .collect();
102    /// assert_eq!(chunks[0], b"Hello");
103    /// assert_eq!(chunks[1], b"\xE2\x96\x81World\xE2\x96\x81Test");
104    /// ```
105    pub fn pattern(mut self, pattern: &'a [u8]) -> Self {
106        self.pattern = Some(pattern);
107        self.delimiters = &[]; // Clear single-byte delimiters
108        self
109    }
110
111    /// Put delimiter at the start of the next chunk (prefix mode).
112    ///
113    /// ```
114    /// use chunk::chunk;
115    /// let chunks: Vec<_> = chunk(b"Hello World").size(8).delimiters(b" ").prefix().collect();
116    /// assert_eq!(chunks, vec![b"Hello".as_slice(), b" World".as_slice()]);
117    /// ```
118    pub fn prefix(mut self) -> Self {
119        self.prefix_mode = true;
120        self
121    }
122
123    /// Put delimiter at the end of the current chunk (suffix mode, default).
124    ///
125    /// ```
126    /// use chunk::chunk;
127    /// let chunks: Vec<_> = chunk(b"Hello World").size(8).delimiters(b" ").suffix().collect();
128    /// assert_eq!(chunks, vec![b"Hello ".as_slice(), b"World".as_slice()]);
129    /// ```
130    pub fn suffix(mut self) -> Self {
131        self.prefix_mode = false;
132        self
133    }
134
135    /// Enable consecutive delimiter/pattern handling.
136    ///
137    /// When splitting, ensures we split at the START of a consecutive run
138    /// of the same delimiter/pattern, not in the middle. For example:
139    /// - With pattern: "word▁▁▁next" splits as ["word"]["▁▁▁next"]
140    /// - With delimiter: "word\n\n\nnext" splits as ["word"]["\\n\\n\\nnext"]
141    ///
142    /// This is useful for patterns that can merge (like BPE tokenization)
143    /// or when consecutive delimiters have semantic meaning (like `\n\n`
144    /// for paragraph breaks).
145    ///
146    /// Works with both `.pattern()` and `.delimiters()`.
147    ///
148    /// ```
149    /// use chunk::chunk;
150    ///
151    /// // With pattern
152    /// let text = b"word\xE2\x96\x81\xE2\x96\x81\xE2\x96\x81next"; // word▁▁▁next
153    /// let metaspace = b"\xE2\x96\x81";
154    /// let chunks: Vec<_> = chunk(text)
155    ///     .pattern(metaspace)
156    ///     .size(10)
157    ///     .prefix()
158    ///     .consecutive()
159    ///     .collect();
160    /// assert_eq!(chunks[0], b"word");
161    ///
162    /// // With delimiters
163    /// let text = b"Hello\n\n\nWorld";
164    /// let chunks: Vec<_> = chunk(text)
165    ///     .delimiters(b"\n")
166    ///     .size(8)
167    ///     .prefix()
168    ///     .consecutive()
169    ///     .collect();
170    /// assert_eq!(chunks[0], b"Hello");
171    /// assert_eq!(chunks[1], b"\n\n\nWorld");
172    /// ```
173    pub fn consecutive(mut self) -> Self {
174        self.consecutive = true;
175        self
176    }
177
178    /// Enable forward fallback search.
179    ///
180    /// When no delimiter/pattern is found in the backward search window,
181    /// search forward from target_end instead of doing a hard split.
182    ///
183    /// This ensures splits always occur at semantic boundaries when possible,
184    /// even if the nearest boundary is past the target size.
185    ///
186    /// Works with both `.pattern()` and `.delimiters()`.
187    ///
188    /// ```
189    /// use chunk::chunk;
190    ///
191    /// // With pattern
192    /// let text = b"verylongword\xE2\x96\x81short"; // verylongword▁short
193    /// let metaspace = b"\xE2\x96\x81";
194    /// let chunks: Vec<_> = chunk(text)
195    ///     .pattern(metaspace)
196    ///     .size(6)
197    ///     .prefix()
198    ///     .forward_fallback()
199    ///     .collect();
200    /// // Without forward_fallback: hard split at position 6
201    /// // With forward_fallback: finds ▁ at position 12
202    /// assert_eq!(chunks[0], b"verylongword");
203    ///
204    /// // With delimiters
205    /// let text = b"verylongword next";
206    /// let chunks: Vec<_> = chunk(text)
207    ///     .delimiters(b" ")
208    ///     .size(6)
209    ///     .prefix()
210    ///     .forward_fallback()
211    ///     .collect();
212    /// assert_eq!(chunks[0], b"verylongword");
213    /// assert_eq!(chunks[1], b" next");
214    /// ```
215    pub fn forward_fallback(mut self) -> Self {
216        self.forward_fallback = true;
217        self
218    }
219
220    /// Initialize lookup table if needed (called on first iteration).
221    fn init(&mut self) {
222        if !self.initialized {
223            self.table = build_table(self.delimiters);
224            self.initialized = true;
225        }
226    }
227}
228
229impl<'a> Iterator for Chunker<'a> {
230    type Item = &'a [u8];
231
232    fn next(&mut self) -> Option<Self::Item> {
233        self.init();
234
235        if self.pos >= self.text.len() {
236            return None;
237        }
238
239        let remaining = self.text.len() - self.pos;
240
241        // Last chunk - return remainder
242        if remaining <= self.target_size {
243            let chunk = &self.text[self.pos..];
244            self.pos = self.text.len();
245            return Some(chunk);
246        }
247
248        let end = self.pos + self.target_size;
249
250        let split_at = compute_split_at(
251            self.text,
252            self.pos,
253            end,
254            self.pattern,
255            self.delimiters,
256            self.table.as_ref(),
257            self.prefix_mode,
258            self.consecutive,
259            self.forward_fallback,
260        );
261
262        let chunk = &self.text[self.pos..split_at];
263        self.pos = split_at;
264        Some(chunk)
265    }
266}
267
268/// Owned chunker for FFI bindings (Python, WASM).
269///
270/// Unlike [`Chunker`], this owns its data and returns owned chunks.
271/// Use this when you need to cross FFI boundaries where lifetimes can't be tracked.
272///
273/// # Example
274///
275/// ```
276/// use chunk::OwnedChunker;
277///
278/// let text = b"Hello world. How are you?".to_vec();
279/// let mut chunker = OwnedChunker::new(text)
280///     .size(15)
281///     .delimiters(b"\n.?".to_vec());
282///
283/// while let Some(chunk) = chunker.next_chunk() {
284///     println!("{:?}", chunk);
285/// }
286/// ```
287pub struct OwnedChunker {
288    text: Vec<u8>,
289    target_size: usize,
290    delimiters: Vec<u8>,
291    pattern: Option<Vec<u8>>,
292    pos: usize,
293    table: Option<[bool; 256]>,
294    initialized: bool,
295    prefix_mode: bool,
296    consecutive: bool,
297    forward_fallback: bool,
298}
299
300impl OwnedChunker {
301    /// Create a new owned chunker with the given text.
302    pub fn new(text: Vec<u8>) -> Self {
303        Self {
304            text,
305            target_size: DEFAULT_TARGET_SIZE,
306            delimiters: DEFAULT_DELIMITERS.to_vec(),
307            pattern: None,
308            pos: 0,
309            table: None,
310            initialized: false,
311            prefix_mode: false,
312            consecutive: false,
313            forward_fallback: false,
314        }
315    }
316
317    /// Set the target chunk size in bytes.
318    pub fn size(mut self, size: usize) -> Self {
319        self.target_size = size;
320        self
321    }
322
323    /// Set single-byte delimiters to split on.
324    ///
325    /// Mutually exclusive with `pattern()` - last one set wins.
326    pub fn delimiters(mut self, delimiters: Vec<u8>) -> Self {
327        self.delimiters = delimiters;
328        self.pattern = None; // Clear pattern mode
329        self
330    }
331
332    /// Set a multi-byte pattern to split on.
333    ///
334    /// Use this for multi-byte delimiters like UTF-8 characters (e.g., metaspace `▁`).
335    /// Mutually exclusive with `delimiters()` - last one set wins.
336    pub fn pattern(mut self, pattern: Vec<u8>) -> Self {
337        self.pattern = Some(pattern);
338        self.delimiters = vec![]; // Clear single-byte delimiters
339        self
340    }
341
342    /// Put delimiter at the start of the next chunk (prefix mode).
343    pub fn prefix(mut self) -> Self {
344        self.prefix_mode = true;
345        self
346    }
347
348    /// Put delimiter at the end of the current chunk (suffix mode, default).
349    pub fn suffix(mut self) -> Self {
350        self.prefix_mode = false;
351        self
352    }
353
354    /// Enable consecutive delimiter/pattern handling.
355    ///
356    /// When splitting, ensures we split at the START of a consecutive run
357    /// of the same delimiter/pattern, not in the middle.
358    /// Works with both `.pattern()` and `.delimiters()`.
359    pub fn consecutive(mut self) -> Self {
360        self.consecutive = true;
361        self
362    }
363
364    /// Enable forward fallback search.
365    ///
366    /// When no delimiter/pattern is found in the backward search window,
367    /// search forward from target_end instead of doing a hard split.
368    /// Works with both `.pattern()` and `.delimiters()`.
369    pub fn forward_fallback(mut self) -> Self {
370        self.forward_fallback = true;
371        self
372    }
373
374    /// Initialize lookup table if needed.
375    fn init(&mut self) {
376        if !self.initialized {
377            self.table = build_table(&self.delimiters);
378            self.initialized = true;
379        }
380    }
381
382    /// Get the next chunk, or None if exhausted.
383    pub fn next_chunk(&mut self) -> Option<Vec<u8>> {
384        self.init();
385
386        if self.pos >= self.text.len() {
387            return None;
388        }
389
390        let remaining = self.text.len() - self.pos;
391
392        // Last chunk - return remainder
393        if remaining <= self.target_size {
394            let chunk = self.text[self.pos..].to_vec();
395            self.pos = self.text.len();
396            return Some(chunk);
397        }
398
399        let end = self.pos + self.target_size;
400
401        let split_at = compute_split_at(
402            &self.text,
403            self.pos,
404            end,
405            self.pattern.as_deref(),
406            &self.delimiters,
407            self.table.as_ref(),
408            self.prefix_mode,
409            self.consecutive,
410            self.forward_fallback,
411        );
412
413        let chunk = self.text[self.pos..split_at].to_vec();
414        self.pos = split_at;
415        Some(chunk)
416    }
417
418    /// Reset the chunker to start from the beginning.
419    pub fn reset(&mut self) {
420        self.pos = 0;
421    }
422
423    /// Get a reference to the underlying text.
424    pub fn text(&self) -> &[u8] {
425        &self.text
426    }
427
428    /// Collect all chunk offsets as (start, end) pairs.
429    /// This is more efficient for FFI as it returns all offsets in one call.
430    pub fn collect_offsets(&mut self) -> Vec<(usize, usize)> {
431        self.init();
432
433        let mut offsets = Vec::new();
434        let mut pos = 0;
435
436        while pos < self.text.len() {
437            let remaining = self.text.len() - pos;
438
439            if remaining <= self.target_size {
440                offsets.push((pos, self.text.len()));
441                break;
442            }
443
444            let end = pos + self.target_size;
445
446            let split_at = compute_split_at(
447                &self.text,
448                pos,
449                end,
450                self.pattern.as_deref(),
451                &self.delimiters,
452                self.table.as_ref(),
453                self.prefix_mode,
454                self.consecutive,
455                self.forward_fallback,
456            );
457
458            offsets.push((pos, split_at));
459            pos = split_at;
460        }
461
462        offsets
463    }
464}
465
466#[cfg(test)]
467mod tests {
468    use super::*;
469
470    #[test]
471    fn test_basic_chunking() {
472        let text = b"Hello. World. Test.";
473        let chunks: Vec<_> = chunk(text).size(10).delimiters(b".").collect();
474        assert_eq!(chunks.len(), 3);
475        assert_eq!(chunks[0], b"Hello.");
476        assert_eq!(chunks[1], b" World.");
477        assert_eq!(chunks[2], b" Test.");
478    }
479
480    #[test]
481    fn test_newline_delimiter() {
482        let text = b"Line one\nLine two\nLine three";
483        let chunks: Vec<_> = chunk(text).size(15).delimiters(b"\n").collect();
484        assert_eq!(chunks[0], b"Line one\n");
485        assert_eq!(chunks[1], b"Line two\n");
486        assert_eq!(chunks[2], b"Line three");
487    }
488
489    #[test]
490    fn test_multiple_delimiters() {
491        let text = b"Hello? World. Yes!";
492        let chunks: Vec<_> = chunk(text).size(10).delimiters(b".?!").collect();
493        assert_eq!(chunks[0], b"Hello?");
494    }
495
496    #[test]
497    fn test_four_delimiters_uses_table() {
498        let text = b"A. B? C! D; E";
499        let chunks: Vec<_> = chunk(text).size(5).delimiters(b".?!;").collect();
500        assert!(chunks.len() >= 2);
501    }
502
503    #[test]
504    fn test_no_delimiter_hard_split() {
505        let text = b"abcdefghij";
506        let chunks: Vec<_> = chunk(text).size(5).delimiters(b".").collect();
507        assert_eq!(chunks[0], b"abcde");
508        assert_eq!(chunks[1], b"fghij");
509    }
510
511    #[test]
512    fn test_empty_text() {
513        let text = b"";
514        let chunks: Vec<_> = chunk(text).size(10).delimiters(b".").collect();
515        assert_eq!(chunks.len(), 0);
516    }
517
518    #[test]
519    fn test_text_smaller_than_target() {
520        let text = b"Small";
521        let chunks: Vec<_> = chunk(text).size(100).delimiters(b".").collect();
522        assert_eq!(chunks.len(), 1);
523        assert_eq!(chunks[0], b"Small");
524    }
525
526    #[test]
527    fn test_total_bytes_preserved() {
528        let text = b"The quick brown fox jumps over the lazy dog. How vexingly quick!";
529        let chunks: Vec<_> = chunk(text).size(20).delimiters(b"\n.?!").collect();
530        let total: usize = chunks.iter().map(|c| c.len()).sum();
531        assert_eq!(total, text.len());
532    }
533
534    #[test]
535    fn test_defaults() {
536        let text = b"Hello world. This is a test.";
537        let chunks: Vec<_> = chunk(text).collect();
538        assert!(!chunks.is_empty());
539    }
540
541    #[test]
542    fn test_prefix_mode() {
543        let text = b"Hello World Test";
544        let chunks: Vec<_> = chunk(text).size(8).delimiters(b" ").prefix().collect();
545        assert_eq!(chunks[0], b"Hello");
546        assert_eq!(chunks[1], b" World");
547        assert_eq!(chunks[2], b" Test");
548    }
549
550    #[test]
551    fn test_suffix_mode() {
552        let text = b"Hello World Test";
553        let chunks: Vec<_> = chunk(text).size(8).delimiters(b" ").suffix().collect();
554        assert_eq!(chunks[0], b"Hello ");
555        assert_eq!(chunks[1], b"World ");
556        assert_eq!(chunks[2], b"Test");
557    }
558
559    #[test]
560    fn test_consecutive_delimiters() {
561        let text = b"Hello\n\n\nWorld";
562        let chunks: Vec<_> = chunk(text)
563            .delimiters(b"\n")
564            .size(8)
565            .prefix()
566            .consecutive()
567            .collect();
568        assert_eq!(chunks[0], b"Hello");
569        assert_eq!(chunks[1], b"\n\n\nWorld");
570    }
571
572    #[test]
573    fn test_forward_fallback() {
574        let text = b"verylongword next";
575        let chunks: Vec<_> = chunk(text)
576            .delimiters(b" ")
577            .size(6)
578            .prefix()
579            .forward_fallback()
580            .collect();
581        assert_eq!(chunks[0], b"verylongword");
582        assert_eq!(chunks[1], b" next");
583    }
584
585    #[test]
586    fn test_pattern_metaspace() {
587        let metaspace = "▁".as_bytes();
588        let text = "Hello▁World▁Test".as_bytes();
589        let chunks: Vec<_> = chunk(text).size(15).pattern(metaspace).prefix().collect();
590        assert_eq!(chunks[0], "Hello".as_bytes());
591        assert_eq!(chunks[1], "▁World▁Test".as_bytes());
592    }
593
594    #[test]
595    fn test_owned_chunker() {
596        let text = b"Hello. World. Test.".to_vec();
597        let mut chunker = OwnedChunker::new(text).size(10).delimiters(b".".to_vec());
598
599        let mut chunks = Vec::new();
600        while let Some(c) = chunker.next_chunk() {
601            chunks.push(c);
602        }
603
604        assert_eq!(chunks.len(), 3);
605        assert_eq!(chunks[0], b"Hello.");
606    }
607
608    #[test]
609    fn test_owned_chunker_collect_offsets() {
610        let text = b"Hello. World. Test.".to_vec();
611        let mut chunker = OwnedChunker::new(text.clone())
612            .size(10)
613            .delimiters(b".".to_vec());
614
615        let offsets = chunker.collect_offsets();
616        assert_eq!(offsets.len(), 3);
617        assert_eq!(&text[offsets[0].0..offsets[0].1], b"Hello.");
618    }
619}