use crate::delim::{
DEFAULT_DELIMITERS, DEFAULT_TARGET_SIZE, MultiPatternSearcher, build_table, compute_split_at,
compute_split_at_combined,
};
pub fn chunk(text: &[u8]) -> Chunker<'_> {
Chunker::new(text)
}
pub struct Chunker<'a> {
text: &'a [u8],
target_size: usize,
delimiters: &'a [u8],
pattern: Option<&'a [u8]>,
multi_searcher: Option<MultiPatternSearcher>,
pos: usize,
table: Option<[bool; 256]>,
initialized: bool,
prefix_mode: bool,
consecutive: bool,
forward_fallback: bool,
}
impl<'a> Chunker<'a> {
fn new(text: &'a [u8]) -> Self {
Self {
text,
target_size: DEFAULT_TARGET_SIZE,
delimiters: DEFAULT_DELIMITERS,
pattern: None,
multi_searcher: None,
pos: 0,
table: None,
initialized: false,
prefix_mode: false,
consecutive: false,
forward_fallback: false,
}
}
pub fn size(mut self, size: usize) -> Self {
self.target_size = size;
self
}
pub fn delimiters(mut self, delimiters: &'a [u8]) -> Self {
self.delimiters = delimiters;
self.pattern = None; self
}
pub fn pattern(mut self, pattern: &'a [u8]) -> Self {
self.pattern = Some(pattern);
self.delimiters = &[]; self
}
pub fn patterns(mut self, patterns: &[&str]) -> Self {
if patterns.is_empty() {
self.multi_searcher = None;
} else {
self.multi_searcher = Some(MultiPatternSearcher::from_strs(patterns));
}
self
}
pub fn prefix(mut self) -> Self {
self.prefix_mode = true;
self
}
pub fn suffix(mut self) -> Self {
self.prefix_mode = false;
self
}
pub fn consecutive(mut self) -> Self {
self.consecutive = true;
self
}
pub fn forward_fallback(mut self) -> Self {
self.forward_fallback = true;
self
}
fn init(&mut self) {
if !self.initialized {
self.table = build_table(self.delimiters);
self.initialized = true;
}
}
}
impl<'a> Iterator for Chunker<'a> {
type Item = &'a [u8];
fn next(&mut self) -> Option<Self::Item> {
self.init();
if self.pos >= self.text.len() {
return None;
}
let remaining = self.text.len() - self.pos;
if remaining <= self.target_size {
let chunk = &self.text[self.pos..];
self.pos = self.text.len();
return Some(chunk);
}
let end = self.pos + self.target_size;
let split_at = if self.multi_searcher.is_some() {
compute_split_at_combined(
self.text,
self.pos,
end,
self.delimiters,
self.table.as_ref(),
self.multi_searcher.as_ref(),
self.prefix_mode,
self.consecutive,
self.forward_fallback,
)
} else {
compute_split_at(
self.text,
self.pos,
end,
self.pattern,
self.delimiters,
self.table.as_ref(),
self.prefix_mode,
self.consecutive,
self.forward_fallback,
)
};
let chunk = &self.text[self.pos..split_at];
self.pos = split_at;
Some(chunk)
}
}
pub struct OwnedChunker {
text: Vec<u8>,
target_size: usize,
delimiters: Vec<u8>,
pattern: Option<Vec<u8>>,
multi_searcher: Option<MultiPatternSearcher>,
pos: usize,
table: Option<[bool; 256]>,
initialized: bool,
prefix_mode: bool,
consecutive: bool,
forward_fallback: bool,
}
impl OwnedChunker {
pub fn new(text: Vec<u8>) -> Self {
Self {
text,
target_size: DEFAULT_TARGET_SIZE,
delimiters: DEFAULT_DELIMITERS.to_vec(),
pattern: None,
multi_searcher: None,
pos: 0,
table: None,
initialized: false,
prefix_mode: false,
consecutive: false,
forward_fallback: false,
}
}
pub fn size(mut self, size: usize) -> Self {
self.target_size = size;
self
}
pub fn delimiters(mut self, delimiters: Vec<u8>) -> Self {
self.delimiters = delimiters;
self.pattern = None; self
}
pub fn pattern(mut self, pattern: Vec<u8>) -> Self {
self.pattern = Some(pattern);
self.delimiters = vec![]; self
}
pub fn patterns(mut self, patterns: &[&str]) -> Self {
if patterns.is_empty() {
self.multi_searcher = None;
} else {
self.multi_searcher = Some(MultiPatternSearcher::from_strs(patterns));
}
self
}
pub fn prefix(mut self) -> Self {
self.prefix_mode = true;
self
}
pub fn suffix(mut self) -> Self {
self.prefix_mode = false;
self
}
pub fn consecutive(mut self) -> Self {
self.consecutive = true;
self
}
pub fn forward_fallback(mut self) -> Self {
self.forward_fallback = true;
self
}
fn init(&mut self) {
if !self.initialized {
self.table = build_table(&self.delimiters);
self.initialized = true;
}
}
pub fn next_chunk(&mut self) -> Option<Vec<u8>> {
self.init();
if self.pos >= self.text.len() {
return None;
}
let remaining = self.text.len() - self.pos;
if remaining <= self.target_size {
let chunk = self.text[self.pos..].to_vec();
self.pos = self.text.len();
return Some(chunk);
}
let end = self.pos + self.target_size;
let split_at = if self.multi_searcher.is_some() {
compute_split_at_combined(
&self.text,
self.pos,
end,
&self.delimiters,
self.table.as_ref(),
self.multi_searcher.as_ref(),
self.prefix_mode,
self.consecutive,
self.forward_fallback,
)
} else {
compute_split_at(
&self.text,
self.pos,
end,
self.pattern.as_deref(),
&self.delimiters,
self.table.as_ref(),
self.prefix_mode,
self.consecutive,
self.forward_fallback,
)
};
let chunk = self.text[self.pos..split_at].to_vec();
self.pos = split_at;
Some(chunk)
}
pub fn reset(&mut self) {
self.pos = 0;
}
pub fn text(&self) -> &[u8] {
&self.text
}
pub fn collect_offsets(&mut self) -> Vec<(usize, usize)> {
self.init();
let mut offsets = Vec::new();
let mut pos = 0;
while pos < self.text.len() {
let remaining = self.text.len() - pos;
if remaining <= self.target_size {
offsets.push((pos, self.text.len()));
break;
}
let end = pos + self.target_size;
let split_at = if self.multi_searcher.is_some() {
compute_split_at_combined(
&self.text,
pos,
end,
&self.delimiters,
self.table.as_ref(),
self.multi_searcher.as_ref(),
self.prefix_mode,
self.consecutive,
self.forward_fallback,
)
} else {
compute_split_at(
&self.text,
pos,
end,
self.pattern.as_deref(),
&self.delimiters,
self.table.as_ref(),
self.prefix_mode,
self.consecutive,
self.forward_fallback,
)
};
offsets.push((pos, split_at));
pos = split_at;
}
offsets
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_basic_chunking() {
let text = b"Hello. World. Test.";
let chunks: Vec<_> = chunk(text).size(10).delimiters(b".").collect();
assert_eq!(chunks.len(), 3);
assert_eq!(chunks[0], b"Hello.");
assert_eq!(chunks[1], b" World.");
assert_eq!(chunks[2], b" Test.");
}
#[test]
fn test_newline_delimiter() {
let text = b"Line one\nLine two\nLine three";
let chunks: Vec<_> = chunk(text).size(15).delimiters(b"\n").collect();
assert_eq!(chunks[0], b"Line one\n");
assert_eq!(chunks[1], b"Line two\n");
assert_eq!(chunks[2], b"Line three");
}
#[test]
fn test_multiple_delimiters() {
let text = b"Hello? World. Yes!";
let chunks: Vec<_> = chunk(text).size(10).delimiters(b".?!").collect();
assert_eq!(chunks[0], b"Hello?");
}
#[test]
fn test_four_delimiters_uses_table() {
let text = b"A. B? C! D; E";
let chunks: Vec<_> = chunk(text).size(5).delimiters(b".?!;").collect();
assert!(chunks.len() >= 2);
}
#[test]
fn test_no_delimiter_hard_split() {
let text = b"abcdefghij";
let chunks: Vec<_> = chunk(text).size(5).delimiters(b".").collect();
assert_eq!(chunks[0], b"abcde");
assert_eq!(chunks[1], b"fghij");
}
#[test]
fn test_empty_text() {
let text = b"";
let chunks: Vec<_> = chunk(text).size(10).delimiters(b".").collect();
assert_eq!(chunks.len(), 0);
}
#[test]
fn test_text_smaller_than_target() {
let text = b"Small";
let chunks: Vec<_> = chunk(text).size(100).delimiters(b".").collect();
assert_eq!(chunks.len(), 1);
assert_eq!(chunks[0], b"Small");
}
#[test]
fn test_total_bytes_preserved() {
let text = b"The quick brown fox jumps over the lazy dog. How vexingly quick!";
let chunks: Vec<_> = chunk(text).size(20).delimiters(b"\n.?!").collect();
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
}
#[test]
fn test_defaults() {
let text = b"Hello world. This is a test.";
let chunks: Vec<_> = chunk(text).collect();
assert!(!chunks.is_empty());
}
#[test]
fn test_prefix_mode() {
let text = b"Hello World Test";
let chunks: Vec<_> = chunk(text).size(8).delimiters(b" ").prefix().collect();
assert_eq!(chunks[0], b"Hello");
assert_eq!(chunks[1], b" World");
assert_eq!(chunks[2], b" Test");
}
#[test]
fn test_suffix_mode() {
let text = b"Hello World Test";
let chunks: Vec<_> = chunk(text).size(8).delimiters(b" ").suffix().collect();
assert_eq!(chunks[0], b"Hello ");
assert_eq!(chunks[1], b"World ");
assert_eq!(chunks[2], b"Test");
}
#[test]
fn test_consecutive_delimiters() {
let text = b"Hello\n\n\nWorld";
let chunks: Vec<_> = chunk(text)
.delimiters(b"\n")
.size(8)
.prefix()
.consecutive()
.collect();
assert_eq!(chunks[0], b"Hello");
assert_eq!(chunks[1], b"\n\n\nWorld");
}
#[test]
fn test_forward_fallback() {
let text = b"verylongword next";
let chunks: Vec<_> = chunk(text)
.delimiters(b" ")
.size(6)
.prefix()
.forward_fallback()
.collect();
assert_eq!(chunks[0], b"verylongword");
assert_eq!(chunks[1], b" next");
}
#[test]
fn test_pattern_metaspace() {
let metaspace = "▁".as_bytes();
let text = "Hello▁World▁Test".as_bytes();
let chunks: Vec<_> = chunk(text).size(15).pattern(metaspace).prefix().collect();
assert_eq!(chunks[0], "Hello".as_bytes());
assert_eq!(chunks[1], "▁World▁Test".as_bytes());
}
#[test]
fn test_owned_chunker() {
let text = b"Hello. World. Test.".to_vec();
let mut chunker = OwnedChunker::new(text).size(10).delimiters(b".".to_vec());
let mut chunks = Vec::new();
while let Some(c) = chunker.next_chunk() {
chunks.push(c);
}
assert_eq!(chunks.len(), 3);
assert_eq!(chunks[0], b"Hello.");
}
#[test]
fn test_owned_chunker_collect_offsets() {
let text = b"Hello. World. Test.".to_vec();
let mut chunker = OwnedChunker::new(text.clone())
.size(10)
.delimiters(b".".to_vec());
let offsets = chunker.collect_offsets();
assert_eq!(offsets.len(), 3);
assert_eq!(&text[offsets[0].0..offsets[0].1], b"Hello.");
}
}