mod chunk;
mod delim;
mod merge;
mod savgol;
mod split;
pub use crate::chunk::{Chunker, OwnedChunker, chunk};
pub use crate::split::{
IncludeDelim, PatternSplitter, Splitter, split, split_at_delimiters, split_at_patterns,
};
pub use crate::merge::{MergeResult, find_merge_indices, merge_splits};
pub use crate::delim::{DEFAULT_DELIMITERS, DEFAULT_TARGET_SIZE, MultiPatternSearcher};
pub use crate::savgol::{
FilteredIndices, MinimaResult, filter_split_indices, find_local_minima_interpolated,
savgol_filter, windowed_cross_similarity,
};
#[cfg(test)]
mod integration_tests {
use super::*;
#[test]
fn test_chunk_and_split_consistency() {
let text = b"Hello. World. Test.";
let chunk_total: usize = chunk(text).size(10).delimiters(b".").map(|c| c.len()).sum();
let split_total: usize = split_at_delimiters(text, b".", IncludeDelim::Prev, 0)
.iter()
.map(|(s, e)| e - s)
.sum();
assert_eq!(chunk_total, text.len());
assert_eq!(split_total, text.len());
}
#[test]
fn test_consecutive_delimiters_chunk() {
let text = b"Hello\n\nWorld";
let chunks: Vec<_> = chunk(text).size(8).delimiters(b"\n").collect();
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
}
#[test]
fn test_prefix_mode_chunk() {
let text = b"Hello World Test";
let chunks: Vec<_> = chunk(text).size(8).delimiters(b" ").prefix().collect();
assert_eq!(chunks[0], b"Hello");
assert_eq!(chunks[1], b" World");
assert_eq!(chunks[2], b" Test");
}
#[test]
fn test_prefix_preserves_total_bytes() {
let text = b"Hello World Test More Words Here";
let chunks: Vec<_> = chunk(text).size(10).delimiters(b" ").prefix().collect();
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
}
#[test]
fn test_prefix_mode_delimiter_at_window_start() {
let text = b"Hello world";
let chunks: Vec<_> = chunk(text).size(5).delimiters(b" ").prefix().collect();
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
assert_eq!(chunks[0], b"Hello");
}
#[test]
fn test_prefix_mode_small_chunks() {
let text = b"a b c d e";
let chunks: Vec<_> = chunk(text).size(2).delimiters(b" ").prefix().collect();
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
for c in &chunks {
assert!(!c.is_empty(), "Found empty chunk!");
}
}
#[test]
fn test_pattern_metaspace_suffix() {
let metaspace = "▁".as_bytes();
let text = "Hello▁World▁Test".as_bytes();
let chunks: Vec<_> = chunk(text).size(15).pattern(metaspace).collect();
assert_eq!(chunks[0], "Hello▁".as_bytes());
assert_eq!(chunks[1], "World▁Test".as_bytes());
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
}
#[test]
fn test_pattern_metaspace_prefix() {
let metaspace = "▁".as_bytes();
let text = "Hello▁World▁Test".as_bytes();
let chunks: Vec<_> = chunk(text).size(15).pattern(metaspace).prefix().collect();
assert_eq!(chunks[0], "Hello".as_bytes());
assert_eq!(chunks[1], "▁World▁Test".as_bytes());
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
}
#[test]
fn test_pattern_preserves_bytes() {
let metaspace = "▁".as_bytes();
let text = "The▁quick▁brown▁fox▁jumps▁over▁the▁lazy▁dog".as_bytes();
let chunks: Vec<_> = chunk(text).size(20).pattern(metaspace).collect();
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
}
#[test]
fn test_pattern_no_match_hard_split() {
let pattern = b"XYZ";
let text = b"abcdefghijklmnop";
let chunks: Vec<_> = chunk(text).size(5).pattern(pattern).collect();
assert_eq!(chunks[0], b"abcde");
assert_eq!(chunks[1], b"fghij");
}
#[test]
fn test_pattern_single_byte_optimization() {
let text = b"Hello World Test";
let chunks: Vec<_> = chunk(text).size(8).pattern(b" ").prefix().collect();
assert_eq!(chunks[0], b"Hello");
assert_eq!(chunks[1], b" World");
}
#[test]
fn test_consecutive_pattern_basic() {
let metaspace = b"\xE2\x96\x81";
let text = b"word\xE2\x96\x81\xE2\x96\x81\xE2\x96\x81next";
let chunks: Vec<_> = chunk(text)
.pattern(metaspace)
.size(10)
.prefix()
.consecutive()
.collect();
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
assert_eq!(chunks[0], b"word");
assert!(chunks[1].starts_with(metaspace));
}
#[test]
fn test_forward_fallback_basic() {
let metaspace = b"\xE2\x96\x81";
let text = b"verylongword\xE2\x96\x81short";
let chunks: Vec<_> = chunk(text)
.pattern(metaspace)
.size(6)
.prefix()
.forward_fallback()
.collect();
assert_eq!(chunks[0], b"verylongword");
assert!(chunks[1].starts_with(metaspace));
}
#[test]
fn test_delimiter_consecutive_basic() {
let text = b"Hello\n\n\nWorld";
let chunks: Vec<_> = chunk(text)
.delimiters(b"\n")
.size(8)
.prefix()
.consecutive()
.collect();
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
assert_eq!(chunks[0], b"Hello");
assert_eq!(chunks[1], b"\n\n\nWorld");
}
#[test]
fn test_delimiter_forward_fallback_basic() {
let text = b"verylongword next";
let chunks: Vec<_> = chunk(text)
.delimiters(b" ")
.size(6)
.prefix()
.forward_fallback()
.collect();
assert_eq!(chunks[0], b"verylongword");
assert_eq!(chunks[1], b" next");
}
#[test]
fn test_owned_chunker_pattern() {
let metaspace = "▁".as_bytes();
let text = "Hello▁World▁Test".as_bytes().to_vec();
let mut chunker = OwnedChunker::new(text.clone())
.size(15)
.pattern(metaspace.to_vec())
.prefix();
let mut chunks = Vec::new();
while let Some(c) = chunker.next_chunk() {
chunks.push(c);
}
assert_eq!(chunks[0], "Hello".as_bytes());
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
}
#[test]
fn test_owned_chunker_collect_offsets() {
let metaspace = "▁".as_bytes();
let text = "Hello▁World▁Test".as_bytes().to_vec();
let mut chunker = OwnedChunker::new(text.clone())
.size(15)
.pattern(metaspace.to_vec())
.prefix();
let offsets = chunker.collect_offsets();
assert_eq!(offsets[0], (0, 5));
assert_eq!(&text[offsets[0].0..offsets[0].1], "Hello".as_bytes());
}
#[test]
fn test_patterns_cjk_basic() {
let text = "Hello。World,Test!Done".as_bytes();
let chunks: Vec<_> = chunk(text)
.size(20)
.delimiters(b"\n.?!")
.patterns(&["。", ",", "!"])
.collect();
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
for c in &chunks {
assert!(
std::str::from_utf8(c).is_ok(),
"Chunk is not valid UTF-8: {:?}",
c
);
}
}
#[test]
fn test_patterns_preserves_all_bytes() {
let text = "First sentence。Second part,Third section!Final".as_bytes();
let chunks: Vec<_> = chunk(text)
.size(25)
.delimiters(b".")
.patterns(&["。", ",", "!"])
.collect();
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
}
#[test]
fn test_patterns_mixed_ascii_and_cjk() {
let text = "Hello. World。Test".as_bytes();
let chunks: Vec<_> = chunk(text)
.size(12)
.delimiters(b".")
.patterns(&["。"])
.collect();
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
assert!(chunks.len() >= 2);
}
#[test]
fn test_patterns_prefix_mode() {
let text = "Hello。World。Test".as_bytes();
let chunks: Vec<_> = chunk(text)
.size(15)
.delimiters(b"")
.patterns(&["。"])
.prefix()
.collect();
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
assert!(chunks[0].starts_with(b"Hello"));
}
#[test]
fn test_patterns_suffix_mode() {
let text = "Hello。World。Test".as_bytes();
let chunks: Vec<_> = chunk(text)
.size(15)
.delimiters(b"")
.patterns(&["。"])
.collect(); let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
assert!(chunks[0].ends_with("。".as_bytes()));
}
#[test]
fn test_patterns_with_forward_fallback() {
let text = "verylongwordwithnodelimiters。short".as_bytes();
let chunks: Vec<_> = chunk(text)
.size(10)
.delimiters(b"")
.patterns(&["。"])
.prefix()
.forward_fallback()
.collect();
assert_eq!(chunks[0], "verylongwordwithnodelimiters".as_bytes());
assert!(chunks[1].starts_with("。".as_bytes()));
}
#[test]
fn test_patterns_no_match_hard_split() {
let text = b"abcdefghijklmnop";
let chunks: Vec<_> = chunk(text)
.size(5)
.delimiters(b"")
.patterns(&["。"])
.collect();
assert_eq!(chunks[0], b"abcde");
assert_eq!(chunks[1], b"fghij");
}
#[test]
fn test_patterns_utf8_boundary_safety() {
let text = "It\u{2019}s a test。Done".as_bytes();
let chunks: Vec<_> = chunk(text)
.size(15)
.delimiters(b".")
.patterns(&["。"])
.forward_fallback()
.collect();
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
for c in &chunks {
assert!(
std::str::from_utf8(c).is_ok(),
"Chunk is not valid UTF-8: {:?}",
c
);
}
}
#[test]
fn test_patterns_many_triggers_aho_corasick() {
let text = "A。B,C!D?E;F".as_bytes();
let chunks: Vec<_> = chunk(text)
.size(8)
.patterns(&["。", ",", "!", "?", ";"])
.collect();
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
for c in &chunks {
assert!(
std::str::from_utf8(c).is_ok(),
"Chunk is not valid UTF-8: {:?}",
c
);
}
}
#[test]
fn test_patterns_owned_chunker() {
let text = "Hello。World,Test".as_bytes().to_vec();
let mut chunker = OwnedChunker::new(text.clone())
.size(15)
.delimiters(b".".to_vec())
.patterns(&["。", ","]);
let mut chunks = Vec::new();
while let Some(c) = chunker.next_chunk() {
chunks.push(c);
}
let total: usize = chunks.iter().map(|c| c.len()).sum();
assert_eq!(total, text.len());
}
#[test]
fn test_patterns_owned_chunker_collect_offsets() {
let text = "Hello。World,Test".as_bytes().to_vec();
let mut chunker = OwnedChunker::new(text.clone())
.size(15)
.delimiters(b".".to_vec())
.patterns(&["。", ","]);
let offsets = chunker.collect_offsets();
let total: usize = offsets.iter().map(|(s, e)| e - s).sum();
assert_eq!(total, text.len());
for i in 1..offsets.len() {
assert_eq!(offsets[i - 1].1, offsets[i].0);
}
}
}