1mod chunk;
64mod delim;
65mod merge;
66mod savgol;
67mod split;
68
69pub use crate::chunk::{Chunker, OwnedChunker, chunk};
71
72pub use crate::split::{IncludeDelim, PatternSplitter, Splitter, split, split_at_delimiters, split_at_patterns};
74
75pub use crate::merge::{MergeResult, find_merge_indices, merge_splits};
77
78pub use crate::delim::{DEFAULT_DELIMITERS, DEFAULT_TARGET_SIZE};
80
81pub use crate::savgol::{
83 FilteredIndices, MinimaResult, filter_split_indices, find_local_minima_interpolated,
84 savgol_filter, windowed_cross_similarity,
85};
86
87#[cfg(test)]
89mod integration_tests {
90 use super::*;
91
92 #[test]
93 fn test_chunk_and_split_consistency() {
94 let text = b"Hello. World. Test.";
96
97 let chunk_total: usize = chunk(text).size(10).delimiters(b".").map(|c| c.len()).sum();
98 let split_total: usize = split_at_delimiters(text, b".", IncludeDelim::Prev, 0)
99 .iter()
100 .map(|(s, e)| e - s)
101 .sum();
102
103 assert_eq!(chunk_total, text.len());
104 assert_eq!(split_total, text.len());
105 }
106
107 #[test]
108 fn test_consecutive_delimiters_chunk() {
109 let text = b"Hello\n\nWorld";
110 let chunks: Vec<_> = chunk(text).size(8).delimiters(b"\n").collect();
111 let total: usize = chunks.iter().map(|c| c.len()).sum();
112 assert_eq!(total, text.len());
113 }
114
115 #[test]
116 fn test_prefix_mode_chunk() {
117 let text = b"Hello World Test";
118 let chunks: Vec<_> = chunk(text).size(8).delimiters(b" ").prefix().collect();
119 assert_eq!(chunks[0], b"Hello");
120 assert_eq!(chunks[1], b" World");
121 assert_eq!(chunks[2], b" Test");
122 }
123
124 #[test]
125 fn test_prefix_preserves_total_bytes() {
126 let text = b"Hello World Test More Words Here";
127 let chunks: Vec<_> = chunk(text).size(10).delimiters(b" ").prefix().collect();
128 let total: usize = chunks.iter().map(|c| c.len()).sum();
129 assert_eq!(total, text.len());
130 }
131
132 #[test]
133 fn test_prefix_mode_delimiter_at_window_start() {
134 let text = b"Hello world";
135 let chunks: Vec<_> = chunk(text).size(5).delimiters(b" ").prefix().collect();
136 let total: usize = chunks.iter().map(|c| c.len()).sum();
137 assert_eq!(total, text.len());
138 assert_eq!(chunks[0], b"Hello");
139 }
140
141 #[test]
142 fn test_prefix_mode_small_chunks() {
143 let text = b"a b c d e";
144 let chunks: Vec<_> = chunk(text).size(2).delimiters(b" ").prefix().collect();
145 let total: usize = chunks.iter().map(|c| c.len()).sum();
146 assert_eq!(total, text.len());
147 for c in &chunks {
148 assert!(!c.is_empty(), "Found empty chunk!");
149 }
150 }
151
152 #[test]
155 fn test_pattern_metaspace_suffix() {
156 let metaspace = "▁".as_bytes();
157 let text = "Hello▁World▁Test".as_bytes();
158 let chunks: Vec<_> = chunk(text).size(15).pattern(metaspace).collect();
159 assert_eq!(chunks[0], "Hello▁".as_bytes());
160 assert_eq!(chunks[1], "World▁Test".as_bytes());
161 let total: usize = chunks.iter().map(|c| c.len()).sum();
162 assert_eq!(total, text.len());
163 }
164
165 #[test]
166 fn test_pattern_metaspace_prefix() {
167 let metaspace = "▁".as_bytes();
168 let text = "Hello▁World▁Test".as_bytes();
169 let chunks: Vec<_> = chunk(text).size(15).pattern(metaspace).prefix().collect();
170 assert_eq!(chunks[0], "Hello".as_bytes());
171 assert_eq!(chunks[1], "▁World▁Test".as_bytes());
172 let total: usize = chunks.iter().map(|c| c.len()).sum();
173 assert_eq!(total, text.len());
174 }
175
176 #[test]
177 fn test_pattern_preserves_bytes() {
178 let metaspace = "▁".as_bytes();
179 let text = "The▁quick▁brown▁fox▁jumps▁over▁the▁lazy▁dog".as_bytes();
180 let chunks: Vec<_> = chunk(text).size(20).pattern(metaspace).collect();
181 let total: usize = chunks.iter().map(|c| c.len()).sum();
182 assert_eq!(total, text.len());
183 }
184
185 #[test]
186 fn test_pattern_no_match_hard_split() {
187 let pattern = b"XYZ";
188 let text = b"abcdefghijklmnop";
189 let chunks: Vec<_> = chunk(text).size(5).pattern(pattern).collect();
190 assert_eq!(chunks[0], b"abcde");
191 assert_eq!(chunks[1], b"fghij");
192 }
193
194 #[test]
195 fn test_pattern_single_byte_optimization() {
196 let text = b"Hello World Test";
197 let chunks: Vec<_> = chunk(text).size(8).pattern(b" ").prefix().collect();
198 assert_eq!(chunks[0], b"Hello");
199 assert_eq!(chunks[1], b" World");
200 }
201
202 #[test]
205 fn test_consecutive_pattern_basic() {
206 let metaspace = b"\xE2\x96\x81";
207 let text = b"word\xE2\x96\x81\xE2\x96\x81\xE2\x96\x81next";
208 let chunks: Vec<_> = chunk(text)
209 .pattern(metaspace)
210 .size(10)
211 .prefix()
212 .consecutive()
213 .collect();
214 let total: usize = chunks.iter().map(|c| c.len()).sum();
215 assert_eq!(total, text.len());
216 assert_eq!(chunks[0], b"word");
217 assert!(chunks[1].starts_with(metaspace));
218 }
219
220 #[test]
221 fn test_forward_fallback_basic() {
222 let metaspace = b"\xE2\x96\x81";
223 let text = b"verylongword\xE2\x96\x81short";
224 let chunks: Vec<_> = chunk(text)
225 .pattern(metaspace)
226 .size(6)
227 .prefix()
228 .forward_fallback()
229 .collect();
230 assert_eq!(chunks[0], b"verylongword");
231 assert!(chunks[1].starts_with(metaspace));
232 }
233
234 #[test]
235 fn test_delimiter_consecutive_basic() {
236 let text = b"Hello\n\n\nWorld";
237 let chunks: Vec<_> = chunk(text)
238 .delimiters(b"\n")
239 .size(8)
240 .prefix()
241 .consecutive()
242 .collect();
243 let total: usize = chunks.iter().map(|c| c.len()).sum();
244 assert_eq!(total, text.len());
245 assert_eq!(chunks[0], b"Hello");
246 assert_eq!(chunks[1], b"\n\n\nWorld");
247 }
248
249 #[test]
250 fn test_delimiter_forward_fallback_basic() {
251 let text = b"verylongword next";
252 let chunks: Vec<_> = chunk(text)
253 .delimiters(b" ")
254 .size(6)
255 .prefix()
256 .forward_fallback()
257 .collect();
258 assert_eq!(chunks[0], b"verylongword");
259 assert_eq!(chunks[1], b" next");
260 }
261
262 #[test]
263 fn test_owned_chunker_pattern() {
264 let metaspace = "▁".as_bytes();
265 let text = "Hello▁World▁Test".as_bytes().to_vec();
266 let mut chunker = OwnedChunker::new(text.clone())
267 .size(15)
268 .pattern(metaspace.to_vec())
269 .prefix();
270 let mut chunks = Vec::new();
271 while let Some(c) = chunker.next_chunk() {
272 chunks.push(c);
273 }
274 assert_eq!(chunks[0], "Hello".as_bytes());
275 let total: usize = chunks.iter().map(|c| c.len()).sum();
276 assert_eq!(total, text.len());
277 }
278
279 #[test]
280 fn test_owned_chunker_collect_offsets() {
281 let metaspace = "▁".as_bytes();
282 let text = "Hello▁World▁Test".as_bytes().to_vec();
283 let mut chunker = OwnedChunker::new(text.clone())
284 .size(15)
285 .pattern(metaspace.to_vec())
286 .prefix();
287 let offsets = chunker.collect_offsets();
288 assert_eq!(offsets[0], (0, 5));
289 assert_eq!(&text[offsets[0].0..offsets[0].1], "Hello".as_bytes());
290 }
291}