1use crate::delim::{DEFAULT_DELIMITERS, build_table, find_first_delimiter};
8
9#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
11pub enum IncludeDelim {
12 #[default]
14 Prev,
15 Next,
17 None,
19}
20
21pub fn split_at_delimiters(
53 text: &[u8],
54 delimiters: &[u8],
55 include_delim: IncludeDelim,
56 min_chars: usize,
57) -> Vec<(usize, usize)> {
58 if text.is_empty() {
59 return vec![];
60 }
61
62 if delimiters.is_empty() {
63 return vec![(0, text.len())];
65 }
66
67 let table = build_table(delimiters);
69
70 let estimated_segments = (text.len() / 40).max(4);
73 let mut splits: Vec<(usize, usize)> = Vec::with_capacity(estimated_segments);
74
75 let mut segment_start = 0;
76 let mut pos = 0;
77
78 let mut accum_start: usize = 0;
81 let mut accum_end: usize = 0;
82
83 macro_rules! emit_segment {
85 ($seg_start:expr, $seg_end:expr) => {
86 let seg_start = $seg_start;
87 let seg_end = $seg_end;
88
89 if min_chars == 0 {
90 splits.push((seg_start, seg_end));
92 } else if accum_start == accum_end {
93 accum_start = seg_start;
95 accum_end = seg_end;
96 } else {
97 let accum_len = accum_end - accum_start;
98 let seg_len = seg_end - seg_start;
99
100 if accum_len < min_chars || seg_len < min_chars {
101 accum_end = seg_end;
103 } else {
104 splits.push((accum_start, accum_end));
106 accum_start = seg_start;
107 accum_end = seg_end;
108 }
109 }
110 };
111 }
112
113 while pos < text.len() {
114 let delim_pos = find_first_delimiter(&text[pos..], delimiters, table.as_ref());
116
117 match delim_pos {
118 Some(rel_pos) => {
119 let abs_pos = pos + rel_pos;
120
121 match include_delim {
122 IncludeDelim::Prev => {
123 let seg_end = abs_pos + 1;
125 if segment_start < seg_end {
126 emit_segment!(segment_start, seg_end);
127 }
128 segment_start = seg_end;
129 }
130 IncludeDelim::Next => {
131 if segment_start < abs_pos {
133 emit_segment!(segment_start, abs_pos);
134 }
135 segment_start = abs_pos;
136 }
137 IncludeDelim::None => {
138 if segment_start < abs_pos {
140 emit_segment!(segment_start, abs_pos);
141 }
142 segment_start = abs_pos + 1;
143 }
144 }
145 pos = abs_pos + 1;
146 }
147 None => {
148 if segment_start < text.len() {
150 emit_segment!(segment_start, text.len());
151 }
152 break;
153 }
154 }
155 }
156
157 if segment_start < text.len()
159 && (splits.is_empty() || splits.last().is_none_or(|&(_, e)| e < text.len()))
160 {
161 if min_chars == 0 || accum_end < text.len() {
163 emit_segment!(segment_start, text.len());
164 }
165 }
166
167 if min_chars > 0 && accum_start < accum_end {
169 splits.push((accum_start, accum_end));
170 }
171
172 splits
173}
174
175pub fn split(text: &[u8]) -> Splitter<'_> {
192 Splitter::new(text)
193}
194
195pub struct Splitter<'a> {
199 text: &'a [u8],
200 delimiters: &'a [u8],
201 include_delim: IncludeDelim,
202 min_chars: usize,
203}
204
205impl<'a> Splitter<'a> {
206 fn new(text: &'a [u8]) -> Self {
207 Self {
208 text,
209 delimiters: DEFAULT_DELIMITERS,
210 include_delim: IncludeDelim::Prev,
211 min_chars: 0,
212 }
213 }
214
215 pub fn delimiters(mut self, delimiters: &'a [u8]) -> Self {
217 self.delimiters = delimiters;
218 self
219 }
220
221 pub fn include_prev(mut self) -> Self {
223 self.include_delim = IncludeDelim::Prev;
224 self
225 }
226
227 pub fn include_next(mut self) -> Self {
229 self.include_delim = IncludeDelim::Next;
230 self
231 }
232
233 pub fn include_none(mut self) -> Self {
235 self.include_delim = IncludeDelim::None;
236 self
237 }
238
239 pub fn min_chars(mut self, min: usize) -> Self {
241 self.min_chars = min;
242 self
243 }
244
245 pub fn collect(self) -> Vec<(usize, usize)> {
247 split_at_delimiters(
248 self.text,
249 self.delimiters,
250 self.include_delim,
251 self.min_chars,
252 )
253 }
254
255 pub fn collect_slices(self) -> Vec<&'a [u8]> {
257 let text = self.text;
258 let offsets =
259 split_at_delimiters(text, self.delimiters, self.include_delim, self.min_chars);
260 offsets
261 .into_iter()
262 .map(|(start, end)| &text[start..end])
263 .collect()
264 }
265}
266
267#[cfg(test)]
268mod tests {
269 use super::*;
270
271 #[test]
272 fn test_split_basic() {
273 let text = b"Hello. World. Test.";
274 let offsets = split_at_delimiters(text, b".", IncludeDelim::Prev, 0);
275 assert_eq!(offsets.len(), 3);
276 assert_eq!(&text[offsets[0].0..offsets[0].1], b"Hello.");
277 assert_eq!(&text[offsets[1].0..offsets[1].1], b" World.");
278 assert_eq!(&text[offsets[2].0..offsets[2].1], b" Test.");
279 }
280
281 #[test]
282 fn test_split_include_next() {
283 let text = b"Hello. World. Test.";
284 let offsets = split_at_delimiters(text, b".", IncludeDelim::Next, 0);
285 assert_eq!(offsets.len(), 4);
287 assert_eq!(&text[offsets[0].0..offsets[0].1], b"Hello");
288 assert_eq!(&text[offsets[1].0..offsets[1].1], b". World");
289 assert_eq!(&text[offsets[2].0..offsets[2].1], b". Test");
290 assert_eq!(&text[offsets[3].0..offsets[3].1], b".");
291 }
292
293 #[test]
294 fn test_split_include_next_no_trailing() {
295 let text = b"Hello. World. Test";
297 let offsets = split_at_delimiters(text, b".", IncludeDelim::Next, 0);
298 assert_eq!(offsets.len(), 3);
299 assert_eq!(&text[offsets[0].0..offsets[0].1], b"Hello");
300 assert_eq!(&text[offsets[1].0..offsets[1].1], b". World");
301 assert_eq!(&text[offsets[2].0..offsets[2].1], b". Test");
302 }
303
304 #[test]
305 fn test_split_include_none() {
306 let text = b"Hello. World. Test.";
307 let offsets = split_at_delimiters(text, b".", IncludeDelim::None, 0);
308 assert_eq!(offsets.len(), 3);
309 assert_eq!(&text[offsets[0].0..offsets[0].1], b"Hello");
310 assert_eq!(&text[offsets[1].0..offsets[1].1], b" World");
311 assert_eq!(&text[offsets[2].0..offsets[2].1], b" Test");
312 }
313
314 #[test]
315 fn test_split_multiple_delimiters() {
316 let text = b"Hello. World? Test!";
317 let offsets = split_at_delimiters(text, b".?!", IncludeDelim::Prev, 0);
318 assert_eq!(offsets.len(), 3);
319 assert_eq!(&text[offsets[0].0..offsets[0].1], b"Hello.");
320 assert_eq!(&text[offsets[1].0..offsets[1].1], b" World?");
321 assert_eq!(&text[offsets[2].0..offsets[2].1], b" Test!");
322 }
323
324 #[test]
325 fn test_split_min_chars() {
326 let text = b"A. B. C. D. E.";
327 let offsets = split_at_delimiters(text, b".", IncludeDelim::Prev, 0);
329 assert_eq!(offsets.len(), 5);
330
331 let offsets = split_at_delimiters(text, b".", IncludeDelim::Prev, 4);
333 assert!(offsets.len() < 5);
334 }
335
336 #[test]
337 fn test_split_empty_text() {
338 let text = b"";
339 let offsets = split_at_delimiters(text, b".", IncludeDelim::Prev, 0);
340 assert_eq!(offsets.len(), 0);
341 }
342
343 #[test]
344 fn test_split_no_delimiters() {
345 let text = b"Hello World";
346 let offsets = split_at_delimiters(text, b".", IncludeDelim::Prev, 0);
347 assert_eq!(offsets.len(), 1);
348 assert_eq!(&text[offsets[0].0..offsets[0].1], b"Hello World");
349 }
350
351 #[test]
352 fn test_split_empty_delimiters() {
353 let text = b"Hello World";
354 let offsets = split_at_delimiters(text, b"", IncludeDelim::Prev, 0);
355 assert_eq!(offsets.len(), 1);
356 assert_eq!(&text[offsets[0].0..offsets[0].1], b"Hello World");
357 }
358
359 #[test]
360 fn test_split_newlines() {
361 let text = b"Line 1\nLine 2\nLine 3";
362 let offsets = split_at_delimiters(text, b"\n", IncludeDelim::Prev, 0);
363 assert_eq!(offsets.len(), 3);
364 assert_eq!(&text[offsets[0].0..offsets[0].1], b"Line 1\n");
365 assert_eq!(&text[offsets[1].0..offsets[1].1], b"Line 2\n");
366 assert_eq!(&text[offsets[2].0..offsets[2].1], b"Line 3");
367 }
368
369 #[test]
370 fn test_split_builder() {
371 let text = b"Hello. World? Test!";
372 let slices = split(text)
373 .delimiters(b".?!")
374 .include_prev()
375 .collect_slices();
376 assert_eq!(slices.len(), 3);
377 assert_eq!(slices[0], b"Hello.");
378 assert_eq!(slices[1], b" World?");
379 assert_eq!(slices[2], b" Test!");
380 }
381
382 #[test]
383 fn test_split_builder_include_next() {
384 let text = b"Hello. World.";
385 let slices = split(text).delimiters(b".").include_next().collect_slices();
386 assert_eq!(slices.len(), 3);
388 assert_eq!(slices[0], b"Hello");
389 assert_eq!(slices[1], b". World");
390 assert_eq!(slices[2], b".");
391 }
392
393 #[test]
394 fn test_split_preserves_all_bytes() {
395 let text = b"The quick brown fox. Jumps over? The lazy dog!";
396 let offsets = split_at_delimiters(text, b".?!", IncludeDelim::Prev, 0);
397
398 let total: usize = offsets.iter().map(|(s, e)| e - s).sum();
400 assert_eq!(total, text.len());
401
402 for i in 1..offsets.len() {
404 assert_eq!(offsets[i - 1].1, offsets[i].0);
405 }
406 }
407
408 #[test]
409 fn test_split_four_delimiters() {
410 let text = b"A. B? C! D; E";
412 let offsets = split_at_delimiters(text, b".?!;", IncludeDelim::Prev, 0);
413 assert_eq!(offsets.len(), 5);
414 }
415}