Skip to main content

recoco_splitters/
by_separators.rs

1// Recoco is a Rust-only fork of CocoIndex, by [CocoIndex](https://CocoIndex)
2// Original code from CocoIndex is copyrighted by CocoIndex
3// SPDX-FileCopyrightText: 2025-2026 CocoIndex (upstream)
4// SPDX-FileContributor: CocoIndex Contributors
5//
6// All modifications from the upstream for Recoco are copyrighted by Knitli Inc.
7// SPDX-FileCopyrightText: 2026 Knitli Inc. (Recoco)
8// SPDX-FileContributor: Adam Poulemanos <adam@knit.li>
9//
10// Both the upstream CocoIndex code and the Recoco modifications are licensed under the Apache-2.0 License.
11// SPDX-License-Identifier: Apache-2.0
12
13//! Split text by regex separators.
14
15use regex::Regex;
16
17use crate::output_positions::{Position, set_output_positions};
18use crate::split::{Chunk, TextRange};
19
20/// How to handle separators when splitting.
21#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum KeepSeparator {
23    /// Include separator at the end of the preceding chunk.
24    Left,
25    /// Include separator at the start of the following chunk.
26    Right,
27}
28
29/// Configuration for separator-based text splitting.
30#[derive(Debug, Clone)]
31pub struct SeparatorSplitConfig {
32    /// Regex patterns for separators. They are OR-joined into a single pattern.
33    pub separators_regex: Vec<String>,
34    /// How to handle separators (None means discard them).
35    pub keep_separator: Option<KeepSeparator>,
36    /// Whether to include empty chunks in the output.
37    pub include_empty: bool,
38    /// Whether to trim whitespace from chunks.
39    pub trim: bool,
40}
41
42impl Default for SeparatorSplitConfig {
43    fn default() -> Self {
44        Self {
45            separators_regex: vec![],
46            keep_separator: None,
47            include_empty: false,
48            trim: true,
49        }
50    }
51}
52
53/// A text splitter that splits by regex separators.
54pub struct SeparatorSplitter {
55    config: SeparatorSplitConfig,
56    regex: Option<Regex>,
57}
58
59impl SeparatorSplitter {
60    /// Create a new separator splitter with the given configuration.
61    ///
62    /// Returns an error if the regex patterns are invalid.
63    pub fn new(config: SeparatorSplitConfig) -> Result<Self, regex::Error> {
64        let regex = if config.separators_regex.is_empty() {
65            None
66        } else {
67            // OR-join all separators with multiline mode
68            let pattern = format!(
69                "(?m){}",
70                config
71                    .separators_regex
72                    .iter()
73                    .map(|s| format!("(?:{s})"))
74                    .collect::<Vec<_>>()
75                    .join("|")
76            );
77            Some(Regex::new(&pattern)?)
78        };
79        Ok(Self { config, regex })
80    }
81
82    /// Split the text and return chunks with position information.
83    pub fn split(&self, text: &str) -> Vec<Chunk> {
84        let bytes = text.as_bytes();
85
86        // Collect raw chunks (byte ranges)
87        struct RawChunk {
88            start: usize,
89            end: usize,
90        }
91
92        let mut raw_chunks: Vec<RawChunk> = Vec::new();
93
94        let mut add_range = |mut s: usize, mut e: usize| {
95            if self.config.trim {
96                while s < e && bytes[s].is_ascii_whitespace() {
97                    s += 1;
98                }
99                while e > s && bytes[e - 1].is_ascii_whitespace() {
100                    e -= 1;
101                }
102            }
103            if self.config.include_empty || e > s {
104                raw_chunks.push(RawChunk { start: s, end: e });
105            }
106        };
107
108        if let Some(re) = &self.regex {
109            let mut start = 0usize;
110            for m in re.find_iter(text) {
111                let end = match self.config.keep_separator {
112                    Some(KeepSeparator::Left) => m.end(),
113                    Some(KeepSeparator::Right) | None => m.start(),
114                };
115                add_range(start, end);
116                start = match self.config.keep_separator {
117                    Some(KeepSeparator::Right) => m.start(),
118                    _ => m.end(),
119                };
120            }
121            add_range(start, text.len());
122        } else {
123            // No separators: emit whole text
124            add_range(0, text.len());
125        }
126
127        // Compute positions for all chunks
128        let mut positions: Vec<Position> = raw_chunks
129            .iter()
130            .flat_map(|c| vec![Position::new(c.start), Position::new(c.end)])
131            .collect();
132
133        set_output_positions(text, positions.iter_mut());
134
135        // Build final chunks
136        raw_chunks
137            .into_iter()
138            .enumerate()
139            .map(|(i, raw)| {
140                let start_pos = positions[i * 2].output.unwrap();
141                let end_pos = positions[i * 2 + 1].output.unwrap();
142                Chunk {
143                    range: TextRange::new(raw.start, raw.end),
144                    start: start_pos,
145                    end: end_pos,
146                }
147            })
148            .collect()
149    }
150}
151
152#[cfg(test)]
153mod tests {
154    use super::*;
155
156    #[test]
157    fn test_split_by_paragraphs() {
158        let config = SeparatorSplitConfig {
159            separators_regex: vec![r"\n\n+".to_string()],
160            keep_separator: None,
161            include_empty: false,
162            trim: true,
163        };
164        let splitter = SeparatorSplitter::new(config).unwrap();
165        let text = "Para1\n\nPara2\n\n\nPara3";
166        let chunks = splitter.split(text);
167
168        assert_eq!(chunks.len(), 3);
169        assert_eq!(&text[chunks[0].range.start..chunks[0].range.end], "Para1");
170        assert_eq!(&text[chunks[1].range.start..chunks[1].range.end], "Para2");
171        assert_eq!(&text[chunks[2].range.start..chunks[2].range.end], "Para3");
172    }
173
174    #[test]
175    fn test_split_keep_separator_left() {
176        let config = SeparatorSplitConfig {
177            separators_regex: vec![r"\.".to_string()],
178            keep_separator: Some(KeepSeparator::Left),
179            include_empty: false,
180            trim: true,
181        };
182        let splitter = SeparatorSplitter::new(config).unwrap();
183        let text = "A. B. C.";
184        let chunks = splitter.split(text);
185
186        assert_eq!(chunks.len(), 3);
187        assert_eq!(&text[chunks[0].range.start..chunks[0].range.end], "A.");
188        assert_eq!(&text[chunks[1].range.start..chunks[1].range.end], "B.");
189        assert_eq!(&text[chunks[2].range.start..chunks[2].range.end], "C.");
190    }
191
192    #[test]
193    fn test_split_keep_separator_right() {
194        let config = SeparatorSplitConfig {
195            separators_regex: vec![r"\.".to_string()],
196            keep_separator: Some(KeepSeparator::Right),
197            include_empty: false,
198            trim: true,
199        };
200        let splitter = SeparatorSplitter::new(config).unwrap();
201        let text = "A. B. C";
202        let chunks = splitter.split(text);
203
204        assert_eq!(chunks.len(), 3);
205        assert_eq!(&text[chunks[0].range.start..chunks[0].range.end], "A");
206        assert_eq!(&text[chunks[1].range.start..chunks[1].range.end], ". B");
207        assert_eq!(&text[chunks[2].range.start..chunks[2].range.end], ". C");
208    }
209
210    #[test]
211    fn test_split_no_separators() {
212        let config = SeparatorSplitConfig {
213            separators_regex: vec![],
214            keep_separator: None,
215            include_empty: false,
216            trim: true,
217        };
218        let splitter = SeparatorSplitter::new(config).unwrap();
219        let text = "Hello World";
220        let chunks = splitter.split(text);
221
222        assert_eq!(chunks.len(), 1);
223        assert_eq!(
224            &text[chunks[0].range.start..chunks[0].range.end],
225            "Hello World"
226        );
227    }
228
229    #[test]
230    fn test_split_with_trim() {
231        let config = SeparatorSplitConfig {
232            separators_regex: vec![r"\|".to_string()],
233            keep_separator: None,
234            include_empty: false,
235            trim: true,
236        };
237        let splitter = SeparatorSplitter::new(config).unwrap();
238        let text = "  A  |  B  |  C  ";
239        let chunks = splitter.split(text);
240
241        assert_eq!(chunks.len(), 3);
242        assert_eq!(&text[chunks[0].range.start..chunks[0].range.end], "A");
243        assert_eq!(&text[chunks[1].range.start..chunks[1].range.end], "B");
244        assert_eq!(&text[chunks[2].range.start..chunks[2].range.end], "C");
245    }
246
247    #[test]
248    fn test_split_include_empty() {
249        let config = SeparatorSplitConfig {
250            separators_regex: vec![r"\|".to_string()],
251            keep_separator: None,
252            include_empty: true,
253            trim: true,
254        };
255        let splitter = SeparatorSplitter::new(config).unwrap();
256        let text = "A||B";
257        let chunks = splitter.split(text);
258
259        assert_eq!(chunks.len(), 3);
260        assert_eq!(&text[chunks[0].range.start..chunks[0].range.end], "A");
261        assert_eq!(&text[chunks[1].range.start..chunks[1].range.end], "");
262        assert_eq!(&text[chunks[2].range.start..chunks[2].range.end], "B");
263    }
264
265    #[test]
266    fn test_split_positions() {
267        let config = SeparatorSplitConfig {
268            separators_regex: vec![r"\n".to_string()],
269            keep_separator: None,
270            include_empty: false,
271            trim: false,
272        };
273        let splitter = SeparatorSplitter::new(config).unwrap();
274        let text = "Line1\nLine2\nLine3";
275        let chunks = splitter.split(text);
276
277        assert_eq!(chunks.len(), 3);
278
279        // Check positions
280        assert_eq!(chunks[0].start.line, 1);
281        assert_eq!(chunks[0].start.column, 1);
282        assert_eq!(chunks[0].end.line, 1);
283        assert_eq!(chunks[0].end.column, 6);
284
285        assert_eq!(chunks[1].start.line, 2);
286        assert_eq!(chunks[1].start.column, 1);
287
288        assert_eq!(chunks[2].start.line, 3);
289        assert_eq!(chunks[2].start.column, 1);
290    }
291}