1use regex::Regex;
16
17use crate::output_positions::{Position, set_output_positions};
18use crate::split::{Chunk, TextRange};
19
20#[derive(Debug, Clone, Copy, PartialEq, Eq)]
22pub enum KeepSeparator {
23 Left,
25 Right,
27}
28
29#[derive(Debug, Clone)]
31pub struct SeparatorSplitConfig {
32 pub separators_regex: Vec<String>,
34 pub keep_separator: Option<KeepSeparator>,
36 pub include_empty: bool,
38 pub trim: bool,
40}
41
42impl Default for SeparatorSplitConfig {
43 fn default() -> Self {
44 Self {
45 separators_regex: vec![],
46 keep_separator: None,
47 include_empty: false,
48 trim: true,
49 }
50 }
51}
52
53pub struct SeparatorSplitter {
55 config: SeparatorSplitConfig,
56 regex: Option<Regex>,
57}
58
59impl SeparatorSplitter {
60 pub fn new(config: SeparatorSplitConfig) -> Result<Self, regex::Error> {
64 let regex = if config.separators_regex.is_empty() {
65 None
66 } else {
67 let pattern = format!(
69 "(?m){}",
70 config
71 .separators_regex
72 .iter()
73 .map(|s| format!("(?:{s})"))
74 .collect::<Vec<_>>()
75 .join("|")
76 );
77 Some(Regex::new(&pattern)?)
78 };
79 Ok(Self { config, regex })
80 }
81
82 pub fn split(&self, text: &str) -> Vec<Chunk> {
84 let bytes = text.as_bytes();
85
86 struct RawChunk {
88 start: usize,
89 end: usize,
90 }
91
92 let mut raw_chunks: Vec<RawChunk> = Vec::new();
93
94 let mut add_range = |mut s: usize, mut e: usize| {
95 if self.config.trim {
96 while s < e && bytes[s].is_ascii_whitespace() {
97 s += 1;
98 }
99 while e > s && bytes[e - 1].is_ascii_whitespace() {
100 e -= 1;
101 }
102 }
103 if self.config.include_empty || e > s {
104 raw_chunks.push(RawChunk { start: s, end: e });
105 }
106 };
107
108 if let Some(re) = &self.regex {
109 let mut start = 0usize;
110 for m in re.find_iter(text) {
111 let end = match self.config.keep_separator {
112 Some(KeepSeparator::Left) => m.end(),
113 Some(KeepSeparator::Right) | None => m.start(),
114 };
115 add_range(start, end);
116 start = match self.config.keep_separator {
117 Some(KeepSeparator::Right) => m.start(),
118 _ => m.end(),
119 };
120 }
121 add_range(start, text.len());
122 } else {
123 add_range(0, text.len());
125 }
126
127 let mut positions: Vec<Position> = raw_chunks
129 .iter()
130 .flat_map(|c| vec![Position::new(c.start), Position::new(c.end)])
131 .collect();
132
133 set_output_positions(text, positions.iter_mut());
134
135 raw_chunks
137 .into_iter()
138 .enumerate()
139 .map(|(i, raw)| {
140 let start_pos = positions[i * 2].output.unwrap();
141 let end_pos = positions[i * 2 + 1].output.unwrap();
142 Chunk {
143 range: TextRange::new(raw.start, raw.end),
144 start: start_pos,
145 end: end_pos,
146 }
147 })
148 .collect()
149 }
150}
151
152#[cfg(test)]
153mod tests {
154 use super::*;
155
156 #[test]
157 fn test_split_by_paragraphs() {
158 let config = SeparatorSplitConfig {
159 separators_regex: vec![r"\n\n+".to_string()],
160 keep_separator: None,
161 include_empty: false,
162 trim: true,
163 };
164 let splitter = SeparatorSplitter::new(config).unwrap();
165 let text = "Para1\n\nPara2\n\n\nPara3";
166 let chunks = splitter.split(text);
167
168 assert_eq!(chunks.len(), 3);
169 assert_eq!(&text[chunks[0].range.start..chunks[0].range.end], "Para1");
170 assert_eq!(&text[chunks[1].range.start..chunks[1].range.end], "Para2");
171 assert_eq!(&text[chunks[2].range.start..chunks[2].range.end], "Para3");
172 }
173
174 #[test]
175 fn test_split_keep_separator_left() {
176 let config = SeparatorSplitConfig {
177 separators_regex: vec![r"\.".to_string()],
178 keep_separator: Some(KeepSeparator::Left),
179 include_empty: false,
180 trim: true,
181 };
182 let splitter = SeparatorSplitter::new(config).unwrap();
183 let text = "A. B. C.";
184 let chunks = splitter.split(text);
185
186 assert_eq!(chunks.len(), 3);
187 assert_eq!(&text[chunks[0].range.start..chunks[0].range.end], "A.");
188 assert_eq!(&text[chunks[1].range.start..chunks[1].range.end], "B.");
189 assert_eq!(&text[chunks[2].range.start..chunks[2].range.end], "C.");
190 }
191
192 #[test]
193 fn test_split_keep_separator_right() {
194 let config = SeparatorSplitConfig {
195 separators_regex: vec![r"\.".to_string()],
196 keep_separator: Some(KeepSeparator::Right),
197 include_empty: false,
198 trim: true,
199 };
200 let splitter = SeparatorSplitter::new(config).unwrap();
201 let text = "A. B. C";
202 let chunks = splitter.split(text);
203
204 assert_eq!(chunks.len(), 3);
205 assert_eq!(&text[chunks[0].range.start..chunks[0].range.end], "A");
206 assert_eq!(&text[chunks[1].range.start..chunks[1].range.end], ". B");
207 assert_eq!(&text[chunks[2].range.start..chunks[2].range.end], ". C");
208 }
209
210 #[test]
211 fn test_split_no_separators() {
212 let config = SeparatorSplitConfig {
213 separators_regex: vec![],
214 keep_separator: None,
215 include_empty: false,
216 trim: true,
217 };
218 let splitter = SeparatorSplitter::new(config).unwrap();
219 let text = "Hello World";
220 let chunks = splitter.split(text);
221
222 assert_eq!(chunks.len(), 1);
223 assert_eq!(
224 &text[chunks[0].range.start..chunks[0].range.end],
225 "Hello World"
226 );
227 }
228
229 #[test]
230 fn test_split_with_trim() {
231 let config = SeparatorSplitConfig {
232 separators_regex: vec![r"\|".to_string()],
233 keep_separator: None,
234 include_empty: false,
235 trim: true,
236 };
237 let splitter = SeparatorSplitter::new(config).unwrap();
238 let text = " A | B | C ";
239 let chunks = splitter.split(text);
240
241 assert_eq!(chunks.len(), 3);
242 assert_eq!(&text[chunks[0].range.start..chunks[0].range.end], "A");
243 assert_eq!(&text[chunks[1].range.start..chunks[1].range.end], "B");
244 assert_eq!(&text[chunks[2].range.start..chunks[2].range.end], "C");
245 }
246
247 #[test]
248 fn test_split_include_empty() {
249 let config = SeparatorSplitConfig {
250 separators_regex: vec![r"\|".to_string()],
251 keep_separator: None,
252 include_empty: true,
253 trim: true,
254 };
255 let splitter = SeparatorSplitter::new(config).unwrap();
256 let text = "A||B";
257 let chunks = splitter.split(text);
258
259 assert_eq!(chunks.len(), 3);
260 assert_eq!(&text[chunks[0].range.start..chunks[0].range.end], "A");
261 assert_eq!(&text[chunks[1].range.start..chunks[1].range.end], "");
262 assert_eq!(&text[chunks[2].range.start..chunks[2].range.end], "B");
263 }
264
265 #[test]
266 fn test_split_positions() {
267 let config = SeparatorSplitConfig {
268 separators_regex: vec![r"\n".to_string()],
269 keep_separator: None,
270 include_empty: false,
271 trim: false,
272 };
273 let splitter = SeparatorSplitter::new(config).unwrap();
274 let text = "Line1\nLine2\nLine3";
275 let chunks = splitter.split(text);
276
277 assert_eq!(chunks.len(), 3);
278
279 assert_eq!(chunks[0].start.line, 1);
281 assert_eq!(chunks[0].start.column, 1);
282 assert_eq!(chunks[0].end.line, 1);
283 assert_eq!(chunks[0].end.column, 6);
284
285 assert_eq!(chunks[1].start.line, 2);
286 assert_eq!(chunks[1].start.column, 1);
287
288 assert_eq!(chunks[2].start.line, 3);
289 assert_eq!(chunks[2].start.column, 1);
290 }
291}