use regex::Regex;
use crate::chunk::TextSpan;
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum KeepSeparator {
Start,
End,
}
pub fn split_text_with_regex(
text: &str,
separator_pattern: &str,
keep_separator: Option<KeepSeparator>,
) -> Vec<String> {
if separator_pattern.is_empty() {
return text.chars().map(|c| c.to_string()).collect();
}
let re = Regex::new(separator_pattern).expect("invalid regex pattern");
split_text_with_compiled_regex(text, &re, keep_separator)
}
pub(crate) fn split_text_with_compiled_regex(
text: &str,
re: &Regex,
keep_separator: Option<KeepSeparator>,
) -> Vec<String> {
match keep_separator {
None => re
.split(text)
.filter(|s| !s.is_empty())
.map(|s| s.to_string())
.collect(),
Some(position) => {
let matches: Vec<_> = re.find_iter(text).collect();
if matches.is_empty() {
return vec![text.to_string()];
}
let mut result = Vec::new();
match position {
KeepSeparator::End => {
let mut start = 0usize;
for mat in matches {
if start < mat.end() {
result.push(text[start..mat.end()].to_string());
}
start = mat.end();
}
if start < text.len() {
result.push(text[start..].to_string());
}
}
KeepSeparator::Start => {
let first = matches[0];
if first.start() > 0 {
result.push(text[..first.start()].to_string());
}
for (idx, mat) in matches.iter().enumerate() {
let end = matches
.get(idx + 1)
.map(|next| next.start())
.unwrap_or(text.len());
if mat.start() < end {
result.push(text[mat.start()..end].to_string());
}
}
}
}
result
}
}
}
pub(crate) fn split_spans_with_compiled_regex(
text: &str,
re: Option<&Regex>,
keep_separator: Option<KeepSeparator>,
) -> Vec<TextSpan> {
if text.is_empty() {
return Vec::new();
}
let Some(re) = re else {
return text
.char_indices()
.map(|(start, ch)| TextSpan::new(start, start + ch.len_utf8()))
.collect();
};
let matches: Vec<_> = re.find_iter(text).collect();
if matches.is_empty() {
return vec![TextSpan::new(0, text.len())];
}
let mut spans = Vec::new();
match keep_separator {
None => {
let mut start = 0usize;
for mat in matches {
if start < mat.start() {
spans.push(TextSpan::new(start, mat.start()));
}
start = mat.end();
}
if start < text.len() {
spans.push(TextSpan::new(start, text.len()));
}
}
Some(KeepSeparator::End) => {
let mut start = 0usize;
for mat in matches {
if start < mat.end() {
spans.push(TextSpan::new(start, mat.end()));
}
start = mat.end();
}
if start < text.len() {
spans.push(TextSpan::new(start, text.len()));
}
}
Some(KeepSeparator::Start) => {
let first = matches[0];
if first.start() > 0 {
spans.push(TextSpan::new(0, first.start()));
}
for (idx, mat) in matches.iter().enumerate() {
let end = matches
.get(idx + 1)
.map(|next| next.start())
.unwrap_or(text.len());
if mat.start() < end {
spans.push(TextSpan::new(mat.start(), end));
}
}
}
}
spans
}
pub(crate) struct RegexSpanIter<'a> {
text: &'a str,
regex: Option<Regex>,
keep_separator: Option<KeepSeparator>,
cursor: usize,
pending: Option<TextSpan>,
done: bool,
}
impl<'a> RegexSpanIter<'a> {
pub(crate) fn from_regex(
text: &'a str,
regex: Option<Regex>,
keep_separator: Option<KeepSeparator>,
) -> Self {
Self {
text,
regex,
keep_separator,
cursor: 0,
pending: None,
done: text.is_empty(),
}
}
}
impl Iterator for RegexSpanIter<'_> {
type Item = TextSpan;
fn next(&mut self) -> Option<Self::Item> {
if let Some(span) = self.pending.take() {
return Some(span);
}
if self.done {
return None;
}
let Some(regex) = &self.regex else {
if let Some((start, ch)) = self.text[self.cursor..].char_indices().next() {
let start = self.cursor + start;
self.cursor = start + ch.len_utf8();
if self.cursor >= self.text.len() {
self.done = true;
}
return Some(TextSpan::new(start, self.cursor));
}
self.done = true;
return None;
};
let mat = regex.find_at(self.text, self.cursor);
match (self.keep_separator, mat) {
(None, Some(mat)) => {
if self.cursor < mat.start() {
let span = TextSpan::new(self.cursor, mat.start());
self.cursor = mat.end();
Some(span)
} else {
self.cursor = mat.end();
self.next()
}
}
(None, None) => {
self.done = true;
if self.cursor < self.text.len() {
Some(TextSpan::new(self.cursor, self.text.len()))
} else {
None
}
}
(Some(KeepSeparator::End), Some(mat)) => {
if self.cursor < mat.end() {
let span = TextSpan::new(self.cursor, mat.end());
self.cursor = mat.end();
Some(span)
} else {
self.cursor = mat.end();
self.next()
}
}
(Some(KeepSeparator::End), None) => {
self.done = true;
if self.cursor < self.text.len() {
Some(TextSpan::new(self.cursor, self.text.len()))
} else {
None
}
}
(Some(KeepSeparator::Start), Some(mat)) => {
if self.cursor < mat.start() {
let next = regex.find_at(self.text, mat.end());
let span = TextSpan::new(self.cursor, mat.start());
let end = next.map(|next| next.start()).unwrap_or(self.text.len());
self.pending = (mat.start() < end).then(|| TextSpan::new(mat.start(), end));
self.cursor = end;
if self.cursor >= self.text.len() {
self.done = true;
}
Some(span)
} else {
let next = regex.find_at(self.text, mat.end());
let end = next.map(|next| next.start()).unwrap_or(self.text.len());
self.cursor = end;
if self.cursor >= self.text.len() {
self.done = true;
}
(mat.start() < end).then(|| TextSpan::new(mat.start(), end))
}
}
(Some(KeepSeparator::Start), None) => {
self.done = true;
if self.cursor < self.text.len() {
Some(TextSpan::new(self.cursor, self.text.len()))
} else {
None
}
}
}
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn test_split_no_keep() {
let result = split_text_with_regex("hello world foo", " ", None);
assert_eq!(result, vec!["hello", "world", "foo"]);
}
#[test]
fn test_split_keep_start() {
let result = split_text_with_regex("hello world foo", " ", Some(KeepSeparator::Start));
assert_eq!(result, vec!["hello", " world", " foo"]);
}
#[test]
fn test_split_keep_end() {
let result = split_text_with_regex("hello world foo", " ", Some(KeepSeparator::End));
assert_eq!(result, vec!["hello ", "world ", "foo"]);
}
#[test]
fn test_split_empty_separator() {
let result = split_text_with_regex("abc", "", None);
assert_eq!(result, vec!["a", "b", "c"]);
}
#[test]
fn test_split_no_match() {
let result = split_text_with_regex("hello", "X", None);
assert_eq!(result, vec!["hello"]);
}
#[test]
fn test_split_filters_empty() {
let result = split_text_with_regex("a b", " ", None);
assert_eq!(result, vec!["a", "b"]);
}
#[test]
fn test_split_regex_pattern() {
let result = split_text_with_regex("foo123bar456baz", r"\d+", None);
assert_eq!(result, vec!["foo", "bar", "baz"]);
}
}